From bcef542d5b5411c2111a605dfa288461a2159b3a Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Tue, 15 Apr 2025 16:25:58 +0200
Subject: [PATCH 01/55] pageserver: don't rewrite invisible layers during
 ancestor compaction (#11580)

## Problem

Shard ancestor compaction can be very expensive following shard splits
of large tenants. We currently rewrite garbage layers after shard splits
as well, which can be a significant amount of data.

Touches https://github.com/neondatabase/cloud/issues/22532.

## Summary of changes

Don't rewrite invisible layers after shard splits.
---
 pageserver/src/tenant/timeline/compaction.rs | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 3d5f11aeb9..5d5149e2d4 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -56,7 +56,8 @@ use crate::tenant::storage_layer::batch_split_writer::{
 use crate::tenant::storage_layer::filter_iterator::FilterIterator;
 use crate::tenant::storage_layer::merge_iterator::MergeIterator;
 use crate::tenant::storage_layer::{
-    AsLayerDesc, PersistentLayerDesc, PersistentLayerKey, ValueReconstructState,
+    AsLayerDesc, LayerVisibilityHint, PersistentLayerDesc, PersistentLayerKey,
+    ValueReconstructState,
 };
 use crate::tenant::tasks::log_compaction_error;
 use crate::tenant::timeline::{
@@ -1348,12 +1349,19 @@ impl Timeline {
                 continue;
             }
 
+            // We do not yet implement rewrite of delta layers.
             if layer_desc.is_delta() {
-                // We do not yet implement rewrite of delta layers
                 debug!(%layer, "Skipping rewrite of delta layer");
                 continue;
             }
 
+            // We don't bother rewriting layers that aren't visible, since these won't be needed by
+            // reads and will likely be garbage collected soon.
+            if layer.visibility() != LayerVisibilityHint::Visible {
+                debug!(%layer, "Skipping rewrite of invisible layer");
+                continue;
+            }
+
             // Only rewrite layers if their generations differ.  This guarantees:
             //  - that local rewrite is safe, as local layer paths will differ between existing layer and rewritten one
             //  - that the layer is persistent in remote storage, as we only see old-generation'd layer via loading from remote storage

From 983d56502bb84d18288cc6498a258896e858a38c Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Tue, 15 Apr 2025 16:26:29 +0200
Subject: [PATCH 02/55] pageserver: reduce shard ancestor rewrite threshold to
 30% (#11582)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

When doing power-of-two shard splits (i.e. 4 → 8 → 16), we end up
rewriting all layers since half of the pages will be local due to
striping. This causes a lot of resource usage when splitting large
tenants.

## Summary of changes

Drop the threshold of local/total pages to 30%, to reduce the amount of
layer rewrites after splits.
---
 pageserver/src/tenant/timeline/compaction.rs | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 5d5149e2d4..76c153d60f 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -70,6 +70,13 @@ use crate::virtual_file::{MaybeFatalIo, VirtualFile};
 /// Maximum number of deltas before generating an image layer in bottom-most compaction.
 const COMPACTION_DELTA_THRESHOLD: usize = 5;
 
+/// Ratio of shard-local pages below which we trigger shard ancestor layer rewrites. 0.3 means that
+/// <= 30% of layer pages must belong to the descendant shard to rewrite the layer.
+///
+/// We choose a value < 0.5 to avoid rewriting all visible layers every time we do a power-of-two
+/// shard split, which gets expensive for large tenants.
+const ANCESTOR_COMPACTION_REWRITE_THRESHOLD: f64 = 0.3;
+
 #[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
 pub struct GcCompactionJobId(pub usize);
 
@@ -1330,14 +1337,15 @@ impl Timeline {
                 continue;
             }
 
-            // Don't bother re-writing a layer unless it will at least halve its size
+            // Only rewrite a layer if we can reclaim significant space.
             if layer_local_page_count != u32::MAX
-                && layer_local_page_count > layer_raw_page_count / 2
+                && layer_local_page_count as f64 / layer_raw_page_count as f64
+                    <= ANCESTOR_COMPACTION_REWRITE_THRESHOLD
             {
                 debug!(%layer,
-                    "layer is already mostly local ({}/{}), not rewriting",
-                    layer_local_page_count,
-                    layer_raw_page_count
+                    "layer has a large share of local pages \
+                        ({layer_local_page_count}/{layer_raw_page_count} > \
+                        {ANCESTOR_COMPACTION_REWRITE_THRESHOLD}), not rewriting",
                 );
             }
 

From 0f7c2cc382af8bad3d796a485b125f9eacd3cd8b Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 15 Apr 2025 16:08:05 +0100
Subject: [PATCH 03/55] CI(release): add time to RC PR branch names (#11547)

## Problem

We can't have more than one open release PR created on the same day (due
to non-unique enough branch names).

## Summary of changes
- Add time (hours and minutes) to RC PR branch names
- Also make sure we use UTC for releases
---
 .github/workflows/_create-release-pr.yml | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/_create-release-pr.yml b/.github/workflows/_create-release-pr.yml
index bfbb45e30b..f96ed7d69b 100644
--- a/.github/workflows/_create-release-pr.yml
+++ b/.github/workflows/_create-release-pr.yml
@@ -53,10 +53,13 @@ jobs:
             || inputs.component-name == 'Compute' && 'release-compute'
           }}
       run: |
-        today=$(date +'%Y-%m-%d')
-        echo "title=${COMPONENT_NAME} release ${today}" | tee -a ${GITHUB_OUTPUT}
-        echo "rc-branch=rc/${RELEASE_BRANCH}/${today}"  | tee -a ${GITHUB_OUTPUT}
-        echo "release-branch=${RELEASE_BRANCH}"         | tee -a ${GITHUB_OUTPUT}
+        now_date=$(date -u +'%Y-%m-%d')
+        now_time=$(date -u +'%H-%M-%Z')
+        {
+          echo "title=${COMPONENT_NAME} release ${now_date}"
+          echo "rc-branch=rc/${RELEASE_BRANCH}/${now_date}_${now_time}"
+          echo "release-branch=${RELEASE_BRANCH}"
+        } | tee -a ${GITHUB_OUTPUT}
 
     - name: Configure git
       run: |

From 931f8c43003275a4f90c8ca1b439e92455881830 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Tue, 15 Apr 2025 11:16:16 -0400
Subject: [PATCH 04/55] fix(pageserver): check if cancelled before waiting
 logical size (2/2) (#11575)

## Problem

close https://github.com/neondatabase/neon/issues/11486, proceeding
https://github.com/neondatabase/neon/pull/11531

## Summary of changes

This patch fixes the rest 50% of instability of
`test_create_churn_during_restart`. During tenant warmup, we'll request
logical size; however, if the startup gets cancelled, we won't be able
to spawn the initial logical size calculation task that sets the
`cancel_wait_for_background_loop_concurrency_limit_semaphore`.

Therefore, we check `cancelled` before proceeding to get
`cancel_wait_for_background_loop_concurrency_limit_semaphore`. There
will still be a race if the timeline shutdown happens after L5710 and
before L5711, but it should be enough to reduce the flakiness of the
test.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/timeline.rs | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index c27a4b62da..613834dc88 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -5702,6 +5702,12 @@ impl Timeline {
             return;
         }
 
+        if self.cancel.is_cancelled() {
+            // We already requested stopping the tenant, so we cannot wait for the logical size
+            // calculation to complete given the task might have been already cancelled.
+            return;
+        }
+
         if let Some(await_bg_cancel) = self
             .current_logical_size
             .cancel_wait_for_background_loop_concurrency_limit_semaphore

From c5115518e93022e3231a312444201545af10f245 Mon Sep 17 00:00:00 2001
From: Fedor Dikarev <fedor@neon.tech>
Date: Tue, 15 Apr 2025 17:29:15 +0200
Subject: [PATCH 05/55] remove temp file from repo (#11586)

## Problem
In https://github.com/neondatabase/neon/pull/11409 we added temp file to
the repo.

## Summary of changes
Remove temp file from the repo.
---
 explained_queries.sql | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 explained_queries.sql

diff --git a/explained_queries.sql b/explained_queries.sql
deleted file mode 100644
index e69de29bb2..0000000000

From eadb05f78e70ad4e1eb474e6ae3194d840fe99c5 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 15 Apr 2025 12:27:49 -0500
Subject: [PATCH 06/55] Teach neon_local to pass the Authorization header to
 compute_ctl (#11490)

This allows us to remove hacks in the compute_ctl authorization
middleware which allowed for bypasses of auth checks.

Fixes: https://github.com/neondatabase/neon/issues/11316

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 Cargo.lock                                    | 14 +++-
 Cargo.toml                                    |  2 +
 .../src/http/middleware/authorize.rs          |  6 +-
 control_plane/Cargo.toml                      |  5 ++
 control_plane/src/bin/neon_local.rs           | 18 +++++
 control_plane/src/endpoint.rs                 | 77 +++++++++++++++----
 control_plane/src/local_env.rs                | 33 +++++++-
 control_plane/src/storage_controller.rs       | 17 ++--
 libs/compute_api/src/responses.rs             |  4 +-
 libs/http-utils/Cargo.toml                    |  1 +
 libs/http-utils/src/endpoint.rs               |  3 +-
 libs/utils/Cargo.toml                         |  1 +
 libs/utils/src/auth.rs                        | 32 +++++---
 pageserver/Cargo.toml                         |  1 +
 pageserver/src/page_service.rs                |  3 +-
 safekeeper/Cargo.toml                         |  1 +
 safekeeper/src/handler.rs                     |  3 +-
 workspace_hack/Cargo.toml                     |  1 +
 18 files changed, 178 insertions(+), 44 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 5d2cdcea27..5c9170b7de 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1416,6 +1416,7 @@ name = "control_plane"
 version = "0.1.0"
 dependencies = [
  "anyhow",
+ "base64 0.13.1",
  "camino",
  "clap",
  "comfy-table",
@@ -1425,10 +1426,13 @@ dependencies = [
  "humantime",
  "humantime-serde",
  "hyper 0.14.30",
+ "jsonwebtoken",
  "nix 0.27.1",
  "once_cell",
  "pageserver_api",
  "pageserver_client",
+ "pem",
+ "pkcs8 0.10.2",
  "postgres_backend",
  "postgres_connection",
  "regex",
@@ -1437,6 +1441,7 @@ dependencies = [
  "scopeguard",
  "serde",
  "serde_json",
+ "sha2",
  "storage_broker",
  "thiserror 1.0.69",
  "tokio",
@@ -2817,6 +2822,7 @@ dependencies = [
  "hyper 0.14.30",
  "itertools 0.10.5",
  "jemalloc_pprof",
+ "jsonwebtoken",
  "metrics",
  "once_cell",
  "pprof",
@@ -4269,6 +4275,7 @@ dependencies = [
  "hyper 0.14.30",
  "indoc",
  "itertools 0.10.5",
+ "jsonwebtoken",
  "md5",
  "metrics",
  "nix 0.27.1",
@@ -5685,9 +5692,9 @@ dependencies = [
 
 [[package]]
 name = "ring"
-version = "0.17.13"
+version = "0.17.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "70ac5d832aa16abd7d1def883a8545280c20a60f523a370aa3a9617c2b8550ee"
+checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7"
 dependencies = [
  "cc",
  "cfg-if",
@@ -5988,6 +5995,7 @@ dependencies = [
  "humantime",
  "hyper 0.14.30",
  "itertools 0.10.5",
+ "jsonwebtoken",
  "metrics",
  "once_cell",
  "pageserver_api",
@@ -7872,6 +7880,7 @@ dependencies = [
  "metrics",
  "nix 0.27.1",
  "once_cell",
+ "pem",
  "pin-project-lite",
  "postgres_connection",
  "pprof",
@@ -8460,6 +8469,7 @@ dependencies = [
  "once_cell",
  "p256 0.13.2",
  "parquet",
+ "pkcs8 0.10.2",
  "prettyplease",
  "proc-macro2",
  "prost 0.13.3",
diff --git a/Cargo.toml b/Cargo.toml
index d957fa9070..8fac3bb46c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -141,7 +141,9 @@ parking_lot = "0.12"
 parquet = { version = "53", default-features = false, features = ["zstd"] }
 parquet_derive = "53"
 pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
+pem = "3.0.3"
 pin-project-lite = "0.2"
+pkcs8 = "0.10.2"
 pprof = { version = "0.14", features = ["criterion", "flamegraph", "frame-pointer", "prost-codec"] }
 procfs = "0.16"
 prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency
diff --git a/compute_tools/src/http/middleware/authorize.rs b/compute_tools/src/http/middleware/authorize.rs
index f221752c38..f1137de0ab 100644
--- a/compute_tools/src/http/middleware/authorize.rs
+++ b/compute_tools/src/http/middleware/authorize.rs
@@ -54,8 +54,8 @@ impl AsyncAuthorizeRequest<Body> for Authorize {
         Box::pin(async move {
             let request_id = request.extract_parts::<RequestId>().await.unwrap();
 
-            // TODO: Remove this stanza after teaching neon_local and the
-            // regression tests to use a JWT + JWKS.
+            // TODO(tristan957): Remove this stanza after teaching neon_local
+            // and the regression tests to use a JWT + JWKS.
             //
             // https://github.com/neondatabase/neon/issues/11316
             if cfg!(feature = "testing") {
@@ -112,6 +112,8 @@ impl Authorize {
         token: &str,
         validation: &Validation,
     ) -> Result<TokenData<ComputeClaims>> {
+        debug_assert!(!jwks.keys.is_empty());
+
         debug!("verifying token {}", token);
 
         for jwk in jwks.keys.iter() {
diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml
index 162c49ec7c..a0ea216d9c 100644
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -6,13 +6,17 @@ license.workspace = true
 
 [dependencies]
 anyhow.workspace = true
+base64.workspace = true
 camino.workspace = true
 clap.workspace = true
 comfy-table.workspace = true
 futures.workspace = true
 humantime.workspace = true
+jsonwebtoken.workspace = true
 nix.workspace = true
 once_cell.workspace = true
+pem.workspace = true
+pkcs8.workspace = true
 humantime-serde.workspace = true
 hyper0.workspace = true
 regex.workspace = true
@@ -20,6 +24,7 @@ reqwest = { workspace = true, features = ["blocking", "json"] }
 scopeguard.workspace = true
 serde.workspace = true
 serde_json.workspace = true
+sha2.workspace = true
 thiserror.workspace = true
 toml.workspace = true
 toml_edit.workspace = true
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index db9715dc62..950b264163 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -552,6 +552,7 @@ enum EndpointCmd {
     Start(EndpointStartCmdArgs),
     Reconfigure(EndpointReconfigureCmdArgs),
     Stop(EndpointStopCmdArgs),
+    GenerateJwt(EndpointGenerateJwtCmdArgs),
 }
 
 #[derive(clap::Args)]
@@ -699,6 +700,13 @@ struct EndpointStopCmdArgs {
     mode: String,
 }
 
+#[derive(clap::Args)]
+#[clap(about = "Generate a JWT for an endpoint")]
+struct EndpointGenerateJwtCmdArgs {
+    #[clap(help = "Postgres endpoint id")]
+    endpoint_id: String,
+}
+
 #[derive(clap::Subcommand)]
 #[clap(about = "Manage neon_local branch name mappings")]
 enum MappingsCmd {
@@ -1528,6 +1536,16 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
                 .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
             endpoint.stop(&args.mode, args.destroy)?;
         }
+        EndpointCmd::GenerateJwt(args) => {
+            let endpoint_id = &args.endpoint_id;
+            let endpoint = cplane
+                .endpoints
+                .get(endpoint_id)
+                .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
+            let jwt = endpoint.generate_jwt()?;
+
+            println!("{jwt}");
+        }
     }
 
     Ok(())
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 2fa7a62f8f..0fe6975a6e 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -42,22 +42,29 @@ use std::path::PathBuf;
 use std::process::Command;
 use std::str::FromStr;
 use std::sync::Arc;
-use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
+use std::time::{Duration, Instant};
 
 use anyhow::{Context, Result, anyhow, bail};
-use compute_api::requests::ConfigurationRequest;
+use compute_api::requests::{ComputeClaims, ConfigurationRequest};
 use compute_api::responses::{
-    ComputeConfig, ComputeCtlConfig, ComputeStatus, ComputeStatusResponse,
+    ComputeConfig, ComputeCtlConfig, ComputeStatus, ComputeStatusResponse, TlsConfig,
 };
 use compute_api::spec::{
     Cluster, ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, Database, PgIdent,
     RemoteExtSpec, Role,
 };
+use jsonwebtoken::jwk::{
+    AlgorithmParameters, CommonParameters, EllipticCurve, Jwk, JwkSet, KeyAlgorithm, KeyOperations,
+    OctetKeyPairParameters, OctetKeyPairType, PublicKeyUse,
+};
 use nix::sys::signal::{Signal, kill};
 use pageserver_api::shard::ShardStripeSize;
+use pem::Pem;
+use pkcs8::der::Decode;
 use reqwest::header::CONTENT_TYPE;
 use safekeeper_api::membership::SafekeeperGeneration;
 use serde::{Deserialize, Serialize};
+use sha2::{Digest, Sha256};
 use tracing::debug;
 use url::Host;
 use utils::id::{NodeId, TenantId, TimelineId};
@@ -82,6 +89,7 @@ pub struct EndpointConf {
     drop_subscriptions_before_start: bool,
     features: Vec<ComputeFeature>,
     cluster: Option<Cluster>,
+    compute_ctl_config: ComputeCtlConfig,
 }
 
 //
@@ -137,6 +145,36 @@ impl ComputeControlPlane {
             .unwrap_or(self.base_port)
     }
 
+    /// Create a JSON Web Key Set. This ideally matches the way we create a JWKS
+    /// from the production control plane.
+    fn create_jwks_from_pem(pem: Pem) -> Result<JwkSet> {
+        let document = pkcs8::Document::from_der(&pem.into_contents())?;
+
+        let mut hasher = Sha256::new();
+        hasher.update(&document);
+        let key_hash = hasher.finalize();
+
+        Ok(JwkSet {
+            keys: vec![Jwk {
+                common: CommonParameters {
+                    public_key_use: Some(PublicKeyUse::Signature),
+                    key_operations: Some(vec![KeyOperations::Verify]),
+                    key_algorithm: Some(KeyAlgorithm::EdDSA),
+                    key_id: Some(base64::encode_config(key_hash, base64::URL_SAFE_NO_PAD)),
+                    x509_url: None::<String>,
+                    x509_chain: None::<Vec<String>>,
+                    x509_sha1_fingerprint: None::<String>,
+                    x509_sha256_fingerprint: None::<String>,
+                },
+                algorithm: AlgorithmParameters::OctetKeyPair(OctetKeyPairParameters {
+                    key_type: OctetKeyPairType::OctetKeyPair,
+                    curve: EllipticCurve::Ed25519,
+                    x: base64::encode_config(&document, base64::URL_SAFE_NO_PAD),
+                }),
+            }],
+        })
+    }
+
     #[allow(clippy::too_many_arguments)]
     pub fn new_endpoint(
         &mut self,
@@ -154,6 +192,10 @@ impl ComputeControlPlane {
         let pg_port = pg_port.unwrap_or_else(|| self.get_port());
         let external_http_port = external_http_port.unwrap_or_else(|| self.get_port() + 1);
         let internal_http_port = internal_http_port.unwrap_or_else(|| external_http_port + 1);
+        let compute_ctl_config = ComputeCtlConfig {
+            jwks: Self::create_jwks_from_pem(self.env.read_public_key()?)?,
+            tls: None::<TlsConfig>,
+        };
         let ep = Arc::new(Endpoint {
             endpoint_id: endpoint_id.to_owned(),
             pg_address: SocketAddr::new(IpAddr::from(Ipv4Addr::LOCALHOST), pg_port),
@@ -181,6 +223,7 @@ impl ComputeControlPlane {
             reconfigure_concurrency: 1,
             features: vec![],
             cluster: None,
+            compute_ctl_config: compute_ctl_config.clone(),
         });
 
         ep.create_endpoint_dir()?;
@@ -200,6 +243,7 @@ impl ComputeControlPlane {
                 reconfigure_concurrency: 1,
                 features: vec![],
                 cluster: None,
+                compute_ctl_config,
             })?,
         )?;
         std::fs::write(
@@ -242,7 +286,6 @@ impl ComputeControlPlane {
 
 ///////////////////////////////////////////////////////////////////////////////
 
-#[derive(Debug)]
 pub struct Endpoint {
     /// used as the directory name
     endpoint_id: String,
@@ -271,6 +314,9 @@ pub struct Endpoint {
     features: Vec<ComputeFeature>,
     // Cluster settings
     cluster: Option<Cluster>,
+
+    /// The compute_ctl config for the endpoint's compute.
+    compute_ctl_config: ComputeCtlConfig,
 }
 
 #[derive(PartialEq, Eq)]
@@ -333,6 +379,7 @@ impl Endpoint {
             drop_subscriptions_before_start: conf.drop_subscriptions_before_start,
             features: conf.features,
             cluster: conf.cluster,
+            compute_ctl_config: conf.compute_ctl_config,
         })
     }
 
@@ -580,6 +627,13 @@ impl Endpoint {
         Ok(safekeeper_connstrings)
     }
 
+    /// Generate a JWT with the correct claims.
+    pub fn generate_jwt(&self) -> Result<String> {
+        self.env.generate_auth_token(&ComputeClaims {
+            compute_id: self.endpoint_id.clone(),
+        })
+    }
+
     #[allow(clippy::too_many_arguments)]
     pub async fn start(
         &self,
@@ -706,7 +760,7 @@ impl Endpoint {
 
             ComputeConfig {
                 spec: Some(spec),
-                compute_ctl_config: ComputeCtlConfig::default(),
+                compute_ctl_config: self.compute_ctl_config.clone(),
             }
         };
 
@@ -774,16 +828,7 @@ impl Endpoint {
         ])
         // TODO: It would be nice if we generated compute IDs with the same
         // algorithm as the real control plane.
-        .args([
-            "--compute-id",
-            &format!(
-                "compute-{}",
-                SystemTime::now()
-                    .duration_since(UNIX_EPOCH)
-                    .unwrap()
-                    .as_secs()
-            ),
-        ])
+        .args(["--compute-id", &self.endpoint_id])
         .stdin(std::process::Stdio::null())
         .stderr(logfile.try_clone()?)
         .stdout(logfile);
@@ -881,6 +926,7 @@ impl Endpoint {
                     self.external_http_address.port()
                 ),
             )
+            .bearer_auth(self.generate_jwt()?)
             .send()
             .await?;
 
@@ -957,6 +1003,7 @@ impl Endpoint {
                 self.external_http_address.port()
             ))
             .header(CONTENT_TYPE.as_str(), "application/json")
+            .bearer_auth(self.generate_jwt()?)
             .body(
                 serde_json::to_string(&ConfigurationRequest {
                     spec,
diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index fa10abe91a..b7906e5f81 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -12,6 +12,7 @@ use std::{env, fs};
 
 use anyhow::{Context, bail};
 use clap::ValueEnum;
+use pem::Pem;
 use postgres_backend::AuthType;
 use reqwest::Url;
 use serde::{Deserialize, Serialize};
@@ -56,6 +57,7 @@ pub struct LocalEnv {
 
     // used to issue tokens during e.g pg start
     pub private_key_path: PathBuf,
+    /// Path to environment's public key
     pub public_key_path: PathBuf,
 
     pub broker: NeonBroker,
@@ -758,11 +760,11 @@ impl LocalEnv {
 
     // this function is used only for testing purposes in CLI e g generate tokens during init
     pub fn generate_auth_token<S: Serialize>(&self, claims: &S) -> anyhow::Result<String> {
-        let private_key_path = self.get_private_key_path();
-        let key_data = fs::read(private_key_path)?;
-        encode_from_key_file(claims, &key_data)
+        let key = self.read_private_key()?;
+        encode_from_key_file(claims, &key)
     }
 
+    /// Get the path to the private key.
     pub fn get_private_key_path(&self) -> PathBuf {
         if self.private_key_path.is_absolute() {
             self.private_key_path.to_path_buf()
@@ -771,6 +773,29 @@ impl LocalEnv {
         }
     }
 
+    /// Get the path to the public key.
+    pub fn get_public_key_path(&self) -> PathBuf {
+        if self.public_key_path.is_absolute() {
+            self.public_key_path.to_path_buf()
+        } else {
+            self.base_data_dir.join(&self.public_key_path)
+        }
+    }
+
+    /// Read the contents of the private key file.
+    pub fn read_private_key(&self) -> anyhow::Result<Pem> {
+        let private_key_path = self.get_private_key_path();
+        let pem = pem::parse(fs::read(private_key_path)?)?;
+        Ok(pem)
+    }
+
+    /// Read the contents of the public key file.
+    pub fn read_public_key(&self) -> anyhow::Result<Pem> {
+        let public_key_path = self.get_public_key_path();
+        let pem = pem::parse(fs::read(public_key_path)?)?;
+        Ok(pem)
+    }
+
     /// Materialize the [`NeonLocalInitConf`] to disk. Called during [`neon_local init`].
     pub fn init(conf: NeonLocalInitConf, force: &InitForceMode) -> anyhow::Result<()> {
         let base_path = base_path();
@@ -956,6 +981,7 @@ fn generate_auth_keys(private_key_path: &Path, public_key_path: &Path) -> anyhow
             String::from_utf8_lossy(&keygen_output.stderr)
         );
     }
+
     // Extract the public key from the private key file
     //
     // openssl pkey -in auth_private_key.pem -pubout -out auth_public_key.pem
@@ -972,6 +998,7 @@ fn generate_auth_keys(private_key_path: &Path, public_key_path: &Path) -> anyhow
             String::from_utf8_lossy(&keygen_output.stderr)
         );
     }
+
     Ok(())
 }
 
diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index a4b56ae5c0..62ad5fa8d6 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -18,6 +18,7 @@ use pageserver_api::models::{
 };
 use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api::ResponseErrorMessageExt;
+use pem::Pem;
 use postgres_backend::AuthType;
 use reqwest::{Certificate, Method};
 use serde::de::DeserializeOwned;
@@ -34,8 +35,8 @@ use crate::local_env::{LocalEnv, NeonStorageControllerConf};
 
 pub struct StorageController {
     env: LocalEnv,
-    private_key: Option<Vec<u8>>,
-    public_key: Option<String>,
+    private_key: Option<Pem>,
+    public_key: Option<Pem>,
     client: reqwest::Client,
     config: NeonStorageControllerConf,
 
@@ -116,7 +117,9 @@ impl StorageController {
             AuthType::Trust => (None, None),
             AuthType::NeonJWT => {
                 let private_key_path = env.get_private_key_path();
-                let private_key = fs::read(private_key_path).expect("failed to read private key");
+                let private_key =
+                    pem::parse(fs::read(private_key_path).expect("failed to read private key"))
+                        .expect("failed to parse PEM file");
 
                 // If pageserver auth is enabled, this implicitly enables auth for this service,
                 // using the same credentials.
@@ -138,9 +141,13 @@ impl StorageController {
                         .expect("Empty key dir")
                         .expect("Error reading key dir");
 
-                    std::fs::read_to_string(dent.path()).expect("Can't read public key")
+                    pem::parse(std::fs::read_to_string(dent.path()).expect("Can't read public key"))
+                        .expect("Failed to parse PEM file")
                 } else {
-                    std::fs::read_to_string(&public_key_path).expect("Can't read public key")
+                    pem::parse(
+                        std::fs::read_to_string(&public_key_path).expect("Can't read public key"),
+                    )
+                    .expect("Failed to parse PEM file")
                 };
                 (Some(private_key), Some(public_key))
             }
diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs
index 353949736b..b7d6b7ca34 100644
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -160,7 +160,7 @@ pub struct CatalogObjects {
     pub databases: Vec<Database>,
 }
 
-#[derive(Clone, Debug, Deserialize, Serialize)]
+#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
 pub struct ComputeCtlConfig {
     /// Set of JSON web keys that the compute can use to authenticate
     /// communication from the control plane.
@@ -179,7 +179,7 @@ impl Default for ComputeCtlConfig {
     }
 }
 
-#[derive(Clone, Debug, Deserialize, Serialize)]
+#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
 pub struct TlsConfig {
     pub key_path: String,
     pub cert_path: String,
diff --git a/libs/http-utils/Cargo.toml b/libs/http-utils/Cargo.toml
index 5f6578f76e..ab9380089b 100644
--- a/libs/http-utils/Cargo.toml
+++ b/libs/http-utils/Cargo.toml
@@ -14,6 +14,7 @@ futures.workspace = true
 hyper0.workspace = true
 itertools.workspace = true
 jemalloc_pprof.workspace = true
+jsonwebtoken.workspace = true
 once_cell.workspace = true
 pprof.workspace = true
 regex.workspace = true
diff --git a/libs/http-utils/src/endpoint.rs b/libs/http-utils/src/endpoint.rs
index 5588f6d87e..64147f2dd0 100644
--- a/libs/http-utils/src/endpoint.rs
+++ b/libs/http-utils/src/endpoint.rs
@@ -8,6 +8,7 @@ use bytes::{Bytes, BytesMut};
 use hyper::header::{AUTHORIZATION, CONTENT_DISPOSITION, CONTENT_TYPE, HeaderName};
 use hyper::http::HeaderValue;
 use hyper::{Body, Method, Request, Response};
+use jsonwebtoken::TokenData;
 use metrics::{Encoder, IntCounter, TextEncoder, register_int_counter};
 use once_cell::sync::Lazy;
 use pprof::ProfilerGuardBuilder;
@@ -618,7 +619,7 @@ pub fn auth_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
                     })?;
                     let token = parse_token(header_value)?;
 
-                    let data = auth.decode(token).map_err(|err| {
+                    let data: TokenData<Claims> = auth.decode(token).map_err(|err| {
                         warn!("Authentication error: {err}");
                         // Rely on From<AuthError> for ApiError impl
                         err
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index fd2fa63fd0..7b1dc56071 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -29,6 +29,7 @@ futures = { workspace = true }
 jsonwebtoken.workspace = true
 nix = { workspace = true, features = ["ioctl"] }
 once_cell.workspace = true
+pem.workspace = true
 pin-project-lite.workspace = true
 regex.workspace = true
 serde.workspace = true
diff --git a/libs/utils/src/auth.rs b/libs/utils/src/auth.rs
index db4fc5685c..de3a964d23 100644
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -11,7 +11,8 @@ use camino::Utf8Path;
 use jsonwebtoken::{
     Algorithm, DecodingKey, EncodingKey, Header, TokenData, Validation, decode, encode,
 };
-use serde::{Deserialize, Serialize};
+use pem::Pem;
+use serde::{Deserialize, Serialize, de::DeserializeOwned};
 
 use crate::id::TenantId;
 
@@ -73,7 +74,10 @@ impl SwappableJwtAuth {
     pub fn swap(&self, jwt_auth: JwtAuth) {
         self.0.swap(Arc::new(jwt_auth));
     }
-    pub fn decode(&self, token: &str) -> std::result::Result<TokenData<Claims>, AuthError> {
+    pub fn decode<D: DeserializeOwned>(
+        &self,
+        token: &str,
+    ) -> std::result::Result<TokenData<D>, AuthError> {
         self.0.load().decode(token)
     }
 }
@@ -148,7 +152,10 @@ impl JwtAuth {
     /// The function tries the stored decoding keys in succession,
     /// and returns the first yielding a successful result.
     /// If there is no working decoding key, it returns the last error.
-    pub fn decode(&self, token: &str) -> std::result::Result<TokenData<Claims>, AuthError> {
+    pub fn decode<D: DeserializeOwned>(
+        &self,
+        token: &str,
+    ) -> std::result::Result<TokenData<D>, AuthError> {
         let mut res = None;
         for decoding_key in &self.decoding_keys {
             res = Some(decode(token, decoding_key, &self.validation));
@@ -173,8 +180,8 @@ impl std::fmt::Debug for JwtAuth {
 }
 
 // this function is used only for testing purposes in CLI e g generate tokens during init
-pub fn encode_from_key_file<S: Serialize>(claims: &S, key_data: &[u8]) -> Result<String> {
-    let key = EncodingKey::from_ed_pem(key_data)?;
+pub fn encode_from_key_file<S: Serialize>(claims: &S, pem: &Pem) -> Result<String> {
+    let key = EncodingKey::from_ed_der(pem.contents());
     Ok(encode(&Header::new(STORAGE_TOKEN_ALGORITHM), claims, &key)?)
 }
 
@@ -188,13 +195,13 @@ mod tests {
     //
     // openssl genpkey -algorithm ed25519 -out ed25519-priv.pem
     // openssl pkey -in ed25519-priv.pem -pubout -out ed25519-pub.pem
-    const TEST_PUB_KEY_ED25519: &[u8] = br#"
+    const TEST_PUB_KEY_ED25519: &str = r#"
 -----BEGIN PUBLIC KEY-----
 MCowBQYDK2VwAyEARYwaNBayR+eGI0iXB4s3QxE3Nl2g1iWbr6KtLWeVD/w=
 -----END PUBLIC KEY-----
 "#;
 
-    const TEST_PRIV_KEY_ED25519: &[u8] = br#"
+    const TEST_PRIV_KEY_ED25519: &str = r#"
 -----BEGIN PRIVATE KEY-----
 MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
 -----END PRIVATE KEY-----
@@ -222,9 +229,9 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
 
         // Check it can be validated with the public key
         let auth = JwtAuth::new(vec![
-            DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap(),
+            DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519.as_bytes()).unwrap(),
         ]);
-        let claims_from_token = auth.decode(encoded_eddsa).unwrap().claims;
+        let claims_from_token: Claims = auth.decode(encoded_eddsa).unwrap().claims;
         assert_eq!(claims_from_token, expected_claims);
     }
 
@@ -235,13 +242,14 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
             scope: Scope::Tenant,
         };
 
-        let encoded = encode_from_key_file(&claims, TEST_PRIV_KEY_ED25519).unwrap();
+        let pem = pem::parse(TEST_PRIV_KEY_ED25519).unwrap();
+        let encoded = encode_from_key_file(&claims, &pem).unwrap();
 
         // decode it back
         let auth = JwtAuth::new(vec![
-            DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap(),
+            DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519.as_bytes()).unwrap(),
         ]);
-        let decoded = auth.decode(&encoded).unwrap();
+        let decoded: TokenData<Claims> = auth.decode(&encoded).unwrap();
 
         assert_eq!(decoded.claims, claims);
     }
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 74f3fce6e5..5c5bab0642 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -35,6 +35,7 @@ humantime.workspace = true
 humantime-serde.workspace = true
 hyper0.workspace = true
 itertools.workspace = true
+jsonwebtoken.workspace = true
 md5.workspace = true
 nix.workspace = true
 # hack to get the number of worker threads tokio uses
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 7a62d8049b..560ac75f4a 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -15,6 +15,7 @@ use async_compression::tokio::write::GzipEncoder;
 use bytes::Buf;
 use futures::FutureExt;
 use itertools::Itertools;
+use jsonwebtoken::TokenData;
 use once_cell::sync::OnceCell;
 use pageserver_api::config::{
     PageServicePipeliningConfig, PageServicePipeliningConfigPipelined,
@@ -2837,7 +2838,7 @@ where
     ) -> Result<(), QueryError> {
         // this unwrap is never triggered, because check_auth_jwt only called when auth_type is NeonJWT
         // which requires auth to be present
-        let data = self
+        let data: TokenData<Claims> = self
             .auth
             .as_ref()
             .unwrap()
diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml
index 965aa7504b..a0ba69aa34 100644
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -27,6 +27,7 @@ humantime.workspace = true
 http.workspace = true
 hyper0.workspace = true
 itertools.workspace = true
+jsonwebtoken.workspace = true
 futures.workspace = true
 once_cell.workspace = true
 parking_lot.workspace = true
diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs
index 5ca3d1b7c2..b54bee8bfb 100644
--- a/safekeeper/src/handler.rs
+++ b/safekeeper/src/handler.rs
@@ -6,6 +6,7 @@ use std::str::{self, FromStr};
 use std::sync::Arc;
 
 use anyhow::Context;
+use jsonwebtoken::TokenData;
 use pageserver_api::models::ShardParameters;
 use pageserver_api::shard::{ShardIdentity, ShardStripeSize};
 use postgres_backend::{PostgresBackend, QueryError};
@@ -278,7 +279,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
             .auth
             .as_ref()
             .expect("auth_type is configured but .auth of handler is missing");
-        let data = auth
+        let data: TokenData<Claims> = auth
             .decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)
             .map_err(|e| QueryError::Unauthorized(e.0))?;
 
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index b548a2a88a..2c37cebc27 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -70,6 +70,7 @@ num-traits = { version = "0.2", features = ["i128", "libm"] }
 once_cell = { version = "1" }
 p256 = { version = "0.13", features = ["jwk"] }
 parquet = { version = "53", default-features = false, features = ["zstd"] }
+pkcs8 = { version = "0.10", default-features = false, features = ["pem", "std"] }
 prost = { version = "0.13", features = ["no-recursion-limit", "prost-derive"] }
 rand = { version = "0.8", features = ["small_rng"] }
 regex = { version = "1" }

From cd9ad757975277a3ce20065c34d209747086be52 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 15 Apr 2025 14:12:34 -0500
Subject: [PATCH 07/55] Remove compute_ctl authorization bypass on localhost
 (#11597)

For whatever reason, this never worked in production computes anyway.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute_tools/src/http/middleware/authorize.rs | 17 ++---------------
 1 file changed, 2 insertions(+), 15 deletions(-)

diff --git a/compute_tools/src/http/middleware/authorize.rs b/compute_tools/src/http/middleware/authorize.rs
index f1137de0ab..e6c3269b15 100644
--- a/compute_tools/src/http/middleware/authorize.rs
+++ b/compute_tools/src/http/middleware/authorize.rs
@@ -1,7 +1,7 @@
-use std::{collections::HashSet, net::SocketAddr};
+use std::collections::HashSet;
 
 use anyhow::{Result, anyhow};
-use axum::{RequestExt, body::Body, extract::ConnectInfo};
+use axum::{RequestExt, body::Body};
 use axum_extra::{
     TypedHeader,
     headers::{Authorization, authorization::Bearer},
@@ -64,19 +64,6 @@ impl AsyncAuthorizeRequest<Body> for Authorize {
                 return Ok(request);
             }
 
-            let connect_info = request
-                .extract_parts::<ConnectInfo<SocketAddr>>()
-                .await
-                .unwrap();
-
-            // In the event the request is coming from the loopback interface,
-            // allow all requests
-            if connect_info.ip().is_loopback() {
-                warn!(%request_id, "Bypassed authorization because request is coming from the loopback interface");
-
-                return Ok(request);
-            }
-
             let TypedHeader(Authorization(bearer)) = request
                 .extract_parts::<TypedHeader<Authorization<Bearer>>>()
                 .await

From 35170656fe4c1c636c5b6715651b7f9064ecdb3e Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Tue, 15 Apr 2025 22:13:12 +0300
Subject: [PATCH 08/55] Allocate WalProposerConn using TopMemoryAllocator
 (#11577)

## Problem

See https://neondb.slack.com/archives/C04DGM6SMTM/p1744659631698609
`WalProposerConn` is allocated using current memory context which life
time is not long enough.

## Summary of changes

Allocate `WalProposerConn`  using `TopMemoryContext`.

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/walproposer_pg.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index 9c34c90002..a061639815 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -890,7 +890,7 @@ libpqwp_connect_start(char *conninfo)
 	 * palloc will exit on failure though, so there's not much we could do if
 	 * it *did* fail.
 	 */
-	conn = palloc(sizeof(WalProposerConn));
+	conn = (WalProposerConn*)MemoryContextAllocZero(TopMemoryContext, sizeof(WalProposerConn));
 	conn->pg_conn = pg_conn;
 	conn->is_nonblocking = false;	/* connections always start in blocking
 									 * mode */

From aa19f10e7e958fbe0e0641f2e8c5952ce3be44b3 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Tue, 15 Apr 2025 17:50:28 -0400
Subject: [PATCH 09/55] fix(test): allow shutdown warning in preempt tests
 (#11600)

## Problem

test_gc_compaction_preempt is still flaky

## Summary of changes

- allow shutdown warning logs

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 test_runner/regress/test_compaction.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index 84d37de9f1..001ddcdcb0 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -199,6 +199,8 @@ def test_pageserver_gc_compaction_preempt(
     conf = PREEMPT_GC_COMPACTION_TENANT_CONF.copy()
     env = neon_env_builder.init_start(initial_tenant_conf=conf)
 
+    env.pageserver.allowed_errors.append(".*The timeline or pageserver is shutting down.*")
+
     tenant_id = env.initial_tenant
     timeline_id = env.initial_timeline
 

From 96b46365e4520d650acc415f454decd6eed80971 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 16 Apr 2025 11:26:47 +0100
Subject: [PATCH 10/55] tests: attach final metrics to allure report (#11604)

## Problem

Metrics are saved in https://github.com/neondatabase/neon/pull/11559,
but the file is not matched by the attachment regex.

## Summary of changes

Make attachment regex match the metrics file.
---
 test_runner/fixtures/neon_fixtures.py | 7 +------
 test_runner/fixtures/utils.py         | 2 +-
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 10bbb7020b..13bd74e05d 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -947,8 +947,6 @@ class NeonEnvBuilder:
                     continue
                 if SMALL_DB_FILE_NAME_REGEX.fullmatch(test_file.name):
                     continue
-                if FINAL_METRICS_FILE_NAME == test_file.name:
-                    continue
                 log.debug(f"Removing large database {test_file} file")
                 test_file.unlink()
             elif test_entry.is_dir():
@@ -2989,7 +2987,7 @@ class NeonPageserver(PgProtocol, LogUtils):
             return
 
         metrics = self.http_client().get_metrics_str()
-        metrics_snapshot_path = self.workdir / FINAL_METRICS_FILE_NAME
+        metrics_snapshot_path = self.workdir / "final_metrics.txt"
 
         with open(metrics_snapshot_path, "w") as f:
             f.write(metrics)
@@ -5156,9 +5154,6 @@ SMALL_DB_FILE_NAME_REGEX: re.Pattern[str] = re.compile(
     r"config-v1|heatmap-v1|tenant-manifest|metadata|.+\.(?:toml|pid|json|sql|conf)"
 )
 
-FINAL_METRICS_FILE_NAME: str = "final_metrics.txt"
-
-
 SKIP_DIRS = frozenset(
     (
         "pg_wal",
diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index 13c2d320d1..0d7345cc82 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -258,7 +258,7 @@ def get_scale_for_db(size_mb: int) -> int:
 
 
 ATTACHMENT_NAME_REGEX: re.Pattern[str] = re.compile(
-    r"regression\.(diffs|out)|.+\.(?:log|stderr|stdout|filediff|metrics|html|walredo)"
+    r"regression\.(diffs|out)|.+\.(?:log|stderr|stdout|filediff|metrics|html|walredo)|final_metrics.txt"
 )
 
 

From b4e26a6284b8dedda229c3d087d706e62e19bb59 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 16 Apr 2025 15:34:18 +0300
Subject: [PATCH 11/55] Set last-written LSN as part of
 smgr_end_unlogged_build() (#11584)

This way, the callers don't need to do it, reducing the footprint of
changes we've had to made to various index AM's build functions.
---
 compute/patches/pgvector.patch | 27 ++++++---------------------
 compute/patches/rum.patch      | 18 +++---------------
 pgxn/neon/pagestore_smgr.c     | 25 +++++++++++++++++++++++--
 vendor/postgres-v14            |  2 +-
 vendor/postgres-v15            |  2 +-
 vendor/postgres-v16            |  2 +-
 vendor/postgres-v17            |  2 +-
 vendor/revisions.json          |  8 ++++----
 8 files changed, 40 insertions(+), 46 deletions(-)

diff --git a/compute/patches/pgvector.patch b/compute/patches/pgvector.patch
index 6fe3d073ed..6a203489fd 100644
--- a/compute/patches/pgvector.patch
+++ b/compute/patches/pgvector.patch
@@ -15,7 +15,7 @@ index 7a4b88c..56678af 100644
  HEADERS = src/halfvec.h src/sparsevec.h src/vector.h
  
 diff --git a/src/hnswbuild.c b/src/hnswbuild.c
-index b667478..dc95d89 100644
+index b667478..1298aa1 100644
 --- a/src/hnswbuild.c
 +++ b/src/hnswbuild.c
 @@ -843,9 +843,17 @@ HnswParallelBuildMain(dsm_segment *seg, shm_toc *toc)
@@ -36,7 +36,7 @@ index b667478..dc95d89 100644
  	/* Close relations within worker */
  	index_close(indexRel, indexLockmode);
  	table_close(heapRel, heapLockmode);
-@@ -1100,12 +1108,39 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo,
+@@ -1100,13 +1108,25 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo,
  	SeedRandom(42);
  #endif
  
@@ -48,32 +48,17 @@ index b667478..dc95d89 100644
  
  	BuildGraph(buildstate, forkNum);
  
--	if (RelationNeedsWAL(index) || forkNum == INIT_FORKNUM)
 +#ifdef NEON_SMGR
 +	smgr_finish_unlogged_build_phase_1(RelationGetSmgr(index));
 +#endif
 +
-+	if (RelationNeedsWAL(index) || forkNum == INIT_FORKNUM) {
+ 	if (RelationNeedsWAL(index) || forkNum == INIT_FORKNUM)
  		log_newpage_range(index, forkNum, 0, RelationGetNumberOfBlocksInFork(index, forkNum), true);
-+#ifdef NEON_SMGR
-+		{
-+#if PG_VERSION_NUM >= 160000
-+			RelFileLocator rlocator = RelationGetSmgr(index)->smgr_rlocator.locator;
-+#else
-+			RelFileNode rlocator = RelationGetSmgr(index)->smgr_rnode.node;
-+#endif
-+			if (set_lwlsn_block_range_hook)
-+				set_lwlsn_block_range_hook(XactLastRecEnd, rlocator,
-+										   MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index));
-+			if (set_lwlsn_relation_hook)
-+				set_lwlsn_relation_hook(XactLastRecEnd, rlocator, MAIN_FORKNUM);
-+		}
-+#endif
-+	}
-+
+ 
 +#ifdef NEON_SMGR
 +	smgr_end_unlogged_build(RelationGetSmgr(index));
 +#endif
- 
++
  	FreeBuildState(buildstate);
  }
+ 
diff --git a/compute/patches/rum.patch b/compute/patches/rum.patch
index 5bc5d739b3..b45afe2874 100644
--- a/compute/patches/rum.patch
+++ b/compute/patches/rum.patch
@@ -1,5 +1,5 @@
 diff --git a/src/ruminsert.c b/src/ruminsert.c
-index 255e616..7a2240f 100644
+index 255e616..1c6edb7 100644
 --- a/src/ruminsert.c
 +++ b/src/ruminsert.c
 @@ -628,6 +628,10 @@ rumbuild(Relation heap, Relation index, struct IndexInfo *indexInfo)
@@ -24,24 +24,12 @@ index 255e616..7a2240f 100644
  	/*
  	 * Write index to xlog
  	 */
-@@ -713,6 +721,22 @@ rumbuild(Relation heap, Relation index, struct IndexInfo *indexInfo)
+@@ -713,6 +721,10 @@ rumbuild(Relation heap, Relation index, struct IndexInfo *indexInfo)
  		UnlockReleaseBuffer(buffer);
  	}
  
 +#ifdef NEON_SMGR
-+	{
-+#if PG_VERSION_NUM >= 160000
-+		RelFileLocator rlocator = RelationGetSmgr(index)->smgr_rlocator.locator;
-+#else
-+		RelFileNode rlocator = RelationGetSmgr(index)->smgr_rnode.node;
-+#endif
-+		if (set_lwlsn_block_range_hook)
-+			set_lwlsn_block_range_hook(XactLastRecEnd, rlocator, MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index));
-+		if (set_lwlsn_relation_hook)
-+			set_lwlsn_relation_hook(XactLastRecEnd, rlocator, MAIN_FORKNUM);
-+
-+		smgr_end_unlogged_build(index->rd_smgr);
-+	}
++	smgr_end_unlogged_build(index->rd_smgr);
 +#endif
 +
  	/*
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index ef6bd038bb..9fe085c558 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -2040,7 +2040,7 @@ neon_finish_unlogged_build_phase_1(SMgrRelation reln)
 /*
  * neon_end_unlogged_build() -- Finish an unlogged rel build.
  *
- * Call this after you have finished WAL-logging an relation that was
+ * Call this after you have finished WAL-logging a relation that was
  * first populated without WAL-logging.
  *
  * This removes the local copy of the rel, since it's now been fully
@@ -2059,14 +2059,35 @@ neon_end_unlogged_build(SMgrRelation reln)
 
 	if (unlogged_build_phase != UNLOGGED_BUILD_NOT_PERMANENT)
 	{
+		XLogRecPtr recptr;
+		BlockNumber nblocks;
+
 		Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_2);
 		Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED);
 
+		/*
+		 * Update the last-written LSN cache.
+		 *
+		 * The relation is still on local disk so we can get the size by
+		 * calling mdnblocks() directly. For the LSN, GetXLogInsertRecPtr() is
+		 * very conservative. If we could assume that this function is called
+		 * from the same backend that WAL-logged the contents, we could use
+		 * XactLastRecEnd here. But better safe than sorry.
+		 */
+		nblocks = mdnblocks(reln, MAIN_FORKNUM);
+		recptr = GetXLogInsertRecPtr();
+
+		neon_set_lwlsn_block_range(recptr,
+								   InfoFromNInfoB(rinfob),
+								   MAIN_FORKNUM, 0, nblocks);
+		neon_set_lwlsn_relation(recptr,
+								InfoFromNInfoB(rinfob),
+								MAIN_FORKNUM);
+
 		/* Make the relation look permanent again */
 		reln->smgr_relpersistence = RELPERSISTENCE_PERMANENT;
 
 		/* Remove local copy */
-		rinfob = InfoBFromSMgrRel(reln);
 		for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++)
 		{
 			neon_log(SmgrTrace, "forgetting cached relsize for %u/%u/%u.%u",
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index a0391901a2..d3c9d61fb7 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit a0391901a2af13aa029b905272a5b2024133c926
+Subproject commit d3c9d61fb7a362a165dac7060819dd9d6ad68c28
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index aeb292eeac..8ecb12f21d 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit aeb292eeace9072e07071254b6ffc7a74007d4d2
+Subproject commit 8ecb12f21d862dfa39f7204b8f5e1c00a2a225b3
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index d56e79cd5d..37496f87b5 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit d56e79cd5d6136c159b1d8d98acb7981d4b69364
+Subproject commit 37496f87b5324af53c56127e278ee5b1e8435253
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index 66114c23bc..eab3a37834 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit 66114c23bc61205b0e3fb1e77ee76a4abc1eb4b8
+Subproject commit eab3a37834cac6ec0719bf817ac918a201712d66
diff --git a/vendor/revisions.json b/vendor/revisions.json
index d7eddf42b7..90d878d0f7 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,18 +1,18 @@
 {
   "v17": [
     "17.4",
-    "66114c23bc61205b0e3fb1e77ee76a4abc1eb4b8"
+    "eab3a37834cac6ec0719bf817ac918a201712d66"
   ],
   "v16": [
     "16.8",
-    "d56e79cd5d6136c159b1d8d98acb7981d4b69364"
+    "37496f87b5324af53c56127e278ee5b1e8435253"
   ],
   "v15": [
     "15.12",
-    "aeb292eeace9072e07071254b6ffc7a74007d4d2"
+    "8ecb12f21d862dfa39f7204b8f5e1c00a2a225b3"
   ],
   "v14": [
     "14.17",
-    "a0391901a2af13aa029b905272a5b2024133c926"
+    "d3c9d61fb7a362a165dac7060819dd9d6ad68c28"
   ]
 }

From edc11253b65e12a10843711bd88ad277511396d7 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 16 Apr 2025 07:51:48 -0500
Subject: [PATCH 12/55] Fix neon_local public key parsing when create compute
 JWKS (#11602)

Finally figured out the right incantation. I had had this in my original
go, but due to some refactoring and apparently missed testing, I
committed a mistake. The reason this doesn't currently break anything is
that we bypass the authorization middleware when the "testing" cargo
feature is enabled.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 Cargo.lock                    |  3 +--
 Cargo.toml                    |  2 +-
 control_plane/Cargo.toml      |  2 +-
 control_plane/src/endpoint.rs | 14 ++++++++------
 workspace_hack/Cargo.toml     |  1 -
 5 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 5c9170b7de..7ab9378853 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1432,7 +1432,6 @@ dependencies = [
  "pageserver_api",
  "pageserver_client",
  "pem",
- "pkcs8 0.10.2",
  "postgres_backend",
  "postgres_connection",
  "regex",
@@ -1442,6 +1441,7 @@ dependencies = [
  "serde",
  "serde_json",
  "sha2",
+ "spki 0.7.3",
  "storage_broker",
  "thiserror 1.0.69",
  "tokio",
@@ -8469,7 +8469,6 @@ dependencies = [
  "once_cell",
  "p256 0.13.2",
  "parquet",
- "pkcs8 0.10.2",
  "prettyplease",
  "proc-macro2",
  "prost 0.13.3",
diff --git a/Cargo.toml b/Cargo.toml
index 8fac3bb46c..9d7904a787 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -143,7 +143,6 @@ parquet_derive = "53"
 pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
 pem = "3.0.3"
 pin-project-lite = "0.2"
-pkcs8 = "0.10.2"
 pprof = { version = "0.14", features = ["criterion", "flamegraph", "frame-pointer", "prost-codec"] }
 procfs = "0.16"
 prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency
@@ -176,6 +175,7 @@ signal-hook = "0.3"
 smallvec = "1.11"
 smol_str = { version = "0.2.0", features = ["serde"] }
 socket2 = "0.5"
+spki = "0.7.3"
 strum = "0.26"
 strum_macros = "0.26"
 "subtle"  = "2.5.0"
diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml
index a0ea216d9c..92f0071bac 100644
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -16,7 +16,6 @@ jsonwebtoken.workspace = true
 nix.workspace = true
 once_cell.workspace = true
 pem.workspace = true
-pkcs8.workspace = true
 humantime-serde.workspace = true
 hyper0.workspace = true
 regex.workspace = true
@@ -25,6 +24,7 @@ scopeguard.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 sha2.workspace = true
+spki.workspace = true
 thiserror.workspace = true
 toml.workspace = true
 toml_edit.workspace = true
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 0fe6975a6e..b569b0fb8e 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -60,11 +60,12 @@ use jsonwebtoken::jwk::{
 use nix::sys::signal::{Signal, kill};
 use pageserver_api::shard::ShardStripeSize;
 use pem::Pem;
-use pkcs8::der::Decode;
 use reqwest::header::CONTENT_TYPE;
 use safekeeper_api::membership::SafekeeperGeneration;
 use serde::{Deserialize, Serialize};
 use sha2::{Digest, Sha256};
+use spki::der::Decode;
+use spki::{SubjectPublicKeyInfo, SubjectPublicKeyInfoRef};
 use tracing::debug;
 use url::Host;
 use utils::id::{NodeId, TenantId, TimelineId};
@@ -147,11 +148,12 @@ impl ComputeControlPlane {
 
     /// Create a JSON Web Key Set. This ideally matches the way we create a JWKS
     /// from the production control plane.
-    fn create_jwks_from_pem(pem: Pem) -> Result<JwkSet> {
-        let document = pkcs8::Document::from_der(&pem.into_contents())?;
+    fn create_jwks_from_pem(pem: &Pem) -> Result<JwkSet> {
+        let spki: SubjectPublicKeyInfoRef = SubjectPublicKeyInfo::from_der(pem.contents())?;
+        let public_key = spki.subject_public_key.raw_bytes();
 
         let mut hasher = Sha256::new();
-        hasher.update(&document);
+        hasher.update(public_key);
         let key_hash = hasher.finalize();
 
         Ok(JwkSet {
@@ -169,7 +171,7 @@ impl ComputeControlPlane {
                 algorithm: AlgorithmParameters::OctetKeyPair(OctetKeyPairParameters {
                     key_type: OctetKeyPairType::OctetKeyPair,
                     curve: EllipticCurve::Ed25519,
-                    x: base64::encode_config(&document, base64::URL_SAFE_NO_PAD),
+                    x: base64::encode_config(public_key, base64::URL_SAFE_NO_PAD),
                 }),
             }],
         })
@@ -193,7 +195,7 @@ impl ComputeControlPlane {
         let external_http_port = external_http_port.unwrap_or_else(|| self.get_port() + 1);
         let internal_http_port = internal_http_port.unwrap_or_else(|| external_http_port + 1);
         let compute_ctl_config = ComputeCtlConfig {
-            jwks: Self::create_jwks_from_pem(self.env.read_public_key()?)?,
+            jwks: Self::create_jwks_from_pem(&self.env.read_public_key()?)?,
             tls: None::<TlsConfig>,
         };
         let ep = Arc::new(Endpoint {
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 2c37cebc27..b548a2a88a 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -70,7 +70,6 @@ num-traits = { version = "0.2", features = ["i128", "libm"] }
 once_cell = { version = "1" }
 p256 = { version = "0.13", features = ["jwk"] }
 parquet = { version = "53", default-features = false, features = ["zstd"] }
-pkcs8 = { version = "0.10", default-features = false, features = ["pem", "std"] }
 prost = { version = "0.13", features = ["no-recursion-limit", "prost-derive"] }
 rand = { version = "0.8", features = ["small_rng"] }
 regex = { version = "1" }

From 2a464261574aa141790564e6a9feb3ca11d63287 Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <matthias@neon.tech>
Date: Wed, 16 Apr 2025 15:42:22 +0200
Subject: [PATCH 13/55] Update neon GUCs with new default settings (#11595)

Staging and prod both have these settings configured like this, so let's
update this so we can eventually drop the overrides in prod.
---
 pgxn/neon/communicator.c | 2 +-
 pgxn/neon/libpagestore.c | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pgxn/neon/communicator.c b/pgxn/neon/communicator.c
index 932034e22e..db3e053321 100644
--- a/pgxn/neon/communicator.c
+++ b/pgxn/neon/communicator.c
@@ -95,7 +95,7 @@ static uint32 local_request_counter;
  * Various settings related to prompt (fast) handling of PageStream responses
  * at any CHECK_FOR_INTERRUPTS point.
  */
-int				readahead_getpage_pull_timeout_ms = 0;
+int				readahead_getpage_pull_timeout_ms = 50;
 static int		PS_TIMEOUT_ID = 0;
 static bool		timeout_set = false;
 static bool		timeout_signaled = false;
diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index dfabb6919e..19c14511bd 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -75,7 +75,7 @@ char	   *neon_auth_token;
 int			readahead_buffer_size = 128;
 int			flush_every_n_requests = 8;
 
-int         neon_protocol_version = 2;
+int         neon_protocol_version = 3;
 
 static int	neon_compute_mode = 0;
 static int	max_reconnect_attempts = 60;
@@ -1432,7 +1432,7 @@ pg_init_libpagestore(void)
 							"PageStream connection when we have pages which "
 							"were read ahead but not yet received.",
 							&readahead_getpage_pull_timeout_ms,
-							0, 0, 5 * 60 * 1000,
+							50, 0, 5 * 60 * 1000,
 							PGC_USERSET,
 							GUC_UNIT_MS,
 							NULL, NULL, NULL);
@@ -1440,7 +1440,7 @@ pg_init_libpagestore(void)
 							"Version of compute<->page server protocol",
 							NULL,
 							&neon_protocol_version,
-							2,	/* use protocol version 2 */
+							3,	/* use protocol version 3 */
 							2,	/* min */
 							3,	/* max */
 							PGC_SU_BACKEND,

From 00eeff9b8d170c925821d051a32f9d38fba0ca69 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Wed, 16 Apr 2025 16:41:02 +0200
Subject: [PATCH 14/55] pageserver: add `compaction_shard_ancestor` to disable
 shard ancestor compaction (#11608)

## Problem

Splits of large tenants (several TB) can cause a huge amount of shard
ancestor compaction work, which can overload Pageservers.

Touches https://github.com/neondatabase/cloud/issues/22532.

## Summary of changes

Add a setting `compaction_shard_ancestor` (default `true`) to disable
shard ancestor compaction on a per-tenant basis.
---
 control_plane/src/pageserver.rs                  |  5 +++++
 libs/pageserver_api/src/config.rs                |  4 ++++
 libs/pageserver_api/src/models.rs                | 13 +++++++++++++
 pageserver/src/tenant/timeline.rs                |  8 ++++++++
 pageserver/src/tenant/timeline/compaction.rs     |  3 +--
 test_runner/regress/test_attach_tenant_config.py |  1 +
 6 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 5c985e6dc8..b9257a27bf 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -413,6 +413,11 @@ impl PageServerNode {
                 .map(serde_json::from_str)
                 .transpose()
                 .context("Failed to parse 'compaction_algorithm' json")?,
+            compaction_shard_ancestor: settings
+                .remove("compaction_shard_ancestor")
+                .map(|x| x.parse::<bool>())
+                .transpose()
+                .context("Failed to parse 'compaction_shard_ancestor' as a bool")?,
             compaction_l0_first: settings
                 .remove("compaction_l0_first")
                 .map(|x| x.parse::<bool>())
diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 53b68afb0f..e734b07c38 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -379,6 +379,8 @@ pub struct TenantConfigToml {
     /// size exceeds `compaction_upper_limit * checkpoint_distance`.
     pub compaction_upper_limit: usize,
     pub compaction_algorithm: crate::models::CompactionAlgorithmSettings,
+    /// If true, enable shard ancestor compaction (enabled by default).
+    pub compaction_shard_ancestor: bool,
     /// If true, compact down L0 across all tenant timelines before doing regular compaction. L0
     /// compaction must be responsive to avoid read amp during heavy ingestion. Defaults to true.
     pub compaction_l0_first: bool,
@@ -677,6 +679,7 @@ pub mod tenant_conf_defaults {
 
     pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s";
     pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10;
+    pub const DEFAULT_COMPACTION_SHARD_ANCESTOR: bool = true;
 
     // This value needs to be tuned to avoid OOM. We have 3/4*CPUs threads for L0 compaction, that's
     // 3/4*16=9 on most of our pageservers. Compacting 20 layers requires about 1 GB memory (could
@@ -734,6 +737,7 @@ impl Default for TenantConfigToml {
             compaction_algorithm: crate::models::CompactionAlgorithmSettings {
                 kind: DEFAULT_COMPACTION_ALGORITHM,
             },
+            compaction_shard_ancestor: DEFAULT_COMPACTION_SHARD_ANCESTOR,
             compaction_l0_first: DEFAULT_COMPACTION_L0_FIRST,
             compaction_l0_semaphore: DEFAULT_COMPACTION_L0_SEMAPHORE,
             l0_flush_delay_threshold: None,
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index f491ed10e1..ea5456e04b 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -526,6 +526,8 @@ pub struct TenantConfigPatch {
     #[serde(skip_serializing_if = "FieldPatch::is_noop")]
     pub compaction_algorithm: FieldPatch<CompactionAlgorithmSettings>,
     #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub compaction_shard_ancestor: FieldPatch<bool>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
     pub compaction_l0_first: FieldPatch<bool>,
     #[serde(skip_serializing_if = "FieldPatch::is_noop")]
     pub compaction_l0_semaphore: FieldPatch<bool>,
@@ -615,6 +617,9 @@ pub struct TenantConfig {
     #[serde(skip_serializing_if = "Option::is_none")]
     pub compaction_algorithm: Option<CompactionAlgorithmSettings>,
 
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub compaction_shard_ancestor: Option<bool>,
+
     #[serde(skip_serializing_if = "Option::is_none")]
     pub compaction_l0_first: Option<bool>,
 
@@ -724,6 +729,7 @@ impl TenantConfig {
             mut compaction_threshold,
             mut compaction_upper_limit,
             mut compaction_algorithm,
+            mut compaction_shard_ancestor,
             mut compaction_l0_first,
             mut compaction_l0_semaphore,
             mut l0_flush_delay_threshold,
@@ -772,6 +778,9 @@ impl TenantConfig {
             .compaction_upper_limit
             .apply(&mut compaction_upper_limit);
         patch.compaction_algorithm.apply(&mut compaction_algorithm);
+        patch
+            .compaction_shard_ancestor
+            .apply(&mut compaction_shard_ancestor);
         patch.compaction_l0_first.apply(&mut compaction_l0_first);
         patch
             .compaction_l0_semaphore
@@ -860,6 +869,7 @@ impl TenantConfig {
             compaction_threshold,
             compaction_upper_limit,
             compaction_algorithm,
+            compaction_shard_ancestor,
             compaction_l0_first,
             compaction_l0_semaphore,
             l0_flush_delay_threshold,
@@ -920,6 +930,9 @@ impl TenantConfig {
                 .as_ref()
                 .unwrap_or(&global_conf.compaction_algorithm)
                 .clone(),
+            compaction_shard_ancestor: self
+                .compaction_shard_ancestor
+                .unwrap_or(global_conf.compaction_shard_ancestor),
             compaction_l0_first: self
                 .compaction_l0_first
                 .unwrap_or(global_conf.compaction_l0_first),
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 613834dc88..bc54c85119 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2702,6 +2702,14 @@ impl Timeline {
             .clone()
     }
 
+    pub fn get_compaction_shard_ancestor(&self) -> bool {
+        let tenant_conf = self.tenant_conf.load();
+        tenant_conf
+            .tenant_conf
+            .compaction_shard_ancestor
+            .unwrap_or(self.conf.default_tenant_conf.compaction_shard_ancestor)
+    }
+
     fn get_eviction_policy(&self) -> EvictionPolicy {
         let tenant_conf = self.tenant_conf.load();
         tenant_conf
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 76c153d60f..92b24a73c9 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -1239,8 +1239,7 @@ impl Timeline {
         let partition_count = self.partitioning.read().0.0.parts.len();
 
         // 4. Shard ancestor compaction
-
-        if self.shard_identity.count >= ShardCount::new(2) {
+        if self.get_compaction_shard_ancestor() && self.shard_identity.count >= ShardCount::new(2) {
             // Limit the number of layer rewrites to the number of partitions: this means its
             // runtime should be comparable to a full round of image layer creations, rather than
             // being potentially much longer.
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index 9b6930695c..ee408e3c65 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -155,6 +155,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
         "compaction_algorithm": {
             "kind": "tiered",
         },
+        "compaction_shard_ancestor": False,
         "eviction_policy": {
             "kind": "LayerAccessThreshold",
             "period": "20s",

From 46100717ad4a7d9d7844233933833320b003db9f Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Wed, 16 Apr 2025 17:38:10 +0200
Subject: [PATCH 15/55] pageserver: add `VectoredBlob::raw_with_header`
 (#11607)

## Problem

To avoid recompressing page images during layer filtering, we need
access to the raw header and data from vectored reads such that we can
pass them through to the target layer.

Touches #11562.

## Summary of changes

Adds `VectoredBlob::raw_with_header()` to return a raw view of the
header+data, and updates `read()` to track it.

Also adds `blob_io::Header` with header metadata and decode logic, to
reuse for tests and assertions. This isn't yet widely used.
---
 pageserver/src/tenant/blob_io.rs          | 57 +++++++++++++++++
 pageserver/src/tenant/vectored_blob_io.rs | 77 +++++++++++------------
 2 files changed, 93 insertions(+), 41 deletions(-)

diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs
index abeaa166a4..d1dd105b13 100644
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -37,6 +37,63 @@ pub struct CompressionInfo {
     pub compressed_size: Option<usize>,
 }
 
+/// A blob header, with header+data length and compression info.
+///
+/// TODO: use this more widely, and add an encode() method too.
+/// TODO: document the header format.
+#[derive(Clone, Copy, Default)]
+pub struct Header {
+    pub header_len: usize,
+    pub data_len: usize,
+    pub compression_bits: u8,
+}
+
+impl Header {
+    /// Decodes a header from a byte slice.
+    pub fn decode(bytes: &[u8]) -> Result<Self, std::io::Error> {
+        let Some(&first_header_byte) = bytes.first() else {
+            return Err(std::io::Error::new(
+                std::io::ErrorKind::InvalidData,
+                "zero-length blob header",
+            ));
+        };
+
+        // If the first bit is 0, this is just a 1-byte length prefix up to 128 bytes.
+        if first_header_byte < 0x80 {
+            return Ok(Self {
+                header_len: 1, // by definition
+                data_len: first_header_byte as usize,
+                compression_bits: BYTE_UNCOMPRESSED,
+            });
+        }
+
+        // Otherwise, this is a 4-byte header containing compression information and length.
+        const HEADER_LEN: usize = 4;
+        let mut header_buf: [u8; HEADER_LEN] = bytes[0..HEADER_LEN].try_into().map_err(|_| {
+            std::io::Error::new(
+                std::io::ErrorKind::InvalidData,
+                format!("blob header too short: {bytes:?}"),
+            )
+        })?;
+
+        // TODO: verify the compression bits and convert to an enum.
+        let compression_bits = header_buf[0] & LEN_COMPRESSION_BIT_MASK;
+        header_buf[0] &= !LEN_COMPRESSION_BIT_MASK;
+        let data_len = u32::from_be_bytes(header_buf) as usize;
+
+        Ok(Self {
+            header_len: HEADER_LEN,
+            data_len,
+            compression_bits,
+        })
+    }
+
+    /// Returns the total header+data length.
+    pub fn total_len(&self) -> usize {
+        self.header_len + self.data_len
+    }
+}
+
 impl BlockCursor<'_> {
     /// Read a blob into a new buffer.
     pub async fn read_blob(
diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs
index 166917d674..8e535a55d7 100644
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -26,7 +26,7 @@ use utils::lsn::Lsn;
 use utils::vec_map::VecMap;
 
 use crate::context::RequestContext;
-use crate::tenant::blob_io::{BYTE_UNCOMPRESSED, BYTE_ZSTD, LEN_COMPRESSION_BIT_MASK};
+use crate::tenant::blob_io::{BYTE_UNCOMPRESSED, BYTE_ZSTD, Header};
 use crate::virtual_file::{self, IoBufferMut, VirtualFile};
 
 /// Metadata bundled with the start and end offset of a blob.
@@ -111,18 +111,20 @@ impl From<Bytes> for BufView<'_> {
 pub struct VectoredBlob {
     /// Blob metadata.
     pub meta: BlobMeta,
-    /// Start offset.
-    start: usize,
+    /// Header start offset.
+    header_start: usize,
+    /// Data start offset.
+    data_start: usize,
     /// End offset.
     end: usize,
-    /// Compression used on the the blob.
+    /// Compression used on the data, extracted from the header.
     compression_bits: u8,
 }
 
 impl VectoredBlob {
     /// Reads a decompressed view of the blob.
     pub(crate) async fn read<'a>(&self, buf: &BufView<'a>) -> Result<BufView<'a>, std::io::Error> {
-        let view = buf.view(self.start..self.end);
+        let view = buf.view(self.data_start..self.end);
 
         match self.compression_bits {
             BYTE_UNCOMPRESSED => Ok(view),
@@ -140,13 +142,19 @@ impl VectoredBlob {
                     std::io::ErrorKind::InvalidData,
                     format!(
                         "Failed to decompress blob for {}@{}, {}..{}: invalid compression byte {bits:x}",
-                        self.meta.key, self.meta.lsn, self.start, self.end
+                        self.meta.key, self.meta.lsn, self.data_start, self.end
                     ),
                 );
                 Err(error)
             }
         }
     }
+
+    /// Returns the raw blob including header.
+    #[allow(unused)]
+    pub(crate) fn raw_with_header<'a>(&self, buf: &BufView<'a>) -> BufView<'a> {
+        buf.view(self.header_start..self.end)
+    }
 }
 
 impl std::fmt::Display for VectoredBlob {
@@ -154,7 +162,7 @@ impl std::fmt::Display for VectoredBlob {
         write!(
             f,
             "{}@{}, {}..{}",
-            self.meta.key, self.meta.lsn, self.start, self.end
+            self.meta.key, self.meta.lsn, self.data_start, self.end
         )
     }
 }
@@ -493,50 +501,28 @@ impl<'a> VectoredBlobReader<'a> {
 
         let blobs_at = read.blobs_at.as_slice();
 
-        let start_offset = read.start;
-
-        let mut metas = Vec::with_capacity(blobs_at.len());
+        let mut blobs = Vec::with_capacity(blobs_at.len());
         // Blobs in `read` only provide their starting offset. The end offset
         // of a blob is implicit: the start of the next blob if one exists
         // or the end of the read.
 
-        for (blob_start, meta) in blobs_at {
-            let blob_start_in_buf = blob_start - start_offset;
-            let first_len_byte = buf[blob_start_in_buf as usize];
+        for (blob_start, meta) in blobs_at.iter().copied() {
+            let header_start = (blob_start - read.start) as usize;
+            let header = Header::decode(&buf[header_start..])?;
+            let data_start = header_start + header.header_len;
+            let end = data_start + header.data_len;
+            let compression_bits = header.compression_bits;
 
-            // Each blob is prefixed by a header containing its size and compression information.
-            // Extract the size and skip that header to find the start of the data.
-            // The size can be 1 or 4 bytes. The most significant bit is 0 in the
-            // 1 byte case and 1 in the 4 byte case.
-            let (size_length, blob_size, compression_bits) = if first_len_byte < 0x80 {
-                (1, first_len_byte as u64, BYTE_UNCOMPRESSED)
-            } else {
-                let mut blob_size_buf = [0u8; 4];
-                let offset_in_buf = blob_start_in_buf as usize;
-
-                blob_size_buf.copy_from_slice(&buf[offset_in_buf..offset_in_buf + 4]);
-                blob_size_buf[0] &= !LEN_COMPRESSION_BIT_MASK;
-
-                let compression_bits = first_len_byte & LEN_COMPRESSION_BIT_MASK;
-                (
-                    4,
-                    u32::from_be_bytes(blob_size_buf) as u64,
-                    compression_bits,
-                )
-            };
-
-            let start = (blob_start_in_buf + size_length) as usize;
-            let end = start + blob_size as usize;
-
-            metas.push(VectoredBlob {
-                start,
+            blobs.push(VectoredBlob {
+                header_start,
+                data_start,
                 end,
-                meta: *meta,
+                meta,
                 compression_bits,
             });
         }
 
-        Ok(VectoredBlobsBuf { buf, blobs: metas })
+        Ok(VectoredBlobsBuf { buf, blobs })
     }
 }
 
@@ -997,6 +983,15 @@ mod tests {
                 &read_buf[..],
                 "mismatch for idx={idx} at offset={offset}"
             );
+
+            // Check that raw_with_header returns a valid header.
+            let raw = read_blob.raw_with_header(&view);
+            let header = Header::decode(&raw)?;
+            if !compression || header.header_len == 1 {
+                assert_eq!(header.compression_bits, BYTE_UNCOMPRESSED);
+            }
+            assert_eq!(raw.len(), header.total_len());
+
             buf = result.buf;
         }
         Ok(())

From 7747a9619f2c2b5c91e9f7581e7f1c8adf70b035 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Wed, 16 Apr 2025 16:55:11 +0100
Subject: [PATCH 16/55] compute: fix copy-paste typo for neon GUC parameters
 check (#11610)

fix for commit
[5063151](https://github.com/neondatabase/neon/commit/50631512710d8c5fd9c4c681d7882e16f4df93f3)
---
 pgxn/neon/libpagestore.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 19c14511bd..64d38e7913 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -1362,7 +1362,7 @@ pg_init_libpagestore(void)
 							   "",
 							   PGC_POSTMASTER,
 							   0,	/* no flags required */
-							   check_neon_id, NULL, NULL);
+							   NULL, NULL, NULL);
 	DefineCustomStringVariable("neon.branch_id",
 							   "Neon branch_id the server is running on",
 							   NULL,
@@ -1370,7 +1370,7 @@ pg_init_libpagestore(void)
 							   "",
 							   PGC_POSTMASTER,
 							   0,	/* no flags required */
-							   check_neon_id, NULL, NULL);
+							   NULL, NULL, NULL);
 	DefineCustomStringVariable("neon.endpoint_id",
 							   "Neon endpoint_id the server is running on",
 							   NULL,
@@ -1378,7 +1378,7 @@ pg_init_libpagestore(void)
 							   "",
 							   PGC_POSTMASTER,
 							   0,	/* no flags required */
-							   check_neon_id, NULL, NULL);
+							   NULL, NULL, NULL);
 
 	DefineCustomIntVariable("neon.stripe_size",
 							"sharding stripe size",

From 0e00faf52891f36db29b471a5a7ad89b8a123b21 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 16 Apr 2025 17:31:23 +0100
Subject: [PATCH 17/55] tests: stability fixes for
 `test_migration_to_cold_secondary` (#11606)

1. Compute may generate WAL on shutdown. The test assumes that after
shutdown,
no further ingest happens. Tweak the compute shutdown to make the
assumption true.
2. Assertion of local layer count post cold migration is not right since
we may have downloaded
layers due to ingest. Remove it.

Closes https://github.com/neondatabase/neon/issues/11587
---
 test_runner/regress/test_pageserver_secondary.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index d48e731394..3aa0c63979 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -14,6 +14,7 @@ from fixtures.neon_fixtures import (
     NeonEnvBuilder,
     NeonPageserver,
     StorageControllerMigrationConfig,
+    flush_ep_to_pageserver,
 )
 from fixtures.pageserver.common_types import parse_layer_file_name
 from fixtures.pageserver.utils import (
@@ -997,10 +998,6 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder):
     ps_secondary.http_client().tenant_heatmap_upload(tenant_id)
     heatmap_after_migration = timeline_heatmap(timeline_id)
 
-    local_layers = ps_secondary.list_layers(tenant_id, timeline_id)
-    # We download 1 layer per second and give up within 5 seconds.
-    assert len(local_layers) < 10
-
     after_migration_heatmap_layers_count = len(heatmap_after_migration["layers"])
     log.info(f"Heatmap size after cold migration is {after_migration_heatmap_layers_count}")
 
@@ -1038,9 +1035,14 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder):
         .value
     )
 
-    workload.stop()
     assert before == after
 
+    # Stop the endpoint and wait until any finally written WAL propagates to
+    # the pageserver and is uploaded to remote storage.
+    flush_ep_to_pageserver(env, workload.endpoint(), tenant_id, timeline_id)
+    ps_secondary.http_client().timeline_checkpoint(tenant_id, timeline_id, wait_until_uploaded=True)
+    workload.stop()
+
     # Now simulate the case where a child timeline is archived, parent layers
     # are evicted and the child is unarchived. When the child is unarchived,
     # itself and the parent update their heatmaps to contain layers needed by the

From 4af0b9b3877b27f18b00712d0aff1e56475ed3ec Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Wed, 16 Apr 2025 19:10:15 +0200
Subject: [PATCH 18/55] pageserver: don't recompress images in
 `ImageLayerInner::filter()` (#11592)

## Problem

During shard ancestor compaction, we currently recompress all page
images as we move them into a new layer file. This is expensive and
unnecessary.

Resolves #11562.
Requires #11607.

## Summary of changes

Pass through compressed page images in `ImageLayerInner::filter()`.
---
 pageserver/src/tenant/blob_io.rs              | 28 ++++++++
 .../src/tenant/storage_layer/image_layer.rs   | 69 +++++++++++++++++--
 pageserver/src/tenant/vectored_blob_io.rs     |  1 -
 3 files changed, 93 insertions(+), 5 deletions(-)

diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs
index d1dd105b13..3483a9f31e 100644
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -446,6 +446,34 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
         };
         (srcbuf, res.map(|_| (offset, compression_info)))
     }
+
+    /// Writes a raw blob containing both header and data, returning its offset.
+    pub(crate) async fn write_blob_raw<Buf: IoBuf + Send>(
+        &mut self,
+        raw_with_header: FullSlice<Buf>,
+        ctx: &RequestContext,
+    ) -> (FullSlice<Buf>, Result<u64, Error>) {
+        // Verify the header, to ensure we don't write invalid/corrupt data.
+        let header = match Header::decode(&raw_with_header) {
+            Ok(header) => header,
+            Err(err) => return (raw_with_header, Err(err)),
+        };
+        if raw_with_header.len() != header.total_len() {
+            let header_total_len = header.total_len();
+            let raw_len = raw_with_header.len();
+            return (
+                raw_with_header,
+                Err(std::io::Error::new(
+                    std::io::ErrorKind::InvalidData,
+                    format!("header length mismatch: {header_total_len} != {raw_len}"),
+                )),
+            );
+        }
+
+        let offset = self.offset;
+        let (raw_with_header, result) = self.write_all(raw_with_header, ctx).await;
+        (raw_with_header, result.map(|_| offset))
+    }
 }
 
 impl BlobWriter<true> {
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 3744d615f2..c2de20b5b3 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -559,11 +559,12 @@ impl ImageLayerInner {
             let view = BufView::new_slice(&blobs_buf.buf);
 
             for meta in blobs_buf.blobs.iter() {
-                let img_buf = meta.read(&view).await?;
-
+                // Just read the raw header+data and pass it through to the target layer, without
+                // decoding and recompressing it.
+                let raw = meta.raw_with_header(&view);
                 key_count += 1;
                 writer
-                    .put_image(meta.meta.key, img_buf.into_bytes(), ctx)
+                    .put_image_raw(meta.meta.key, raw.into_bytes(), ctx)
                     .await
                     .context(format!("Storing key {}", meta.meta.key))?;
             }
@@ -853,6 +854,41 @@ impl ImageLayerWriterInner {
         Ok(())
     }
 
+    ///
+    /// Write the next image to the file, as a raw blob header and data.
+    ///
+    /// The page versions must be appended in blknum order.
+    ///
+    async fn put_image_raw(
+        &mut self,
+        key: Key,
+        raw_with_header: Bytes,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        ensure!(self.key_range.contains(&key));
+
+        // NB: we don't update the (un)compressed metrics, since we can't determine them without
+        // decompressing the image. This seems okay.
+        self.num_keys += 1;
+
+        let (_, res) = self
+            .blob_writer
+            .write_blob_raw(raw_with_header.slice_len(), ctx)
+            .await;
+        let offset = res?;
+
+        let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
+        key.write_to_byte_slice(&mut keybuf);
+        self.tree.append(&keybuf, offset)?;
+
+        #[cfg(feature = "testing")]
+        {
+            self.last_written_key = key;
+        }
+
+        Ok(())
+    }
+
     ///
     /// Finish writing the image layer.
     ///
@@ -888,7 +924,13 @@ impl ImageLayerWriterInner {
         crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES_CONSIDERED
             .inc_by(self.uncompressed_bytes_eligible);
         crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES_CHOSEN.inc_by(self.uncompressed_bytes_chosen);
-        crate::metrics::COMPRESSION_IMAGE_OUTPUT_BYTES.inc_by(compressed_size);
+
+        // NB: filter() may pass through raw pages from a different layer, without looking at
+        // whether these are compressed or not. We don't track metrics for these, so avoid
+        // increasing `COMPRESSION_IMAGE_OUTPUT_BYTES` in this case too.
+        if self.uncompressed_bytes > 0 {
+            crate::metrics::COMPRESSION_IMAGE_OUTPUT_BYTES.inc_by(compressed_size);
+        };
 
         let mut file = self.blob_writer.into_inner();
 
@@ -1034,6 +1076,25 @@ impl ImageLayerWriter {
         self.inner.as_mut().unwrap().put_image(key, img, ctx).await
     }
 
+    ///
+    /// Write the next value to the file, as a raw header and data. This allows passing through a
+    /// raw, potentially compressed image from a different layer file without recompressing it.
+    ///
+    /// The page versions must be appended in blknum order.
+    ///
+    pub async fn put_image_raw(
+        &mut self,
+        key: Key,
+        raw_with_header: Bytes,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        self.inner
+            .as_mut()
+            .unwrap()
+            .put_image_raw(key, raw_with_header, ctx)
+            .await
+    }
+
     /// Estimated size of the image layer.
     pub(crate) fn estimated_size(&self) -> u64 {
         let inner = self.inner.as_ref().unwrap();
diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs
index 8e535a55d7..f9a44fe4ca 100644
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -151,7 +151,6 @@ impl VectoredBlob {
     }
 
     /// Returns the raw blob including header.
-    #[allow(unused)]
     pub(crate) fn raw_with_header<'a>(&self, buf: &BufView<'a>) -> BufView<'a> {
         buf.view(self.header_start..self.end)
     }

From c0022361458fc4b674eeb9000f7e6f63186156d5 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 16 Apr 2025 12:54:51 -0500
Subject: [PATCH 19/55] Remove compute_ctl authorization bypass if testing
 feature was enable (#11596)

We want to exercise the authorization middleware in our regression
tests.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute_tools/src/http/extract/mod.rs         |   1 +
 .../src/http/middleware/authorize.rs          |  14 +-------
 control_plane/src/bin/neon_local.rs           |   2 +-
 docker-compose/README.md                      |  11 +++++-
 .../compute_wrapper/private-key.pem           |   3 ++
 docker-compose/compute_wrapper/public-key.der | Bin 0 -> 44 bytes
 docker-compose/compute_wrapper/public-key.pem |   3 ++
 .../var/db/postgres/configs/config.json       |  14 +++++++-
 test_runner/fixtures/endpoint/http.py         |  34 +++++++++++++++---
 test_runner/fixtures/neon_cli.py              |  12 +++++++
 test_runner/fixtures/neon_fixtures.py         |   9 +++--
 11 files changed, 80 insertions(+), 23 deletions(-)
 create mode 100644 docker-compose/compute_wrapper/private-key.pem
 create mode 100644 docker-compose/compute_wrapper/public-key.der
 create mode 100644 docker-compose/compute_wrapper/public-key.pem

diff --git a/compute_tools/src/http/extract/mod.rs b/compute_tools/src/http/extract/mod.rs
index 589681cfe2..93319c36c8 100644
--- a/compute_tools/src/http/extract/mod.rs
+++ b/compute_tools/src/http/extract/mod.rs
@@ -6,4 +6,5 @@ pub(crate) mod request_id;
 pub(crate) use json::Json;
 pub(crate) use path::Path;
 pub(crate) use query::Query;
+#[allow(unused)]
 pub(crate) use request_id::RequestId;
diff --git a/compute_tools/src/http/middleware/authorize.rs b/compute_tools/src/http/middleware/authorize.rs
index e6c3269b15..2d0f411d7a 100644
--- a/compute_tools/src/http/middleware/authorize.rs
+++ b/compute_tools/src/http/middleware/authorize.rs
@@ -13,7 +13,7 @@ use jsonwebtoken::{Algorithm, DecodingKey, TokenData, Validation, jwk::JwkSet};
 use tower_http::auth::AsyncAuthorizeRequest;
 use tracing::{debug, warn};
 
-use crate::http::{JsonResponse, extract::RequestId};
+use crate::http::JsonResponse;
 
 #[derive(Clone, Debug)]
 pub(in crate::http) struct Authorize {
@@ -52,18 +52,6 @@ impl AsyncAuthorizeRequest<Body> for Authorize {
         let validation = self.validation.clone();
 
         Box::pin(async move {
-            let request_id = request.extract_parts::<RequestId>().await.unwrap();
-
-            // TODO(tristan957): Remove this stanza after teaching neon_local
-            // and the regression tests to use a JWT + JWKS.
-            //
-            // https://github.com/neondatabase/neon/issues/11316
-            if cfg!(feature = "testing") {
-                warn!(%request_id, "Skipping compute_ctl authorization check");
-
-                return Ok(request);
-            }
-
             let TypedHeader(Authorization(bearer)) = request
                 .extract_parts::<TypedHeader<Authorization<Bearer>>>()
                 .await
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 950b264163..1ff4295438 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -1544,7 +1544,7 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
                 .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
             let jwt = endpoint.generate_jwt()?;
 
-            println!("{jwt}");
+            print!("{jwt}");
         }
     }
 
diff --git a/docker-compose/README.md b/docker-compose/README.md
index 648e4ca030..8232027adc 100644
--- a/docker-compose/README.md
+++ b/docker-compose/README.md
@@ -1,4 +1,3 @@
-
 # Example docker compose configuration
 
 The configuration in this directory is used for testing Neon docker images: it is
@@ -8,3 +7,13 @@ you can experiment with a miniature Neon system, use `cargo neon` rather than co
 This configuration does not start the storage controller, because the controller
 needs a way to reconfigure running computes, and no such thing exists in this setup.
 
+## Generating the JWKS for a compute
+
+```shell
+openssl genpkey -algorithm Ed25519 -out private-key.pem
+openssl pkey -in private-key.pem -pubout -out public-key.pem
+openssl pkey -pubin -inform pem -in public-key.pem -pubout -outform der -out public-key.der
+key="$(xxd -plain -cols 32 -s -32 public-key.der)"
+key_id="$(printf '%s' "$key" | sha256sum | awk '{ print $1 }' | basenc --base64url --wrap=0)"
+x="$(printf '%s' "$key" | basenc --base64url --wrap=0)"
+```
diff --git a/docker-compose/compute_wrapper/private-key.pem b/docker-compose/compute_wrapper/private-key.pem
new file mode 100644
index 0000000000..9bfbfebe27
--- /dev/null
+++ b/docker-compose/compute_wrapper/private-key.pem
@@ -0,0 +1,3 @@
+-----BEGIN PRIVATE KEY-----
+MC4CAQAwBQYDK2VwBCIEIOmnRbzt2AJ0d+S3aU1hiYOl/tXpvz1FmWBfwHYBgOma
+-----END PRIVATE KEY-----
diff --git a/docker-compose/compute_wrapper/public-key.der b/docker-compose/compute_wrapper/public-key.der
new file mode 100644
index 0000000000000000000000000000000000000000..1b25e50055797b2543cc766196c327f693aa0cfa
GIT binary patch
literal 44
zcmV+{0Mq|4Dli2G11n{410etnjT)ErG;RQQ``6SOPq#d!r?JUBJ1kA)IL9E~mwKH#
CPZ3@K

literal 0
HcmV?d00001

diff --git a/docker-compose/compute_wrapper/public-key.pem b/docker-compose/compute_wrapper/public-key.pem
new file mode 100644
index 0000000000..344450cb3d
--- /dev/null
+++ b/docker-compose/compute_wrapper/public-key.pem
@@ -0,0 +1,3 @@
+-----BEGIN PUBLIC KEY-----
+MCowBQYDK2VwAyEADY0al/U0bgB3+9fUGk+3PKWnsck9OyxN5DjHIN6Xep0=
+-----END PUBLIC KEY-----
diff --git a/docker-compose/compute_wrapper/var/db/postgres/configs/config.json b/docker-compose/compute_wrapper/var/db/postgres/configs/config.json
index 3ddf96512a..21caf3800c 100644
--- a/docker-compose/compute_wrapper/var/db/postgres/configs/config.json
+++ b/docker-compose/compute_wrapper/var/db/postgres/configs/config.json
@@ -142,7 +142,19 @@
     },
     "compute_ctl_config": {
         "jwks": {
-            "keys": []
+            "keys": [
+                {
+                    "use": "sig",
+                    "key_ops": [
+                        "verify"
+                    ],
+                    "alg": "EdDSA",
+                    "kid": "ZGIxMzAzOGY0YWQwODk2ODU1MTk1NzMxMDFkYmUyOWU2NzZkOWNjNjMyMGRkZGJjOWY0MjdjYWVmNzE1MjUyOAo=",
+                    "kty": "OKP",
+                    "crv": "Ed25519",
+                    "x": "MGQ4ZDFhOTdmNTM0NmUwMDc3ZmJkN2Q0MWE0ZmI3M2NhNWE3YjFjOTNkM2IyYzRkZTQzOGM3MjBkZTk3N2E5ZAo="
+                }
+            ]
         }
     }
 }
diff --git a/test_runner/fixtures/endpoint/http.py b/test_runner/fixtures/endpoint/http.py
index 4073ebc3b9..652c38f5c3 100644
--- a/test_runner/fixtures/endpoint/http.py
+++ b/test_runner/fixtures/endpoint/http.py
@@ -1,33 +1,59 @@
 from __future__ import annotations
 
 import urllib.parse
+from typing import TYPE_CHECKING, final
 
 import requests
 from requests.adapters import HTTPAdapter
+from requests.auth import AuthBase
+from typing_extensions import override
 
 from fixtures.log_helper import log
 
+if TYPE_CHECKING:
+    from requests import PreparedRequest
 
+
+@final
+class BearerAuth(AuthBase):
+    """
+    Auth implementation for bearer authorization in HTTP requests through the
+    requests HTTP client library.
+    """
+
+    def __init__(self, jwt: str):
+        self.__jwt = jwt
+
+    @override
+    def __call__(self, request: PreparedRequest) -> PreparedRequest:
+        request.headers["Authorization"] = "Bearer " + self.__jwt
+        return request
+
+
+@final
 class EndpointHttpClient(requests.Session):
     def __init__(
         self,
         external_port: int,
         internal_port: int,
+        jwt: str,
     ):
         super().__init__()
         self.external_port: int = external_port
         self.internal_port: int = internal_port
+        self.auth = BearerAuth(jwt)
 
         self.mount("http://", HTTPAdapter())
 
     def dbs_and_roles(self):
-        res = self.get(f"http://localhost:{self.external_port}/dbs_and_roles")
+        res = self.get(f"http://localhost:{self.external_port}/dbs_and_roles", auth=self.auth)
         res.raise_for_status()
         return res.json()
 
     def database_schema(self, database: str):
         res = self.get(
-            f"http://localhost:{self.external_port}/database_schema?database={urllib.parse.quote(database, safe='')}"
+            f"http://localhost:{self.external_port}/database_schema?database={urllib.parse.quote(database, safe='')}",
+            auth=self.auth,
         )
         res.raise_for_status()
         return res.text
@@ -58,13 +84,13 @@ class EndpointHttpClient(requests.Session):
 
     # Current compute status.
     def status(self):
-        res = self.get(f"http://localhost:{self.external_port}/status")
+        res = self.get(f"http://localhost:{self.external_port}/status", auth=self.auth)
         res.raise_for_status()
         return res.json()
 
     # Compute startup-related metrics.
     def metrics_json(self):
-        res = self.get(f"http://localhost:{self.external_port}/metrics.json")
+        res = self.get(f"http://localhost:{self.external_port}/metrics.json", auth=self.auth)
         res.raise_for_status()
         return res.json()
 
diff --git a/test_runner/fixtures/neon_cli.py b/test_runner/fixtures/neon_cli.py
index 5f5626fb98..80852b610b 100644
--- a/test_runner/fixtures/neon_cli.py
+++ b/test_runner/fixtures/neon_cli.py
@@ -535,6 +535,18 @@ class NeonLocalCli(AbstractNeonCli):
         res.check_returncode()
         return res
 
+    def endpoint_generate_jwt(self, endpoint_id: str) -> str:
+        """
+        Generate a JWT for making requests to the endpoint's external HTTP
+        server.
+        """
+        args = ["endpoint", "generate-jwt", endpoint_id]
+
+        cmd = self.raw_cli(args)
+        cmd.check_returncode()
+
+        return cmd.stdout
+
     def endpoint_start(
         self,
         endpoint_id: str,
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 13bd74e05d..e70ddc8e66 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -4110,13 +4110,14 @@ class Endpoint(PgProtocol, LogUtils):
         # try and stop the same process twice, as stop() is called by test teardown and
         # potentially by some __del__ chains in other threads.
         self._running = threading.Semaphore(0)
+        self.__jwt: str | None = None
 
-    def http_client(
-        self, auth_token: str | None = None, retries: Retry | None = None
-    ) -> EndpointHttpClient:
+    def http_client(self, retries: Retry | None = None) -> EndpointHttpClient:
+        assert self.__jwt is not None
         return EndpointHttpClient(
             external_port=self.external_http_port,
             internal_port=self.internal_http_port,
+            jwt=self.__jwt,
         )
 
     def create(
@@ -4200,6 +4201,8 @@ class Endpoint(PgProtocol, LogUtils):
 
         self.config(config_lines)
 
+        self.__jwt = self.env.neon_cli.endpoint_generate_jwt(self.endpoint_id)
+
         return self
 
     def start(

From fc233794f62d8161f92152bfd4ab483005cb744b Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 16 Apr 2025 19:37:17 +0100
Subject: [PATCH 20/55] fix(proxy): make sure that sql-over-http is TLS aware
 (#11612)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

I noticed that while auth-broker -> local-proxy is TLS aware, and TCP
proxy -> postgres is TLS aware, HTTP proxy -> postgres is not 😅
---
 proxy/src/serverless/backend.rs   | 4 +++-
 proxy/src/serverless/conn_pool.rs | 7 +++++--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index e40aa024a8..13058f08f1 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -561,8 +561,10 @@ impl ConnectMechanism for TokioMechanism {
             .dbname(&self.conn_info.dbname)
             .connect_timeout(compute_config.timeout);
 
+        let mk_tls =
+            crate::tls::postgres_rustls::MakeRustlsConnect::new(compute_config.tls.clone());
         let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
-        let res = config.connect(postgres_client::NoTls).await;
+        let res = config.connect(mk_tls).await;
         drop(pause);
         let (client, connection) = permit.release_result(res)?;
 
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index 516d474a11..409056a6a9 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -6,7 +6,7 @@ use std::task::{Poll, ready};
 use futures::Future;
 use futures::future::poll_fn;
 use postgres_client::AsyncMessage;
-use postgres_client::tls::NoTlsStream;
+use postgres_client::tls::MakeTlsConnect;
 use smallvec::SmallVec;
 use tokio::net::TcpStream;
 use tokio::time::Instant;
@@ -26,6 +26,9 @@ use super::conn_pool_lib::{
 use crate::context::RequestContext;
 use crate::control_plane::messages::MetricsAuxInfo;
 use crate::metrics::Metrics;
+use crate::tls::postgres_rustls::MakeRustlsConnect;
+
+type TlsStream = <MakeRustlsConnect as MakeTlsConnect<TcpStream>>::Stream;
 
 #[derive(Debug, Clone)]
 pub(crate) struct ConnInfoWithAuth {
@@ -58,7 +61,7 @@ pub(crate) fn poll_client<C: ClientInnerExt>(
     ctx: &RequestContext,
     conn_info: ConnInfo,
     client: C,
-    mut connection: postgres_client::Connection<TcpStream, NoTlsStream>,
+    mut connection: postgres_client::Connection<TcpStream, TlsStream>,
     conn_id: uuid::Uuid,
     aux: MetricsAuxInfo,
 ) -> Client<C> {

From cf2e695f497d2a4417622d2d396c8850f639c6a1 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 16 Apr 2025 14:51:48 -0400
Subject: [PATCH 21/55] feat(pageserver): gc-compaction meta statistics
 (#11601)

## Problem

We currently only have gc-compaction statistics for each single
sub-compaction job.

## Summary of changes

Add meta statistics across all sub-compaction jobs scheduled.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/timeline/compaction.rs | 121 ++++++++++++++++++-
 1 file changed, 117 insertions(+), 4 deletions(-)

diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 92b24a73c9..ff85a33055 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -7,7 +7,7 @@
 use std::collections::{BinaryHeap, HashMap, HashSet, VecDeque};
 use std::ops::{Deref, Range};
 use std::sync::Arc;
-use std::time::{Duration, Instant};
+use std::time::{Duration, Instant, SystemTime};
 
 use super::layer_manager::LayerManager;
 use super::{
@@ -77,7 +77,7 @@ const COMPACTION_DELTA_THRESHOLD: usize = 5;
 /// shard split, which gets expensive for large tenants.
 const ANCESTOR_COMPACTION_REWRITE_THRESHOLD: f64 = 0.3;
 
-#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
+#[derive(Default, Debug, Clone, Copy, Hash, PartialEq, Eq, Serialize)]
 pub struct GcCompactionJobId(pub usize);
 
 impl std::fmt::Display for GcCompactionJobId {
@@ -105,6 +105,43 @@ pub enum GcCompactionQueueItem {
     Notify(GcCompactionJobId, Option<Lsn>),
 }
 
+/// Statistics for gc-compaction meta jobs, which contains several sub compaction jobs.
+#[derive(Debug, Clone, Serialize, Default)]
+pub struct GcCompactionMetaStatistics {
+    /// The total number of sub compaction jobs.
+    pub total_sub_compaction_jobs: usize,
+    /// The total number of sub compaction jobs that failed.
+    pub failed_sub_compaction_jobs: usize,
+    /// The total number of sub compaction jobs that succeeded.
+    pub succeeded_sub_compaction_jobs: usize,
+    /// The layer size before compaction.
+    pub before_compaction_layer_size: u64,
+    /// The layer size after compaction.
+    pub after_compaction_layer_size: u64,
+    /// The start time of the meta job.
+    pub start_time: Option<SystemTime>,
+    /// The end time of the meta job.
+    pub end_time: Option<SystemTime>,
+    /// The duration of the meta job.
+    pub duration_secs: f64,
+    /// The id of the meta job.
+    pub meta_job_id: GcCompactionJobId,
+    /// The LSN below which the layers are compacted, used to compute the statistics.
+    pub below_lsn: Lsn,
+}
+
+impl GcCompactionMetaStatistics {
+    fn finalize(&mut self) {
+        let end_time = SystemTime::now();
+        if let Some(start_time) = self.start_time {
+            if let Ok(duration) = end_time.duration_since(start_time) {
+                self.duration_secs = duration.as_secs_f64();
+            }
+        }
+        self.end_time = Some(end_time);
+    }
+}
+
 impl GcCompactionQueueItem {
     pub fn into_compact_info_resp(
         self,
@@ -142,6 +179,7 @@ struct GcCompactionQueueInner {
     queued: VecDeque<(GcCompactionJobId, GcCompactionQueueItem)>,
     guards: HashMap<GcCompactionJobId, GcCompactionGuardItems>,
     last_id: GcCompactionJobId,
+    meta_statistics: Option<GcCompactionMetaStatistics>,
 }
 
 impl GcCompactionQueueInner {
@@ -173,6 +211,7 @@ impl GcCompactionQueue {
                 queued: VecDeque::new(),
                 guards: HashMap::new(),
                 last_id: GcCompactionJobId(0),
+                meta_statistics: None,
             }),
             consumer_lock: tokio::sync::Mutex::new(()),
         }
@@ -357,6 +396,23 @@ impl GcCompactionQueue {
         Ok(())
     }
 
+    async fn collect_layer_below_lsn(
+        &self,
+        timeline: &Arc<Timeline>,
+        lsn: Lsn,
+    ) -> Result<u64, CompactionError> {
+        let guard = timeline.layers.read().await;
+        let layer_map = guard.layer_map()?;
+        let layers = layer_map.iter_historic_layers().collect_vec();
+        let mut size = 0;
+        for layer in layers {
+            if layer.lsn_range.start <= lsn {
+                size += layer.file_size();
+            }
+        }
+        Ok(size)
+    }
+
     /// Notify the caller the job has finished and unblock GC.
     fn notify_and_unblock(&self, id: GcCompactionJobId) {
         info!("compaction job id={} finished", id);
@@ -366,6 +422,16 @@ impl GcCompactionQueue {
                 let _ = tx.send(());
             }
         }
+        if let Some(ref meta_statistics) = guard.meta_statistics {
+            if meta_statistics.meta_job_id == id {
+                if let Ok(stats) = serde_json::to_string(&meta_statistics) {
+                    info!(
+                        "gc-compaction meta statistics for job id = {}: {}",
+                        id, stats
+                    );
+                }
+            }
+        }
     }
 
     fn clear_running_job(&self) {
@@ -405,7 +471,11 @@ impl GcCompactionQueue {
             let mut pending_tasks = Vec::new();
             // gc-compaction might pick more layers or fewer layers to compact. The L2 LSN does not need to be accurate.
             // And therefore, we simply assume the maximum LSN of all jobs is the expected L2 LSN.
-            let expected_l2_lsn = jobs.iter().map(|job| job.compact_lsn_range.end).max();
+            let expected_l2_lsn = jobs
+                .iter()
+                .map(|job| job.compact_lsn_range.end)
+                .max()
+                .unwrap();
             for job in jobs {
                 // Unfortunately we need to convert the `GcCompactJob` back to `CompactionOptions`
                 // until we do further refactors to allow directly call `compact_with_gc`.
@@ -430,9 +500,13 @@ impl GcCompactionQueue {
             if !auto {
                 pending_tasks.push(GcCompactionQueueItem::Notify(id, None));
             } else {
-                pending_tasks.push(GcCompactionQueueItem::Notify(id, expected_l2_lsn));
+                pending_tasks.push(GcCompactionQueueItem::Notify(id, Some(expected_l2_lsn)));
             }
 
+            let layer_size = self
+                .collect_layer_below_lsn(timeline, expected_l2_lsn)
+                .await?;
+
             {
                 let mut guard = self.inner.lock().unwrap();
                 let mut tasks = Vec::new();
@@ -444,7 +518,16 @@ impl GcCompactionQueue {
                 for item in tasks {
                     guard.queued.push_front(item);
                 }
+                guard.meta_statistics = Some(GcCompactionMetaStatistics {
+                    meta_job_id: id,
+                    start_time: Some(SystemTime::now()),
+                    before_compaction_layer_size: layer_size,
+                    below_lsn: expected_l2_lsn,
+                    total_sub_compaction_jobs: jobs_len,
+                    ..Default::default()
+                });
             }
+
             info!(
                 "scheduled enhanced gc bottom-most compaction with sub-compaction, split into {} jobs",
                 jobs_len
@@ -573,6 +656,10 @@ impl GcCompactionQueue {
                     Err(err) => {
                         warn!(%err, "failed to run gc-compaction subcompaction job");
                         self.clear_running_job();
+                        let mut guard = self.inner.lock().unwrap();
+                        if let Some(ref mut meta_statistics) = guard.meta_statistics {
+                            meta_statistics.failed_sub_compaction_jobs += 1;
+                        }
                         return Err(err);
                     }
                 };
@@ -582,8 +669,34 @@ impl GcCompactionQueue {
                     // we need to clean things up before returning from the function.
                     yield_for_l0 = true;
                 }
+                {
+                    let mut guard = self.inner.lock().unwrap();
+                    if let Some(ref mut meta_statistics) = guard.meta_statistics {
+                        meta_statistics.succeeded_sub_compaction_jobs += 1;
+                    }
+                }
             }
             GcCompactionQueueItem::Notify(id, l2_lsn) => {
+                let below_lsn = {
+                    let mut guard = self.inner.lock().unwrap();
+                    if let Some(ref mut meta_statistics) = guard.meta_statistics {
+                        meta_statistics.below_lsn
+                    } else {
+                        Lsn::INVALID
+                    }
+                };
+                let layer_size = if below_lsn != Lsn::INVALID {
+                    self.collect_layer_below_lsn(timeline, below_lsn).await?
+                } else {
+                    0
+                };
+                {
+                    let mut guard = self.inner.lock().unwrap();
+                    if let Some(ref mut meta_statistics) = guard.meta_statistics {
+                        meta_statistics.after_compaction_layer_size = layer_size;
+                        meta_statistics.finalize();
+                    }
+                }
                 self.notify_and_unblock(id);
                 if let Some(l2_lsn) = l2_lsn {
                     let current_l2_lsn = timeline

From b9b25e13a06cb1caa42a0e183dfd03d9109d579a Mon Sep 17 00:00:00 2001
From: Ivan Efremov <ivan@neon.tech>
Date: Wed, 16 Apr 2025 22:03:23 +0300
Subject: [PATCH 22/55] feat(proxy): Return prefixed errors to testodrome
 (#11561)

Testodrome measures uptime based on the failed requests and errors. In
case of testodrome request we send back error based on the service. This
will help us distinguish error types in testodrome and rely on the
uptime SLI.
---
 proxy/src/binary/pg_sni_router.rs   |   2 +-
 proxy/src/console_redirect_proxy.rs |   4 +-
 proxy/src/context/mod.rs            |  10 +--
 proxy/src/proxy/handshake.rs        |   6 +-
 proxy/src/proxy/mod.rs              |   9 +-
 proxy/src/serverless/mod.rs         |  13 +--
 proxy/src/serverless/websocket.rs   |   1 -
 proxy/src/stream.rs                 | 122 ++++++++++++++++++++++------
 8 files changed, 116 insertions(+), 51 deletions(-)

diff --git a/proxy/src/binary/pg_sni_router.rs b/proxy/src/binary/pg_sni_router.rs
index 1aa290399c..aef5c9383e 100644
--- a/proxy/src/binary/pg_sni_router.rs
+++ b/proxy/src/binary/pg_sni_router.rs
@@ -258,7 +258,7 @@ async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
                 "unexpected startup packet, rejecting connection"
             );
             stream
-                .throw_error_str(ERR_INSECURE_CONNECTION, crate::error::ErrorKind::User)
+                .throw_error_str(ERR_INSECURE_CONNECTION, crate::error::ErrorKind::User, None)
                 .await?
         }
     }
diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs
index 1156545f34..0f2c3def0d 100644
--- a/proxy/src/console_redirect_proxy.rs
+++ b/proxy/src/console_redirect_proxy.rs
@@ -222,7 +222,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     {
         Ok(auth_result) => auth_result,
         Err(e) => {
-            return stream.throw_error(e).await?;
+            return stream.throw_error(e, Some(ctx)).await?;
         }
     };
 
@@ -238,7 +238,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
         config.wake_compute_retry_config,
         &config.connect_to_compute,
     )
-    .or_else(|e| stream.throw_error(e))
+    .or_else(|e| stream.throw_error(e, Some(ctx)))
     .await?;
 
     let cancellation_handler_clone = Arc::clone(&cancellation_handler);
diff --git a/proxy/src/context/mod.rs b/proxy/src/context/mod.rs
index 7c1a6206c1..5f649d2b21 100644
--- a/proxy/src/context/mod.rs
+++ b/proxy/src/context/mod.rs
@@ -63,7 +63,7 @@ struct RequestContextInner {
     success: bool,
     pub(crate) cold_start_info: ColdStartInfo,
     pg_options: Option<StartupMessageParams>,
-    testodrome_query_id: Option<String>,
+    testodrome_query_id: Option<SmolStr>,
 
     // extra
     // This sender is here to keep the request monitoring channel open while requests are taking place.
@@ -219,7 +219,7 @@ impl RequestContext {
             for option in options_str.split_whitespace() {
                 if option.starts_with("neon_query_id:") {
                     if let Some(value) = option.strip_prefix("neon_query_id:") {
-                        this.set_testodrome_id(value.to_string());
+                        this.set_testodrome_id(value.into());
                         break;
                     }
                 }
@@ -272,7 +272,7 @@ impl RequestContext {
             .set_user_agent(user_agent);
     }
 
-    pub(crate) fn set_testodrome_id(&self, query_id: String) {
+    pub(crate) fn set_testodrome_id(&self, query_id: SmolStr) {
         self.0
             .try_lock()
             .expect("should not deadlock")
@@ -378,7 +378,7 @@ impl RequestContext {
             .accumulated()
     }
 
-    pub(crate) fn get_testodrome_id(&self) -> Option<String> {
+    pub(crate) fn get_testodrome_id(&self) -> Option<SmolStr> {
         self.0
             .try_lock()
             .expect("should not deadlock")
@@ -447,7 +447,7 @@ impl RequestContextInner {
         self.user = Some(user);
     }
 
-    fn set_testodrome_id(&mut self, query_id: String) {
+    fn set_testodrome_id(&mut self, query_id: SmolStr) {
         self.testodrome_query_id = Some(query_id);
     }
 
diff --git a/proxy/src/proxy/handshake.rs b/proxy/src/proxy/handshake.rs
index 2582e4c069..c05031ad97 100644
--- a/proxy/src/proxy/handshake.rs
+++ b/proxy/src/proxy/handshake.rs
@@ -196,7 +196,11 @@ pub(crate) async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
                 // OR we didn't provide it at all (for dev purposes).
                 if tls.is_some() {
                     return stream
-                        .throw_error_str(ERR_INSECURE_CONNECTION, crate::error::ErrorKind::User)
+                        .throw_error_str(
+                            ERR_INSECURE_CONNECTION,
+                            crate::error::ErrorKind::User,
+                            None,
+                        )
                         .await?;
                 }
 
diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs
index 2e7d332a8b..cf331b8bc0 100644
--- a/proxy/src/proxy/mod.rs
+++ b/proxy/src/proxy/mod.rs
@@ -329,7 +329,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
 
     let user_info = match result {
         Ok(user_info) => user_info,
-        Err(e) => stream.throw_error(e).await?,
+        Err(e) => stream.throw_error(e, Some(ctx)).await?,
     };
 
     let user = user_info.get_user().to_owned();
@@ -349,7 +349,10 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
             let app = params.get("application_name");
             let params_span = tracing::info_span!("", ?user, ?db, ?app);
 
-            return stream.throw_error(e).instrument(params_span).await?;
+            return stream
+                .throw_error(e, Some(ctx))
+                .instrument(params_span)
+                .await?;
         }
     };
 
@@ -374,7 +377,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
         config.wake_compute_retry_config,
         &config.connect_to_compute,
     )
-    .or_else(|e| stream.throw_error(e))
+    .or_else(|e| stream.throw_error(e, Some(ctx)))
     .await?;
 
     let cancellation_handler_clone = Arc::clone(&cancellation_handler);
diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs
index 9c11f32083..6f24ad3dec 100644
--- a/proxy/src/serverless/mod.rs
+++ b/proxy/src/serverless/mod.rs
@@ -434,17 +434,6 @@ async fn request_handler(
                 .map(Into::into),
         );
 
-        let testodrome_id = request
-            .headers()
-            .get("X-Neon-Query-ID")
-            .and_then(|value| value.to_str().ok())
-            .map(|s| s.to_string());
-
-        if let Some(query_id) = testodrome_id {
-            info!(parent: &ctx.span(), "testodrome query ID: {query_id}");
-            ctx.set_testodrome_id(query_id);
-        }
-
         let span = ctx.span();
         info!(parent: &span, "performing websocket upgrade");
 
@@ -491,7 +480,7 @@ async fn request_handler(
 
         if let Some(query_id) = testodrome_id {
             info!(parent: &ctx.span(), "testodrome query ID: {query_id}");
-            ctx.set_testodrome_id(query_id);
+            ctx.set_testodrome_id(query_id.into());
         }
 
         sql_over_http::handle(config, ctx, request, backend, http_cancellation_token)
diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs
index 01d37d0eec..7fc48105c5 100644
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -157,7 +157,6 @@ pub(crate) async fn serve_websocket(
 
     match res {
         Err(e) => {
-            // todo: log and push to ctx the error kind
             ctx.set_error_kind(e.get_error_kind());
             Err(e.into())
         }
diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs
index ace27a7284..360550b0ac 100644
--- a/proxy/src/stream.rs
+++ b/proxy/src/stream.rs
@@ -6,11 +6,13 @@ use bytes::BytesMut;
 use pq_proto::framed::{ConnectionError, Framed};
 use pq_proto::{BeMessage, FeMessage, FeStartupPacket, ProtocolError};
 use rustls::ServerConfig;
+use serde::{Deserialize, Serialize};
 use thiserror::Error;
 use tokio::io::{AsyncRead, AsyncWrite, ReadBuf};
 use tokio_rustls::server::TlsStream;
 use tracing::debug;
 
+use crate::control_plane::messages::ColdStartInfo;
 use crate::error::{ErrorKind, ReportableError, UserFacingError};
 use crate::metrics::Metrics;
 use crate::tls::TlsServerEndPoint;
@@ -100,6 +102,44 @@ impl ReportableError for ReportedError {
     }
 }
 
+#[derive(Serialize, Deserialize, Debug)]
+enum ErrorTag {
+    #[serde(rename = "proxy")]
+    Proxy,
+    #[serde(rename = "compute")]
+    Compute,
+    #[serde(rename = "client")]
+    Client,
+    #[serde(rename = "controlplane")]
+    ControlPlane,
+    #[serde(rename = "other")]
+    Other,
+}
+
+impl From<ErrorKind> for ErrorTag {
+    fn from(error_kind: ErrorKind) -> Self {
+        match error_kind {
+            ErrorKind::User => Self::Client,
+            ErrorKind::ClientDisconnect => Self::Client,
+            ErrorKind::RateLimit => Self::Proxy,
+            ErrorKind::ServiceRateLimit => Self::Proxy, // considering rate limit as proxy error for SLI
+            ErrorKind::Quota => Self::Proxy,
+            ErrorKind::Service => Self::Proxy,
+            ErrorKind::ControlPlane => Self::ControlPlane,
+            ErrorKind::Postgres => Self::Other,
+            ErrorKind::Compute => Self::Compute,
+        }
+    }
+}
+
+#[derive(Serialize, Deserialize, Debug)]
+#[serde(rename_all = "snake_case")]
+struct ProbeErrorData {
+    tag: ErrorTag,
+    msg: String,
+    cold_start_info: Option<ColdStartInfo>,
+}
+
 impl<S: AsyncWrite + Unpin> PqStream<S> {
     /// Write the message into an internal buffer, but don't flush the underlying stream.
     pub(crate) fn write_message_noflush(
@@ -125,26 +165,54 @@ impl<S: AsyncWrite + Unpin> PqStream<S> {
         Ok(self)
     }
 
-    /// Write the error message using [`Self::write_message`], then re-throw it.
+    /// Writes message with the given error kind to the stream.
+    /// Used only for probe queries
+    async fn write_format_message(
+        &mut self,
+        msg: &str,
+        error_kind: ErrorKind,
+        ctx: Option<&crate::context::RequestContext>,
+    ) -> String {
+        let formatted_msg = match ctx {
+            Some(ctx) if ctx.get_testodrome_id().is_some() => {
+                serde_json::to_string(&ProbeErrorData {
+                    tag: ErrorTag::from(error_kind),
+                    msg: msg.to_string(),
+                    cold_start_info: Some(ctx.cold_start_info()),
+                })
+                .unwrap_or_default()
+            }
+            _ => msg.to_string(),
+        };
+
+        // already error case, ignore client IO error
+        self.write_message(&BeMessage::ErrorResponse(&formatted_msg, None))
+            .await
+            .inspect_err(|e| debug!("write_message failed: {e}"))
+            .ok();
+
+        formatted_msg
+    }
+
+    /// Write the error message using [`Self::write_format_message`], then re-throw it.
     /// Allowing string literals is safe under the assumption they might not contain any runtime info.
     /// This method exists due to `&str` not implementing `Into<anyhow::Error>`.
+    /// If `ctx` is provided and has testodrome_id set, error messages will be prefixed according to error kind.
     pub async fn throw_error_str<T>(
         &mut self,
         msg: &'static str,
         error_kind: ErrorKind,
+        ctx: Option<&crate::context::RequestContext>,
     ) -> Result<T, ReportedError> {
-        // TODO: only log this for actually interesting errors
-        tracing::info!(
-            kind = error_kind.to_metric_label(),
-            msg,
-            "forwarding error to user"
-        );
+        self.write_format_message(msg, error_kind, ctx).await;
 
-        // already error case, ignore client IO error
-        self.write_message(&BeMessage::ErrorResponse(msg, None))
-            .await
-            .inspect_err(|e| debug!("write_message failed: {e}"))
-            .ok();
+        if error_kind != ErrorKind::RateLimit && error_kind != ErrorKind::User {
+            tracing::info!(
+                kind = error_kind.to_metric_label(),
+                msg,
+                "forwarding error to user"
+            );
+        }
 
         Err(ReportedError {
             source: anyhow::anyhow!(msg),
@@ -152,26 +220,28 @@ impl<S: AsyncWrite + Unpin> PqStream<S> {
         })
     }
 
-    /// Write the error message using [`Self::write_message`], then re-throw it.
+    /// Write the error message using [`Self::write_format_message`], then re-throw it.
     /// Trait [`UserFacingError`] acts as an allowlist for error types.
-    pub(crate) async fn throw_error<T, E>(&mut self, error: E) -> Result<T, ReportedError>
+    /// If `ctx` is provided and has testodrome_id set, error messages will be prefixed according to error kind.
+    pub(crate) async fn throw_error<T, E>(
+        &mut self,
+        error: E,
+        ctx: Option<&crate::context::RequestContext>,
+    ) -> Result<T, ReportedError>
     where
         E: UserFacingError + Into<anyhow::Error>,
     {
         let error_kind = error.get_error_kind();
         let msg = error.to_string_client();
-        tracing::info!(
-            kind=error_kind.to_metric_label(),
-            error=%error,
-            msg,
-            "forwarding error to user"
-        );
-
-        // already error case, ignore client IO error
-        self.write_message(&BeMessage::ErrorResponse(&msg, None))
-            .await
-            .inspect_err(|e| debug!("write_message failed: {e}"))
-            .ok();
+        self.write_format_message(&msg, error_kind, ctx).await;
+        if error_kind != ErrorKind::RateLimit && error_kind != ErrorKind::User {
+            tracing::info!(
+                kind=error_kind.to_metric_label(),
+                error=%error,
+                msg,
+                "forwarding error to user",
+            );
+        }
 
         Err(ReportedError {
             source: anyhow::anyhow!(error),

From ec9079f483a174bead40c38624af8a505458cd83 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Wed, 16 Apr 2025 20:05:21 +0000
Subject: [PATCH 23/55] Allow unwrap() in tests when clippy::unwrap_used is
 denied (#11616)

## Problem

The proxy denies using `unwrap()`s in regular code, but we want to use
it in test code
and so have to allow it for each test block.

## Summary of changes

Set `allow-unwrap-in-tests = true` in clippy.toml and remove all
exceptions.
---
 clippy.toml                                    | 2 ++
 proxy/src/auth/backend/jwt.rs                  | 1 -
 proxy/src/auth/credentials.rs                  | 1 -
 proxy/src/cache/endpoints.rs                   | 1 -
 proxy/src/cache/project_info.rs                | 1 -
 proxy/src/context/parquet.rs                   | 1 -
 proxy/src/intern.rs                            | 1 -
 proxy/src/logging.rs                           | 1 -
 proxy/src/protocol2.rs                         | 1 -
 proxy/src/proxy/copy_bidirectional.rs          | 1 -
 proxy/src/proxy/tests/mod.rs                   | 2 +-
 proxy/src/rate_limiter/leaky_bucket.rs         | 2 +-
 proxy/src/rate_limiter/limit_algorithm/aimd.rs | 1 -
 proxy/src/rate_limiter/limiter.rs              | 1 -
 proxy/src/sasl/messages.rs                     | 1 -
 proxy/src/scram/messages.rs                    | 1 -
 proxy/src/scram/mod.rs                         | 1 -
 proxy/src/scram/secret.rs                      | 1 -
 proxy/src/serverless/conn_pool.rs              | 1 -
 proxy/src/serverless/json.rs                   | 1 -
 proxy/src/serverless/local_conn_pool.rs        | 1 -
 proxy/src/serverless/sql_over_http.rs          | 1 -
 proxy/src/serverless/websocket.rs              | 1 -
 proxy/src/url.rs                               | 1 -
 proxy/src/usage_metrics.rs                     | 1 -
 25 files changed, 4 insertions(+), 24 deletions(-)

diff --git a/clippy.toml b/clippy.toml
index 4c0c04f9a1..408232488c 100644
--- a/clippy.toml
+++ b/clippy.toml
@@ -12,3 +12,5 @@ disallowed-macros = [
     # cannot disallow this, because clippy finds used from tokio macros
     #"tokio::pin",
 ]
+
+allow-unwrap-in-tests = true
diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs
index 942f1e13d1..44a6a42665 100644
--- a/proxy/src/auth/backend/jwt.rs
+++ b/proxy/src/auth/backend/jwt.rs
@@ -776,7 +776,6 @@ impl From<&jose_jwk::Key> for KeyType {
 }
 
 #[cfg(test)]
-#[expect(clippy::unwrap_used)]
 mod tests {
     use std::future::IntoFuture;
     use std::net::SocketAddr;
diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index c1b7718e4f..c55af325e3 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -253,7 +253,6 @@ fn project_name_valid(name: &str) -> bool {
 }
 
 #[cfg(test)]
-#[expect(clippy::unwrap_used)]
 mod tests {
     use ComputeUserInfoParseError::*;
     use serde_json::json;
diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs
index 8ec1a4648b..3c88e07484 100644
--- a/proxy/src/cache/endpoints.rs
+++ b/proxy/src/cache/endpoints.rs
@@ -259,7 +259,6 @@ impl EndpointsCache {
 }
 
 #[cfg(test)]
-#[expect(clippy::unwrap_used)]
 mod tests {
     use super::*;
 
diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs
index e153e9f61f..60678b034d 100644
--- a/proxy/src/cache/project_info.rs
+++ b/proxy/src/cache/project_info.rs
@@ -585,7 +585,6 @@ impl Cache for ProjectInfoCacheImpl {
 }
 
 #[cfg(test)]
-#[expect(clippy::unwrap_used)]
 mod tests {
     use super::*;
     use crate::scram::ServerSecret;
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index bfab5f34f9..f6250bcd17 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -416,7 +416,6 @@ async fn upload_parquet(
 }
 
 #[cfg(test)]
-#[expect(clippy::unwrap_used)]
 mod tests {
     use std::net::Ipv4Addr;
     use std::num::NonZeroUsize;
diff --git a/proxy/src/intern.rs b/proxy/src/intern.rs
index 0d1382679c..d7e39ebaf4 100644
--- a/proxy/src/intern.rs
+++ b/proxy/src/intern.rs
@@ -227,7 +227,6 @@ impl From<AccountId> for AccountIdInt {
 }
 
 #[cfg(test)]
-#[expect(clippy::unwrap_used)]
 mod tests {
     use std::sync::OnceLock;
 
diff --git a/proxy/src/logging.rs b/proxy/src/logging.rs
index 454fe81357..b83b03bc4f 100644
--- a/proxy/src/logging.rs
+++ b/proxy/src/logging.rs
@@ -1032,7 +1032,6 @@ impl<const F: usize> serde::ser::Serialize for ExtractedSpanFields<'_, F> {
 }
 
 #[cfg(test)]
-#[allow(clippy::unwrap_used)]
 mod tests {
     use std::marker::PhantomData;
     use std::sync::{Arc, Mutex, MutexGuard};
diff --git a/proxy/src/protocol2.rs b/proxy/src/protocol2.rs
index b0603da379..ecd9882f53 100644
--- a/proxy/src/protocol2.rs
+++ b/proxy/src/protocol2.rs
@@ -400,7 +400,6 @@ impl NetworkEndianIpv6 {
 }
 
 #[cfg(test)]
-#[expect(clippy::unwrap_used)]
 mod tests {
     use tokio::io::AsyncReadExt;
 
diff --git a/proxy/src/proxy/copy_bidirectional.rs b/proxy/src/proxy/copy_bidirectional.rs
index 6f8b972348..97f8d7c6af 100644
--- a/proxy/src/proxy/copy_bidirectional.rs
+++ b/proxy/src/proxy/copy_bidirectional.rs
@@ -262,7 +262,6 @@ impl CopyBuffer {
 }
 
 #[cfg(test)]
-#[expect(clippy::unwrap_used)]
 mod tests {
     use tokio::io::AsyncWriteExt;
 
diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs
index 2268e60d25..9a6864c33e 100644
--- a/proxy/src/proxy/tests/mod.rs
+++ b/proxy/src/proxy/tests/mod.rs
@@ -1,5 +1,5 @@
 //! A group of high-level tests for connection establishing logic and auth.
-#![allow(clippy::unimplemented, clippy::unwrap_used)]
+#![allow(clippy::unimplemented)]
 
 mod mitm;
 
diff --git a/proxy/src/rate_limiter/leaky_bucket.rs b/proxy/src/rate_limiter/leaky_bucket.rs
index b3853d48e4..4f27c6faef 100644
--- a/proxy/src/rate_limiter/leaky_bucket.rs
+++ b/proxy/src/rate_limiter/leaky_bucket.rs
@@ -83,7 +83,7 @@ impl From<LeakyBucketConfig> for utils::leaky_bucket::LeakyBucketConfig {
 }
 
 #[cfg(test)]
-#[allow(clippy::float_cmp, clippy::unwrap_used)]
+#[allow(clippy::float_cmp)]
 mod tests {
     use std::time::Duration;
 
diff --git a/proxy/src/rate_limiter/limit_algorithm/aimd.rs b/proxy/src/rate_limiter/limit_algorithm/aimd.rs
index 04e136b6d5..3000cc4c2a 100644
--- a/proxy/src/rate_limiter/limit_algorithm/aimd.rs
+++ b/proxy/src/rate_limiter/limit_algorithm/aimd.rs
@@ -63,7 +63,6 @@ impl LimitAlgorithm for Aimd {
 }
 
 #[cfg(test)]
-#[expect(clippy::unwrap_used)]
 mod tests {
     use std::time::Duration;
 
diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs
index 71e2a92da6..21eaa6739b 100644
--- a/proxy/src/rate_limiter/limiter.rs
+++ b/proxy/src/rate_limiter/limiter.rs
@@ -259,7 +259,6 @@ impl<K: Hash + Eq, R: Rng, S: BuildHasher + Clone> BucketRateLimiter<K, R, S> {
 }
 
 #[cfg(test)]
-#[expect(clippy::unwrap_used)]
 mod tests {
     use std::hash::BuildHasherDefault;
     use std::time::Duration;
diff --git a/proxy/src/sasl/messages.rs b/proxy/src/sasl/messages.rs
index 4922ece615..7f2f3a761c 100644
--- a/proxy/src/sasl/messages.rs
+++ b/proxy/src/sasl/messages.rs
@@ -51,7 +51,6 @@ impl<'a> ServerMessage<&'a str> {
 }
 
 #[cfg(test)]
-#[expect(clippy::unwrap_used)]
 mod tests {
     use super::*;
 
diff --git a/proxy/src/scram/messages.rs b/proxy/src/scram/messages.rs
index 7b0b861ce9..e071417dab 100644
--- a/proxy/src/scram/messages.rs
+++ b/proxy/src/scram/messages.rs
@@ -185,7 +185,6 @@ impl fmt::Debug for OwnedServerFirstMessage {
 }
 
 #[cfg(test)]
-#[expect(clippy::unwrap_used)]
 mod tests {
     use super::*;
 
diff --git a/proxy/src/scram/mod.rs b/proxy/src/scram/mod.rs
index 24f991d4d9..4f764c6087 100644
--- a/proxy/src/scram/mod.rs
+++ b/proxy/src/scram/mod.rs
@@ -57,7 +57,6 @@ fn sha256<'a>(parts: impl IntoIterator<Item = &'a [u8]>) -> [u8; 32] {
 }
 
 #[cfg(test)]
-#[expect(clippy::unwrap_used)]
 mod tests {
     use super::threadpool::ThreadPool;
     use super::{Exchange, ServerSecret};
diff --git a/proxy/src/scram/secret.rs b/proxy/src/scram/secret.rs
index eb21b26ab4..8c6a08d432 100644
--- a/proxy/src/scram/secret.rs
+++ b/proxy/src/scram/secret.rs
@@ -72,7 +72,6 @@ impl ServerSecret {
 }
 
 #[cfg(test)]
-#[expect(clippy::unwrap_used)]
 mod tests {
     use super::*;
 
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index 409056a6a9..87176ff7d6 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -189,7 +189,6 @@ impl ClientDataRemote {
 }
 
 #[cfg(test)]
-#[expect(clippy::unwrap_used)]
 mod tests {
     use std::sync::atomic::AtomicBool;
 
diff --git a/proxy/src/serverless/json.rs b/proxy/src/serverless/json.rs
index 7235fb6079..1afc10359f 100644
--- a/proxy/src/serverless/json.rs
+++ b/proxy/src/serverless/json.rs
@@ -256,7 +256,6 @@ fn pg_array_parse_inner(
 }
 
 #[cfg(test)]
-#[expect(clippy::unwrap_used)]
 mod tests {
     use serde_json::json;
 
diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs
index 3282c0ebde..1d9b35f41d 100644
--- a/proxy/src/serverless/local_conn_pool.rs
+++ b/proxy/src/serverless/local_conn_pool.rs
@@ -367,7 +367,6 @@ fn sign_jwt(sk: &SigningKey, payload: &[u8]) -> String {
 }
 
 #[cfg(test)]
-#[expect(clippy::unwrap_used)]
 mod tests {
     use ed25519_dalek::SigningKey;
     use typed_json::json;
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 972bf58d91..7fb39553f9 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -1209,7 +1209,6 @@ impl Discard<'_> {
 }
 
 #[cfg(test)]
-#[expect(clippy::unwrap_used)]
 mod tests {
     use super::*;
 
diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs
index 7fc48105c5..8648a94869 100644
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -177,7 +177,6 @@ pub(crate) async fn serve_websocket(
 }
 
 #[cfg(test)]
-#[expect(clippy::unwrap_used)]
 mod tests {
     use std::pin::pin;
 
diff --git a/proxy/src/url.rs b/proxy/src/url.rs
index d73a84057a..270cd7c24d 100644
--- a/proxy/src/url.rs
+++ b/proxy/src/url.rs
@@ -50,7 +50,6 @@ impl std::fmt::Display for ApiUrl {
 }
 
 #[cfg(test)]
-#[expect(clippy::unwrap_used)]
 mod tests {
     use super::*;
 
diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs
index 2b27dc5c76..115b958c54 100644
--- a/proxy/src/usage_metrics.rs
+++ b/proxy/src/usage_metrics.rs
@@ -497,7 +497,6 @@ async fn upload_backup_events(
 }
 
 #[cfg(test)]
-#[expect(clippy::unwrap_used)]
 mod tests {
     use std::fs;
     use std::io::BufReader;

From 79083de61c0255a960558ba2f728ac30cff0c066 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 16 Apr 2025 18:14:24 -0500
Subject: [PATCH 24/55] Remove forward compatibility hacks related to
 compute_ctl auth (#11621)

These various hacks were needed for the forward compatibility tests.
Enough time has passed since the merge that these are no longer needed.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute_tools/src/bin/compute_ctl.rs          |  4 +--
 control_plane/src/endpoint.rs                 | 29 ++-----------------
 .../compute_wrapper/shell/compute.sh          | 12 +-------
 3 files changed, 4 insertions(+), 41 deletions(-)

diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index 16fd51d79a..e337ee7b15 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -116,9 +116,7 @@ struct Cli {
     #[arg(long)]
     pub set_disk_quota_for_fs: Option<String>,
 
-    // TODO(tristan957): remove alias after compatibility tests are no longer
-    // an issue
-    #[arg(short = 'c', long, alias = "spec-path")]
+    #[arg(short = 'c', long)]
     pub config: Option<OsString>,
 
     #[arg(short = 'i', long, group = "compute-id")]
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index b569b0fb8e..4071b620d6 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -766,10 +766,6 @@ impl Endpoint {
             }
         };
 
-        // TODO(tristan957): Remove the write to spec.json after compatibility
-        // tests work themselves out
-        let spec_path = self.endpoint_path().join("spec.json");
-        std::fs::write(spec_path, serde_json::to_string_pretty(&config.spec)?)?;
         let config_path = self.endpoint_path().join("config.json");
         std::fs::write(config_path, serde_json::to_string_pretty(&config)?)?;
 
@@ -779,16 +775,6 @@ impl Endpoint {
             .append(true)
             .open(self.endpoint_path().join("compute.log"))?;
 
-        // TODO(tristan957): Remove when compatibility tests are no longer an
-        // issue
-        let old_compute_ctl = {
-            let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl"));
-            let help_output = cmd.arg("--help").output()?;
-            let help_output = String::from_utf8_lossy(&help_output.stdout);
-
-            !help_output.contains("--config")
-        };
-
         // Launch compute_ctl
         let conn_str = self.connstr("cloud_admin", "postgres");
         println!("Starting postgres node at '{}'", conn_str);
@@ -807,19 +793,8 @@ impl Endpoint {
         ])
         .args(["--pgdata", self.pgdata().to_str().unwrap()])
         .args(["--connstr", &conn_str])
-        // TODO(tristan957): Change this to --config when compatibility tests
-        // are no longer an issue
-        .args([
-            "--spec-path",
-            self.endpoint_path()
-                .join(if old_compute_ctl {
-                    "spec.json"
-                } else {
-                    "config.json"
-                })
-                .to_str()
-                .unwrap(),
-        ])
+        .arg("--config")
+        .arg(self.endpoint_path().join("config.json").as_os_str())
         .args([
             "--pgbin",
             self.env
diff --git a/docker-compose/compute_wrapper/shell/compute.sh b/docker-compose/compute_wrapper/shell/compute.sh
index 9409e9d055..723b2f8afb 100755
--- a/docker-compose/compute_wrapper/shell/compute.sh
+++ b/docker-compose/compute_wrapper/shell/compute.sh
@@ -81,19 +81,9 @@ sed -i "s/TIMELINE_ID/${timeline_id}/" ${CONFIG_FILE}
 
 cat ${CONFIG_FILE}
 
-# TODO(tristan957): Remove these workarounds for backwards compatibility after
-# the next compute release. That includes these next few lines and the
-# --spec-path in the compute_ctl invocation.
-if compute_ctl --help | grep --quiet -- '--config'; then
-  SPEC_PATH="$CONFIG_FILE"
-else
-  jq '.spec' < "$CONFIG_FILE" > /tmp/spec.json
-  SPEC_PATH=/tmp/spec.json
-fi
-
 echo "Start compute node"
 /usr/local/bin/compute_ctl --pgdata /var/db/postgres/compute \
      -C "postgresql://cloud_admin@localhost:55433/postgres"  \
      -b /usr/local/bin/postgres                              \
      --compute-id "compute-$RANDOM"                          \
-     --spec-path "$SPEC_PATH"
+     --config "$CONFIG_FILE"

From 9794f386f48144f4475c8f950dbf736c3664624c Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 16 Apr 2025 18:23:37 -0500
Subject: [PATCH 25/55] Make Postgres 17 the default version (#11619)

This is mostly a documentation update, but a few updates with regard to
neon_local, pageserver, and tests.

17 is our default for users in production, so dropping references to 16
makes sense.

Signed-off-by: Tristan Partin <tristan@neon.tech>

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 .github/workflows/pg-clients.yml                            | 2 +-
 README.md                                                   | 2 +-
 control_plane/src/bin/neon_local.rs                         | 2 +-
 control_plane/src/local_env.rs                              | 2 +-
 pageserver/src/lib.rs                                       | 2 +-
 pageserver/src/tenant/metadata.rs                           | 5 +++--
 test_runner/README.md                                       | 2 +-
 test_runner/performance/README.md                           | 2 +-
 test_runner/performance/pageserver/README.md                | 2 +-
 .../pageserver/interactive/test_many_small_tenants.py       | 2 +-
 test_runner/regress/test_compatibility.py                   | 6 +++---
 11 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/pg-clients.yml b/.github/workflows/pg-clients.yml
index 098503769e..cad222f60d 100644
--- a/.github/workflows/pg-clients.yml
+++ b/.github/workflows/pg-clients.yml
@@ -30,7 +30,7 @@ permissions:
   statuses: write # require for posting a status update
 
 env:
-  DEFAULT_PG_VERSION: 16
+  DEFAULT_PG_VERSION: 17
   PLATFORM: neon-captest-new
   AWS_DEFAULT_REGION: eu-central-1
 
diff --git a/README.md b/README.md
index 4453904346..c3ba00393e 100644
--- a/README.md
+++ b/README.md
@@ -270,7 +270,7 @@ By default, this runs both debug and release modes, and all supported postgres v
 testing locally, it is convenient to run just one set of permutations, like this:
 
 ```sh
-DEFAULT_PG_VERSION=16 BUILD_TYPE=release ./scripts/pytest
+DEFAULT_PG_VERSION=17 BUILD_TYPE=release ./scripts/pytest
 ```
 
 ## Flamegraphs
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 1ff4295438..af0504b957 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -63,7 +63,7 @@ const DEFAULT_PAGESERVER_ID: NodeId = NodeId(1);
 const DEFAULT_BRANCH_NAME: &str = "main";
 project_git_version!(GIT_VERSION);
 
-const DEFAULT_PG_VERSION: u32 = 16;
+const DEFAULT_PG_VERSION: u32 = 17;
 
 const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/";
 
diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index b7906e5f81..5e3cf95a31 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -23,7 +23,7 @@ use crate::object_storage::{OBJECT_STORAGE_REMOTE_STORAGE_DIR, ObjectStorage};
 use crate::pageserver::{PAGESERVER_REMOTE_STORAGE_DIR, PageServerNode};
 use crate::safekeeper::SafekeeperNode;
 
-pub const DEFAULT_PG_VERSION: u32 = 16;
+pub const DEFAULT_PG_VERSION: u32 = 17;
 
 //
 // This data structures represents neon_local CLI config
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index bda218444d..42454e7356 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -49,7 +49,7 @@ use tracing::{info, info_span};
 /// backwards-compatible changes to the metadata format.
 pub const STORAGE_FORMAT_VERSION: u16 = 3;
 
-pub const DEFAULT_PG_VERSION: u32 = 16;
+pub const DEFAULT_PG_VERSION: u32 = 17;
 
 // Magic constants used to identify different kinds of files
 pub const IMAGE_FILE_MAGIC: u16 = 0x5A60;
diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs
index dceae89d1c..bea3128265 100644
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -564,8 +564,9 @@ mod tests {
             Lsn(0),
             Lsn(0),
             Lsn(0),
-            // Any version will do here, so use the default
-            crate::DEFAULT_PG_VERSION,
+            // Updating this version to 17 will cause the test to fail at the
+            // next assert_eq!().
+            16,
         );
         let expected_bytes = vec![
             /* TimelineMetadataHeader */
diff --git a/test_runner/README.md b/test_runner/README.md
index f342ef8aaa..c93352aaa9 100644
--- a/test_runner/README.md
+++ b/test_runner/README.md
@@ -72,7 +72,7 @@ Inside that dir, a `bin/postgres` binary should be present.
 `COMPATIBILITY_POSTGRES_DISTRIB_DIR`: The directory where the prevoius version of postgres distribution can be found.
 `DEFAULT_PG_VERSION`: The version of Postgres to use,
 This is used to construct full path to the postgres binaries.
-Format is 2-digit major version nubmer, i.e. `DEFAULT_PG_VERSION=16`
+Format is 2-digit major version nubmer, i.e. `DEFAULT_PG_VERSION=17`
 `TEST_OUTPUT`: Set the directory where test state and test output files
 should go.
 `RUST_LOG`: logging configuration to pass into Neon CLI
diff --git a/test_runner/performance/README.md b/test_runner/performance/README.md
index 85096d3770..3b25a60e9b 100644
--- a/test_runner/performance/README.md
+++ b/test_runner/performance/README.md
@@ -7,7 +7,7 @@ easier to see if you have compile errors without scrolling up.
 You may also need to run `./scripts/pysync`.
 
 Then run the tests
-`DEFAULT_PG_VERSION=16 NEON_BIN=./target/release poetry run pytest test_runner/performance`
+`DEFAULT_PG_VERSION=17 NEON_BIN=./target/release poetry run pytest test_runner/performance`
 
 Some handy pytest flags for local development:
 - `-x` tells pytest to stop on first error
diff --git a/test_runner/performance/pageserver/README.md b/test_runner/performance/pageserver/README.md
index 56ffad9963..b8bc8923cd 100644
--- a/test_runner/performance/pageserver/README.md
+++ b/test_runner/performance/pageserver/README.md
@@ -11,6 +11,6 @@ It supports mounting snapshots using overlayfs, which improves iteration time.
 Here's a full command line.
 
 ```
-RUST_BACKTRACE=1 NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS=1 DEFAULT_PG_VERSION=16 BUILD_TYPE=release \
+RUST_BACKTRACE=1 NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS=1 DEFAULT_PG_VERSION=17 BUILD_TYPE=release \
     ./scripts/pytest test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
 ````
diff --git a/test_runner/performance/pageserver/interactive/test_many_small_tenants.py b/test_runner/performance/pageserver/interactive/test_many_small_tenants.py
index 4931295beb..0e67a6b709 100644
--- a/test_runner/performance/pageserver/interactive/test_many_small_tenants.py
+++ b/test_runner/performance/pageserver/interactive/test_many_small_tenants.py
@@ -16,7 +16,7 @@ from performance.pageserver.util import ensure_pageserver_ready_for_benchmarking
 
 """
 Usage:
-DEFAULT_PG_VERSION=16 BUILD_TYPE=debug NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS=1 INTERACTIVE=true \
+DEFAULT_PG_VERSION=17 BUILD_TYPE=debug NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS=1 INTERACTIVE=true \
     ./scripts/pytest --timeout 0 test_runner/performance/pageserver/interactive/test_many_small_tenants.py
 """
 
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index e23b1e0bca..7bd25cb109 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -48,7 +48,7 @@ if TYPE_CHECKING:
 #
 # How to run `test_backward_compatibility` locally:
 #
-#    export DEFAULT_PG_VERSION=16
+#    export DEFAULT_PG_VERSION=17
 #    export BUILD_TYPE=release
 #    export CHECK_ONDISK_DATA_COMPATIBILITY=true
 #    export COMPATIBILITY_SNAPSHOT_DIR=test_output/compatibility_snapshot_pgv${DEFAULT_PG_VERSION}
@@ -70,7 +70,7 @@ if TYPE_CHECKING:
 #
 # How to run `test_forward_compatibility` locally:
 #
-#    export DEFAULT_PG_VERSION=16
+#    export DEFAULT_PG_VERSION=17
 #    export BUILD_TYPE=release
 #    export CHECK_ONDISK_DATA_COMPATIBILITY=true
 #    export COMPATIBILITY_NEON_BIN=neon_previous/target/${BUILD_TYPE}
@@ -96,7 +96,7 @@ if TYPE_CHECKING:
 #
 # How to run `test_version_mismatch` locally:
 #
-#    export DEFAULT_PG_VERSION=16
+#    export DEFAULT_PG_VERSION=17
 #    export BUILD_TYPE=release
 #    export CHECK_ONDISK_DATA_COMPATIBILITY=true
 #    export COMPATIBILITY_NEON_BIN=neon_previous/target/${BUILD_TYPE}

From b7548de81408fff00d3f89004477c66f40a7cce1 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 17 Apr 2025 08:07:45 +0300
Subject: [PATCH 26/55] Disable autovacuum and increase limit for WS
 approximation (#11583)

## Problem

Test lfc working set approximation becomes flaky after recent changes in
prefetch.
May be it is caused by updating HLL in `lfc_write`, may be by some other
reasons.

## Summary of changes

1. Disable autovacuum in this test (as possible source of extra page
accesses).
2. Increase upper boundary for WS approximation from 12 to 20.

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 .../test_lfc_working_set_approximation.py      | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/test_runner/regress/test_lfc_working_set_approximation.py b/test_runner/regress/test_lfc_working_set_approximation.py
index e422622167..a28bc3d047 100644
--- a/test_runner/regress/test_lfc_working_set_approximation.py
+++ b/test_runner/regress/test_lfc_working_set_approximation.py
@@ -22,7 +22,12 @@ def test_lfc_working_set_approximation(neon_simple_env: NeonEnv):
     log.info("Creating endpoint with 1MB shared_buffers and 64 MB LFC")
     endpoint = env.endpoints.create_start(
         "main",
-        config_lines=["neon.max_file_cache_size='128MB'", "neon.file_cache_size_limit='64MB'"],
+        config_lines=[
+            "autovacuum=off",
+            "bgwriter_lru_maxpages=0",
+            "neon.max_file_cache_size='128MB'",
+            "neon.file_cache_size_limit='64MB'",
+        ],
     )
 
     cur = endpoint.connect().cursor()
@@ -72,7 +77,7 @@ WITH (fillfactor='100');
     # verify working set size after some index access of a few select pages only
     blocks = query_scalar(cur, "select approximate_working_set_size(true)")
     log.info(f"working set size after some index access of a few select pages only {blocks}")
-    assert blocks < 12
+    assert blocks < 20
 
 
 @pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
@@ -83,6 +88,7 @@ def test_sliding_working_set_approximation(neon_simple_env: NeonEnv):
         branch_name="main",
         config_lines=[
             "autovacuum = off",
+            "bgwriter_lru_maxpages=0",
             "shared_buffers=1MB",
             "neon.max_file_cache_size=256MB",
             "neon.file_cache_size_limit=245MB",
@@ -92,9 +98,9 @@ def test_sliding_working_set_approximation(neon_simple_env: NeonEnv):
     cur = conn.cursor()
     cur.execute("create extension neon")
     cur.execute(
-        "create table t(pk integer primary key, count integer default 0, payload text default repeat('?', 128))"
+        "create table t(pk integer primary key, count integer default 0, payload text default repeat('?', 1000)) with (fillfactor=10)"
     )
-    cur.execute("insert into t (pk) values (generate_series(1,1000000))")
+    cur.execute("insert into t (pk) values (generate_series(1,100000))")
     time.sleep(2)
     before_10k = time.monotonic()
     cur.execute("select sum(count) from t where pk between 10000 and 20000")
@@ -115,5 +121,5 @@ def test_sliding_working_set_approximation(neon_simple_env: NeonEnv):
     size = cur.fetchall()[0][0] // 8192
     log.info(f"Table size {size} blocks")
 
-    assert estimation_1k >= 20 and estimation_1k <= 40
-    assert estimation_10k >= 200 and estimation_10k <= 440
+    assert estimation_1k >= 900 and estimation_1k <= 2000
+    assert estimation_10k >= 9000 and estimation_10k <= 20000

From 5819938c93f2eb504b0e7593314d6d1f168c6196 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 17 Apr 2025 09:54:23 +0100
Subject: [PATCH 27/55] CI(pg-clients): fix workflow permissions (#11623)

## Problem

`pg-clients` can't start:

```
The workflow is not valid. .github/workflows/pg-clients.yml (Line: 44, Col: 3): Error calling workflow 'neondatabase/neon/.github/workflows/build-build-tools-image.yml@aa19f10e7e958fbe0e0641f2e8c5952ce3be44b3'. The nested job 'check-image' is requesting 'packages: read', but is only allowed 'packages: none'. .github/workflows/pg-clients.yml (Line: 44, Col: 3): Error calling workflow 'neondatabase/neon/.github/workflows/build-build-tools-image.yml@aa19f10e7e958fbe0e0641f2e8c5952ce3be44b3'. The nested job 'build-image' is requesting 'packages: write', but is only allowed 'packages: none'.
```

## Summary of changes
- Grant required `packages: write` permissions to the workflow
---
 .github/workflows/pg-clients.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/pg-clients.yml b/.github/workflows/pg-clients.yml
index cad222f60d..fa4fd73b12 100644
--- a/.github/workflows/pg-clients.yml
+++ b/.github/workflows/pg-clients.yml
@@ -42,6 +42,8 @@ jobs:
       github-event-name: ${{ github.event_name }}
 
   build-build-tools-image:
+    permissions:
+      packages: write
     needs: [ check-permissions ]
     uses: ./.github/workflows/build-build-tools-image.yml
     secrets: inherit

From 07c2411f6be9c776a4a1dec9e1cb1e2029d06a52 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 17 Apr 2025 11:03:21 +0100
Subject: [PATCH 28/55] tests: remove mentions of
 ALLOW_*_COMPATIBILITY_BREAKAGE (#11618)

## Problem

There are mentions of `ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE` and
`ALLOW_FORWARD_COMPATIBILITY_BREAKAGE`, but in reality, this mechanism
doesn't work, so let's remove it to avoid confusion.

The idea behind it was to allow some breaking changes by adding a
special label to a PR that would `xfail` the test. However, in practice,
this means we would need to carry this label through all subsequent PRs
until the release (and artifact regeneration). This approach isn't
really viable, as it increases the risk of missing a compatibility break
in another PR.

## Summary of changes
- Remove mentions and handling of
`ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE` /
`ALLOW_FORWARD_COMPATIBILITY_BREAKAGE`
---
 .../actions/run-python-test-set/action.yml    |   2 -
 test_runner/regress/test_compatibility.py     | 137 +++++++-----------
 2 files changed, 51 insertions(+), 88 deletions(-)

diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index fa6f882161..7139d37be9 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -113,8 +113,6 @@ runs:
         TEST_OUTPUT: /tmp/test_output
         BUILD_TYPE: ${{ inputs.build_type }}
         COMPATIBILITY_SNAPSHOT_DIR: /tmp/compatibility_snapshot_pg${{ inputs.pg_version }}
-        ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'backward compatibility breakage')
-        ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage')
         RERUN_FAILED: ${{ inputs.rerun_failed }}
         PG_VERSION: ${{ inputs.pg_version }}
         SANITIZERS: ${{ inputs.sanitizers }}
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index 7bd25cb109..784afbba82 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -36,10 +36,8 @@ if TYPE_CHECKING:
 # - `test_create_snapshot` a script wrapped in a test that creates a data snapshot.
 # - `test_backward_compatibility` checks that the current version of Neon can start/read/interract with a data snapshot created by the previous version.
 #   The path to the snapshot is configured by COMPATIBILITY_SNAPSHOT_DIR environment variable.
-#   If the breakage is intentional, the test can be xfaild with setting ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE=true.
 # - `test_forward_compatibility` checks that a snapshot created by the current version can be started/read/interracted by the previous version of Neon.
 #   Paths to Neon and Postgres are configured by COMPATIBILITY_NEON_BIN and COMPATIBILITY_POSTGRES_DISTRIB_DIR environment variables.
-#   If the breakage is intentional, the test can be xfaild with setting ALLOW_FORWARD_COMPATIBILITY_BREAKAGE=true.
 #
 # The file contains a couple of helper functions:
 # - check_neon_works performs the test itself, feel free to add more checks there.
@@ -208,36 +206,19 @@ def test_backward_compatibility(
     """
     Test that the new binaries can read old data
     """
-    breaking_changes_allowed = (
-        os.environ.get("ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE", "false").lower() == "true"
+    neon_env_builder.num_safekeepers = 3
+    env = neon_env_builder.from_repo_dir(compatibility_snapshot_dir / "repo")
+    env.pageserver.allowed_errors.append(ingest_lag_log_line)
+    env.start()
+
+    check_neon_works(
+        env,
+        test_output_dir=test_output_dir,
+        sql_dump_path=compatibility_snapshot_dir / "dump.sql",
+        repo_dir=env.repo_dir,
     )
 
-    try:
-        neon_env_builder.num_safekeepers = 3
-        env = neon_env_builder.from_repo_dir(compatibility_snapshot_dir / "repo")
-        env.pageserver.allowed_errors.append(ingest_lag_log_line)
-        env.start()
-
-        check_neon_works(
-            env,
-            test_output_dir=test_output_dir,
-            sql_dump_path=compatibility_snapshot_dir / "dump.sql",
-            repo_dir=env.repo_dir,
-        )
-
-        env.pageserver.assert_log_contains(ingest_lag_log_line)
-
-    except Exception:
-        if breaking_changes_allowed:
-            pytest.xfail(
-                "Breaking changes are allowed by ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE env var"
-            )
-        else:
-            raise
-
-    assert not breaking_changes_allowed, (
-        "Breaking changes are allowed by ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage"
-    )
+    env.pageserver.assert_log_contains(ingest_lag_log_line)
 
 
 @check_ondisk_data_compatibility_if_enabled
@@ -254,72 +235,56 @@ def test_forward_compatibility(
     """
     Test that the old binaries can read new data
     """
-    breaking_changes_allowed = (
-        os.environ.get("ALLOW_FORWARD_COMPATIBILITY_BREAKAGE", "false").lower() == "true"
-    )
 
     neon_env_builder.control_plane_hooks_api = compute_reconfigure_listener.control_plane_hooks_api
     neon_env_builder.test_may_use_compatibility_snapshot_binaries = True
 
-    try:
-        neon_env_builder.num_safekeepers = 3
+    neon_env_builder.num_safekeepers = 3
 
-        # Use previous version's production binaries (pageserver, safekeeper, pg_distrib_dir, etc.).
-        # But always use the current version's neon_local binary.
-        # This is because we want to test the compatibility of the data format, not the compatibility of the neon_local CLI.
-        assert neon_env_builder.compatibility_neon_binpath is not None, (
-            "the environment variable COMPATIBILITY_NEON_BIN is required"
-        )
-        assert neon_env_builder.compatibility_pg_distrib_dir is not None, (
-            "the environment variable COMPATIBILITY_POSTGRES_DISTRIB_DIR is required"
-        )
-        neon_env_builder.neon_binpath = neon_env_builder.compatibility_neon_binpath
-        neon_env_builder.pg_distrib_dir = neon_env_builder.compatibility_pg_distrib_dir
+    # Use previous version's production binaries (pageserver, safekeeper, pg_distrib_dir, etc.).
+    # But always use the current version's neon_local binary.
+    # This is because we want to test the compatibility of the data format, not the compatibility of the neon_local CLI.
+    assert neon_env_builder.compatibility_neon_binpath is not None, (
+        "the environment variable COMPATIBILITY_NEON_BIN is required"
+    )
+    assert neon_env_builder.compatibility_pg_distrib_dir is not None, (
+        "the environment variable COMPATIBILITY_POSTGRES_DISTRIB_DIR is required"
+    )
+    neon_env_builder.neon_binpath = neon_env_builder.compatibility_neon_binpath
+    neon_env_builder.pg_distrib_dir = neon_env_builder.compatibility_pg_distrib_dir
 
-        env = neon_env_builder.from_repo_dir(
-            compatibility_snapshot_dir / "repo",
-        )
-        # there may be an arbitrary number of unrelated tests run between create_snapshot and here
-        env.pageserver.allowed_errors.append(ingest_lag_log_line)
+    env = neon_env_builder.from_repo_dir(
+        compatibility_snapshot_dir / "repo",
+    )
+    # there may be an arbitrary number of unrelated tests run between create_snapshot and here
+    env.pageserver.allowed_errors.append(ingest_lag_log_line)
 
-        # not using env.pageserver.version because it was initialized before
-        prev_pageserver_version_str = env.get_binary_version("pageserver")
-        prev_pageserver_version_match = re.search(
-            "Neon page server git(?:-env)?:(.*) failpoints: (.*), features: (.*)",
-            prev_pageserver_version_str,
-        )
-        if prev_pageserver_version_match is not None:
-            prev_pageserver_version = prev_pageserver_version_match.group(1)
-        else:
-            raise AssertionError(
-                "cannot find git hash in the version string: " + prev_pageserver_version_str
-            )
-
-        # does not include logs from previous runs
-        assert not env.pageserver.log_contains(f"git(-env)?:{prev_pageserver_version}")
-
-        env.start()
-
-        # ensure the specified pageserver is running
-        assert env.pageserver.log_contains(f"git(-env)?:{prev_pageserver_version}")
-
-        check_neon_works(
-            env,
-            test_output_dir=test_output_dir,
-            sql_dump_path=compatibility_snapshot_dir / "dump.sql",
-            repo_dir=env.repo_dir,
+    # not using env.pageserver.version because it was initialized before
+    prev_pageserver_version_str = env.get_binary_version("pageserver")
+    prev_pageserver_version_match = re.search(
+        "Neon page server git(?:-env)?:(.*) failpoints: (.*), features: (.*)",
+        prev_pageserver_version_str,
+    )
+    if prev_pageserver_version_match is not None:
+        prev_pageserver_version = prev_pageserver_version_match.group(1)
+    else:
+        raise AssertionError(
+            "cannot find git hash in the version string: " + prev_pageserver_version_str
         )
 
-    except Exception:
-        if breaking_changes_allowed:
-            pytest.xfail(
-                "Breaking changes are allowed by ALLOW_FORWARD_COMPATIBILITY_BREAKAGE env var"
-            )
-        else:
-            raise
+    # does not include logs from previous runs
+    assert not env.pageserver.log_contains(f"git(-env)?:{prev_pageserver_version}")
 
-    assert not breaking_changes_allowed, (
-        "Breaking changes are allowed by ALLOW_FORWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage"
+    env.start()
+
+    # ensure the specified pageserver is running
+    assert env.pageserver.log_contains(f"git(-env)?:{prev_pageserver_version}")
+
+    check_neon_works(
+        env,
+        test_output_dir=test_output_dir,
+        sql_dump_path=compatibility_snapshot_dir / "dump.sql",
+        repo_dir=env.repo_dir,
     )
 
 

From 0a2797358484fe3fb712dd20bd81342b2f882c20 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 17 Apr 2025 14:29:16 +0100
Subject: [PATCH 29/55] pageserver: rename `Tenant` to `TenantShard` (#11589)

## Problem

`Tenant` isn't really a whole tenant: it's just one shard of a tenant.

## Summary of changes

- Automated rename of Tenant to TenantShard
- Followup commit to change references in comments
---
 pageserver/src/config.rs                      |  10 +-
 pageserver/src/consumption_metrics.rs         |   4 +-
 pageserver/src/consumption_metrics/metrics.rs |  12 +-
 pageserver/src/http/routes.rs                 |   4 +-
 pageserver/src/metrics.rs                     |   2 +-
 pageserver/src/page_service.rs                |   2 +-
 pageserver/src/tenant.rs                      | 112 +++++++++---------
 pageserver/src/tenant/mgr.rs                  |  65 +++++-----
 .../src/tenant/remote_timeline_client.rs      |   8 +-
 .../tenant/remote_timeline_client/download.rs |   2 +-
 .../src/tenant/secondary/heatmap_uploader.rs  |   8 +-
 pageserver/src/tenant/secondary/scheduler.rs  |   2 +-
 pageserver/src/tenant/size.rs                 |   4 +-
 .../src/tenant/storage_layer/delta_layer.rs   |   4 +-
 .../src/tenant/storage_layer/image_layer.rs   |   4 +-
 pageserver/src/tenant/tasks.rs                |  12 +-
 pageserver/src/tenant/timeline.rs             |  24 ++--
 pageserver/src/tenant/timeline/delete.rs      |  16 +--
 .../src/tenant/timeline/detach_ancestor.rs    |  18 +--
 .../src/tenant/timeline/eviction_task.rs      |  16 +--
 pageserver/src/tenant/timeline/offload.rs     |   6 +-
 pageserver/src/tenant/timeline/uninit.rs      |  20 ++--
 22 files changed, 184 insertions(+), 171 deletions(-)

diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 26ae6af70e..c12ac32b7e 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -118,13 +118,13 @@ pub struct PageServerConf {
     /// A lower value implicitly deprioritizes loading such tenants, vs. other work in the system.
     pub concurrent_tenant_warmup: ConfigurableSemaphore,
 
-    /// Number of concurrent [`Tenant::gather_size_inputs`](crate::tenant::Tenant::gather_size_inputs) allowed.
+    /// Number of concurrent [`TenantShard::gather_size_inputs`](crate::tenant::TenantShard::gather_size_inputs) allowed.
     pub concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore,
-    /// Limit of concurrent [`Tenant::gather_size_inputs`] issued by module `eviction_task`.
+    /// Limit of concurrent [`TenantShard::gather_size_inputs`] issued by module `eviction_task`.
     /// The number of permits is the same as `concurrent_tenant_size_logical_size_queries`.
     /// See the comment in `eviction_task` for details.
     ///
-    /// [`Tenant::gather_size_inputs`]: crate::tenant::Tenant::gather_size_inputs
+    /// [`TenantShard::gather_size_inputs`]: crate::tenant::TenantShard::gather_size_inputs
     pub eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore,
 
     // How often to collect metrics and send them to the metrics endpoint.
@@ -588,10 +588,10 @@ impl ConfigurableSemaphore {
     /// Initializse using a non-zero amount of permits.
     ///
     /// Require a non-zero initial permits, because using permits == 0 is a crude way to disable a
-    /// feature such as [`Tenant::gather_size_inputs`]. Otherwise any semaphore using future will
+    /// feature such as [`TenantShard::gather_size_inputs`]. Otherwise any semaphore using future will
     /// behave like [`futures::future::pending`], just waiting until new permits are added.
     ///
-    /// [`Tenant::gather_size_inputs`]: crate::tenant::Tenant::gather_size_inputs
+    /// [`TenantShard::gather_size_inputs`]: crate::tenant::TenantShard::gather_size_inputs
     pub fn new(initial_permits: NonZeroUsize) -> Self {
         ConfigurableSemaphore {
             initial_permits,
diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs
index 0231190e69..3ca82528cf 100644
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -24,7 +24,7 @@ use crate::task_mgr::{self, BACKGROUND_RUNTIME, TaskKind};
 use crate::tenant::mgr::TenantManager;
 use crate::tenant::size::CalculateSyntheticSizeError;
 use crate::tenant::tasks::BackgroundLoopKind;
-use crate::tenant::{LogicalSizeCalculationCause, Tenant};
+use crate::tenant::{LogicalSizeCalculationCause, TenantShard};
 
 mod disk_cache;
 mod metrics;
@@ -428,7 +428,7 @@ async fn calculate_synthetic_size_worker(
     }
 }
 
-async fn calculate_and_log(tenant: &Tenant, cancel: &CancellationToken, ctx: &RequestContext) {
+async fn calculate_and_log(tenant: &TenantShard, cancel: &CancellationToken, ctx: &RequestContext) {
     const CAUSE: LogicalSizeCalculationCause =
         LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize;
 
diff --git a/pageserver/src/consumption_metrics/metrics.rs b/pageserver/src/consumption_metrics/metrics.rs
index 71910011ea..a4bfe74e30 100644
--- a/pageserver/src/consumption_metrics/metrics.rs
+++ b/pageserver/src/consumption_metrics/metrics.rs
@@ -175,9 +175,9 @@ impl MetricsKey {
         .absolute_values()
     }
 
-    /// [`Tenant::remote_size`]
+    /// [`TenantShard::remote_size`]
     ///
-    /// [`Tenant::remote_size`]: crate::tenant::Tenant::remote_size
+    /// [`TenantShard::remote_size`]: crate::tenant::TenantShard::remote_size
     const fn remote_storage_size(tenant_id: TenantId) -> AbsoluteValueFactory {
         MetricsKey {
             tenant_id,
@@ -199,9 +199,9 @@ impl MetricsKey {
         .absolute_values()
     }
 
-    /// [`Tenant::cached_synthetic_size`] as refreshed by [`calculate_synthetic_size_worker`].
+    /// [`TenantShard::cached_synthetic_size`] as refreshed by [`calculate_synthetic_size_worker`].
     ///
-    /// [`Tenant::cached_synthetic_size`]: crate::tenant::Tenant::cached_synthetic_size
+    /// [`TenantShard::cached_synthetic_size`]: crate::tenant::TenantShard::cached_synthetic_size
     /// [`calculate_synthetic_size_worker`]: super::calculate_synthetic_size_worker
     const fn synthetic_size(tenant_id: TenantId) -> AbsoluteValueFactory {
         MetricsKey {
@@ -254,7 +254,7 @@ pub(super) async fn collect_all_metrics(
 
 async fn collect<S>(tenants: S, cache: &Cache, ctx: &RequestContext) -> Vec<NewRawMetric>
 where
-    S: futures::stream::Stream<Item = (TenantId, Arc<crate::tenant::Tenant>)>,
+    S: futures::stream::Stream<Item = (TenantId, Arc<crate::tenant::TenantShard>)>,
 {
     let mut current_metrics: Vec<NewRawMetric> = Vec::new();
 
@@ -308,7 +308,7 @@ impl TenantSnapshot {
     ///
     /// `resident_size` is calculated of the timelines we had access to for other metrics, so we
     /// cannot just list timelines here.
-    fn collect(t: &Arc<crate::tenant::Tenant>, resident_size: u64) -> Self {
+    fn collect(t: &Arc<crate::tenant::TenantShard>, resident_size: u64) -> Self {
         TenantSnapshot {
             resident_size,
             remote_size: t.remote_size(),
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index bbc4bfae1b..8b6500b020 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1873,7 +1873,7 @@ async fn update_tenant_config_handler(
         &ShardParameters::default(),
     );
 
-    crate::tenant::Tenant::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
+    crate::tenant::TenantShard::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
         .await
         .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
 
@@ -1917,7 +1917,7 @@ async fn patch_tenant_config_handler(
         &ShardParameters::default(),
     );
 
-    crate::tenant::Tenant::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
+    crate::tenant::TenantShard::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
         .await
         .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
 
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 2a779b0daa..ce229bbbec 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1086,7 +1086,7 @@ pub(crate) static TIMELINE_EPHEMERAL_BYTES: Lazy<UIntGauge> = Lazy::new(|| {
     .expect("Failed to register metric")
 });
 
-/// Metrics related to the lifecycle of a [`crate::tenant::Tenant`] object: things
+/// Metrics related to the lifecycle of a [`crate::tenant::TenantShard`] object: things
 /// like how long it took to load.
 ///
 /// Note that these are process-global metrics, _not_ per-tenant metrics.  Per-tenant
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 560ac75f4a..d1a210a786 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -76,7 +76,7 @@ use crate::tenant::timeline::{self, WaitLsnError};
 use crate::tenant::{GetTimelineError, PageReconstructError, Timeline};
 use crate::{basebackup, timed_after_cancellation};
 
-/// How long we may wait for a [`crate::tenant::mgr::TenantSlot::InProgress`]` and/or a [`crate::tenant::Tenant`] which
+/// How long we may wait for a [`crate::tenant::mgr::TenantSlot::InProgress`]` and/or a [`crate::tenant::TenantShard`] which
 /// is not yet in state [`TenantState::Active`].
 ///
 /// NB: this is a different value than [`crate::http::routes::ACTIVE_TENANT_TIMEOUT`].
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 0ba70f45b2..997fc24052 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -158,7 +158,7 @@ pub struct TenantSharedResources {
     pub l0_flush_global_state: L0FlushGlobalState,
 }
 
-/// A [`Tenant`] is really an _attached_ tenant.  The configuration
+/// A [`TenantShard`] is really an _attached_ tenant.  The configuration
 /// for an attached tenant is a subset of the [`LocationConf`], represented
 /// in this struct.
 #[derive(Clone)]
@@ -245,7 +245,7 @@ pub(crate) enum SpawnMode {
 ///
 /// Tenant consists of multiple timelines. Keep them in a hash table.
 ///
-pub struct Tenant {
+pub struct TenantShard {
     // Global pageserver config parameters
     pub conf: &'static PageServerConf,
 
@@ -267,7 +267,7 @@ pub struct Tenant {
     shard_identity: ShardIdentity,
 
     /// The remote storage generation, used to protect S3 objects from split-brain.
-    /// Does not change over the lifetime of the [`Tenant`] object.
+    /// Does not change over the lifetime of the [`TenantShard`] object.
     ///
     /// This duplicates the generation stored in LocationConf, but that structure is mutable:
     /// this copy enforces the invariant that generatio doesn't change during a Tenant's lifetime.
@@ -309,7 +309,7 @@ pub struct Tenant {
     // Access to global deletion queue for when this tenant wants to schedule a deletion
     deletion_queue_client: DeletionQueueClient,
 
-    /// Cached logical sizes updated updated on each [`Tenant::gather_size_inputs`].
+    /// Cached logical sizes updated updated on each [`TenantShard::gather_size_inputs`].
     cached_logical_sizes: tokio::sync::Mutex<HashMap<(TimelineId, Lsn), u64>>,
     cached_synthetic_tenant_size: Arc<AtomicU64>,
 
@@ -337,12 +337,12 @@ pub struct Tenant {
     // Timelines' cancellation token.
     pub(crate) cancel: CancellationToken,
 
-    // Users of the Tenant such as the page service must take this Gate to avoid
-    // trying to use a Tenant which is shutting down.
+    // Users of the TenantShard such as the page service must take this Gate to avoid
+    // trying to use a TenantShard which is shutting down.
     pub(crate) gate: Gate,
 
     /// Throttle applied at the top of [`Timeline::get`].
-    /// All [`Tenant::timelines`] of a given [`Tenant`] instance share the same [`throttle::Throttle`] instance.
+    /// All [`TenantShard::timelines`] of a given [`TenantShard`] instance share the same [`throttle::Throttle`] instance.
     pub(crate) pagestream_throttle: Arc<throttle::Throttle>,
 
     pub(crate) pagestream_throttle_metrics: Arc<crate::metrics::tenant_throttling::Pagestream>,
@@ -362,7 +362,7 @@ pub struct Tenant {
 
     l0_flush_global_state: L0FlushGlobalState,
 }
-impl std::fmt::Debug for Tenant {
+impl std::fmt::Debug for TenantShard {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         write!(f, "{} ({})", self.tenant_shard_id, self.current_state())
     }
@@ -841,7 +841,7 @@ impl Debug for SetStoppingError {
     }
 }
 
-/// Arguments to [`Tenant::create_timeline`].
+/// Arguments to [`TenantShard::create_timeline`].
 ///
 /// Not usable as an idempotency key for timeline creation because if [`CreateTimelineParamsBranch::ancestor_start_lsn`]
 /// is `None`, the result of the timeline create call is not deterministic.
@@ -876,7 +876,7 @@ pub(crate) struct CreateTimelineParamsImportPgdata {
     pub(crate) idempotency_key: import_pgdata::index_part_format::IdempotencyKey,
 }
 
-/// What is used to determine idempotency of a [`Tenant::create_timeline`] call in  [`Tenant::start_creating_timeline`] in  [`Tenant::start_creating_timeline`].
+/// What is used to determine idempotency of a [`TenantShard::create_timeline`] call in  [`TenantShard::start_creating_timeline`] in  [`TenantShard::start_creating_timeline`].
 ///
 /// Each [`Timeline`] object holds [`Self`] as an immutable property in [`Timeline::create_idempotency`].
 ///
@@ -914,7 +914,7 @@ pub(crate) struct CreatingTimelineIdempotencyImportPgdata {
     idempotency_key: import_pgdata::index_part_format::IdempotencyKey,
 }
 
-/// What is returned by [`Tenant::start_creating_timeline`].
+/// What is returned by [`TenantShard::start_creating_timeline`].
 #[must_use]
 enum StartCreatingTimelineResult {
     CreateGuard(TimelineCreateGuard),
@@ -943,13 +943,13 @@ struct TimelineInitAndSyncNeedsSpawnImportPgdata {
     guard: TimelineCreateGuard,
 }
 
-/// What is returned by [`Tenant::create_timeline`].
+/// What is returned by [`TenantShard::create_timeline`].
 enum CreateTimelineResult {
     Created(Arc<Timeline>),
     Idempotent(Arc<Timeline>),
-    /// IMPORTANT: This [`Arc<Timeline>`] object is not in [`Tenant::timelines`] when
+    /// IMPORTANT: This [`Arc<Timeline>`] object is not in [`TenantShard::timelines`] when
     /// we return this result, nor will this concrete object ever be added there.
-    /// Cf method comment on [`Tenant::create_timeline_import_pgdata`].
+    /// Cf method comment on [`TenantShard::create_timeline_import_pgdata`].
     ImportSpawned(Arc<Timeline>),
 }
 
@@ -1082,7 +1082,7 @@ pub(crate) enum LoadConfigError {
     NotFound(Utf8PathBuf),
 }
 
-impl Tenant {
+impl TenantShard {
     /// Yet another helper for timeline initialization.
     ///
     /// - Initializes the Timeline struct and inserts it into the tenant's hash map
@@ -1303,7 +1303,7 @@ impl Tenant {
         init_order: Option<InitializationOrder>,
         mode: SpawnMode,
         ctx: &RequestContext,
-    ) -> Result<Arc<Tenant>, GlobalShutDown> {
+    ) -> Result<Arc<TenantShard>, GlobalShutDown> {
         let wal_redo_manager =
             WalRedoManager::new(PostgresRedoManager::new(conf, tenant_shard_id))?;
 
@@ -1317,7 +1317,7 @@ impl Tenant {
         let attach_mode = attached_conf.location.attach_mode;
         let generation = attached_conf.location.generation;
 
-        let tenant = Arc::new(Tenant::new(
+        let tenant = Arc::new(TenantShard::new(
             TenantState::Attaching,
             conf,
             attached_conf,
@@ -1334,7 +1334,7 @@ impl Tenant {
         let attach_gate_guard = tenant
             .gate
             .enter()
-            .expect("We just created the Tenant: nothing else can have shut it down yet");
+            .expect("We just created the TenantShard: nothing else can have shut it down yet");
 
         // Do all the hard work in the background
         let tenant_clone = Arc::clone(&tenant);
@@ -1362,7 +1362,7 @@ impl Tenant {
                     }
                 }
 
-                fn make_broken_or_stopping(t: &Tenant, err: anyhow::Error) {
+                fn make_broken_or_stopping(t: &TenantShard, err: anyhow::Error) {
                     t.state.send_modify(|state| match state {
                         // TODO: the old code alluded to DeleteTenantFlow sometimes setting
                         // TenantState::Stopping before we get here, but this may be outdated.
@@ -1627,7 +1627,7 @@ impl Tenant {
     /// No background tasks are started as part of this routine.
     ///
     async fn attach(
-        self: &Arc<Tenant>,
+        self: &Arc<TenantShard>,
         preload: Option<TenantPreload>,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
@@ -1957,7 +1957,7 @@ impl Tenant {
     }
 
     async fn load_timelines_metadata(
-        self: &Arc<Tenant>,
+        self: &Arc<TenantShard>,
         timeline_ids: HashSet<TimelineId>,
         remote_storage: &GenericRemoteStorage,
         heatmap: Option<(HeatMapTenant, std::time::Instant)>,
@@ -2028,7 +2028,7 @@ impl Tenant {
     }
 
     fn load_timeline_metadata(
-        self: &Arc<Tenant>,
+        self: &Arc<TenantShard>,
         timeline_id: TimelineId,
         remote_storage: GenericRemoteStorage,
         previous_heatmap: Option<PreviousHeatmap>,
@@ -2429,14 +2429,14 @@ impl Tenant {
     /// This is used by tests & import-from-basebackup.
     ///
     /// The returned [`UninitializedTimeline`] contains no data nor metadata and it is in
-    /// a state that will fail [`Tenant::load_remote_timeline`] because `disk_consistent_lsn=Lsn(0)`.
+    /// a state that will fail [`TenantShard::load_remote_timeline`] because `disk_consistent_lsn=Lsn(0)`.
     ///
     /// The caller is responsible for getting the timeline into a state that will be accepted
-    /// by [`Tenant::load_remote_timeline`] / [`Tenant::attach`].
+    /// by [`TenantShard::load_remote_timeline`] / [`TenantShard::attach`].
     /// Then they may call [`UninitializedTimeline::finish_creation`] to add the timeline
-    /// to the [`Tenant::timelines`].
+    /// to the [`TenantShard::timelines`].
     ///
-    /// Tests should use `Tenant::create_test_timeline` to set up the minimum required metadata keys.
+    /// Tests should use `TenantShard::create_test_timeline` to set up the minimum required metadata keys.
     pub(crate) async fn create_empty_timeline(
         self: &Arc<Self>,
         new_timeline_id: TimelineId,
@@ -2584,7 +2584,7 @@ impl Tenant {
     /// the same timeline ID already exists, returns CreateTimelineError::AlreadyExists.
     #[allow(clippy::too_many_arguments)]
     pub(crate) async fn create_timeline(
-        self: &Arc<Tenant>,
+        self: &Arc<TenantShard>,
         params: CreateTimelineParams,
         broker_client: storage_broker::BrokerClientChannel,
         ctx: &RequestContext,
@@ -2751,13 +2751,13 @@ impl Tenant {
         Ok(activated_timeline)
     }
 
-    /// The returned [`Arc<Timeline>`] is NOT in the [`Tenant::timelines`] map until the import
+    /// The returned [`Arc<Timeline>`] is NOT in the [`TenantShard::timelines`] map until the import
     /// completes in the background. A DIFFERENT [`Arc<Timeline>`] will be inserted into the
-    /// [`Tenant::timelines`] map when the import completes.
+    /// [`TenantShard::timelines`] map when the import completes.
     /// We only return an [`Arc<Timeline>`] here so the API handler can create a [`pageserver_api::models::TimelineInfo`]
     /// for the response.
     async fn create_timeline_import_pgdata(
-        self: &Arc<Tenant>,
+        self: &Arc<Self>,
         params: CreateTimelineParamsImportPgdata,
         activate: ActivateTimelineArgs,
         ctx: &RequestContext,
@@ -2854,7 +2854,7 @@ impl Tenant {
 
     #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline.timeline_id))]
     async fn create_timeline_import_pgdata_task(
-        self: Arc<Tenant>,
+        self: Arc<TenantShard>,
         timeline: Arc<Timeline>,
         index_part: import_pgdata::index_part_format::Root,
         activate: ActivateTimelineArgs,
@@ -2882,7 +2882,7 @@ impl Tenant {
     }
 
     async fn create_timeline_import_pgdata_task_impl(
-        self: Arc<Tenant>,
+        self: Arc<TenantShard>,
         timeline: Arc<Timeline>,
         index_part: import_pgdata::index_part_format::Root,
         activate: ActivateTimelineArgs,
@@ -2899,10 +2899,10 @@ impl Tenant {
         // Reload timeline from remote.
         // This proves that the remote state is attachable, and it reuses the code.
         //
-        // TODO: think about whether this is safe to do with concurrent Tenant::shutdown.
+        // TODO: think about whether this is safe to do with concurrent TenantShard::shutdown.
         // timeline_create_guard hols the tenant gate open, so, shutdown cannot _complete_ until we exit.
-        // But our activate() call might launch new background tasks after Tenant::shutdown
-        // already went past shutting down the Tenant::timelines, which this timeline here is no part of.
+        // But our activate() call might launch new background tasks after TenantShard::shutdown
+        // already went past shutting down the TenantShard::timelines, which this timeline here is no part of.
         // I think the same problem exists with the bootstrap & branch mgmt API tasks (tenant shutting
         // down while bootstrapping/branching + activating), but, the race condition is much more likely
         // to manifest because of the long runtime of this import task.
@@ -2917,7 +2917,7 @@ impl Tenant {
         // };
         let timeline_id = timeline.timeline_id;
 
-        // load from object storage like Tenant::attach does
+        // load from object storage like TenantShard::attach does
         let resources = self.build_timeline_resources(timeline_id);
         let index_part = resources
             .remote_client
@@ -3938,7 +3938,7 @@ enum ActivateTimelineArgs {
     No,
 }
 
-impl Tenant {
+impl TenantShard {
     pub fn tenant_specific_overrides(&self) -> pageserver_api::models::TenantConfig {
         self.tenant_conf.load().tenant_conf.clone()
     }
@@ -4096,7 +4096,7 @@ impl Tenant {
         update: F,
     ) -> anyhow::Result<pageserver_api::models::TenantConfig> {
         // Use read-copy-update in order to avoid overwriting the location config
-        // state if this races with [`Tenant::set_new_location_config`]. Note that
+        // state if this races with [`TenantShard::set_new_location_config`]. Note that
         // this race is not possible if both request types come from the storage
         // controller (as they should!) because an exclusive op lock is required
         // on the storage controller side.
@@ -4219,7 +4219,7 @@ impl Tenant {
         Ok((timeline, timeline_ctx))
     }
 
-    /// [`Tenant::shutdown`] must be called before dropping the returned [`Tenant`] object
+    /// [`TenantShard::shutdown`] must be called before dropping the returned [`TenantShard`] object
     /// to ensure proper cleanup of background tasks and metrics.
     //
     // Allow too_many_arguments because a constructor's argument list naturally grows with the
@@ -4235,7 +4235,7 @@ impl Tenant {
         remote_storage: GenericRemoteStorage,
         deletion_queue_client: DeletionQueueClient,
         l0_flush_global_state: L0FlushGlobalState,
-    ) -> Tenant {
+    ) -> TenantShard {
         debug_assert!(
             !attached_conf.location.generation.is_none() || conf.control_plane_api.is_none()
         );
@@ -4295,7 +4295,7 @@ impl Tenant {
             }
         });
 
-        Tenant {
+        TenantShard {
             tenant_shard_id,
             shard_identity,
             generation: attached_conf.location.generation,
@@ -4330,7 +4330,7 @@ impl Tenant {
             cancel: CancellationToken::default(),
             gate: Gate::default(),
             pagestream_throttle: Arc::new(throttle::Throttle::new(
-                Tenant::get_pagestream_throttle_config(conf, &attached_conf.tenant_conf),
+                TenantShard::get_pagestream_throttle_config(conf, &attached_conf.tenant_conf),
             )),
             pagestream_throttle_metrics: Arc::new(
                 crate::metrics::tenant_throttling::Pagestream::new(&tenant_shard_id),
@@ -4466,11 +4466,11 @@ impl Tenant {
 
         // Perform GC for each timeline.
         //
-        // Note that we don't hold the `Tenant::gc_cs` lock here because we don't want to delay the
+        // Note that we don't hold the `TenantShard::gc_cs` lock here because we don't want to delay the
         // branch creation task, which requires the GC lock. A GC iteration can run concurrently
         // with branch creation.
         //
-        // See comments in [`Tenant::branch_timeline`] for more information about why branch
+        // See comments in [`TenantShard::branch_timeline`] for more information about why branch
         // creation task can run concurrently with timeline's GC iteration.
         for timeline in gc_timelines {
             if cancel.is_cancelled() {
@@ -4500,7 +4500,7 @@ impl Tenant {
 
     /// Refreshes the Timeline::gc_info for all timelines, returning the
     /// vector of timelines which have [`Timeline::get_last_record_lsn`] past
-    /// [`Tenant::get_gc_horizon`].
+    /// [`TenantShard::get_gc_horizon`].
     ///
     /// This is usually executed as part of periodic gc, but can now be triggered more often.
     pub(crate) async fn refresh_gc_info(
@@ -5499,7 +5499,7 @@ impl Tenant {
             }
         }
 
-        // The flushes we did above were just writes, but the Tenant might have had
+        // The flushes we did above were just writes, but the TenantShard might have had
         // pending deletions as well from recent compaction/gc: we want to flush those
         // as well.  This requires flushing the global delete queue.  This is cheap
         // because it's typically a no-op.
@@ -5517,7 +5517,7 @@ impl Tenant {
 
     /// How much local storage would this tenant like to have?  It can cope with
     /// less than this (via eviction and on-demand downloads), but this function enables
-    /// the Tenant to advertise how much storage it would prefer to have to provide fast I/O
+    /// the TenantShard to advertise how much storage it would prefer to have to provide fast I/O
     /// by keeping important things on local disk.
     ///
     /// This is a heuristic, not a guarantee: tenants that are long-idle will actually use less
@@ -5540,11 +5540,11 @@ impl Tenant {
     /// manifest in `Self::remote_tenant_manifest`.
     ///
     /// TODO: instead of requiring callers to remember to call `maybe_upload_tenant_manifest` after
-    /// changing any `Tenant` state that's included in the manifest, consider making the manifest
+    /// changing any `TenantShard` state that's included in the manifest, consider making the manifest
     /// the authoritative source of data with an API that automatically uploads on changes. Revisit
     /// this when the manifest is more widely used and we have a better idea of the data model.
     pub(crate) async fn maybe_upload_tenant_manifest(&self) -> Result<(), TenantManifestError> {
-        // Multiple tasks may call this function concurrently after mutating the Tenant runtime
+        // Multiple tasks may call this function concurrently after mutating the TenantShard runtime
         // state, affecting the manifest generated by `build_tenant_manifest`. We use an async mutex
         // to serialize these callers. `eq_ignoring_version` acts as a slightly inefficient but
         // simple coalescing mechanism.
@@ -5812,7 +5812,7 @@ pub(crate) mod harness {
             info_span!("TenantHarness", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug())
         }
 
-        pub(crate) async fn load(&self) -> (Arc<Tenant>, RequestContext) {
+        pub(crate) async fn load(&self) -> (Arc<TenantShard>, RequestContext) {
             let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error)
                 .with_scope_unit_test();
             (
@@ -5827,10 +5827,10 @@ pub(crate) mod harness {
         pub(crate) async fn do_try_load(
             &self,
             ctx: &RequestContext,
-        ) -> anyhow::Result<Arc<Tenant>> {
+        ) -> anyhow::Result<Arc<TenantShard>> {
             let walredo_mgr = Arc::new(WalRedoManager::from(TestRedoManager));
 
-            let tenant = Arc::new(Tenant::new(
+            let tenant = Arc::new(TenantShard::new(
                 TenantState::Attaching,
                 self.conf,
                 AttachedTenantConf::try_from(LocationConf::attached_single(
@@ -6046,7 +6046,7 @@ mod tests {
     #[cfg(feature = "testing")]
     #[allow(clippy::too_many_arguments)]
     async fn randomize_timeline(
-        tenant: &Arc<Tenant>,
+        tenant: &Arc<TenantShard>,
         new_timeline_id: TimelineId,
         pg_version: u32,
         spec: TestTimelineSpecification,
@@ -6936,7 +6936,7 @@ mod tests {
     }
 
     async fn bulk_insert_compact_gc(
-        tenant: &Tenant,
+        tenant: &TenantShard,
         timeline: &Arc<Timeline>,
         ctx: &RequestContext,
         lsn: Lsn,
@@ -6948,7 +6948,7 @@ mod tests {
     }
 
     async fn bulk_insert_maybe_compact_gc(
-        tenant: &Tenant,
+        tenant: &TenantShard,
         timeline: &Arc<Timeline>,
         ctx: &RequestContext,
         mut lsn: Lsn,
@@ -7858,7 +7858,7 @@ mod tests {
             let (tline, _ctx) = tenant
                 .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
                 .await?;
-            // Leave the timeline ID in [`Tenant::timelines_creating`] to exclude attempting to create it again
+            // Leave the timeline ID in [`TenantShard::timelines_creating`] to exclude attempting to create it again
             let raw_tline = tline.raw_timeline().unwrap();
             raw_tline
                 .shutdown(super::timeline::ShutdownMode::Hard)
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index ac81b8e3d7..2ae7e1e875 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -52,7 +52,9 @@ use crate::tenant::config::{
 use crate::tenant::span::debug_assert_current_span_has_tenant_id;
 use crate::tenant::storage_layer::inmemory_layer;
 use crate::tenant::timeline::ShutdownMode;
-use crate::tenant::{AttachedTenantConf, GcError, LoadConfigError, SpawnMode, Tenant, TenantState};
+use crate::tenant::{
+    AttachedTenantConf, GcError, LoadConfigError, SpawnMode, TenantShard, TenantState,
+};
 use crate::virtual_file::MaybeFatalIo;
 use crate::{InitializationOrder, TEMP_FILE_SUFFIX};
 
@@ -67,7 +69,7 @@ use crate::{InitializationOrder, TEMP_FILE_SUFFIX};
 /// having a properly acquired generation (Secondary doesn't need a generation)
 #[derive(Clone)]
 pub(crate) enum TenantSlot {
-    Attached(Arc<Tenant>),
+    Attached(Arc<TenantShard>),
     Secondary(Arc<SecondaryTenant>),
     /// In this state, other administrative operations acting on the TenantId should
     /// block, or return a retry indicator equivalent to HTTP 503.
@@ -86,7 +88,7 @@ impl std::fmt::Debug for TenantSlot {
 
 impl TenantSlot {
     /// Return the `Tenant` in this slot if attached, else None
-    fn get_attached(&self) -> Option<&Arc<Tenant>> {
+    fn get_attached(&self) -> Option<&Arc<TenantShard>> {
         match self {
             Self::Attached(t) => Some(t),
             Self::Secondary(_) => None,
@@ -164,7 +166,7 @@ impl TenantStartupMode {
 /// Result type for looking up a TenantId to a specific shard
 pub(crate) enum ShardResolveResult {
     NotFound,
-    Found(Arc<Tenant>),
+    Found(Arc<TenantShard>),
     // Wait for this barrrier, then query again
     InProgress(utils::completion::Barrier),
 }
@@ -173,7 +175,7 @@ impl TenantsMap {
     /// Convenience function for typical usage, where we want to get a `Tenant` object, for
     /// working with attached tenants.  If the TenantId is in the map but in Secondary state,
     /// None is returned.
-    pub(crate) fn get(&self, tenant_shard_id: &TenantShardId) -> Option<&Arc<Tenant>> {
+    pub(crate) fn get(&self, tenant_shard_id: &TenantShardId) -> Option<&Arc<TenantShard>> {
         match self {
             TenantsMap::Initializing => None,
             TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
@@ -410,7 +412,7 @@ fn load_tenant_config(
         return None;
     }
 
-    Some(Tenant::load_tenant_config(conf, &tenant_shard_id))
+    Some(TenantShard::load_tenant_config(conf, &tenant_shard_id))
 }
 
 /// Initial stage of load: walk the local tenants directory, clean up any temp files,
@@ -606,7 +608,8 @@ pub async fn init_tenant_mgr(
         // Presence of a generation number implies attachment: attach the tenant
         // if it wasn't already, and apply the generation number.
         config_write_futs.push(async move {
-            let r = Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await;
+            let r =
+                TenantShard::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await;
             (tenant_shard_id, location_conf, r)
         });
     }
@@ -694,7 +697,7 @@ fn tenant_spawn(
     init_order: Option<InitializationOrder>,
     mode: SpawnMode,
     ctx: &RequestContext,
-) -> Result<Arc<Tenant>, GlobalShutDown> {
+) -> Result<Arc<TenantShard>, GlobalShutDown> {
     // All these conditions should have been satisfied by our caller: the tenant dir exists, is a well formed
     // path, and contains a configuration file.  Assertions that do synchronous I/O are limited to debug mode
     // to avoid impacting prod runtime performance.
@@ -706,7 +709,7 @@ fn tenant_spawn(
             .unwrap()
     );
 
-    Tenant::spawn(
+    TenantShard::spawn(
         conf,
         tenant_shard_id,
         resources,
@@ -883,12 +886,12 @@ impl TenantManager {
     /// Gets the attached tenant from the in-memory data, erroring if it's absent, in secondary mode, or currently
     /// undergoing a state change (i.e. slot is InProgress).
     ///
-    /// The return Tenant is not guaranteed to be active: check its status after obtaing it, or
-    /// use [`Tenant::wait_to_become_active`] before using it if you will do I/O on it.
+    /// The return TenantShard is not guaranteed to be active: check its status after obtaing it, or
+    /// use [`TenantShard::wait_to_become_active`] before using it if you will do I/O on it.
     pub(crate) fn get_attached_tenant_shard(
         &self,
         tenant_shard_id: TenantShardId,
-    ) -> Result<Arc<Tenant>, GetTenantError> {
+    ) -> Result<Arc<TenantShard>, GetTenantError> {
         let locked = self.tenants.read().unwrap();
 
         let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)?;
@@ -937,12 +940,12 @@ impl TenantManager {
         flush: Option<Duration>,
         mut spawn_mode: SpawnMode,
         ctx: &RequestContext,
-    ) -> Result<Option<Arc<Tenant>>, UpsertLocationError> {
+    ) -> Result<Option<Arc<TenantShard>>, UpsertLocationError> {
         debug_assert_current_span_has_tenant_id();
         info!("configuring tenant location to state {new_location_config:?}");
 
         enum FastPathModified {
-            Attached(Arc<Tenant>),
+            Attached(Arc<TenantShard>),
             Secondary(Arc<SecondaryTenant>),
         }
 
@@ -999,9 +1002,13 @@ impl TenantManager {
         // phase of writing config and/or waiting for flush, before returning.
         match fast_path_taken {
             Some(FastPathModified::Attached(tenant)) => {
-                Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
-                    .await
-                    .fatal_err("write tenant shard config");
+                TenantShard::persist_tenant_config(
+                    self.conf,
+                    &tenant_shard_id,
+                    &new_location_config,
+                )
+                .await
+                .fatal_err("write tenant shard config");
 
                 // Transition to AttachedStale means we may well hold a valid generation
                 // still, and have been requested to go stale as part of a migration.  If
@@ -1030,9 +1037,13 @@ impl TenantManager {
                 return Ok(Some(tenant));
             }
             Some(FastPathModified::Secondary(_secondary_tenant)) => {
-                Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
-                    .await
-                    .fatal_err("write tenant shard config");
+                TenantShard::persist_tenant_config(
+                    self.conf,
+                    &tenant_shard_id,
+                    &new_location_config,
+                )
+                .await
+                .fatal_err("write tenant shard config");
 
                 return Ok(None);
             }
@@ -1122,7 +1133,7 @@ impl TenantManager {
         // Before activating either secondary or attached mode, persist the
         // configuration, so that on restart we will re-attach (or re-start
         // secondary) on the tenant.
-        Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
+        TenantShard::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
             .await
             .fatal_err("write tenant shard config");
 
@@ -1262,7 +1273,7 @@ impl TenantManager {
 
         let tenant_path = self.conf.tenant_path(&tenant_shard_id);
         let timelines_path = self.conf.timelines_path(&tenant_shard_id);
-        let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)?;
+        let config = TenantShard::load_tenant_config(self.conf, &tenant_shard_id)?;
 
         if drop_cache {
             tracing::info!("Dropping local file cache");
@@ -1297,7 +1308,7 @@ impl TenantManager {
         Ok(())
     }
 
-    pub(crate) fn get_attached_active_tenant_shards(&self) -> Vec<Arc<Tenant>> {
+    pub(crate) fn get_attached_active_tenant_shards(&self) -> Vec<Arc<TenantShard>> {
         let locked = self.tenants.read().unwrap();
         match &*locked {
             TenantsMap::Initializing => Vec::new(),
@@ -1446,7 +1457,7 @@ impl TenantManager {
     #[instrument(skip_all, fields(tenant_id=%tenant.get_tenant_shard_id().tenant_id, shard_id=%tenant.get_tenant_shard_id().shard_slug(), new_shard_count=%new_shard_count.literal()))]
     pub(crate) async fn shard_split(
         &self,
-        tenant: Arc<Tenant>,
+        tenant: Arc<TenantShard>,
         new_shard_count: ShardCount,
         new_stripe_size: Option<ShardStripeSize>,
         ctx: &RequestContext,
@@ -1476,7 +1487,7 @@ impl TenantManager {
 
     pub(crate) async fn do_shard_split(
         &self,
-        tenant: Arc<Tenant>,
+        tenant: Arc<TenantShard>,
         new_shard_count: ShardCount,
         new_stripe_size: Option<ShardStripeSize>,
         ctx: &RequestContext,
@@ -1703,7 +1714,7 @@ impl TenantManager {
     /// For each resident layer in the parent shard, we will hard link it into all of the child shards.
     async fn shard_split_hardlink(
         &self,
-        parent_shard: &Tenant,
+        parent_shard: &TenantShard,
         child_shards: Vec<TenantShardId>,
     ) -> anyhow::Result<()> {
         debug_assert_current_span_has_tenant_id();
@@ -1988,7 +1999,7 @@ impl TenantManager {
             }
 
             let tenant_path = self.conf.tenant_path(&tenant_shard_id);
-            let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)
+            let config = TenantShard::load_tenant_config(self.conf, &tenant_shard_id)
                 .map_err(|e| Error::DetachReparent(e.into()))?;
 
             let shard_identity = config.shard;
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 10a13ef1a2..ea29f51956 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -133,7 +133,7 @@
 //! - Initiate upload queue with that [`IndexPart`].
 //! - Reschedule all lost operations by comparing the local filesystem state
 //!   and remote state as per [`IndexPart`]. This is done in
-//!   [`Tenant::timeline_init_and_sync`].
+//!   [`TenantShard::timeline_init_and_sync`].
 //!
 //! Note that if we crash during file deletion between the index update
 //! that removes the file from the list of files, and deleting the remote file,
@@ -171,7 +171,7 @@
 //! If no remote storage configuration is provided, the [`RemoteTimelineClient`] is
 //! not created and the uploads are skipped.
 //!
-//! [`Tenant::timeline_init_and_sync`]: super::Tenant::timeline_init_and_sync
+//! [`TenantShard::timeline_init_and_sync`]: super::TenantShard::timeline_init_and_sync
 //! [`Timeline::load_layer_map`]: super::Timeline::load_layer_map
 
 pub(crate) mod download;
@@ -2743,7 +2743,7 @@ mod tests {
     use crate::tenant::config::AttachmentMode;
     use crate::tenant::harness::{TIMELINE_ID, TenantHarness};
     use crate::tenant::storage_layer::layer::local_layer_path;
-    use crate::tenant::{Tenant, Timeline};
+    use crate::tenant::{TenantShard, Timeline};
 
     pub(super) fn dummy_contents(name: &str) -> Vec<u8> {
         format!("contents for {name}").into()
@@ -2796,7 +2796,7 @@ mod tests {
 
     struct TestSetup {
         harness: TenantHarness,
-        tenant: Arc<Tenant>,
+        tenant: Arc<TenantShard>,
         timeline: Arc<Timeline>,
         tenant_ctx: RequestContext,
     }
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index 8b399996d5..70f77ef9e8 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -452,7 +452,7 @@ async fn do_download_index_part(
 /// generation (normal case when migrating/restarting).  Only if both of these return 404 do we fall back
 /// to listing objects.
 ///
-/// * `my_generation`: the value of `[crate::tenant::Tenant::generation]`
+/// * `my_generation`: the value of `[crate::tenant::TenantShard::generation]`
 /// * `what`: for logging, what object are we downloading
 /// * `prefix`: when listing objects, use this prefix (i.e. the part of the object path before the generation)
 /// * `do_download`: a GET of the object in a particular generation, which should **retry indefinitely** unless
diff --git a/pageserver/src/tenant/secondary/heatmap_uploader.rs b/pageserver/src/tenant/secondary/heatmap_uploader.rs
index 3375714a66..46bc0ef235 100644
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -21,7 +21,7 @@ use super::scheduler::{
 use super::{CommandRequest, SecondaryTenantError, UploadCommand};
 use crate::TEMP_FILE_SUFFIX;
 use crate::metrics::SECONDARY_MODE;
-use crate::tenant::Tenant;
+use crate::tenant::TenantShard;
 use crate::tenant::config::AttachmentMode;
 use crate::tenant::mgr::{GetTenantError, TenantManager};
 use crate::tenant::remote_timeline_client::remote_heatmap_path;
@@ -74,7 +74,7 @@ impl RunningJob for WriteInProgress {
 }
 
 struct UploadPending {
-    tenant: Arc<Tenant>,
+    tenant: Arc<TenantShard>,
     last_upload: Option<LastUploadState>,
     target_time: Option<Instant>,
     period: Option<Duration>,
@@ -106,7 +106,7 @@ impl scheduler::Completion for WriteComplete {
 struct UploaderTenantState {
     // This Weak only exists to enable culling idle instances of this type
     // when the Tenant has been deallocated.
-    tenant: Weak<Tenant>,
+    tenant: Weak<TenantShard>,
 
     /// Digest of the serialized heatmap that we last successfully uploaded
     last_upload_state: Option<LastUploadState>,
@@ -357,7 +357,7 @@ struct LastUploadState {
 /// of the object we would have uploaded.
 async fn upload_tenant_heatmap(
     remote_storage: GenericRemoteStorage,
-    tenant: &Arc<Tenant>,
+    tenant: &Arc<TenantShard>,
     last_upload: Option<LastUploadState>,
 ) -> Result<UploadHeatmapOutcome, UploadHeatmapError> {
     debug_assert_current_span_has_tenant_id();
diff --git a/pageserver/src/tenant/secondary/scheduler.rs b/pageserver/src/tenant/secondary/scheduler.rs
index f948f9114f..62ca527bbc 100644
--- a/pageserver/src/tenant/secondary/scheduler.rs
+++ b/pageserver/src/tenant/secondary/scheduler.rs
@@ -360,7 +360,7 @@ where
 
     /// Periodic execution phase: inspect all attached tenants and schedule any work they require.
     ///
-    /// The type in `tenants` should be a tenant-like structure, e.g. [`crate::tenant::Tenant`] or [`crate::tenant::secondary::SecondaryTenant`]
+    /// The type in `tenants` should be a tenant-like structure, e.g. [`crate::tenant::TenantShard`] or [`crate::tenant::secondary::SecondaryTenant`]
     ///
     /// This function resets the pending list: it is assumed that the caller may change their mind about
     /// which tenants need work between calls to schedule_iteration.
diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs
index c7ac50ca6a..bf5d9bc87a 100644
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -12,7 +12,7 @@ use tracing::*;
 use utils::id::TimelineId;
 use utils::lsn::Lsn;
 
-use super::{GcError, LogicalSizeCalculationCause, Tenant};
+use super::{GcError, LogicalSizeCalculationCause, TenantShard};
 use crate::context::RequestContext;
 use crate::pgdatadir_mapping::CalculateLogicalSizeError;
 use crate::tenant::{MaybeOffloaded, Timeline};
@@ -156,7 +156,7 @@ pub struct TimelineInputs {
 ///   initdb_lsn  branchpoints*  next_pitr_cutoff  latest
 /// ```
 pub(super) async fn gather_inputs(
-    tenant: &Tenant,
+    tenant: &TenantShard,
     limit: &Arc<Semaphore>,
     max_retention_period: Option<u64>,
     logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>,
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 4417b8aa51..0654342a25 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -1620,7 +1620,7 @@ pub(crate) mod test {
     use crate::tenant::harness::{TIMELINE_ID, TenantHarness};
     use crate::tenant::storage_layer::{Layer, ResidentLayer};
     use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner;
-    use crate::tenant::{Tenant, Timeline};
+    use crate::tenant::{TenantShard, Timeline};
 
     /// Construct an index for a fictional delta layer and and then
     /// traverse in order to plan vectored reads for a query. Finally,
@@ -2209,7 +2209,7 @@ pub(crate) mod test {
     }
 
     pub(crate) async fn produce_delta_layer(
-        tenant: &Tenant,
+        tenant: &TenantShard,
         tline: &Arc<Timeline>,
         mut deltas: Vec<(Key, Lsn, Value)>,
         ctx: &RequestContext,
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index c2de20b5b3..8ee4cdee66 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -1228,7 +1228,7 @@ mod test {
     use crate::tenant::harness::{TIMELINE_ID, TenantHarness};
     use crate::tenant::storage_layer::{Layer, ResidentLayer};
     use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner;
-    use crate::tenant::{Tenant, Timeline};
+    use crate::tenant::{TenantShard, Timeline};
 
     #[tokio::test]
     async fn image_layer_rewrite() {
@@ -1410,7 +1410,7 @@ mod test {
     }
 
     async fn produce_image_layer(
-        tenant: &Tenant,
+        tenant: &TenantShard,
         tline: &Arc<Timeline>,
         mut images: Vec<(Key, Bytes)>,
         lsn: Lsn,
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 54588e788c..1112a5330b 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -24,7 +24,7 @@ use crate::task_mgr::{self, BACKGROUND_RUNTIME, TOKIO_WORKER_THREADS, TaskKind};
 use crate::tenant::throttle::Stats;
 use crate::tenant::timeline::CompactionError;
 use crate::tenant::timeline::compaction::CompactionOutcome;
-use crate::tenant::{Tenant, TenantState};
+use crate::tenant::{TenantShard, TenantState};
 
 /// Semaphore limiting concurrent background tasks (across all tenants).
 ///
@@ -117,7 +117,7 @@ pub(crate) async fn acquire_concurrency_permit(
 }
 
 /// Start per tenant background loops: compaction, GC, and ingest housekeeping.
-pub fn start_background_loops(tenant: &Arc<Tenant>, can_start: Option<&Barrier>) {
+pub fn start_background_loops(tenant: &Arc<TenantShard>, can_start: Option<&Barrier>) {
     let tenant_shard_id = tenant.tenant_shard_id;
 
     task_mgr::spawn(
@@ -198,7 +198,7 @@ pub fn start_background_loops(tenant: &Arc<Tenant>, can_start: Option<&Barrier>)
 }
 
 /// Compaction task's main loop.
-async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
+async fn compaction_loop(tenant: Arc<TenantShard>, cancel: CancellationToken) {
     const BASE_BACKOFF_SECS: f64 = 1.0;
     const MAX_BACKOFF_SECS: f64 = 300.0;
     const RECHECK_CONFIG_INTERVAL: Duration = Duration::from_secs(10);
@@ -348,7 +348,7 @@ pub(crate) fn log_compaction_error(
 }
 
 /// GC task's main loop.
-async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
+async fn gc_loop(tenant: Arc<TenantShard>, cancel: CancellationToken) {
     const MAX_BACKOFF_SECS: f64 = 300.0;
     let mut error_run = 0; // consecutive errors
 
@@ -432,7 +432,7 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
 }
 
 /// Tenant housekeeping's main loop.
-async fn tenant_housekeeping_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
+async fn tenant_housekeeping_loop(tenant: Arc<TenantShard>, cancel: CancellationToken) {
     let mut last_throttle_flag_reset_at = Instant::now();
     loop {
         if wait_for_active_tenant(&tenant, &cancel).await.is_break() {
@@ -483,7 +483,7 @@ async fn tenant_housekeeping_loop(tenant: Arc<Tenant>, cancel: CancellationToken
 
 /// Waits until the tenant becomes active, or returns `ControlFlow::Break()` to shut down.
 async fn wait_for_active_tenant(
-    tenant: &Arc<Tenant>,
+    tenant: &Arc<TenantShard>,
     cancel: &CancellationToken,
 ) -> ControlFlow<()> {
     if tenant.current_state() == TenantState::Active {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index bc54c85119..5b126d516b 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -412,7 +412,7 @@ pub struct Timeline {
     /// Timeline deletion will acquire both compaction and gc locks in whatever order.
     gc_lock: tokio::sync::Mutex<()>,
 
-    /// Cloned from [`super::Tenant::pagestream_throttle`] on construction.
+    /// Cloned from [`super::TenantShard::pagestream_throttle`] on construction.
     pub(crate) pagestream_throttle: Arc<crate::tenant::throttle::Throttle>,
 
     /// Size estimator for aux file v2
@@ -2065,7 +2065,7 @@ impl Timeline {
 
     pub(crate) fn activate(
         self: &Arc<Self>,
-        parent: Arc<crate::tenant::Tenant>,
+        parent: Arc<crate::tenant::TenantShard>,
         broker_client: BrokerClientChannel,
         background_jobs_can_start: Option<&completion::Barrier>,
         ctx: &RequestContext,
@@ -3325,7 +3325,7 @@ impl Timeline {
         //     (1) and (4)
         // TODO: this is basically a no-op now, should we remove it?
         self.remote_client.schedule_barrier()?;
-        // Tenant::create_timeline will wait for these uploads to happen before returning, or
+        // TenantShard::create_timeline will wait for these uploads to happen before returning, or
         // on retry.
 
         // Now that we have the full layer map, we may calculate the visibility of layers within it (a global scan)
@@ -5754,7 +5754,7 @@ impl Timeline {
     /// from our ancestor to be branches of this timeline.
     pub(crate) async fn prepare_to_detach_from_ancestor(
         self: &Arc<Timeline>,
-        tenant: &crate::tenant::Tenant,
+        tenant: &crate::tenant::TenantShard,
         options: detach_ancestor::Options,
         behavior: DetachBehavior,
         ctx: &RequestContext,
@@ -5773,7 +5773,7 @@ impl Timeline {
     /// resetting the tenant.
     pub(crate) async fn detach_from_ancestor_and_reparent(
         self: &Arc<Timeline>,
-        tenant: &crate::tenant::Tenant,
+        tenant: &crate::tenant::TenantShard,
         prepared: detach_ancestor::PreparedTimelineDetach,
         ancestor_timeline_id: TimelineId,
         ancestor_lsn: Lsn,
@@ -5797,7 +5797,7 @@ impl Timeline {
     /// The tenant must've been reset if ancestry was modified previously (in tenant manager).
     pub(crate) async fn complete_detaching_timeline_ancestor(
         self: &Arc<Timeline>,
-        tenant: &crate::tenant::Tenant,
+        tenant: &crate::tenant::TenantShard,
         attempt: detach_ancestor::Attempt,
         ctx: &RequestContext,
     ) -> Result<(), detach_ancestor::Error> {
@@ -6859,14 +6859,14 @@ impl Timeline {
     /// Persistently blocks gc for `Manual` reason.
     ///
     /// Returns true if no such block existed before, false otherwise.
-    pub(crate) async fn block_gc(&self, tenant: &super::Tenant) -> anyhow::Result<bool> {
+    pub(crate) async fn block_gc(&self, tenant: &super::TenantShard) -> anyhow::Result<bool> {
         use crate::tenant::remote_timeline_client::index::GcBlockingReason;
         assert_eq!(self.tenant_shard_id, tenant.tenant_shard_id);
         tenant.gc_block.insert(self, GcBlockingReason::Manual).await
     }
 
     /// Persistently unblocks gc for `Manual` reason.
-    pub(crate) async fn unblock_gc(&self, tenant: &super::Tenant) -> anyhow::Result<()> {
+    pub(crate) async fn unblock_gc(&self, tenant: &super::TenantShard) -> anyhow::Result<()> {
         use crate::tenant::remote_timeline_client::index::GcBlockingReason;
         assert_eq!(self.tenant_shard_id, tenant.tenant_shard_id);
         tenant.gc_block.remove(self, GcBlockingReason::Manual).await
@@ -6884,8 +6884,8 @@ impl Timeline {
 
     /// Force create an image layer and place it into the layer map.
     ///
-    /// DO NOT use this function directly. Use [`Tenant::branch_timeline_test_with_layers`]
-    /// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are
+    /// DO NOT use this function directly. Use [`TenantShard::branch_timeline_test_with_layers`]
+    /// or [`TenantShard::create_test_timeline_with_layers`] to ensure all these layers are
     /// placed into the layer map in one run AND be validated.
     #[cfg(test)]
     pub(super) async fn force_create_image_layer(
@@ -6941,8 +6941,8 @@ impl Timeline {
 
     /// Force create a delta layer and place it into the layer map.
     ///
-    /// DO NOT use this function directly. Use [`Tenant::branch_timeline_test_with_layers`]
-    /// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are
+    /// DO NOT use this function directly. Use [`TenantShard::branch_timeline_test_with_layers`]
+    /// or [`TenantShard::create_test_timeline_with_layers`] to ensure all these layers are
     /// placed into the layer map in one run AND be validated.
     #[cfg(test)]
     pub(super) async fn force_create_delta_layer(
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index 64fcf1fe0d..1d4dd05e34 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -18,8 +18,8 @@ use crate::tenant::remote_timeline_client::{
     PersistIndexPartWithDeletedFlagError, RemoteTimelineClient,
 };
 use crate::tenant::{
-    CreateTimelineCause, DeleteTimelineError, MaybeDeletedIndexPart, Tenant, TenantManifestError,
-    Timeline, TimelineOrOffloaded,
+    CreateTimelineCause, DeleteTimelineError, MaybeDeletedIndexPart, TenantManifestError,
+    TenantShard, Timeline, TimelineOrOffloaded,
 };
 use crate::virtual_file::MaybeFatalIo;
 
@@ -113,7 +113,7 @@ pub(super) async fn delete_local_timeline_directory(
 /// It is important that this gets called when DeletionGuard is being held.
 /// For more context see comments in [`make_timeline_delete_guard`]
 async fn remove_maybe_offloaded_timeline_from_tenant(
-    tenant: &Tenant,
+    tenant: &TenantShard,
     timeline: &TimelineOrOffloaded,
     _: &DeletionGuard, // using it as a witness
 ) -> anyhow::Result<()> {
@@ -192,7 +192,7 @@ impl DeleteTimelineFlow {
     // error out if some of the shutdown tasks have already been completed!
     #[instrument(skip_all)]
     pub async fn run(
-        tenant: &Arc<Tenant>,
+        tenant: &Arc<TenantShard>,
         timeline_id: TimelineId,
     ) -> Result<(), DeleteTimelineError> {
         super::debug_assert_current_span_has_tenant_and_timeline_id();
@@ -288,7 +288,7 @@ impl DeleteTimelineFlow {
     /// Shortcut to create Timeline in stopping state and spawn deletion task.
     #[instrument(skip_all, fields(%timeline_id))]
     pub(crate) async fn resume_deletion(
-        tenant: Arc<Tenant>,
+        tenant: Arc<TenantShard>,
         timeline_id: TimelineId,
         local_metadata: &TimelineMetadata,
         remote_client: RemoteTimelineClient,
@@ -338,7 +338,7 @@ impl DeleteTimelineFlow {
     fn schedule_background(
         guard: DeletionGuard,
         conf: &'static PageServerConf,
-        tenant: Arc<Tenant>,
+        tenant: Arc<TenantShard>,
         timeline: TimelineOrOffloaded,
         remote_client: Arc<RemoteTimelineClient>,
     ) {
@@ -381,7 +381,7 @@ impl DeleteTimelineFlow {
     async fn background(
         mut guard: DeletionGuard,
         conf: &PageServerConf,
-        tenant: &Tenant,
+        tenant: &TenantShard,
         timeline: &TimelineOrOffloaded,
         remote_client: Arc<RemoteTimelineClient>,
     ) -> Result<(), DeleteTimelineError> {
@@ -435,7 +435,7 @@ pub(super) enum TimelineDeleteGuardKind {
 }
 
 pub(super) fn make_timeline_delete_guard(
-    tenant: &Tenant,
+    tenant: &TenantShard,
     timeline_id: TimelineId,
     guard_kind: TimelineDeleteGuardKind,
 ) -> Result<(TimelineOrOffloaded, DeletionGuard), DeleteTimelineError> {
diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs
index a841cc55f0..8e95c3a8ff 100644
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -23,7 +23,7 @@ use super::layer_manager::LayerManager;
 use super::{FlushLayerError, Timeline};
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::TaskKind;
-use crate::tenant::Tenant;
+use crate::tenant::TenantShard;
 use crate::tenant::remote_timeline_client::index::GcBlockingReason::DetachAncestor;
 use crate::tenant::storage_layer::layer::local_layer_path;
 use crate::tenant::storage_layer::{
@@ -265,7 +265,7 @@ async fn generate_tombstone_image_layer(
 /// See [`Timeline::prepare_to_detach_from_ancestor`]
 pub(super) async fn prepare(
     detached: &Arc<Timeline>,
-    tenant: &Tenant,
+    tenant: &TenantShard,
     behavior: DetachBehavior,
     options: Options,
     ctx: &RequestContext,
@@ -590,7 +590,7 @@ pub(super) async fn prepare(
 
 async fn start_new_attempt(
     detached: &Timeline,
-    tenant: &Tenant,
+    tenant: &TenantShard,
     ancestor_timeline_id: TimelineId,
     ancestor_lsn: Lsn,
 ) -> Result<Attempt, Error> {
@@ -611,7 +611,7 @@ async fn start_new_attempt(
 
 async fn continue_with_blocked_gc(
     detached: &Timeline,
-    tenant: &Tenant,
+    tenant: &TenantShard,
     ancestor_timeline_id: TimelineId,
     ancestor_lsn: Lsn,
 ) -> Result<Attempt, Error> {
@@ -622,7 +622,7 @@ async fn continue_with_blocked_gc(
 
 fn obtain_exclusive_attempt(
     detached: &Timeline,
-    tenant: &Tenant,
+    tenant: &TenantShard,
     ancestor_timeline_id: TimelineId,
     ancestor_lsn: Lsn,
 ) -> Result<Attempt, Error> {
@@ -655,7 +655,7 @@ fn obtain_exclusive_attempt(
 
 fn reparented_direct_children(
     detached: &Arc<Timeline>,
-    tenant: &Tenant,
+    tenant: &TenantShard,
 ) -> Result<HashSet<TimelineId>, Error> {
     let mut all_direct_children = tenant
         .timelines
@@ -950,7 +950,7 @@ impl DetachingAndReparenting {
 /// See [`Timeline::detach_from_ancestor_and_reparent`].
 pub(super) async fn detach_and_reparent(
     detached: &Arc<Timeline>,
-    tenant: &Tenant,
+    tenant: &TenantShard,
     prepared: PreparedTimelineDetach,
     ancestor_timeline_id: TimelineId,
     ancestor_lsn: Lsn,
@@ -1184,7 +1184,7 @@ pub(super) async fn detach_and_reparent(
 
 pub(super) async fn complete(
     detached: &Arc<Timeline>,
-    tenant: &Tenant,
+    tenant: &TenantShard,
     mut attempt: Attempt,
     _ctx: &RequestContext,
 ) -> Result<(), Error> {
@@ -1258,7 +1258,7 @@ where
 }
 
 fn check_no_archived_children_of_ancestor(
-    tenant: &Tenant,
+    tenant: &TenantShard,
     detached: &Arc<Timeline>,
     ancestor: &Arc<Timeline>,
     ancestor_lsn: Lsn,
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index 397e8e8978..b1b0d32c9b 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -33,7 +33,7 @@ use crate::tenant::size::CalculateSyntheticSizeError;
 use crate::tenant::storage_layer::LayerVisibilityHint;
 use crate::tenant::tasks::{BackgroundLoopKind, BackgroundLoopSemaphorePermit, sleep_random};
 use crate::tenant::timeline::EvictionError;
-use crate::tenant::{LogicalSizeCalculationCause, Tenant};
+use crate::tenant::{LogicalSizeCalculationCause, TenantShard};
 
 #[derive(Default)]
 pub struct EvictionTaskTimelineState {
@@ -48,7 +48,7 @@ pub struct EvictionTaskTenantState {
 impl Timeline {
     pub(super) fn launch_eviction_task(
         self: &Arc<Self>,
-        parent: Arc<Tenant>,
+        parent: Arc<TenantShard>,
         background_tasks_can_start: Option<&completion::Barrier>,
     ) {
         let self_clone = Arc::clone(self);
@@ -75,7 +75,7 @@ impl Timeline {
     }
 
     #[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))]
-    async fn eviction_task(self: Arc<Self>, tenant: Arc<Tenant>) {
+    async fn eviction_task(self: Arc<Self>, tenant: Arc<TenantShard>) {
         // acquire the gate guard only once within a useful span
         let Ok(guard) = self.gate.enter() else {
             return;
@@ -118,7 +118,7 @@ impl Timeline {
     #[instrument(skip_all, fields(policy_kind = policy.discriminant_str()))]
     async fn eviction_iteration(
         self: &Arc<Self>,
-        tenant: &Tenant,
+        tenant: &TenantShard,
         policy: &EvictionPolicy,
         cancel: &CancellationToken,
         gate: &GateGuard,
@@ -175,7 +175,7 @@ impl Timeline {
 
     async fn eviction_iteration_threshold(
         self: &Arc<Self>,
-        tenant: &Tenant,
+        tenant: &TenantShard,
         p: &EvictionPolicyLayerAccessThreshold,
         cancel: &CancellationToken,
         gate: &GateGuard,
@@ -309,7 +309,7 @@ impl Timeline {
     /// disk usage based eviction task.
     async fn imitiate_only(
         self: &Arc<Self>,
-        tenant: &Tenant,
+        tenant: &TenantShard,
         p: &EvictionPolicyLayerAccessThreshold,
         cancel: &CancellationToken,
         gate: &GateGuard,
@@ -363,7 +363,7 @@ impl Timeline {
     #[instrument(skip_all)]
     async fn imitate_layer_accesses(
         &self,
-        tenant: &Tenant,
+        tenant: &TenantShard,
         p: &EvictionPolicyLayerAccessThreshold,
         cancel: &CancellationToken,
         gate: &GateGuard,
@@ -499,7 +499,7 @@ impl Timeline {
     #[instrument(skip_all)]
     async fn imitate_synthetic_size_calculation_worker(
         &self,
-        tenant: &Tenant,
+        tenant: &TenantShard,
         cancel: &CancellationToken,
         ctx: &RequestContext,
     ) {
diff --git a/pageserver/src/tenant/timeline/offload.rs b/pageserver/src/tenant/timeline/offload.rs
index f46f1676c9..5920315917 100644
--- a/pageserver/src/tenant/timeline/offload.rs
+++ b/pageserver/src/tenant/timeline/offload.rs
@@ -8,7 +8,7 @@ use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::remote_timeline_client::ShutdownIfArchivedError;
 use crate::tenant::timeline::delete::{TimelineDeleteGuardKind, make_timeline_delete_guard};
 use crate::tenant::{
-    DeleteTimelineError, OffloadedTimeline, Tenant, TenantManifestError, TimelineOrOffloaded,
+    DeleteTimelineError, OffloadedTimeline, TenantManifestError, TenantShard, TimelineOrOffloaded,
 };
 
 #[derive(thiserror::Error, Debug)]
@@ -33,7 +33,7 @@ impl From<TenantManifestError> for OffloadError {
 }
 
 pub(crate) async fn offload_timeline(
-    tenant: &Tenant,
+    tenant: &TenantShard,
     timeline: &Arc<Timeline>,
 ) -> Result<(), OffloadError> {
     debug_assert_current_span_has_tenant_and_timeline_id();
@@ -123,7 +123,7 @@ pub(crate) async fn offload_timeline(
 ///
 /// Returns the strong count of the timeline `Arc`
 fn remove_timeline_from_tenant(
-    tenant: &Tenant,
+    tenant: &TenantShard,
     timeline: &Timeline,
     _: &DeletionGuard, // using it as a witness
 ) -> usize {
diff --git a/pageserver/src/tenant/timeline/uninit.rs b/pageserver/src/tenant/timeline/uninit.rs
index f66c0ffa0f..beebf35462 100644
--- a/pageserver/src/tenant/timeline/uninit.rs
+++ b/pageserver/src/tenant/timeline/uninit.rs
@@ -15,17 +15,19 @@ use super::Timeline;
 use crate::context::RequestContext;
 use crate::import_datadir;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
-use crate::tenant::{CreateTimelineError, CreateTimelineIdempotency, Tenant, TimelineOrOffloaded};
+use crate::tenant::{
+    CreateTimelineError, CreateTimelineIdempotency, TenantShard, TimelineOrOffloaded,
+};
 
 /// A timeline with some of its files on disk, being initialized.
 /// This struct ensures the atomicity of the timeline init: it's either properly created and inserted into pageserver's memory, or
 /// its local files are removed.  If we crash while this class exists, then the timeline's local
-/// state is cleaned up during [`Tenant::clean_up_timelines`], because the timeline's content isn't in remote storage.
+/// state is cleaned up during [`TenantShard::clean_up_timelines`], because the timeline's content isn't in remote storage.
 ///
 /// The caller is responsible for proper timeline data filling before the final init.
 #[must_use]
 pub struct UninitializedTimeline<'t> {
-    pub(crate) owning_tenant: &'t Tenant,
+    pub(crate) owning_tenant: &'t TenantShard,
     timeline_id: TimelineId,
     raw_timeline: Option<(Arc<Timeline>, TimelineCreateGuard)>,
     /// Whether we spawned the inner Timeline's tasks such that we must later shut it down
@@ -35,7 +37,7 @@ pub struct UninitializedTimeline<'t> {
 
 impl<'t> UninitializedTimeline<'t> {
     pub(crate) fn new(
-        owning_tenant: &'t Tenant,
+        owning_tenant: &'t TenantShard,
         timeline_id: TimelineId,
         raw_timeline: Option<(Arc<Timeline>, TimelineCreateGuard)>,
     ) -> Self {
@@ -156,7 +158,7 @@ impl<'t> UninitializedTimeline<'t> {
     /// Prepares timeline data by loading it from the basebackup archive.
     pub(crate) async fn import_basebackup_from_tar(
         mut self,
-        tenant: Arc<Tenant>,
+        tenant: Arc<TenantShard>,
         copyin_read: &mut (impl tokio::io::AsyncRead + Send + Sync + Unpin),
         base_lsn: Lsn,
         broker_client: storage_broker::BrokerClientChannel,
@@ -227,17 +229,17 @@ pub(crate) fn cleanup_timeline_directory(create_guard: TimelineCreateGuard) {
             error!("Failed to clean up uninitialized timeline directory {timeline_path:?}: {e:?}")
         }
     }
-    // Having cleaned up, we can release this TimelineId in `[Tenant::timelines_creating]` to allow other
+    // Having cleaned up, we can release this TimelineId in `[TenantShard::timelines_creating]` to allow other
     // timeline creation attempts under this TimelineId to proceed
     drop(create_guard);
 }
 
 /// A guard for timeline creations in process: as long as this object exists, the timeline ID
-/// is kept in `[Tenant::timelines_creating]` to exclude concurrent attempts to create the same timeline.
+/// is kept in `[TenantShard::timelines_creating]` to exclude concurrent attempts to create the same timeline.
 #[must_use]
 pub(crate) struct TimelineCreateGuard {
     pub(crate) _tenant_gate_guard: GateGuard,
-    pub(crate) owning_tenant: Arc<Tenant>,
+    pub(crate) owning_tenant: Arc<TenantShard>,
     pub(crate) timeline_id: TimelineId,
     pub(crate) timeline_path: Utf8PathBuf,
     pub(crate) idempotency: CreateTimelineIdempotency,
@@ -263,7 +265,7 @@ pub(crate) enum TimelineExclusionError {
 
 impl TimelineCreateGuard {
     pub(crate) fn new(
-        owning_tenant: &Arc<Tenant>,
+        owning_tenant: &Arc<TenantShard>,
         timeline_id: TimelineId,
         timeline_path: Utf8PathBuf,
         idempotency: CreateTimelineIdempotency,

From d1728a6bcd872ab9d79a95a5f3f5fa72243ed898 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Thu, 17 Apr 2025 09:08:42 -0500
Subject: [PATCH 30/55] Remove old compatibility hack for remote extensions
 (#11620)

Control plane has long since been updated to send the right value.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute_tools/src/bin/compute_ctl.rs | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index e337ee7b15..aacef91d56 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -57,24 +57,13 @@ use tracing::{error, info};
 use url::Url;
 use utils::failpoint_support;
 
-// Compatibility hack: if the control plane specified any remote-ext-config
-// use the default value for extension storage proxy gateway.
-// Remove this once the control plane is updated to pass the gateway URL
-fn parse_remote_ext_config(arg: &str) -> Result<String> {
-    if arg.starts_with("http") {
-        Ok(arg.trim_end_matches('/').to_string())
-    } else {
-        Ok("http://pg-ext-s3-gateway".to_string())
-    }
-}
-
 #[derive(Parser)]
 #[command(rename_all = "kebab-case")]
 struct Cli {
     #[arg(short = 'b', long, default_value = "postgres", env = "POSTGRES_PATH")]
     pub pgbin: String,
 
-    #[arg(short = 'r', long, value_parser = parse_remote_ext_config)]
+    #[arg(short = 'r', long)]
     pub remote_ext_config: Option<String>,
 
     /// The port to bind the external listening HTTP server to. Clients running

From 2c56c46d48ddaef67b1673c34f2c0d74e071a530 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Thu, 17 Apr 2025 14:38:19 +0000
Subject: [PATCH 31/55] compute: Set max log level for local proxy
 sql_over_http mod to WARN (#11629)

neondatabase/cloud#27738
---
 compute/vm-image-spec-bookworm.yaml | 2 +-
 compute/vm-image-spec-bullseye.yaml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/compute/vm-image-spec-bookworm.yaml b/compute/vm-image-spec-bookworm.yaml
index f63aa88da2..ec24d73242 100644
--- a/compute/vm-image-spec-bookworm.yaml
+++ b/compute/vm-image-spec-bookworm.yaml
@@ -22,7 +22,7 @@ commands:
   - name: local_proxy
     user: postgres
     sysvInitAction: respawn
-    shell: '/usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432'
+    shell: 'RUST_LOG="info,proxy::serverless::sql_over_http=warn" /usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432'
   - name: postgres-exporter
     user: nobody
     sysvInitAction: respawn
diff --git a/compute/vm-image-spec-bullseye.yaml b/compute/vm-image-spec-bullseye.yaml
index 8b3c681228..b40bdecebc 100644
--- a/compute/vm-image-spec-bullseye.yaml
+++ b/compute/vm-image-spec-bullseye.yaml
@@ -22,7 +22,7 @@ commands:
   - name: local_proxy
     user: postgres
     sysvInitAction: respawn
-    shell: '/usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432'
+    shell: 'RUST_LOG="info,proxy::serverless::sql_over_http=warn" /usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432'
   - name: postgres-exporter
     user: nobody
     sysvInitAction: respawn

From d4c059a884dc83a7dc670432e3b3c443a32dbba0 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 17 Apr 2025 16:03:23 +0100
Subject: [PATCH 32/55] tests: use endpoint http wrapper to get auth (#11628)

## Problem

`test_compute_startup_simple` and `test_compute_ondemand_slru_startup`
are failing.

This test implicitly asserts that the metrics.json endpoint succeeds and
returns all expected metrics, but doesn't make it easy to see what went
wrong if it doesn't (e.g. in this failure
https://neon-github-public-dev.s3.amazonaws.com/reports/main/14513210240/index.html#suites/13d8e764c394daadbad415a08454c04e/b0f92a86b2ed309f/)

In this case, it was failing because of a missing auth token, because it
was using `requests` directly instead of using the endpoint http client
type.

## Summary of changes

- Use endpoint http wrapper to get raise_for_status & auth token
---
 test_runner/performance/test_compute_startup.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/test_runner/performance/test_compute_startup.py b/test_runner/performance/test_compute_startup.py
index abedb4be27..5c36982c93 100644
--- a/test_runner/performance/test_compute_startup.py
+++ b/test_runner/performance/test_compute_startup.py
@@ -3,7 +3,6 @@ from __future__ import annotations
 from typing import TYPE_CHECKING
 
 import pytest
-import requests
 from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
 
 if TYPE_CHECKING:
@@ -68,9 +67,7 @@ def test_compute_startup_simple(
             endpoint.safe_psql("select 1;")
 
         # Get metrics
-        metrics = requests.get(
-            f"http://localhost:{endpoint.external_http_port}/metrics.json"
-        ).json()
+        metrics = endpoint.http_client().metrics_json()
         durations = {
             "wait_for_spec_ms": f"{i}_wait_for_spec",
             "sync_safekeepers_ms": f"{i}_sync_safekeepers",
@@ -155,9 +152,7 @@ def test_compute_ondemand_slru_startup(
             assert sum == 1000000
 
         # Get metrics
-        metrics = requests.get(
-            f"http://localhost:{endpoint.external_http_port}/metrics.json"
-        ).json()
+        metrics = endpoint.http_client().metrics_json()
         durations = {
             "wait_for_spec_ms": f"{slru}_{i}_wait_for_spec",
             "sync_safekeepers_ms": f"{slru}_{i}_sync_safekeepers",

From 2b041964b3648ea9b1453474180535ceb5ebae19 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 17 Apr 2025 17:53:10 +0200
Subject: [PATCH 33/55] cover direct IO + concurrent IO in unit, regression &
 perf tests (#11585)

This mirrors the production config.

Thread that discusses the merits of this:
- https://neondb.slack.com/archives/C033RQ5SPDH/p1744742010740569

# Refs
- context
https://neondb.slack.com/archives/C04BLQ4LW7K/p1744724844844589?thread_ts=1744705831.014169&cid=C04BLQ4LW7K
- prep for https://github.com/neondatabase/neon/pull/11558 which adds
new io mode `direct-rw`

# Impact on CI turnaround time

Spot-checking impact on CI timings

- Baseline: [some recent main
commit](https://github.com/neondatabase/neon/actions/runs/14471549758/job/40587837475)
- Comparison: [this
commit](https://github.com/neondatabase/neon/actions/runs/14471945087/job/40589613274)
in this PR here

Impact on CI turnaround time

- Regression tests:
  - x64: very minor, sometimes better; likely in the noise
  - arm64: substantial  30min => 40min
- Benchmarks (x86 only I think): very minor; noise seems higher than
regress tests

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Alex Chi Z. <4198311+skyzh@users.noreply.github.com>
Co-authored-by: Peter Bendel <peterbendel@neon.tech>
Co-authored-by: Alex Chi Z <chi@neon.tech>
---
 .github/workflows/_build-and-test-locally.yml | 12 +++++---
 .github/workflows/build_and_test.yml          |  2 ++
 Cargo.lock                                    |  1 +
 libs/pageserver_api/Cargo.toml                |  1 +
 libs/pageserver_api/src/models.rs             | 30 +++++++++++++++++--
 pageserver/src/virtual_file.rs                |  3 +-
 test_runner/regress/test_compaction.py        |  4 ++-
 7 files changed, 45 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index 8b1314f95b..318e69d8a7 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -272,10 +272,13 @@ jobs:
           # run pageserver tests with different settings
           for get_vectored_concurrent_io in sequential sidecar-task; do
             for io_engine in std-fs tokio-epoll-uring ; do
-              NEON_PAGESERVER_UNIT_TEST_GET_VECTORED_CONCURRENT_IO=$get_vectored_concurrent_io \
-                NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine \
-                ${cov_prefix} \
-                cargo nextest run $CARGO_FLAGS $CARGO_FEATURES  -E 'package(pageserver)'
+                for io_mode in buffered direct direct-rw ; do
+                  NEON_PAGESERVER_UNIT_TEST_GET_VECTORED_CONCURRENT_IO=$get_vectored_concurrent_io \
+                  NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine \
+                  NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOMODE=$io_mode \
+                  ${cov_prefix} \
+                  cargo nextest run $CARGO_FLAGS $CARGO_FEATURES  -E 'package(pageserver)'
+              done
             done
           done
 
@@ -392,6 +395,7 @@ jobs:
           BUILD_TAG: ${{ inputs.build-tag }}
           PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
           PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task
+          PAGESERVER_VIRTUAL_FILE_IO_MODE: direct
           USE_LFC: ${{ matrix.lfc_state == 'with-lfc' && 'true' || 'false' }}
 
       # Temporary disable this step until we figure out why it's so flaky
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 80c4511b36..e875cb327f 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -323,6 +323,8 @@ jobs:
           PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
           TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
           PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
+          PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task
+          PAGESERVER_VIRTUAL_FILE_IO_MODE: direct
           SYNC_BETWEEN_TESTS: true
       # XXX: no coverage data handling here, since benchmarks are run on release builds,
       # while coverage is currently collected for the debug ones
diff --git a/Cargo.lock b/Cargo.lock
index 7ab9378853..870401e7f9 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4352,6 +4352,7 @@ dependencies = [
  "humantime-serde",
  "itertools 0.10.5",
  "nix 0.27.1",
+ "once_cell",
  "postgres_backend",
  "postgres_ffi",
  "rand 0.8.5",
diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml
index 688e9de6e7..25f29b8ecd 100644
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -35,6 +35,7 @@ nix = {workspace = true, optional = true}
 reqwest.workspace = true
 rand.workspace = true
 tracing-utils.workspace = true
+once_cell.workspace = true
 
 [dev-dependencies]
 bincode.workspace = true
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index ea5456e04b..e367db614f 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -1817,8 +1817,34 @@ pub mod virtual_file {
     }
 
     impl IoMode {
-        pub const fn preferred() -> Self {
-            Self::Buffered
+        pub fn preferred() -> Self {
+            // The default behavior when running Rust unit tests without any further
+            // flags is to use the newest behavior if available on the platform (Direct).
+            // The CI uses the following environment variable to unit tests for all
+            // different modes.
+            // NB: the Python regression & perf tests have their own defaults management
+            // that writes pageserver.toml; they do not use this variable.
+            if cfg!(test) {
+                use once_cell::sync::Lazy;
+                static CACHED: Lazy<IoMode> = Lazy::new(|| {
+                    utils::env::var_serde_json_string(
+                        "NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IO_MODE",
+                    )
+                    .unwrap_or({
+                        #[cfg(target_os = "linux")]
+                        {
+                            IoMode::Direct
+                        }
+                        #[cfg(not(target_os = "linux"))]
+                        {
+                            IoMode::Buffered
+                        }
+                    })
+                });
+                *CACHED
+            } else {
+                IoMode::Buffered
+            }
         }
     }
 
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index cd3d897423..45cd0f469b 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -1366,7 +1366,8 @@ pub(crate) type IoBuffer = AlignedBuffer<ConstAlign<{ get_io_buffer_alignment()
 pub(crate) type IoPageSlice<'a> =
     AlignedSlice<'a, PAGE_SZ, ConstAlign<{ get_io_buffer_alignment() }>>;
 
-static IO_MODE: AtomicU8 = AtomicU8::new(IoMode::preferred() as u8);
+static IO_MODE: once_cell::sync::Lazy<AtomicU8> =
+    once_cell::sync::Lazy::new(|| AtomicU8::new(IoMode::preferred() as u8));
 
 pub(crate) fn set_io_mode(mode: IoMode) {
     IO_MODE.store(mode as u8, std::sync::atomic::Ordering::Relaxed);
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index 001ddcdcb0..53edf9f79e 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -72,6 +72,7 @@ PREEMPT_GC_COMPACTION_TENANT_CONF = {
     "wal_receiver_protocol",
     [PageserverWalReceiverProtocol.VANILLA, PageserverWalReceiverProtocol.INTERPRETED],
 )
+@pytest.mark.timeout(900)
 def test_pageserver_compaction_smoke(
     neon_env_builder: NeonEnvBuilder,
     wal_receiver_protocol: PageserverWalReceiverProtocol,
@@ -190,6 +191,7 @@ def test_pageserver_compaction_preempt(
 
 
 @skip_in_debug_build("only run with release build")
+@pytest.mark.timeout(600)
 def test_pageserver_gc_compaction_preempt(
     neon_env_builder: NeonEnvBuilder,
 ):
@@ -227,7 +229,7 @@ def test_pageserver_gc_compaction_preempt(
 
 
 @skip_in_debug_build("only run with release build")
-@pytest.mark.timeout(900)  # This test is slow with sanitizers enabled, especially on ARM
+@pytest.mark.timeout(600)  # This test is slow with sanitizers enabled, especially on ARM
 @pytest.mark.parametrize(
     "with_branches",
     ["with_branches", "no_branches"],

From ad0c5fdae7f32821e28a890f0dc67b5c2f6fb609 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 17 Apr 2025 12:12:24 -0400
Subject: [PATCH 34/55] fix(test): allow stale generation warnings in storcon
 (#11624)

## Problem

https://github.com/neondatabase/neon/pull/11531 did not fully fix the
problem because the warning is part of the storcon instead of
pageserver.

## Summary of changes

Allow stale generation error in storcon.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 test_runner/regress/test_tenants.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index c00f8f4ca5..d08692500f 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -390,7 +390,7 @@ def test_create_churn_during_restart(neon_env_builder: NeonEnvBuilder):
     # Tenant creation requests which arrive out of order will generate complaints about
     # generation nubmers out of order.
     env.pageserver.allowed_errors.append(".*Generation .+ is less than existing .+")
-    env.pageserver.allowed_errors.append(".*due to stale generation.+")
+    env.storage_controller.allowed_errors.append(".*due to stale generation.*")
 
     # Timeline::flush_and_shutdown cannot tell if it is hitting a failure because of
     # an incomplete attach, or some other problem.  In the field this should be rare,

From 748539b222a24c64c33d116c9d5ce2c8ad5c958d Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 17 Apr 2025 15:51:28 -0400
Subject: [PATCH 35/55] fix(pageserver): lower L0 compaction threshold (#11617)

## Problem

We saw OOMs due to L0 compaction happening simultaneously for all shards
of the same tenant right after the shard split.

## Summary of changes

Lower the threshold so that we compact fewer files.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/pageserver_api/src/config.rs | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index e734b07c38..b434696624 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -682,10 +682,10 @@ pub mod tenant_conf_defaults {
     pub const DEFAULT_COMPACTION_SHARD_ANCESTOR: bool = true;
 
     // This value needs to be tuned to avoid OOM. We have 3/4*CPUs threads for L0 compaction, that's
-    // 3/4*16=9 on most of our pageservers. Compacting 20 layers requires about 1 GB memory (could
-    // be reduced later by optimizing L0 hole calculation to avoid loading all keys into memory). So
-    // with this config, we can get a maximum peak compaction usage of 9 GB.
-    pub const DEFAULT_COMPACTION_UPPER_LIMIT: usize = 20;
+    // 3/4*8=6 on most of our pageservers. Compacting 10 layers requires a maximum of
+    // DEFAULT_CHECKPOINT_DISTANCE*10 memory, that's 2560MB. So with this config, we can get a maximum peak
+    // compaction usage of 15360MB.
+    pub const DEFAULT_COMPACTION_UPPER_LIMIT: usize = 10;
     // Enable L0 compaction pass and semaphore by default. L0 compaction must be responsive to avoid
     // read amp.
     pub const DEFAULT_COMPACTION_L0_FIRST: bool = true;
@@ -702,8 +702,11 @@ pub mod tenant_conf_defaults {
     // Relevant: https://github.com/neondatabase/neon/issues/3394
     pub const DEFAULT_GC_PERIOD: &str = "1 hr";
     pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3;
-    // If there are more than threshold * compaction_threshold (that is 3 * 10 in the default config) L0 layers, image
-    // layer creation will end immediately. Set to 0 to disable.
+    // Currently, any value other than 0 will trigger image layer creation preemption immediately with L0 backpressure
+    // without looking at the exact number of L0 layers.
+    // It was expected to have the following behavior:
+    // > If there are more than threshold * compaction_threshold (that is 3 * 10 in the default config) L0 layers, image
+    // > layer creation will end immediately. Set to 0 to disable.
     pub const DEFAULT_IMAGE_CREATION_PREEMPT_THRESHOLD: usize = 3;
     pub const DEFAULT_PITR_INTERVAL: &str = "7 days";
     pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds";

From 6c2e5c044ce12296494231f342ffb10ff90cc2bb Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Thu, 17 Apr 2025 21:59:35 +0200
Subject: [PATCH 36/55] random operations test (#10986)

## Problem
We need to test the stability of Neon.

## Summary of changes
The test runs random operations on a Neon project. It performs via the
Public API calls the following operations: `create a branch`, `delete a
branch`, `add a read-only endpoint`, `delete a read-only endpoint`,
`restore a branch to a random position in the past`. All the branches
and endpoints are loaded with `pgbench`.

---------

Co-authored-by: Peter Bendel <peterbendel@neon.tech>
Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 .github/workflows/random-ops-test.yml     |  93 +++++
 test_runner/fixtures/neon_api.py          | 147 ++++++-
 test_runner/fixtures/neon_fixtures.py     |   5 +-
 test_runner/random_ops/README.md          |  93 +++++
 test_runner/random_ops/test_random_ops.py | 463 ++++++++++++++++++++++
 5 files changed, 795 insertions(+), 6 deletions(-)
 create mode 100644 .github/workflows/random-ops-test.yml
 create mode 100644 test_runner/random_ops/README.md
 create mode 100644 test_runner/random_ops/test_random_ops.py

diff --git a/.github/workflows/random-ops-test.yml b/.github/workflows/random-ops-test.yml
new file mode 100644
index 0000000000..7c19537744
--- /dev/null
+++ b/.github/workflows/random-ops-test.yml
@@ -0,0 +1,93 @@
+name: Random Operations Test
+
+on:
+  schedule:
+    # * is a special character in YAML so you have to quote this string
+    #          ┌───────────── minute (0 - 59)
+    #          │  ┌───────────── hour (0 - 23)
+    #          │  │  ┌───────────── day of the month (1 - 31)
+    #          │  │  │ ┌───────────── month (1 - 12 or JAN-DEC)
+    #          │  │  │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
+    - cron:  '23 */2 * * *' # runs every 2 hours
+  workflow_dispatch:
+    inputs:
+      random_seed:
+        type: number
+        description: 'The random seed'
+        required: false
+        default: 0
+      num_operations:
+        type: number
+        description: "The number of operations to test"
+        default: 250
+
+defaults:
+  run:
+    shell: bash -euxo pipefail {0}
+
+permissions: {}
+
+env:
+  DEFAULT_PG_VERSION: 16
+  PLATFORM: neon-captest-new
+  AWS_DEFAULT_REGION: eu-central-1
+
+jobs:
+  run-random-rests:
+    env:
+      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
+    runs-on: small
+    permissions:
+      id-token: write
+      statuses: write
+
+    strategy:
+      fail-fast: false
+      matrix:
+        pg-version: [16, 17]
+
+    container:
+      image: ghcr.io/neondatabase/build-tools:pinned-bookworm
+      credentials:
+        username: ${{ github.actor }}
+        password: ${{ secrets.GITHUB_TOKEN }}
+      options: --init
+    steps:
+      - name: Harden the runner (Audit all outbound calls)
+        uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
+        with:
+          egress-policy: audit
+
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+      - name: Download Neon artifact
+        uses: ./.github/actions/download
+        with:
+          name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
+          path: /tmp/neon/
+          prefix: latest
+          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+
+      - name: Run tests
+        uses: ./.github/actions/run-python-test-set
+        with:
+          build_type: remote
+          test_selection: random_ops
+          run_in_parallel: false
+          extra_params: -m remote_cluster
+          pg_version: ${{ matrix.pg-version }}
+          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        env:
+          NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
+          RANDOM_SEED: ${{ inputs.random_seed }}
+          NUM_OPERATIONS: ${{ inputs.num_operations }}
+
+      - name: Create Allure report
+        if: ${{ !cancelled() }}
+        id: create-allure-report
+        uses: ./.github/actions/allure-report-generate
+        with:
+          store-test-results-into-db: true
+          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        env:
+          REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
diff --git a/test_runner/fixtures/neon_api.py b/test_runner/fixtures/neon_api.py
index df80f0683c..0cf5945458 100644
--- a/test_runner/fixtures/neon_api.py
+++ b/test_runner/fixtures/neon_api.py
@@ -22,19 +22,62 @@ def connection_parameters_to_env(params: dict[str, str]) -> dict[str, str]:
     }
 
 
+# Some API calls not yet implemented.
+# You may want to copy not-yet-implemented methods from the PR https://github.com/neondatabase/neon/pull/11305
 class NeonAPI:
     def __init__(self, neon_api_key: str, neon_api_base_url: str):
         self.__neon_api_key = neon_api_key
         self.__neon_api_base_url = neon_api_base_url.strip("/")
+        self.retry_if_possible = False
+        self.attempts = 10
+        self.sleep_before_retry = 1
+        self.retries524 = 0
+        self.retries4xx = 0
 
     def __request(self, method: str | bytes, endpoint: str, **kwargs: Any) -> requests.Response:
-        if "headers" not in kwargs:
-            kwargs["headers"] = {}
+        kwargs["headers"] = kwargs.get("headers", {})
         kwargs["headers"]["Authorization"] = f"Bearer {self.__neon_api_key}"
 
-        resp = requests.request(method, f"{self.__neon_api_base_url}{endpoint}", **kwargs)
-        log.debug("%s %s returned a %d: %s", method, endpoint, resp.status_code, resp.text)
-        resp.raise_for_status()
+        for attempt in range(self.attempts):
+            retry = False
+            resp = requests.request(method, f"{self.__neon_api_base_url}{endpoint}", **kwargs)
+            if resp.status_code >= 400:
+                log.error(
+                    "%s %s returned a %d: %s",
+                    method,
+                    endpoint,
+                    resp.status_code,
+                    resp.text if resp.status_code != 524 else "CloudFlare error page",
+                )
+            else:
+                log.debug("%s %s returned a %d: %s", method, endpoint, resp.status_code, resp.text)
+            if not self.retry_if_possible:
+                resp.raise_for_status()
+                break
+            elif resp.status_code >= 400:
+                if resp.status_code == 422:
+                    if resp.json()["message"] == "branch not ready yet":
+                        retry = True
+                        self.retries4xx += 1
+                elif resp.status_code == 423 and resp.json()["message"] in {
+                    "endpoint is in some transitive state, could not suspend",
+                    "project already has running conflicting operations, scheduling of new ones is prohibited",
+                }:
+                    retry = True
+                    self.retries4xx += 1
+                elif resp.status_code == 524:
+                    log.info("The request was timed out, trying to get operations")
+                    retry = True
+                    self.retries524 += 1
+            if retry:
+                log.info("Retrying, attempt %s/%s", attempt + 1, self.attempts)
+                time.sleep(self.sleep_before_retry)
+                continue
+            else:
+                resp.raise_for_status()
+            break
+        else:
+            raise RuntimeError("Max retry count is reached")
 
         return resp
 
@@ -101,6 +144,96 @@ class NeonAPI:
 
         return cast("dict[str, Any]", resp.json())
 
+    def create_branch(
+        self,
+        project_id: str,
+        branch_name: str | None = None,
+        parent_id: str | None = None,
+        parent_lsn: str | None = None,
+        parent_timestamp: str | None = None,
+        protected: bool | None = None,
+        archived: bool | None = None,
+        init_source: str | None = None,
+        add_endpoint=True,
+    ) -> dict[str, Any]:
+        data: dict[str, Any] = {}
+        if add_endpoint:
+            data["endpoints"] = [{"type": "read_write"}]
+        data["branch"] = {}
+        if parent_id:
+            data["branch"]["parent_id"] = parent_id
+        if branch_name:
+            data["branch"]["name"] = branch_name
+        if parent_lsn is not None:
+            data["branch"]["parent_lsn"] = parent_lsn
+        if parent_timestamp is not None:
+            data["branch"]["parent_timestamp"] = parent_timestamp
+        if protected is not None:
+            data["branch"]["protected"] = protected
+        if init_source is not None:
+            data["branch"]["init_source"] = init_source
+        if archived is not None:
+            data["branch"]["archived"] = archived
+        if not data["branch"]:
+            data.pop("branch")
+        resp = self.__request(
+            "POST",
+            f"/projects/{project_id}/branches",
+            headers={
+                "Accept": "application/json",
+                "Content-Type": "application/json",
+            },
+            json=data,
+        )
+        return cast("dict[str, Any]", resp.json())
+
+    def get_branch_details(self, project_id: str, branch_id: str) -> dict[str, Any]:
+        resp = self.__request(
+            "GET",
+            f"/projects/{project_id}/branches/{branch_id}",
+            headers={
+                "Accept": "application/json",
+            },
+        )
+        return cast("dict[str, Any]", resp.json())
+
+    def delete_branch(self, project_id: str, branch_id: str) -> dict[str, Any]:
+        resp = self.__request(
+            "DELETE",
+            f"/projects/{project_id}/branches/{branch_id}",
+            headers={
+                "Accept": "application/json",
+            },
+        )
+        return cast("dict[str, Any]", resp.json())
+
+    def restore_branch(
+        self,
+        project_id: str,
+        branch_id: str,
+        source_branch_id: str,
+        source_lsn: str | None,
+        source_timestamp: str | None,
+        preserve_under_name: str | None,
+    ):
+        data = {"source_branch_id": source_branch_id}
+        if source_lsn:
+            data["source_lsn"] = source_lsn
+        if source_timestamp:
+            data["source_timestamp"] = source_timestamp
+        if preserve_under_name:
+            data["preserve_under_name"] = preserve_under_name
+        log.info("Data: %s", data)
+        resp = self.__request(
+            "POST",
+            f"/projects/{project_id}/branches/{branch_id}/restore",
+            headers={
+                "Accept": "application/json",
+            },
+            json=data,
+        )
+        return cast("dict[str, Any]", resp.json())
+
     def start_endpoint(
         self,
         project_id: str,
@@ -176,6 +309,10 @@ class NeonAPI:
 
         return cast("dict[str, Any]", resp.json())
 
+    def delete_endpoint(self, project_id: str, endpoint_id: str) -> dict[str, Any]:
+        resp = self.__request("DELETE", f"/projects/{project_id}/endpoints/{endpoint_id}")
+        return cast("dict[str,Any]", resp.json())
+
     def get_connection_uri(
         self,
         project_id: str,
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index e70ddc8e66..db2b68d082 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3185,6 +3185,7 @@ class PgBin:
         command: list[str],
         env: Env | None = None,
         cwd: str | Path | None = None,
+        stderr_pipe: Any | None = None,
     ) -> subprocess.Popen[Any]:
         """
         Run one of the postgres binaries, not waiting for it to finish
@@ -3202,7 +3203,9 @@ class PgBin:
         log.info(f"Running command '{' '.join(command)}'")
         env = self._build_env(env)
         self._log_env(env)
-        return subprocess.Popen(command, env=env, cwd=cwd, stdout=subprocess.PIPE, text=True)
+        return subprocess.Popen(
+            command, env=env, cwd=cwd, stdout=subprocess.PIPE, stderr=stderr_pipe, text=True
+        )
 
     def run(
         self,
diff --git a/test_runner/random_ops/README.md b/test_runner/random_ops/README.md
new file mode 100644
index 0000000000..45163422a8
--- /dev/null
+++ b/test_runner/random_ops/README.md
@@ -0,0 +1,93 @@
+# Random Operations Test for Neon Stability
+
+## Problem Statement
+
+Neon needs robust testing of Neon's stability to ensure reliability for users. The random operations test addresses this by continuously exercising the API with unpredictable sequences of operations, helping to identify edge cases and potential issues that might not be caught by deterministic tests.
+
+### Key Components
+
+#### 1. Class Structure
+
+The test implements three main classes to model the Neon architecture:
+
+- **NeonProject**: Represents a Neon project and manages the lifecycle of branches and endpoints
+- **NeonBranch**: Represents a branch within a project, with methods for creating child branches, endpoints, and performing point-in-time restores
+- **NeonEndpoint**: Represents an endpoint (connection point) for a branch, with methods for managing benchmarks
+
+#### 2. Operations Tested
+
+The test randomly performs the following operations with weighted probabilities:
+
+- **Creating branches** 
+- **Deleting branches**
+- **Adding read-only endpoints**
+- **Deleting read-only endpoints**
+- **Restoring branches to random points in time**
+
+#### 3. Load Generation
+
+Each branch and endpoint is loaded with `pgbench` to simulate real database workloads during testing. This ensures that the operations are performed against branches with actual data and ongoing transactions.
+
+#### 4. Error Handling
+
+The test includes robust error handling for various scenarios:
+- Branch limit exceeded
+- Connection timeouts
+- Control plane timeouts (HTTP 524 errors)
+- Benchmark failures
+
+#### 5. CI Integration
+
+The test is integrated into the CI pipeline via a GitHub workflow that runs daily, ensuring continuous validation of API stability.
+
+## How It Works
+
+1. The test creates a Neon project using the Public API
+2. It initializes the main branch with pgbench data
+3. It performs random operations according to the weighted probabilities
+4. During each operation, it checks that all running benchmarks are still operational
+5. The test cleans up by deleting the project at the end
+
+## Configuration
+
+The test can be configured with:
+- `RANDOM_SEED`: Set a specific random seed for reproducible test runs
+- `NEON_API_KEY`: API key for authentication
+- `NEON_API_BASE_URL`: Base URL for the API (defaults to staging environment)
+- `NUM_OPERATIONS`: The number of operations to be performed
+
+## Running the Test
+
+The test is designed to run in the CI environment but can also be executed locally:
+
+```bash
+NEON_API_KEY=your_api_key ./scripts/pytest test_runner/random_ops/test_random_ops.py -m remote_cluster
+```
+
+To run with a specific random seed for reproducibility:
+
+```bash
+RANDOM_SEED=12345 NEON_API_KEY=your_api_key ./scripts/pytest test_runner/random_ops/test_random_ops.py -m remote_cluster
+```
+
+To run with the custom number of operations:
+
+```bash
+NUM_OPERATIONS=500 NEON_API_KEY=your_api_key ./scripts/pytest test_runner/random_ops/test_random_ops.py -m remote_cluster
+```
+
+## Benefits
+
+This test provides several key benefits:
+1. **Comprehensive API testing**: Exercises multiple API endpoints in combination
+2. **Edge case discovery**: Random sequences may uncover issues not found in deterministic tests
+3. **Stability validation**: Continuous execution helps ensure long-term API reliability
+4. **Regression prevention**: Detects if new changes break existing API functionality
+
+## Future Improvements
+
+Potential enhancements to the test could include:
+1. Adding more API operations, e.g. `reset_to_parent`, `snapshot`, etc 
+2. Implementing more sophisticated load patterns
+3. Adding metrics collection to measure API performance
+4. Extending test duration for longer-term stability validation
\ No newline at end of file
diff --git a/test_runner/random_ops/test_random_ops.py b/test_runner/random_ops/test_random_ops.py
new file mode 100644
index 0000000000..b3078ecac1
--- /dev/null
+++ b/test_runner/random_ops/test_random_ops.py
@@ -0,0 +1,463 @@
+"""
+Run the random API tests on the cloud instance of Neon
+"""
+
+from __future__ import annotations
+
+import os
+import random
+import subprocess
+import time
+from datetime import UTC, datetime, timedelta
+from typing import TYPE_CHECKING, Any
+
+import pytest
+from fixtures.log_helper import log
+from requests import HTTPError
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+    from fixtures.neon_api import NeonAPI
+    from fixtures.neon_fixtures import PgBin
+    from fixtures.pg_version import PgVersion
+
+
+class NeonEndpoint:
+    """
+    Neon Endpoint
+    Gets the output of the API call of an endpoint creation
+    """
+
+    def __init__(self, project: NeonProject, endpoint: dict[str, Any]):
+        self.project: NeonProject = project
+        self.id: str = endpoint["id"]
+        # The branch endpoint belongs to
+        self.branch: NeonBranch = project.branches[endpoint["branch_id"]]
+        self.type: str = endpoint["type"]
+        # add itself to the list of endpoints of the branch
+        self.branch.endpoints[self.id] = self
+        self.project.endpoints[self.id] = self
+        self.host: str = endpoint["host"]
+        self.benchmark: subprocess.Popen[Any] | None = None
+        # The connection environment is used when running benchmark
+        self.connect_env: dict[str, str] | None = None
+        if self.branch.connect_env:
+            self.connect_env = self.branch.connect_env.copy()
+            self.connect_env["PGHOST"] = self.host
+
+    def delete(self):
+        self.project.delete_endpoint(self.id)
+
+    def start_benchmark(self, clients=10):
+        return self.project.start_benchmark(self.id, clients=clients)
+
+    def check_benchmark(self):
+        self.project.check_benchmark(self.id)
+
+    def terminate_benchmark(self):
+        self.project.terminate_benchmark(self.id)
+
+
+class NeonBranch:
+    """
+    Neon Branch
+    Gets the output of the API call of the Neon Public API call of a branch creation as a first parameter
+    is_reset defines if the branch is a reset one i.e. created as a result of the reset API Call
+    """
+
+    def __init__(self, project, branch: dict[str, Any], is_reset=False):
+        self.id: str = branch["branch"]["id"]
+        self.desc = branch
+        self.project: NeonProject = project
+        self.neon_api: NeonAPI = project.neon_api
+        self.project_id: str = branch["branch"]["project_id"]
+        self.parent: NeonBranch | None = (
+            self.project.branches[branch["branch"]["parent_id"]]
+            if "parent_id" in branch["branch"]
+            else None
+        )
+        if is_reset:
+            self.project.reset_branches.add(self.id)
+        elif self.parent:
+            self.project.leaf_branches[self.id] = self
+        if self.parent is not None and self.parent.id in self.project.leaf_branches:
+            self.project.leaf_branches.pop(self.parent.id)
+        self.project.branches[self.id] = self
+        self.children: dict[str, NeonBranch] = {}
+        if self.parent is not None:
+            self.parent.children[self.id] = self
+        self.endpoints: dict[str, NeonEndpoint] = {}
+        self.connection_parameters: dict[str, str] | None = (
+            branch["connection_uris"][0]["connection_parameters"]
+            if "connection_uris" in branch
+            else None
+        )
+        self.benchmark: subprocess.Popen[Any] | None = None
+        self.updated_at: datetime = datetime.fromisoformat(branch["branch"]["updated_at"])
+        self.connect_env: dict[str, str] | None = None
+        if self.connection_parameters:
+            self.connect_env = {
+                "PGHOST": self.connection_parameters["host"],
+                "PGUSER": self.connection_parameters["role"],
+                "PGDATABASE": self.connection_parameters["database"],
+                "PGPASSWORD": self.connection_parameters["password"],
+                "PGSSLMODE": "require",
+            }
+
+    def __str__(self):
+        """
+        Prints the branch's name with all the predecessors
+        (r) means the branch is a reset one
+        """
+        return f"{self.id}{'(r)' if self.id in self.project.reset_branches else ''}, parent: {self.parent}"
+
+    def create_child_branch(self) -> NeonBranch | None:
+        return self.project.create_branch(self.id)
+
+    def create_ro_endpoint(self) -> NeonEndpoint:
+        return NeonEndpoint(
+            self.project,
+            self.neon_api.create_endpoint(self.project_id, self.id, "read_only", {})["endpoint"],
+        )
+
+    def delete(self) -> None:
+        self.project.delete_branch(self.id)
+
+    def start_benchmark(self, clients=10) -> subprocess.Popen[Any]:
+        return self.project.start_benchmark(self.id, clients=clients)
+
+    def check_benchmark(self) -> None:
+        self.project.check_benchmark(self.id)
+
+    def terminate_benchmark(self) -> None:
+        self.project.terminate_benchmark(self.id)
+
+    def restore_random_time(self) -> None:
+        """
+        Does PITR, i.e. calls the reset API call on the same branch to the random time in the past
+        """
+        min_time = self.updated_at + timedelta(seconds=1)
+        max_time = datetime.now(UTC) - timedelta(seconds=1)
+        target_time = (min_time + (max_time - min_time) * random.random()).replace(microsecond=0)
+        res = self.restore(
+            self.id,
+            source_timestamp=target_time.isoformat().replace("+00:00", "Z"),
+            preserve_under_name=self.project.gen_restore_name(),
+        )
+        if res is None:
+            return
+        self.updated_at = datetime.fromisoformat(res["branch"]["updated_at"])
+        parent_id: str = res["branch"]["parent_id"]
+        # Creates an object for the parent branch
+        # After the reset operation a new parent branch is created
+        parent = NeonBranch(
+            self.project, self.neon_api.get_branch_details(self.project_id, parent_id), True
+        )
+        self.project.branches[parent_id] = parent
+        self.parent = parent
+        parent.children[self.id] = self
+        self.project.wait()
+
+    def restore(
+        self,
+        source_branch_id: str,
+        source_lsn: str | None = None,
+        source_timestamp: str | None = None,
+        preserve_under_name: str | None = None,
+    ) -> dict[str, Any] | None:
+        endpoints = [ep for ep in self.endpoints.values() if ep.type == "read_only"]
+        # Terminate all the benchmarks running to prevent errors. Errors in benchmark during pgbench are expected
+        for ep in endpoints:
+            ep.terminate_benchmark()
+        self.terminate_benchmark()
+        try:
+            res: dict[str, Any] = self.neon_api.restore_branch(
+                self.project_id,
+                self.id,
+                source_branch_id,
+                source_lsn,
+                source_timestamp,
+                preserve_under_name,
+            )
+        except HTTPError as he:
+            if (
+                he.response.status_code == 422
+                and he.response.json()["code"] == "BRANCHES_LIMIT_EXCEEDED"
+            ):
+                log.info("Branch limit exceeded, skipping")
+                return None
+            else:
+                raise HTTPError(he) from he
+        self.project.wait()
+        self.start_benchmark()
+        for ep in endpoints:
+            ep.start_benchmark()
+        return res
+
+
+class NeonProject:
+    """
+    The project object
+    Calls the Public API to create a Neon Project
+    """
+
+    def __init__(self, neon_api: NeonAPI, pg_bin: PgBin, pg_version: PgVersion):
+        self.neon_api = neon_api
+        self.pg_bin = pg_bin
+        proj = self.neon_api.create_project(
+            pg_version, f"Automatic random API test {os.getenv('GITHUB_RUN_ID')}"
+        )
+        self.id: str = proj["project"]["id"]
+        self.name: str = proj["project"]["name"]
+        self.connection_uri: str = proj["connection_uris"][0]["connection_uri"]
+        self.connection_parameters: dict[str, str] = proj["connection_uris"][0][
+            "connection_parameters"
+        ]
+        self.pg_version: PgVersion = pg_version
+        # Leaf branches are the branches, which do not have children
+        self.leaf_branches: dict[str, NeonBranch] = {}
+        self.branches: dict[str, NeonBranch] = {}
+        self.reset_branches: set[str] = set()
+        self.main_branch: NeonBranch = NeonBranch(self, proj)
+        self.main_branch.connection_parameters = self.connection_parameters
+        self.endpoints: dict[str, NeonEndpoint] = {}
+        for endpoint in proj["endpoints"]:
+            NeonEndpoint(self, endpoint)
+        self.neon_api.wait_for_operation_to_finish(self.id)
+        self.benchmarks: dict[str, subprocess.Popen[Any]] = {}
+        self.restore_num: int = 0
+        self.restart_pgbench_on_console_errors: bool = False
+
+    def delete(self):
+        self.neon_api.delete_project(self.id)
+
+    def create_branch(self, parent_id: str | None = None) -> NeonBranch | None:
+        self.wait()
+        try:
+            branch_def = self.neon_api.create_branch(self.id, parent_id=parent_id)
+        except HTTPError as he:
+            if (
+                he.response.status_code == 422
+                and he.response.json()["code"] == "BRANCHES_LIMIT_EXCEEDED"
+            ):
+                log.info("Branch limit exceeded, skipping")
+                return None
+            else:
+                raise HTTPError(he) from he
+        new_branch = NeonBranch(self, branch_def)
+        self.wait()
+        return new_branch
+
+    def delete_branch(self, branch_id: str) -> None:
+        parent = self.branches[branch_id].parent
+        if not parent or branch_id == self.main_branch.id:
+            raise RuntimeError("Cannot delete the main branch")
+        if branch_id not in self.leaf_branches and branch_id not in self.reset_branches:
+            raise RuntimeError(f"The branch {branch_id}, probably, has ancestors")
+        if branch_id not in self.branches:
+            raise RuntimeError(f"The branch with id {branch_id} is not found")
+        endpoints_to_delete = [
+            ep for ep in self.branches[branch_id].endpoints.values() if ep.type == "read_only"
+        ]
+        for ep in endpoints_to_delete:
+            ep.delete()
+        if branch_id not in self.reset_branches:
+            self.terminate_benchmark(branch_id)
+        self.neon_api.delete_branch(self.id, branch_id)
+        if len(parent.children) == 1 and parent.id != self.main_branch.id:
+            self.leaf_branches[parent.id] = parent
+        parent.children.pop(branch_id)
+        if branch_id in self.leaf_branches:
+            self.leaf_branches.pop(branch_id)
+        else:
+            self.reset_branches.remove(branch_id)
+        self.branches.pop(branch_id)
+        self.wait()
+        if parent.id in self.reset_branches:
+            parent.delete()
+
+    def delete_endpoint(self, endpoint_id: str) -> None:
+        self.terminate_benchmark(endpoint_id)
+        self.neon_api.delete_endpoint(self.id, endpoint_id)
+        self.endpoints[endpoint_id].branch.endpoints.pop(endpoint_id)
+        self.endpoints.pop(endpoint_id)
+        self.wait()
+
+    def start_benchmark(self, target: str, clients: int = 10) -> subprocess.Popen[Any]:
+        if target in self.benchmarks:
+            raise RuntimeError(f"Benchmark was already started for {target}")
+        is_endpoint = target.startswith("ep")
+        read_only = is_endpoint and self.endpoints[target].type == "read_only"
+        cmd = ["pgbench", f"-c{clients}", "-T10800", "-Mprepared"]
+        if read_only:
+            cmd.extend(["-S", "-n"])
+        target_object = self.endpoints[target] if is_endpoint else self.branches[target]
+        if target_object.connect_env is None:
+            raise RuntimeError(f"The connection environment is not defined for {target}")
+        log.info(
+            "running pgbench on %s, cmd: %s, host: %s",
+            target,
+            cmd,
+            target_object.connect_env["PGHOST"],
+        )
+        pgbench = self.pg_bin.run_nonblocking(
+            cmd, env=target_object.connect_env, stderr_pipe=subprocess.PIPE
+        )
+        self.benchmarks[target] = pgbench
+        target_object.benchmark = pgbench
+        time.sleep(2)
+        return pgbench
+
+    def check_all_benchmarks(self) -> None:
+        for target in tuple(self.benchmarks.keys()):
+            self.check_benchmark(target)
+
+    def check_benchmark(self, target) -> None:
+        rc = self.benchmarks[target].poll()
+        if rc is not None:
+            _, err = self.benchmarks[target].communicate()
+            log.error("STDERR: %s", err)
+            # if the benchmark failed due to irresponsible Control plane,
+            # just restart it
+            if self.restart_pgbench_on_console_errors and (
+                "ERROR:  Couldn't connect to compute node" in err
+                or "ERROR:  Console request failed" in err
+            ):
+                log.info("Restarting benchmark for %s", target)
+                self.benchmarks.pop(target)
+                self.start_benchmark(target)
+                return
+            raise RuntimeError(f"The benchmark for {target} ended with code {rc}")
+
+    def terminate_benchmark(self, target):
+        log.info("Terminating the benchmark %s", target)
+        target_endpoint = target.startswith("ep")
+        self.check_benchmark(target)
+        self.benchmarks[target].terminate()
+        self.benchmarks.pop(target)
+        if target_endpoint:
+            self.endpoints[target].benchmark = None
+        else:
+            self.branches[target].benchmark = None
+
+    def wait(self):
+        """
+        Wait for all the operations to be finished
+        """
+        return self.neon_api.wait_for_operation_to_finish(self.id)
+
+    def gen_restore_name(self):
+        self.restore_num += 1
+        return f"restore{self.restore_num}"
+
+
+@pytest.fixture()
+def setup_class(
+    pg_version: PgVersion,
+    pg_bin: PgBin,
+    neon_api: NeonAPI,
+):
+    neon_api.retry_if_possible = True
+    project = NeonProject(neon_api, pg_bin, pg_version)
+    log.info("Created a project with id %s, name %s", project.id, project.name)
+    yield pg_bin, project
+    log.info("Retried 524 errors: %s", neon_api.retries524)
+    log.info("Retried 4xx errors: %s", neon_api.retries4xx)
+    if neon_api.retries524 > 0:
+        print(f"::warning::Retried on 524 error {neon_api.retries524} times")
+    if neon_api.retries4xx > 0:
+        print(f"::warning::Retried on 4xx error {neon_api.retries4xx} times")
+    log.info("Removing the project")
+    project.delete()
+
+
+def do_action(project: NeonProject, action: str) -> None:
+    """
+    Runs the action
+    """
+    log.info("Action: %s", action)
+    if action == "new_branch":
+        log.info("Trying to create a new branch")
+        parent = project.branches[
+            random.choice(list(set(project.branches.keys()) - project.reset_branches))
+        ]
+        log.info("Parent: %s", parent)
+        child = parent.create_child_branch()
+        if child is None:
+            return
+        log.info("Created branch %s", child)
+        child.start_benchmark()
+    elif action == "delete_branch":
+        if project.leaf_branches:
+            target = random.choice(list(project.leaf_branches.values()))
+            log.info("Trying to delete branch %s", target)
+            target.delete()
+        else:
+            log.info("Leaf branches not found, skipping")
+    elif action == "new_ro_endpoint":
+        ep = random.choice(
+            [br for br in project.branches.values() if br.id not in project.reset_branches]
+        ).create_ro_endpoint()
+        log.info("Created the RO endpoint with id %s branch: %s", ep.id, ep.branch.id)
+        ep.start_benchmark()
+    elif action == "delete_ro_endpoint":
+        ro_endpoints: list[NeonEndpoint] = [
+            endpoint for endpoint in project.endpoints.values() if endpoint.type == "read_only"
+        ]
+        if ro_endpoints:
+            target_ep: NeonEndpoint = random.choice(ro_endpoints)
+            target_ep.delete()
+            log.info("endpoint %s deleted", target_ep.id)
+        else:
+            log.info("no read_only endpoints present, skipping")
+    elif action == "restore_random_time":
+        if project.leaf_branches:
+            br: NeonBranch = random.choice(list(project.leaf_branches.values()))
+            log.info("Restore %s", br)
+            br.restore_random_time()
+        else:
+            log.info("No leaf branches found")
+    else:
+        raise ValueError(f"The action {action} is unknown")
+
+
+@pytest.mark.timeout(7200)
+@pytest.mark.remote_cluster
+def test_api_random(
+    setup_class,
+    pg_distrib_dir: Path,
+    test_output_dir: Path,
+):
+    """
+    Run the random API tests
+    """
+    if seed_env := os.getenv("RANDOM_SEED"):
+        seed = int(seed_env)
+    else:
+        seed = 0
+    if seed == 0:
+        seed = int(time.time())
+    log.info("Using random seed: %s", seed)
+    random.seed(seed)
+    pg_bin, project = setup_class
+    # Here we can assign weights
+    ACTIONS = (
+        ("new_branch", 1.5),
+        ("new_ro_endpoint", 1.4),
+        ("delete_ro_endpoint", 0.8),
+        ("delete_branch", 1.0),
+        ("restore_random_time", 1.2),
+    )
+    if num_ops_env := os.getenv("NUM_OPERATIONS"):
+        num_operations = int(num_ops_env)
+    else:
+        num_operations = 250
+    pg_bin.run(["pgbench", "-i", "-I", "dtGvp", "-s100"], env=project.main_branch.connect_env)
+    for _ in range(num_operations):
+        log.info("Starting action #%s", _ + 1)
+        do_action(
+            project, random.choices([a[0] for a in ACTIONS], weights=[w[1] for w in ACTIONS])[0]
+        )
+        project.check_all_benchmarks()
+    assert True

From c1e4befd561c594bd64818508128203684b54423 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 17 Apr 2025 22:25:30 +0200
Subject: [PATCH 37/55] Additional fixes and improvements to storcon safekeeper
 timelines  (#11477)

This delivers some additional fixes and improvements to storcon managed
safekeeper timelines:

* use `i32::MAX` for the generation number of timeline deletion
* start the generation for new timelines at 1 instead of 0: this ensures
that the other components actually are generation enabled
* fix database operations we use for metrics
* use join in list_pending_ops to prevent the classical ORM issue where
one does many db queries
* use enums in `test_storcon_create_delete_sk_down`. we are adding a
second parameter, and having two bool parameters is weird.
* extend `test_storcon_create_delete_sk_down` with a test of whole
tenant deletion. this hasn't been tested before.
* remove some redundant logging contexts
* Don't require mutable access to the service lock for scheduling
pending ops in memory. In order to pull this off, create reconcilers
eagerly. The advantage is that we don't need mutable access to the
service lock that way any more.

Part of #9011

---------

Co-authored-by: Arseny Sher <sher-ars@yandex.ru>
---
 storage_controller/src/persistence.rs         | 43 +++++++---
 storage_controller/src/service.rs             |  6 +-
 .../src/service/safekeeper_reconciler.rs      | 81 ++++++++++---------
 .../src/service/safekeeper_service.rs         | 30 ++++---
 .../regress/test_storage_controller.py        | 48 ++++++++---
 5 files changed, 139 insertions(+), 69 deletions(-)

diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index d25448718f..a413bba3c9 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -126,6 +126,7 @@ pub(crate) enum DatabaseOperation {
     InsertTimelineReconcile,
     RemoveTimelineReconcile,
     ListTimelineReconcile,
+    ListTimelineReconcileStartup,
 }
 
 #[must_use]
@@ -1521,23 +1522,41 @@ impl Persistence {
         .await
     }
 
-    /// Load pending operations from db.
-    pub(crate) async fn list_pending_ops(
+    /// Load pending operations from db, joined together with timeline data.
+    pub(crate) async fn list_pending_ops_with_timelines(
         &self,
-    ) -> DatabaseResult<Vec<TimelinePendingOpPersistence>> {
+    ) -> DatabaseResult<Vec<(TimelinePendingOpPersistence, Option<TimelinePersistence>)>> {
         use crate::schema::safekeeper_timeline_pending_ops::dsl;
+        use crate::schema::timelines;
 
         let timeline_from_db = self
-            .with_measured_conn(DatabaseOperation::ListTimelineReconcile, move |conn| {
-                Box::pin(async move {
-                    let from_db: Vec<TimelinePendingOpPersistence> =
-                        dsl::safekeeper_timeline_pending_ops.load(conn).await?;
-                    Ok(from_db)
-                })
-            })
+            .with_measured_conn(
+                DatabaseOperation::ListTimelineReconcileStartup,
+                move |conn| {
+                    Box::pin(async move {
+                        let from_db: Vec<(TimelinePendingOpPersistence, Option<TimelineFromDb>)> =
+                            dsl::safekeeper_timeline_pending_ops
+                                .left_join(
+                                    timelines::table.on(timelines::tenant_id
+                                        .eq(dsl::tenant_id)
+                                        .and(timelines::timeline_id.eq(dsl::timeline_id))),
+                                )
+                                .select((
+                                    TimelinePendingOpPersistence::as_select(),
+                                    Option::<TimelineFromDb>::as_select(),
+                                ))
+                                .load(conn)
+                                .await?;
+                        Ok(from_db)
+                    })
+                },
+            )
             .await?;
 
-        Ok(timeline_from_db)
+        Ok(timeline_from_db
+            .into_iter()
+            .map(|(op, tl_opt)| (op, tl_opt.map(|tl_opt| tl_opt.into_persistence())))
+            .collect())
     }
     /// List pending operations for a given timeline (including tenant-global ones)
     pub(crate) async fn list_pending_ops_for_timeline(
@@ -1580,7 +1599,7 @@ impl Persistence {
 
         let tenant_id = &tenant_id;
         let timeline_id = &timeline_id;
-        self.with_measured_conn(DatabaseOperation::ListTimelineReconcile, move |conn| {
+        self.with_measured_conn(DatabaseOperation::RemoveTimelineReconcile, move |conn| {
             let timeline_id_str = timeline_id.map(|tid| tid.to_string()).unwrap_or_default();
             Box::pin(async move {
                 diesel::delete(dsl::safekeeper_timeline_pending_ops)
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index a021313474..860fc4f6ab 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -824,9 +824,13 @@ impl Service {
             let mut locked = self.inner.write().unwrap();
             locked.become_leader();
 
+            for (sk_id, _sk) in locked.safekeepers.clone().iter() {
+                locked.safekeeper_reconcilers.start_reconciler(*sk_id, self);
+            }
+
             locked
                 .safekeeper_reconcilers
-                .schedule_request_vec(self, sk_schedule_requests);
+                .schedule_request_vec(sk_schedule_requests);
         }
 
         // TODO: if any tenant's intent now differs from its loaded generation_pageserver, we should clear that
diff --git a/storage_controller/src/service/safekeeper_reconciler.rs b/storage_controller/src/service/safekeeper_reconciler.rs
index 76e3162617..b15772a36c 100644
--- a/storage_controller/src/service/safekeeper_reconciler.rs
+++ b/storage_controller/src/service/safekeeper_reconciler.rs
@@ -30,31 +30,35 @@ impl SafekeeperReconcilers {
             reconcilers: HashMap::new(),
         }
     }
-    pub(crate) fn schedule_request_vec(
-        &mut self,
-        service: &Arc<Service>,
-        reqs: Vec<ScheduleRequest>,
-    ) {
+    /// Adds a safekeeper-specific reconciler.
+    /// Can be called multiple times, but it needs to be called at least once
+    /// for every new safekeeper added.
+    pub(crate) fn start_reconciler(&mut self, node_id: NodeId, service: &Arc<Service>) {
+        self.reconcilers.entry(node_id).or_insert_with(|| {
+            SafekeeperReconciler::spawn(self.cancel.child_token(), service.clone())
+        });
+    }
+    /// Stop a safekeeper-specific reconciler.
+    /// Stops the reconciler, cancelling all ongoing tasks.
+    pub(crate) fn stop_reconciler(&mut self, node_id: NodeId) {
+        if let Some(handle) = self.reconcilers.remove(&node_id) {
+            handle.cancel.cancel();
+        }
+    }
+    pub(crate) fn schedule_request_vec(&self, reqs: Vec<ScheduleRequest>) {
         tracing::info!(
             "Scheduling {} pending safekeeper ops loaded from db",
             reqs.len()
         );
         for req in reqs {
-            self.schedule_request(service, req);
+            self.schedule_request(req);
         }
     }
-    pub(crate) fn schedule_request(&mut self, service: &Arc<Service>, req: ScheduleRequest) {
+    pub(crate) fn schedule_request(&self, req: ScheduleRequest) {
         let node_id = req.safekeeper.get_id();
-        let reconciler_handle = self.reconcilers.entry(node_id).or_insert_with(|| {
-            SafekeeperReconciler::spawn(self.cancel.child_token(), service.clone())
-        });
+        let reconciler_handle = self.reconcilers.get(&node_id).unwrap();
         reconciler_handle.schedule_reconcile(req);
     }
-    pub(crate) fn cancel_safekeeper(&mut self, node_id: NodeId) {
-        if let Some(handle) = self.reconcilers.remove(&node_id) {
-            handle.cancel.cancel();
-        }
-    }
     /// Cancel ongoing reconciles for the given timeline
     ///
     /// Specifying `None` here only removes reconciles for the tenant-global reconciliation,
@@ -78,9 +82,12 @@ pub(crate) async fn load_schedule_requests(
     service: &Arc<Service>,
     safekeepers: &HashMap<NodeId, Safekeeper>,
 ) -> anyhow::Result<Vec<ScheduleRequest>> {
-    let pending_ops = service.persistence.list_pending_ops().await?;
-    let mut res = Vec::with_capacity(pending_ops.len());
-    for op_persist in pending_ops {
+    let pending_ops_timelines = service
+        .persistence
+        .list_pending_ops_with_timelines()
+        .await?;
+    let mut res = Vec::with_capacity(pending_ops_timelines.len());
+    for (op_persist, timeline_persist) in pending_ops_timelines {
         let node_id = NodeId(op_persist.sk_id as u64);
         let Some(sk) = safekeepers.get(&node_id) else {
             // This shouldn't happen, at least the safekeeper should exist as decomissioned.
@@ -102,16 +109,12 @@ pub(crate) async fn load_schedule_requests(
             SafekeeperTimelineOpKind::Delete => Vec::new(),
             SafekeeperTimelineOpKind::Exclude => Vec::new(),
             SafekeeperTimelineOpKind::Pull => {
-                // TODO this code is super hacky, it doesn't take migrations into account
-                let Some(timeline_id) = timeline_id else {
+                if timeline_id.is_none() {
+                    // We only do this extra check (outside of timeline_persist check) to give better error msgs
                     anyhow::bail!(
                         "timeline_id is empty for `pull` schedule request for {tenant_id}"
                     );
                 };
-                let timeline_persist = service
-                    .persistence
-                    .get_timeline(tenant_id, timeline_id)
-                    .await?;
                 let Some(timeline_persist) = timeline_persist else {
                     // This shouldn't happen, the timeline should still exist
                     tracing::warn!(
@@ -163,6 +166,7 @@ pub(crate) struct ScheduleRequest {
     pub(crate) kind: SafekeeperTimelineOpKind,
 }
 
+/// Handle to per safekeeper reconciler.
 struct ReconcilerHandle {
     tx: UnboundedSender<(ScheduleRequest, CancellationToken)>,
     ongoing_tokens: Arc<ClashMap<(TenantId, Option<TimelineId>), CancellationToken>>,
@@ -170,7 +174,10 @@ struct ReconcilerHandle {
 }
 
 impl ReconcilerHandle {
-    /// Obtain a new token slot, cancelling any existing reconciliations for that timeline
+    /// Obtain a new token slot, cancelling any existing reconciliations for
+    /// that timeline. It is not useful to have >1 operation per <tenant_id,
+    /// timeline_id, safekeeper>, hence scheduling op cancels current one if it
+    /// exists.
     fn new_token_slot(
         &self,
         tenant_id: TenantId,
@@ -305,15 +312,16 @@ impl SafekeeperReconciler {
             SafekeeperTimelineOpKind::Delete => {
                 let tenant_id = req.tenant_id;
                 if let Some(timeline_id) = req.timeline_id {
-                    let deleted = self.reconcile_inner(
-                        req,
-                        async |client| client.delete_timeline(tenant_id, timeline_id).await,
-                        |_resp| {
-                            tracing::info!(%tenant_id, %timeline_id, "deleted timeline from {req_host}");
-                        },
-                        req_cancel,
-                    )
-                    .await;
+                    let deleted = self
+                        .reconcile_inner(
+                            req,
+                            async |client| client.delete_timeline(tenant_id, timeline_id).await,
+                            |_resp| {
+                                tracing::info!("deleted timeline from {req_host}");
+                            },
+                            req_cancel,
+                        )
+                        .await;
                     if deleted {
                         self.delete_timeline_from_db(tenant_id, timeline_id).await;
                     }
@@ -344,12 +352,13 @@ impl SafekeeperReconciler {
         {
             Ok(list) => {
                 if !list.is_empty() {
-                    tracing::info!(%tenant_id, %timeline_id, "not deleting timeline from db as there is {} open reconciles", list.len());
+                    // duplicate the timeline_id here because it might be None in the reconcile context
+                    tracing::info!(%timeline_id, "not deleting timeline from db as there is {} open reconciles", list.len());
                     return;
                 }
             }
             Err(e) => {
-                tracing::warn!(%tenant_id, %timeline_id, "couldn't query pending ops: {e}");
+                tracing::warn!(%timeline_id, "couldn't query pending ops: {e}");
                 return;
             }
         }
diff --git a/storage_controller/src/service/safekeeper_service.rs b/storage_controller/src/service/safekeeper_service.rs
index a23b9a4a02..8a13c6af23 100644
--- a/storage_controller/src/service/safekeeper_service.rs
+++ b/storage_controller/src/service/safekeeper_service.rs
@@ -46,6 +46,7 @@ impl Service {
             .map(SecretString::from);
         let mut joinset = JoinSet::new();
 
+        // Prepare membership::Configuration from choosen safekeepers.
         let safekeepers = {
             let locked = self.inner.read().unwrap();
             locked.safekeepers.clone()
@@ -205,7 +206,7 @@ impl Service {
             tenant_id: tenant_id.to_string(),
             timeline_id: timeline_id.to_string(),
             start_lsn: start_lsn.into(),
-            generation: 0,
+            generation: 1,
             sk_set: sks_persistence.clone(),
             new_sk_set: None,
             cplane_notified_generation: 0,
@@ -254,7 +255,7 @@ impl Service {
             self.persistence.insert_pending_op(pending_op).await?;
         }
         if !remaining.is_empty() {
-            let mut locked = self.inner.write().unwrap();
+            let locked = self.inner.read().unwrap();
             for remaining_id in remaining {
                 let Some(sk) = locked.safekeepers.get(&remaining_id) else {
                     return Err(ApiError::InternalServerError(anyhow::anyhow!(
@@ -290,7 +291,7 @@ impl Service {
                     generation: timeline_persist.generation as u32,
                     kind: crate::persistence::SafekeeperTimelineOpKind::Pull,
                 };
-                locked.safekeeper_reconcilers.schedule_request(self, req);
+                locked.safekeeper_reconcilers.schedule_request(req);
             }
         }
 
@@ -357,7 +358,7 @@ impl Service {
             let pending_op = TimelinePendingOpPersistence {
                 tenant_id: tenant_id.to_string(),
                 timeline_id: timeline_id.to_string(),
-                generation: tl.generation,
+                generation: i32::MAX,
                 op_kind: SafekeeperTimelineOpKind::Delete,
                 sk_id: *sk_id,
             };
@@ -365,7 +366,7 @@ impl Service {
             self.persistence.insert_pending_op(pending_op).await?;
         }
         {
-            let mut locked = self.inner.write().unwrap();
+            let locked = self.inner.read().unwrap();
             for sk_id in all_sks {
                 let sk_id = NodeId(*sk_id as u64);
                 let Some(sk) = locked.safekeepers.get(&sk_id) else {
@@ -383,7 +384,7 @@ impl Service {
                     generation: tl.generation as u32,
                     kind: SafekeeperTimelineOpKind::Delete,
                 };
-                locked.safekeeper_reconcilers.schedule_request(self, req);
+                locked.safekeeper_reconcilers.schedule_request(req);
             }
         }
         Ok(())
@@ -482,7 +483,7 @@ impl Service {
                 tenant_id,
                 timeline_id: None,
             };
-            locked.safekeeper_reconcilers.schedule_request(self, req);
+            locked.safekeeper_reconcilers.schedule_request(req);
         }
         Ok(())
     }
@@ -579,7 +580,7 @@ impl Service {
     }
 
     pub(crate) async fn upsert_safekeeper(
-        &self,
+        self: &Arc<Service>,
         record: crate::persistence::SafekeeperUpsert,
     ) -> Result<(), ApiError> {
         let node_id = NodeId(record.id as u64);
@@ -618,6 +619,9 @@ impl Service {
                     );
                 }
             }
+            locked
+                .safekeeper_reconcilers
+                .start_reconciler(node_id, self);
             locked.safekeepers = Arc::new(safekeepers);
             metrics::METRICS_REGISTRY
                 .metrics_group
@@ -638,7 +642,7 @@ impl Service {
     }
 
     pub(crate) async fn set_safekeeper_scheduling_policy(
-        &self,
+        self: &Arc<Service>,
         id: i64,
         scheduling_policy: SkSchedulingPolicy,
     ) -> Result<(), DatabaseError> {
@@ -656,9 +660,13 @@ impl Service {
             sk.set_scheduling_policy(scheduling_policy);
 
             match scheduling_policy {
-                SkSchedulingPolicy::Active => (),
+                SkSchedulingPolicy::Active => {
+                    locked
+                        .safekeeper_reconcilers
+                        .start_reconciler(node_id, self);
+                }
                 SkSchedulingPolicy::Decomissioned | SkSchedulingPolicy::Pause => {
-                    locked.safekeeper_reconcilers.cancel_safekeeper(node_id);
+                    locked.safekeeper_reconcilers.stop_reconciler(node_id);
                 }
             }
 
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index b2c8415e9a..26f745adb9 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -4114,13 +4114,29 @@ def test_storage_controller_location_conf_equivalence(neon_env_builder: NeonEnvB
     assert reconciles_after_restart == 0
 
 
+class RestartStorcon(Enum):
+    RESTART = "restart"
+    ONLINE = "online"
+
+
+class DeletionSubject(Enum):
+    TIMELINE = "timeline"
+    TENANT = "tenant"
+
+
 @run_only_on_default_postgres("PG version is not interesting here")
-@pytest.mark.parametrize("restart_storcon", [True, False])
-def test_storcon_create_delete_sk_down(neon_env_builder: NeonEnvBuilder, restart_storcon: bool):
+@pytest.mark.parametrize("restart_storcon", [RestartStorcon.RESTART, RestartStorcon.ONLINE])
+@pytest.mark.parametrize("deletetion_subject", [DeletionSubject.TENANT, DeletionSubject.TIMELINE])
+def test_storcon_create_delete_sk_down(
+    neon_env_builder: NeonEnvBuilder,
+    restart_storcon: RestartStorcon,
+    deletetion_subject: DeletionSubject,
+):
     """
     Test that the storcon can create and delete tenants and timelines with a safekeeper being down.
-      - restart_storcon: tests whether the pending ops are persisted.
+      - restart_storcon: tests that the pending ops are persisted.
         if we don't restart, we test that we don't require it to come from the db.
+      - deletion_subject: test that both single timeline and whole tenant deletion work.
     """
 
     neon_env_builder.num_safekeepers = 3
@@ -4143,6 +4159,7 @@ def test_storcon_create_delete_sk_down(neon_env_builder: NeonEnvBuilder, restart
     tenant_id = TenantId.generate()
     timeline_id = TimelineId.generate()
     env.create_tenant(tenant_id, timeline_id)
+    child_timeline_id = env.create_branch("child_of_main", tenant_id)
 
     env.safekeepers[1].assert_log_contains(f"creating new timeline {tenant_id}/{timeline_id}")
     env.safekeepers[2].assert_log_contains(f"creating new timeline {tenant_id}/{timeline_id}")
@@ -4155,7 +4172,7 @@ def test_storcon_create_delete_sk_down(neon_env_builder: NeonEnvBuilder, restart
         ]
     )
 
-    if restart_storcon:
+    if restart_storcon == RestartStorcon.RESTART:
         # Restart the storcon to check that we persist operations
         env.storage_controller.stop()
         env.storage_controller.start()
@@ -4168,6 +4185,13 @@ def test_storcon_create_delete_sk_down(neon_env_builder: NeonEnvBuilder, restart
         ep.start(safekeeper_generation=1, safekeepers=[1, 2, 3])
         ep.safe_psql("CREATE TABLE IF NOT EXISTS t(key int, value text)")
 
+    with env.endpoints.create(
+        "child_of_main", tenant_id=tenant_id, config_lines=config_lines
+    ) as ep:
+        # endpoint should start.
+        ep.start(safekeeper_generation=1, safekeepers=[1, 2, 3])
+        ep.safe_psql("CREATE TABLE IF NOT EXISTS t(key int, value text)")
+
     env.storage_controller.assert_log_contains("writing pending op for sk id 1")
     env.safekeepers[0].start()
 
@@ -4176,25 +4200,31 @@ def test_storcon_create_delete_sk_down(neon_env_builder: NeonEnvBuilder, restart
         env.safekeepers[0].assert_log_contains(
             f"pulling timeline {tenant_id}/{timeline_id} from safekeeper"
         )
+        env.safekeepers[0].assert_log_contains(
+            f"pulling timeline {tenant_id}/{child_timeline_id} from safekeeper"
+        )
 
     wait_until(logged_contains_on_sk)
 
     env.safekeepers[1].stop()
 
-    env.storage_controller.pageserver_api().tenant_delete(tenant_id)
+    if deletetion_subject == DeletionSubject.TENANT:
+        env.storage_controller.pageserver_api().tenant_delete(tenant_id)
+    else:
+        env.storage_controller.pageserver_api().timeline_delete(tenant_id, child_timeline_id)
 
     # ensure the safekeeper deleted the timeline
     def timeline_deleted_on_active_sks():
         env.safekeepers[0].assert_log_contains(
-            f"deleting timeline {tenant_id}/{timeline_id} from disk"
+            f"deleting timeline {tenant_id}/{child_timeline_id} from disk"
         )
         env.safekeepers[2].assert_log_contains(
-            f"deleting timeline {tenant_id}/{timeline_id} from disk"
+            f"deleting timeline {tenant_id}/{child_timeline_id} from disk"
         )
 
     wait_until(timeline_deleted_on_active_sks)
 
-    if restart_storcon:
+    if restart_storcon == RestartStorcon.RESTART:
         # Restart the storcon to check that we persist operations
         env.storage_controller.stop()
         env.storage_controller.start()
@@ -4204,7 +4234,7 @@ def test_storcon_create_delete_sk_down(neon_env_builder: NeonEnvBuilder, restart
     # ensure that there is log msgs for the third safekeeper too
     def timeline_deleted_on_sk():
         env.safekeepers[1].assert_log_contains(
-            f"deleting timeline {tenant_id}/{timeline_id} from disk"
+            f"deleting timeline {tenant_id}/{child_timeline_id} from disk"
         )
 
     wait_until(timeline_deleted_on_sk)

From 134d01c771ddd03674bdce36b55e79cca18232a6 Mon Sep 17 00:00:00 2001
From: Suhas Thalanki <54014218+thesuhas@users.noreply.github.com>
Date: Thu, 17 Apr 2025 18:08:16 -0400
Subject: [PATCH 38/55] remove pg_anon.patch (#11636)

This PR removes `pg_anon.patch` as the `anon` v1 extension has been
removed and the patch is not being used anywhere
---
 compute/patches/pg_anon.patch | 265 ----------------------------------
 1 file changed, 265 deletions(-)
 delete mode 100644 compute/patches/pg_anon.patch

diff --git a/compute/patches/pg_anon.patch b/compute/patches/pg_anon.patch
deleted file mode 100644
index e2b4b292e4..0000000000
--- a/compute/patches/pg_anon.patch
+++ /dev/null
@@ -1,265 +0,0 @@
-commit 00aa659afc9c7336ab81036edec3017168aabf40
-Author: Heikki Linnakangas <heikki@neon.tech>
-Date:   Tue Nov 12 16:59:19 2024 +0200
-
-    Temporarily disable test that depends on timezone
-
-diff --git a/tests/expected/generalization.out b/tests/expected/generalization.out
-index 23ef5fa..9e60deb 100644
---- a/ext-src/pg_anon-src/tests/expected/generalization.out
-+++ b/ext-src/pg_anon-src/tests/expected/generalization.out
-@@ -284,12 +284,9 @@ SELECT anon.generalize_tstzrange('19041107','century');
-  ["Tue Jan 01 00:00:00 1901 PST","Mon Jan 01 00:00:00 2001 PST")
- (1 row)
- 
--SELECT anon.generalize_tstzrange('19041107','millennium');
--                      generalize_tstzrange                       
-------------------------------------------------------------------
-- ["Thu Jan 01 00:00:00 1001 PST","Mon Jan 01 00:00:00 2001 PST")
--(1 row)
--
-+-- temporarily disabled, see:
-+-- https://gitlab.com/dalibo/postgresql_anonymizer/-/commit/199f0a392b37c59d92ae441fb8f037e094a11a52#note_2148017485
-+--SELECT anon.generalize_tstzrange('19041107','millennium');
- -- generalize_daterange
- SELECT anon.generalize_daterange('19041107');
-   generalize_daterange   
-diff --git a/tests/sql/generalization.sql b/tests/sql/generalization.sql
-index b868344..b4fc977 100644
---- a/ext-src/pg_anon-src/tests/sql/generalization.sql
-+++ b/ext-src/pg_anon-src/tests/sql/generalization.sql
-@@ -61,7 +61,9 @@ SELECT anon.generalize_tstzrange('19041107','month');
- SELECT anon.generalize_tstzrange('19041107','year');
- SELECT anon.generalize_tstzrange('19041107','decade');
- SELECT anon.generalize_tstzrange('19041107','century');
--SELECT anon.generalize_tstzrange('19041107','millennium');
-+-- temporarily disabled, see:
-+-- https://gitlab.com/dalibo/postgresql_anonymizer/-/commit/199f0a392b37c59d92ae441fb8f037e094a11a52#note_2148017485
-+--SELECT anon.generalize_tstzrange('19041107','millennium');
- 
- -- generalize_daterange
- SELECT anon.generalize_daterange('19041107');
-
-commit 7dd414ee75f2875cffb1d6ba474df1f135a6fc6f
-Author: Alexey Masterov <alexeymasterov@neon.tech>
-Date:   Fri May 31 06:34:26 2024 +0000
-
-    These alternative expected files were added to consider the neon features
-
-diff --git a/ext-src/pg_anon-src/tests/expected/permissions_masked_role_1.out b/ext-src/pg_anon-src/tests/expected/permissions_masked_role_1.out
-new file mode 100644
-index 0000000..2539cfd
---- /dev/null
-+++ b/ext-src/pg_anon-src/tests/expected/permissions_masked_role_1.out
-@@ -0,0 +1,101 @@
-+BEGIN;
-+CREATE EXTENSION anon CASCADE;
-+NOTICE:  installing required extension "pgcrypto"
-+SELECT anon.init();
-+ init 
-+------
-+ t
-+(1 row)
-+
-+CREATE ROLE mallory_the_masked_user;
-+SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS 'MASKED';
-+CREATE TABLE t1(i INT);
-+ALTER TABLE t1 ADD COLUMN t TEXT;
-+SECURITY LABEL FOR anon ON COLUMN t1.t
-+IS 'MASKED WITH VALUE NULL';
-+INSERT INTO t1 VALUES (1,'test');
-+--
-+-- We're checking the owner's permissions
-+--
-+-- see
-+-- https://postgresql-anonymizer.readthedocs.io/en/latest/SECURITY/#permissions
-+--
-+SET ROLE mallory_the_masked_user;
-+SELECT anon.pseudo_first_name(0) IS NOT NULL;
-+ ?column? 
-+----------
-+ t
-+(1 row)
-+
-+-- SHOULD FAIL
-+DO $$
-+BEGIN
-+  PERFORM anon.init();
-+  EXCEPTION WHEN insufficient_privilege
-+  THEN RAISE NOTICE 'insufficient_privilege';
-+END$$;
-+NOTICE:  insufficient_privilege
-+-- SHOULD FAIL
-+DO $$
-+BEGIN
-+  PERFORM anon.anonymize_table('t1');
-+  EXCEPTION WHEN insufficient_privilege
-+  THEN RAISE NOTICE 'insufficient_privilege';
-+END$$;
-+NOTICE:  insufficient_privilege
-+-- SHOULD FAIL
-+SAVEPOINT fail_start_engine;
-+SELECT anon.start_dynamic_masking();
-+ERROR:  Only supersusers can start the dynamic masking engine.
-+CONTEXT:  PL/pgSQL function anon.start_dynamic_masking(boolean) line 18 at RAISE
-+ROLLBACK TO fail_start_engine;
-+RESET ROLE;
-+SELECT anon.start_dynamic_masking();
-+ start_dynamic_masking 
-+-----------------------
-+ t
-+(1 row)
-+
-+SET ROLE mallory_the_masked_user;
-+SELECT * FROM mask.t1;
-+ i | t 
-+---+---
-+ 1 | 
-+(1 row)
-+
-+-- SHOULD FAIL
-+DO $$
-+BEGIN
-+  SELECT * FROM public.t1;
-+  EXCEPTION WHEN insufficient_privilege
-+  THEN RAISE NOTICE 'insufficient_privilege';
-+END$$;
-+NOTICE:  insufficient_privilege
-+-- SHOULD FAIL
-+SAVEPOINT fail_stop_engine;
-+SELECT anon.stop_dynamic_masking();
-+ERROR:  Only supersusers can stop the dynamic masking engine.
-+CONTEXT:  PL/pgSQL function anon.stop_dynamic_masking() line 18 at RAISE
-+ROLLBACK TO fail_stop_engine;
-+RESET ROLE;
-+SELECT anon.stop_dynamic_masking();
-+NOTICE:  The previous priviledges of 'mallory_the_masked_user' are not restored. You need to grant them manually.
-+ stop_dynamic_masking 
-+----------------------
-+ t
-+(1 row)
-+
-+SET ROLE mallory_the_masked_user;
-+SELECT COUNT(*)=1 FROM anon.pg_masking_rules;
-+ ?column? 
-+----------
-+ t
-+(1 row)
-+
-+-- SHOULD FAIL
-+SAVEPOINT fail_seclabel_on_role;
-+SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS NULL;
-+ERROR:  permission denied
-+DETAIL:  The current user must have the CREATEROLE attribute.
-+ROLLBACK TO fail_seclabel_on_role;
-+ROLLBACK;
-diff --git a/ext-src/pg_anon-src/tests/expected/permissions_owner_1.out b/ext-src/pg_anon-src/tests/expected/permissions_owner_1.out
-new file mode 100644
-index 0000000..8b090fe
---- /dev/null
-+++ b/ext-src/pg_anon-src/tests/expected/permissions_owner_1.out
-@@ -0,0 +1,104 @@
-+BEGIN;
-+CREATE EXTENSION anon CASCADE;
-+NOTICE:  installing required extension "pgcrypto"
-+SELECT anon.init();
-+ init 
-+------
-+ t
-+(1 row)
-+
-+CREATE ROLE oscar_the_owner;
-+ALTER DATABASE :DBNAME OWNER TO oscar_the_owner;
-+CREATE ROLE mallory_the_masked_user;
-+SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS 'MASKED';
-+--
-+-- We're checking the owner's permissions
-+--
-+-- see
-+-- https://postgresql-anonymizer.readthedocs.io/en/latest/SECURITY/#permissions
-+--
-+SET ROLE oscar_the_owner;
-+SELECT anon.pseudo_first_name(0) IS NOT NULL;
-+ ?column? 
-+----------
-+ t
-+(1 row)
-+
-+-- SHOULD FAIL
-+DO $$
-+BEGIN
-+  PERFORM anon.init();
-+  EXCEPTION WHEN insufficient_privilege
-+  THEN RAISE NOTICE 'insufficient_privilege';
-+END$$;
-+NOTICE:  insufficient_privilege
-+CREATE TABLE t1(i INT);
-+ALTER TABLE t1 ADD COLUMN t TEXT;
-+SECURITY LABEL FOR anon ON COLUMN t1.t
-+IS 'MASKED WITH VALUE NULL';
-+INSERT INTO t1 VALUES (1,'test');
-+SELECT anon.anonymize_table('t1');
-+ anonymize_table 
-+-----------------
-+ t
-+(1 row)
-+
-+SELECT * FROM t1;
-+ i | t 
-+---+---
-+ 1 | 
-+(1 row)
-+
-+UPDATE t1 SET t='test' WHERE i=1;
-+-- SHOULD FAIL
-+SAVEPOINT fail_start_engine;
-+SELECT anon.start_dynamic_masking();
-+ start_dynamic_masking 
-+-----------------------
-+ t
-+(1 row)
-+
-+ROLLBACK TO fail_start_engine;
-+RESET ROLE;
-+SELECT anon.start_dynamic_masking();
-+ start_dynamic_masking 
-+-----------------------
-+ t
-+(1 row)
-+
-+SET ROLE oscar_the_owner;
-+SELECT * FROM t1;
-+ i |  t   
-+---+------
-+ 1 | test
-+(1 row)
-+
-+--SELECT * FROM mask.t1;
-+-- SHOULD FAIL
-+SAVEPOINT fail_stop_engine;
-+SELECT anon.stop_dynamic_masking();
-+ERROR:  permission denied for schema mask
-+CONTEXT:  SQL statement "DROP VIEW mask.t1;"
-+PL/pgSQL function anon.mask_drop_view(oid) line 3 at EXECUTE
-+SQL statement "SELECT anon.mask_drop_view(oid)
-+  FROM pg_catalog.pg_class
-+  WHERE relnamespace=quote_ident(pg_catalog.current_setting('anon.sourceschema'))::REGNAMESPACE
-+  AND relkind IN ('r','p','f')"
-+PL/pgSQL function anon.stop_dynamic_masking() line 22 at PERFORM
-+ROLLBACK TO fail_stop_engine;
-+RESET ROLE;
-+SELECT anon.stop_dynamic_masking();
-+NOTICE:  The previous priviledges of 'mallory_the_masked_user' are not restored. You need to grant them manually.
-+ stop_dynamic_masking 
-+----------------------
-+ t
-+(1 row)
-+
-+SET ROLE oscar_the_owner;
-+-- SHOULD FAIL
-+SAVEPOINT fail_seclabel_on_role;
-+SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS NULL;
-+ERROR:  permission denied
-+DETAIL:  The current user must have the CREATEROLE attribute.
-+ROLLBACK TO fail_seclabel_on_role;
-+ROLLBACK;

From ce7795a67d8d25658221511bfd9d6f232e942096 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Fri, 18 Apr 2025 00:32:38 +0100
Subject: [PATCH 39/55] compute: use project_id, endpoint_id as tag (#11556)

for compute audit logs

part of https://github.com/neondatabase/cloud/issues/21955
---
 compute_tools/src/compute.rs | 21 ++++++++++++++++++++-
 compute_tools/src/rsyslog.rs |  4 ++--
 2 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index c7b4bdd240..8834f0d63d 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -641,7 +641,26 @@ impl ComputeNode {
 
                 let log_directory_path = Path::new(&self.params.pgdata).join("log");
                 let log_directory_path = log_directory_path.to_string_lossy().to_string();
-                configure_audit_rsyslog(log_directory_path.clone(), "hipaa", &remote_endpoint)?;
+
+                // Add project_id,endpoint_id tag to identify the logs.
+                //
+                // These ids are passed from cplane,
+                // for backwards compatibility (old computes that don't have them),
+                // we set them to None.
+                // TODO: Clean up this code when all computes have them.
+                let tag: Option<String> = match (
+                    pspec.spec.project_id.as_deref(),
+                    pspec.spec.endpoint_id.as_deref(),
+                ) {
+                    (Some(project_id), Some(endpoint_id)) => {
+                        Some(format!("{project_id}/{endpoint_id}"))
+                    }
+                    (Some(project_id), None) => Some(format!("{project_id}/None")),
+                    (None, Some(endpoint_id)) => Some(format!("None,{endpoint_id}")),
+                    (None, None) => None,
+                };
+
+                configure_audit_rsyslog(log_directory_path.clone(), tag, &remote_endpoint)?;
 
                 // Launch a background task to clean up the audit logs
                 launch_pgaudit_gc(log_directory_path);
diff --git a/compute_tools/src/rsyslog.rs b/compute_tools/src/rsyslog.rs
index ba08302df2..7be97046a0 100644
--- a/compute_tools/src/rsyslog.rs
+++ b/compute_tools/src/rsyslog.rs
@@ -50,13 +50,13 @@ fn restart_rsyslog() -> Result<()> {
 
 pub fn configure_audit_rsyslog(
     log_directory: String,
-    tag: &str,
+    tag: Option<String>,
     remote_endpoint: &str,
 ) -> Result<()> {
     let config_content: String = format!(
         include_str!("config_template/compute_audit_rsyslog_template.conf"),
         log_directory = log_directory,
-        tag = tag,
+        tag = tag.unwrap_or("".to_string()),
         remote_endpoint = remote_endpoint
     );
 

From 182bd95a4e265a2844b5741c109cf55d9af9025b Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 18 Apr 2025 02:25:38 +0100
Subject: [PATCH 40/55] CI(regress-tests): run tests on `large-metal` (#11634)

## Problem

Regression tests are more flaky on virtualised (`qemu-x64-*`) runners

See https://neondb.slack.com/archives/C069Z2199DL/p1744891865307769
Ref https://github.com/neondatabase/neon/issues/11627

## Summary of changes
- Switch `regress-tests` to metal-only large runners to mitigate flaky
behaviour
---
 .github/workflows/_build-and-test-locally.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index 318e69d8a7..3a88bc844a 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -349,7 +349,7 @@ jobs:
       contents: read
       statuses: write
     needs: [ build-neon ]
-    runs-on: ${{ fromJSON(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }}
+    runs-on: ${{ fromJSON(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large-metal')) }}
     container:
       image: ${{ inputs.build-tools-image }}
       credentials:

From 5073e46df419bb05e12ffcce67c846b3aada6f43 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Fri, 18 Apr 2025 01:28:01 -0400
Subject: [PATCH 41/55] feat(pageserver): use rfc3339 time and print ratio in
 gc-compact stats (#11638)

## Problem

follow-up on https://github.com/neondatabase/neon/pull/11601

## Summary of changes

- serialize the start/end time using rfc3339 time string
- compute the size ratio of the compaction

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/timeline/compaction.rs | 21 +++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index ff85a33055..47a07f929d 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -7,7 +7,7 @@
 use std::collections::{BinaryHeap, HashMap, HashSet, VecDeque};
 use std::ops::{Deref, Range};
 use std::sync::Arc;
-use std::time::{Duration, Instant, SystemTime};
+use std::time::{Duration, Instant};
 
 use super::layer_manager::LayerManager;
 use super::{
@@ -119,25 +119,32 @@ pub struct GcCompactionMetaStatistics {
     /// The layer size after compaction.
     pub after_compaction_layer_size: u64,
     /// The start time of the meta job.
-    pub start_time: Option<SystemTime>,
+    pub start_time: Option<chrono::DateTime<chrono::Utc>>,
     /// The end time of the meta job.
-    pub end_time: Option<SystemTime>,
+    pub end_time: Option<chrono::DateTime<chrono::Utc>>,
     /// The duration of the meta job.
     pub duration_secs: f64,
     /// The id of the meta job.
     pub meta_job_id: GcCompactionJobId,
     /// The LSN below which the layers are compacted, used to compute the statistics.
     pub below_lsn: Lsn,
+    /// The retention ratio of the meta job (after_compaction_layer_size / before_compaction_layer_size)
+    pub retention_ratio: f64,
 }
 
 impl GcCompactionMetaStatistics {
     fn finalize(&mut self) {
-        let end_time = SystemTime::now();
+        let end_time = chrono::Utc::now();
         if let Some(start_time) = self.start_time {
-            if let Ok(duration) = end_time.duration_since(start_time) {
-                self.duration_secs = duration.as_secs_f64();
+            if end_time > start_time {
+                let delta = end_time - start_time;
+                if let Ok(std_dur) = delta.to_std() {
+                    self.duration_secs = std_dur.as_secs_f64();
+                }
             }
         }
+        self.retention_ratio = self.after_compaction_layer_size as f64
+            / (self.before_compaction_layer_size as f64 + 1.0);
         self.end_time = Some(end_time);
     }
 }
@@ -520,7 +527,7 @@ impl GcCompactionQueue {
                 }
                 guard.meta_statistics = Some(GcCompactionMetaStatistics {
                     meta_job_id: id,
-                    start_time: Some(SystemTime::now()),
+                    start_time: Some(chrono::Utc::now()),
                     before_compaction_layer_size: layer_size,
                     below_lsn: expected_l2_lsn,
                     total_sub_compaction_jobs: jobs_len,

From a0d844dfed37f4e8f56a316741dc9ac42ba731ae Mon Sep 17 00:00:00 2001
From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com>
Date: Fri, 18 Apr 2025 10:27:23 +0400
Subject: [PATCH 42/55] pageserver + safekeeper: pass ssl ca certs to broker
 client (#11635)

## Problem
Pageservers and safakeepers do not pass CA certificates to broker
client, so the client do not trust locally issued certificates.
- Part of https://github.com/neondatabase/cloud/issues/27492

## Summary of changes
- Change `ssl_ca_certs` type in PS/SK's config to `Pem` which may be
converted to both `reqwest` and `tonic` certificates.
- Pass CA certificates to storage broker client in PS and SK
---
 Cargo.lock                                    |  2 ++
 pageserver/Cargo.toml                         |  1 +
 pageserver/src/bin/pageserver.rs              | 12 +++++++-
 pageserver/src/config.rs                      | 12 +++++---
 pageserver/src/controller_upcall_client.rs    |  5 ++--
 .../timeline/import_pgdata/upcall_api.rs      |  4 +--
 safekeeper/Cargo.toml                         |  1 +
 safekeeper/src/bin/safekeeper.rs              |  6 ++--
 safekeeper/src/broker.rs                      | 30 +++++++++++++++----
 safekeeper/src/http/routes.rs                 | 23 +++++++++-----
 safekeeper/src/lib.rs                         |  4 +--
 safekeeper/src/recovery.rs                    |  3 +-
 storage_broker/benches/rps.rs                 | 21 +++++++++++--
 storage_broker/src/lib.rs                     | 12 +++++---
 14 files changed, 101 insertions(+), 35 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 870401e7f9..af5c271686 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4285,6 +4285,7 @@ dependencies = [
  "pageserver_api",
  "pageserver_client",
  "pageserver_compaction",
+ "pem",
  "pin-project-lite",
  "postgres-protocol",
  "postgres-types",
@@ -6001,6 +6002,7 @@ dependencies = [
  "once_cell",
  "pageserver_api",
  "parking_lot 0.12.1",
+ "pem",
  "postgres-protocol",
  "postgres_backend",
  "postgres_ffi",
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 5c5bab0642..fee78aa94d 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -78,6 +78,7 @@ metrics.workspace = true
 pageserver_api.workspace = true
 pageserver_client.workspace = true # for ResponseErrorMessageExt TOOD refactor that
 pageserver_compaction.workspace = true
+pem.workspace = true
 postgres_connection.workspace = true
 postgres_ffi.workspace = true
 pq_proto.workspace = true
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 250d4180f5..6cfaec955b 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -416,8 +416,18 @@ fn start_pageserver(
     // The storage_broker::connect call needs to happen inside a tokio runtime thread.
     let broker_client = WALRECEIVER_RUNTIME
         .block_on(async {
+            let tls_config = storage_broker::ClientTlsConfig::new().ca_certificates(
+                conf.ssl_ca_certs
+                    .iter()
+                    .map(pem::encode)
+                    .map(storage_broker::Certificate::from_pem),
+            );
             // Note: we do not attempt connecting here (but validate endpoints sanity).
-            storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)
+            storage_broker::connect(
+                conf.broker_endpoint.clone(),
+                conf.broker_keepalive_interval,
+                tls_config,
+            )
         })
         .with_context(|| {
             format!(
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index c12ac32b7e..d4bfed95a1 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -17,9 +17,10 @@ use once_cell::sync::OnceCell;
 use pageserver_api::config::{DiskUsageEvictionTaskConfig, MaxVectoredReadBytes};
 use pageserver_api::models::ImageCompressionAlgorithm;
 use pageserver_api::shard::TenantShardId;
+use pem::Pem;
 use postgres_backend::AuthType;
 use remote_storage::{RemotePath, RemoteStorageConfig};
-use reqwest::{Certificate, Url};
+use reqwest::Url;
 use storage_broker::Uri;
 use utils::id::{NodeId, TimelineId};
 use utils::logging::{LogFormat, SecretString};
@@ -67,8 +68,8 @@ pub struct PageServerConf {
     /// Period to reload certificate and private key from files.
     /// Default: 60s.
     pub ssl_cert_reload_period: Duration,
-    /// Trusted root CA certificates to use in https APIs.
-    pub ssl_ca_certs: Vec<Certificate>,
+    /// Trusted root CA certificates to use in https APIs in PEM format.
+    pub ssl_ca_certs: Vec<Pem>,
 
     /// Current availability zone. Used for traffic metrics.
     pub availability_zone: Option<String>,
@@ -497,7 +498,10 @@ impl PageServerConf {
             ssl_ca_certs: match ssl_ca_file {
                 Some(ssl_ca_file) => {
                     let buf = std::fs::read(ssl_ca_file)?;
-                    Certificate::from_pem_bundle(&buf)?
+                    pem::parse_many(&buf)?
+                        .into_iter()
+                        .filter(|pem| pem.tag() == "CERTIFICATE")
+                        .collect()
                 }
                 None => Vec::new(),
             },
diff --git a/pageserver/src/controller_upcall_client.rs b/pageserver/src/controller_upcall_client.rs
index fd5fbfcba9..ed52823c20 100644
--- a/pageserver/src/controller_upcall_client.rs
+++ b/pageserver/src/controller_upcall_client.rs
@@ -8,6 +8,7 @@ use pageserver_api::upcall_api::{
     ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest,
     ValidateRequestTenant, ValidateResponse,
 };
+use reqwest::Certificate;
 use serde::Serialize;
 use serde::de::DeserializeOwned;
 use tokio_util::sync::CancellationToken;
@@ -76,8 +77,8 @@ impl StorageControllerUpcallClient {
             client = client.default_headers(headers);
         }
 
-        for ssl_ca_cert in &conf.ssl_ca_certs {
-            client = client.add_root_certificate(ssl_ca_cert.clone());
+        for cert in &conf.ssl_ca_certs {
+            client = client.add_root_certificate(Certificate::from_der(cert.contents())?);
         }
 
         Ok(Some(Self {
diff --git a/pageserver/src/tenant/timeline/import_pgdata/upcall_api.rs b/pageserver/src/tenant/timeline/import_pgdata/upcall_api.rs
index 352bbbc4d4..99081a65e0 100644
--- a/pageserver/src/tenant/timeline/import_pgdata/upcall_api.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata/upcall_api.rs
@@ -1,6 +1,6 @@
 //! FIXME: most of this is copy-paste from mgmt_api.rs ; dedupe into a `reqwest_utils::Client` crate.
 use pageserver_client::mgmt_api::{Error, ResponseErrorMessageExt};
-use reqwest::Method;
+use reqwest::{Certificate, Method};
 use serde::{Deserialize, Serialize};
 use tokio_util::sync::CancellationToken;
 use tracing::error;
@@ -34,7 +34,7 @@ impl Client {
         };
         let mut http_client = reqwest::Client::builder();
         for cert in &conf.ssl_ca_certs {
-            http_client = http_client.add_root_certificate(cert.clone());
+            http_client = http_client.add_root_certificate(Certificate::from_der(cert.contents())?);
         }
         let http_client = http_client.build()?;
 
diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml
index a0ba69aa34..0a8cc415be 100644
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -55,6 +55,7 @@ tokio-util = { workspace = true }
 tracing.workspace = true
 url.workspace = true
 metrics.workspace = true
+pem.workspace = true
 postgres_backend.workspace = true
 postgres_ffi.workspace = true
 pq_proto.workspace = true
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index b8c122ea72..5fc742cda7 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -16,7 +16,6 @@ use futures::stream::FuturesUnordered;
 use futures::{FutureExt, StreamExt};
 use metrics::set_build_info_metric;
 use remote_storage::RemoteStorageConfig;
-use reqwest::Certificate;
 use safekeeper::defaults::{
     DEFAULT_CONTROL_FILE_SAVE_INTERVAL, DEFAULT_EVICTION_MIN_RESIDENT, DEFAULT_HEARTBEAT_TIMEOUT,
     DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES, DEFAULT_PARTIAL_BACKUP_CONCURRENCY,
@@ -373,7 +372,10 @@ async fn main() -> anyhow::Result<()> {
         Some(ssl_ca_file) => {
             tracing::info!("Using ssl root CA file: {ssl_ca_file:?}");
             let buf = tokio::fs::read(ssl_ca_file).await?;
-            Certificate::from_pem_bundle(&buf)?
+            pem::parse_many(&buf)?
+                .into_iter()
+                .filter(|pem| pem.tag() == "CERTIFICATE")
+                .collect()
         }
         None => Vec::new(),
     };
diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs
index de6e275124..3b15cf8d70 100644
--- a/safekeeper/src/broker.rs
+++ b/safekeeper/src/broker.rs
@@ -24,6 +24,15 @@ use crate::{GlobalTimelines, SafeKeeperConf};
 const RETRY_INTERVAL_MSEC: u64 = 1000;
 const PUSH_INTERVAL_MSEC: u64 = 1000;
 
+fn make_tls_config(conf: &SafeKeeperConf) -> storage_broker::ClientTlsConfig {
+    storage_broker::ClientTlsConfig::new().ca_certificates(
+        conf.ssl_ca_certs
+            .iter()
+            .map(pem::encode)
+            .map(storage_broker::Certificate::from_pem),
+    )
+}
+
 /// Push once in a while data about all active timelines to the broker.
 async fn push_loop(
     conf: Arc<SafeKeeperConf>,
@@ -37,8 +46,11 @@ async fn push_loop(
 
     let active_timelines_set = global_timelines.get_global_broker_active_set();
 
-    let mut client =
-        storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)?;
+    let mut client = storage_broker::connect(
+        conf.broker_endpoint.clone(),
+        conf.broker_keepalive_interval,
+        make_tls_config(&conf),
+    )?;
     let push_interval = Duration::from_millis(PUSH_INTERVAL_MSEC);
 
     let outbound = async_stream::stream! {
@@ -81,8 +93,11 @@ async fn pull_loop(
     global_timelines: Arc<GlobalTimelines>,
     stats: Arc<BrokerStats>,
 ) -> Result<()> {
-    let mut client =
-        storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)?;
+    let mut client = storage_broker::connect(
+        conf.broker_endpoint.clone(),
+        conf.broker_keepalive_interval,
+        make_tls_config(&conf),
+    )?;
 
     // TODO: subscribe only to local timelines instead of all
     let request = SubscribeSafekeeperInfoRequest {
@@ -134,8 +149,11 @@ async fn discover_loop(
     global_timelines: Arc<GlobalTimelines>,
     stats: Arc<BrokerStats>,
 ) -> Result<()> {
-    let mut client =
-        storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)?;
+    let mut client = storage_broker::connect(
+        conf.broker_endpoint.clone(),
+        conf.broker_keepalive_interval,
+        make_tls_config(&conf),
+    )?;
 
     let request = SubscribeByFilterRequest {
         types: vec![TypeSubscription {
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index 312456e5b2..2b2d721db2 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -14,6 +14,7 @@ use http_utils::json::{json_request, json_response};
 use http_utils::request::{ensure_no_body, parse_query_param, parse_request_param};
 use http_utils::{RequestExt, RouterBuilder};
 use hyper::{Body, Request, Response, StatusCode};
+use pem::Pem;
 use postgres_ffi::WAL_SEGMENT_SIZE;
 use safekeeper_api::models::{
     AcceptorStateStatus, PullTimelineRequest, SafekeeperStatus, SkTimelineInfo, TenantDeleteResult,
@@ -230,14 +231,20 @@ async fn timeline_pull_handler(mut request: Request<Body>) -> Result<Response<Bo
     let conf = get_conf(&request);
     let global_timelines = get_global_timelines(&request);
 
-    let resp = pull_timeline::handle_request(
-        data,
-        conf.sk_auth_token.clone(),
-        conf.ssl_ca_certs.clone(),
-        global_timelines,
-    )
-    .await
-    .map_err(ApiError::InternalServerError)?;
+    let ca_certs = conf
+        .ssl_ca_certs
+        .iter()
+        .map(Pem::contents)
+        .map(reqwest::Certificate::from_der)
+        .collect::<Result<Vec<_>, _>>()
+        .map_err(|e| {
+            ApiError::InternalServerError(anyhow::anyhow!("failed to parse CA certs: {e}"))
+        })?;
+
+    let resp =
+        pull_timeline::handle_request(data, conf.sk_auth_token.clone(), ca_certs, global_timelines)
+            .await
+            .map_err(ApiError::InternalServerError)?;
     json_response(StatusCode::OK, resp)
 }
 
diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs
index 3ca51ba40a..9f7580a313 100644
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -6,8 +6,8 @@ use std::time::Duration;
 
 use camino::Utf8PathBuf;
 use once_cell::sync::Lazy;
+use pem::Pem;
 use remote_storage::RemoteStorageConfig;
-use reqwest::Certificate;
 use storage_broker::Uri;
 use tokio::runtime::Runtime;
 use utils::auth::SwappableJwtAuth;
@@ -120,7 +120,7 @@ pub struct SafeKeeperConf {
     pub ssl_key_file: Utf8PathBuf,
     pub ssl_cert_file: Utf8PathBuf,
     pub ssl_cert_reload_period: Duration,
-    pub ssl_ca_certs: Vec<Certificate>,
+    pub ssl_ca_certs: Vec<Pem>,
     pub use_https_safekeeper_api: bool,
 }
 
diff --git a/safekeeper/src/recovery.rs b/safekeeper/src/recovery.rs
index 25b40f5d2e..577a2f694e 100644
--- a/safekeeper/src/recovery.rs
+++ b/safekeeper/src/recovery.rs
@@ -8,6 +8,7 @@ use std::time::SystemTime;
 use anyhow::{Context, bail};
 use futures::StreamExt;
 use postgres_protocol::message::backend::ReplicationMessage;
+use reqwest::Certificate;
 use safekeeper_api::Term;
 use safekeeper_api::membership::INVALID_GENERATION;
 use safekeeper_api::models::{PeerInfo, TimelineStatus};
@@ -241,7 +242,7 @@ async fn recover(
 
     let mut client = reqwest::Client::builder();
     for cert in &conf.ssl_ca_certs {
-        client = client.add_root_certificate(cert.clone());
+        client = client.add_root_certificate(Certificate::from_der(cert.contents())?);
     }
     let client = client
         .build()
diff --git a/storage_broker/benches/rps.rs b/storage_broker/benches/rps.rs
index 0fef6a58e0..9953ccfa91 100644
--- a/storage_broker/benches/rps.rs
+++ b/storage_broker/benches/rps.rs
@@ -87,7 +87,12 @@ fn tli_from_u64(i: u64) -> Vec<u8> {
 async fn subscribe(client: Option<BrokerClientChannel>, counter: Arc<AtomicU64>, i: u64) {
     let mut client = match client {
         Some(c) => c,
-        None => storage_broker::connect(DEFAULT_ENDPOINT, Duration::from_secs(5)).unwrap(),
+        None => storage_broker::connect(
+            DEFAULT_ENDPOINT,
+            Duration::from_secs(5),
+            storage_broker::ClientTlsConfig::new(),
+        )
+        .unwrap(),
     };
 
     let ttid = ProtoTenantTimelineId {
@@ -119,7 +124,12 @@ async fn subscribe(client: Option<BrokerClientChannel>, counter: Arc<AtomicU64>,
 async fn publish(client: Option<BrokerClientChannel>, n_keys: u64) {
     let mut client = match client {
         Some(c) => c,
-        None => storage_broker::connect(DEFAULT_ENDPOINT, Duration::from_secs(5)).unwrap(),
+        None => storage_broker::connect(
+            DEFAULT_ENDPOINT,
+            Duration::from_secs(5),
+            storage_broker::ClientTlsConfig::new(),
+        )
+        .unwrap(),
     };
     let mut counter: u64 = 0;
 
@@ -164,7 +174,12 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
     }
     let h = tokio::spawn(progress_reporter(counters.clone()));
 
-    let c = storage_broker::connect(DEFAULT_ENDPOINT, Duration::from_secs(5)).unwrap();
+    let c = storage_broker::connect(
+        DEFAULT_ENDPOINT,
+        Duration::from_secs(5),
+        storage_broker::ClientTlsConfig::new(),
+    )
+    .unwrap();
 
     for i in 0..args.num_subs {
         let c = Some(c.clone());
diff --git a/storage_broker/src/lib.rs b/storage_broker/src/lib.rs
index 7b36f5e948..149656a191 100644
--- a/storage_broker/src/lib.rs
+++ b/storage_broker/src/lib.rs
@@ -4,7 +4,7 @@ use proto::TenantTimelineId as ProtoTenantTimelineId;
 use proto::broker_service_client::BrokerServiceClient;
 use tonic::Status;
 use tonic::codegen::StdError;
-use tonic::transport::{Channel, ClientTlsConfig, Endpoint};
+use tonic::transport::{Channel, Endpoint};
 use utils::id::{TenantId, TenantTimelineId, TimelineId};
 
 // Code generated by protobuf.
@@ -20,6 +20,7 @@ pub mod metrics;
 
 // Re-exports to avoid direct tonic dependency in user crates.
 pub use hyper::Uri;
+pub use tonic::transport::{Certificate, ClientTlsConfig};
 pub use tonic::{Code, Request, Streaming};
 
 pub const DEFAULT_LISTEN_ADDR: &str = "127.0.0.1:50051";
@@ -38,7 +39,11 @@ pub type BrokerClientChannel = BrokerServiceClient<Channel>;
 //
 // NB: this function is not async, but still must be run on a tokio runtime thread
 // because that's a requirement of tonic_endpoint.connect_lazy()'s Channel::new call.
-pub fn connect<U>(endpoint: U, keepalive_interval: Duration) -> anyhow::Result<BrokerClientChannel>
+pub fn connect<U>(
+    endpoint: U,
+    keepalive_interval: Duration,
+    tls_config: ClientTlsConfig,
+) -> anyhow::Result<BrokerClientChannel>
 where
     U: std::convert::TryInto<Uri>,
     U::Error: std::error::Error + Send + Sync + 'static,
@@ -54,8 +59,7 @@ where
         rustls::crypto::ring::default_provider()
             .install_default()
             .ok();
-        let tls = ClientTlsConfig::new();
-        tonic_endpoint = tonic_endpoint.tls_config(tls)?;
+        tonic_endpoint = tonic_endpoint.tls_config(tls_config)?;
     }
     tonic_endpoint = tonic_endpoint
         .http2_keep_alive_interval(keepalive_interval)

From f006879fb7a135774875808b0bbab5f62f368937 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= <jc@neon.tech>
Date: Fri, 18 Apr 2025 18:39:18 +0200
Subject: [PATCH 43/55] fix(ci): make regex to find rc branches less strict
 (#11646)

## Problem

https://github.com/neondatabase/neon/actions/runs/14537161022/job/40787763965
failed to find the correct RC PR run, preventing artifact re-use. This
broke in https://github.com/neondatabase/neon/pull/11547.

There's a hotfix release containing this in
https://github.com/neondatabase/neon/pull/11645.

## Summary of changes
Make the regex for finding the RC PR run less strict, it was needlessly
precise.
---
 .github/workflows/_meta.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/_meta.yml b/.github/workflows/_meta.yml
index 1ad37f13ed..6157762c96 100644
--- a/.github/workflows/_meta.yml
+++ b/.github/workflows/_meta.yml
@@ -165,5 +165,5 @@ jobs:
           GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
           CURRENT_SHA: ${{ github.sha }}
         run: |
-          RELEASE_PR_RUN_ID=$(gh api "/repos/${GITHUB_REPOSITORY}/actions/runs?head_sha=$CURRENT_SHA" | jq '[.workflow_runs[] | select(.name == "Build and Test") | select(.head_branch | test("^rc/release(-(proxy|compute))?/[0-9]{4}-[0-9]{2}-[0-9]{2}$"; "s"))] | first | .id // ("Failed to find Build and Test run from  RC PR!" | halt_error(1))')
+          RELEASE_PR_RUN_ID=$(gh api "/repos/${GITHUB_REPOSITORY}/actions/runs?head_sha=$CURRENT_SHA" | jq '[.workflow_runs[] | select(.name == "Build and Test") | select(.head_branch | test("^rc/release.*$"; "s"))] | first | .id // ("Failed to find Build and Test run from  RC PR!" | halt_error(1))')
           echo "release-pr-run-id=$RELEASE_PR_RUN_ID" | tee -a $GITHUB_OUTPUT

From 3158442a595f55e6b08c09ce53dfc76ce9143b52 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= <jc@neon.tech>
Date: Fri, 18 Apr 2025 19:49:34 +0200
Subject: [PATCH 44/55] fix(ci): set token for fast-forward failure comments
 and allow merging with state unstable (#11647)

## Problem

https://github.com/neondatabase/neon/actions/runs/14538136318/job/40790985693?pr=11645
failed, even though the relevant parts of the CI had passed and
auto-merge determined the PR is ready to merge. After that, commenting
failed.

## Summary of changes
- set GH_TOKEN for commenting after fast-forward failure
- allow merging with mergeable_state unstable
---
 .github/workflows/fast-forward.yml | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/fast-forward.yml b/.github/workflows/fast-forward.yml
index a292522b88..f80596a7a6 100644
--- a/.github/workflows/fast-forward.yml
+++ b/.github/workflows/fast-forward.yml
@@ -27,15 +27,17 @@ jobs:
       - name: Fast forwarding
         uses: sequoia-pgp/fast-forward@ea7628bedcb0b0b96e94383ada458d812fca4979
         # See https://docs.github.com/en/graphql/reference/enums#mergestatestatus
-        if: ${{ github.event.pull_request.mergeable_state  == 'clean' }}
+        if: ${{ contains(fromJSON('["clean", "unstable"]'), github.event.pull_request.mergeable_state) }}
         with:
           merge: true
           comment: on-error
           github_token: ${{ secrets.CI_ACCESS_TOKEN }}
 
       - name: Comment if mergeable_state is not clean
-        if: ${{ github.event.pull_request.mergeable_state  != 'clean' }}
+        if: ${{ !contains(fromJSON('["clean", "unstable"]'), github.event.pull_request.mergeable_state) }}
+        env:
+          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
         run: |
           gh pr comment ${{ github.event.pull_request.number }} \
             --repo "${GITHUB_REPOSITORY}" \
-            --body "Not trying to forward pull-request, because \`mergeable_state\` is \`${{ github.event.pull_request.mergeable_state }}\`, not \`clean\`."
+            --body "Not trying to forward pull-request, because \`mergeable_state\` is \`${{ github.event.pull_request.mergeable_state }}\`, not \`clean\` or \`unstable\`."

From 4d0c1e8b783df7c849cf5ac55dce39da10d51b93 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Sat, 19 Apr 2025 11:38:03 +0300
Subject: [PATCH 45/55] refactor: Extract some code in pagebench getpage
 command to function (#11563)

This makes it easier to add a different client implementation alongside
the current one. I started working on a new gRPC-based protocol to
replace the libpq protocol, which will introduce a new function like
`client_libpq`, but for the new protocol.

It's a little more readable with less indentation anyway.
---
 .../pagebench/src/cmd/getpage_latest_lsn.rs   | 200 ++++++++++--------
 1 file changed, 109 insertions(+), 91 deletions(-)

diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
index 6fd1c00eca..771a7cbe5b 100644
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -68,6 +68,13 @@ pub(crate) struct Args {
     targets: Option<Vec<TenantTimelineId>>,
 }
 
+/// State shared by all clients
+#[derive(Debug)]
+struct SharedState {
+    start_work_barrier: tokio::sync::Barrier,
+    live_stats: LiveStats,
+}
+
 #[derive(Debug, Default)]
 struct LiveStats {
     completed_requests: AtomicU64,
@@ -240,24 +247,26 @@ async fn main_impl(
         all_ranges
     };
 
-    let live_stats = Arc::new(LiveStats::default());
-
     let num_live_stats_dump = 1;
     let num_work_sender_tasks = args.num_clients.get() * timelines.len();
     let num_main_impl = 1;
 
-    let start_work_barrier = Arc::new(tokio::sync::Barrier::new(
-        num_live_stats_dump + num_work_sender_tasks + num_main_impl,
-    ));
+    let shared_state = Arc::new(SharedState {
+        start_work_barrier: tokio::sync::Barrier::new(
+            num_live_stats_dump + num_work_sender_tasks + num_main_impl,
+        ),
+        live_stats: LiveStats::default(),
+    });
+    let cancel = CancellationToken::new();
 
+    let ss = shared_state.clone();
     tokio::spawn({
-        let stats = Arc::clone(&live_stats);
-        let start_work_barrier = Arc::clone(&start_work_barrier);
         async move {
-            start_work_barrier.wait().await;
+            ss.start_work_barrier.wait().await;
             loop {
                 let start = std::time::Instant::now();
                 tokio::time::sleep(std::time::Duration::from_secs(1)).await;
+                let stats = &ss.live_stats;
                 let completed_requests = stats.completed_requests.swap(0, Ordering::Relaxed);
                 let missed = stats.missed.swap(0, Ordering::Relaxed);
                 let elapsed = start.elapsed();
@@ -270,14 +279,12 @@ async fn main_impl(
         }
     });
 
-    let cancel = CancellationToken::new();
-
     let rps_period = args
         .per_client_rate
         .map(|rps_limit| Duration::from_secs_f64(1.0 / (rps_limit as f64)));
     let make_worker: &dyn Fn(WorkerId) -> Pin<Box<dyn Send + Future<Output = ()>>> = &|worker_id| {
-        let live_stats = live_stats.clone();
-        let start_work_barrier = start_work_barrier.clone();
+        let ss = shared_state.clone();
+        let cancel = cancel.clone();
         let ranges: Vec<KeyRange> = all_ranges
             .iter()
             .filter(|r| r.timeline == worker_id.timeline)
@@ -287,85 +294,8 @@ async fn main_impl(
             rand::distributions::weighted::WeightedIndex::new(ranges.iter().map(|v| v.len()))
                 .unwrap();
 
-        let cancel = cancel.clone();
         Box::pin(async move {
-            let client =
-                pageserver_client::page_service::Client::new(args.page_service_connstring.clone())
-                    .await
-                    .unwrap();
-            let mut client = client
-                .pagestream(worker_id.timeline.tenant_id, worker_id.timeline.timeline_id)
-                .await
-                .unwrap();
-
-            start_work_barrier.wait().await;
-            let client_start = Instant::now();
-            let mut ticks_processed = 0;
-            let mut inflight = VecDeque::new();
-            while !cancel.is_cancelled() {
-                // Detect if a request took longer than the RPS rate
-                if let Some(period) = &rps_period {
-                    let periods_passed_until_now =
-                        usize::try_from(client_start.elapsed().as_micros() / period.as_micros())
-                            .unwrap();
-
-                    if periods_passed_until_now > ticks_processed {
-                        live_stats.missed((periods_passed_until_now - ticks_processed) as u64);
-                    }
-                    ticks_processed = periods_passed_until_now;
-                }
-
-                while inflight.len() < args.queue_depth.get() {
-                    let start = Instant::now();
-                    let req = {
-                        let mut rng = rand::thread_rng();
-                        let r = &ranges[weights.sample(&mut rng)];
-                        let key: i128 = rng.gen_range(r.start..r.end);
-                        let key = Key::from_i128(key);
-                        assert!(key.is_rel_block_key());
-                        let (rel_tag, block_no) = key
-                            .to_rel_block()
-                            .expect("we filter non-rel-block keys out above");
-                        PagestreamGetPageRequest {
-                            hdr: PagestreamRequest {
-                                reqid: 0,
-                                request_lsn: if rng.gen_bool(args.req_latest_probability) {
-                                    Lsn::MAX
-                                } else {
-                                    r.timeline_lsn
-                                },
-                                not_modified_since: r.timeline_lsn,
-                            },
-                            rel: rel_tag,
-                            blkno: block_no,
-                        }
-                    };
-                    client.getpage_send(req).await.unwrap();
-                    inflight.push_back(start);
-                }
-
-                let start = inflight.pop_front().unwrap();
-                client.getpage_recv().await.unwrap();
-                let end = Instant::now();
-                live_stats.request_done();
-                ticks_processed += 1;
-                STATS.with(|stats| {
-                    stats
-                        .borrow()
-                        .lock()
-                        .unwrap()
-                        .observe(end.duration_since(start))
-                        .unwrap();
-                });
-
-                if let Some(period) = &rps_period {
-                    let next_at = client_start
-                        + Duration::from_micros(
-                            (ticks_processed) as u64 * u64::try_from(period.as_micros()).unwrap(),
-                        );
-                    tokio::time::sleep_until(next_at.into()).await;
-                }
-            }
+            client_libpq(args, worker_id, ss, cancel, rps_period, ranges, weights).await
         })
     };
 
@@ -387,7 +317,7 @@ async fn main_impl(
     };
 
     info!("waiting for everything to become ready");
-    start_work_barrier.wait().await;
+    shared_state.start_work_barrier.wait().await;
     info!("work started");
     if let Some(runtime) = args.runtime {
         tokio::time::sleep(runtime.into()).await;
@@ -416,3 +346,91 @@ async fn main_impl(
 
     anyhow::Ok(())
 }
+
+async fn client_libpq(
+    args: &Args,
+    worker_id: WorkerId,
+    shared_state: Arc<SharedState>,
+    cancel: CancellationToken,
+    rps_period: Option<Duration>,
+    ranges: Vec<KeyRange>,
+    weights: rand::distributions::weighted::WeightedIndex<i128>,
+) {
+    let client = pageserver_client::page_service::Client::new(args.page_service_connstring.clone())
+        .await
+        .unwrap();
+    let mut client = client
+        .pagestream(worker_id.timeline.tenant_id, worker_id.timeline.timeline_id)
+        .await
+        .unwrap();
+
+    shared_state.start_work_barrier.wait().await;
+    let client_start = Instant::now();
+    let mut ticks_processed = 0;
+    let mut inflight = VecDeque::new();
+    while !cancel.is_cancelled() {
+        // Detect if a request took longer than the RPS rate
+        if let Some(period) = &rps_period {
+            let periods_passed_until_now =
+                usize::try_from(client_start.elapsed().as_micros() / period.as_micros()).unwrap();
+
+            if periods_passed_until_now > ticks_processed {
+                shared_state
+                    .live_stats
+                    .missed((periods_passed_until_now - ticks_processed) as u64);
+            }
+            ticks_processed = periods_passed_until_now;
+        }
+
+        while inflight.len() < args.queue_depth.get() {
+            let start = Instant::now();
+            let req = {
+                let mut rng = rand::thread_rng();
+                let r = &ranges[weights.sample(&mut rng)];
+                let key: i128 = rng.gen_range(r.start..r.end);
+                let key = Key::from_i128(key);
+                assert!(key.is_rel_block_key());
+                let (rel_tag, block_no) = key
+                    .to_rel_block()
+                    .expect("we filter non-rel-block keys out above");
+                PagestreamGetPageRequest {
+                    hdr: PagestreamRequest {
+                        reqid: 0,
+                        request_lsn: if rng.gen_bool(args.req_latest_probability) {
+                            Lsn::MAX
+                        } else {
+                            r.timeline_lsn
+                        },
+                        not_modified_since: r.timeline_lsn,
+                    },
+                    rel: rel_tag,
+                    blkno: block_no,
+                }
+            };
+            client.getpage_send(req).await.unwrap();
+            inflight.push_back(start);
+        }
+
+        let start = inflight.pop_front().unwrap();
+        client.getpage_recv().await.unwrap();
+        let end = Instant::now();
+        shared_state.live_stats.request_done();
+        ticks_processed += 1;
+        STATS.with(|stats| {
+            stats
+                .borrow()
+                .lock()
+                .unwrap()
+                .observe(end.duration_since(start))
+                .unwrap();
+        });
+
+        if let Some(period) = &rps_period {
+            let next_at = client_start
+                + Duration::from_micros(
+                    (ticks_processed) as u64 * u64::try_from(period.as_micros()).unwrap(),
+                );
+            tokio::time::sleep_until(next_at.into()).await;
+        }
+    }
+}

From cbf442292b44decf7ab7fff77658d81c51b2c93f Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Mon, 21 Apr 2025 18:45:16 +0100
Subject: [PATCH 46/55] pageserver: handle empty get vectored queries (#11652)

## Problem

If all batched requests are excluded from the query by
`Timeine::get_rel_page_at_lsn_batched` (e.g. because they are past the
end of the relation), the read path would panic since it doesn't expect
empty queries. This is a change in behaviour that was introduced with
the scattered query implementation.

## Summary of Changes

Handle empty queries explicitly.
---
 pageserver/src/tenant/timeline.rs | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 5b126d516b..cfeab77598 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1285,6 +1285,10 @@ impl Timeline {
         reconstruct_state: &mut ValuesReconstructState,
         ctx: &RequestContext,
     ) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
+        if query.is_empty() {
+            return Ok(BTreeMap::default());
+        }
+
         let read_path = if self.conf.enable_read_path_debugging || ctx.read_path_debug() {
             Some(ReadPath::new(
                 query.total_keyspace(),

From 5df4a747e69befa92c6df7d6eb289ae7ab375e83 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 21 Apr 2025 12:49:17 -0500
Subject: [PATCH 47/55] Update pgbouncer in compute images to 1.24.1 (#11651)

Fixes CVE-2025-2291.

Link:
https://www.postgresql.org/about/news/pgbouncer-1241-released-fixes-cve-2025-2291-3059/

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute/compute-node.Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 83cbacf034..d8db627521 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1677,7 +1677,7 @@ RUN set -e \
     && apt clean && rm -rf /var/lib/apt/lists/*
 
 # Use `dist_man_MANS=` to skip manpage generation (which requires python3/pandoc)
-ENV PGBOUNCER_TAG=pgbouncer_1_22_1
+ENV PGBOUNCER_TAG=pgbouncer_1_24_1
 RUN set -e \
     && git clone --recurse-submodules --depth 1 --branch ${PGBOUNCER_TAG} https://github.com/pgbouncer/pgbouncer.git pgbouncer \
     && cd pgbouncer \

From cd2e1fbc7cb6758e30844343fd93db92fe3b610c Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 22 Apr 2025 10:41:28 +0100
Subject: [PATCH 48/55] CI(benchmarks): upload perf results for passed tests
 (#11649)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

We run benchmarks in batches (five parallel jobs on different runners).
If any test in a batch fails, we won’t upload any results for that
batch, even for the tests that passed.

## Summary of changes
- Move the results upload to a separate step in the run-python-test-set
action, and execute this step even if tests fail.
---
 .github/actions/run-python-test-set/action.yml | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index 7139d37be9..1c65244ef4 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -133,6 +133,7 @@ runs:
         fi
 
         PERF_REPORT_DIR="$(realpath test_runner/perf-report-local)"
+        echo "PERF_REPORT_DIR=${PERF_REPORT_DIR}" >> ${GITHUB_ENV}
         rm -rf $PERF_REPORT_DIR
 
         TEST_SELECTION="test_runner/${{ inputs.test_selection }}"
@@ -209,11 +210,12 @@ runs:
           --verbose \
           -rA $TEST_SELECTION $EXTRA_PARAMS
 
-        if [[ "${{ inputs.save_perf_report }}" == "true" ]]; then
-          export REPORT_FROM="$PERF_REPORT_DIR"
-          export REPORT_TO="$PLATFORM"
-          scripts/generate_and_push_perf_report.sh
-        fi
+    - name: Upload performance report
+      if: ${{ !cancelled() && inputs.save_perf_report == 'true' }}
+      shell: bash -euxo pipefail {0}
+      run: |
+        export REPORT_FROM="${PERF_REPORT_DIR}"
+        scripts/generate_and_push_perf_report.sh
 
     - name: Upload compatibility snapshot
       # Note, that we use `github.base_ref` which is a target branch for a PR

From fd916abf2537404913277ecb541dda971689c32b Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Tue, 22 Apr 2025 13:43:45 +0200
Subject: [PATCH 49/55] Remove NOTICE messages, which can make the pg_repack
 regression test fail. (#11659)

## Problem
The pg_repack test can be flaky due to unpredictable `NOTICE` messages
about waiting for some processes.
E.g.,
```
 INFO: repacking table "public.issue3_2"
+NOTICE: Waiting for 1 transactions to finish. First PID: 427
```
## Summary of changes
The `client_min_messages` set to `warning` for the regression tests.
---
 compute/patches/pg_repack.patch | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/compute/patches/pg_repack.patch b/compute/patches/pg_repack.patch
index f6b0aa1e13..10ed1054ff 100644
--- a/compute/patches/pg_repack.patch
+++ b/compute/patches/pg_repack.patch
@@ -11,6 +11,14 @@ index bf6edcb..89b4c7f 100644
  
  USE_PGXS = 1	# use pgxs if not in contrib directory
  PGXS := $(shell $(PG_CONFIG) --pgxs)
+diff --git a/regress/expected/init-extension.out b/regress/expected/init-extension.out
+index 9f2e171..f6e4f8d 100644
+--- a/regress/expected/init-extension.out
++++ b/regress/expected/init-extension.out
+@@ -1,3 +1,2 @@
+ SET client_min_messages = warning;
+ CREATE EXTENSION pg_repack;
+-RESET client_min_messages;
 diff --git a/regress/expected/nosuper.out b/regress/expected/nosuper.out
 index 8d0a94e..63b68bf 100644
 --- a/regress/expected/nosuper.out
@@ -42,6 +50,14 @@ index 8d0a94e..63b68bf 100644
  INFO: repacking table "public.tbl_cluster"
  ERROR: query failed: ERROR:  current transaction is aborted, commands ignored until end of transaction block
  DETAIL: query was: RESET lock_timeout
+diff --git a/regress/sql/init-extension.sql b/regress/sql/init-extension.sql
+index 9f2e171..f6e4f8d 100644
+--- a/regress/sql/init-extension.sql
++++ b/regress/sql/init-extension.sql
+@@ -1,3 +1,2 @@
+ SET client_min_messages = warning;
+ CREATE EXTENSION pg_repack;
+-RESET client_min_messages;
 diff --git a/regress/sql/nosuper.sql b/regress/sql/nosuper.sql
 index 072f0fa..dbe60f8 100644
 --- a/regress/sql/nosuper.sql

From 6173c0f44c4b86273ac6fb7f49bd6645ce25fcc1 Mon Sep 17 00:00:00 2001
From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com>
Date: Tue, 22 Apr 2025 17:19:03 +0400
Subject: [PATCH 50/55] safekeeper: add enable_tls_wal_service_api (#11520)

## Problem
Safekeeper doesn't use TLS in wal service
- Closes: https://github.com/neondatabase/cloud/issues/27302

## Summary of changes
- Add `enable_tls_wal_service_api` option to safekeeper's cmd arguments
- Propagate `tls_server_config` to `wal_service` if the option is
enabled
- Create `BACKGROUND_RUNTIME` for small background tasks and offload SSL
certificate reloader to it.

No integration tests for now because support from compute side is
required: https://github.com/neondatabase/cloud/issues/25823
---
 safekeeper/src/bin/safekeeper.rs              | 54 +++++++++++++++++--
 safekeeper/src/http/mod.rs                    | 16 +-----
 safekeeper/src/lib.rs                         | 11 ++++
 safekeeper/src/wal_service.rs                 |  8 ++-
 .../tests/walproposer_sim/safekeeper.rs       |  1 +
 5 files changed, 69 insertions(+), 21 deletions(-)

diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 5fc742cda7..000235f2f5 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -14,6 +14,7 @@ use clap::{ArgAction, Parser};
 use futures::future::BoxFuture;
 use futures::stream::FuturesUnordered;
 use futures::{FutureExt, StreamExt};
+use http_utils::tls_certs::ReloadingCertificateResolver;
 use metrics::set_build_info_metric;
 use remote_storage::RemoteStorageConfig;
 use safekeeper::defaults::{
@@ -23,8 +24,8 @@ use safekeeper::defaults::{
     DEFAULT_SSL_CERT_RELOAD_PERIOD, DEFAULT_SSL_KEY_FILE,
 };
 use safekeeper::{
-    BROKER_RUNTIME, GlobalTimelines, HTTP_RUNTIME, SafeKeeperConf, WAL_SERVICE_RUNTIME, broker,
-    control_file, http, wal_backup, wal_service,
+    BACKGROUND_RUNTIME, BROKER_RUNTIME, GlobalTimelines, HTTP_RUNTIME, SafeKeeperConf,
+    WAL_SERVICE_RUNTIME, broker, control_file, http, wal_backup, wal_service,
 };
 use sd_notify::NotifyState;
 use storage_broker::{DEFAULT_ENDPOINT, Uri};
@@ -215,16 +216,21 @@ struct Args {
     ssl_cert_file: Utf8PathBuf,
     /// Period to reload certificate and private key from files.
     #[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_SSL_CERT_RELOAD_PERIOD)]
-    pub ssl_cert_reload_period: Duration,
+    ssl_cert_reload_period: Duration,
     /// Trusted root CA certificates to use in https APIs.
     #[arg(long)]
-    pub ssl_ca_file: Option<Utf8PathBuf>,
+    ssl_ca_file: Option<Utf8PathBuf>,
     /// Flag to use https for requests to peer's safekeeper API.
     #[arg(long)]
-    pub use_https_safekeeper_api: bool,
+    use_https_safekeeper_api: bool,
     /// Path to the JWT auth token used to authenticate with other safekeepers.
     #[arg(long)]
     auth_token_path: Option<Utf8PathBuf>,
+    /// Enable TLS in WAL service API.
+    /// Does not force TLS: the client negotiates TLS usage during the handshake.
+    /// Uses key and certificate from ssl_key_file/ssl_cert_file.
+    #[arg(long)]
+    enable_tls_wal_service_api: bool,
 }
 
 // Like PathBufValueParser, but allows empty string.
@@ -418,6 +424,7 @@ async fn main() -> anyhow::Result<()> {
         ssl_cert_reload_period: args.ssl_cert_reload_period,
         ssl_ca_certs,
         use_https_safekeeper_api: args.use_https_safekeeper_api,
+        enable_tls_wal_service_api: args.enable_tls_wal_service_api,
     });
 
     // initialize sentry if SENTRY_DSN is provided
@@ -517,6 +524,36 @@ async fn start_safekeeper(conf: Arc<SafeKeeperConf>) -> Result<()> {
         info!("running in current thread runtime");
     }
 
+    let tls_server_config = if conf.listen_https_addr.is_some() || conf.enable_tls_wal_service_api {
+        let ssl_key_file = conf.ssl_key_file.clone();
+        let ssl_cert_file = conf.ssl_cert_file.clone();
+        let ssl_cert_reload_period = conf.ssl_cert_reload_period;
+
+        // Create resolver in BACKGROUND_RUNTIME, so the background certificate reloading
+        // task is run in this runtime.
+        let cert_resolver = current_thread_rt
+            .as_ref()
+            .unwrap_or_else(|| BACKGROUND_RUNTIME.handle())
+            .spawn(async move {
+                ReloadingCertificateResolver::new(
+                    "main",
+                    &ssl_key_file,
+                    &ssl_cert_file,
+                    ssl_cert_reload_period,
+                )
+                .await
+            })
+            .await??;
+
+        let config = rustls::ServerConfig::builder()
+            .with_no_client_auth()
+            .with_cert_resolver(cert_resolver);
+
+        Some(Arc::new(config))
+    } else {
+        None
+    };
+
     let wal_service_handle = current_thread_rt
         .as_ref()
         .unwrap_or_else(|| WAL_SERVICE_RUNTIME.handle())
@@ -524,6 +561,9 @@ async fn start_safekeeper(conf: Arc<SafeKeeperConf>) -> Result<()> {
             conf.clone(),
             pg_listener,
             Scope::SafekeeperData,
+            conf.enable_tls_wal_service_api
+                .then(|| tls_server_config.clone())
+                .flatten(),
             global_timelines.clone(),
         ))
         // wrap with task name for error reporting
@@ -552,6 +592,9 @@ async fn start_safekeeper(conf: Arc<SafeKeeperConf>) -> Result<()> {
                 conf.clone(),
                 pg_listener_tenant_only,
                 Scope::Tenant,
+                conf.enable_tls_wal_service_api
+                    .then(|| tls_server_config.clone())
+                    .flatten(),
                 global_timelines.clone(),
             ))
             // wrap with task name for error reporting
@@ -577,6 +620,7 @@ async fn start_safekeeper(conf: Arc<SafeKeeperConf>) -> Result<()> {
             .spawn(http::task_main_https(
                 conf.clone(),
                 https_listener,
+                tls_server_config.expect("tls_server_config is set earlier if https is enabled"),
                 global_timelines.clone(),
             ))
             .map(|res| ("HTTPS service main".to_owned(), res));
diff --git a/safekeeper/src/http/mod.rs b/safekeeper/src/http/mod.rs
index 6e7c5d971d..0003310763 100644
--- a/safekeeper/src/http/mod.rs
+++ b/safekeeper/src/http/mod.rs
@@ -1,7 +1,6 @@
 pub mod routes;
 use std::sync::Arc;
 
-use http_utils::tls_certs::ReloadingCertificateResolver;
 pub use routes::make_router;
 pub use safekeeper_api::models;
 use tokio_util::sync::CancellationToken;
@@ -28,21 +27,10 @@ pub async fn task_main_http(
 pub async fn task_main_https(
     conf: Arc<SafeKeeperConf>,
     https_listener: std::net::TcpListener,
+    tls_config: Arc<rustls::ServerConfig>,
     global_timelines: Arc<GlobalTimelines>,
 ) -> anyhow::Result<()> {
-    let cert_resolver = ReloadingCertificateResolver::new(
-        "main",
-        &conf.ssl_key_file,
-        &conf.ssl_cert_file,
-        conf.ssl_cert_reload_period,
-    )
-    .await?;
-
-    let server_config = rustls::ServerConfig::builder()
-        .with_no_client_auth()
-        .with_cert_resolver(cert_resolver);
-
-    let tls_acceptor = tokio_rustls::TlsAcceptor::from(Arc::new(server_config));
+    let tls_acceptor = tokio_rustls::TlsAcceptor::from(tls_config);
 
     let router = make_router(conf, global_timelines)
         .build()
diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs
index 9f7580a313..ef2608e5d6 100644
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -122,6 +122,7 @@ pub struct SafeKeeperConf {
     pub ssl_cert_reload_period: Duration,
     pub ssl_ca_certs: Vec<Pem>,
     pub use_https_safekeeper_api: bool,
+    pub enable_tls_wal_service_api: bool,
 }
 
 impl SafeKeeperConf {
@@ -172,6 +173,7 @@ impl SafeKeeperConf {
             ssl_cert_reload_period: Duration::from_secs(60),
             ssl_ca_certs: Vec::new(),
             use_https_safekeeper_api: false,
+            enable_tls_wal_service_api: false,
         }
     }
 }
@@ -209,3 +211,12 @@ pub static WAL_BACKUP_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
         .build()
         .expect("Failed to create WAL backup runtime")
 });
+
+pub static BACKGROUND_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
+    tokio::runtime::Builder::new_multi_thread()
+        .thread_name("background worker")
+        .worker_threads(1) // there is only one task now (ssl certificate reloading), having more threads doesn't make sense
+        .enable_all()
+        .build()
+        .expect("Failed to create background runtime")
+});
diff --git a/safekeeper/src/wal_service.rs b/safekeeper/src/wal_service.rs
index 045fa88cb0..6e007265b2 100644
--- a/safekeeper/src/wal_service.rs
+++ b/safekeeper/src/wal_service.rs
@@ -29,6 +29,7 @@ pub async fn task_main(
     conf: Arc<SafeKeeperConf>,
     pg_listener: std::net::TcpListener,
     allowed_auth_scope: Scope,
+    tls_config: Option<Arc<rustls::ServerConfig>>,
     global_timelines: Arc<GlobalTimelines>,
 ) -> anyhow::Result<()> {
     // Tokio's from_std won't do this for us, per its comment.
@@ -43,9 +44,10 @@ pub async fn task_main(
         let conf = conf.clone();
         let conn_id = issue_connection_id(&mut connection_count);
         let global_timelines = global_timelines.clone();
+        let tls_config = tls_config.clone();
         tokio::spawn(
             async move {
-                if let Err(err) = handle_socket(socket, conf, conn_id, allowed_auth_scope, global_timelines).await {
+                if let Err(err) = handle_socket(socket, conf, conn_id, allowed_auth_scope, tls_config, global_timelines).await {
                     error!("connection handler exited: {}", err);
                 }
             }
@@ -61,6 +63,7 @@ async fn handle_socket(
     conf: Arc<SafeKeeperConf>,
     conn_id: ConnectionId,
     allowed_auth_scope: Scope,
+    tls_config: Option<Arc<rustls::ServerConfig>>,
     global_timelines: Arc<GlobalTimelines>,
 ) -> Result<(), QueryError> {
     socket.set_nodelay(true)?;
@@ -110,7 +113,8 @@ async fn handle_socket(
         auth_pair,
         global_timelines,
     );
-    let pgbackend = PostgresBackend::new_from_io(socket_fd, socket, peer_addr, auth_type, None)?;
+    let pgbackend =
+        PostgresBackend::new_from_io(socket_fd, socket, peer_addr, auth_type, tls_config)?;
     // libpq protocol between safekeeper and walproposer / pageserver
     // We don't use shutdown.
     pgbackend
diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs
index b3f088d31c..5fb29683f2 100644
--- a/safekeeper/tests/walproposer_sim/safekeeper.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper.rs
@@ -185,6 +185,7 @@ pub fn run_server(os: NodeOs, disk: Arc<SafekeeperDisk>) -> Result<()> {
         ssl_cert_reload_period: Duration::ZERO,
         ssl_ca_certs: Vec::new(),
         use_https_safekeeper_api: false,
+        enable_tls_wal_service_api: false,
     };
 
     let mut global = GlobalMap::new(disk, conf.clone())?;

From ad3519ebcbfaec661a4691f6a8726999038d6277 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Tue, 22 Apr 2025 10:28:22 -0400
Subject: [PATCH 51/55] fix(pageserver): report synthetic size = 1 if all tls
 offloaded (#11648)

## Problem

A quick workaround for https://github.com/neondatabase/neon/issues/11631

## Summary of changes

Report synthetic size == 1 if all timelines are offloaded.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/consumption_metrics/metrics.rs | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/consumption_metrics/metrics.rs b/pageserver/src/consumption_metrics/metrics.rs
index a4bfe74e30..08ab69f349 100644
--- a/pageserver/src/consumption_metrics/metrics.rs
+++ b/pageserver/src/consumption_metrics/metrics.rs
@@ -263,7 +263,9 @@ where
     while let Some((tenant_id, tenant)) = tenants.next().await {
         let mut tenant_resident_size = 0;
 
-        for timeline in tenant.list_timelines() {
+        let timelines = tenant.list_timelines();
+        let timelines_len = timelines.len();
+        for timeline in timelines {
             let timeline_id = timeline.timeline_id;
 
             match TimelineSnapshot::collect(&timeline, ctx) {
@@ -289,6 +291,11 @@ where
             tenant_resident_size += timeline.resident_physical_size();
         }
 
+        if timelines_len == 0 {
+            // Force set it to 1 byte to avoid not being reported -- all timelines are offloaded.
+            tenant_resident_size = 1;
+        }
+
         let snap = TenantSnapshot::collect(&tenant, tenant_resident_size);
         snap.to_metrics(tenant_id, Utc::now(), cache, &mut current_metrics);
     }

From 132b6154bbd6095a82e3d7a8cd8fd845e438ad0d Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Tue, 22 Apr 2025 21:07:45 +0300
Subject: [PATCH 52/55] Unlogged build debug compare local v2 (#11554)

## Problem

Init fork is used in DEBUG_COMPARE_LOCAL to determine unlogged relation
or unlogged build.
But it is created only after the relation is initialized and so can be
swapped out, producing `Page is evicted with zero LSN` error.

## Summary of changes

Create init fork together with main fork for unlogged relations in
DEBUG_COMPARE_LOCAL mode.

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/pagestore_smgr.c | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 9fe085c558..3bf0bedf99 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -803,7 +803,13 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
 
 		case RELPERSISTENCE_TEMP:
 		case RELPERSISTENCE_UNLOGGED:
+#ifdef DEBUG_COMPARE_LOCAL
+			mdcreate(reln, forkNum, forkNum == INIT_FORKNUM || isRedo);
+			if (forkNum == MAIN_FORKNUM)
+				mdcreate(reln, INIT_FORKNUM, true);
+#else
 			mdcreate(reln, forkNum, isRedo);
+#endif
 			return;
 
 		default:
@@ -1973,6 +1979,10 @@ neon_start_unlogged_build(SMgrRelation reln)
 		case RELPERSISTENCE_UNLOGGED:
 			unlogged_build_rel = reln;
 			unlogged_build_phase = UNLOGGED_BUILD_NOT_PERMANENT;
+#ifdef DEBUG_COMPARE_LOCAL
+			if (!IsParallelWorker())
+				mdcreate(reln, INIT_FORKNUM, true);
+#endif
 			return;
 
 		default:
@@ -1995,12 +2005,14 @@ neon_start_unlogged_build(SMgrRelation reln)
 	 * FIXME: should we pass isRedo true to create the tablespace dir if it
 	 * doesn't exist? Is it needed?
 	 */
-#ifndef DEBUG_COMPARE_LOCAL
  	if (!IsParallelWorker())
+	{
+#ifndef DEBUG_COMPARE_LOCAL
 		mdcreate(reln, MAIN_FORKNUM, false);
 #else
-	mdcreate(reln, INIT_FORKNUM, false);
+		mdcreate(reln, INIT_FORKNUM, true);
 #endif
+	}
 }
 
 /*
@@ -2099,12 +2111,12 @@ neon_end_unlogged_build(SMgrRelation reln)
 #ifndef DEBUG_COMPARE_LOCAL
 			/* use isRedo == true, so that we drop it immediately */
 			mdunlink(rinfob, forknum, true);
-#else
-			mdunlink(rinfob, INIT_FORKNUM, true);
 #endif
 		}
+#ifdef DEBUG_COMPARE_LOCAL
+		mdunlink(rinfob, INIT_FORKNUM, true);
+#endif
 	}
-
 	unlogged_build_rel = NULL;
 	unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
 }

From 7b949daf13d0c7f767081db92ddc087b1ab39631 Mon Sep 17 00:00:00 2001
From: Alexander Lakhin <exclusion@gmail.com>
Date: Tue, 22 Apr 2025 21:13:16 +0300
Subject: [PATCH 53/55] fix(test): allow reconcile errors in
 test_storage_controller_heartbeats (#11665)

## Problem

test_storage_controller_heartbeats is flaky because of unallowed
reconciler errors (#11625)

## Summary of changes

Allow reconcile errors as in other tests in test_storage_controller.py.
---
 .../regress/test_storage_controller.py        | 22 ++++++++++---------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 26f745adb9..44b30e289d 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -1612,16 +1612,18 @@ def test_storage_controller_heartbeats(
     env = neon_env_builder.init_configs()
     env.start()
 
-    # Default log allow list permits connection errors, but this test will use error responses on
-    # the utilization endpoint.
-    env.storage_controller.allowed_errors.append(
-        ".*Call to node.*management API.*failed.*failpoint.*"
-    )
-    # The server starts listening to the socket before sending re-attach request,
-    # but it starts serving HTTP only when re-attach is completed.
-    # If re-attach is slow (last scenario), storcon's heartbeat requests will time out.
-    env.storage_controller.allowed_errors.append(
-        ".*Call to node.*management API.*failed.* Timeout.*"
+    env.storage_controller.allowed_errors.extend(
+        [
+            # Default log allow list permits connection errors, but this test will use error responses on
+            # the utilization endpoint.
+            ".*Call to node.*management API.*failed.*failpoint.*",
+            # The server starts listening to the socket before sending re-attach request,
+            # but it starts serving HTTP only when re-attach is completed.
+            # If re-attach is slow (last scenario), storcon's heartbeat requests will time out.
+            ".*Call to node.*management API.*failed.* Timeout.*",
+            # We will intentionally cause reconcile errors
+            ".*Reconcile error.*",
+        ]
     )
 
     # Initially we have two online pageservers

From 149cbd1e0a9ce721653d583099e13e63d1ee0fe6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 22 Apr 2025 23:27:01 +0200
Subject: [PATCH 54/55] Support single and two safekeeper scenarios (#11483)

In tests and when one safekeeper is down in small regions, we need to
contend with one or two safekeepers. Before, we gave an error in
`safekeepers_for_new_timeline`. Now we just silently allow the timeline
to be created on one or two safekeepers.

Part of #9011
---
 .../src/service/safekeeper_service.rs         | 49 ++++++++++++++--
 .../regress/test_storage_controller.py        | 57 +++++++++++++++++++
 2 files changed, 101 insertions(+), 5 deletions(-)

diff --git a/storage_controller/src/service/safekeeper_service.rs b/storage_controller/src/service/safekeeper_service.rs
index 8a13c6af23..088b3c4741 100644
--- a/storage_controller/src/service/safekeeper_service.rs
+++ b/storage_controller/src/service/safekeeper_service.rs
@@ -151,11 +151,39 @@ impl Service {
             "Got {} non-successful responses from initial creation request of total {total_result_count} responses",
             remaining.len()
         );
-        if remaining.len() >= 2 {
+        let target_sk_count = timeline_persistence.sk_set.len();
+        let quorum_size = match target_sk_count {
+            0 => {
+                return Err(ApiError::InternalServerError(anyhow::anyhow!(
+                    "timeline configured without any safekeepers",
+                )));
+            }
+            1 | 2 => {
+                #[cfg(feature = "testing")]
+                {
+                    // In test settings, it is allowed to have one or two safekeepers
+                    target_sk_count
+                }
+                #[cfg(not(feature = "testing"))]
+                {
+                    // The region is misconfigured: we need at least three safekeepers to be configured
+                    // in order to schedule work to them
+                    tracing::warn!(
+                        "couldn't find at least 3 safekeepers for timeline, found: {:?}",
+                        timeline_persistence.sk_set
+                    );
+                    return Err(ApiError::InternalServerError(anyhow::anyhow!(
+                        "couldn't find at least 3 safekeepers to put timeline to"
+                    )));
+                }
+            }
+            _ => target_sk_count / 2 + 1,
+        };
+        let success_count = target_sk_count - remaining.len();
+        if success_count < quorum_size {
             // Failure
             return Err(ApiError::InternalServerError(anyhow::anyhow!(
-                "not enough successful reconciliations to reach quorum, please retry: {} errored",
-                remaining.len()
+                "not enough successful reconciliations to reach quorum size: {success_count} of {quorum_size} of total {target_sk_count}"
             )));
         }
 
@@ -492,8 +520,6 @@ impl Service {
     pub(crate) async fn safekeepers_for_new_timeline(
         &self,
     ) -> Result<Vec<SafekeeperInfo>, ApiError> {
-        // Number of safekeepers in different AZs we are looking for
-        let wanted_count = 3;
         let mut all_safekeepers = {
             let locked = self.inner.read().unwrap();
             locked
@@ -532,6 +558,19 @@ impl Service {
                 sk.1.id.0,
             )
         });
+        // Number of safekeepers in different AZs we are looking for
+        let wanted_count = match all_safekeepers.len() {
+            0 => {
+                return Err(ApiError::InternalServerError(anyhow::anyhow!(
+                    "couldn't find any active safekeeper for new timeline",
+                )));
+            }
+            // Have laxer requirements on testig mode as we don't want to
+            // spin up three safekeepers for every single test
+            #[cfg(feature = "testing")]
+            1 | 2 => all_safekeepers.len(),
+            _ => 3,
+        };
         let mut sks = Vec::new();
         let mut azs = HashSet::new();
         for (_sk_util, sk_info, az_id) in all_safekeepers.iter() {
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 44b30e289d..0f291030fe 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -4242,6 +4242,63 @@ def test_storcon_create_delete_sk_down(
     wait_until(timeline_deleted_on_sk)
 
 
+@run_only_on_default_postgres("PG version is not interesting here")
+@pytest.mark.parametrize("num_safekeepers", [1, 2, 3])
+@pytest.mark.parametrize("deletetion_subject", [DeletionSubject.TENANT, DeletionSubject.TIMELINE])
+def test_storcon_few_sk(
+    neon_env_builder: NeonEnvBuilder,
+    num_safekeepers: int,
+    deletetion_subject: DeletionSubject,
+):
+    """
+    Test that the storcon can create and delete tenants and timelines with a limited/special number of safekeepers
+      - num_safekeepers: number of safekeepers.
+      - deletion_subject: test that both single timeline and whole tenant deletion work.
+    """
+
+    neon_env_builder.num_safekeepers = num_safekeepers
+    safekeeper_list = list(range(1, num_safekeepers + 1))
+    neon_env_builder.storage_controller_config = {
+        "timelines_onto_safekeepers": True,
+    }
+    env = neon_env_builder.init_start()
+
+    tenant_id = TenantId.generate()
+    timeline_id = TimelineId.generate()
+    env.create_tenant(tenant_id, timeline_id)
+    child_timeline_id = env.create_branch("child_of_main", tenant_id)
+
+    env.safekeepers[0].assert_log_contains(f"creating new timeline {tenant_id}/{timeline_id}")
+
+    config_lines = [
+        "neon.safekeeper_proto_version = 3",
+    ]
+    with env.endpoints.create("main", tenant_id=tenant_id, config_lines=config_lines) as ep:
+        # endpoint should start.
+        ep.start(safekeeper_generation=1, safekeepers=safekeeper_list)
+        ep.safe_psql("CREATE TABLE IF NOT EXISTS t(key int, value text)")
+
+    with env.endpoints.create(
+        "child_of_main", tenant_id=tenant_id, config_lines=config_lines
+    ) as ep:
+        # endpoint should start.
+        ep.start(safekeeper_generation=1, safekeepers=safekeeper_list)
+        ep.safe_psql("CREATE TABLE IF NOT EXISTS t(key int, value text)")
+
+    if deletetion_subject is DeletionSubject.TENANT:
+        env.storage_controller.pageserver_api().tenant_delete(tenant_id)
+    else:
+        env.storage_controller.pageserver_api().timeline_delete(tenant_id, child_timeline_id)
+
+    # ensure that there is log msgs for the third safekeeper too
+    def timeline_deleted_on_sk():
+        env.safekeepers[0].assert_log_contains(
+            f"deleting timeline {tenant_id}/{child_timeline_id} from disk"
+        )
+
+    wait_until(timeline_deleted_on_sk)
+
+
 @pytest.mark.parametrize("wrong_az", [True, False])
 def test_storage_controller_graceful_migration(neon_env_builder: NeonEnvBuilder, wrong_az: bool):
     """

From b00db536bbe2361f97670b2be9120081458adec3 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 22 Apr 2025 17:47:22 -0500
Subject: [PATCH 55/55] Add CPU architecture to the remote extensions object
 key (#11590)

ARM computes are incoming and we need to account for that in remote
extensions. Previously, we just blindly assumed that all computes were
x86_64.

Note that we use the Go architecture naming convention instead of the
Rust one directly to do our best and be consistent across the stack.

Part-of: https://github.com/neondatabase/cloud/issues/23148

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 libs/compute_api/src/spec.rs                    | 13 +++++++++++--
 test_runner/regress/test_download_extensions.py | 14 +++++++++++++-
 2 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs
index 5e67ccce00..ad246c48ec 100644
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -242,13 +242,22 @@ impl RemoteExtSpec {
 
         match self.extension_data.get(real_ext_name) {
             Some(_ext_data) => {
+                // We have decided to use the Go naming convention due to Kubernetes.
+
+                let arch = match std::env::consts::ARCH {
+                    "x86_64" => "amd64",
+                    "aarch64" => "arm64",
+                    arch => arch,
+                };
+
                 // Construct the path to the extension archive
                 // BUILD_TAG/PG_MAJOR_VERSION/extensions/EXTENSION_NAME.tar.zst
                 //
                 // Keep it in sync with path generation in
                 // https://github.com/neondatabase/build-custom-extensions/tree/main
-                let archive_path_str =
-                    format!("{build_tag}/{pg_major_version}/extensions/{real_ext_name}.tar.zst");
+                let archive_path_str = format!(
+                    "{build_tag}/{arch}/{pg_major_version}/extensions/{real_ext_name}.tar.zst"
+                );
                 Ok((
                     real_ext_name.to_string(),
                     RemotePath::from_string(&archive_path_str)?,
diff --git a/test_runner/regress/test_download_extensions.py b/test_runner/regress/test_download_extensions.py
index 77babe12cd..a81d55e57b 100644
--- a/test_runner/regress/test_download_extensions.py
+++ b/test_runner/regress/test_download_extensions.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import os
+import platform
 import shutil
 import tarfile
 from typing import TYPE_CHECKING
@@ -58,7 +59,18 @@ def test_remote_extensions(
     extensions_endpoint = f"http://{host}:{port}/pg-ext-s3-gateway"
 
     build_tag = os.environ.get("BUILD_TAG", "latest")
-    archive_route = f"{build_tag}/v{pg_version}/extensions/test_extension.tar.zst"
+
+    # We have decided to use the Go naming convention due to Kubernetes.
+    arch = platform.machine()
+    match arch:
+        case "aarch64":
+            arch = "arm64"
+        case "x86_64":
+            arch = "amd64"
+        case _:
+            pass
+
+    archive_route = f"{build_tag}/{arch}/v{pg_version}/extensions/test_extension.tar.zst"
     tarball = test_output_dir / "test_extension.tar"
     extension_dir = (
         base_dir / "test_runner" / "regress" / "data" / "test_remote_extensions" / "test_extension"