From 423e2396174cc8e0c97661a6ceb58ad290a17982 Mon Sep 17 00:00:00 2001
From: Anna Stepanyan <anna.stepanyan@neon.tech>
Date: Fri, 31 Jan 2025 07:29:06 +0100
Subject: [PATCH 01/77] [infra/notes] impr: add issue types to issue templates
 (#10018)

refs #0000

---------

Co-authored-by: Fedor Dikarev <fedor@neon.tech>
---
 .github/ISSUE_TEMPLATE/bug-template.md  | 1 +
 .github/ISSUE_TEMPLATE/epic-template.md | 1 +
 2 files changed, 2 insertions(+)

diff --git a/.github/ISSUE_TEMPLATE/bug-template.md b/.github/ISSUE_TEMPLATE/bug-template.md
index d33eec3cde..234d9b5a37 100644
--- a/.github/ISSUE_TEMPLATE/bug-template.md
+++ b/.github/ISSUE_TEMPLATE/bug-template.md
@@ -3,6 +3,7 @@ name: Bug Template
 about: Used for describing bugs
 title: ''
 labels: t/bug
+type: Bug
 assignees: ''
 
 ---
diff --git a/.github/ISSUE_TEMPLATE/epic-template.md b/.github/ISSUE_TEMPLATE/epic-template.md
index c442f50fde..868fd084f1 100644
--- a/.github/ISSUE_TEMPLATE/epic-template.md
+++ b/.github/ISSUE_TEMPLATE/epic-template.md
@@ -4,6 +4,7 @@ about: A set of related tasks contributing towards specific outcome, comprising
   more than 1 week of work.
 title: 'Epic: '
 labels: t/Epic
+type: Epic
 assignees: ''
 
 ---

From 738bf835836de94e8aa41b8575c6db78cb882c38 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 31 Jan 2025 09:53:43 +0000
Subject: [PATCH 02/77] chore: replace dashmap with clashmap (#10582)

## Problem

Because dashmap 6 switched to hashbrown RawTable API, it required us to
use unsafe code in the upgrade:
https://github.com/neondatabase/neon/pull/8107

## Summary of changes

Switch to clashmap, a fork maintained by me which removes much of the
unsafe and ultimately switches to HashTable instead of RawTable to
remove much of the unsafe requirement on us.
---
 Cargo.lock                             | 53 +++++++++++++++++++++++++-
 Cargo.toml                             |  2 +-
 proxy/Cargo.toml                       |  2 +-
 proxy/src/auth/backend/jwt.rs          |  6 +--
 proxy/src/cache/endpoints.rs           | 14 +++----
 proxy/src/cache/project_info.rs        | 12 +++---
 proxy/src/cancellation.rs              |  8 ++--
 proxy/src/control_plane/client/mod.rs  |  8 ++--
 proxy/src/rate_limiter/leaky_bucket.rs |  8 ++--
 proxy/src/rate_limiter/limiter.rs      |  6 +--
 proxy/src/redis/keys.rs                |  3 +-
 proxy/src/redis/kv_ops.rs              |  1 -
 proxy/src/serverless/conn_pool_lib.rs  | 12 +++---
 proxy/src/usage_metrics.rs             | 10 ++---
 14 files changed, 97 insertions(+), 48 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 359f989a76..e9cbebcd02 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1213,6 +1213,20 @@ version = "0.7.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "afb84c814227b90d6895e01398aee0d8033c00e7466aca416fb6a8e0eb19d8a7"
 
+[[package]]
+name = "clashmap"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "93bd59c81e2bd87a775ae2de75f070f7e2bfe97363a6ad652f46824564c23e4d"
+dependencies = [
+ "crossbeam-utils",
+ "hashbrown 0.15.2",
+ "lock_api",
+ "parking_lot_core 0.9.8",
+ "polonius-the-crab",
+ "replace_with",
+]
+
 [[package]]
 name = "colorchoice"
 version = "1.0.0"
@@ -2531,6 +2545,12 @@ dependencies = [
  "allocator-api2",
 ]
 
+[[package]]
+name = "hashbrown"
+version = "0.15.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289"
+
 [[package]]
 name = "hashlink"
 version = "0.9.1"
@@ -2581,6 +2601,15 @@ version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6fe2267d4ed49bc07b63801559be28c718ea06c4738b7a03c94df7386d2cde46"
 
+[[package]]
+name = "higher-kinded-types"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "561985554c8b8d4808605c90a5f1979cc6c31a5d20b78465cd59501233c6678e"
+dependencies = [
+ "never-say-never",
+]
+
 [[package]]
 name = "hmac"
 version = "0.12.1"
@@ -3544,6 +3573,12 @@ version = "0.8.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"
 
+[[package]]
+name = "never-say-never"
+version = "6.6.666"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cf5a574dadd7941adeaa71823ecba5e28331b8313fb2e1c6a5c7e5981ea53ad6"
+
 [[package]]
 name = "nix"
 version = "0.25.1"
@@ -4421,6 +4456,16 @@ dependencies = [
  "plotters-backend",
 ]
 
+[[package]]
+name = "polonius-the-crab"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e97ca2c89572ae41bbec1c99498251f87dd5a94e500c5ec19c382dd593dd5ce9"
+dependencies = [
+ "higher-kinded-types",
+ "never-say-never",
+]
+
 [[package]]
 name = "postgres"
 version = "0.19.6"
@@ -4794,9 +4839,9 @@ dependencies = [
  "camino-tempfile",
  "chrono",
  "clap",
+ "clashmap",
  "compute_api",
  "consumption_metrics",
- "dashmap 5.5.0",
  "ecdsa 0.16.9",
  "ed25519-dalek",
  "env_logger 0.10.2",
@@ -5215,6 +5260,12 @@ dependencies = [
  "utils",
 ]
 
+[[package]]
+name = "replace_with"
+version = "0.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e3a8614ee435691de62bcffcf4a66d91b3594bf1428a5722e79103249a095690"
+
 [[package]]
 name = "reqwest"
 version = "0.12.4"
diff --git a/Cargo.toml b/Cargo.toml
index 9ccdb45f6d..9d15b78a93 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -77,10 +77,10 @@ camino = "1.1.6"
 cfg-if = "1.0.0"
 chrono = { version = "0.4", default-features = false, features = ["clock"] }
 clap = { version = "4.0", features = ["derive", "env"] }
+clashmap = { version = "1.0", features = ["raw-api"] }
 comfy-table = "7.1"
 const_format = "0.2"
 crc32c = "0.6"
-dashmap = { version = "5.5.0", features = ["raw-api"] }
 diatomic-waker = { version = "0.2.3" }
 either = "1.8"
 enum-map = "2.4.2"
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index f362a45035..35574e945c 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -24,9 +24,9 @@ bytes = { workspace = true, features = ["serde"] }
 camino.workspace = true
 chrono.workspace = true
 clap = { workspace = true, features = ["derive", "env"] }
+clashmap.workspace = true
 compute_api.workspace = true
 consumption_metrics.workspace = true
-dashmap.workspace = true
 env_logger.workspace = true
 framed-websockets.workspace = true
 futures.workspace = true
diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs
index df716f8455..e05a693cee 100644
--- a/proxy/src/auth/backend/jwt.rs
+++ b/proxy/src/auth/backend/jwt.rs
@@ -4,7 +4,7 @@ use std::sync::Arc;
 use std::time::{Duration, SystemTime};
 
 use arc_swap::ArcSwapOption;
-use dashmap::DashMap;
+use clashmap::ClashMap;
 use jose_jwk::crypto::KeyInfo;
 use reqwest::{redirect, Client};
 use reqwest_retry::policies::ExponentialBackoff;
@@ -64,7 +64,7 @@ pub(crate) struct AuthRule {
 pub struct JwkCache {
     client: reqwest_middleware::ClientWithMiddleware,
 
-    map: DashMap<(EndpointId, RoleName), Arc<JwkCacheEntryLock>>,
+    map: ClashMap<(EndpointId, RoleName), Arc<JwkCacheEntryLock>>,
 }
 
 pub(crate) struct JwkCacheEntry {
@@ -469,7 +469,7 @@ impl Default for JwkCache {
 
         JwkCache {
             client,
-            map: DashMap::default(),
+            map: ClashMap::default(),
         }
     }
 }
diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs
index 0136446d6d..b5c42cd23d 100644
--- a/proxy/src/cache/endpoints.rs
+++ b/proxy/src/cache/endpoints.rs
@@ -3,7 +3,7 @@ use std::future::pending;
 use std::sync::atomic::{AtomicBool, Ordering};
 use std::sync::{Arc, Mutex};
 
-use dashmap::DashSet;
+use clashmap::ClashSet;
 use redis::streams::{StreamReadOptions, StreamReadReply};
 use redis::{AsyncCommands, FromRedisValue, Value};
 use serde::Deserialize;
@@ -55,9 +55,9 @@ impl TryFrom<&Value> for ControlPlaneEvent {
 
 pub struct EndpointsCache {
     config: EndpointCacheConfig,
-    endpoints: DashSet<EndpointIdInt>,
-    branches: DashSet<BranchIdInt>,
-    projects: DashSet<ProjectIdInt>,
+    endpoints: ClashSet<EndpointIdInt>,
+    branches: ClashSet<BranchIdInt>,
+    projects: ClashSet<ProjectIdInt>,
     ready: AtomicBool,
     limiter: Arc<Mutex<GlobalRateLimiter>>,
 }
@@ -69,9 +69,9 @@ impl EndpointsCache {
                 config.limiter_info.clone(),
             ))),
             config,
-            endpoints: DashSet::new(),
-            branches: DashSet::new(),
-            projects: DashSet::new(),
+            endpoints: ClashSet::new(),
+            branches: ClashSet::new(),
+            projects: ClashSet::new(),
             ready: AtomicBool::new(false),
         }
     }
diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs
index cab0b8b905..a5e71f1a87 100644
--- a/proxy/src/cache/project_info.rs
+++ b/proxy/src/cache/project_info.rs
@@ -5,7 +5,7 @@ use std::sync::Arc;
 use std::time::Duration;
 
 use async_trait::async_trait;
-use dashmap::DashMap;
+use clashmap::ClashMap;
 use rand::{thread_rng, Rng};
 use smol_str::SmolStr;
 use tokio::sync::Mutex;
@@ -108,9 +108,9 @@ impl EndpointInfo {
 /// One may ask, why the data is stored per project, when on the user request there is only data about the endpoint available?
 /// On the cplane side updates are done per project (or per branch), so it's easier to invalidate the whole project cache.
 pub struct ProjectInfoCacheImpl {
-    cache: DashMap<EndpointIdInt, EndpointInfo>,
+    cache: ClashMap<EndpointIdInt, EndpointInfo>,
 
-    project2ep: DashMap<ProjectIdInt, HashSet<EndpointIdInt>>,
+    project2ep: ClashMap<ProjectIdInt, HashSet<EndpointIdInt>>,
     config: ProjectInfoCacheOptions,
 
     start_time: Instant,
@@ -176,8 +176,8 @@ impl ProjectInfoCache for ProjectInfoCacheImpl {
 impl ProjectInfoCacheImpl {
     pub(crate) fn new(config: ProjectInfoCacheOptions) -> Self {
         Self {
-            cache: DashMap::new(),
-            project2ep: DashMap::new(),
+            cache: ClashMap::new(),
+            project2ep: ClashMap::new(),
             config,
             ttl_disabled_since_us: AtomicU64::new(u64::MAX),
             start_time: Instant::now(),
@@ -302,7 +302,7 @@ impl ProjectInfoCacheImpl {
         let mut removed = 0;
         let shard = self.project2ep.shards()[shard].write();
         for (_, endpoints) in shard.iter() {
-            for endpoint in endpoints.get() {
+            for endpoint in endpoints {
                 self.cache.remove(endpoint);
                 removed += 1;
             }
diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index 34f708a36b..9a0b954341 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -1,3 +1,4 @@
+use std::convert::Infallible;
 use std::net::{IpAddr, SocketAddr};
 use std::sync::Arc;
 
@@ -8,7 +9,7 @@ use pq_proto::CancelKeyData;
 use serde::{Deserialize, Serialize};
 use thiserror::Error;
 use tokio::net::TcpStream;
-use tokio::sync::mpsc;
+use tokio::sync::{mpsc, oneshot};
 use tracing::{debug, info};
 
 use crate::auth::backend::{BackendIpAllowlist, ComputeUserInfo};
@@ -17,14 +18,11 @@ use crate::config::ComputeConfig;
 use crate::context::RequestContext;
 use crate::error::ReportableError;
 use crate::ext::LockExt;
-use crate::metrics::CancelChannelSizeGuard;
-use crate::metrics::{CancellationRequest, Metrics, RedisMsgKind};
+use crate::metrics::{CancelChannelSizeGuard, CancellationRequest, Metrics, RedisMsgKind};
 use crate::rate_limiter::LeakyBucketRateLimiter;
 use crate::redis::keys::KeyPrefix;
 use crate::redis::kv_ops::RedisKVClient;
 use crate::tls::postgres_rustls::MakeRustlsConnect;
-use std::convert::Infallible;
-use tokio::sync::oneshot;
 
 type IpSubnetKey = IpNet;
 
diff --git a/proxy/src/control_plane/client/mod.rs b/proxy/src/control_plane/client/mod.rs
index d559d96bbc..b879f3a59f 100644
--- a/proxy/src/control_plane/client/mod.rs
+++ b/proxy/src/control_plane/client/mod.rs
@@ -6,7 +6,7 @@ use std::hash::Hash;
 use std::sync::Arc;
 use std::time::Duration;
 
-use dashmap::DashMap;
+use clashmap::ClashMap;
 use tokio::time::Instant;
 use tracing::{debug, info};
 
@@ -148,7 +148,7 @@ impl ApiCaches {
 /// Various caches for [`control_plane`](super).
 pub struct ApiLocks<K> {
     name: &'static str,
-    node_locks: DashMap<K, Arc<DynamicLimiter>>,
+    node_locks: ClashMap<K, Arc<DynamicLimiter>>,
     config: RateLimiterConfig,
     timeout: Duration,
     epoch: std::time::Duration,
@@ -180,7 +180,7 @@ impl<K: Hash + Eq + Clone> ApiLocks<K> {
     ) -> prometheus::Result<Self> {
         Ok(Self {
             name,
-            node_locks: DashMap::with_shard_amount(shards),
+            node_locks: ClashMap::with_shard_amount(shards),
             config,
             timeout,
             epoch,
@@ -238,7 +238,7 @@ impl<K: Hash + Eq + Clone> ApiLocks<K> {
                 let mut lock = shard.write();
                 let timer = self.metrics.reclamation_lag_seconds.start_timer();
                 let count = lock
-                    .extract_if(|_, semaphore| Arc::strong_count(semaphore.get_mut()) == 1)
+                    .extract_if(|(_, semaphore)| Arc::strong_count(semaphore) == 1)
                     .count();
                 drop(lock);
                 self.metrics.semaphores_unregistered.inc_by(count as u64);
diff --git a/proxy/src/rate_limiter/leaky_bucket.rs b/proxy/src/rate_limiter/leaky_bucket.rs
index bff800f0a2..9645eaf725 100644
--- a/proxy/src/rate_limiter/leaky_bucket.rs
+++ b/proxy/src/rate_limiter/leaky_bucket.rs
@@ -2,7 +2,7 @@ use std::hash::Hash;
 use std::sync::atomic::{AtomicUsize, Ordering};
 
 use ahash::RandomState;
-use dashmap::DashMap;
+use clashmap::ClashMap;
 use rand::{thread_rng, Rng};
 use tokio::time::Instant;
 use tracing::info;
@@ -14,7 +14,7 @@ use crate::intern::EndpointIdInt;
 pub type EndpointRateLimiter = LeakyBucketRateLimiter<EndpointIdInt>;
 
 pub struct LeakyBucketRateLimiter<Key> {
-    map: DashMap<Key, LeakyBucketState, RandomState>,
+    map: ClashMap<Key, LeakyBucketState, RandomState>,
     config: utils::leaky_bucket::LeakyBucketConfig,
     access_count: AtomicUsize,
 }
@@ -27,7 +27,7 @@ impl<K: Hash + Eq> LeakyBucketRateLimiter<K> {
 
     pub fn new_with_shards(config: LeakyBucketConfig, shards: usize) -> Self {
         Self {
-            map: DashMap::with_hasher_and_shard_amount(RandomState::new(), shards),
+            map: ClashMap::with_hasher_and_shard_amount(RandomState::new(), shards),
             config: config.into(),
             access_count: AtomicUsize::new(0),
         }
@@ -58,7 +58,7 @@ impl<K: Hash + Eq> LeakyBucketRateLimiter<K> {
         let shard = thread_rng().gen_range(0..n);
         self.map.shards()[shard]
             .write()
-            .retain(|_, value| !value.get().bucket_is_empty(now));
+            .retain(|(_, value)| !value.bucket_is_empty(now));
     }
 }
 
diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs
index ec080f270b..ef6c39f230 100644
--- a/proxy/src/rate_limiter/limiter.rs
+++ b/proxy/src/rate_limiter/limiter.rs
@@ -5,7 +5,7 @@ use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::Mutex;
 
 use anyhow::bail;
-use dashmap::DashMap;
+use clashmap::ClashMap;
 use itertools::Itertools;
 use rand::rngs::StdRng;
 use rand::{Rng, SeedableRng};
@@ -62,7 +62,7 @@ impl GlobalRateLimiter {
 pub type WakeComputeRateLimiter = BucketRateLimiter<EndpointIdInt, StdRng, RandomState>;
 
 pub struct BucketRateLimiter<Key, Rand = StdRng, Hasher = RandomState> {
-    map: DashMap<Key, Vec<RateBucket>, Hasher>,
+    map: ClashMap<Key, Vec<RateBucket>, Hasher>,
     info: Cow<'static, [RateBucketInfo]>,
     access_count: AtomicUsize,
     rand: Mutex<Rand>,
@@ -202,7 +202,7 @@ impl<K: Hash + Eq, R: Rng, S: BuildHasher + Clone> BucketRateLimiter<K, R, S> {
         info!(buckets = ?info, "endpoint rate limiter");
         Self {
             info,
-            map: DashMap::with_hasher_and_shard_amount(hasher, 64),
+            map: ClashMap::with_hasher_and_shard_amount(hasher, 64),
             access_count: AtomicUsize::new(1), // start from 1 to avoid GC on the first request
             rand: Mutex::new(rand),
         }
diff --git a/proxy/src/redis/keys.rs b/proxy/src/redis/keys.rs
index dddc7e2054..dcb9a59f87 100644
--- a/proxy/src/redis/keys.rs
+++ b/proxy/src/redis/keys.rs
@@ -1,7 +1,8 @@
+use std::io::ErrorKind;
+
 use anyhow::Ok;
 use pq_proto::{id_to_cancel_key, CancelKeyData};
 use serde::{Deserialize, Serialize};
-use std::io::ErrorKind;
 
 pub mod keyspace {
     pub const CANCEL_PREFIX: &str = "cancel";
diff --git a/proxy/src/redis/kv_ops.rs b/proxy/src/redis/kv_ops.rs
index dcc6aac51b..3689bf7ae2 100644
--- a/proxy/src/redis/kv_ops.rs
+++ b/proxy/src/redis/kv_ops.rs
@@ -1,7 +1,6 @@
 use redis::{AsyncCommands, ToRedisArgs};
 
 use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
-
 use crate::rate_limiter::{GlobalRateLimiter, RateBucketInfo};
 
 pub struct RedisKVClient {
diff --git a/proxy/src/serverless/conn_pool_lib.rs b/proxy/src/serverless/conn_pool_lib.rs
index 44eac77e8f..a300198de4 100644
--- a/proxy/src/serverless/conn_pool_lib.rs
+++ b/proxy/src/serverless/conn_pool_lib.rs
@@ -5,7 +5,7 @@ use std::sync::atomic::{self, AtomicUsize};
 use std::sync::{Arc, Weak};
 use std::time::Duration;
 
-use dashmap::DashMap;
+use clashmap::ClashMap;
 use parking_lot::RwLock;
 use postgres_client::ReadyForQueryStatus;
 use rand::Rng;
@@ -351,11 +351,11 @@ where
     //
     // That should be a fairly conteded map, so return reference to the per-endpoint
     // pool as early as possible and release the lock.
-    pub(crate) global_pool: DashMap<EndpointCacheKey, Arc<RwLock<P>>>,
+    pub(crate) global_pool: ClashMap<EndpointCacheKey, Arc<RwLock<P>>>,
 
     /// Number of endpoint-connection pools
     ///
-    /// [`DashMap::len`] iterates over all inner pools and acquires a read lock on each.
+    /// [`ClashMap::len`] iterates over all inner pools and acquires a read lock on each.
     /// That seems like far too much effort, so we're using a relaxed increment counter instead.
     /// It's only used for diagnostics.
     pub(crate) global_pool_size: AtomicUsize,
@@ -396,7 +396,7 @@ where
     pub(crate) fn new(config: &'static crate::config::HttpConfig) -> Arc<Self> {
         let shards = config.pool_options.pool_shards;
         Arc::new(Self {
-            global_pool: DashMap::with_shard_amount(shards),
+            global_pool: ClashMap::with_shard_amount(shards),
             global_pool_size: AtomicUsize::new(0),
             config,
             global_connections_count: Arc::new(AtomicUsize::new(0)),
@@ -442,10 +442,10 @@ where
             .start_timer();
         let current_len = shard.len();
         let mut clients_removed = 0;
-        shard.retain(|endpoint, x| {
+        shard.retain(|(endpoint, x)| {
             // if the current endpoint pool is unique (no other strong or weak references)
             // then it is currently not in use by any connections.
-            if let Some(pool) = Arc::get_mut(x.get_mut()) {
+            if let Some(pool) = Arc::get_mut(x) {
                 let endpoints = pool.get_mut();
                 clients_removed = endpoints.clear_closed();
 
diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs
index e1cc7e87b4..d369e3742f 100644
--- a/proxy/src/usage_metrics.rs
+++ b/proxy/src/usage_metrics.rs
@@ -10,9 +10,9 @@ use anyhow::{bail, Context};
 use async_compression::tokio::write::GzipEncoder;
 use bytes::Bytes;
 use chrono::{DateTime, Datelike, Timelike, Utc};
+use clashmap::mapref::entry::Entry;
+use clashmap::ClashMap;
 use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
-use dashmap::mapref::entry::Entry;
-use dashmap::DashMap;
 use once_cell::sync::Lazy;
 use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel};
 use serde::{Deserialize, Serialize};
@@ -137,7 +137,7 @@ type FastHasher = std::hash::BuildHasherDefault<rustc_hash::FxHasher>;
 
 #[derive(Default)]
 pub(crate) struct Metrics {
-    endpoints: DashMap<Ids, Arc<MetricCounter>, FastHasher>,
+    endpoints: ClashMap<Ids, Arc<MetricCounter>, FastHasher>,
 }
 
 impl Metrics {
@@ -213,7 +213,7 @@ pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result<Infall
 }
 
 fn collect_and_clear_metrics<C: Clearable>(
-    endpoints: &DashMap<Ids, Arc<C>, FastHasher>,
+    endpoints: &ClashMap<Ids, Arc<C>, FastHasher>,
 ) -> Vec<(Ids, u64)> {
     let mut metrics_to_clear = Vec::new();
 
@@ -271,7 +271,7 @@ fn create_event_chunks<'a>(
 #[expect(clippy::too_many_arguments)]
 #[instrument(skip_all)]
 async fn collect_metrics_iteration(
-    endpoints: &DashMap<Ids, Arc<MetricCounter>, FastHasher>,
+    endpoints: &ClashMap<Ids, Arc<MetricCounter>, FastHasher>,
     client: &http::ClientWithMiddleware,
     metric_collection_endpoint: &reqwest::Url,
     storage: Option<&GenericRemoteStorage>,

From 6041a935918f34f06ec7f0fb93cc6fc97c1dc1dd Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Fri, 31 Jan 2025 10:54:31 +0100
Subject: [PATCH 03/77] Update tokio base crates (#10556)

Update `tokio` base crates and their deps. Pin `tokio` to at least 1.41
which stabilized task ID APIs.

To dedup `mio` dep the `notify` crate is updated. It's used in
`compute_tools`.

https://github.com/neondatabase/neon/blob/9f81828429ad6475b4fbb1a814240213b74bec63/compute_tools/src/pg_helpers.rs#L258-L367
---
 Cargo.lock | 84 +++++++++++++++++++++++++++++++-----------------------
 Cargo.toml |  4 +--
 2 files changed, 51 insertions(+), 37 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index e9cbebcd02..cada9604ff 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -966,7 +966,7 @@ version = "0.70.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f49d8fed880d473ea71efb9bf597651e77201bdd4893efe54c9e5d65ae04ce6f"
 dependencies = [
- "bitflags 2.4.1",
+ "bitflags 2.8.0",
  "cexpr",
  "clang-sys",
  "itertools 0.12.1",
@@ -994,9 +994,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
 
 [[package]]
 name = "bitflags"
-version = "2.4.1"
+version = "2.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07"
+checksum = "8f68f53c83ab957f72c32642f3868eec03eb974d1fb82e453128456482613d36"
 
 [[package]]
 name = "block-buffer"
@@ -1563,7 +1563,7 @@ version = "0.27.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f476fe445d41c9e991fd07515a6f463074b782242ccf4a5b7b1d1012e70824df"
 dependencies = [
- "bitflags 2.4.1",
+ "bitflags 2.8.0",
  "crossterm_winapi",
  "libc",
  "parking_lot 0.12.1",
@@ -1794,7 +1794,7 @@ version = "2.2.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ccf1bedf64cdb9643204a36dd15b19a6ce8e7aa7f7b105868e9f1fad5ffa7d12"
 dependencies = [
- "bitflags 2.4.1",
+ "bitflags 2.8.0",
  "byteorder",
  "chrono",
  "diesel_derives",
@@ -3088,11 +3088,11 @@ dependencies = [
 
 [[package]]
 name = "inotify"
-version = "0.9.6"
+version = "0.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f8069d3ec154eb856955c1c0fbffefbf5f3c40a104ec912d4797314c1801abff"
+checksum = "f37dccff2791ab604f9babef0ba14fbe0be30bd368dc541e2b08d07c8aa908f3"
 dependencies = [
- "bitflags 1.3.2",
+ "bitflags 2.8.0",
  "inotify-sys",
  "libc",
 ]
@@ -3269,9 +3269,9 @@ dependencies = [
 
 [[package]]
 name = "kqueue"
-version = "1.0.7"
+version = "1.0.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2c8fc60ba15bf51257aa9807a48a61013db043fcf3a78cb0d916e8e396dcad98"
+checksum = "7447f1ca1b7b563588a205fe93dea8df60fd981423a768bc1c0ded35ed147d0c"
 dependencies = [
  "kqueue-sys",
  "libc",
@@ -3279,9 +3279,9 @@ dependencies = [
 
 [[package]]
 name = "kqueue-sys"
-version = "1.0.3"
+version = "1.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8367585489f01bc55dd27404dcf56b95e6da061a256a666ab23be9ba96a2e587"
+checksum = "ed9625ffda8729b85e45cf04090035ac368927b8cebc34898e7c120f52e4838b"
 dependencies = [
  "bitflags 1.3.2",
  "libc",
@@ -3308,9 +3308,9 @@ dependencies = [
 
 [[package]]
 name = "libc"
-version = "0.2.167"
+version = "0.2.169"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "09d6582e104315a817dff97f75133544b2e094ee22447d2acf4a74e189ba06fc"
+checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a"
 
 [[package]]
 name = "libloading"
@@ -3557,14 +3557,14 @@ dependencies = [
 
 [[package]]
 name = "mio"
-version = "0.8.11"
+version = "1.0.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c"
+checksum = "2886843bf800fba2e3377cff24abf6379b4c4d5c6681eaf9ea5b0d15090450bd"
 dependencies = [
  "libc",
  "log",
  "wasi 0.11.0+wasi-snapshot-preview1",
- "windows-sys 0.48.0",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
@@ -3610,7 +3610,7 @@ version = "0.27.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2eb04e9c688eff1c89d72b407f168cf79bb9e867a9d3323ed6c01519eb9cc053"
 dependencies = [
- "bitflags 2.4.1",
+ "bitflags 2.8.0",
  "cfg-if",
  "libc",
  "memoffset 0.9.0",
@@ -3628,12 +3628,11 @@ dependencies = [
 
 [[package]]
 name = "notify"
-version = "6.1.1"
+version = "8.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6205bd8bb1e454ad2e27422015fb5e4f2bcc7e08fa8f27058670d208324a4d2d"
+checksum = "2fee8403b3d66ac7b26aee6e40a897d85dc5ce26f44da36b8b73e987cc52e943"
 dependencies = [
- "bitflags 2.4.1",
- "crossbeam-channel",
+ "bitflags 2.8.0",
  "filetime",
  "fsevent-sys",
  "inotify",
@@ -3641,10 +3640,17 @@ dependencies = [
  "libc",
  "log",
  "mio",
+ "notify-types",
  "walkdir",
- "windows-sys 0.48.0",
+ "windows-sys 0.59.0",
 ]
 
+[[package]]
+name = "notify-types"
+version = "2.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5e0826a989adedc2a244799e823aece04662b66609d96af8dff7ac6df9a8925d"
+
 [[package]]
 name = "ntapi"
 version = "0.4.1"
@@ -4705,7 +4711,7 @@ version = "0.16.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "731e0d9356b0c25f16f33b5be79b1c57b562f141ebfcdb0ad8ac2c13a24293b4"
 dependencies = [
- "bitflags 2.4.1",
+ "bitflags 2.8.0",
  "chrono",
  "flate2",
  "hex",
@@ -4720,7 +4726,7 @@ version = "0.16.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2d3554923a69f4ce04c4a754260c338f505ce22642d3830e049a399fc2059a29"
 dependencies = [
- "bitflags 2.4.1",
+ "bitflags 2.8.0",
  "chrono",
  "hex",
 ]
@@ -5545,7 +5551,7 @@ version = "0.38.41"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d7f649912bc1495e167a6edee79151c84b1bad49748cb4f1f1167f459f6224f6"
 dependencies = [
- "bitflags 2.4.1",
+ "bitflags 2.8.0",
  "errno",
  "libc",
  "linux-raw-sys 0.4.14",
@@ -6819,21 +6825,20 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
 
 [[package]]
 name = "tokio"
-version = "1.38.1"
+version = "1.43.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eb2caba9f80616f438e09748d5acda951967e1ea58508ef53d9c6402485a46df"
+checksum = "3d61fa4ffa3de412bfea335c6ecff681de2b609ba3c77ef3e00e521813a9ed9e"
 dependencies = [
  "backtrace",
  "bytes",
  "libc",
  "mio",
- "num_cpus",
  "parking_lot 0.12.1",
  "pin-project-lite",
  "signal-hook-registry",
  "socket2",
  "tokio-macros",
- "windows-sys 0.48.0",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
@@ -6864,9 +6869,9 @@ dependencies = [
 
 [[package]]
 name = "tokio-macros"
-version = "2.3.0"
+version = "2.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5f5ae998a069d4b5aba8ee9dad856af7d520c3699e6159b185c2acd48155d39a"
+checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -7151,7 +7156,7 @@ version = "0.6.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "403fa3b783d4b626a8ad51d766ab03cb6d2dbfc46b1c5d4448395e6628dc9697"
 dependencies = [
- "bitflags 2.4.1",
+ "bitflags 2.8.0",
  "bytes",
  "http 1.1.0",
  "http-body 1.0.0",
@@ -7654,9 +7659,9 @@ dependencies = [
 
 [[package]]
 name = "walkdir"
-version = "2.3.3"
+version = "2.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "36df944cda56c7d8d8b7496af378e6b16de9284591917d307c9b4d313c44e698"
+checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b"
 dependencies = [
  "same-file",
  "winapi-util",
@@ -7908,6 +7913,15 @@ dependencies = [
  "windows-targets 0.52.6",
 ]
 
+[[package]]
+name = "windows-sys"
+version = "0.59.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
+dependencies = [
+ "windows-targets 0.52.6",
+]
+
 [[package]]
 name = "windows-targets"
 version = "0.48.0"
diff --git a/Cargo.toml b/Cargo.toml
index 9d15b78a93..267a91d773 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -123,7 +123,7 @@ measured = { version = "0.0.22", features=["lasso"] }
 measured-process = { version = "0.0.22" }
 memoffset = "0.9"
 nix = { version = "0.27", features = ["dir", "fs", "process", "socket", "signal", "poll"] }
-notify = "6.0.0"
+notify = "8.0.0"
 num_cpus = "1.15"
 num-traits = "0.2.15"
 once_cell = "1.13"
@@ -177,7 +177,7 @@ test-context = "0.3"
 thiserror = "1.0"
 tikv-jemallocator = { version = "0.6", features = ["profiling", "stats", "unprefixed_malloc_on_supported_platforms"] }
 tikv-jemalloc-ctl = { version = "0.6", features = ["stats"] }
-tokio = { version = "1.17", features = ["macros"] }
+tokio = { version = "1.41", features = ["macros"] }
 tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
 tokio-io-timeout = "1.2.0"
 tokio-postgres-rustls = "0.12.0"

From 765ba43438f91bf75773e9de358dbf58f34c5d87 Mon Sep 17 00:00:00 2001
From: Arseny Sher <ars@neon.tech>
Date: Fri, 31 Jan 2025 13:33:24 +0300
Subject: [PATCH 04/77] Allow pageserver unreachable errors in
 test_scrubber_tenant_snapshot (#10585)

## Problem

test_scrubber_tenant_snapshot restarts pageservers, but log validation
fails tests on any non white listed storcon warnings, making the test
flaky.

## Summary of changes

Allow warns like
2025-01-29T12:37:42.622179Z WARN reconciler{seq=1
tenant_id=2011077aea9b4e8a60e8e8a19407634c shard_id=0004}: Call to node
2 (localhost:15352) management API failed, will retry (attempt 1):
receive body: error sending request for url
(http://localhost:15352/v1/tenant/2011077aea9b4e8a60e8e8a19407634c-0004/location_config):
client error (Connect)

ref https://github.com/neondatabase/neon/issues/10462
---
 test_runner/regress/test_storage_scrubber.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py
index 1304d302b7..7e92cc01cd 100644
--- a/test_runner/regress/test_storage_scrubber.py
+++ b/test_runner/regress/test_storage_scrubber.py
@@ -32,6 +32,12 @@ def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count:
     neon_env_builder.num_pageservers = shard_count if shard_count is not None else 1
 
     env = neon_env_builder.init_start()
+    # We restart pageserver(s), which will cause storage storage controller
+    # requests to fail and warn.
+    env.storage_controller.allowed_errors.append(".*management API still failed.*")
+    env.storage_controller.allowed_errors.append(
+        ".*Reconcile error.*error sending request for url.*"
+    )
     tenant_id = env.initial_tenant
     timeline_id = env.initial_timeline
     branch = "main"

From f09cfd11cb3029f9395c0ad0bccd61d2a848a6db Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 31 Jan 2025 10:54:14 +0000
Subject: [PATCH 05/77] pageserver: exclude archived timelines from
 freeze+flush on shutdown (#10594)

## Problem

If offloading races with normal shutdown, we get a "failed to freeze and
flush: cannot flush frozen layers when flush_loop is not running, state
is Exited". This is harmless but points to it being quite strange to try
and freeze and flush such a timeline. flushing on shutdown for an
archived timeline isn't useful.

Related: https://github.com/neondatabase/neon/issues/10389

## Summary of changes

- During Timeline::shutdown, ignore ShutdownMode::FreezeAndFlush if the
timeline is archived
---
 pageserver/src/tenant/timeline.rs | 71 ++++++++++++++++++-------------
 1 file changed, 41 insertions(+), 30 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 827601fa8b..d6a8eaa4d9 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1818,7 +1818,7 @@ impl Timeline {
         self.last_record_lsn.shutdown();
 
         if let ShutdownMode::FreezeAndFlush = mode {
-            if let Some((open, frozen)) = self
+            let do_flush = if let Some((open, frozen)) = self
                 .layers
                 .read()
                 .await
@@ -1827,43 +1827,54 @@ impl Timeline {
                 .ok()
                 .filter(|(open, frozen)| *open || *frozen > 0)
             {
-                tracing::info!(?open, frozen, "flushing and freezing on shutdown");
+                if self.remote_client.is_archived() == Some(true) {
+                    // No point flushing on shutdown for an archived timeline: it is not important
+                    // to have it nice and fresh after our restart, and trying to flush here might
+                    // race with trying to offload it (which also stops the flush loop)
+                    false
+                } else {
+                    tracing::info!(?open, frozen, "flushing and freezing on shutdown");
+                    true
+                }
             } else {
-                // this is double-shutdown, ignore it
-            }
+                // this is double-shutdown, it'll be a no-op
+                true
+            };
 
             // we shut down walreceiver above, so, we won't add anything more
             // to the InMemoryLayer; freeze it and wait for all frozen layers
             // to reach the disk & upload queue, then shut the upload queue and
             // wait for it to drain.
-            match self.freeze_and_flush().await {
-                Ok(_) => {
-                    // drain the upload queue
-                    // if we did not wait for completion here, it might be our shutdown process
-                    // didn't wait for remote uploads to complete at all, as new tasks can forever
-                    // be spawned.
-                    //
-                    // what is problematic is the shutting down of RemoteTimelineClient, because
-                    // obviously it does not make sense to stop while we wait for it, but what
-                    // about corner cases like s3 suddenly hanging up?
-                    self.remote_client.shutdown().await;
+            if do_flush {
+                match self.freeze_and_flush().await {
+                    Ok(_) => {
+                        // drain the upload queue
+                        // if we did not wait for completion here, it might be our shutdown process
+                        // didn't wait for remote uploads to complete at all, as new tasks can forever
+                        // be spawned.
+                        //
+                        // what is problematic is the shutting down of RemoteTimelineClient, because
+                        // obviously it does not make sense to stop while we wait for it, but what
+                        // about corner cases like s3 suddenly hanging up?
+                        self.remote_client.shutdown().await;
+                    }
+                    Err(FlushLayerError::Cancelled) => {
+                        // this is likely the second shutdown, ignore silently.
+                        // TODO: this can be removed once https://github.com/neondatabase/neon/issues/5080
+                        debug_assert!(self.cancel.is_cancelled());
+                    }
+                    Err(e) => {
+                        // Non-fatal.  Shutdown is infallible.  Failures to flush just mean that
+                        // we have some extra WAL replay to do next time the timeline starts.
+                        warn!("failed to freeze and flush: {e:#}");
+                    }
                 }
-                Err(FlushLayerError::Cancelled) => {
-                    // this is likely the second shutdown, ignore silently.
-                    // TODO: this can be removed once https://github.com/neondatabase/neon/issues/5080
-                    debug_assert!(self.cancel.is_cancelled());
-                }
-                Err(e) => {
-                    // Non-fatal.  Shutdown is infallible.  Failures to flush just mean that
-                    // we have some extra WAL replay to do next time the timeline starts.
-                    warn!("failed to freeze and flush: {e:#}");
-                }
-            }
 
-            // `self.remote_client.shutdown().await` above should have already flushed everything from the queue, but
-            // we also do a final check here to ensure that the queue is empty.
-            if !self.remote_client.no_pending_work() {
-                warn!("still have pending work in remote upload queue, but continuing shutting down anyways");
+                // `self.remote_client.shutdown().await` above should have already flushed everything from the queue, but
+                // we also do a final check here to ensure that the queue is empty.
+                if !self.remote_client.no_pending_work() {
+                    warn!("still have pending work in remote upload queue, but continuing shutting down anyways");
+                }
             }
         }
 

From 7d5c70c717c5ad543fdbd6115e422e27e0e86da9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 31 Jan 2025 12:23:12 +0100
Subject: [PATCH 06/77] Update AWS SDK crates (#10588)

We want to keep the AWS SDK up to date as that way we benefit from new
developments and improvements.

Prior update was in #10056
---
 Cargo.lock | 75 ++++++++++++++++++++++++------------------------------
 1 file changed, 33 insertions(+), 42 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index cada9604ff..6b63c3c388 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -290,9 +290,9 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
 
 [[package]]
 name = "aws-config"
-version = "1.5.10"
+version = "1.5.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9b49afaa341e8dd8577e1a2200468f98956d6eda50bcf4a53246cc00174ba924"
+checksum = "dc47e70fc35d054c8fcd296d47a61711f043ac80534a10b4f741904f81e73a90"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
@@ -301,7 +301,7 @@ dependencies = [
  "aws-sdk-sts",
  "aws-smithy-async",
  "aws-smithy-http",
- "aws-smithy-json 0.60.7",
+ "aws-smithy-json",
  "aws-smithy-runtime",
  "aws-smithy-runtime-api",
  "aws-smithy-types",
@@ -332,9 +332,9 @@ dependencies = [
 
 [[package]]
 name = "aws-runtime"
-version = "1.4.4"
+version = "1.5.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b5ac934720fbb46206292d2c75b57e67acfc56fe7dfd34fb9a02334af08409ea"
+checksum = "bee7643696e7fdd74c10f9eb42848a87fe469d35eae9c3323f80aa98f350baac"
 dependencies = [
  "aws-credential-types",
  "aws-sigv4",
@@ -366,7 +366,7 @@ dependencies = [
  "aws-runtime",
  "aws-smithy-async",
  "aws-smithy-http",
- "aws-smithy-json 0.61.1",
+ "aws-smithy-json",
  "aws-smithy-query",
  "aws-smithy-runtime",
  "aws-smithy-runtime-api",
@@ -389,7 +389,7 @@ dependencies = [
  "aws-runtime",
  "aws-smithy-async",
  "aws-smithy-http",
- "aws-smithy-json 0.61.1",
+ "aws-smithy-json",
  "aws-smithy-runtime",
  "aws-smithy-runtime-api",
  "aws-smithy-types",
@@ -414,7 +414,7 @@ dependencies = [
  "aws-smithy-checksums",
  "aws-smithy-eventstream",
  "aws-smithy-http",
- "aws-smithy-json 0.61.1",
+ "aws-smithy-json",
  "aws-smithy-runtime",
  "aws-smithy-runtime-api",
  "aws-smithy-types",
@@ -437,15 +437,15 @@ dependencies = [
 
 [[package]]
 name = "aws-sdk-sso"
-version = "1.50.0"
+version = "1.57.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "05ca43a4ef210894f93096039ef1d6fa4ad3edfabb3be92b80908b9f2e4b4eab"
+checksum = "c54bab121fe1881a74c338c5f723d1592bf3b53167f80268a1274f404e1acc38"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
  "aws-smithy-async",
  "aws-smithy-http",
- "aws-smithy-json 0.61.1",
+ "aws-smithy-json",
  "aws-smithy-runtime",
  "aws-smithy-runtime-api",
  "aws-smithy-types",
@@ -459,15 +459,15 @@ dependencies = [
 
 [[package]]
 name = "aws-sdk-ssooidc"
-version = "1.51.0"
+version = "1.58.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "abaf490c2e48eed0bb8e2da2fb08405647bd7f253996e0f93b981958ea0f73b0"
+checksum = "8c8234fd024f7ac61c4e44ea008029bde934250f371efe7d4a39708397b1080c"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
  "aws-smithy-async",
  "aws-smithy-http",
- "aws-smithy-json 0.61.1",
+ "aws-smithy-json",
  "aws-smithy-runtime",
  "aws-smithy-runtime-api",
  "aws-smithy-types",
@@ -481,15 +481,15 @@ dependencies = [
 
 [[package]]
 name = "aws-sdk-sts"
-version = "1.51.0"
+version = "1.58.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b68fde0d69c8bfdc1060ea7da21df3e39f6014da316783336deff0a9ec28f4bf"
+checksum = "ba60e1d519d6f23a9df712c04fdeadd7872ac911c84b2f62a8bda92e129b7962"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
  "aws-smithy-async",
  "aws-smithy-http",
- "aws-smithy-json 0.61.1",
+ "aws-smithy-json",
  "aws-smithy-query",
  "aws-smithy-runtime",
  "aws-smithy-runtime-api",
@@ -504,9 +504,9 @@ dependencies = [
 
 [[package]]
 name = "aws-sigv4"
-version = "1.2.6"
+version = "1.2.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7d3820e0c08d0737872ff3c7c1f21ebbb6693d832312d6152bf18ef50a5471c2"
+checksum = "690118821e46967b3c4501d67d7d52dd75106a9c54cf36cefa1985cedbe94e05"
 dependencies = [
  "aws-credential-types",
  "aws-smithy-eventstream",
@@ -533,9 +533,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-async"
-version = "1.2.1"
+version = "1.2.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "62220bc6e97f946ddd51b5f1361f78996e704677afc518a4ff66b7a72ea1378c"
+checksum = "fa59d1327d8b5053c54bf2eaae63bf629ba9e904434d0835a28ed3c0ed0a614e"
 dependencies = [
  "futures-util",
  "pin-project-lite",
@@ -565,9 +565,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-eventstream"
-version = "0.60.5"
+version = "0.60.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cef7d0a272725f87e51ba2bf89f8c21e4df61b9e49ae1ac367a6d69916ef7c90"
+checksum = "8b18559a41e0c909b77625adf2b8c50de480a8041e5e4a3f5f7d177db70abc5a"
 dependencies = [
  "aws-smithy-types",
  "bytes",
@@ -576,9 +576,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-http"
-version = "0.60.11"
+version = "0.60.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5c8bc3e8fdc6b8d07d976e301c02fe553f72a39b7a9fea820e023268467d7ab6"
+checksum = "7809c27ad8da6a6a68c454e651d4962479e81472aa19ae99e59f9aba1f9713cc"
 dependencies = [
  "aws-smithy-eventstream",
  "aws-smithy-runtime-api",
@@ -597,18 +597,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-json"
-version = "0.60.7"
+version = "0.61.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4683df9469ef09468dad3473d129960119a0d3593617542b7d52086c8486f2d6"
-dependencies = [
- "aws-smithy-types",
-]
-
-[[package]]
-name = "aws-smithy-json"
-version = "0.61.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ee4e69cc50921eb913c6b662f8d909131bb3e6ad6cb6090d3a39b66fc5c52095"
+checksum = "623a51127f24c30776c8b374295f2df78d92517386f77ba30773f15a30ce1422"
 dependencies = [
  "aws-smithy-types",
 ]
@@ -625,9 +616,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-runtime"
-version = "1.7.4"
+version = "1.7.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9f20685047ca9d6f17b994a07f629c813f08b5bce65523e47124879e60103d45"
+checksum = "865f7050bbc7107a6c98a397a9fcd9413690c27fa718446967cf03b2d3ac517e"
 dependencies = [
  "aws-smithy-async",
  "aws-smithy-http",
@@ -669,9 +660,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-types"
-version = "1.2.9"
+version = "1.2.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4fbd94a32b3a7d55d3806fe27d98d3ad393050439dd05eb53ece36ec5e3d3510"
+checksum = "a28f6feb647fb5e0d5b50f0472c19a7db9462b74e2fec01bb0b44eedcc834e97"
 dependencies = [
  "base64-simd",
  "bytes",
@@ -704,9 +695,9 @@ dependencies = [
 
 [[package]]
 name = "aws-types"
-version = "1.3.3"
+version = "1.3.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5221b91b3e441e6675310829fd8984801b772cb1546ef6c0e54dec9f1ac13fef"
+checksum = "b0df5a18c4f951c645300d365fec53a61418bcf4650f604f85fe2a665bfaa0c2"
 dependencies = [
  "aws-credential-types",
  "aws-smithy-async",

From afbcebe7f761dd555a3433aa34802b601367a82f Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Fri, 31 Jan 2025 12:31:58 +0100
Subject: [PATCH 07/77] test_runner: force-compact in `test_sharding_autosplit`
 (#10605)

## Problem

This test may not fully detect data corruption during splits, since we
don't force-compact the entire keyspace.

## Summary of changes

Force-compact all data in `test_sharding_autosplit`.
---
 test_runner/performance/test_sharding_autosplit.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/test_runner/performance/test_sharding_autosplit.py b/test_runner/performance/test_sharding_autosplit.py
index 76c3ad01a4..e5a9f17da8 100644
--- a/test_runner/performance/test_sharding_autosplit.py
+++ b/test_runner/performance/test_sharding_autosplit.py
@@ -247,7 +247,7 @@ def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
         log.info(f"{shard_zero_id} timeline: {timeline_info}")
 
     # Run compaction for all tenants, restart endpoint so that on subsequent reads we will
-    # definitely hit pageserver for reads.  This compaction passis expected to drop unwanted
+    # definitely hit pageserver for reads.  This compaction pass is expected to drop unwanted
     # layers but not do any rewrites (we're still in the same generation)
     for tenant_id, tenant_state in tenants.items():
         tenant_state.endpoint.stop()
@@ -296,6 +296,16 @@ def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
         for fut in pgbench_futs:
             fut.result()
 
+    # Run a full forced compaction, to detect any data corruption.
+    for tenant_id, tenant_state in tenants.items():
+        for shard_id, shard_ps in tenant_get_shards(env, tenant_id):
+            shard_ps.http_client().timeline_compact(
+                shard_id,
+                tenant_state.timeline_id,
+                force_image_layer_creation=True,
+                force_l0_compaction=True,
+            )
+
     # Assert that some rewrites happened
     # TODO: uncomment this after https://github.com/neondatabase/neon/pull/7531 is merged
     # assert any(ps.log_contains(".*Rewriting layer after shard split.*") for ps in env.pageservers)

From 89cff08354f7c2f2bbb0a92df2eca6de828fa4fe Mon Sep 17 00:00:00 2001
From: Fedor Dikarev <fedor@neon.tech>
Date: Fri, 31 Jan 2025 12:46:33 +0100
Subject: [PATCH 08/77] unify pg-build-nonroot-with-cargo base layer and config
 retries in curl (#10575)

Ref: https://github.com/neondatabase/cloud/issues/23461

## Problem

Just made changes around and see these 2 base layers could be optimised.

and after review comment from @myrrc setting up timeouts and retries in
`alpine/curl` image

## Summary of changes
---
 compute/compute-node.Dockerfile | 43 +++++++++++++++------------------
 1 file changed, 19 insertions(+), 24 deletions(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 1ef449f0b0..32226c56a5 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -825,11 +825,11 @@ RUN case "${PG_VERSION}" in "v17") \
 
 #########################################################################################
 #
-# Layer "rust extensions"
-# This layer is used to build `pgrx` deps
+# Layer "pg build with nonroot user and cargo installed"
+# This layer is base and common for layers with `pgrx`
 #
 #########################################################################################
-FROM pg-build AS rust-extensions-build
+FROM pg-build AS pg-build-nonroot-with-cargo
 ARG PG_VERSION
 
 RUN apt update && \
@@ -847,8 +847,18 @@ RUN echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 30
 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
     chmod +x rustup-init && \
     ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
-    rm rustup-init && \
-    case "${PG_VERSION}" in \
+    rm rustup-init
+
+#########################################################################################
+#
+# Layer "rust extensions"
+# This layer is used to build `pgrx` deps
+#
+#########################################################################################
+FROM pg-build-nonroot-with-cargo AS rust-extensions-build
+ARG PG_VERSION
+
+RUN case "${PG_VERSION}" in \
         'v17') \
             echo 'v17 is not supported yet by pgrx. Quit' && exit 0;; \
     esac && \
@@ -867,26 +877,10 @@ USER root
 # and eventually get merged with `rust-extensions-build`
 #
 #########################################################################################
-FROM pg-build AS rust-extensions-build-pgrx12
+FROM pg-build-nonroot-with-cargo AS rust-extensions-build-pgrx12
 ARG PG_VERSION
 
-RUN apt update && \
-    apt install --no-install-recommends --no-install-suggests -y curl libclang-dev && \
-    apt clean && rm -rf /var/lib/apt/lists/* && \
-    useradd -ms /bin/bash nonroot -b /home
-
-ENV HOME=/home/nonroot
-ENV PATH="/home/nonroot/.cargo/bin:$PATH"
-USER nonroot
-WORKDIR /home/nonroot
-
-RUN echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /home/nonroot/.curlrc
-
-RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
-    chmod +x rustup-init && \
-    ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
-    rm rustup-init && \
-    cargo install --locked --version 0.12.9 cargo-pgrx && \
+RUN cargo install --locked --version 0.12.9 cargo-pgrx && \
     /bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'
 
 USER root
@@ -1283,7 +1277,8 @@ FROM alpine/curl:${ALPINE_CURL_VERSION} AS exporters
 ARG TARGETARCH
 # Keep sql_exporter version same as in build-tools.Dockerfile and
 # test_runner/regress/test_compute_metrics.py
-RUN if [ "$TARGETARCH" = "amd64" ]; then\
+RUN echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc; \
+    if [ "$TARGETARCH" = "amd64" ]; then\
         postgres_exporter_sha256='027e75dda7af621237ff8f5ac66b78a40b0093595f06768612b92b1374bd3105';\
         pgbouncer_exporter_sha256='c9f7cf8dcff44f0472057e9bf52613d93f3ffbc381ad7547a959daa63c5e84ac';\
         sql_exporter_sha256='38e439732bbf6e28ca4a94d7bc3686d3fa1abdb0050773d5617a9efdb9e64d08';\

From 503bc72d31ce15620479346dfa1081771a7f0a95 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 31 Jan 2025 11:48:46 +0000
Subject: [PATCH 09/77] CI: add `diesel print-schema` check (#10527)

## Problem

We want to check that `diesel print-schema` doesn't generate any changes
(`storage_controller/src/schema.rs`) in comparison with the list of
migration.

## Summary of changes
- Add `diesel_cli` to `build-tools` image
- Add `Check diesel schema` step to `build-neon` job, at this stage we
have all required binaries, so don't need to compile anything
additionally
- Check runs only on x86 release builds to be sure we do it at least
once per CI run.
---
 .github/workflows/_build-and-test-locally.yml | 20 +++++++++++++++++++
 .github/workflows/_check-codestyle-rust.yml   |  3 +++
 build-tools.Dockerfile                        |  3 +++
 3 files changed, 26 insertions(+)

diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index 2daed90386..e9483492c9 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -271,6 +271,26 @@ jobs:
           path: /tmp/neon
           aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
+      - name: Check diesel schema
+        if: inputs.build-type == 'release' && inputs.arch == 'x64'
+        env:
+          DATABASE_URL: postgresql://localhost:1235/storage_controller
+          POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
+        run: |
+          /tmp/neon/bin/neon_local init
+          /tmp/neon/bin/neon_local storage_controller start
+
+          diesel print-schema > storage_controller/src/schema.rs
+
+          if [ -n "$(git diff storage_controller/src/schema.rs)" ]; then
+            echo >&2 "Uncommitted changes in diesel schema"
+
+            git diff .
+            exit 1
+          fi
+
+          /tmp/neon/bin/neon_local storage_controller stop
+
       # XXX: keep this after the binaries.list is formed, so the coverage can properly work later
       - name: Merge and upload coverage data
         if: inputs.build-type == 'debug'
diff --git a/.github/workflows/_check-codestyle-rust.yml b/.github/workflows/_check-codestyle-rust.yml
index cbc47c6406..f7518d6500 100644
--- a/.github/workflows/_check-codestyle-rust.yml
+++ b/.github/workflows/_check-codestyle-rust.yml
@@ -16,6 +16,9 @@ defaults:
   run:
     shell: bash -euxo pipefail {0}
 
+# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
+permissions: {}
+
 jobs:
   check-codestyle-rust:
     strategy:
diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile
index 9c13e480c1..dfcc7d06b4 100644
--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -261,6 +261,7 @@ ARG CARGO_HAKARI_VERSION=0.9.33
 ARG CARGO_DENY_VERSION=0.16.2
 ARG CARGO_HACK_VERSION=0.6.33
 ARG CARGO_NEXTEST_VERSION=0.9.85
+ARG CARGO_DIESEL_CLI_VERSION=2.2.6
 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
 	chmod +x rustup-init && \
 	./rustup-init -y --default-toolchain ${RUSTC_VERSION} && \
@@ -274,6 +275,8 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
     cargo install cargo-deny --locked --version ${CARGO_DENY_VERSION} && \
     cargo install cargo-hack          --version ${CARGO_HACK_VERSION} && \
     cargo install cargo-nextest       --version ${CARGO_NEXTEST_VERSION} && \
+    cargo install diesel_cli          --version ${CARGO_DIESEL_CLI_VERSION} \
+                                      --features postgres-bundled --no-default-features && \
     rm -rf /home/nonroot/.cargo/registry && \
     rm -rf /home/nonroot/.cargo/git
 

From dce617fe070e8528aba5a5628b138e2fe3eb4c85 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 31 Jan 2025 13:40:20 +0100
Subject: [PATCH 10/77] Update to rebased rust-postgres (#10584)

Update to a rebased version of our rust-postgres patches, rebased on
[this](https://github.com/sfackler/rust-postgres/commit/98f5a11bc0a8e451552d8941ffa078c7eb6cd60c)
commit this time.

With #10280 reapplied, this means that the rust-postgres crates will be
deduplicated, as the new crate versions are finally compatible with the
requirements of diesel-async.

Earlier update: #10561

rust-postgres PR: https://github.com/neondatabase/rust-postgres/pull/39
---
 Cargo.lock | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 6b63c3c388..cdc620e485 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4465,8 +4465,8 @@ dependencies = [
 
 [[package]]
 name = "postgres"
-version = "0.19.6"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#8b44892f7851e705810b2cb54504325699966070"
+version = "0.19.7"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#1f21e7959a96a34dcfbfce1b14b73286cdadffe9"
 dependencies = [
  "bytes",
  "fallible-iterator",
@@ -4479,9 +4479,9 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.6"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#8b44892f7851e705810b2cb54504325699966070"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#1f21e7959a96a34dcfbfce1b14b73286cdadffe9"
 dependencies = [
- "base64 0.21.1",
+ "base64 0.22.1",
  "byteorder",
  "bytes",
  "fallible-iterator",
@@ -4513,7 +4513,7 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.6"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#8b44892f7851e705810b2cb54504325699966070"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#1f21e7959a96a34dcfbfce1b14b73286cdadffe9"
 dependencies = [
  "bytes",
  "chrono",
@@ -6871,8 +6871,8 @@ dependencies = [
 
 [[package]]
 name = "tokio-postgres"
-version = "0.7.9"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#8b44892f7851e705810b2cb54504325699966070"
+version = "0.7.10"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#1f21e7959a96a34dcfbfce1b14b73286cdadffe9"
 dependencies = [
  "async-trait",
  "byteorder",

From 10cf5e7a38d45037f3f51b666c519b7e5c6c72a8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= <jc@neon.tech>
Date: Fri, 31 Jan 2025 14:42:59 +0100
Subject: [PATCH 11/77] Move cargo-deny into a separate workflow on a schedule
 (#10289)

## Problem
There are two (related) problems with the previous handling of
`cargo-deny`:
- When a new advisory is added to rustsec that affects a dependency,
unrelated pull requests will fail.
- New advisories rely on pushes or PRs to be surfaced. Problems that
already exist on main will only be found if we try to merge new things
into main.

## Summary of changes
We split out `cargo-deny` into a separate workflow that runs on all PRs
that touch `Cargo.lock`, and on a schedule on `main`, `release`,
`release-compute` and `release-proxy` to find new advisories.
---
 .github/actionlint.yml                      |  1 +
 .github/file-filters.yaml                   |  1 +
 .github/workflows/_check-codestyle-rust.yml |  5 --
 .github/workflows/build_and_test.yml        | 39 +++++++++++++-
 .github/workflows/cargo-deny.yml            | 57 +++++++++++++++++++++
 .github/workflows/pre-merge-checks.yml      |  3 +-
 6 files changed, 99 insertions(+), 7 deletions(-)
 create mode 100644 .github/workflows/cargo-deny.yml

diff --git a/.github/actionlint.yml b/.github/actionlint.yml
index ecff0cc70b..2b96ce95da 100644
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -27,3 +27,4 @@ config-variables:
   - SLACK_ON_CALL_QA_STAGING_STREAM
   - DEV_AWS_OIDC_ROLE_MANAGE_BENCHMARK_EC2_VMS_ARN
   - SLACK_ON_CALL_STORAGE_STAGING_STREAM
+  - SLACK_CICD_CHANNEL_ID
diff --git a/.github/file-filters.yaml b/.github/file-filters.yaml
index 886cd3919a..02ee383d5e 100644
--- a/.github/file-filters.yaml
+++ b/.github/file-filters.yaml
@@ -1,4 +1,5 @@
 rust_code: ['**/*.rs', '**/Cargo.toml', '**/Cargo.lock']
+rust_dependencies: ['**/Cargo.lock']
 
 v14: ['vendor/postgres-v14/**', 'Makefile', 'pgxn/**']
 v15: ['vendor/postgres-v15/**', 'Makefile', 'pgxn/**']
diff --git a/.github/workflows/_check-codestyle-rust.yml b/.github/workflows/_check-codestyle-rust.yml
index f7518d6500..c4c76914aa 100644
--- a/.github/workflows/_check-codestyle-rust.yml
+++ b/.github/workflows/_check-codestyle-rust.yml
@@ -87,8 +87,3 @@ jobs:
         run: |
           cargo hakari generate --diff  # workspace-hack Cargo.toml is up-to-date
           cargo hakari manage-deps --dry-run  # all workspace crates depend on workspace-hack
-
-      # https://github.com/EmbarkStudios/cargo-deny
-      - name: Check rust licenses/bans/advisories/sources
-        if: ${{ !cancelled() }}
-        run: cargo deny check --hide-inclusion-graph
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index e588fc5a0e..1274543429 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -45,6 +45,26 @@ jobs:
             run cancel-previous-in-concurrency-group.yml \
               --field concurrency_group="${{ env.E2E_CONCURRENCY_GROUP }}"
 
+  files-changed:
+    needs: [ check-permissions ]
+    runs-on: [ self-hosted, small ]
+    timeout-minutes: 3
+    outputs:
+      check-rust-dependencies: ${{ steps.files-changed.outputs.rust_dependencies }}
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+
+      - name: Check for file changes
+        uses: dorny/paths-filter@de90cc6fb38fc0963ad72b210f1f284cd68cea36  # v3.0.2
+        id: files-changed
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
+          filters: .github/file-filters.yaml
+
   tag:
     needs: [ check-permissions ]
     runs-on: [ self-hosted, small ]
@@ -170,6 +190,14 @@ jobs:
       archs: '["x64", "arm64"]'
     secrets: inherit
 
+  check-dependencies-rust:
+    needs: [ files-changed, build-build-tools-image ]
+    if: ${{ needs.files-changed.outputs.check-rust-dependencies == 'true' }}
+    uses: ./.github/workflows/cargo-deny.yml
+    with:
+      build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
+    secrets: inherit
+
   build-and-test-locally:
     needs: [ tag, build-build-tools-image ]
     strategy:
@@ -1332,6 +1360,8 @@ jobs:
       - build-and-test-locally
       - check-codestyle-python
       - check-codestyle-rust
+      - check-dependencies-rust
+      - files-changed
       - promote-images-dev
       - test-images
       - trigger-custom-extensions-build-and-wait
@@ -1344,4 +1374,11 @@ jobs:
         if: |
           contains(needs.*.result, 'failure')
           || contains(needs.*.result, 'cancelled')
-          || contains(needs.*.result, 'skipped')
+          || (needs.check-dependencies-rust.result == 'skipped' && needs.files-changed.outputs.check-rust-dependencies == 'true')
+          || needs.build-and-test-locally.result == 'skipped'
+          || needs.check-codestyle-python.result == 'skipped'
+          || needs.check-codestyle-rust.result == 'skipped'
+          || needs.files-changed.result == 'skipped'
+          || needs.promote-images-dev.result == 'skipped'
+          || needs.test-images.result == 'skipped'
+          || needs.trigger-custom-extensions-build-and-wait.result == 'skipped'
diff --git a/.github/workflows/cargo-deny.yml b/.github/workflows/cargo-deny.yml
new file mode 100644
index 0000000000..433b377c32
--- /dev/null
+++ b/.github/workflows/cargo-deny.yml
@@ -0,0 +1,57 @@
+name: cargo deny checks
+
+on:
+  workflow_call:
+    inputs:
+      build-tools-image:
+        required: false
+        type: string
+  schedule:
+    - cron: '0 0 * * *'
+
+jobs:
+  cargo-deny:
+    strategy:
+      matrix:
+        ref: >-
+          ${{
+            fromJSON(
+              github.event_name == 'schedule'
+                && '["main","release","release-proxy","release-compute"]'
+                || format('["{0}"]', github.sha)
+            )
+          }}
+
+    runs-on: [self-hosted, small]
+
+    container:
+      image: ${{ inputs.build-tools-image || 'neondatabase/build-tools:pinned' }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      options: --init
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ matrix.ref }}
+
+      - name: Check rust licenses/bans/advisories/sources
+        env:
+          CARGO_DENY_TARGET: >-
+            ${{ github.event_name == 'schedule' && 'advisories' || 'all' }}
+        run: cargo deny check --hide-inclusion-graph $CARGO_DENY_TARGET
+
+      - name: Post to a Slack channel
+        if: ${{ github.event_name == 'schedule' && failure() }}
+        uses: slackapi/slack-github-action@v2
+        with:
+          method: chat.postMessage
+          token: ${{ secrets.SLACK_BOT_TOKEN }}
+          payload: |
+            channel: ${{ vars.SLACK_CICD_CHANNEL_ID }}
+            text: |
+              Periodic cargo-deny on ${{ matrix.ref }}: ${{ job.status }}
+              <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
+              Pinging @oncall-devprod.
diff --git a/.github/workflows/pre-merge-checks.yml b/.github/workflows/pre-merge-checks.yml
index e6dfbaeed8..e92a153db9 100644
--- a/.github/workflows/pre-merge-checks.yml
+++ b/.github/workflows/pre-merge-checks.yml
@@ -124,6 +124,7 @@ jobs:
       - name: Fail the job if any of the dependencies do not succeed or skipped
         run: exit 1
         if: |
-          (contains(needs.check-codestyle-python.result, 'skipped') && needs.get-changed-files.outputs.python-changed == 'true')
+          (needs.check-codestyle-python.result == 'skipped' && needs.get-changed-files.outputs.python-changed == 'true')
+          || (needs.check-codestyle-rust.result == 'skipped' && needs.get-changed-files.outputs.rust-changed == 'true')
           || contains(needs.*.result, 'failure')
           || contains(needs.*.result, 'cancelled')

From a93e9f22fc0e35ad2863d1e57db9f3a01326b710 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 31 Jan 2025 17:43:31 +0000
Subject: [PATCH 12/77] pageserver: remove faulty debug assertion in compaction
 (#10610)

## Problem

This assertion is incorrect: it is legal to see another shard's data at
this point, after a shard split.

Closes: https://github.com/neondatabase/neon/issues/10609

## Summary of changes

- Remove faulty assertion
---
 pageserver/src/tenant/timeline/compaction.rs | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 7242f73a82..7244e946cb 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -1503,11 +1503,9 @@ impl Timeline {
                     .await
                     .map_err(CompactionError::Other)?;
             } else {
-                let shard = self.shard_identity.shard_index();
                 let owner = self.shard_identity.get_shard_number(&key);
-                if cfg!(debug_assertions) {
-                    panic!("key {key} does not belong on shard {shard}, owned by {owner}");
-                }
+
+                // This happens after a shard split, when we're compacting an L0 created by our parent shard
                 debug!("dropping key {key} during compaction (it belongs on shard {owner})");
             }
 

From aedeb1c7c277703af861894455764a8c248df9b8 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 31 Jan 2025 17:43:54 +0000
Subject: [PATCH 13/77] pageserver: revise logging of cancelled request results
 (#10604)

## Problem

When a client dropped before a request completed, and a handler returned
an ApiError, we would log that at error severity. That was excessive in
the case of a request erroring on a shutdown, and could cause test
flakes.

example:
https://neon-github-public-dev.s3.amazonaws.com/reports/main/13067651123/index.html#suites/ad9c266207b45eafe19909d1020dd987/6021ce86a0d72ae7/

```
Cancelled request finished with an error: ShuttingDown
```

## Summary of changes

- Log a different info-level on ShuttingDown and ResourceUnavailable API
errors from cancelled requests
---
 pageserver/src/http/routes.rs | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index eb9cb4da0c..94f7510a4a 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -3393,7 +3393,17 @@ where
                             let status = response.status();
                             info!(%status, "Cancelled request finished successfully")
                         }
-                        Err(e) => error!("Cancelled request finished with an error: {e:?}"),
+                        Err(e) => match e {
+                            ApiError::ShuttingDown | ApiError::ResourceUnavailable(_) => {
+                                // Don't log this at error severity: they are normal during lifecycle of tenants/process
+                                info!("Cancelled request aborted for shutdown")
+                            }
+                            _ => {
+                                // Log these in a highly visible way, because we have no client to send the response to, but
+                                // would like to know that something went wrong.
+                                error!("Cancelled request finished with an error: {e:?}")
+                            }
+                        },
                     }
                 }
                 // only logging for cancelled panicked request handlers is the tracing_panic_hook,

From 48c87dc458a84fa9132ff22b1ae1fcc3d3094cda Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 31 Jan 2025 18:07:26 +0000
Subject: [PATCH 14/77] CI(pre-merge-checks): fix condition (#10617)

## Problem

Merge Queue fails if changes include Rust code.

## Summary of changes
- Fix condition for `build-build-tools-image`
- Add a couple of no-op `false ||` to make predicates look
symmetric
---
 .github/workflows/pre-merge-checks.yml | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/pre-merge-checks.yml b/.github/workflows/pre-merge-checks.yml
index e92a153db9..d39ccecac9 100644
--- a/.github/workflows/pre-merge-checks.yml
+++ b/.github/workflows/pre-merge-checks.yml
@@ -59,7 +59,10 @@ jobs:
           echo "${RUST_CHANGED_FILES}"
 
   build-build-tools-image:
-    if: needs.get-changed-files.outputs.python-changed == 'true'
+    if: |
+      false
+      || needs.get-changed-files.outputs.python-changed == 'true'
+      || needs.get-changed-files.outputs.rust-changed == 'true'
     needs: [ get-changed-files ]
     uses: ./.github/workflows/build-build-tools-image.yml
     with:
@@ -124,7 +127,8 @@ jobs:
       - name: Fail the job if any of the dependencies do not succeed or skipped
         run: exit 1
         if: |
-          (needs.check-codestyle-python.result == 'skipped' && needs.get-changed-files.outputs.python-changed == 'true')
-          || (needs.check-codestyle-rust.result == 'skipped' && needs.get-changed-files.outputs.rust-changed == 'true')
+          false
+          || (needs.check-codestyle-python.result == 'skipped' && needs.get-changed-files.outputs.python-changed == 'true')
+          || (needs.check-codestyle-rust.result   == 'skipped' && needs.get-changed-files.outputs.rust-changed   == 'true')
           || contains(needs.*.result, 'failure')
           || contains(needs.*.result, 'cancelled')

From bc7822d90c82046de709b211faa03d3f720a6931 Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Fri, 31 Jan 2025 19:41:17 +0100
Subject: [PATCH 15/77] temporarily disable some steps and run more often to
 expose more pgbench --initialize in benchmarking workflow (#10616)

## Problem

we want to disable some steps in benchmarking workflow that do not
initialize new projects and instead run the test more frequently

Test run
 https://github.com/neondatabase/neon/actions/runs/13077737888
---
 .github/workflows/benchmarking.yml | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index 49f23e895b..20a8a6e2c9 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -11,7 +11,8 @@ on:
     #          │ │ ┌───────────── day of the month (1 - 31)
     #          │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
     #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
-    - cron:   '0 3 * * *' # run once a day, timezone is utc
+    # - cron:   '0 3 * * *' # run once a day, timezone is utc
+    - cron: '0 */10 * * *' # Runs every 10 hours at minute 0
   workflow_dispatch: # adds ability to run this manually
     inputs:
       region_id:
@@ -550,6 +551,7 @@ jobs:
         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
 
   pgbench-pgvector:
+    if: false
     permissions:
       contents: write
       statuses: write
@@ -683,7 +685,8 @@ jobs:
     #
     # *_CLICKBENCH_CONNSTR: Genuine ClickBench DB with ~100M rows
     # *_CLICKBENCH_10M_CONNSTR: DB with the first 10M rows of ClickBench DB
-    if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
+    # if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
+    if: false
     permissions:
       contents: write
       statuses: write
@@ -810,7 +813,8 @@ jobs:
     # We might change it after https://github.com/neondatabase/neon/issues/2900.
     #
     # *_TPCH_S10_CONNSTR: DB generated with scale factor 10 (~10 GB)
-    if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
+    # if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
+    if: false
     permissions:
       contents: write
       statuses: write
@@ -929,7 +933,8 @@ jobs:
         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
 
   user-examples-compare:
-    if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
+    # if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
+    if: false
     permissions:
       contents: write
       statuses: write

From fcd195c2b63fdfbb5323258a5422469e1e850175 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Fri, 31 Jan 2025 13:04:26 -0600
Subject: [PATCH 16/77] Migrate compute_ctl arg parsing to clap derive (#10497)

The primary benefit is that all the ad hoc get_matches() calls are no
longer necessary. Now all it takes to get at the CLI arguments is
referencing a struct member. It's also great the we can replace the ad
hoc CLI struct we had with this more formal solution.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute_tools/src/bin/compute_ctl.rs | 348 +++++++++------------------
 1 file changed, 112 insertions(+), 236 deletions(-)

diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index b98cf706d3..47fc9cb7fe 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -34,6 +34,7 @@
 //!             -r http://pg-ext-s3-gateway \
 //! ```
 use std::collections::HashMap;
+use std::ffi::OsString;
 use std::fs::File;
 use std::path::Path;
 use std::process::exit;
@@ -44,7 +45,7 @@ use std::{thread, time::Duration};
 
 use anyhow::{Context, Result};
 use chrono::Utc;
-use clap::Arg;
+use clap::Parser;
 use compute_tools::disk_quota::set_disk_quota;
 use compute_tools::lsn_lease::launch_lsn_lease_bg_task_for_static;
 use signal_hook::consts::{SIGQUIT, SIGTERM};
@@ -73,10 +74,75 @@ use utils::failpoint_support;
 // in-case of not-set environment var
 const BUILD_TAG_DEFAULT: &str = "latest";
 
-fn main() -> Result<()> {
-    let scenario = failpoint_support::init();
+// Compatibility hack: if the control plane specified any remote-ext-config
+// use the default value for extension storage proxy gateway.
+// Remove this once the control plane is updated to pass the gateway URL
+fn parse_remote_ext_config(arg: &str) -> Result<String> {
+    if arg.starts_with("http") {
+        Ok(arg.trim_end_matches('/').to_string())
+    } else {
+        Ok("http://pg-ext-s3-gateway".to_string())
+    }
+}
 
-    let (build_tag, clap_args) = init()?;
+#[derive(Parser)]
+#[command(rename_all = "kebab-case")]
+struct Cli {
+    #[arg(short = 'b', long, default_value = "postgres", env = "POSTGRES_PATH")]
+    pub pgbin: String,
+
+    #[arg(short = 'r', long, value_parser = parse_remote_ext_config)]
+    pub remote_ext_config: Option<String>,
+
+    #[arg(long, default_value_t = 3080)]
+    pub http_port: u16,
+
+    #[arg(short = 'D', long, value_name = "DATADIR")]
+    pub pgdata: String,
+
+    #[arg(short = 'C', long, value_name = "DATABASE_URL")]
+    pub connstr: String,
+
+    #[cfg(target_os = "linux")]
+    #[arg(long, default_value = "neon-postgres")]
+    pub cgroup: String,
+
+    #[cfg(target_os = "linux")]
+    #[arg(
+        long,
+        default_value = "host=localhost port=5432 dbname=postgres user=cloud_admin sslmode=disable application_name=vm-monitor"
+    )]
+    pub filecache_connstr: String,
+
+    #[cfg(target_os = "linux")]
+    #[arg(long, default_value = "0.0.0.0:10301")]
+    pub vm_monitor_addr: String,
+
+    #[arg(long, action = clap::ArgAction::SetTrue)]
+    pub resize_swap_on_bind: bool,
+
+    #[arg(long)]
+    pub set_disk_quota_for_fs: Option<String>,
+
+    #[arg(short = 's', long = "spec", group = "spec")]
+    pub spec_json: Option<String>,
+
+    #[arg(short = 'S', long, group = "spec-path")]
+    pub spec_path: Option<OsString>,
+
+    #[arg(short = 'i', long, group = "compute-id", conflicts_with_all = ["spec", "spec-path"])]
+    pub compute_id: Option<String>,
+
+    #[arg(short = 'p', long, conflicts_with_all = ["spec", "spec-path"], requires = "compute-id", value_name = "CONTROL_PLANE_API_BASE_URL")]
+    pub control_plane_uri: Option<String>,
+}
+
+fn main() -> Result<()> {
+    let cli = Cli::parse();
+
+    let build_tag = init()?;
+
+    let scenario = failpoint_support::init();
 
     // enable core dumping for all child processes
     setrlimit(Resource::CORE, rlimit::INFINITY, rlimit::INFINITY)?;
@@ -85,13 +151,11 @@ fn main() -> Result<()> {
         // Enter startup tracing context
         let _startup_context_guard = startup_context_from_env();
 
-        let cli_args = process_cli(&clap_args)?;
+        let cli_spec = try_spec_from_cli(&cli)?;
 
-        let cli_spec = try_spec_from_cli(&clap_args, &cli_args)?;
+        let compute = wait_spec(build_tag, &cli, cli_spec)?;
 
-        let wait_spec_result = wait_spec(build_tag, cli_args, cli_spec)?;
-
-        start_postgres(&clap_args, wait_spec_result)?
+        start_postgres(&cli, compute)?
 
         // Startup is finished, exit the startup tracing span
     };
@@ -108,7 +172,7 @@ fn main() -> Result<()> {
     deinit_and_exit(wait_pg_result);
 }
 
-fn init() -> Result<(String, clap::ArgMatches)> {
+fn init() -> Result<String> {
     init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
 
     let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?;
@@ -123,66 +187,7 @@ fn init() -> Result<(String, clap::ArgMatches)> {
         .to_string();
     info!("build_tag: {build_tag}");
 
-    Ok((build_tag, cli().get_matches()))
-}
-
-fn process_cli(matches: &clap::ArgMatches) -> Result<ProcessCliResult> {
-    let pgbin_default = "postgres";
-    let pgbin = matches
-        .get_one::<String>("pgbin")
-        .map(|s| s.as_str())
-        .unwrap_or(pgbin_default);
-
-    let ext_remote_storage = matches
-        .get_one::<String>("remote-ext-config")
-        // Compatibility hack: if the control plane specified any remote-ext-config
-        // use the default value for extension storage proxy gateway.
-        // Remove this once the control plane is updated to pass the gateway URL
-        .map(|conf| {
-            if conf.starts_with("http") {
-                conf.trim_end_matches('/')
-            } else {
-                "http://pg-ext-s3-gateway"
-            }
-        });
-
-    let http_port = *matches
-        .get_one::<u16>("http-port")
-        .expect("http-port is required");
-    let pgdata = matches
-        .get_one::<String>("pgdata")
-        .expect("PGDATA path is required");
-    let connstr = matches
-        .get_one::<String>("connstr")
-        .expect("Postgres connection string is required");
-    let spec_json = matches.get_one::<String>("spec");
-    let spec_path = matches.get_one::<String>("spec-path");
-    let resize_swap_on_bind = matches.get_flag("resize-swap-on-bind");
-    let set_disk_quota_for_fs = matches.get_one::<String>("set-disk-quota-for-fs");
-
-    Ok(ProcessCliResult {
-        connstr,
-        pgdata,
-        pgbin,
-        ext_remote_storage,
-        http_port,
-        spec_json,
-        spec_path,
-        resize_swap_on_bind,
-        set_disk_quota_for_fs,
-    })
-}
-
-struct ProcessCliResult<'clap> {
-    connstr: &'clap str,
-    pgdata: &'clap str,
-    pgbin: &'clap str,
-    ext_remote_storage: Option<&'clap str>,
-    http_port: u16,
-    spec_json: Option<&'clap String>,
-    spec_path: Option<&'clap String>,
-    resize_swap_on_bind: bool,
-    set_disk_quota_for_fs: Option<&'clap String>,
+    Ok(build_tag)
 }
 
 fn startup_context_from_env() -> Option<opentelemetry::ContextGuard> {
@@ -235,19 +240,9 @@ fn startup_context_from_env() -> Option<opentelemetry::ContextGuard> {
     }
 }
 
-fn try_spec_from_cli(
-    matches: &clap::ArgMatches,
-    ProcessCliResult {
-        spec_json,
-        spec_path,
-        ..
-    }: &ProcessCliResult,
-) -> Result<CliSpecParams> {
-    let compute_id = matches.get_one::<String>("compute-id");
-    let control_plane_uri = matches.get_one::<String>("control-plane-uri");
-
+fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
     // First, try to get cluster spec from the cli argument
-    if let Some(spec_json) = spec_json {
+    if let Some(ref spec_json) = cli.spec_json {
         info!("got spec from cli argument {}", spec_json);
         return Ok(CliSpecParams {
             spec: Some(serde_json::from_str(spec_json)?),
@@ -256,7 +251,7 @@ fn try_spec_from_cli(
     }
 
     // Second, try to read it from the file if path is provided
-    if let Some(spec_path) = spec_path {
+    if let Some(ref spec_path) = cli.spec_path {
         let file = File::open(Path::new(spec_path))?;
         return Ok(CliSpecParams {
             spec: Some(serde_json::from_reader(file)?),
@@ -264,17 +259,20 @@ fn try_spec_from_cli(
         });
     }
 
-    let Some(compute_id) = compute_id else {
+    if cli.compute_id.is_none() {
         panic!(
             "compute spec should be provided by one of the following ways: \
                 --spec OR --spec-path OR --control-plane-uri and --compute-id"
         );
     };
-    let Some(control_plane_uri) = control_plane_uri else {
+    if cli.control_plane_uri.is_none() {
         panic!("must specify both --control-plane-uri and --compute-id or none");
     };
 
-    match get_spec_from_control_plane(control_plane_uri, compute_id) {
+    match get_spec_from_control_plane(
+        cli.control_plane_uri.as_ref().unwrap(),
+        cli.compute_id.as_ref().unwrap(),
+    ) {
         Ok(spec) => Ok(CliSpecParams {
             spec,
             live_config_allowed: true,
@@ -298,21 +296,12 @@ struct CliSpecParams {
 
 fn wait_spec(
     build_tag: String,
-    ProcessCliResult {
-        connstr,
-        pgdata,
-        pgbin,
-        ext_remote_storage,
-        resize_swap_on_bind,
-        set_disk_quota_for_fs,
-        http_port,
-        ..
-    }: ProcessCliResult,
+    cli: &Cli,
     CliSpecParams {
         spec,
         live_config_allowed,
     }: CliSpecParams,
-) -> Result<WaitSpecResult> {
+) -> Result<Arc<ComputeNode>> {
     let mut new_state = ComputeState::new();
     let spec_set;
 
@@ -324,7 +313,7 @@ fn wait_spec(
     } else {
         spec_set = false;
     }
-    let connstr = Url::parse(connstr).context("cannot parse connstr as a URL")?;
+    let connstr = Url::parse(&cli.connstr).context("cannot parse connstr as a URL")?;
     let conn_conf = postgres::config::Config::from_str(connstr.as_str())
         .context("cannot build postgres config from connstr")?;
     let tokio_conn_conf = tokio_postgres::config::Config::from_str(connstr.as_str())
@@ -333,14 +322,14 @@ fn wait_spec(
         connstr,
         conn_conf,
         tokio_conn_conf,
-        pgdata: pgdata.to_string(),
-        pgbin: pgbin.to_string(),
-        pgversion: get_pg_version_string(pgbin),
-        http_port,
+        pgdata: cli.pgdata.clone(),
+        pgbin: cli.pgbin.clone(),
+        pgversion: get_pg_version_string(&cli.pgbin),
+        http_port: cli.http_port,
         live_config_allowed,
         state: Mutex::new(new_state),
         state_changed: Condvar::new(),
-        ext_remote_storage: ext_remote_storage.map(|s| s.to_string()),
+        ext_remote_storage: cli.remote_ext_config.clone(),
         ext_download_progress: RwLock::new(HashMap::new()),
         build_tag,
     };
@@ -357,7 +346,7 @@ fn wait_spec(
     // Launch http service first, so that we can serve control-plane requests
     // while configuration is still in progress.
     let _http_handle =
-        launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread");
+        launch_http_server(cli.http_port, &compute).expect("cannot launch http endpoint thread");
 
     if !spec_set {
         // No spec provided, hang waiting for it.
@@ -389,27 +378,12 @@ fn wait_spec(
 
     launch_lsn_lease_bg_task_for_static(&compute);
 
-    Ok(WaitSpecResult {
-        compute,
-        resize_swap_on_bind,
-        set_disk_quota_for_fs: set_disk_quota_for_fs.cloned(),
-    })
-}
-
-struct WaitSpecResult {
-    compute: Arc<ComputeNode>,
-    resize_swap_on_bind: bool,
-    set_disk_quota_for_fs: Option<String>,
+    Ok(compute)
 }
 
 fn start_postgres(
-    // need to allow unused because `matches` is only used if target_os = "linux"
-    #[allow(unused_variables)] matches: &clap::ArgMatches,
-    WaitSpecResult {
-        compute,
-        resize_swap_on_bind,
-        set_disk_quota_for_fs,
-    }: WaitSpecResult,
+    cli: &Cli,
+    compute: Arc<ComputeNode>,
 ) -> Result<(Option<PostgresHandle>, StartPostgresResult)> {
     // We got all we need, update the state.
     let mut state = compute.state.lock().unwrap();
@@ -437,7 +411,7 @@ fn start_postgres(
     let mut delay_exit = false;
 
     // Resize swap to the desired size if the compute spec says so
-    if let (Some(size_bytes), true) = (swap_size_bytes, resize_swap_on_bind) {
+    if let (Some(size_bytes), true) = (swap_size_bytes, cli.resize_swap_on_bind) {
         // To avoid 'swapoff' hitting postgres startup, we need to run resize-swap to completion
         // *before* starting postgres.
         //
@@ -464,9 +438,9 @@ fn start_postgres(
 
     // Set disk quota if the compute spec says so
     if let (Some(disk_quota_bytes), Some(disk_quota_fs_mountpoint)) =
-        (disk_quota_bytes, set_disk_quota_for_fs)
+        (disk_quota_bytes, cli.set_disk_quota_for_fs.as_ref())
     {
-        match set_disk_quota(disk_quota_bytes, &disk_quota_fs_mountpoint) {
+        match set_disk_quota(disk_quota_bytes, disk_quota_fs_mountpoint) {
             Ok(()) => {
                 let size_mib = disk_quota_bytes as f32 / (1 << 20) as f32; // just for more coherent display.
                 info!(%disk_quota_bytes, %size_mib, "set disk quota");
@@ -509,13 +483,7 @@ fn start_postgres(
         if #[cfg(target_os = "linux")] {
             use std::env;
             use tokio_util::sync::CancellationToken;
-            let vm_monitor_addr = matches
-                .get_one::<String>("vm-monitor-addr")
-                .expect("--vm-monitor-addr should always be set because it has a default arg");
-            let file_cache_connstr = matches.get_one::<String>("filecache-connstr");
-            let cgroup = matches.get_one::<String>("cgroup");
 
-            // Only make a runtime if we need to.
             // Note: it seems like you can make a runtime in an inner scope and
             // if you start a task in it it won't be dropped. However, make it
             // in the outermost scope just to be safe.
@@ -538,15 +506,15 @@ fn start_postgres(
             let pgconnstr = if disable_lfc_resizing.unwrap_or(false) {
                 None
             } else {
-                file_cache_connstr.cloned()
+                Some(cli.filecache_connstr.clone())
             };
 
             let vm_monitor = rt.as_ref().map(|rt| {
                 rt.spawn(vm_monitor::start(
                     Box::leak(Box::new(vm_monitor::Args {
-                        cgroup: cgroup.cloned(),
+                        cgroup: Some(cli.cgroup.clone()),
                         pgconnstr,
-                        addr: vm_monitor_addr.clone(),
+                        addr: cli.vm_monitor_addr.clone(),
                     })),
                     token.clone(),
                 ))
@@ -702,105 +670,6 @@ fn deinit_and_exit(WaitPostgresResult { exit_code }: WaitPostgresResult) -> ! {
     exit(exit_code.unwrap_or(1))
 }
 
-fn cli() -> clap::Command {
-    // Env variable is set by `cargo`
-    let version = option_env!("CARGO_PKG_VERSION").unwrap_or("unknown");
-    clap::Command::new("compute_ctl")
-        .version(version)
-        .arg(
-            Arg::new("http-port")
-                .long("http-port")
-                .value_name("HTTP_PORT")
-                .default_value("3080")
-                .value_parser(clap::value_parser!(u16))
-                .required(false),
-        )
-        .arg(
-            Arg::new("connstr")
-                .short('C')
-                .long("connstr")
-                .value_name("DATABASE_URL")
-                .required(true),
-        )
-        .arg(
-            Arg::new("pgdata")
-                .short('D')
-                .long("pgdata")
-                .value_name("DATADIR")
-                .required(true),
-        )
-        .arg(
-            Arg::new("pgbin")
-                .short('b')
-                .long("pgbin")
-                .default_value("postgres")
-                .value_name("POSTGRES_PATH"),
-        )
-        .arg(
-            Arg::new("spec")
-                .short('s')
-                .long("spec")
-                .value_name("SPEC_JSON"),
-        )
-        .arg(
-            Arg::new("spec-path")
-                .short('S')
-                .long("spec-path")
-                .value_name("SPEC_PATH"),
-        )
-        .arg(
-            Arg::new("compute-id")
-                .short('i')
-                .long("compute-id")
-                .value_name("COMPUTE_ID"),
-        )
-        .arg(
-            Arg::new("control-plane-uri")
-                .short('p')
-                .long("control-plane-uri")
-                .value_name("CONTROL_PLANE_API_BASE_URI"),
-        )
-        .arg(
-            Arg::new("remote-ext-config")
-                .short('r')
-                .long("remote-ext-config")
-                .value_name("REMOTE_EXT_CONFIG"),
-        )
-        // TODO(fprasx): we currently have default arguments because the cloud PR
-        // to pass them in hasn't been merged yet. We should get rid of them once
-        // the PR is merged.
-        .arg(
-            Arg::new("vm-monitor-addr")
-                .long("vm-monitor-addr")
-                .default_value("0.0.0.0:10301")
-                .value_name("VM_MONITOR_ADDR"),
-        )
-        .arg(
-            Arg::new("cgroup")
-                .long("cgroup")
-                .default_value("neon-postgres")
-                .value_name("CGROUP"),
-        )
-        .arg(
-            Arg::new("filecache-connstr")
-                .long("filecache-connstr")
-                .default_value(
-                    "host=localhost port=5432 dbname=postgres user=cloud_admin sslmode=disable application_name=vm-monitor",
-                )
-                .value_name("FILECACHE_CONNSTR"),
-        )
-        .arg(
-            Arg::new("resize-swap-on-bind")
-                .long("resize-swap-on-bind")
-                .action(clap::ArgAction::SetTrue),
-        )
-        .arg(
-            Arg::new("set-disk-quota-for-fs")
-                .long("set-disk-quota-for-fs")
-                .value_name("SET_DISK_QUOTA_FOR_FS")
-        )
-}
-
 /// When compute_ctl is killed, send also termination signal to sync-safekeepers
 /// to prevent leakage. TODO: it is better to convert compute_ctl to async and
 /// wait for termination which would be easy then.
@@ -810,7 +679,14 @@ fn handle_exit_signal(sig: i32) {
     exit(1);
 }
 
-#[test]
-fn verify_cli() {
-    cli().debug_assert()
+#[cfg(test)]
+mod test {
+    use clap::CommandFactory;
+
+    use super::Cli;
+
+    #[test]
+    fn verify_cli() {
+        Cli::command().debug_assert()
+    }
 }

From ad1a41157affa94c9e818239b7c2d9fd26bb3de6 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 31 Jan 2025 19:14:27 +0000
Subject: [PATCH 17/77] feat(proxy): optimizing the chances of large write in
 copy_bidirectional (#10608)

We forked copy_bidirectional to solve some issues like fast-shutdown
(disallowing half-open connections) and to introduce better error
tracking (which side of the conn closed down).

A change recently made its way upstream offering performance
improvements: https://github.com/tokio-rs/tokio/pull/6532. These seem
applicable to our fork, thus it makes sense to apply them here as well.
---
 proxy/src/proxy/copy_bidirectional.rs | 35 +++++++++++++++------------
 1 file changed, 20 insertions(+), 15 deletions(-)

diff --git a/proxy/src/proxy/copy_bidirectional.rs b/proxy/src/proxy/copy_bidirectional.rs
index 3336a9556a..861f1766e8 100644
--- a/proxy/src/proxy/copy_bidirectional.rs
+++ b/proxy/src/proxy/copy_bidirectional.rs
@@ -201,25 +201,26 @@ impl CopyBuffer {
         W: AsyncWrite + ?Sized,
     {
         loop {
-            // If our buffer is empty, then we need to read some data to
-            // continue.
-            if self.pos == self.cap && !self.read_done {
-                self.pos = 0;
-                self.cap = 0;
-
+            // If there is some space left in our buffer, then we try to read some
+            // data to continue, thus maximizing the chances of a large write.
+            if self.cap < self.buf.len() && !self.read_done {
                 match self.poll_fill_buf(cx, reader.as_mut()) {
                     Poll::Ready(Ok(())) => (),
                     Poll::Ready(Err(err)) => return Poll::Ready(Err(ErrorDirection::Read(err))),
                     Poll::Pending => {
-                        // Try flushing when the reader has no progress to avoid deadlock
-                        // when the reader depends on buffered writer.
-                        if self.need_flush {
-                            ready!(writer.as_mut().poll_flush(cx))
-                                .map_err(ErrorDirection::Write)?;
-                            self.need_flush = false;
-                        }
+                        // Ignore pending reads when our buffer is not empty, because
+                        // we can try to write data immediately.
+                        if self.pos == self.cap {
+                            // Try flushing when the reader has no progress to avoid deadlock
+                            // when the reader depends on buffered writer.
+                            if self.need_flush {
+                                ready!(writer.as_mut().poll_flush(cx))
+                                    .map_err(ErrorDirection::Write)?;
+                                self.need_flush = false;
+                            }
 
-                        return Poll::Pending;
+                            return Poll::Pending;
+                        }
                     }
                 }
             }
@@ -246,9 +247,13 @@ impl CopyBuffer {
                 "writer returned length larger than input slice"
             );
 
+            // All data has been written, the buffer can be considered empty again
+            self.pos = 0;
+            self.cap = 0;
+
             // If we've written all the data and we've seen EOF, flush out the
             // data and finish the transfer.
-            if self.pos == self.cap && self.read_done {
+            if self.read_done {
                 ready!(writer.as_mut().poll_flush(cx)).map_err(ErrorDirection::Write)?;
                 return Poll::Ready(Ok(self.amt));
             }

From 6dd48ba148af2eaf90c9d8b5505a760a9995f173 Mon Sep 17 00:00:00 2001
From: Stefan Radig <stefan@neon.tech>
Date: Fri, 31 Jan 2025 21:32:57 +0100
Subject: [PATCH 18/77] feat(proxy): Implement access control with VPC endpoint
 checks and block for public internet / VPC (#10143)

- Wired up filtering on VPC endpoints
- Wired up block access from public internet / VPC depending on per
project flag
- Added cache invalidation for VPC endpoints (partially based on PR from
Raphael)
- Removed BackendIpAllowlist trait

---------

Co-authored-by: Ivan Efremov <ivan@neon.tech>
---
 proxy/src/auth/backend/console_redirect.rs    |  32 ++-
 proxy/src/auth/backend/mod.rs                 | 154 ++++++++----
 proxy/src/auth/mod.rs                         |  25 ++
 proxy/src/bin/local_proxy.rs                  |   1 +
 proxy/src/bin/proxy.rs                        |   1 +
 proxy/src/cache/project_info.rs               | 224 +++++++++++++++++-
 proxy/src/cancellation.rs                     |  55 ++++-
 proxy/src/config.rs                           |   1 +
 proxy/src/console_redirect_proxy.rs           |   3 +-
 proxy/src/context/mod.rs                      |  11 +-
 .../control_plane/client/cplane_proxy_v1.rs   | 174 +++++++++++++-
 proxy/src/control_plane/client/mock.rs        |  42 +++-
 proxy/src/control_plane/client/mod.rs         |  51 +++-
 proxy/src/control_plane/messages.rs           |  26 +-
 proxy/src/control_plane/mod.rs                |  33 ++-
 proxy/src/intern.rs                           |  22 +-
 proxy/src/metrics.rs                          |  13 +
 proxy/src/proxy/mod.rs                        |   3 +-
 proxy/src/proxy/tests/mod.rs                  |  16 +-
 proxy/src/redis/notifications.rs              |  48 +++-
 proxy/src/serverless/backend.rs               |  42 +++-
 proxy/src/types.rs                            |   2 +
 22 files changed, 845 insertions(+), 134 deletions(-)

diff --git a/proxy/src/auth/backend/console_redirect.rs b/proxy/src/auth/backend/console_redirect.rs
index 1cbf91d3ae..9be29c38c9 100644
--- a/proxy/src/auth/backend/console_redirect.rs
+++ b/proxy/src/auth/backend/console_redirect.rs
@@ -7,8 +7,8 @@ use thiserror::Error;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{info, info_span};
 
-use super::{ComputeCredentialKeys, ControlPlaneApi};
-use crate::auth::backend::{BackendIpAllowlist, ComputeUserInfo};
+use super::ComputeCredentialKeys;
+use crate::auth::backend::ComputeUserInfo;
 use crate::auth::IpPattern;
 use crate::cache::Cached;
 use crate::config::AuthenticationConfig;
@@ -84,26 +84,15 @@ pub(crate) fn new_psql_session_id() -> String {
     hex::encode(rand::random::<[u8; 8]>())
 }
 
-#[async_trait]
-impl BackendIpAllowlist for ConsoleRedirectBackend {
-    async fn get_allowed_ips(
-        &self,
-        ctx: &RequestContext,
-        user_info: &ComputeUserInfo,
-    ) -> auth::Result<Vec<auth::IpPattern>> {
-        self.api
-            .get_allowed_ips_and_secret(ctx, user_info)
-            .await
-            .map(|(ips, _)| ips.as_ref().clone())
-            .map_err(|e| e.into())
-    }
-}
-
 impl ConsoleRedirectBackend {
     pub fn new(console_uri: reqwest::Url, api: cplane_proxy_v1::NeonControlPlaneClient) -> Self {
         Self { console_uri, api }
     }
 
+    pub(crate) fn get_api(&self) -> &cplane_proxy_v1::NeonControlPlaneClient {
+        &self.api
+    }
+
     pub(crate) async fn authenticate(
         &self,
         ctx: &RequestContext,
@@ -191,6 +180,15 @@ async fn authenticate(
         }
     }
 
+    // Check if the access over the public internet is allowed, otherwise block. Note that
+    // the console redirect is not behind the VPC service endpoint, so we don't need to check
+    // the VPC endpoint ID.
+    if let Some(public_access_allowed) = db_info.public_access_allowed {
+        if !public_access_allowed {
+            return Err(auth::AuthError::NetworkNotAllowed);
+        }
+    }
+
     client.write_message_noflush(&Be::NoticeResponse("Connecting to database."))?;
 
     // This config should be self-contained, because we won't
diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs
index d17d91a56d..7ef096207a 100644
--- a/proxy/src/auth/backend/mod.rs
+++ b/proxy/src/auth/backend/mod.rs
@@ -26,10 +26,12 @@ use crate::context::RequestContext;
 use crate::control_plane::client::ControlPlaneClient;
 use crate::control_plane::errors::GetAuthInfoError;
 use crate::control_plane::{
-    self, AuthSecret, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, ControlPlaneApi,
+    self, AccessBlockerFlags, AuthSecret, CachedAccessBlockerFlags, CachedAllowedIps,
+    CachedAllowedVpcEndpointIds, CachedNodeInfo, CachedRoleSecret, ControlPlaneApi,
 };
 use crate::intern::EndpointIdInt;
 use crate::metrics::Metrics;
+use crate::protocol2::ConnectionInfoExtra;
 use crate::proxy::connect_compute::ComputeConnectBackend;
 use crate::proxy::NeonOptions;
 use crate::rate_limiter::{BucketRateLimiter, EndpointRateLimiter};
@@ -99,6 +101,13 @@ impl<T> Backend<'_, T> {
             Self::Local(l) => Backend::Local(MaybeOwned::Borrowed(l)),
         }
     }
+
+    pub(crate) fn get_api(&self) -> &ControlPlaneClient {
+        match self {
+            Self::ControlPlane(api, _) => api,
+            Self::Local(_) => panic!("Local backend has no API"),
+        }
+    }
 }
 
 impl<'a, T> Backend<'a, T> {
@@ -247,15 +256,6 @@ impl AuthenticationConfig {
     }
 }
 
-#[async_trait::async_trait]
-pub(crate) trait BackendIpAllowlist {
-    async fn get_allowed_ips(
-        &self,
-        ctx: &RequestContext,
-        user_info: &ComputeUserInfo,
-    ) -> auth::Result<Vec<auth::IpPattern>>;
-}
-
 /// True to its name, this function encapsulates our current auth trade-offs.
 /// Here, we choose the appropriate auth flow based on circumstances.
 ///
@@ -282,23 +282,51 @@ async fn auth_quirks(
         Ok(info) => (info, None),
     };
 
-    debug!("fetching user's authentication info");
-    let (allowed_ips, maybe_secret) = api.get_allowed_ips_and_secret(ctx, &info).await?;
+    debug!("fetching authentication info and allowlists");
 
     // check allowed list
-    if config.ip_allowlist_check_enabled
-        && !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips)
-    {
-        return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr()));
+    let allowed_ips = if config.ip_allowlist_check_enabled {
+        let allowed_ips = api.get_allowed_ips(ctx, &info).await?;
+        if !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips) {
+            return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr()));
+        }
+        allowed_ips
+    } else {
+        Cached::new_uncached(Arc::new(vec![]))
+    };
+
+    // check if a VPC endpoint ID is coming in and if yes, if it's allowed
+    let access_blocks = api.get_block_public_or_vpc_access(ctx, &info).await?;
+    if config.is_vpc_acccess_proxy {
+        if access_blocks.vpc_access_blocked {
+            return Err(AuthError::NetworkNotAllowed);
+        }
+
+        let incoming_vpc_endpoint_id = match ctx.extra() {
+            None => return Err(AuthError::MissingEndpointName),
+            Some(ConnectionInfoExtra::Aws { vpce_id }) => {
+                // Convert the vcpe_id to a string
+                String::from_utf8(vpce_id.to_vec()).unwrap_or_default()
+            }
+            Some(ConnectionInfoExtra::Azure { link_id }) => link_id.to_string(),
+        };
+        let allowed_vpc_endpoint_ids = api.get_allowed_vpc_endpoint_ids(ctx, &info).await?;
+        // TODO: For now an empty VPC endpoint ID list means all are allowed. We should replace that.
+        if !allowed_vpc_endpoint_ids.is_empty()
+            && !allowed_vpc_endpoint_ids.contains(&incoming_vpc_endpoint_id)
+        {
+            return Err(AuthError::vpc_endpoint_id_not_allowed(
+                incoming_vpc_endpoint_id,
+            ));
+        }
+    } else if access_blocks.public_access_blocked {
+        return Err(AuthError::NetworkNotAllowed);
     }
 
     if !endpoint_rate_limiter.check(info.endpoint.clone().into(), 1) {
         return Err(AuthError::too_many_connections());
     }
-    let cached_secret = match maybe_secret {
-        Some(secret) => secret,
-        None => api.get_role_secret(ctx, &info).await?,
-    };
+    let cached_secret = api.get_role_secret(ctx, &info).await?;
     let (cached_entry, secret) = cached_secret.take_value();
 
     let secret = if let Some(secret) = secret {
@@ -440,34 +468,38 @@ impl Backend<'_, ComputeUserInfo> {
         }
     }
 
-    pub(crate) async fn get_allowed_ips_and_secret(
+    pub(crate) async fn get_allowed_ips(
         &self,
         ctx: &RequestContext,
-    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
+    ) -> Result<CachedAllowedIps, GetAuthInfoError> {
         match self {
-            Self::ControlPlane(api, user_info) => {
-                api.get_allowed_ips_and_secret(ctx, user_info).await
-            }
-            Self::Local(_) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
+            Self::ControlPlane(api, user_info) => api.get_allowed_ips(ctx, user_info).await,
+            Self::Local(_) => Ok(Cached::new_uncached(Arc::new(vec![]))),
         }
     }
-}
 
-#[async_trait::async_trait]
-impl BackendIpAllowlist for Backend<'_, ()> {
-    async fn get_allowed_ips(
+    pub(crate) async fn get_allowed_vpc_endpoint_ids(
         &self,
         ctx: &RequestContext,
-        user_info: &ComputeUserInfo,
-    ) -> auth::Result<Vec<auth::IpPattern>> {
-        let auth_data = match self {
-            Self::ControlPlane(api, ()) => api.get_allowed_ips_and_secret(ctx, user_info).await,
-            Self::Local(_) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
-        };
+    ) -> Result<CachedAllowedVpcEndpointIds, GetAuthInfoError> {
+        match self {
+            Self::ControlPlane(api, user_info) => {
+                api.get_allowed_vpc_endpoint_ids(ctx, user_info).await
+            }
+            Self::Local(_) => Ok(Cached::new_uncached(Arc::new(vec![]))),
+        }
+    }
 
-        auth_data
-            .map(|(ips, _)| ips.as_ref().clone())
-            .map_err(|e| e.into())
+    pub(crate) async fn get_block_public_or_vpc_access(
+        &self,
+        ctx: &RequestContext,
+    ) -> Result<CachedAccessBlockerFlags, GetAuthInfoError> {
+        match self {
+            Self::ControlPlane(api, user_info) => {
+                api.get_block_public_or_vpc_access(ctx, user_info).await
+            }
+            Self::Local(_) => Ok(Cached::new_uncached(AccessBlockerFlags::default())),
+        }
     }
 }
 
@@ -514,7 +546,10 @@ mod tests {
     use crate::auth::{ComputeUserInfoMaybeEndpoint, IpPattern};
     use crate::config::AuthenticationConfig;
     use crate::context::RequestContext;
-    use crate::control_plane::{self, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret};
+    use crate::control_plane::{
+        self, AccessBlockerFlags, CachedAccessBlockerFlags, CachedAllowedIps,
+        CachedAllowedVpcEndpointIds, CachedNodeInfo, CachedRoleSecret,
+    };
     use crate::proxy::NeonOptions;
     use crate::rate_limiter::{EndpointRateLimiter, RateBucketInfo};
     use crate::scram::threadpool::ThreadPool;
@@ -523,6 +558,8 @@ mod tests {
 
     struct Auth {
         ips: Vec<IpPattern>,
+        vpc_endpoint_ids: Vec<String>,
+        access_blocker_flags: AccessBlockerFlags,
         secret: AuthSecret,
     }
 
@@ -535,17 +572,31 @@ mod tests {
             Ok(CachedRoleSecret::new_uncached(Some(self.secret.clone())))
         }
 
-        async fn get_allowed_ips_and_secret(
+        async fn get_allowed_ips(
             &self,
             _ctx: &RequestContext,
             _user_info: &super::ComputeUserInfo,
-        ) -> Result<
-            (CachedAllowedIps, Option<CachedRoleSecret>),
-            control_plane::errors::GetAuthInfoError,
-        > {
-            Ok((
-                CachedAllowedIps::new_uncached(Arc::new(self.ips.clone())),
-                Some(CachedRoleSecret::new_uncached(Some(self.secret.clone()))),
+        ) -> Result<CachedAllowedIps, control_plane::errors::GetAuthInfoError> {
+            Ok(CachedAllowedIps::new_uncached(Arc::new(self.ips.clone())))
+        }
+
+        async fn get_allowed_vpc_endpoint_ids(
+            &self,
+            _ctx: &RequestContext,
+            _user_info: &super::ComputeUserInfo,
+        ) -> Result<CachedAllowedVpcEndpointIds, control_plane::errors::GetAuthInfoError> {
+            Ok(CachedAllowedVpcEndpointIds::new_uncached(Arc::new(
+                self.vpc_endpoint_ids.clone(),
+            )))
+        }
+
+        async fn get_block_public_or_vpc_access(
+            &self,
+            _ctx: &RequestContext,
+            _user_info: &super::ComputeUserInfo,
+        ) -> Result<CachedAccessBlockerFlags, control_plane::errors::GetAuthInfoError> {
+            Ok(CachedAccessBlockerFlags::new_uncached(
+                self.access_blocker_flags.clone(),
             ))
         }
 
@@ -575,6 +626,7 @@ mod tests {
         rate_limiter: AuthRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET),
         rate_limit_ip_subnet: 64,
         ip_allowlist_check_enabled: true,
+        is_vpc_acccess_proxy: false,
         is_auth_broker: false,
         accept_jwts: false,
         console_redirect_confirmation_timeout: std::time::Duration::from_secs(5),
@@ -642,6 +694,8 @@ mod tests {
         let ctx = RequestContext::test();
         let api = Auth {
             ips: vec![],
+            vpc_endpoint_ids: vec![],
+            access_blocker_flags: AccessBlockerFlags::default(),
             secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
         };
 
@@ -722,6 +776,8 @@ mod tests {
         let ctx = RequestContext::test();
         let api = Auth {
             ips: vec![],
+            vpc_endpoint_ids: vec![],
+            access_blocker_flags: AccessBlockerFlags::default(),
             secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
         };
 
@@ -774,6 +830,8 @@ mod tests {
         let ctx = RequestContext::test();
         let api = Auth {
             ips: vec![],
+            vpc_endpoint_ids: vec![],
+            access_blocker_flags: AccessBlockerFlags::default(),
             secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
         };
 
diff --git a/proxy/src/auth/mod.rs b/proxy/src/auth/mod.rs
index 0198cc306e..6082695a6b 100644
--- a/proxy/src/auth/mod.rs
+++ b/proxy/src/auth/mod.rs
@@ -55,6 +55,12 @@ pub(crate) enum AuthError {
     )]
     MissingEndpointName,
 
+    #[error(
+        "VPC endpoint ID is not specified. \
+        This endpoint requires a VPC endpoint ID to connect."
+    )]
+    MissingVPCEndpointId,
+
     #[error("password authentication failed for user '{0}'")]
     PasswordFailed(Box<str>),
 
@@ -69,6 +75,15 @@ pub(crate) enum AuthError {
     )]
     IpAddressNotAllowed(IpAddr),
 
+    #[error("This connection is trying to access this endpoint from a blocked network.")]
+    NetworkNotAllowed,
+
+    #[error(
+        "This VPC endpoint id {0} is not allowed to connect to this endpoint. \
+        Please add it to the allowed list in the Neon console."
+    )]
+    VpcEndpointIdNotAllowed(String),
+
     #[error("Too many connections to this endpoint. Please try again later.")]
     TooManyConnections,
 
@@ -95,6 +110,10 @@ impl AuthError {
         AuthError::IpAddressNotAllowed(ip)
     }
 
+    pub(crate) fn vpc_endpoint_id_not_allowed(id: String) -> Self {
+        AuthError::VpcEndpointIdNotAllowed(id)
+    }
+
     pub(crate) fn too_many_connections() -> Self {
         AuthError::TooManyConnections
     }
@@ -122,8 +141,11 @@ impl UserFacingError for AuthError {
             Self::BadAuthMethod(_) => self.to_string(),
             Self::MalformedPassword(_) => self.to_string(),
             Self::MissingEndpointName => self.to_string(),
+            Self::MissingVPCEndpointId => self.to_string(),
             Self::Io(_) => "Internal error".to_string(),
             Self::IpAddressNotAllowed(_) => self.to_string(),
+            Self::NetworkNotAllowed => self.to_string(),
+            Self::VpcEndpointIdNotAllowed(_) => self.to_string(),
             Self::TooManyConnections => self.to_string(),
             Self::UserTimeout(_) => self.to_string(),
             Self::ConfirmationTimeout(_) => self.to_string(),
@@ -142,8 +164,11 @@ impl ReportableError for AuthError {
             Self::BadAuthMethod(_) => crate::error::ErrorKind::User,
             Self::MalformedPassword(_) => crate::error::ErrorKind::User,
             Self::MissingEndpointName => crate::error::ErrorKind::User,
+            Self::MissingVPCEndpointId => crate::error::ErrorKind::User,
             Self::Io(_) => crate::error::ErrorKind::ClientDisconnect,
             Self::IpAddressNotAllowed(_) => crate::error::ErrorKind::User,
+            Self::NetworkNotAllowed => crate::error::ErrorKind::User,
+            Self::VpcEndpointIdNotAllowed(_) => crate::error::ErrorKind::User,
             Self::TooManyConnections => crate::error::ErrorKind::RateLimit,
             Self::UserTimeout(_) => crate::error::ErrorKind::User,
             Self::ConfirmationTimeout(_) => crate::error::ErrorKind::User,
diff --git a/proxy/src/bin/local_proxy.rs b/proxy/src/bin/local_proxy.rs
index ee8b3d4ef5..7a855bf54b 100644
--- a/proxy/src/bin/local_proxy.rs
+++ b/proxy/src/bin/local_proxy.rs
@@ -284,6 +284,7 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig
             rate_limiter: BucketRateLimiter::new(vec![]),
             rate_limit_ip_subnet: 64,
             ip_allowlist_check_enabled: true,
+            is_vpc_acccess_proxy: false,
             is_auth_broker: false,
             accept_jwts: true,
             console_redirect_confirmation_timeout: Duration::ZERO,
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index e1affe8391..de685a82c6 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -630,6 +630,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
         rate_limiter: AuthRateLimiter::new(args.auth_rate_limit.clone()),
         rate_limit_ip_subnet: args.auth_rate_limit_ip_subnet,
         ip_allowlist_check_enabled: !args.is_private_access_proxy,
+        is_vpc_acccess_proxy: args.is_private_access_proxy,
         is_auth_broker: args.is_auth_broker,
         accept_jwts: args.is_auth_broker,
         console_redirect_confirmation_timeout: args.webauth_confirmation_timeout,
diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs
index a5e71f1a87..7651eb71a2 100644
--- a/proxy/src/cache/project_info.rs
+++ b/proxy/src/cache/project_info.rs
@@ -15,13 +15,16 @@ use tracing::{debug, info};
 use super::{Cache, Cached};
 use crate::auth::IpPattern;
 use crate::config::ProjectInfoCacheOptions;
-use crate::control_plane::AuthSecret;
-use crate::intern::{EndpointIdInt, ProjectIdInt, RoleNameInt};
+use crate::control_plane::{AccessBlockerFlags, AuthSecret};
+use crate::intern::{AccountIdInt, EndpointIdInt, ProjectIdInt, RoleNameInt};
 use crate::types::{EndpointId, RoleName};
 
 #[async_trait]
 pub(crate) trait ProjectInfoCache {
     fn invalidate_allowed_ips_for_project(&self, project_id: ProjectIdInt);
+    fn invalidate_allowed_vpc_endpoint_ids_for_projects(&self, project_ids: Vec<ProjectIdInt>);
+    fn invalidate_allowed_vpc_endpoint_ids_for_org(&self, account_id: AccountIdInt);
+    fn invalidate_block_public_or_vpc_access_for_project(&self, project_id: ProjectIdInt);
     fn invalidate_role_secret_for_project(&self, project_id: ProjectIdInt, role_name: RoleNameInt);
     async fn decrement_active_listeners(&self);
     async fn increment_active_listeners(&self);
@@ -51,6 +54,8 @@ impl<T> From<T> for Entry<T> {
 struct EndpointInfo {
     secret: std::collections::HashMap<RoleNameInt, Entry<Option<AuthSecret>>>,
     allowed_ips: Option<Entry<Arc<Vec<IpPattern>>>>,
+    block_public_or_vpc_access: Option<Entry<AccessBlockerFlags>>,
+    allowed_vpc_endpoint_ids: Option<Entry<Arc<Vec<String>>>>,
 }
 
 impl EndpointInfo {
@@ -92,9 +97,52 @@ impl EndpointInfo {
         }
         None
     }
+    pub(crate) fn get_allowed_vpc_endpoint_ids(
+        &self,
+        valid_since: Instant,
+        ignore_cache_since: Option<Instant>,
+    ) -> Option<(Arc<Vec<String>>, bool)> {
+        if let Some(allowed_vpc_endpoint_ids) = &self.allowed_vpc_endpoint_ids {
+            if valid_since < allowed_vpc_endpoint_ids.created_at {
+                return Some((
+                    allowed_vpc_endpoint_ids.value.clone(),
+                    Self::check_ignore_cache(
+                        ignore_cache_since,
+                        allowed_vpc_endpoint_ids.created_at,
+                    ),
+                ));
+            }
+        }
+        None
+    }
+    pub(crate) fn get_block_public_or_vpc_access(
+        &self,
+        valid_since: Instant,
+        ignore_cache_since: Option<Instant>,
+    ) -> Option<(AccessBlockerFlags, bool)> {
+        if let Some(block_public_or_vpc_access) = &self.block_public_or_vpc_access {
+            if valid_since < block_public_or_vpc_access.created_at {
+                return Some((
+                    block_public_or_vpc_access.value.clone(),
+                    Self::check_ignore_cache(
+                        ignore_cache_since,
+                        block_public_or_vpc_access.created_at,
+                    ),
+                ));
+            }
+        }
+        None
+    }
+
     pub(crate) fn invalidate_allowed_ips(&mut self) {
         self.allowed_ips = None;
     }
+    pub(crate) fn invalidate_allowed_vpc_endpoint_ids(&mut self) {
+        self.allowed_vpc_endpoint_ids = None;
+    }
+    pub(crate) fn invalidate_block_public_or_vpc_access(&mut self) {
+        self.block_public_or_vpc_access = None;
+    }
     pub(crate) fn invalidate_role_secret(&mut self, role_name: RoleNameInt) {
         self.secret.remove(&role_name);
     }
@@ -111,6 +159,8 @@ pub struct ProjectInfoCacheImpl {
     cache: ClashMap<EndpointIdInt, EndpointInfo>,
 
     project2ep: ClashMap<ProjectIdInt, HashSet<EndpointIdInt>>,
+    // FIXME(stefan): we need a way to GC the account2ep map.
+    account2ep: ClashMap<AccountIdInt, HashSet<EndpointIdInt>>,
     config: ProjectInfoCacheOptions,
 
     start_time: Instant,
@@ -120,6 +170,63 @@ pub struct ProjectInfoCacheImpl {
 
 #[async_trait]
 impl ProjectInfoCache for ProjectInfoCacheImpl {
+    fn invalidate_allowed_vpc_endpoint_ids_for_projects(&self, project_ids: Vec<ProjectIdInt>) {
+        info!(
+            "invalidating allowed vpc endpoint ids for projects `{}`",
+            project_ids
+                .iter()
+                .map(|id| id.to_string())
+                .collect::<Vec<_>>()
+                .join(", ")
+        );
+        for project_id in project_ids {
+            let endpoints = self
+                .project2ep
+                .get(&project_id)
+                .map(|kv| kv.value().clone())
+                .unwrap_or_default();
+            for endpoint_id in endpoints {
+                if let Some(mut endpoint_info) = self.cache.get_mut(&endpoint_id) {
+                    endpoint_info.invalidate_allowed_vpc_endpoint_ids();
+                }
+            }
+        }
+    }
+
+    fn invalidate_allowed_vpc_endpoint_ids_for_org(&self, account_id: AccountIdInt) {
+        info!(
+            "invalidating allowed vpc endpoint ids for org `{}`",
+            account_id
+        );
+        let endpoints = self
+            .account2ep
+            .get(&account_id)
+            .map(|kv| kv.value().clone())
+            .unwrap_or_default();
+        for endpoint_id in endpoints {
+            if let Some(mut endpoint_info) = self.cache.get_mut(&endpoint_id) {
+                endpoint_info.invalidate_allowed_vpc_endpoint_ids();
+            }
+        }
+    }
+
+    fn invalidate_block_public_or_vpc_access_for_project(&self, project_id: ProjectIdInt) {
+        info!(
+            "invalidating block public or vpc access for project `{}`",
+            project_id
+        );
+        let endpoints = self
+            .project2ep
+            .get(&project_id)
+            .map(|kv| kv.value().clone())
+            .unwrap_or_default();
+        for endpoint_id in endpoints {
+            if let Some(mut endpoint_info) = self.cache.get_mut(&endpoint_id) {
+                endpoint_info.invalidate_block_public_or_vpc_access();
+            }
+        }
+    }
+
     fn invalidate_allowed_ips_for_project(&self, project_id: ProjectIdInt) {
         info!("invalidating allowed ips for project `{}`", project_id);
         let endpoints = self
@@ -178,6 +285,7 @@ impl ProjectInfoCacheImpl {
         Self {
             cache: ClashMap::new(),
             project2ep: ClashMap::new(),
+            account2ep: ClashMap::new(),
             config,
             ttl_disabled_since_us: AtomicU64::new(u64::MAX),
             start_time: Instant::now(),
@@ -226,6 +334,49 @@ impl ProjectInfoCacheImpl {
         }
         Some(Cached::new_uncached(value))
     }
+    pub(crate) fn get_allowed_vpc_endpoint_ids(
+        &self,
+        endpoint_id: &EndpointId,
+    ) -> Option<Cached<&Self, Arc<Vec<String>>>> {
+        let endpoint_id = EndpointIdInt::get(endpoint_id)?;
+        let (valid_since, ignore_cache_since) = self.get_cache_times();
+        let endpoint_info = self.cache.get(&endpoint_id)?;
+        let value = endpoint_info.get_allowed_vpc_endpoint_ids(valid_since, ignore_cache_since);
+        let (value, ignore_cache) = value?;
+        if !ignore_cache {
+            let cached = Cached {
+                token: Some((
+                    self,
+                    CachedLookupInfo::new_allowed_vpc_endpoint_ids(endpoint_id),
+                )),
+                value,
+            };
+            return Some(cached);
+        }
+        Some(Cached::new_uncached(value))
+    }
+    pub(crate) fn get_block_public_or_vpc_access(
+        &self,
+        endpoint_id: &EndpointId,
+    ) -> Option<Cached<&Self, AccessBlockerFlags>> {
+        let endpoint_id = EndpointIdInt::get(endpoint_id)?;
+        let (valid_since, ignore_cache_since) = self.get_cache_times();
+        let endpoint_info = self.cache.get(&endpoint_id)?;
+        let value = endpoint_info.get_block_public_or_vpc_access(valid_since, ignore_cache_since);
+        let (value, ignore_cache) = value?;
+        if !ignore_cache {
+            let cached = Cached {
+                token: Some((
+                    self,
+                    CachedLookupInfo::new_block_public_or_vpc_access(endpoint_id),
+                )),
+                value,
+            };
+            return Some(cached);
+        }
+        Some(Cached::new_uncached(value))
+    }
+
     pub(crate) fn insert_role_secret(
         &self,
         project_id: ProjectIdInt,
@@ -256,6 +407,43 @@ impl ProjectInfoCacheImpl {
         self.insert_project2endpoint(project_id, endpoint_id);
         self.cache.entry(endpoint_id).or_default().allowed_ips = Some(allowed_ips.into());
     }
+    pub(crate) fn insert_allowed_vpc_endpoint_ids(
+        &self,
+        account_id: Option<AccountIdInt>,
+        project_id: ProjectIdInt,
+        endpoint_id: EndpointIdInt,
+        allowed_vpc_endpoint_ids: Arc<Vec<String>>,
+    ) {
+        if self.cache.len() >= self.config.size {
+            // If there are too many entries, wait until the next gc cycle.
+            return;
+        }
+        if let Some(account_id) = account_id {
+            self.insert_account2endpoint(account_id, endpoint_id);
+        }
+        self.insert_project2endpoint(project_id, endpoint_id);
+        self.cache
+            .entry(endpoint_id)
+            .or_default()
+            .allowed_vpc_endpoint_ids = Some(allowed_vpc_endpoint_ids.into());
+    }
+    pub(crate) fn insert_block_public_or_vpc_access(
+        &self,
+        project_id: ProjectIdInt,
+        endpoint_id: EndpointIdInt,
+        access_blockers: AccessBlockerFlags,
+    ) {
+        if self.cache.len() >= self.config.size {
+            // If there are too many entries, wait until the next gc cycle.
+            return;
+        }
+        self.insert_project2endpoint(project_id, endpoint_id);
+        self.cache
+            .entry(endpoint_id)
+            .or_default()
+            .block_public_or_vpc_access = Some(access_blockers.into());
+    }
+
     fn insert_project2endpoint(&self, project_id: ProjectIdInt, endpoint_id: EndpointIdInt) {
         if let Some(mut endpoints) = self.project2ep.get_mut(&project_id) {
             endpoints.insert(endpoint_id);
@@ -264,6 +452,14 @@ impl ProjectInfoCacheImpl {
                 .insert(project_id, HashSet::from([endpoint_id]));
         }
     }
+    fn insert_account2endpoint(&self, account_id: AccountIdInt, endpoint_id: EndpointIdInt) {
+        if let Some(mut endpoints) = self.account2ep.get_mut(&account_id) {
+            endpoints.insert(endpoint_id);
+        } else {
+            self.account2ep
+                .insert(account_id, HashSet::from([endpoint_id]));
+        }
+    }
     fn get_cache_times(&self) -> (Instant, Option<Instant>) {
         let mut valid_since = Instant::now() - self.config.ttl;
         // Only ignore cache if ttl is disabled.
@@ -334,11 +530,25 @@ impl CachedLookupInfo {
             lookup_type: LookupType::AllowedIps,
         }
     }
+    pub(self) fn new_allowed_vpc_endpoint_ids(endpoint_id: EndpointIdInt) -> Self {
+        Self {
+            endpoint_id,
+            lookup_type: LookupType::AllowedVpcEndpointIds,
+        }
+    }
+    pub(self) fn new_block_public_or_vpc_access(endpoint_id: EndpointIdInt) -> Self {
+        Self {
+            endpoint_id,
+            lookup_type: LookupType::BlockPublicOrVpcAccess,
+        }
+    }
 }
 
 enum LookupType {
     RoleSecret(RoleNameInt),
     AllowedIps,
+    AllowedVpcEndpointIds,
+    BlockPublicOrVpcAccess,
 }
 
 impl Cache for ProjectInfoCacheImpl {
@@ -360,6 +570,16 @@ impl Cache for ProjectInfoCacheImpl {
                     endpoint_info.invalidate_allowed_ips();
                 }
             }
+            LookupType::AllowedVpcEndpointIds => {
+                if let Some(mut endpoint_info) = self.cache.get_mut(&key.endpoint_id) {
+                    endpoint_info.invalidate_allowed_vpc_endpoint_ids();
+                }
+            }
+            LookupType::BlockPublicOrVpcAccess => {
+                if let Some(mut endpoint_info) = self.cache.get_mut(&key.endpoint_id) {
+                    endpoint_info.invalidate_block_public_or_vpc_access();
+                }
+            }
         }
     }
 }
diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index 9a0b954341..4d919f374a 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -12,13 +12,15 @@ use tokio::net::TcpStream;
 use tokio::sync::{mpsc, oneshot};
 use tracing::{debug, info};
 
-use crate::auth::backend::{BackendIpAllowlist, ComputeUserInfo};
+use crate::auth::backend::ComputeUserInfo;
 use crate::auth::{check_peer_addr_is_in_list, AuthError};
 use crate::config::ComputeConfig;
 use crate::context::RequestContext;
+use crate::control_plane::ControlPlaneApi;
 use crate::error::ReportableError;
 use crate::ext::LockExt;
 use crate::metrics::{CancelChannelSizeGuard, CancellationRequest, Metrics, RedisMsgKind};
+use crate::protocol2::ConnectionInfoExtra;
 use crate::rate_limiter::LeakyBucketRateLimiter;
 use crate::redis::keys::KeyPrefix;
 use crate::redis::kv_ops::RedisKVClient;
@@ -133,6 +135,9 @@ pub(crate) enum CancelError {
     #[error("IP is not allowed")]
     IpNotAllowed,
 
+    #[error("VPC endpoint id is not allowed to connect")]
+    VpcEndpointIdNotAllowed,
+
     #[error("Authentication backend error")]
     AuthError(#[from] AuthError),
 
@@ -152,8 +157,9 @@ impl ReportableError for CancelError {
             }
             CancelError::Postgres(_) => crate::error::ErrorKind::Compute,
             CancelError::RateLimit => crate::error::ErrorKind::RateLimit,
-            CancelError::IpNotAllowed => crate::error::ErrorKind::User,
-            CancelError::NotFound => crate::error::ErrorKind::User,
+            CancelError::IpNotAllowed
+            | CancelError::VpcEndpointIdNotAllowed
+            | CancelError::NotFound => crate::error::ErrorKind::User,
             CancelError::AuthError(_) => crate::error::ErrorKind::ControlPlane,
             CancelError::InternalError => crate::error::ErrorKind::Service,
         }
@@ -265,11 +271,12 @@ impl CancellationHandler {
     /// Will fetch IP allowlist internally.
     ///
     /// return Result primarily for tests
-    pub(crate) async fn cancel_session<T: BackendIpAllowlist>(
+    pub(crate) async fn cancel_session<T: ControlPlaneApi>(
         &self,
         key: CancelKeyData,
         ctx: RequestContext,
-        check_allowed: bool,
+        check_ip_allowed: bool,
+        check_vpc_allowed: bool,
         auth_backend: &T,
     ) -> Result<(), CancelError> {
         let subnet_key = match ctx.peer_addr() {
@@ -304,11 +311,11 @@ impl CancellationHandler {
             return Err(CancelError::NotFound);
         };
 
-        if check_allowed {
+        if check_ip_allowed {
             let ip_allowlist = auth_backend
                 .get_allowed_ips(&ctx, &cancel_closure.user_info)
                 .await
-                .map_err(CancelError::AuthError)?;
+                .map_err(|e| CancelError::AuthError(e.into()))?;
 
             if !check_peer_addr_is_in_list(&ctx.peer_addr(), &ip_allowlist) {
                 // log it here since cancel_session could be spawned in a task
@@ -320,6 +327,40 @@ impl CancellationHandler {
             }
         }
 
+        // check if a VPC endpoint ID is coming in and if yes, if it's allowed
+        let access_blocks = auth_backend
+            .get_block_public_or_vpc_access(&ctx, &cancel_closure.user_info)
+            .await
+            .map_err(|e| CancelError::AuthError(e.into()))?;
+
+        if check_vpc_allowed {
+            if access_blocks.vpc_access_blocked {
+                return Err(CancelError::AuthError(AuthError::NetworkNotAllowed));
+            }
+
+            let incoming_vpc_endpoint_id = match ctx.extra() {
+                None => return Err(CancelError::AuthError(AuthError::MissingVPCEndpointId)),
+                Some(ConnectionInfoExtra::Aws { vpce_id }) => {
+                    // Convert the vcpe_id to a string
+                    String::from_utf8(vpce_id.to_vec()).unwrap_or_default()
+                }
+                Some(ConnectionInfoExtra::Azure { link_id }) => link_id.to_string(),
+            };
+
+            let allowed_vpc_endpoint_ids = auth_backend
+                .get_allowed_vpc_endpoint_ids(&ctx, &cancel_closure.user_info)
+                .await
+                .map_err(|e| CancelError::AuthError(e.into()))?;
+            // TODO: For now an empty VPC endpoint ID list means all are allowed. We should replace that.
+            if !allowed_vpc_endpoint_ids.is_empty()
+                && !allowed_vpc_endpoint_ids.contains(&incoming_vpc_endpoint_id)
+            {
+                return Err(CancelError::VpcEndpointIdNotAllowed);
+            }
+        } else if access_blocks.public_access_blocked {
+            return Err(CancelError::VpcEndpointIdNotAllowed);
+        }
+
         Metrics::get()
             .proxy
             .cancellation_requests_total
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 8502edcfab..1dcd37712e 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -68,6 +68,7 @@ pub struct AuthenticationConfig {
     pub rate_limiter: AuthRateLimiter,
     pub rate_limit_ip_subnet: u8,
     pub ip_allowlist_check_enabled: bool,
+    pub is_vpc_acccess_proxy: bool,
     pub jwks_cache: JwkCache,
     pub is_auth_broker: bool,
     pub accept_jwts: bool,
diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs
index 78bfb6deac..c4548a7ddd 100644
--- a/proxy/src/console_redirect_proxy.rs
+++ b/proxy/src/console_redirect_proxy.rs
@@ -182,7 +182,8 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
                             cancel_key_data,
                             ctx,
                             config.authentication_config.ip_allowlist_check_enabled,
-                            backend,
+                            config.authentication_config.is_vpc_acccess_proxy,
+                            backend.get_api(),
                         )
                         .await
                         .inspect_err(|e | debug!(error = ?e, "cancel_session failed")).ok();
diff --git a/proxy/src/context/mod.rs b/proxy/src/context/mod.rs
index a9fb513d3c..3236b2e1bf 100644
--- a/proxy/src/context/mod.rs
+++ b/proxy/src/context/mod.rs
@@ -19,7 +19,7 @@ use crate::intern::{BranchIdInt, ProjectIdInt};
 use crate::metrics::{
     ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol, Waiting,
 };
-use crate::protocol2::ConnectionInfo;
+use crate::protocol2::{ConnectionInfo, ConnectionInfoExtra};
 use crate::types::{DbName, EndpointId, RoleName};
 
 pub mod parquet;
@@ -312,6 +312,15 @@ impl RequestContext {
             .ip()
     }
 
+    pub(crate) fn extra(&self) -> Option<ConnectionInfoExtra> {
+        self.0
+            .try_lock()
+            .expect("should not deadlock")
+            .conn_info
+            .extra
+            .clone()
+    }
+
     pub(crate) fn cold_start_info(&self) -> ColdStartInfo {
         self.0
             .try_lock()
diff --git a/proxy/src/control_plane/client/cplane_proxy_v1.rs b/proxy/src/control_plane/client/cplane_proxy_v1.rs
index ece03156d1..ef6621fc59 100644
--- a/proxy/src/control_plane/client/cplane_proxy_v1.rs
+++ b/proxy/src/control_plane/client/cplane_proxy_v1.rs
@@ -22,7 +22,8 @@ use crate::control_plane::errors::{
 use crate::control_plane::locks::ApiLocks;
 use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, Reason};
 use crate::control_plane::{
-    AuthInfo, AuthSecret, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, NodeInfo,
+    AccessBlockerFlags, AuthInfo, AuthSecret, CachedAccessBlockerFlags, CachedAllowedIps,
+    CachedAllowedVpcEndpointIds, CachedNodeInfo, CachedRoleSecret, NodeInfo,
 };
 use crate::metrics::{CacheOutcome, Metrics};
 use crate::rate_limiter::WakeComputeRateLimiter;
@@ -137,9 +138,6 @@ impl NeonControlPlaneClient {
                 }
             };
 
-            // Ivan: don't know where it will be used, so I leave it here
-            let _endpoint_vpc_ids = body.allowed_vpc_endpoint_ids.unwrap_or_default();
-
             let secret = if body.role_secret.is_empty() {
                 None
             } else {
@@ -153,10 +151,23 @@ impl NeonControlPlaneClient {
                 .proxy
                 .allowed_ips_number
                 .observe(allowed_ips.len() as f64);
+            let allowed_vpc_endpoint_ids = body.allowed_vpc_endpoint_ids.unwrap_or_default();
+            Metrics::get()
+                .proxy
+                .allowed_vpc_endpoint_ids
+                .observe(allowed_vpc_endpoint_ids.len() as f64);
+            let block_public_connections = body.block_public_connections.unwrap_or_default();
+            let block_vpc_connections = body.block_vpc_connections.unwrap_or_default();
             Ok(AuthInfo {
                 secret,
                 allowed_ips,
+                allowed_vpc_endpoint_ids,
                 project_id: body.project_id,
+                account_id: body.account_id,
+                access_blocker_flags: AccessBlockerFlags {
+                    public_access_blocked: block_public_connections,
+                    vpc_access_blocked: block_vpc_connections,
+                },
             })
         }
         .inspect_err(|e| tracing::debug!(error = ?e))
@@ -299,6 +310,7 @@ impl super::ControlPlaneApi for NeonControlPlaneClient {
             return Ok(role_secret);
         }
         let auth_info = self.do_get_auth_info(ctx, user_info).await?;
+        let account_id = auth_info.account_id;
         if let Some(project_id) = auth_info.project_id {
             let normalized_ep_int = normalized_ep.into();
             self.caches.project_info.insert_role_secret(
@@ -312,24 +324,35 @@ impl super::ControlPlaneApi for NeonControlPlaneClient {
                 normalized_ep_int,
                 Arc::new(auth_info.allowed_ips),
             );
+            self.caches.project_info.insert_allowed_vpc_endpoint_ids(
+                account_id,
+                project_id,
+                normalized_ep_int,
+                Arc::new(auth_info.allowed_vpc_endpoint_ids),
+            );
+            self.caches.project_info.insert_block_public_or_vpc_access(
+                project_id,
+                normalized_ep_int,
+                auth_info.access_blocker_flags,
+            );
             ctx.set_project_id(project_id);
         }
         // When we just got a secret, we don't need to invalidate it.
         Ok(Cached::new_uncached(auth_info.secret))
     }
 
-    async fn get_allowed_ips_and_secret(
+    async fn get_allowed_ips(
         &self,
         ctx: &RequestContext,
         user_info: &ComputeUserInfo,
-    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
+    ) -> Result<CachedAllowedIps, GetAuthInfoError> {
         let normalized_ep = &user_info.endpoint.normalize();
         if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(normalized_ep) {
             Metrics::get()
                 .proxy
-                .allowed_ips_cache_misses
+                .allowed_ips_cache_misses // TODO SR: Should we rename this variable to something like allowed_ip_cache_stats?
                 .inc(CacheOutcome::Hit);
-            return Ok((allowed_ips, None));
+            return Ok(allowed_ips);
         }
         Metrics::get()
             .proxy
@@ -337,7 +360,10 @@ impl super::ControlPlaneApi for NeonControlPlaneClient {
             .inc(CacheOutcome::Miss);
         let auth_info = self.do_get_auth_info(ctx, user_info).await?;
         let allowed_ips = Arc::new(auth_info.allowed_ips);
+        let allowed_vpc_endpoint_ids = Arc::new(auth_info.allowed_vpc_endpoint_ids);
+        let access_blocker_flags = auth_info.access_blocker_flags;
         let user = &user_info.user;
+        let account_id = auth_info.account_id;
         if let Some(project_id) = auth_info.project_id {
             let normalized_ep_int = normalized_ep.into();
             self.caches.project_info.insert_role_secret(
@@ -351,12 +377,136 @@ impl super::ControlPlaneApi for NeonControlPlaneClient {
                 normalized_ep_int,
                 allowed_ips.clone(),
             );
+            self.caches.project_info.insert_allowed_vpc_endpoint_ids(
+                account_id,
+                project_id,
+                normalized_ep_int,
+                allowed_vpc_endpoint_ids.clone(),
+            );
+            self.caches.project_info.insert_block_public_or_vpc_access(
+                project_id,
+                normalized_ep_int,
+                access_blocker_flags,
+            );
             ctx.set_project_id(project_id);
         }
-        Ok((
-            Cached::new_uncached(allowed_ips),
-            Some(Cached::new_uncached(auth_info.secret)),
-        ))
+        Ok(Cached::new_uncached(allowed_ips))
+    }
+
+    async fn get_allowed_vpc_endpoint_ids(
+        &self,
+        ctx: &RequestContext,
+        user_info: &ComputeUserInfo,
+    ) -> Result<CachedAllowedVpcEndpointIds, GetAuthInfoError> {
+        let normalized_ep = &user_info.endpoint.normalize();
+        if let Some(allowed_vpc_endpoint_ids) = self
+            .caches
+            .project_info
+            .get_allowed_vpc_endpoint_ids(normalized_ep)
+        {
+            Metrics::get()
+                .proxy
+                .vpc_endpoint_id_cache_stats
+                .inc(CacheOutcome::Hit);
+            return Ok(allowed_vpc_endpoint_ids);
+        }
+
+        Metrics::get()
+            .proxy
+            .vpc_endpoint_id_cache_stats
+            .inc(CacheOutcome::Miss);
+
+        let auth_info = self.do_get_auth_info(ctx, user_info).await?;
+        let allowed_ips = Arc::new(auth_info.allowed_ips);
+        let allowed_vpc_endpoint_ids = Arc::new(auth_info.allowed_vpc_endpoint_ids);
+        let access_blocker_flags = auth_info.access_blocker_flags;
+        let user = &user_info.user;
+        let account_id = auth_info.account_id;
+        if let Some(project_id) = auth_info.project_id {
+            let normalized_ep_int = normalized_ep.into();
+            self.caches.project_info.insert_role_secret(
+                project_id,
+                normalized_ep_int,
+                user.into(),
+                auth_info.secret.clone(),
+            );
+            self.caches.project_info.insert_allowed_ips(
+                project_id,
+                normalized_ep_int,
+                allowed_ips.clone(),
+            );
+            self.caches.project_info.insert_allowed_vpc_endpoint_ids(
+                account_id,
+                project_id,
+                normalized_ep_int,
+                allowed_vpc_endpoint_ids.clone(),
+            );
+            self.caches.project_info.insert_block_public_or_vpc_access(
+                project_id,
+                normalized_ep_int,
+                access_blocker_flags,
+            );
+            ctx.set_project_id(project_id);
+        }
+        Ok(Cached::new_uncached(allowed_vpc_endpoint_ids))
+    }
+
+    async fn get_block_public_or_vpc_access(
+        &self,
+        ctx: &RequestContext,
+        user_info: &ComputeUserInfo,
+    ) -> Result<CachedAccessBlockerFlags, GetAuthInfoError> {
+        let normalized_ep = &user_info.endpoint.normalize();
+        if let Some(access_blocker_flags) = self
+            .caches
+            .project_info
+            .get_block_public_or_vpc_access(normalized_ep)
+        {
+            Metrics::get()
+                .proxy
+                .access_blocker_flags_cache_stats
+                .inc(CacheOutcome::Hit);
+            return Ok(access_blocker_flags);
+        }
+
+        Metrics::get()
+            .proxy
+            .access_blocker_flags_cache_stats
+            .inc(CacheOutcome::Miss);
+
+        let auth_info = self.do_get_auth_info(ctx, user_info).await?;
+        let allowed_ips = Arc::new(auth_info.allowed_ips);
+        let allowed_vpc_endpoint_ids = Arc::new(auth_info.allowed_vpc_endpoint_ids);
+        let access_blocker_flags = auth_info.access_blocker_flags;
+        let user = &user_info.user;
+        let account_id = auth_info.account_id;
+        if let Some(project_id) = auth_info.project_id {
+            let normalized_ep_int = normalized_ep.into();
+            self.caches.project_info.insert_role_secret(
+                project_id,
+                normalized_ep_int,
+                user.into(),
+                auth_info.secret.clone(),
+            );
+            self.caches.project_info.insert_allowed_ips(
+                project_id,
+                normalized_ep_int,
+                allowed_ips.clone(),
+            );
+            self.caches.project_info.insert_allowed_vpc_endpoint_ids(
+                account_id,
+                project_id,
+                normalized_ep_int,
+                allowed_vpc_endpoint_ids.clone(),
+            );
+            self.caches.project_info.insert_block_public_or_vpc_access(
+                project_id,
+                normalized_ep_int,
+                access_blocker_flags.clone(),
+            );
+            ctx.set_project_id(project_id);
+        }
+        Ok(Cached::new_uncached(access_blocker_flags))
     }
 
     #[tracing::instrument(skip_all)]
diff --git a/proxy/src/control_plane/client/mock.rs b/proxy/src/control_plane/client/mock.rs
index 5f8bda0f35..1e6cde8fb0 100644
--- a/proxy/src/control_plane/client/mock.rs
+++ b/proxy/src/control_plane/client/mock.rs
@@ -13,12 +13,14 @@ use crate::auth::backend::ComputeUserInfo;
 use crate::auth::IpPattern;
 use crate::cache::Cached;
 use crate::context::RequestContext;
-use crate::control_plane::client::{CachedAllowedIps, CachedRoleSecret};
+use crate::control_plane::client::{
+    CachedAllowedIps, CachedAllowedVpcEndpointIds, CachedRoleSecret,
+};
 use crate::control_plane::errors::{
     ControlPlaneError, GetAuthInfoError, GetEndpointJwksError, WakeComputeError,
 };
 use crate::control_plane::messages::MetricsAuxInfo;
-use crate::control_plane::{AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo};
+use crate::control_plane::{AccessBlockerFlags, AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo};
 use crate::error::io_error;
 use crate::intern::RoleNameInt;
 use crate::types::{BranchId, EndpointId, ProjectId, RoleName};
@@ -121,7 +123,10 @@ impl MockControlPlane {
         Ok(AuthInfo {
             secret,
             allowed_ips,
+            allowed_vpc_endpoint_ids: vec![],
             project_id: None,
+            account_id: None,
+            access_blocker_flags: AccessBlockerFlags::default(),
         })
     }
 
@@ -214,16 +219,35 @@ impl super::ControlPlaneApi for MockControlPlane {
         ))
     }
 
-    async fn get_allowed_ips_and_secret(
+    async fn get_allowed_ips(
         &self,
         _ctx: &RequestContext,
         user_info: &ComputeUserInfo,
-    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
-        Ok((
-            Cached::new_uncached(Arc::new(
-                self.do_get_auth_info(user_info).await?.allowed_ips,
-            )),
-            None,
+    ) -> Result<CachedAllowedIps, GetAuthInfoError> {
+        Ok(Cached::new_uncached(Arc::new(
+            self.do_get_auth_info(user_info).await?.allowed_ips,
+        )))
+    }
+
+    async fn get_allowed_vpc_endpoint_ids(
+        &self,
+        _ctx: &RequestContext,
+        user_info: &ComputeUserInfo,
+    ) -> Result<CachedAllowedVpcEndpointIds, super::errors::GetAuthInfoError> {
+        Ok(Cached::new_uncached(Arc::new(
+            self.do_get_auth_info(user_info)
+                .await?
+                .allowed_vpc_endpoint_ids,
+        )))
+    }
+
+    async fn get_block_public_or_vpc_access(
+        &self,
+        _ctx: &RequestContext,
+        user_info: &ComputeUserInfo,
+    ) -> Result<super::CachedAccessBlockerFlags, super::errors::GetAuthInfoError> {
+        Ok(Cached::new_uncached(
+            self.do_get_auth_info(user_info).await?.access_blocker_flags,
         ))
     }
 
diff --git a/proxy/src/control_plane/client/mod.rs b/proxy/src/control_plane/client/mod.rs
index b879f3a59f..a06943726e 100644
--- a/proxy/src/control_plane/client/mod.rs
+++ b/proxy/src/control_plane/client/mod.rs
@@ -17,7 +17,8 @@ use crate::cache::project_info::ProjectInfoCacheImpl;
 use crate::config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions};
 use crate::context::RequestContext;
 use crate::control_plane::{
-    errors, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, ControlPlaneApi, NodeInfoCache,
+    errors, CachedAccessBlockerFlags, CachedAllowedIps, CachedAllowedVpcEndpointIds,
+    CachedNodeInfo, CachedRoleSecret, ControlPlaneApi, NodeInfoCache,
 };
 use crate::error::ReportableError;
 use crate::metrics::ApiLockMetrics;
@@ -55,17 +56,45 @@ impl ControlPlaneApi for ControlPlaneClient {
         }
     }
 
-    async fn get_allowed_ips_and_secret(
+    async fn get_allowed_ips(
         &self,
         ctx: &RequestContext,
         user_info: &ComputeUserInfo,
-    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError> {
+    ) -> Result<CachedAllowedIps, errors::GetAuthInfoError> {
         match self {
-            Self::ProxyV1(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
+            Self::ProxyV1(api) => api.get_allowed_ips(ctx, user_info).await,
             #[cfg(any(test, feature = "testing"))]
-            Self::PostgresMock(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
+            Self::PostgresMock(api) => api.get_allowed_ips(ctx, user_info).await,
             #[cfg(test)]
-            Self::Test(api) => api.get_allowed_ips_and_secret(),
+            Self::Test(api) => api.get_allowed_ips(),
+        }
+    }
+
+    async fn get_allowed_vpc_endpoint_ids(
+        &self,
+        ctx: &RequestContext,
+        user_info: &ComputeUserInfo,
+    ) -> Result<CachedAllowedVpcEndpointIds, errors::GetAuthInfoError> {
+        match self {
+            Self::ProxyV1(api) => api.get_allowed_vpc_endpoint_ids(ctx, user_info).await,
+            #[cfg(any(test, feature = "testing"))]
+            Self::PostgresMock(api) => api.get_allowed_vpc_endpoint_ids(ctx, user_info).await,
+            #[cfg(test)]
+            Self::Test(api) => api.get_allowed_vpc_endpoint_ids(),
+        }
+    }
+
+    async fn get_block_public_or_vpc_access(
+        &self,
+        ctx: &RequestContext,
+        user_info: &ComputeUserInfo,
+    ) -> Result<CachedAccessBlockerFlags, errors::GetAuthInfoError> {
+        match self {
+            Self::ProxyV1(api) => api.get_block_public_or_vpc_access(ctx, user_info).await,
+            #[cfg(any(test, feature = "testing"))]
+            Self::PostgresMock(api) => api.get_block_public_or_vpc_access(ctx, user_info).await,
+            #[cfg(test)]
+            Self::Test(api) => api.get_block_public_or_vpc_access(),
         }
     }
 
@@ -102,9 +131,15 @@ impl ControlPlaneApi for ControlPlaneClient {
 pub(crate) trait TestControlPlaneClient: Send + Sync + 'static {
     fn wake_compute(&self) -> Result<CachedNodeInfo, errors::WakeComputeError>;
 
-    fn get_allowed_ips_and_secret(
+    fn get_allowed_ips(&self) -> Result<CachedAllowedIps, errors::GetAuthInfoError>;
+
+    fn get_allowed_vpc_endpoint_ids(
         &self,
-    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError>;
+    ) -> Result<CachedAllowedVpcEndpointIds, errors::GetAuthInfoError>;
+
+    fn get_block_public_or_vpc_access(
+        &self,
+    ) -> Result<CachedAccessBlockerFlags, errors::GetAuthInfoError>;
 
     fn dyn_clone(&self) -> Box<dyn TestControlPlaneClient>;
 }
diff --git a/proxy/src/control_plane/messages.rs b/proxy/src/control_plane/messages.rs
index d068614b24..5883d02b92 100644
--- a/proxy/src/control_plane/messages.rs
+++ b/proxy/src/control_plane/messages.rs
@@ -4,7 +4,7 @@ use measured::FixedCardinalityLabel;
 use serde::{Deserialize, Serialize};
 
 use crate::auth::IpPattern;
-use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt, RoleNameInt};
+use crate::intern::{AccountIdInt, BranchIdInt, EndpointIdInt, ProjectIdInt, RoleNameInt};
 use crate::proxy::retry::CouldRetry;
 
 /// Generic error response with human-readable description.
@@ -227,8 +227,11 @@ pub(crate) struct UserFacingMessage {
 pub(crate) struct GetEndpointAccessControl {
     pub(crate) role_secret: Box<str>,
     pub(crate) allowed_ips: Option<Vec<IpPattern>>,
+    pub(crate) allowed_vpc_endpoint_ids: Option<Vec<String>>,
     pub(crate) project_id: Option<ProjectIdInt>,
-    pub(crate) allowed_vpc_endpoint_ids: Option<Vec<EndpointIdInt>>,
+    pub(crate) account_id: Option<AccountIdInt>,
+    pub(crate) block_public_connections: Option<bool>,
+    pub(crate) block_vpc_connections: Option<bool>,
 }
 
 /// Response which holds compute node's `host:port` pair.
@@ -282,6 +285,10 @@ pub(crate) struct DatabaseInfo {
     pub(crate) aux: MetricsAuxInfo,
     #[serde(default)]
     pub(crate) allowed_ips: Option<Vec<IpPattern>>,
+    #[serde(default)]
+    pub(crate) allowed_vpc_endpoint_ids: Option<Vec<String>>,
+    #[serde(default)]
+    pub(crate) public_access_allowed: Option<bool>,
 }
 
 // Manually implement debug to omit sensitive info.
@@ -293,6 +300,7 @@ impl fmt::Debug for DatabaseInfo {
             .field("dbname", &self.dbname)
             .field("user", &self.user)
             .field("allowed_ips", &self.allowed_ips)
+            .field("allowed_vpc_endpoint_ids", &self.allowed_vpc_endpoint_ids)
             .finish_non_exhaustive()
     }
 }
@@ -457,7 +465,7 @@ mod tests {
 
     #[test]
     fn parse_get_role_secret() -> anyhow::Result<()> {
-        // Empty `allowed_ips` field.
+        // Empty `allowed_ips` and `allowed_vpc_endpoint_ids` field.
         let json = json!({
             "role_secret": "secret",
         });
@@ -467,9 +475,21 @@ mod tests {
             "allowed_ips": ["8.8.8.8"],
         });
         serde_json::from_str::<GetEndpointAccessControl>(&json.to_string())?;
+        let json = json!({
+            "role_secret": "secret",
+            "allowed_vpc_endpoint_ids": ["vpce-0abcd1234567890ef"],
+        });
+        serde_json::from_str::<GetEndpointAccessControl>(&json.to_string())?;
         let json = json!({
             "role_secret": "secret",
             "allowed_ips": ["8.8.8.8"],
+            "allowed_vpc_endpoint_ids": ["vpce-0abcd1234567890ef"],
+        });
+        serde_json::from_str::<GetEndpointAccessControl>(&json.to_string())?;
+        let json = json!({
+            "role_secret": "secret",
+            "allowed_ips": ["8.8.8.8"],
+            "allowed_vpc_endpoint_ids": ["vpce-0abcd1234567890ef"],
             "project_id": "project",
         });
         serde_json::from_str::<GetEndpointAccessControl>(&json.to_string())?;
diff --git a/proxy/src/control_plane/mod.rs b/proxy/src/control_plane/mod.rs
index 1dca26d686..f92e4f3f60 100644
--- a/proxy/src/control_plane/mod.rs
+++ b/proxy/src/control_plane/mod.rs
@@ -19,6 +19,7 @@ use crate::cache::{Cached, TimedLru};
 use crate::config::ComputeConfig;
 use crate::context::RequestContext;
 use crate::control_plane::messages::{ControlPlaneErrorMessage, MetricsAuxInfo};
+use crate::intern::AccountIdInt;
 use crate::intern::ProjectIdInt;
 use crate::types::{EndpointCacheKey, EndpointId};
 use crate::{compute, scram};
@@ -52,8 +53,14 @@ pub(crate) struct AuthInfo {
     pub(crate) secret: Option<AuthSecret>,
     /// List of IP addresses allowed for the autorization.
     pub(crate) allowed_ips: Vec<IpPattern>,
+    /// List of VPC endpoints allowed for the autorization.
+    pub(crate) allowed_vpc_endpoint_ids: Vec<String>,
     /// Project ID. This is used for cache invalidation.
     pub(crate) project_id: Option<ProjectIdInt>,
+    /// Account ID. This is used for cache invalidation.
+    pub(crate) account_id: Option<AccountIdInt>,
+    /// Are public connections or VPC connections blocked?
+    pub(crate) access_blocker_flags: AccessBlockerFlags,
 }
 
 /// Info for establishing a connection to a compute node.
@@ -95,11 +102,21 @@ impl NodeInfo {
     }
 }
 
+#[derive(Clone, Default, Eq, PartialEq, Debug)]
+pub(crate) struct AccessBlockerFlags {
+    pub public_access_blocked: bool,
+    pub vpc_access_blocked: bool,
+}
+
 pub(crate) type NodeInfoCache =
     TimedLru<EndpointCacheKey, Result<NodeInfo, Box<ControlPlaneErrorMessage>>>;
 pub(crate) type CachedNodeInfo = Cached<&'static NodeInfoCache, NodeInfo>;
 pub(crate) type CachedRoleSecret = Cached<&'static ProjectInfoCacheImpl, Option<AuthSecret>>;
 pub(crate) type CachedAllowedIps = Cached<&'static ProjectInfoCacheImpl, Arc<Vec<IpPattern>>>;
+pub(crate) type CachedAllowedVpcEndpointIds =
+    Cached<&'static ProjectInfoCacheImpl, Arc<Vec<String>>>;
+pub(crate) type CachedAccessBlockerFlags =
+    Cached<&'static ProjectInfoCacheImpl, AccessBlockerFlags>;
 
 /// This will allocate per each call, but the http requests alone
 /// already require a few allocations, so it should be fine.
@@ -113,11 +130,23 @@ pub(crate) trait ControlPlaneApi {
         user_info: &ComputeUserInfo,
     ) -> Result<CachedRoleSecret, errors::GetAuthInfoError>;
 
-    async fn get_allowed_ips_and_secret(
+    async fn get_allowed_ips(
         &self,
         ctx: &RequestContext,
         user_info: &ComputeUserInfo,
-    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError>;
+    ) -> Result<CachedAllowedIps, errors::GetAuthInfoError>;
+
+    async fn get_allowed_vpc_endpoint_ids(
+        &self,
+        ctx: &RequestContext,
+        user_info: &ComputeUserInfo,
+    ) -> Result<CachedAllowedVpcEndpointIds, errors::GetAuthInfoError>;
+
+    async fn get_block_public_or_vpc_access(
+        &self,
+        ctx: &RequestContext,
+        user_info: &ComputeUserInfo,
+    ) -> Result<CachedAccessBlockerFlags, errors::GetAuthInfoError>;
 
     async fn get_endpoint_jwks(
         &self,
diff --git a/proxy/src/intern.rs b/proxy/src/intern.rs
index 79c6020302..0d1382679c 100644
--- a/proxy/src/intern.rs
+++ b/proxy/src/intern.rs
@@ -7,7 +7,7 @@ use std::sync::OnceLock;
 use lasso::{Capacity, MemoryLimits, Spur, ThreadedRodeo};
 use rustc_hash::FxHasher;
 
-use crate::types::{BranchId, EndpointId, ProjectId, RoleName};
+use crate::types::{AccountId, BranchId, EndpointId, ProjectId, RoleName};
 
 pub trait InternId: Sized + 'static {
     fn get_interner() -> &'static StringInterner<Self>;
@@ -206,6 +206,26 @@ impl From<ProjectId> for ProjectIdInt {
     }
 }
 
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub struct AccountIdTag;
+impl InternId for AccountIdTag {
+    fn get_interner() -> &'static StringInterner<Self> {
+        static ROLE_NAMES: OnceLock<StringInterner<AccountIdTag>> = OnceLock::new();
+        ROLE_NAMES.get_or_init(Default::default)
+    }
+}
+pub type AccountIdInt = InternedString<AccountIdTag>;
+impl From<&AccountId> for AccountIdInt {
+    fn from(value: &AccountId) -> Self {
+        AccountIdTag::get_interner().get_or_intern(value)
+    }
+}
+impl From<AccountId> for AccountIdInt {
+    fn from(value: AccountId) -> Self {
+        AccountIdTag::get_interner().get_or_intern(&value)
+    }
+}
+
 #[cfg(test)]
 #[expect(clippy::unwrap_used)]
 mod tests {
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index f3d281a26b..25bcc81108 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -96,6 +96,16 @@ pub struct ProxyMetrics {
     #[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0]))]
     pub allowed_ips_number: Histogram<10>,
 
+    /// Number of cache hits/misses for VPC endpoint IDs.
+    pub vpc_endpoint_id_cache_stats: CounterVec<StaticLabelSet<CacheOutcome>>,
+
+    /// Number of cache hits/misses for access blocker flags.
+    pub access_blocker_flags_cache_stats: CounterVec<StaticLabelSet<CacheOutcome>>,
+
+    /// Number of allowed VPC endpoints IDs
+    #[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0]))]
+    pub allowed_vpc_endpoint_ids: Histogram<10>,
+
     /// Number of connections (per sni).
     pub accepted_connections_by_sni: CounterVec<StaticLabelSet<SniKind>>,
 
@@ -570,6 +580,9 @@ pub enum RedisEventsCount {
     CancelSession,
     PasswordUpdate,
     AllowedIpsUpdate,
+    AllowedVpcEndpointIdsUpdateForProjects,
+    AllowedVpcEndpointIdsUpdateForAllProjectsInOrg,
+    BlockPublicOrVpcAccessUpdate,
 }
 
 pub struct ThreadPoolWorkers(usize);
diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs
index ab173bd0d0..8a407c8119 100644
--- a/proxy/src/proxy/mod.rs
+++ b/proxy/src/proxy/mod.rs
@@ -283,7 +283,8 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
                             cancel_key_data,
                             ctx,
                             config.authentication_config.ip_allowlist_check_enabled,
-                            auth_backend,
+                            config.authentication_config.is_vpc_acccess_proxy,
+                            auth_backend.get_api(),
                         )
                         .await
                         .inspect_err(|e | debug!(error = ?e, "cancel_session failed")).ok();
diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs
index 10db2bcb30..d8c00a9b41 100644
--- a/proxy/src/proxy/tests/mod.rs
+++ b/proxy/src/proxy/tests/mod.rs
@@ -26,7 +26,7 @@ use crate::config::{ComputeConfig, RetryConfig};
 use crate::control_plane::client::{ControlPlaneClient, TestControlPlaneClient};
 use crate::control_plane::messages::{ControlPlaneErrorMessage, Details, MetricsAuxInfo, Status};
 use crate::control_plane::{
-    self, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, NodeInfo, NodeInfoCache,
+    self, CachedAllowedIps, CachedAllowedVpcEndpointIds, CachedNodeInfo, NodeInfo, NodeInfoCache,
 };
 use crate::error::ErrorKind;
 use crate::tls::client_config::compute_client_config_with_certs;
@@ -526,9 +526,19 @@ impl TestControlPlaneClient for TestConnectMechanism {
         }
     }
 
-    fn get_allowed_ips_and_secret(
+    fn get_allowed_ips(&self) -> Result<CachedAllowedIps, control_plane::errors::GetAuthInfoError> {
+        unimplemented!("not used in tests")
+    }
+
+    fn get_allowed_vpc_endpoint_ids(
         &self,
-    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), control_plane::errors::GetAuthInfoError>
+    ) -> Result<CachedAllowedVpcEndpointIds, control_plane::errors::GetAuthInfoError> {
+        unimplemented!("not used in tests")
+    }
+
+    fn get_block_public_or_vpc_access(
+        &self,
+    ) -> Result<control_plane::CachedAccessBlockerFlags, control_plane::errors::GetAuthInfoError>
     {
         unimplemented!("not used in tests")
     }
diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs
index 19fdd3280d..1a7024588a 100644
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -10,7 +10,7 @@ use uuid::Uuid;
 
 use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
 use crate::cache::project_info::ProjectInfoCache;
-use crate::intern::{ProjectIdInt, RoleNameInt};
+use crate::intern::{AccountIdInt, ProjectIdInt, RoleNameInt};
 use crate::metrics::{Metrics, RedisErrors, RedisEventsCount};
 
 const CPLANE_CHANNEL_NAME: &str = "neondb-proxy-ws-updates";
@@ -86,9 +86,7 @@ pub(crate) struct BlockPublicOrVpcAccessUpdated {
 
 #[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
 pub(crate) struct AllowedVpcEndpointsUpdatedForOrg {
-    // TODO: change type once the implementation is more fully fledged.
-    // See e.g. https://github.com/neondatabase/neon/pull/10073.
-    account_id: ProjectIdInt,
+    account_id: AccountIdInt,
 }
 
 #[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
@@ -205,6 +203,24 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
                         .proxy
                         .redis_events_count
                         .inc(RedisEventsCount::PasswordUpdate);
+                } else if matches!(
+                    msg,
+                    Notification::AllowedVpcEndpointsUpdatedForProjects { .. }
+                ) {
+                    Metrics::get()
+                        .proxy
+                        .redis_events_count
+                        .inc(RedisEventsCount::AllowedVpcEndpointIdsUpdateForProjects);
+                } else if matches!(msg, Notification::AllowedVpcEndpointsUpdatedForOrg { .. }) {
+                    Metrics::get()
+                        .proxy
+                        .redis_events_count
+                        .inc(RedisEventsCount::AllowedVpcEndpointIdsUpdateForAllProjectsInOrg);
+                } else if matches!(msg, Notification::BlockPublicOrVpcAccessUpdated { .. }) {
+                    Metrics::get()
+                        .proxy
+                        .redis_events_count
+                        .inc(RedisEventsCount::BlockPublicOrVpcAccessUpdate);
                 }
                 // TODO: add additional metrics for the other event types.
 
@@ -230,20 +246,26 @@ fn invalidate_cache<C: ProjectInfoCache>(cache: Arc<C>, msg: Notification) {
         Notification::AllowedIpsUpdate { allowed_ips_update } => {
             cache.invalidate_allowed_ips_for_project(allowed_ips_update.project_id);
         }
+        Notification::BlockPublicOrVpcAccessUpdated {
+            block_public_or_vpc_access_updated,
+        } => cache.invalidate_block_public_or_vpc_access_for_project(
+            block_public_or_vpc_access_updated.project_id,
+        ),
+        Notification::AllowedVpcEndpointsUpdatedForOrg {
+            allowed_vpc_endpoints_updated_for_org,
+        } => cache.invalidate_allowed_vpc_endpoint_ids_for_org(
+            allowed_vpc_endpoints_updated_for_org.account_id,
+        ),
+        Notification::AllowedVpcEndpointsUpdatedForProjects {
+            allowed_vpc_endpoints_updated_for_projects,
+        } => cache.invalidate_allowed_vpc_endpoint_ids_for_projects(
+            allowed_vpc_endpoints_updated_for_projects.project_ids,
+        ),
         Notification::PasswordUpdate { password_update } => cache
             .invalidate_role_secret_for_project(
                 password_update.project_id,
                 password_update.role_name,
             ),
-        Notification::BlockPublicOrVpcAccessUpdated { .. } => {
-            // https://github.com/neondatabase/neon/pull/10073
-        }
-        Notification::AllowedVpcEndpointsUpdatedForOrg { .. } => {
-            // https://github.com/neondatabase/neon/pull/10073
-        }
-        Notification::AllowedVpcEndpointsUpdatedForProjects { .. } => {
-            // https://github.com/neondatabase/neon/pull/10073
-        }
         Notification::UnknownTopic => unreachable!(),
     }
 }
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 6d5fb13681..0fb4a8a6cc 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -30,6 +30,7 @@ use crate::control_plane::locks::ApiLocks;
 use crate::control_plane::CachedNodeInfo;
 use crate::error::{ErrorKind, ReportableError, UserFacingError};
 use crate::intern::EndpointIdInt;
+use crate::protocol2::ConnectionInfoExtra;
 use crate::proxy::connect_compute::ConnectMechanism;
 use crate::proxy::retry::{CouldRetry, ShouldRetryWakeCompute};
 use crate::rate_limiter::EndpointRateLimiter;
@@ -57,23 +58,52 @@ impl PoolingBackend {
 
         let user_info = user_info.clone();
         let backend = self.auth_backend.as_ref().map(|()| user_info.clone());
-        let (allowed_ips, maybe_secret) = backend.get_allowed_ips_and_secret(ctx).await?;
+        let allowed_ips = backend.get_allowed_ips(ctx).await?;
+
         if self.config.authentication_config.ip_allowlist_check_enabled
             && !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips)
         {
             return Err(AuthError::ip_address_not_allowed(ctx.peer_addr()));
         }
+
+        let access_blocker_flags = backend.get_block_public_or_vpc_access(ctx).await?;
+        if self.config.authentication_config.is_vpc_acccess_proxy {
+            if access_blocker_flags.vpc_access_blocked {
+                return Err(AuthError::NetworkNotAllowed);
+            }
+
+            let extra = ctx.extra();
+            let incoming_endpoint_id = match extra {
+                None => String::new(),
+                Some(ConnectionInfoExtra::Aws { vpce_id }) => {
+                    // Convert the vcpe_id to a string
+                    String::from_utf8(vpce_id.to_vec()).unwrap_or_default()
+                }
+                Some(ConnectionInfoExtra::Azure { link_id }) => link_id.to_string(),
+            };
+
+            if incoming_endpoint_id.is_empty() {
+                return Err(AuthError::MissingVPCEndpointId);
+            }
+
+            let allowed_vpc_endpoint_ids = backend.get_allowed_vpc_endpoint_ids(ctx).await?;
+            // TODO: For now an empty VPC endpoint ID list means all are allowed. We should replace that.
+            if !allowed_vpc_endpoint_ids.is_empty()
+                && !allowed_vpc_endpoint_ids.contains(&incoming_endpoint_id)
+            {
+                return Err(AuthError::vpc_endpoint_id_not_allowed(incoming_endpoint_id));
+            }
+        } else if access_blocker_flags.public_access_blocked {
+            return Err(AuthError::NetworkNotAllowed);
+        }
+
         if !self
             .endpoint_rate_limiter
             .check(user_info.endpoint.clone().into(), 1)
         {
             return Err(AuthError::too_many_connections());
         }
-        let cached_secret = match maybe_secret {
-            Some(secret) => secret,
-            None => backend.get_role_secret(ctx).await?,
-        };
-
+        let cached_secret = backend.get_role_secret(ctx).await?;
         let secret = match cached_secret.value.clone() {
             Some(secret) => self.config.authentication_config.check_rate_limit(
                 ctx,
diff --git a/proxy/src/types.rs b/proxy/src/types.rs
index 6e0bd61c94..d5952d1d8b 100644
--- a/proxy/src/types.rs
+++ b/proxy/src/types.rs
@@ -97,6 +97,8 @@ smol_str_wrapper!(EndpointId);
 smol_str_wrapper!(BranchId);
 // 90% of project strings are 23 characters or less.
 smol_str_wrapper!(ProjectId);
+// 90% of account strings are 23 characters or less.
+smol_str_wrapper!(AccountId);
 
 // will usually equal endpoint ID
 smol_str_wrapper!(EndpointCacheKey);

From 6318828c639ba85e66da321f7b32828ee945de12 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Fri, 31 Jan 2025 21:52:17 +0100
Subject: [PATCH 19/77] Update rust to 1.84.1 (#10618)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We keep the practice of keeping the compiler up to date, pointing to the
latest release. This is done by many other projects in the Rust
ecosystem as well.

[Release notes](https://releases.rs/docs/1.84.1/).

Prior update was in https://github.com/neondatabase/neon/pull/10328.

Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
---
 build-tools.Dockerfile | 2 +-
 rust-toolchain.toml    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile
index dfcc7d06b4..f744b44808 100644
--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -253,7 +253,7 @@ WORKDIR /home/nonroot
 
 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.84.0
+ENV RUSTC_VERSION=1.84.1
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 ARG RUSTFILT_VERSION=0.2.1
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
index 06746d3e1d..38a7f202ba 100644
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,5 +1,5 @@
 [toolchain]
-channel = "1.84.0"
+channel = "1.84.1"
 profile = "default"
 # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
 # https://rust-lang.github.io/rustup/concepts/profiles.html

From b9e1a6724628aad5cf62737f6acab60ec23ff09b Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Sat, 1 Feb 2025 12:09:45 +0100
Subject: [PATCH 20/77] fix generate matrix for olap for saturdays (#10622)

## Problem

when introducing pg17 for job step `Generate matrix for OLAP benchmarks`
I introduced a syntax error that only hits on Saturdays.

## Summary of changes

Remove trailing comma

## successful test run

https://github.com/neondatabase/neon/actions/runs/13086363907
---
 .github/workflows/benchmarking.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index 20a8a6e2c9..413af90dec 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -341,7 +341,7 @@ jobs:
           ],
           "pg_version" : [
             16,17
-          ],
+          ]
         }'
 
         if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then

From 8ae6f656a694a2d6892ce6ebd1475d1b831ba917 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 3 Feb 2025 05:11:06 +0100
Subject: [PATCH 21/77] Don't require partial backup semaphore capacity for
 deletions (#10628)

In the safekeeper, we block deletions on the timeline's gate closing,
and any `WalResidentTimeline` keeps the gate open (because it owns a
gate lock object). Thus, unless the `main_task` function of a partial
backup doesn't return, we can't delete the associated timeline.

In order to make these tasks exit early, we call the cancellation token
of the timeline upon its shutdown. However, the partial backup task
wasn't looking for the cancellation while waiting to acquire a partial
backup permit.

On a staging safekeeper we have been in a situation in the past where
the semaphore was already empty for a duration of many hours, rendering
all attempted deletions unable to proceed until a restart where the
semaphore was reset:
https://neondb.slack.com/archives/C03H1K0PGKH/p1738416586442029
---
 safekeeper/src/wal_backup_partial.rs | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs
index 4e5b34a9bf..5ecb23e8e0 100644
--- a/safekeeper/src/wal_backup_partial.rs
+++ b/safekeeper/src/wal_backup_partial.rs
@@ -535,6 +535,10 @@ pub async fn main_task(
         // limit concurrent uploads
         let _upload_permit = tokio::select! {
             acq = limiter.acquire_partial_backup() => acq,
+            _ = backup.tli.cancel.cancelled() => {
+                info!("timeline canceled");
+                return None;
+            }
             _ = cancel.cancelled() => {
                 info!("task canceled");
                 return None;

From 4dfe60e2ad08ad0e99bfd938bcbe18271deb4f84 Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Mon, 3 Feb 2025 10:00:23 +0100
Subject: [PATCH 22/77] revert https://github.com/neondatabase/neon/pull/10616
 (#10631)

## Problem

https://github.com/neondatabase/neon/pull/10616 was only intended
temparily during the weekend, want to reset to prior state

## Summary of changes

revert https://github.com/neondatabase/neon/pull/10616 but keep fixes in
https://github.com/neondatabase/neon/pull/10622
---
 .github/workflows/benchmarking.yml | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index 413af90dec..b36ac46f35 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -11,8 +11,7 @@ on:
     #          │ │ ┌───────────── day of the month (1 - 31)
     #          │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
     #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
-    # - cron:   '0 3 * * *' # run once a day, timezone is utc
-    - cron: '0 */10 * * *' # Runs every 10 hours at minute 0
+    - cron:   '0 3 * * *' # run once a day, timezone is utc
   workflow_dispatch: # adds ability to run this manually
     inputs:
       region_id:
@@ -551,7 +550,6 @@ jobs:
         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
 
   pgbench-pgvector:
-    if: false
     permissions:
       contents: write
       statuses: write
@@ -685,8 +683,7 @@ jobs:
     #
     # *_CLICKBENCH_CONNSTR: Genuine ClickBench DB with ~100M rows
     # *_CLICKBENCH_10M_CONNSTR: DB with the first 10M rows of ClickBench DB
-    # if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
-    if: false
+    if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
     permissions:
       contents: write
       statuses: write
@@ -814,7 +811,6 @@ jobs:
     #
     # *_TPCH_S10_CONNSTR: DB generated with scale factor 10 (~10 GB)
     # if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
-    if: false
     permissions:
       contents: write
       statuses: write
@@ -934,7 +930,6 @@ jobs:
 
   user-examples-compare:
     # if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
-    if: false
     permissions:
       contents: write
       statuses: write

From f071800979fba434ea0708f22e454c513efe47b2 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 3 Feb 2025 09:02:21 +0000
Subject: [PATCH 23/77] tests: stabilize shard locations earlier in
 test_scrubber_tenant_snapshot (#10606)

## Problem

This test would sometimes emit unexpected logs from the storage
controller's requests to do migrations, which overlap with the test's
restarts of pageservers, where those migrations are happening some time
after a shard split as the controller moves load around.

Example:
https://neon-github-public-dev.s3.amazonaws.com/reports/pr-10602/13067323736/index.html#testresult/f66f1329557a1fc5/retries

## Summary of changes

- Do a reconcile_until_idle after shard split, so that the rest of the
test doesn't run concurrently with migrations
---
 test_runner/regress/test_storage_scrubber.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py
index 7e92cc01cd..0f4e5688a9 100644
--- a/test_runner/regress/test_storage_scrubber.py
+++ b/test_runner/regress/test_storage_scrubber.py
@@ -71,6 +71,10 @@ def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count:
     else:
         tenant_shard_ids = [TenantShardId(tenant_id, 0, 0)]
 
+    # Let shards finish rescheduling to other pageservers: this makes the rest of the test more stable
+    # is it won't overlap with migrations
+    env.storage_controller.reconcile_until_idle(max_interval=0.1, timeout_secs=120)
+
     output_path = neon_env_builder.test_output_dir / "snapshot"
     os.makedirs(output_path)
 

From 89b9f7407706353bf256ac6eeea0edd31c3829d1 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 3 Feb 2025 09:40:12 +0000
Subject: [PATCH 24/77] CI(pre-merge-checks): do not run `conclusion` job for
 PRs (#10619)

## Problem

While working on https://github.com/neondatabase/neon/pull/10617 I
(unintentionally) merged the PR before the main CI pipeline has
finished.
I suspect this happens because we have received all the required job
results from the pre-merge-checks workflow, which runs on PRs that
include changes to relevant files.

## Summary of changes
- Skip the `conclusion` job in `pre-merge-checks` workflows for PRs
---
 .github/workflows/pre-merge-checks.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/pre-merge-checks.yml b/.github/workflows/pre-merge-checks.yml
index d39ccecac9..c47b3fe0de 100644
--- a/.github/workflows/pre-merge-checks.yml
+++ b/.github/workflows/pre-merge-checks.yml
@@ -95,7 +95,8 @@ jobs:
   # - conclusion
   # - neon-cloud-e2e
   conclusion:
-    if: always()
+    # Do not run job on Pull Requests as it interferes with the `conclusion` job from the `build_and_test` workflow
+    if: always() && github.event_name == 'merge_group'
     permissions:
       statuses: write # for `github.repos.createCommitStatus(...)`
       contents: write

From 87ad50c92547891f6c1161c3e3c255067e6c0276 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 3 Feb 2025 12:53:51 +0100
Subject: [PATCH 25/77] storcon: use diesel-async again, now with tls support
 (#10614)

Successor of #10280 after it was reverted in #10592.

Re-introduce the usage of diesel-async again, but now also add TLS
support so that we connect to the storcon database using TLS. By
default, diesel-async doesn't support TLS, so add some code to make us
explicitly request TLS.

cc https://github.com/neondatabase/cloud/issues/23583
---
 .github/workflows/_build-and-test-locally.yml |   4 -
 .github/workflows/build-macos.yml             |   2 +-
 .github/workflows/neon_extra_builds.yml       |   2 +-
 Cargo.lock                                    |  72 +-
 Dockerfile                                    |   2 +-
 Makefile                                      |   2 -
 storage_controller/Cargo.toml                 |   9 +-
 storage_controller/src/main.rs                |   2 +-
 storage_controller/src/persistence.rs         | 833 +++++++++++-------
 9 files changed, 549 insertions(+), 379 deletions(-)

diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index e9483492c9..1dec8106b4 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -158,8 +158,6 @@ jobs:
 
       - name: Run cargo build
         run: |
-          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
-          export PQ_LIB_DIR
           ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
 
       # Do install *before* running rust tests because they might recompile the
@@ -217,8 +215,6 @@ jobs:
         env:
           NEXTEST_RETRIES: 3
         run: |
-          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
-          export PQ_LIB_DIR
           LD_LIBRARY_PATH=$(pwd)/pg_install/v17/lib
           export LD_LIBRARY_PATH
 
diff --git a/.github/workflows/build-macos.yml b/.github/workflows/build-macos.yml
index 01d82a1ed2..347a511e98 100644
--- a/.github/workflows/build-macos.yml
+++ b/.github/workflows/build-macos.yml
@@ -235,7 +235,7 @@ jobs:
           echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV
 
       - name: Run cargo build (only for v17)
-        run: PQ_LIB_DIR=$(pwd)/pg_install/v17/lib cargo build --all --release -j$(sysctl -n hw.ncpu)
+        run: cargo build --all --release -j$(sysctl -n hw.ncpu)
 
       - name: Check that no warnings are produced (only for v17)
         run: ./run_clippy.sh
diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
index 5b5910badf..f077e04d1c 100644
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -114,7 +114,7 @@ jobs:
         run: make walproposer-lib -j$(nproc)
 
       - name: Produce the build stats
-        run: PQ_LIB_DIR=$(pwd)/pg_install/v17/lib cargo build --all --release --timings -j$(nproc)
+        run: cargo build --all --release --timings -j$(nproc)
 
       - name: Configure AWS credentials
         uses: aws-actions/configure-aws-credentials@v4
diff --git a/Cargo.lock b/Cargo.lock
index cdc620e485..0133c83564 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -932,6 +932,18 @@ version = "1.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b"
 
+[[package]]
+name = "bb8"
+version = "0.8.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d89aabfae550a5c44b43ab941844ffcd2e993cb6900b342debf59e9ea74acdb8"
+dependencies = [
+ "async-trait",
+ "futures-util",
+ "parking_lot 0.12.1",
+ "tokio",
+]
+
 [[package]]
 name = "bcder"
 version = "0.7.4"
@@ -1790,11 +1802,24 @@ dependencies = [
  "chrono",
  "diesel_derives",
  "itoa",
- "pq-sys",
- "r2d2",
  "serde_json",
 ]
 
+[[package]]
+name = "diesel-async"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "51a307ac00f7c23f526a04a77761a0519b9f0eb2838ebf5b905a58580095bdcb"
+dependencies = [
+ "async-trait",
+ "bb8",
+ "diesel",
+ "futures-util",
+ "scoped-futures",
+ "tokio",
+ "tokio-postgres",
+]
+
 [[package]]
 name = "diesel_derives"
 version = "2.2.1"
@@ -4645,15 +4670,6 @@ version = "0.2.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
 
-[[package]]
-name = "pq-sys"
-version = "0.6.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f6cc05d7ea95200187117196eee9edd0644424911821aeb28a18ce60ea0b8793"
-dependencies = [
- "vcpkg",
-]
-
 [[package]]
 name = "pq_proto"
 version = "0.1.0"
@@ -4966,17 +4982,6 @@ dependencies = [
  "proc-macro2",
 ]
 
-[[package]]
-name = "r2d2"
-version = "0.8.10"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "51de85fb3fb6524929c8a2eb85e6b6d363de4e8c48f9e2c2eac4944abc181c93"
-dependencies = [
- "log",
- "parking_lot 0.12.1",
- "scheduled-thread-pool",
-]
-
 [[package]]
 name = "rand"
 version = "0.7.3"
@@ -5797,12 +5802,12 @@ dependencies = [
 ]
 
 [[package]]
-name = "scheduled-thread-pool"
-version = "0.2.7"
+name = "scoped-futures"
+version = "0.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3cbc66816425a074528352f5789333ecff06ca41b36b0b0efdfbb29edc391a19"
+checksum = "1b24aae2d0636530f359e9d5ef0c04669d11c5e756699b27a6a6d845d8329091"
 dependencies = [
- "parking_lot 0.12.1",
+ "pin-project-lite",
 ]
 
 [[package]]
@@ -6337,6 +6342,7 @@ dependencies = [
  "clap",
  "control_plane",
  "diesel",
+ "diesel-async",
  "diesel_migrations",
  "fail",
  "futures",
@@ -6351,10 +6357,12 @@ dependencies = [
  "pageserver_api",
  "pageserver_client",
  "postgres_connection",
- "r2d2",
  "rand 0.8.5",
  "reqwest",
  "routerify",
+ "rustls 0.23.18",
+ "rustls-native-certs 0.8.0",
+ "scoped-futures",
  "scopeguard",
  "serde",
  "serde_json",
@@ -6362,6 +6370,8 @@ dependencies = [
  "strum_macros",
  "thiserror 1.0.69",
  "tokio",
+ "tokio-postgres",
+ "tokio-postgres-rustls",
  "tokio-util",
  "tracing",
  "utils",
@@ -6604,7 +6614,7 @@ dependencies = [
  "fastrand 2.2.0",
  "once_cell",
  "rustix",
- "windows-sys 0.52.0",
+ "windows-sys 0.59.0",
 ]
 
 [[package]]
@@ -7562,12 +7572,6 @@ version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d"
 
-[[package]]
-name = "vcpkg"
-version = "0.2.15"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
-
 [[package]]
 name = "version_check"
 version = "0.9.4"
diff --git a/Dockerfile b/Dockerfile
index f80666529b..7ba54c8ca5 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -45,7 +45,7 @@ COPY --chown=nonroot . .
 
 ARG ADDITIONAL_RUSTFLAGS
 RUN set -e \
-    && PQ_LIB_DIR=$(pwd)/pg_install/v${STABLE_PG_VERSION}/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo build \
+    && RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo build \
       --bin pg_sni_router  \
       --bin pageserver  \
       --bin pagectl  \
diff --git a/Makefile b/Makefile
index 22ebfea7d5..d1238caebf 100644
--- a/Makefile
+++ b/Makefile
@@ -64,8 +64,6 @@ CARGO_BUILD_FLAGS += $(filter -j1,$(MAKEFLAGS))
 CARGO_CMD_PREFIX += $(if $(filter n,$(MAKEFLAGS)),,+)
 # Force cargo not to print progress bar
 CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1
-# Set PQ_LIB_DIR to make sure `storage_controller` get linked with bundled libpq (through diesel)
-CARGO_CMD_PREFIX += PQ_LIB_DIR=$(POSTGRES_INSTALL_DIR)/v16/lib
 
 CACHEDIR_TAG_CONTENTS := "Signature: 8a477f597d28d172789f06886806bc55"
 
diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml
index caaa22d0a5..63f43cdf62 100644
--- a/storage_controller/Cargo.toml
+++ b/storage_controller/Cargo.toml
@@ -32,6 +32,7 @@ postgres_connection.workspace = true
 rand.workspace = true
 reqwest = { workspace = true, features = ["stream"] }
 routerify.workspace = true
+rustls-native-certs.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 thiserror.workspace = true
@@ -39,18 +40,20 @@ tokio.workspace = true
 tokio-util.workspace = true
 tracing.workspace = true
 measured.workspace = true
+rustls.workspace = true
 scopeguard.workspace = true
 strum.workspace = true
 strum_macros.workspace = true
+tokio-postgres.workspace = true
+tokio-postgres-rustls.workspace = true
 
 diesel = { version = "2.2.6", features = [
     "serde_json",
-    "postgres",
-    "r2d2",
     "chrono",
 ] }
+diesel-async = { version = "0.5.2", features = ["postgres", "bb8", "async-connection-wrapper"] }
 diesel_migrations = { version = "2.2.0" }
-r2d2 = { version = "0.8.10" }
+scoped-futures = "0.1.4"
 
 utils = { path = "../libs/utils/" }
 metrics = { path = "../libs/metrics/" }
diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index 801409d612..659c088d51 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -308,7 +308,7 @@ async fn async_main() -> anyhow::Result<()> {
     // Validate that we can connect to the database
     Persistence::await_connection(&secrets.database_url, args.db_connect_timeout.into()).await?;
 
-    let persistence = Arc::new(Persistence::new(secrets.database_url));
+    let persistence = Arc::new(Persistence::new(secrets.database_url).await);
 
     let service = Service::spawn(config, persistence.clone()).await?;
 
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index 37bfaf1139..880f203064 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -1,13 +1,20 @@
 pub(crate) mod split_state;
 use std::collections::HashMap;
 use std::str::FromStr;
+use std::sync::Arc;
 use std::time::Duration;
 use std::time::Instant;
 
 use self::split_state::SplitState;
-use diesel::pg::PgConnection;
 use diesel::prelude::*;
-use diesel::Connection;
+use diesel_async::async_connection_wrapper::AsyncConnectionWrapper;
+use diesel_async::pooled_connection::bb8::Pool;
+use diesel_async::pooled_connection::AsyncDieselConnectionManager;
+use diesel_async::pooled_connection::ManagerConfig;
+use diesel_async::AsyncPgConnection;
+use diesel_async::RunQueryDsl;
+use futures::future::BoxFuture;
+use futures::FutureExt;
 use itertools::Itertools;
 use pageserver_api::controller_api::AvailabilityZone;
 use pageserver_api::controller_api::MetadataHealthRecord;
@@ -20,6 +27,8 @@ use pageserver_api::shard::ShardConfigError;
 use pageserver_api::shard::ShardIdentity;
 use pageserver_api::shard::ShardStripeSize;
 use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
+use rustls::crypto::ring;
+use scoped_futures::ScopedBoxFuture;
 use serde::{Deserialize, Serialize};
 use utils::generation::Generation;
 use utils::id::{NodeId, TenantId};
@@ -60,7 +69,7 @@ const MIGRATIONS: EmbeddedMigrations = embed_migrations!("./migrations");
 /// updated, and reads of nodes are always from memory, not the database.  We only require that
 /// we can UPDATE a node's scheduling mode reasonably quickly to mark a bad node offline.
 pub struct Persistence {
-    connection_pool: diesel::r2d2::Pool<diesel::r2d2::ConnectionManager<PgConnection>>,
+    connection_pool: Pool<AsyncPgConnection>,
 }
 
 /// Legacy format, for use in JSON compat objects in test environment
@@ -76,7 +85,7 @@ pub(crate) enum DatabaseError {
     #[error(transparent)]
     Connection(#[from] diesel::result::ConnectionError),
     #[error(transparent)]
-    ConnectionPool(#[from] r2d2::Error),
+    ConnectionPool(#[from] diesel_async::pooled_connection::bb8::RunError),
     #[error("Logical error: {0}")]
     Logical(String),
     #[error("Migration error: {0}")]
@@ -124,6 +133,7 @@ pub(crate) enum AbortShardSplitStatus {
 pub(crate) type DatabaseResult<T> = Result<T, DatabaseError>;
 
 /// Some methods can operate on either a whole tenant or a single shard
+#[derive(Clone)]
 pub(crate) enum TenantFilter {
     Tenant(TenantId),
     Shard(TenantShardId),
@@ -136,6 +146,11 @@ pub(crate) struct ShardGenerationState {
     pub(crate) generation_pageserver: Option<NodeId>,
 }
 
+// A generous allowance for how many times we may retry serializable transactions
+// before giving up.  This is not expected to be hit: it is a defensive measure in case we
+// somehow engineer a situation where duelling transactions might otherwise live-lock.
+const MAX_RETRIES: usize = 128;
+
 impl Persistence {
     // The default postgres connection limit is 100.  We use up to 99, to leave one free for a human admin under
     // normal circumstances.  This assumes we have exclusive use of the database cluster to which we connect.
@@ -145,12 +160,18 @@ impl Persistence {
     const IDLE_CONNECTION_TIMEOUT: Duration = Duration::from_secs(10);
     const MAX_CONNECTION_LIFETIME: Duration = Duration::from_secs(60);
 
-    pub fn new(database_url: String) -> Self {
-        let manager = diesel::r2d2::ConnectionManager::<PgConnection>::new(database_url);
+    pub async fn new(database_url: String) -> Self {
+        let mut mgr_config = ManagerConfig::default();
+        mgr_config.custom_setup = Box::new(establish_connection_rustls);
+
+        let manager = AsyncDieselConnectionManager::<AsyncPgConnection>::new_with_config(
+            database_url,
+            mgr_config,
+        );
 
         // We will use a connection pool: this is primarily to _limit_ our connection count, rather than to optimize time
         // to execute queries (database queries are not generally on latency-sensitive paths).
-        let connection_pool = diesel::r2d2::Pool::builder()
+        let connection_pool = Pool::builder()
             .max_size(Self::MAX_CONNECTIONS)
             .max_lifetime(Some(Self::MAX_CONNECTION_LIFETIME))
             .idle_timeout(Some(Self::IDLE_CONNECTION_TIMEOUT))
@@ -158,6 +179,7 @@ impl Persistence {
             .min_idle(Some(1))
             .test_on_check_out(true)
             .build(manager)
+            .await
             .expect("Could not build connection pool");
 
         Self { connection_pool }
@@ -171,7 +193,7 @@ impl Persistence {
     ) -> Result<(), diesel::ConnectionError> {
         let started_at = Instant::now();
         loop {
-            match PgConnection::establish(database_url) {
+            match establish_connection_rustls(database_url).await {
                 Ok(_) => {
                     tracing::info!("Connected to database.");
                     return Ok(());
@@ -192,57 +214,22 @@ impl Persistence {
     pub(crate) async fn migration_run(&self) -> DatabaseResult<()> {
         use diesel_migrations::{HarnessWithOutput, MigrationHarness};
 
-        self.with_conn(move |conn| -> DatabaseResult<()> {
-            HarnessWithOutput::write_to_stdout(conn)
-                .run_pending_migrations(MIGRATIONS)
-                .map(|_| ())
-                .map_err(|e| DatabaseError::Migration(e.to_string()))
-        })
-        .await
-    }
-
-    /// Wraps `with_conn` in order to collect latency and error metrics
-    async fn with_measured_conn<F, R>(&self, op: DatabaseOperation, func: F) -> DatabaseResult<R>
-    where
-        F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
-        R: Send + 'static,
-    {
-        let latency = &METRICS_REGISTRY
-            .metrics_group
-            .storage_controller_database_query_latency;
-        let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup { operation: op });
-
-        let res = self.with_conn(func).await;
-
-        if let Err(err) = &res {
-            let error_counter = &METRICS_REGISTRY
-                .metrics_group
-                .storage_controller_database_query_error;
-            error_counter.inc(DatabaseQueryErrorLabelGroup {
-                error_type: err.error_label(),
-                operation: op,
-            })
-        }
-
-        res
-    }
-
-    /// Call the provided function in a tokio blocking thread, with a Diesel database connection.
-    async fn with_conn<F, R>(&self, func: F) -> DatabaseResult<R>
-    where
-        F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
-        R: Send + 'static,
-    {
-        // A generous allowance for how many times we may retry serializable transactions
-        // before giving up.  This is not expected to be hit: it is a defensive measure in case we
-        // somehow engineer a situation where duelling transactions might otherwise live-lock.
-        const MAX_RETRIES: usize = 128;
-
-        let mut conn = self.connection_pool.get()?;
-        tokio::task::spawn_blocking(move || -> DatabaseResult<R> {
+        // Can't use self.with_conn here as we do spawn_blocking which requires static.
+        let conn = self
+            .connection_pool
+            .dedicated_connection()
+            .await
+            .map_err(|e| DatabaseError::Migration(e.to_string()))?;
+        let mut async_wrapper: AsyncConnectionWrapper<AsyncPgConnection> =
+            AsyncConnectionWrapper::from(conn);
+        tokio::task::spawn_blocking(move || {
             let mut retry_count = 0;
             loop {
-                match conn.build_transaction().serializable().run(|c| func(c)) {
+                let result = HarnessWithOutput::write_to_stdout(&mut async_wrapper)
+                    .run_pending_migrations(MIGRATIONS)
+                    .map(|_| ())
+                    .map_err(|e| DatabaseError::Migration(e.to_string()));
+                match result {
                     Ok(r) => break Ok(r),
                     Err(
                         err @ DatabaseError::Query(diesel::result::Error::DatabaseError(
@@ -271,33 +258,112 @@ impl Persistence {
             }
         })
         .await
-        .expect("Task panic")
+        .map_err(|e| DatabaseError::Migration(e.to_string()))??;
+        Ok(())
+    }
+
+    /// Wraps `with_conn` in order to collect latency and error metrics
+    async fn with_measured_conn<'a, 'b, F, R>(
+        &self,
+        op: DatabaseOperation,
+        func: F,
+    ) -> DatabaseResult<R>
+    where
+        F: for<'r> Fn(&'r mut AsyncPgConnection) -> ScopedBoxFuture<'b, 'r, DatabaseResult<R>>
+            + Send
+            + std::marker::Sync
+            + 'a,
+        R: Send + 'b,
+    {
+        let latency = &METRICS_REGISTRY
+            .metrics_group
+            .storage_controller_database_query_latency;
+        let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup { operation: op });
+
+        let res = self.with_conn(func).await;
+
+        if let Err(err) = &res {
+            let error_counter = &METRICS_REGISTRY
+                .metrics_group
+                .storage_controller_database_query_error;
+            error_counter.inc(DatabaseQueryErrorLabelGroup {
+                error_type: err.error_label(),
+                operation: op,
+            })
+        }
+
+        res
+    }
+
+    /// Call the provided function with a Diesel database connection in a retry loop
+    async fn with_conn<'a, 'b, F, R>(&self, func: F) -> DatabaseResult<R>
+    where
+        F: for<'r> Fn(&'r mut AsyncPgConnection) -> ScopedBoxFuture<'b, 'r, DatabaseResult<R>>
+            + Send
+            + std::marker::Sync
+            + 'a,
+        R: Send + 'b,
+    {
+        let mut retry_count = 0;
+        loop {
+            let mut conn = self.connection_pool.get().await?;
+            match conn
+                .build_transaction()
+                .serializable()
+                .run(|c| func(c))
+                .await
+            {
+                Ok(r) => break Ok(r),
+                Err(
+                    err @ DatabaseError::Query(diesel::result::Error::DatabaseError(
+                        diesel::result::DatabaseErrorKind::SerializationFailure,
+                        _,
+                    )),
+                ) => {
+                    retry_count += 1;
+                    if retry_count > MAX_RETRIES {
+                        tracing::error!(
+                            "Exceeded max retries on SerializationFailure errors: {err:?}"
+                        );
+                        break Err(err);
+                    } else {
+                        // Retry on serialization errors: these are expected, because even though our
+                        // transactions don't fight for the same rows, they will occasionally collide
+                        // on index pages (e.g. increment_generation for unrelated shards can collide)
+                        tracing::debug!("Retrying transaction on serialization failure {err:?}");
+                        continue;
+                    }
+                }
+                Err(e) => break Err(e),
+            }
+        }
     }
 
     /// When a node is first registered, persist it before using it for anything
     pub(crate) async fn insert_node(&self, node: &Node) -> DatabaseResult<()> {
-        let np = node.to_persistent();
-        self.with_measured_conn(
-            DatabaseOperation::InsertNode,
-            move |conn| -> DatabaseResult<()> {
+        let np = &node.to_persistent();
+        self.with_measured_conn(DatabaseOperation::InsertNode, move |conn| {
+            Box::pin(async move {
                 diesel::insert_into(crate::schema::nodes::table)
-                    .values(&np)
-                    .execute(conn)?;
+                    .values(np)
+                    .execute(conn)
+                    .await?;
                 Ok(())
-            },
-        )
+            })
+        })
         .await
     }
 
     /// At startup, populate the list of nodes which our shards may be placed on
     pub(crate) async fn list_nodes(&self) -> DatabaseResult<Vec<NodePersistence>> {
         let nodes: Vec<NodePersistence> = self
-            .with_measured_conn(
-                DatabaseOperation::ListNodes,
-                move |conn| -> DatabaseResult<_> {
-                    Ok(crate::schema::nodes::table.load::<NodePersistence>(conn)?)
-                },
-            )
+            .with_measured_conn(DatabaseOperation::ListNodes, move |conn| {
+                Box::pin(async move {
+                    Ok(crate::schema::nodes::table
+                        .load::<NodePersistence>(conn)
+                        .await?)
+                })
+            })
             .await?;
 
         tracing::info!("list_nodes: loaded {} nodes", nodes.len());
@@ -313,11 +379,14 @@ impl Persistence {
         use crate::schema::nodes::dsl::*;
         let updated = self
             .with_measured_conn(DatabaseOperation::UpdateNode, move |conn| {
-                let updated = diesel::update(nodes)
-                    .filter(node_id.eq(input_node_id.0 as i64))
-                    .set((scheduling_policy.eq(String::from(input_scheduling)),))
-                    .execute(conn)?;
-                Ok(updated)
+                Box::pin(async move {
+                    let updated = diesel::update(nodes)
+                        .filter(node_id.eq(input_node_id.0 as i64))
+                        .set((scheduling_policy.eq(String::from(input_scheduling)),))
+                        .execute(conn)
+                        .await?;
+                    Ok(updated)
+                })
             })
             .await?;
 
@@ -339,17 +408,16 @@ impl Persistence {
         &self,
     ) -> DatabaseResult<Vec<TenantShardPersistence>> {
         use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(
-            DatabaseOperation::ListTenantShards,
-            move |conn| -> DatabaseResult<_> {
+        self.with_measured_conn(DatabaseOperation::ListTenantShards, move |conn| {
+            Box::pin(async move {
                 let query = tenant_shards.filter(
                     placement_policy.ne(serde_json::to_string(&PlacementPolicy::Detached).unwrap()),
                 );
-                let result = query.load::<TenantShardPersistence>(conn)?;
+                let result = query.load::<TenantShardPersistence>(conn).await?;
 
                 Ok(result)
-            },
-        )
+            })
+        })
         .await
     }
 
@@ -359,15 +427,14 @@ impl Persistence {
         filter_tenant_id: TenantId,
     ) -> DatabaseResult<Vec<TenantShardPersistence>> {
         use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(
-            DatabaseOperation::LoadTenant,
-            move |conn| -> DatabaseResult<_> {
+        self.with_measured_conn(DatabaseOperation::LoadTenant, move |conn| {
+            Box::pin(async move {
                 let query = tenant_shards.filter(tenant_id.eq(filter_tenant_id.to_string()));
-                let result = query.load::<TenantShardPersistence>(conn)?;
+                let result = query.load::<TenantShardPersistence>(conn).await?;
 
                 Ok(result)
-            },
-        )
+            })
+        })
         .await
     }
 
@@ -393,19 +460,22 @@ impl Persistence {
             })
             .collect::<Vec<_>>();
 
-        self.with_measured_conn(
-            DatabaseOperation::InsertTenantShards,
-            move |conn| -> DatabaseResult<()> {
+        let shards = &shards;
+        let metadata_health_records = &metadata_health_records;
+        self.with_measured_conn(DatabaseOperation::InsertTenantShards, move |conn| {
+            Box::pin(async move {
                 diesel::insert_into(tenant_shards::table)
-                    .values(&shards)
-                    .execute(conn)?;
+                    .values(shards)
+                    .execute(conn)
+                    .await?;
 
                 diesel::insert_into(metadata_health::table)
-                    .values(&metadata_health_records)
-                    .execute(conn)?;
+                    .values(metadata_health_records)
+                    .execute(conn)
+                    .await?;
                 Ok(())
-            },
-        )
+            })
+        })
         .await
     }
 
@@ -413,31 +483,31 @@ impl Persistence {
     /// the tenant from memory on this server.
     pub(crate) async fn delete_tenant(&self, del_tenant_id: TenantId) -> DatabaseResult<()> {
         use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(
-            DatabaseOperation::DeleteTenant,
-            move |conn| -> DatabaseResult<()> {
+        self.with_measured_conn(DatabaseOperation::DeleteTenant, move |conn| {
+            Box::pin(async move {
                 // `metadata_health` status (if exists) is also deleted based on the cascade behavior.
                 diesel::delete(tenant_shards)
                     .filter(tenant_id.eq(del_tenant_id.to_string()))
-                    .execute(conn)?;
+                    .execute(conn)
+                    .await?;
                 Ok(())
-            },
-        )
+            })
+        })
         .await
     }
 
     pub(crate) async fn delete_node(&self, del_node_id: NodeId) -> DatabaseResult<()> {
         use crate::schema::nodes::dsl::*;
-        self.with_measured_conn(
-            DatabaseOperation::DeleteNode,
-            move |conn| -> DatabaseResult<()> {
+        self.with_measured_conn(DatabaseOperation::DeleteNode, move |conn| {
+            Box::pin(async move {
                 diesel::delete(nodes)
                     .filter(node_id.eq(del_node_id.0 as i64))
-                    .execute(conn)?;
+                    .execute(conn)
+                    .await?;
 
                 Ok(())
-            },
-        )
+            })
+        })
         .await
     }
 
@@ -454,34 +524,41 @@ impl Persistence {
         use crate::schema::tenant_shards::dsl::*;
         let updated = self
             .with_measured_conn(DatabaseOperation::ReAttach, move |conn| {
-                let rows_updated = diesel::update(tenant_shards)
-                    .filter(generation_pageserver.eq(input_node_id.0 as i64))
-                    .set(generation.eq(generation + 1))
-                    .execute(conn)?;
+                Box::pin(async move {
+                    let rows_updated = diesel::update(tenant_shards)
+                        .filter(generation_pageserver.eq(input_node_id.0 as i64))
+                        .set(generation.eq(generation + 1))
+                        .execute(conn)
+                        .await?;
 
-                tracing::info!("Incremented {} tenants' generations", rows_updated);
+                    tracing::info!("Incremented {} tenants' generations", rows_updated);
 
-                // TODO: UPDATE+SELECT in one query
+                    // TODO: UPDATE+SELECT in one query
 
-                let updated = tenant_shards
-                    .filter(generation_pageserver.eq(input_node_id.0 as i64))
-                    .select(TenantShardPersistence::as_select())
-                    .load(conn)?;
+                    let updated = tenant_shards
+                        .filter(generation_pageserver.eq(input_node_id.0 as i64))
+                        .select(TenantShardPersistence::as_select())
+                        .load(conn)
+                        .await?;
 
-                // If the node went through a drain and restart phase before re-attaching,
-                // then reset it's node scheduling policy to active.
-                diesel::update(nodes)
-                    .filter(node_id.eq(input_node_id.0 as i64))
-                    .filter(
-                        scheduling_policy
-                            .eq(String::from(NodeSchedulingPolicy::PauseForRestart))
-                            .or(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Draining)))
-                            .or(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Filling))),
-                    )
-                    .set(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Active)))
-                    .execute(conn)?;
+                    // If the node went through a drain and restart phase before re-attaching,
+                    // then reset it's node scheduling policy to active.
+                    diesel::update(nodes)
+                        .filter(node_id.eq(input_node_id.0 as i64))
+                        .filter(
+                            scheduling_policy
+                                .eq(String::from(NodeSchedulingPolicy::PauseForRestart))
+                                .or(scheduling_policy
+                                    .eq(String::from(NodeSchedulingPolicy::Draining)))
+                                .or(scheduling_policy
+                                    .eq(String::from(NodeSchedulingPolicy::Filling))),
+                        )
+                        .set(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Active)))
+                        .execute(conn)
+                        .await?;
 
-                Ok(updated)
+                    Ok(updated)
+                })
             })
             .await?;
 
@@ -518,19 +595,22 @@ impl Persistence {
         use crate::schema::tenant_shards::dsl::*;
         let updated = self
             .with_measured_conn(DatabaseOperation::IncrementGeneration, move |conn| {
-                let updated = diesel::update(tenant_shards)
-                    .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
-                    .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
-                    .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
-                    .set((
-                        generation.eq(generation + 1),
-                        generation_pageserver.eq(node_id.0 as i64),
-                    ))
-                    // TODO: only returning() the generation column
-                    .returning(TenantShardPersistence::as_returning())
-                    .get_result(conn)?;
+                Box::pin(async move {
+                    let updated = diesel::update(tenant_shards)
+                        .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
+                        .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
+                        .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
+                        .set((
+                            generation.eq(generation + 1),
+                            generation_pageserver.eq(node_id.0 as i64),
+                        ))
+                        // TODO: only returning() the generation column
+                        .returning(TenantShardPersistence::as_returning())
+                        .get_result(conn)
+                        .await?;
 
-                Ok(updated)
+                    Ok(updated)
+                })
             })
             .await?;
 
@@ -562,12 +642,15 @@ impl Persistence {
         use crate::schema::tenant_shards::dsl::*;
         let rows = self
             .with_measured_conn(DatabaseOperation::TenantGenerations, move |conn| {
-                let result = tenant_shards
-                    .filter(tenant_id.eq(filter_tenant_id.to_string()))
-                    .select(TenantShardPersistence::as_select())
-                    .order(shard_number)
-                    .load(conn)?;
-                Ok(result)
+                Box::pin(async move {
+                    let result = tenant_shards
+                        .filter(tenant_id.eq(filter_tenant_id.to_string()))
+                        .select(TenantShardPersistence::as_select())
+                        .order(shard_number)
+                        .load(conn)
+                        .await?;
+                    Ok(result)
+                })
             })
             .await?;
 
@@ -615,15 +698,18 @@ impl Persistence {
                 break;
             }
 
+            let in_clause = &in_clause;
             let chunk_rows = self
                 .with_measured_conn(DatabaseOperation::ShardGenerations, move |conn| {
-                    // diesel doesn't support multi-column IN queries, so we compose raw SQL.  No escaping is required because
-                    // the inputs are strongly typed and cannot carry any user-supplied raw string content.
-                    let result : Vec<TenantShardPersistence> = diesel::sql_query(
-                        format!("SELECT * from tenant_shards where (tenant_id, shard_number, shard_count) in ({in_clause});").as_str()
-                    ).load(conn)?;
+                    Box::pin(async move {
+                        // diesel doesn't support multi-column IN queries, so we compose raw SQL.  No escaping is required because
+                        // the inputs are strongly typed and cannot carry any user-supplied raw string content.
+                        let result : Vec<TenantShardPersistence> = diesel::sql_query(
+                            format!("SELECT * from tenant_shards where (tenant_id, shard_number, shard_count) in ({in_clause});").as_str()
+                        ).load(conn).await?;
 
-                    Ok(result)
+                        Ok(result)
+                    })
                 })
                 .await?;
             rows.extend(chunk_rows.into_iter())
@@ -657,51 +743,58 @@ impl Persistence {
     ) -> DatabaseResult<()> {
         use crate::schema::tenant_shards::dsl::*;
 
+        let tenant = &tenant;
+        let input_placement_policy = &input_placement_policy;
+        let input_config = &input_config;
+        let input_generation = &input_generation;
+        let input_scheduling_policy = &input_scheduling_policy;
         self.with_measured_conn(DatabaseOperation::UpdateTenantShard, move |conn| {
-            let query = match tenant {
-                TenantFilter::Shard(tenant_shard_id) => diesel::update(tenant_shards)
-                    .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
-                    .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
-                    .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
-                    .into_boxed(),
-                TenantFilter::Tenant(input_tenant_id) => diesel::update(tenant_shards)
-                    .filter(tenant_id.eq(input_tenant_id.to_string()))
-                    .into_boxed(),
-            };
+            Box::pin(async move {
+                let query = match tenant {
+                    TenantFilter::Shard(tenant_shard_id) => diesel::update(tenant_shards)
+                        .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
+                        .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
+                        .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
+                        .into_boxed(),
+                    TenantFilter::Tenant(input_tenant_id) => diesel::update(tenant_shards)
+                        .filter(tenant_id.eq(input_tenant_id.to_string()))
+                        .into_boxed(),
+                };
 
-            // Clear generation_pageserver if we are moving into a state where we won't have
-            // any attached pageservers.
-            let input_generation_pageserver = match input_placement_policy {
-                None | Some(PlacementPolicy::Attached(_)) => None,
-                Some(PlacementPolicy::Detached | PlacementPolicy::Secondary) => Some(None),
-            };
+                // Clear generation_pageserver if we are moving into a state where we won't have
+                // any attached pageservers.
+                let input_generation_pageserver = match input_placement_policy {
+                    None | Some(PlacementPolicy::Attached(_)) => None,
+                    Some(PlacementPolicy::Detached | PlacementPolicy::Secondary) => Some(None),
+                };
 
-            #[derive(AsChangeset)]
-            #[diesel(table_name = crate::schema::tenant_shards)]
-            struct ShardUpdate {
-                generation: Option<i32>,
-                placement_policy: Option<String>,
-                config: Option<String>,
-                scheduling_policy: Option<String>,
-                generation_pageserver: Option<Option<i64>>,
-            }
+                #[derive(AsChangeset)]
+                #[diesel(table_name = crate::schema::tenant_shards)]
+                struct ShardUpdate {
+                    generation: Option<i32>,
+                    placement_policy: Option<String>,
+                    config: Option<String>,
+                    scheduling_policy: Option<String>,
+                    generation_pageserver: Option<Option<i64>>,
+                }
 
-            let update = ShardUpdate {
-                generation: input_generation.map(|g| g.into().unwrap() as i32),
-                placement_policy: input_placement_policy
-                    .as_ref()
-                    .map(|p| serde_json::to_string(&p).unwrap()),
-                config: input_config
-                    .as_ref()
-                    .map(|c| serde_json::to_string(&c).unwrap()),
-                scheduling_policy: input_scheduling_policy
-                    .map(|p| serde_json::to_string(&p).unwrap()),
-                generation_pageserver: input_generation_pageserver,
-            };
+                let update = ShardUpdate {
+                    generation: input_generation.map(|g| g.into().unwrap() as i32),
+                    placement_policy: input_placement_policy
+                        .as_ref()
+                        .map(|p| serde_json::to_string(&p).unwrap()),
+                    config: input_config
+                        .as_ref()
+                        .map(|c| serde_json::to_string(&c).unwrap()),
+                    scheduling_policy: input_scheduling_policy
+                        .map(|p| serde_json::to_string(&p).unwrap()),
+                    generation_pageserver: input_generation_pageserver,
+                };
 
-            query.set(update).execute(conn)?;
+                query.set(update).execute(conn).await?;
 
-            Ok(())
+                Ok(())
+            })
         })
         .await?;
 
@@ -715,23 +808,27 @@ impl Persistence {
     ) -> DatabaseResult<Vec<(TenantShardId, Option<AvailabilityZone>)>> {
         use crate::schema::tenant_shards::dsl::*;
 
+        let preferred_azs = preferred_azs.as_slice();
         self.with_measured_conn(DatabaseOperation::SetPreferredAzs, move |conn| {
-            let mut shards_updated = Vec::default();
+            Box::pin(async move {
+                let mut shards_updated = Vec::default();
 
-            for (tenant_shard_id, preferred_az) in preferred_azs.iter() {
-                let updated = diesel::update(tenant_shards)
-                    .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
-                    .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
-                    .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
-                    .set(preferred_az_id.eq(preferred_az.as_ref().map(|az| az.0.clone())))
-                    .execute(conn)?;
+                for (tenant_shard_id, preferred_az) in preferred_azs.iter() {
+                    let updated = diesel::update(tenant_shards)
+                        .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
+                        .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
+                        .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
+                        .set(preferred_az_id.eq(preferred_az.as_ref().map(|az| az.0.clone())))
+                        .execute(conn)
+                        .await?;
 
-                if updated == 1 {
-                    shards_updated.push((*tenant_shard_id, preferred_az.clone()));
+                    if updated == 1 {
+                        shards_updated.push((*tenant_shard_id, preferred_az.clone()));
+                    }
                 }
-            }
 
-            Ok(shards_updated)
+                Ok(shards_updated)
+            })
         })
         .await
     }
@@ -739,17 +836,21 @@ impl Persistence {
     pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
         use crate::schema::tenant_shards::dsl::*;
         self.with_measured_conn(DatabaseOperation::Detach, move |conn| {
-            let updated = diesel::update(tenant_shards)
-                .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
-                .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
-                .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
-                .set((
-                    generation_pageserver.eq(Option::<i64>::None),
-                    placement_policy.eq(serde_json::to_string(&PlacementPolicy::Detached).unwrap()),
-                ))
-                .execute(conn)?;
+            Box::pin(async move {
+                let updated = diesel::update(tenant_shards)
+                    .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
+                    .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
+                    .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
+                    .set((
+                        generation_pageserver.eq(Option::<i64>::None),
+                        placement_policy
+                            .eq(serde_json::to_string(&PlacementPolicy::Detached).unwrap()),
+                    ))
+                    .execute(conn)
+                    .await?;
 
-            Ok(updated)
+                Ok(updated)
+            })
         })
         .await?;
 
@@ -768,14 +869,16 @@ impl Persistence {
         parent_to_children: Vec<(TenantShardId, Vec<TenantShardPersistence>)>,
     ) -> DatabaseResult<()> {
         use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(DatabaseOperation::BeginShardSplit, move |conn| -> DatabaseResult<()> {
+        let parent_to_children = parent_to_children.as_slice();
+        self.with_measured_conn(DatabaseOperation::BeginShardSplit, move |conn| {
+            Box::pin(async move {
             // Mark parent shards as splitting
 
             let updated = diesel::update(tenant_shards)
                 .filter(tenant_id.eq(split_tenant_id.to_string()))
                 .filter(shard_count.eq(old_shard_count.literal() as i32))
                 .set((splitting.eq(1),))
-                .execute(conn)?;
+                .execute(conn).await?;
             if u8::try_from(updated)
                 .map_err(|_| DatabaseError::Logical(
                     format!("Overflow existing shard count {} while splitting", updated))
@@ -788,7 +891,7 @@ impl Persistence {
             }
 
             // FIXME: spurious clone to sidestep closure move rules
-            let parent_to_children = parent_to_children.clone();
+            let parent_to_children = parent_to_children.to_vec();
 
             // Insert child shards
             for (parent_shard_id, children) in parent_to_children {
@@ -796,7 +899,7 @@ impl Persistence {
                     .filter(tenant_id.eq(parent_shard_id.tenant_id.to_string()))
                     .filter(shard_number.eq(parent_shard_id.shard_number.0 as i32))
                     .filter(shard_count.eq(parent_shard_id.shard_count.literal() as i32))
-                    .load::<TenantShardPersistence>(conn)?;
+                    .load::<TenantShardPersistence>(conn).await?;
                 let parent = if parent.len() != 1 {
                     return Err(DatabaseError::Logical(format!(
                         "Parent shard {parent_shard_id} not found"
@@ -811,12 +914,13 @@ impl Persistence {
                     debug_assert!(shard.splitting == SplitState::Splitting);
                     diesel::insert_into(tenant_shards)
                         .values(shard)
-                        .execute(conn)?;
+                        .execute(conn).await?;
                 }
             }
 
             Ok(())
         })
+        })
         .await
     }
 
@@ -828,25 +932,26 @@ impl Persistence {
         old_shard_count: ShardCount,
     ) -> DatabaseResult<()> {
         use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(
-            DatabaseOperation::CompleteShardSplit,
-            move |conn| -> DatabaseResult<()> {
+        self.with_measured_conn(DatabaseOperation::CompleteShardSplit, move |conn| {
+            Box::pin(async move {
                 // Drop parent shards
                 diesel::delete(tenant_shards)
                     .filter(tenant_id.eq(split_tenant_id.to_string()))
                     .filter(shard_count.eq(old_shard_count.literal() as i32))
-                    .execute(conn)?;
+                    .execute(conn)
+                    .await?;
 
                 // Clear sharding flag
                 let updated = diesel::update(tenant_shards)
                     .filter(tenant_id.eq(split_tenant_id.to_string()))
                     .set((splitting.eq(0),))
-                    .execute(conn)?;
+                    .execute(conn)
+                    .await?;
                 debug_assert!(updated > 0);
 
                 Ok(())
-            },
-        )
+            })
+        })
         .await
     }
 
@@ -858,15 +963,15 @@ impl Persistence {
         new_shard_count: ShardCount,
     ) -> DatabaseResult<AbortShardSplitStatus> {
         use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(
-            DatabaseOperation::AbortShardSplit,
-            move |conn| -> DatabaseResult<AbortShardSplitStatus> {
+        self.with_measured_conn(DatabaseOperation::AbortShardSplit, move |conn| {
+            Box::pin(async move {
                 // Clear the splitting state on parent shards
                 let updated = diesel::update(tenant_shards)
                     .filter(tenant_id.eq(split_tenant_id.to_string()))
                     .filter(shard_count.ne(new_shard_count.literal() as i32))
                     .set((splitting.eq(0),))
-                    .execute(conn)?;
+                    .execute(conn)
+                    .await?;
 
                 // Parent shards are already gone: we cannot abort.
                 if updated == 0 {
@@ -886,11 +991,12 @@ impl Persistence {
                 diesel::delete(tenant_shards)
                     .filter(tenant_id.eq(split_tenant_id.to_string()))
                     .filter(shard_count.eq(new_shard_count.literal() as i32))
-                    .execute(conn)?;
+                    .execute(conn)
+                    .await?;
 
                 Ok(AbortShardSplitStatus::Aborted)
-            },
-        )
+            })
+        })
         .await
     }
 
@@ -906,25 +1012,28 @@ impl Persistence {
     ) -> DatabaseResult<()> {
         use crate::schema::metadata_health::dsl::*;
 
-        self.with_measured_conn(
-            DatabaseOperation::UpdateMetadataHealth,
-            move |conn| -> DatabaseResult<_> {
+        let healthy_records = healthy_records.as_slice();
+        let unhealthy_records = unhealthy_records.as_slice();
+        self.with_measured_conn(DatabaseOperation::UpdateMetadataHealth, move |conn| {
+            Box::pin(async move {
                 diesel::insert_into(metadata_health)
-                    .values(&healthy_records)
+                    .values(healthy_records)
                     .on_conflict((tenant_id, shard_number, shard_count))
                     .do_update()
                     .set((healthy.eq(true), last_scrubbed_at.eq(now)))
-                    .execute(conn)?;
+                    .execute(conn)
+                    .await?;
 
                 diesel::insert_into(metadata_health)
-                    .values(&unhealthy_records)
+                    .values(unhealthy_records)
                     .on_conflict((tenant_id, shard_number, shard_count))
                     .do_update()
                     .set((healthy.eq(false), last_scrubbed_at.eq(now)))
-                    .execute(conn)?;
+                    .execute(conn)
+                    .await?;
                 Ok(())
-            },
-        )
+            })
+        })
         .await
     }
 
@@ -933,15 +1042,13 @@ impl Persistence {
     pub(crate) async fn list_metadata_health_records(
         &self,
     ) -> DatabaseResult<Vec<MetadataHealthPersistence>> {
-        self.with_measured_conn(
-            DatabaseOperation::ListMetadataHealth,
-            move |conn| -> DatabaseResult<_> {
-                Ok(
-                    crate::schema::metadata_health::table
-                        .load::<MetadataHealthPersistence>(conn)?,
-                )
-            },
-        )
+        self.with_measured_conn(DatabaseOperation::ListMetadataHealth, move |conn| {
+            Box::pin(async {
+                Ok(crate::schema::metadata_health::table
+                    .load::<MetadataHealthPersistence>(conn)
+                    .await?)
+            })
+        })
         .await
     }
 
@@ -953,10 +1060,15 @@ impl Persistence {
         use crate::schema::metadata_health::dsl::*;
         self.with_measured_conn(
             DatabaseOperation::ListMetadataHealthUnhealthy,
-            move |conn| -> DatabaseResult<_> {
-                Ok(crate::schema::metadata_health::table
-                    .filter(healthy.eq(false))
-                    .load::<MetadataHealthPersistence>(conn)?)
+            move |conn| {
+                Box::pin(async {
+                    DatabaseResult::Ok(
+                        crate::schema::metadata_health::table
+                            .filter(healthy.eq(false))
+                            .load::<MetadataHealthPersistence>(conn)
+                            .await?,
+                    )
+                })
             },
         )
         .await
@@ -970,15 +1082,14 @@ impl Persistence {
     ) -> DatabaseResult<Vec<MetadataHealthPersistence>> {
         use crate::schema::metadata_health::dsl::*;
 
-        self.with_measured_conn(
-            DatabaseOperation::ListMetadataHealthOutdated,
-            move |conn| -> DatabaseResult<_> {
+        self.with_measured_conn(DatabaseOperation::ListMetadataHealthOutdated, move |conn| {
+            Box::pin(async move {
                 let query = metadata_health.filter(last_scrubbed_at.lt(earlier));
-                let res = query.load::<MetadataHealthPersistence>(conn)?;
+                let res = query.load::<MetadataHealthPersistence>(conn).await?;
 
                 Ok(res)
-            },
-        )
+            })
+        })
         .await
     }
 
@@ -986,12 +1097,13 @@ impl Persistence {
     /// It is an error for the table to contain more than one entry.
     pub(crate) async fn get_leader(&self) -> DatabaseResult<Option<ControllerPersistence>> {
         let mut leader: Vec<ControllerPersistence> = self
-            .with_measured_conn(
-                DatabaseOperation::GetLeader,
-                move |conn| -> DatabaseResult<_> {
-                    Ok(crate::schema::controllers::table.load::<ControllerPersistence>(conn)?)
-                },
-            )
+            .with_measured_conn(DatabaseOperation::GetLeader, move |conn| {
+                Box::pin(async move {
+                    Ok(crate::schema::controllers::table
+                        .load::<ControllerPersistence>(conn)
+                        .await?)
+                })
+            })
             .await?;
 
         if leader.len() > 1 {
@@ -1014,26 +1126,33 @@ impl Persistence {
         use crate::schema::controllers::dsl::*;
 
         let updated = self
-            .with_measured_conn(
-                DatabaseOperation::UpdateLeader,
-                move |conn| -> DatabaseResult<usize> {
+            .with_measured_conn(DatabaseOperation::UpdateLeader, move |conn| {
+                let prev = prev.clone();
+                let new = new.clone();
+                Box::pin(async move {
                     let updated = match &prev {
-                        Some(prev) => diesel::update(controllers)
-                            .filter(address.eq(prev.address.clone()))
-                            .filter(started_at.eq(prev.started_at))
-                            .set((
-                                address.eq(new.address.clone()),
-                                started_at.eq(new.started_at),
-                            ))
-                            .execute(conn)?,
-                        None => diesel::insert_into(controllers)
-                            .values(new.clone())
-                            .execute(conn)?,
+                        Some(prev) => {
+                            diesel::update(controllers)
+                                .filter(address.eq(prev.address.clone()))
+                                .filter(started_at.eq(prev.started_at))
+                                .set((
+                                    address.eq(new.address.clone()),
+                                    started_at.eq(new.started_at),
+                                ))
+                                .execute(conn)
+                                .await?
+                        }
+                        None => {
+                            diesel::insert_into(controllers)
+                                .values(new.clone())
+                                .execute(conn)
+                                .await?
+                        }
                     };
 
                     Ok(updated)
-                },
-            )
+                })
+            })
             .await?;
 
         if updated == 0 {
@@ -1048,12 +1167,13 @@ impl Persistence {
     /// At startup, populate the list of nodes which our shards may be placed on
     pub(crate) async fn list_safekeepers(&self) -> DatabaseResult<Vec<SafekeeperPersistence>> {
         let safekeepers: Vec<SafekeeperPersistence> = self
-            .with_measured_conn(
-                DatabaseOperation::ListNodes,
-                move |conn| -> DatabaseResult<_> {
-                    Ok(crate::schema::safekeepers::table.load::<SafekeeperPersistence>(conn)?)
-                },
-            )
+            .with_measured_conn(DatabaseOperation::ListNodes, move |conn| {
+                Box::pin(async move {
+                    Ok(crate::schema::safekeepers::table
+                        .load::<SafekeeperPersistence>(conn)
+                        .await?)
+                })
+            })
             .await?;
 
         tracing::info!("list_safekeepers: loaded {} nodes", safekeepers.len());
@@ -1066,11 +1186,14 @@ impl Persistence {
         id: i64,
     ) -> Result<SafekeeperPersistence, DatabaseError> {
         use crate::schema::safekeepers::dsl::{id as id_column, safekeepers};
-        self.with_conn(move |conn| -> DatabaseResult<SafekeeperPersistence> {
-            Ok(safekeepers
-                .filter(id_column.eq(&id))
-                .select(SafekeeperPersistence::as_select())
-                .get_result(conn)?)
+        self.with_conn(move |conn| {
+            Box::pin(async move {
+                Ok(safekeepers
+                    .filter(id_column.eq(&id))
+                    .select(SafekeeperPersistence::as_select())
+                    .get_result(conn)
+                    .await?)
+            })
         })
         .await
     }
@@ -1081,26 +1204,30 @@ impl Persistence {
     ) -> Result<(), DatabaseError> {
         use crate::schema::safekeepers::dsl::*;
 
-        self.with_conn(move |conn| -> DatabaseResult<()> {
-            let bind = record
-                .as_insert_or_update()
-                .map_err(|e| DatabaseError::Logical(format!("{e}")))?;
+        self.with_conn(move |conn| {
+            let record = record.clone();
+            Box::pin(async move {
+                let bind = record
+                    .as_insert_or_update()
+                    .map_err(|e| DatabaseError::Logical(format!("{e}")))?;
 
-            let inserted_updated = diesel::insert_into(safekeepers)
-                .values(&bind)
-                .on_conflict(id)
-                .do_update()
-                .set(&bind)
-                .execute(conn)?;
+                let inserted_updated = diesel::insert_into(safekeepers)
+                    .values(&bind)
+                    .on_conflict(id)
+                    .do_update()
+                    .set(&bind)
+                    .execute(conn)
+                    .await?;
 
-            if inserted_updated != 1 {
-                return Err(DatabaseError::Logical(format!(
-                    "unexpected number of rows ({})",
-                    inserted_updated
-                )));
-            }
+                if inserted_updated != 1 {
+                    return Err(DatabaseError::Logical(format!(
+                        "unexpected number of rows ({})",
+                        inserted_updated
+                    )));
+                }
 
-            Ok(())
+                Ok(())
+            })
         })
         .await
     }
@@ -1112,31 +1239,73 @@ impl Persistence {
     ) -> Result<(), DatabaseError> {
         use crate::schema::safekeepers::dsl::*;
 
-        self.with_conn(move |conn| -> DatabaseResult<()> {
-            #[derive(Insertable, AsChangeset)]
-            #[diesel(table_name = crate::schema::safekeepers)]
-            struct UpdateSkSchedulingPolicy<'a> {
-                id: i64,
-                scheduling_policy: &'a str,
-            }
-            let scheduling_policy_ = String::from(scheduling_policy_);
+        self.with_conn(move |conn| {
+            Box::pin(async move {
+                #[derive(Insertable, AsChangeset)]
+                #[diesel(table_name = crate::schema::safekeepers)]
+                struct UpdateSkSchedulingPolicy<'a> {
+                    id: i64,
+                    scheduling_policy: &'a str,
+                }
+                let scheduling_policy_ = String::from(scheduling_policy_);
 
-            let rows_affected = diesel::update(safekeepers.filter(id.eq(id_)))
-                .set(scheduling_policy.eq(scheduling_policy_))
-                .execute(conn)?;
+                let rows_affected = diesel::update(safekeepers.filter(id.eq(id_)))
+                    .set(scheduling_policy.eq(scheduling_policy_))
+                    .execute(conn)
+                    .await?;
 
-            if rows_affected != 1 {
-                return Err(DatabaseError::Logical(format!(
-                    "unexpected number of rows ({rows_affected})",
-                )));
-            }
+                if rows_affected != 1 {
+                    return Err(DatabaseError::Logical(format!(
+                        "unexpected number of rows ({rows_affected})",
+                    )));
+                }
 
-            Ok(())
+                Ok(())
+            })
         })
         .await
     }
 }
 
+pub(crate) fn load_certs() -> anyhow::Result<Arc<rustls::RootCertStore>> {
+    let der_certs = rustls_native_certs::load_native_certs();
+
+    if !der_certs.errors.is_empty() {
+        anyhow::bail!("could not parse certificates: {:?}", der_certs.errors);
+    }
+
+    let mut store = rustls::RootCertStore::empty();
+    store.add_parsable_certificates(der_certs.certs);
+    Ok(Arc::new(store))
+}
+
+/// Loads the root certificates and constructs a client config suitable for connecting.
+/// This function is blocking.
+pub fn client_config_with_root_certs() -> anyhow::Result<rustls::ClientConfig> {
+    Ok(
+        rustls::ClientConfig::builder_with_provider(Arc::new(ring::default_provider()))
+            .with_safe_default_protocol_versions()
+            .expect("ring should support the default protocol versions")
+            .with_root_certificates(load_certs()?)
+            .with_no_client_auth(),
+    )
+}
+
+fn establish_connection_rustls(config: &str) -> BoxFuture<ConnectionResult<AsyncPgConnection>> {
+    let fut = async {
+        // We first set up the way we want rustls to work.
+        let rustls_config = client_config_with_root_certs()
+            .map_err(|err| ConnectionError::BadConnection(format!("{err:?}")))?;
+        let tls = tokio_postgres_rustls::MakeRustlsConnect::new(rustls_config);
+        let (client, conn) = tokio_postgres::connect(config, tls)
+            .await
+            .map_err(|e| ConnectionError::BadConnection(e.to_string()))?;
+
+        AsyncPgConnection::try_from_client_and_connection(client, conn).await
+    };
+    fut.boxed()
+}
+
 /// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably
 #[derive(
     QueryableByName, Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq,

From b1e451091ad45d3e274dc4119e94f0d0307650b5 Mon Sep 17 00:00:00 2001
From: OBBO67 <35974943+OBBO67@users.noreply.github.com>
Date: Mon, 3 Feb 2025 11:54:07 +0000
Subject: [PATCH 26/77] pageserver: clean up references to timeline delete
 marker, uninit marker (#5718) (#10627)

## Problem

Since [#5580](https://github.com/neondatabase/neon/pull/5580) the delete
and uninit file markers are no longer needed.

## Summary of changes

Remove the remaining code for the delete and uninit markers.

Additionally removes the `ends_with_suffix` function as it is no longer
required.

Closes [#5718](https://github.com/neondatabase/neon/issues/5718).
---
 pageserver/src/lib.rs    | 27 ---------------------------
 pageserver/src/tenant.rs |  7 +------
 2 files changed, 1 insertion(+), 33 deletions(-)

diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index ff6af3566c..f43cd08cf7 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -263,14 +263,6 @@ pub(crate) const TENANT_HEATMAP_BASENAME: &str = "heatmap-v1.json";
 /// data directory at pageserver startup can be automatically removed.
 pub(crate) const TEMP_FILE_SUFFIX: &str = "___temp";
 
-/// A marker file to mark that a timeline directory was not fully initialized.
-/// If a timeline directory with this marker is encountered at pageserver startup,
-/// the timeline directory and the marker file are both removed.
-/// Full path: `tenants/<tenant_id>/timelines/<timeline_id>___uninit`.
-pub(crate) const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit";
-
-pub(crate) const TIMELINE_DELETE_MARK_SUFFIX: &str = "___delete";
-
 pub fn is_temporary(path: &Utf8Path) -> bool {
     match path.file_name() {
         Some(name) => name.ends_with(TEMP_FILE_SUFFIX),
@@ -278,25 +270,6 @@ pub fn is_temporary(path: &Utf8Path) -> bool {
     }
 }
 
-fn ends_with_suffix(path: &Utf8Path, suffix: &str) -> bool {
-    match path.file_name() {
-        Some(name) => name.ends_with(suffix),
-        None => false,
-    }
-}
-
-// FIXME: DO NOT ADD new query methods like this, which will have a next step of parsing timelineid
-// from the directory name. Instead create type "UninitMark(TimelineId)" and only parse it once
-// from the name.
-
-pub(crate) fn is_uninit_mark(path: &Utf8Path) -> bool {
-    ends_with_suffix(path, TIMELINE_UNINIT_MARK_SUFFIX)
-}
-
-pub(crate) fn is_delete_mark(path: &Utf8Path) -> bool {
-    ends_with_suffix(path, TIMELINE_DELETE_MARK_SUFFIX)
-}
-
 /// During pageserver startup, we need to order operations not to exhaust tokio worker threads by
 /// blocking.
 ///
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 657cc78e2c..1914a95562 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -95,7 +95,6 @@ use crate::context::{DownloadBehavior, RequestContext};
 use crate::deletion_queue::DeletionQueueClient;
 use crate::deletion_queue::DeletionQueueError;
 use crate::import_datadir;
-use crate::is_uninit_mark;
 use crate::l0_flush::L0FlushGlobalState;
 use crate::metrics::CONCURRENT_INITDBS;
 use crate::metrics::INITDB_RUN_TIME;
@@ -1793,11 +1792,7 @@ impl Tenant {
             let entry = entry.context("read timeline dir entry")?;
             let entry_path = entry.path();
 
-            let purge = if crate::is_temporary(entry_path)
-                // TODO: remove uninit mark code (https://github.com/neondatabase/neon/issues/5718)
-                || is_uninit_mark(entry_path)
-                || crate::is_delete_mark(entry_path)
-            {
+            let purge = if crate::is_temporary(entry_path) {
                 true
             } else {
                 match TimelineId::try_from(entry_path.file_name()) {

From b1bc33eb4d23c77ce0b4a2c9c8d8453e1f7cd5ee Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Mon, 3 Feb 2025 12:44:47 +0000
Subject: [PATCH 27/77] Fix logical_replication_sync test fixture (#10531)

Fixes flaky test_lr_with_slow_safekeeper test #10242

Fix query to `pg_catalog.pg_stat_subscription` catalog to handle table
synchronization and parallel LR correctly.
---
 test_runner/fixtures/neon_fixtures.py         | 56 +++++++++++++------
 .../performance/test_logical_replication.py   |  4 +-
 test_runner/regress/test_compute_catalog.py   |  7 ++-
 test_runner/regress/test_layer_bloating.py    |  2 +-
 .../regress/test_logical_replication.py       | 28 +++++-----
 .../test_physical_and_logical_replicaiton.py  |  4 +-
 .../regress/test_subscriber_branching.py      |  3 +-
 7 files changed, 65 insertions(+), 39 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 7e3cc19829..8909f7f249 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -4996,13 +4996,35 @@ def check_restored_datadir_content(
     assert (mismatch, error) == ([], [])
 
 
+# wait for subscriber to catch up with publisher
 def logical_replication_sync(
     subscriber: PgProtocol,
     publisher: PgProtocol,
+    # pass subname explicitly to avoid confusion
+    # when multiple subscriptions are present
+    subname: str,
     sub_dbname: str | None = None,
     pub_dbname: str | None = None,
-) -> Lsn:
+):
     """Wait logical replication subscriber to sync with publisher."""
+
+    def initial_sync():
+        # first check if the subscription is active `s`=`synchronized`, `r` = `ready`
+        query = f"""SELECT 1 FROM pg_subscription_rel join pg_catalog.pg_subscription
+                    on pg_subscription_rel.srsubid = pg_subscription.oid
+                    WHERE srsubstate NOT IN ('r', 's') and subname='{subname}'"""
+
+        if sub_dbname is not None:
+            res = subscriber.safe_psql(query, dbname=sub_dbname)
+        else:
+            res = subscriber.safe_psql(query)
+
+        assert (res is None) or (len(res) == 0)
+
+    wait_until(initial_sync)
+
+    # wait for the subscription to catch up with current state of publisher
+    # caller is responsible to call checkpoint before calling this function
     if pub_dbname is not None:
         publisher_lsn = Lsn(
             publisher.safe_psql("SELECT pg_current_wal_flush_lsn()", dbname=pub_dbname)[0][0]
@@ -5010,23 +5032,23 @@ def logical_replication_sync(
     else:
         publisher_lsn = Lsn(publisher.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
 
-    while True:
-        if sub_dbname is not None:
-            res = subscriber.safe_psql(
-                "select latest_end_lsn from pg_catalog.pg_stat_subscription", dbname=sub_dbname
-            )[0][0]
-        else:
-            res = subscriber.safe_psql(
-                "select latest_end_lsn from pg_catalog.pg_stat_subscription"
-            )[0][0]
+    def subscriber_catch_up():
+        query = f"select latest_end_lsn from pg_catalog.pg_stat_subscription where latest_end_lsn is NOT NULL and subname='{subname}'"
 
-        if res:
-            log.info(f"subscriber_lsn={res}")
-            subscriber_lsn = Lsn(res)
-            log.info(f"Subscriber LSN={subscriber_lsn}, publisher LSN={publisher_lsn}")
-            if subscriber_lsn >= publisher_lsn:
-                return subscriber_lsn
-        time.sleep(0.5)
+        if sub_dbname is not None:
+            res = subscriber.safe_psql(query, dbname=sub_dbname)
+        else:
+            res = subscriber.safe_psql(query)
+
+        assert res is not None
+
+        res_lsn = res[0][0]
+        log.info(f"subscriber_lsn={res_lsn}")
+        subscriber_lsn = Lsn(res_lsn)
+        log.info(f"Subscriber LSN={subscriber_lsn}, publisher LSN={publisher_lsn}")
+        assert subscriber_lsn >= publisher_lsn
+
+    wait_until(subscriber_catch_up)
 
 
 def tenant_get_shards(
diff --git a/test_runner/performance/test_logical_replication.py b/test_runner/performance/test_logical_replication.py
index 9d653d1a1e..fdc56cc496 100644
--- a/test_runner/performance/test_logical_replication.py
+++ b/test_runner/performance/test_logical_replication.py
@@ -44,13 +44,13 @@ def test_logical_replication(neon_simple_env: NeonEnv, pg_bin: PgBin, vanilla_pg
     vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1")
 
     # Wait logical replication channel to be established
-    logical_replication_sync(vanilla_pg, endpoint)
+    logical_replication_sync(vanilla_pg, endpoint, "sub1")
 
     pg_bin.run_capture(["pgbench", "-c10", "-T100", "-Mprepared", endpoint.connstr()])
 
     # Wait logical replication to sync
     start = time.time()
-    logical_replication_sync(vanilla_pg, endpoint)
+    logical_replication_sync(vanilla_pg, endpoint, "sub1")
     log.info(f"Sync with master took {time.time() - start} seconds")
 
     sum_master = cast("int", endpoint.safe_psql("select sum(abalance) from pgbench_accounts")[0][0])
diff --git a/test_runner/regress/test_compute_catalog.py b/test_runner/regress/test_compute_catalog.py
index f0878b2631..50a922a616 100644
--- a/test_runner/regress/test_compute_catalog.py
+++ b/test_runner/regress/test_compute_catalog.py
@@ -183,6 +183,7 @@ def test_dropdb_with_subscription(neon_simple_env: NeonEnv):
         cursor.execute("select pg_catalog.pg_create_logical_replication_slot('mysub', 'pgoutput');")
         cursor.execute("CREATE TABLE t(a int)")
         cursor.execute("INSERT INTO t VALUES (1)")
+        cursor.execute("CHECKPOINT")
 
     # connect to the subscriber_db and create a subscription
     # Note that we need to create subscription with
@@ -195,7 +196,11 @@ def test_dropdb_with_subscription(neon_simple_env: NeonEnv):
 
     # wait for the subscription to be active
     logical_replication_sync(
-        endpoint, endpoint, sub_dbname="subscriber_db", pub_dbname="publisher_db"
+        endpoint,
+        endpoint,
+        "mysub",
+        sub_dbname="subscriber_db",
+        pub_dbname="publisher_db",
     )
 
     # Check that replication is working
diff --git a/test_runner/regress/test_layer_bloating.py b/test_runner/regress/test_layer_bloating.py
index d9043fef7f..0260704ebf 100644
--- a/test_runner/regress/test_layer_bloating.py
+++ b/test_runner/regress/test_layer_bloating.py
@@ -63,7 +63,7 @@ def test_layer_bloating(neon_env_builder: NeonEnvBuilder, vanilla_pg):
     cur.execute("set statement_timeout=0")
     cur.execute("select create_snapshots(10000)")
     # Wait logical replication to sync
-    logical_replication_sync(vanilla_pg, endpoint)
+    logical_replication_sync(vanilla_pg, endpoint, "sub1")
     wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, timeline)
     env.pageserver.http_client().timeline_checkpoint(env.initial_tenant, timeline, compact=False)
 
diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py
index 8908763109..3a92f0d1d1 100644
--- a/test_runner/regress/test_logical_replication.py
+++ b/test_runner/regress/test_logical_replication.py
@@ -55,13 +55,13 @@ def test_logical_replication(neon_simple_env: NeonEnv, vanilla_pg: VanillaPostgr
     vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1")
 
     # Wait logical replication channel to be established
-    logical_replication_sync(vanilla_pg, endpoint)
+    logical_replication_sync(vanilla_pg, endpoint, "sub1")
 
     # insert some data
     cur.execute("insert into t values (generate_series(1,1000), 0)")
 
     # Wait logical replication to sync
-    logical_replication_sync(vanilla_pg, endpoint)
+    logical_replication_sync(vanilla_pg, endpoint, "sub1")
     assert vanilla_pg.safe_psql("select count(*) from t")[0][0] == 1000
 
     # now stop subscriber...
@@ -78,7 +78,7 @@ def test_logical_replication(neon_simple_env: NeonEnv, vanilla_pg: VanillaPostgr
     vanilla_pg.start()
 
     # Wait logical replication to sync
-    logical_replication_sync(vanilla_pg, endpoint)
+    logical_replication_sync(vanilla_pg, endpoint, "sub1")
 
     # Check that subscribers receives all data
     assert vanilla_pg.safe_psql("select count(*) from t")[0][0] == 2000
@@ -148,7 +148,7 @@ COMMIT;
     endpoint.start()
 
     vanilla_pg.start()
-    logical_replication_sync(vanilla_pg, endpoint)
+    logical_replication_sync(vanilla_pg, endpoint, "sub1")
     eq_q = "select testcolumn1, testcolumn2, testcolumn3 from replication_example order by 1, 2, 3"
     assert vanilla_pg.safe_psql(eq_q) == endpoint.safe_psql(eq_q)
     log.info("rewriteheap synced")
@@ -285,7 +285,7 @@ FROM generate_series(1, 16384) AS seq; -- Inserts enough rows to exceed 16MB of
     vanilla_pg.safe_psql("create table t(a int)")
     connstr = endpoint.connstr().replace("'", "''")
     vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub")
-    logical_replication_sync(vanilla_pg, endpoint)
+    logical_replication_sync(vanilla_pg, endpoint, "sub1")
 
     vanilla_pg.stop()
 
@@ -321,13 +321,13 @@ FROM generate_series(1, 16384) AS seq; -- Inserts enough rows to exceed 16MB of
         sk_http = sk.http_client()
         sk_http.configure_failpoints([("sk-pause-send", "off")])
 
-    logical_replication_sync(vanilla_pg, endpoint)
+    logical_replication_sync(vanilla_pg, endpoint, "sub1")
     assert [r[0] for r in vanilla_pg.safe_psql("select * from t")] == [1, 2]
 
     # Check that local reads also work
     with endpoint.connect().cursor() as cur:
         cur.execute("insert into t values (3)")
-    logical_replication_sync(vanilla_pg, endpoint)
+    logical_replication_sync(vanilla_pg, endpoint, "sub1")
     assert [r[0] for r in vanilla_pg.safe_psql("select * from t")] == [1, 2, 3]
 
     log_path = vanilla_pg.pgdatadir / "pg.log"
@@ -365,7 +365,7 @@ def test_restart_endpoint(neon_simple_env: NeonEnv, vanilla_pg: VanillaPostgres)
     log.info(f"ep connstr is {endpoint.connstr()}, subscriber connstr {vanilla_pg.connstr()}")
     connstr = endpoint.connstr().replace("'", "''")
     vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1")
-    logical_replication_sync(vanilla_pg, endpoint)
+    logical_replication_sync(vanilla_pg, endpoint, "sub1")
     vanilla_pg.stop()
 
     wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
@@ -375,7 +375,7 @@ def test_restart_endpoint(neon_simple_env: NeonEnv, vanilla_pg: VanillaPostgres)
     # this should flush current wal page
     cur.execute("insert into replication_example values (3, 4)")
     vanilla_pg.start()
-    logical_replication_sync(vanilla_pg, endpoint)
+    logical_replication_sync(vanilla_pg, endpoint, "sub1")
     assert vanilla_pg.safe_psql(
         "select sum(somedata) from replication_example"
     ) == endpoint.safe_psql("select sum(somedata) from replication_example")
@@ -409,18 +409,18 @@ def test_large_records(neon_simple_env: NeonEnv, vanilla_pg: VanillaPostgres):
     # Test simple insert, update, delete. But with very large values
     value = random_string(10_000_000)
     cur.execute(f"INSERT INTO reptbl VALUES (1, '{value}')")
-    logical_replication_sync(vanilla_pg, endpoint)
+    logical_replication_sync(vanilla_pg, endpoint, "sub1")
     assert vanilla_pg.safe_psql("select id, largeval from reptbl") == [(1, value)]
 
     # Test delete, and reinsert another value
     cur.execute("DELETE FROM reptbl WHERE id = 1")
     cur.execute(f"INSERT INTO reptbl VALUES (2, '{value}')")
-    logical_replication_sync(vanilla_pg, endpoint)
+    logical_replication_sync(vanilla_pg, endpoint, "sub1")
     assert vanilla_pg.safe_psql("select id, largeval from reptbl") == [(2, value)]
 
     value = random_string(10_000_000)
     cur.execute(f"UPDATE reptbl SET largeval='{value}'")
-    logical_replication_sync(vanilla_pg, endpoint)
+    logical_replication_sync(vanilla_pg, endpoint, "sub1")
     assert vanilla_pg.safe_psql("select id, largeval from reptbl") == [(2, value)]
 
     endpoint.stop()
@@ -428,7 +428,7 @@ def test_large_records(neon_simple_env: NeonEnv, vanilla_pg: VanillaPostgres):
     cur = endpoint.connect().cursor()
     value = random_string(10_000_000)
     cur.execute(f"UPDATE reptbl SET largeval='{value}'")
-    logical_replication_sync(vanilla_pg, endpoint)
+    logical_replication_sync(vanilla_pg, endpoint, "sub1")
     assert vanilla_pg.safe_psql("select id, largeval from reptbl") == [(2, value)]
 
 
@@ -608,7 +608,7 @@ def test_subscriber_synchronous_commit(neon_simple_env: NeonEnv, vanilla_pg: Van
         for i in range(0, 1000):
             pcur.execute("INSERT into t values (%s, random()*100000)", (i,))
     # wait until sub receives all data
-    logical_replication_sync(sub, vanilla_pg)
+    logical_replication_sync(sub, vanilla_pg, "sub")
     # Update confirmed_flush_lsn of the slot. If subscriber ack'ed recevied data
     # as flushed we'll now lose it if subscriber restars. That's why
     # logical_replication_wait_flush_lsn_sync is expected to hang while
diff --git a/test_runner/regress/test_physical_and_logical_replicaiton.py b/test_runner/regress/test_physical_and_logical_replicaiton.py
index 3f9824ee67..229439106b 100644
--- a/test_runner/regress/test_physical_and_logical_replicaiton.py
+++ b/test_runner/regress/test_physical_and_logical_replicaiton.py
@@ -43,7 +43,7 @@ def test_physical_and_logical_replication_slot_not_copied(neon_simple_env: NeonE
     s_cur.execute("select count(*) from t")
     assert s_cur.fetchall()[0][0] == n_records
 
-    logical_replication_sync(vanilla_pg, primary)
+    logical_replication_sync(vanilla_pg, primary, "sub1")
     assert vanilla_pg.safe_psql("select count(*) from t")[0][0] == n_records
 
     # Check that LR slot is not copied to replica
@@ -87,7 +87,7 @@ def test_aux_not_logged_at_replica(neon_simple_env: NeonEnv, vanilla_pg):
     s_con = secondary.connect()
     s_cur = s_con.cursor()
 
-    logical_replication_sync(vanilla_pg, primary)
+    logical_replication_sync(vanilla_pg, primary, "sub1")
 
     assert vanilla_pg.safe_psql("select count(*) from t")[0][0] == n_records
     s_cur.execute("select count(*) from t")
diff --git a/test_runner/regress/test_subscriber_branching.py b/test_runner/regress/test_subscriber_branching.py
index 645572da8e..849d4f024d 100644
--- a/test_runner/regress/test_subscriber_branching.py
+++ b/test_runner/regress/test_subscriber_branching.py
@@ -3,7 +3,7 @@ from __future__ import annotations
 import time
 
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv, logical_replication_sync
+from fixtures.neon_fixtures import NeonEnv
 from fixtures.utils import query_scalar, wait_until
 
 
@@ -208,7 +208,6 @@ def test_subscriber_branching(neon_simple_env: NeonEnv):
         # wake the sub and ensure that it catches up with the new data
         sub.start(create_test_user=True)
         with sub.cursor(dbname="neondb", user="test", password="testpwd") as scur:
-            logical_replication_sync(sub, pub)
             wait_until(check_that_changes_propagated)
             scur.execute("SELECT count(*) FROM t")
             res = scur.fetchall()

From 23ca8b061ba4c3a53b38d3f49d3ae7be1ed696f8 Mon Sep 17 00:00:00 2001
From: Fedor Dikarev <fedor@neon.tech>
Date: Mon, 3 Feb 2025 13:55:48 +0100
Subject: [PATCH 28/77] Use actions/checkout for checkout (#10630)

## Problem
1. First of all it's more correct
2. Current usage allows ` Time-of-Check-Time-of-Use (TOCTOU) 'Pwn
Request' vulnerabilities`. Please check security slack channel or reach
me for more details. I will update PR description after merge.

## Summary of changes
1. Use `actions/checkout` with `ref: ${{
github.event.pull_request.head.sha }}`

Discovered by and Co-author: @varunsh-coder
---
 .github/workflows/approved-for-ci-run.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/approved-for-ci-run.yml b/.github/workflows/approved-for-ci-run.yml
index 0a0898d30c..fc2f36c74b 100644
--- a/.github/workflows/approved-for-ci-run.yml
+++ b/.github/workflows/approved-for-ci-run.yml
@@ -94,7 +94,9 @@ jobs:
           echo "LABELS_TO_ADD=${LABELS_TO_ADD}" >> ${GITHUB_OUTPUT}
           echo "LABELS_TO_REMOVE=${LABELS_TO_REMOVE}" >> ${GITHUB_OUTPUT}
 
-      - run: gh pr checkout "${PR_NUMBER}"
+      - uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event.pull_request.head.sha }}
 
       - run: git checkout -b "${BRANCH}"
 

From e617a3a075f4debfa2b50a280a50475f3abed32c Mon Sep 17 00:00:00 2001
From: Em Sharnoff <sharnoff@neon.tech>
Date: Mon, 3 Feb 2025 05:34:11 -0800
Subject: [PATCH 29/77] vm-monitor: Improve error display (#10542)

Logging errors with the debug format specifier causes multi-line errors,
which are sometimes a pain to deal with. Instead, we should use anyhow's
alternate display format, which shows the same information on a single
line.

Also adjusted a couple of error messages that were stale.

Fixes neondatabase/cloud#14710.
---
 libs/vm_monitor/src/filecache.rs |  6 +++---
 libs/vm_monitor/src/lib.rs       | 14 +++++++-------
 libs/vm_monitor/src/runner.rs    | 12 ++++++++----
 3 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/libs/vm_monitor/src/filecache.rs b/libs/vm_monitor/src/filecache.rs
index fe71e11197..4f5bf1c1e3 100644
--- a/libs/vm_monitor/src/filecache.rs
+++ b/libs/vm_monitor/src/filecache.rs
@@ -177,8 +177,8 @@ impl FileCacheState {
         crate::spawn_with_cancel(
             token,
             |res| {
-                if let Err(error) = res {
-                    error!(%error, "postgres error")
+                if let Err(e) = res {
+                    error!(error = format_args!("{e:#}"), "postgres error");
                 }
             },
             conn,
@@ -205,7 +205,7 @@ impl FileCacheState {
         {
             Ok(rows) => Ok(rows),
             Err(e) => {
-                error!(error = ?e, "postgres error: {e} -> retrying");
+                error!(error = format_args!("{e:#}"), "postgres error -> retrying");
 
                 let client = FileCacheState::connect(&self.conn_str, self.token.clone())
                     .await
diff --git a/libs/vm_monitor/src/lib.rs b/libs/vm_monitor/src/lib.rs
index 1b13c8e0b2..0cd97d4ca1 100644
--- a/libs/vm_monitor/src/lib.rs
+++ b/libs/vm_monitor/src/lib.rs
@@ -191,15 +191,12 @@ async fn start_monitor(
     .await;
     let mut monitor = match monitor {
         Ok(Ok(monitor)) => monitor,
-        Ok(Err(error)) => {
-            error!(?error, "failed to create monitor");
+        Ok(Err(e)) => {
+            error!(error = format_args!("{e:#}"), "failed to create monitor");
             return;
         }
         Err(_) => {
-            error!(
-                ?timeout,
-                "creating monitor timed out (probably waiting to receive protocol range)"
-            );
+            error!(?timeout, "creating monitor timed out");
             return;
         }
     };
@@ -207,6 +204,9 @@ async fn start_monitor(
 
     match monitor.run().await {
         Ok(()) => info!("monitor was killed due to new connection"),
-        Err(e) => error!(error = ?e, "monitor terminated unexpectedly"),
+        Err(e) => error!(
+            error = format_args!("{e:#}"),
+            "monitor terminated unexpectedly"
+        ),
     }
 }
diff --git a/libs/vm_monitor/src/runner.rs b/libs/vm_monitor/src/runner.rs
index 8605314ba9..8839f5803f 100644
--- a/libs/vm_monitor/src/runner.rs
+++ b/libs/vm_monitor/src/runner.rs
@@ -370,12 +370,16 @@ impl Runner {
                 }),
             InboundMsgKind::InvalidMessage { error } => {
                 warn!(
-                    %error, id, "received notification of an invalid message we sent"
+                    error = format_args!("{error:#}"),
+                    id, "received notification of an invalid message we sent"
                 );
                 Ok(None)
             }
             InboundMsgKind::InternalError { error } => {
-                warn!(error, id, "agent experienced an internal error");
+                warn!(
+                    error = format_args!("{error:#}"),
+                    id, "agent experienced an internal error"
+                );
                 Ok(None)
             }
             InboundMsgKind::HealthCheck {} => {
@@ -476,7 +480,7 @@ impl Runner {
                                         // gives the outermost cause, and the debug impl
                                         // pretty-prints the error, whereas {:#} contains all the
                                         // causes, but is compact (no newlines).
-                                        warn!(error = format!("{e:#}"), "error handling message");
+                                        warn!(error = format_args!("{e:#}"), "error handling message");
                                         OutboundMsg::new(
                                             OutboundMsgKind::InternalError {
                                                 error: e.to_string(),
@@ -492,7 +496,7 @@ impl Runner {
                                     .context("failed to send message")?;
                             }
                             Err(e) => warn!(
-                                error = format!("{e}"),
+                                error = format_args!("{e:#}"),
                                 msg = ?msg,
                                 "received error message"
                             ),

From 43682624b5548136f321254e88cd9b7f7453ae61 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 3 Feb 2025 13:41:41 +0000
Subject: [PATCH 30/77] CI(pg-clients): fix logical replication tests (#10623)

## Problem

Tests for logical replication (on Staging) have been failing for some
time because logical replication is not enabled for them. This issue
occurred after switching to an org API key with a different default
setting, where logical replication was not enabled by default.

## Summary of changes
- Add `enable_logical_replication` input to
`actions/neon-project-create`
- Enable logical replication in `test-logical-replication` job
---
 .../actions/neon-project-create/action.yml    | 12 +++++++----
 .github/workflows/pg-clients.yml              |  6 ++++--
 test_runner/logical_repl/README.md            | 20 +++++++++++--------
 3 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/.github/actions/neon-project-create/action.yml b/.github/actions/neon-project-create/action.yml
index 11f46bce8e..c9f6b0832e 100644
--- a/.github/actions/neon-project-create/action.yml
+++ b/.github/actions/neon-project-create/action.yml
@@ -41,7 +41,10 @@ inputs:
     description: 'Path to directory containing libpq library - it is caller responsibility to provision the libpq library'
     required: false
     default: '/tmp/neon/pg_install/v16/lib'
-  
+  project_settings:
+    description: 'A JSON object with project settings'
+    required: false
+    default: '{}'
 
 outputs:
   dsn:
@@ -73,7 +76,7 @@ runs:
               \"provisioner\": \"k8s-neonvm\",
               \"autoscaling_limit_min_cu\": ${MIN_CU},
               \"autoscaling_limit_max_cu\": ${MAX_CU},
-              \"settings\": { }
+              \"settings\": ${PROJECT_SETTINGS}
             }
           }")
 
@@ -92,12 +95,12 @@ runs:
         if [ "${SHARD_SPLIT_PROJECT}" = "true" ]; then
           # determine tenant ID
           TENANT_ID=`${PSQL} ${dsn} -t -A -c "SHOW neon.tenant_id"`
-          
+
           echo "Splitting project ${project_id} with tenant_id ${TENANT_ID} into $((SHARD_COUNT)) shards with stripe size $((STRIPE_SIZE))"
 
           echo "Sending PUT request to https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/storage/proxy/control/v1/tenant/${TENANT_ID}/shard_split"
           echo "with body {\"new_shard_count\": $((SHARD_COUNT)), \"new_stripe_size\": $((STRIPE_SIZE))}"
-          
+
           # we need an ADMIN API KEY to invoke storage controller API for shard splitting (bash -u above checks that the variable is set)
           curl -X PUT \
             "https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/storage/proxy/control/v1/tenant/${TENANT_ID}/shard_split" \
@@ -118,3 +121,4 @@ runs:
         STRIPE_SIZE: ${{ inputs.stripe_size }}
         PSQL: ${{ inputs.psql_path }}
         LD_LIBRARY_PATH: ${{ inputs.libpq_lib_path }}
+        PROJECT_SETTINGS: ${{ inputs.project_settings }}
diff --git a/.github/workflows/pg-clients.yml b/.github/workflows/pg-clients.yml
index 4947907eb0..abc90c7fe1 100644
--- a/.github/workflows/pg-clients.yml
+++ b/.github/workflows/pg-clients.yml
@@ -12,8 +12,8 @@ on:
   pull_request:
     paths:
       - '.github/workflows/pg-clients.yml'
-      - 'test_runner/pg_clients/**'
-      - 'test_runner/logical_repl/**'
+      - 'test_runner/pg_clients/**/*.py'
+      - 'test_runner/logical_repl/**/*.py'
       - 'poetry.lock'
   workflow_dispatch:
 
@@ -104,6 +104,8 @@ jobs:
         with:
           api_key: ${{ secrets.NEON_STAGING_API_KEY }}
           postgres_version: ${{ env.DEFAULT_PG_VERSION }}
+          project_settings: >-
+            {"enable_logical_replication": true}
 
       - name: Run tests
         uses: ./.github/actions/run-python-test-set
diff --git a/test_runner/logical_repl/README.md b/test_runner/logical_repl/README.md
index 8eca056dda..449e56e21d 100644
--- a/test_runner/logical_repl/README.md
+++ b/test_runner/logical_repl/README.md
@@ -1,13 +1,18 @@
 # Logical replication tests
 
+> [!NOTE]
+> Neon project should have logical replication enabled:
+>
+> https://neon.tech/docs/guides/logical-replication-postgres#enable-logical-replication-in-the-source-neon-project
+
 ## Clickhouse
 
 ```bash
 export BENCHMARK_CONNSTR=postgres://user:pass@ep-abc-xyz-123.us-east-2.aws.neon.build/neondb
 
-docker compose -f clickhouse/docker-compose.yml up -d
-pytest -m remote_cluster -k test_clickhouse
-docker compose -f clickhouse/docker-compose.yml down
+docker compose -f test_runner/logical_repl/clickhouse/docker-compose.yml up -d
+./scripts/pytest -m remote_cluster -k test_clickhouse
+docker compose -f test_runner/logical_repl/clickhouse/docker-compose.yml down
 ```
 
 ## Debezium
@@ -15,8 +20,7 @@ docker compose -f clickhouse/docker-compose.yml down
 ```bash
 export BENCHMARK_CONNSTR=postgres://user:pass@ep-abc-xyz-123.us-east-2.aws.neon.build/neondb
 
-docker compose -f debezium/docker-compose.yml up -d
-pytest -m remote_cluster -k test_debezium
-docker compose -f debezium/docker-compose.yml down
-
-```
\ No newline at end of file
+docker compose -f test_runner/logical_repl/debezium/docker-compose.yml up -d
+./scripts/pytest -m remote_cluster -k test_debezium
+docker compose -f test_runner/logical_repl/debezium/docker-compose.yml down
+```

From 628a9616c4f0836a8d06dce34f8b4a525a5d7985 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Mon, 3 Feb 2025 15:12:41 +0100
Subject: [PATCH 31/77] fix(proxy): Don't use --is-private-access-proxy to
 disable IP check (#10633)

## Problem
* The behavior of this flag changed. Plus, it's not necessary to disable
the IP check as long as there are no IPs listed in the local postgres.

## Summary of changes
* Drop the flag from the command in the README.md section.
* Change the postgres URL passed to proxy to not use the endpoint
hostname.
* Also swap postgres creation and proxy startup, so the DB is running
when proxy comes up.
---
 proxy/README.md | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/proxy/README.md b/proxy/README.md
index 4b98342d72..ecd54fbbd8 100644
--- a/proxy/README.md
+++ b/proxy/README.md
@@ -106,17 +106,7 @@ cases where it is hard to use rows represented as objects (e.g. when several fie
 
 Proxy determines project name from the subdomain, request to the `round-rice-566201.somedomain.tld` will be routed to the project named `round-rice-566201`. Unfortunately, `/etc/hosts` does not support domain wildcards, so we can use *.localtest.me` which resolves to `127.0.0.1`.
 
-Let's create self-signed certificate by running:
-```sh
-openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key -subj "/CN=*.localtest.me"
-```
-
-Then we need to build proxy with 'testing' feature and run, e.g.:
-```sh
-RUST_LOG=proxy cargo run -p proxy --bin proxy --features testing -- --auth-backend postgres --auth-endpoint 'postgresql://proxy:password@endpoint.localtest.me:5432/postgres' --is-private-access-proxy true -c server.crt -k server.key
-```
-
-We will also need to have a postgres instance. Assuming that we have setted up docker we can set it up as follows:
+We will need to have a postgres instance. Assuming that we have set up docker we can set it up as follows:
 ```sh
 docker run \
   --detach \
@@ -133,8 +123,18 @@ docker exec -it proxy-postgres psql -U postgres -c "CREATE TABLE neon_control_pl
 docker exec -it proxy-postgres psql -U postgres -c "CREATE ROLE proxy WITH SUPERUSER LOGIN PASSWORD 'password';"
 ```
 
+Let's create self-signed certificate by running:
+```sh
+openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key -subj "/CN=*.localtest.me"
+```
+
+Then we need to build proxy with 'testing' feature and run, e.g.:
+```sh
+RUST_LOG=proxy cargo run -p proxy --bin proxy --features testing -- --auth-backend postgres --auth-endpoint 'postgresql://postgres:proxy-postgres@127.0.0.1:5432/postgres' -c server.crt -k server.key
+```
+
 Now from client you can start a new session:
 
 ```sh
 PGSSLROOTCERT=./server.crt psql  "postgresql://proxy:password@endpoint.localtest.me:4432/postgres?sslmode=verify-full"
-```
\ No newline at end of file
+```

From c774f0a14710846c65f6b82b3838c1496d03af8f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 3 Feb 2025 19:21:01 +0100
Subject: [PATCH 32/77] storcon db: allow accepting any TLS certificate
 (#10640)

We encountered some TLS validation errors for the storcon since applying
#10614. Add an option to downgrade them to logged errors instead to
allow us to debug with more peace.

cc issue https://github.com/neondatabase/cloud/issues/23583
---
 storage_controller/src/persistence.rs | 93 +++++++++++++++++++++++++--
 1 file changed, 88 insertions(+), 5 deletions(-)

diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index 880f203064..45f3108d6b 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -27,6 +27,8 @@ use pageserver_api::shard::ShardConfigError;
 use pageserver_api::shard::ShardIdentity;
 use pageserver_api::shard::ShardStripeSize;
 use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
+use rustls::client::danger::ServerCertVerifier;
+use rustls::client::WebPkiServerVerifier;
 use rustls::crypto::ring;
 use scoped_futures::ScopedBoxFuture;
 use serde::{Deserialize, Serialize};
@@ -1281,14 +1283,95 @@ pub(crate) fn load_certs() -> anyhow::Result<Arc<rustls::RootCertStore>> {
 
 /// Loads the root certificates and constructs a client config suitable for connecting.
 /// This function is blocking.
-pub fn client_config_with_root_certs() -> anyhow::Result<rustls::ClientConfig> {
-    Ok(
+fn client_config_with_root_certs() -> anyhow::Result<rustls::ClientConfig> {
+    let client_config =
         rustls::ClientConfig::builder_with_provider(Arc::new(ring::default_provider()))
             .with_safe_default_protocol_versions()
-            .expect("ring should support the default protocol versions")
+            .expect("ring should support the default protocol versions");
+    static DO_CERT_CHECKS: std::sync::OnceLock<bool> = std::sync::OnceLock::new();
+    let do_cert_checks =
+        DO_CERT_CHECKS.get_or_init(|| std::env::var("STORCON_CERT_CHECKS").is_ok());
+    Ok(if *do_cert_checks {
+        client_config
             .with_root_certificates(load_certs()?)
-            .with_no_client_auth(),
-    )
+            .with_no_client_auth()
+    } else {
+        use rustls::client::danger::{HandshakeSignatureValid, ServerCertVerified};
+        #[derive(Debug)]
+        struct AcceptAll(Arc<WebPkiServerVerifier>);
+        impl ServerCertVerifier for AcceptAll {
+            fn verify_server_cert(
+                &self,
+                end_entity: &rustls::pki_types::CertificateDer<'_>,
+                intermediates: &[rustls::pki_types::CertificateDer<'_>],
+                server_name: &rustls::pki_types::ServerName<'_>,
+                ocsp_response: &[u8],
+                now: rustls::pki_types::UnixTime,
+            ) -> Result<ServerCertVerified, rustls::Error> {
+                let r = self.0.verify_server_cert(
+                    end_entity,
+                    intermediates,
+                    server_name,
+                    ocsp_response,
+                    now,
+                );
+                if let Err(err) = r {
+                    tracing::info!(
+                        ?server_name,
+                        "ignoring db connection TLS validation error: {err:?}"
+                    );
+                    return Ok(ServerCertVerified::assertion());
+                }
+                r
+            }
+            fn verify_tls12_signature(
+                &self,
+                message: &[u8],
+                cert: &rustls::pki_types::CertificateDer<'_>,
+                dss: &rustls::DigitallySignedStruct,
+            ) -> Result<rustls::client::danger::HandshakeSignatureValid, rustls::Error>
+            {
+                let r = self.0.verify_tls12_signature(message, cert, dss);
+                if let Err(err) = r {
+                    tracing::info!(
+                        "ignoring db connection 1.2 signature TLS validation error: {err:?}"
+                    );
+                    return Ok(HandshakeSignatureValid::assertion());
+                }
+                r
+            }
+            fn verify_tls13_signature(
+                &self,
+                message: &[u8],
+                cert: &rustls::pki_types::CertificateDer<'_>,
+                dss: &rustls::DigitallySignedStruct,
+            ) -> Result<rustls::client::danger::HandshakeSignatureValid, rustls::Error>
+            {
+                let r = self.0.verify_tls13_signature(message, cert, dss);
+                if let Err(err) = r {
+                    tracing::info!(
+                        "ignoring db connection 1.3 signature TLS validation error: {err:?}"
+                    );
+                    return Ok(HandshakeSignatureValid::assertion());
+                }
+                r
+            }
+            fn supported_verify_schemes(&self) -> Vec<rustls::SignatureScheme> {
+                self.0.supported_verify_schemes()
+            }
+        }
+        let verifier = AcceptAll(
+            WebPkiServerVerifier::builder_with_provider(
+                load_certs()?,
+                Arc::new(ring::default_provider()),
+            )
+            .build()?,
+        );
+        client_config
+            .dangerous()
+            .with_custom_certificate_verifier(Arc::new(verifier))
+            .with_no_client_auth()
+    })
 }
 
 fn establish_connection_rustls(config: &str) -> BoxFuture<ConnectionResult<AsyncPgConnection>> {

From 715e20343a64a6194feffba0e67d4f6a47d4c83a Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 3 Feb 2025 19:01:16 +0000
Subject: [PATCH 33/77] storage controller: improve scheduling of tenants
 created in PlacementPolicy::Secondary (#10590)

## Problem

I noticed when onboarding lots of tenants that the AZ scheduling
violation stat was climbing, before falling later as optimisations
happened. This was happening because we first add the tenant with
PlacementPolicy::Secondary, and then later go to
PlacementPolicy::Attached, and the scheduler's behavior led to a bad AZ
choice:
1. Create a secondary location in the non-preferred AZ
2. Upgrade to Attached where we promote that non-preferred-AZ location
to attached and then create another secondary
3. Optimiser later realises we're in the wrong AZ and moves us

## Summary of changes

- Extend some logging to give more information about AZs
- When scheduling secondary location in PlacementPolicy::Secondary,
select it as if we were attached: in this mode, our business goal is to
have a warm pageserver location that we can make available as attached
quickly if needed, therefore we want it to be in the preferred AZ.
- Make optimize_secondary logic the same, so that it will consider a
secondary location in the preferred AZ to be optimal when in
PlacementPolicy::Secondary
- When transitioning to from PlacementPolicy::Attached(N) to
PlacementPolicy::Secondary, instead of arbitrarily picking a location to
keep, prefer to keep the location in the preferred AZ
---
 storage_controller/src/scheduler.rs           |   5 +-
 storage_controller/src/tenant_shard.rs        | 155 ++++++++++++++++--
 .../regress/test_storage_controller.py        |   7 +
 3 files changed, 155 insertions(+), 12 deletions(-)

diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs
index f5cab9dd57..f9e72862ae 100644
--- a/storage_controller/src/scheduler.rs
+++ b/storage_controller/src/scheduler.rs
@@ -774,8 +774,9 @@ impl Scheduler {
 
         if !matches!(context.mode, ScheduleMode::Speculative) {
             tracing::info!(
-            "scheduler selected node {node_id} (elegible nodes {:?}, hard exclude: {hard_exclude:?}, soft exclude: {context:?})",
-            scores.iter().map(|i| i.node_id().0).collect::<Vec<_>>()
+            "scheduler selected node {node_id} (elegible nodes {:?}, hard exclude: {hard_exclude:?}, soft exclude: {context:?}, preferred_az: {:?})",
+            scores.iter().map(|i| i.node_id().0).collect::<Vec<_>>(),
+            preferred_az,
        );
         }
 
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index d344e27e31..302104dc97 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -710,7 +710,11 @@ impl TenantShard {
                     modified = true;
                 } else if self.intent.secondary.is_empty() {
                     // Populate secondary by scheduling a fresh node
-                    let node_id = scheduler.schedule_shard::<SecondaryShardTag>(
+                    //
+                    // We use [`AttachedShardTag`] because when a secondary location is the only one
+                    // a shard has, we expect that its next use will be as an attached location: we want
+                    // the tenant to be ready to warm up and run fast in their preferred AZ.
+                    let node_id = scheduler.schedule_shard::<AttachedShardTag>(
                         &[],
                         &self.intent.preferred_az_id,
                         context,
@@ -719,9 +723,17 @@ impl TenantShard {
                     modified = true;
                 }
                 while self.intent.secondary.len() > 1 {
-                    // We have no particular preference for one secondary location over another: just
-                    // arbitrarily drop from the end
-                    self.intent.pop_secondary(scheduler);
+                    // If we have multiple secondaries (e.g. when transitioning from Attached to Secondary and
+                    // having just demoted our attached location), then we should prefer to keep the location
+                    // in our preferred AZ.  Tenants in Secondary mode want to be in the preferred AZ so that
+                    // they have a warm location to become attached when transitioning back into Attached.
+
+                    let mut candidates = self.intent.get_secondary().clone();
+                    // Sort to get secondaries outside preferred AZ last
+                    candidates
+                        .sort_by_key(|n| scheduler.get_node_az(n).as_ref() != self.preferred_az());
+                    let secondary_to_remove = candidates.pop().unwrap();
+                    self.intent.remove_secondary(scheduler, secondary_to_remove);
                     modified = true;
                 }
             }
@@ -1079,12 +1091,31 @@ impl TenantShard {
                 None => vec![],
             };
 
-            let replacement = self.find_better_location::<SecondaryShardTag>(
-                scheduler,
-                &schedule_context,
-                *secondary,
-                &exclude,
-            );
+            let replacement = match &self.policy {
+                PlacementPolicy::Attached(_) => {
+                    // Secondaries for an attached shard should be scheduled using `SecondaryShardTag`
+                    // to avoid placing them in the preferred AZ.
+                    self.find_better_location::<SecondaryShardTag>(
+                        scheduler,
+                        &schedule_context,
+                        *secondary,
+                        &exclude,
+                    )
+                }
+                PlacementPolicy::Secondary => {
+                    // In secondary-only mode, we want our secondary locations in the preferred AZ,
+                    // so that they're ready to take over as an attached location when we transition
+                    // into PlacementPolicy::Attached.
+                    self.find_better_location::<AttachedShardTag>(
+                        scheduler,
+                        &schedule_context,
+                        *secondary,
+                        &exclude,
+                    )
+                }
+                PlacementPolicy::Detached => None,
+            };
+
             assert!(replacement != Some(*secondary));
             if let Some(replacement) = replacement {
                 // We have found a candidate and confirmed that its score is preferable
@@ -2687,4 +2718,108 @@ pub(crate) mod tests {
         }
         Ok(())
     }
+
+    /// Check how the shard's scheduling behaves when in PlacementPolicy::Secondary mode.
+    #[test]
+    fn tenant_secondary_scheduling() -> anyhow::Result<()> {
+        let az_a = AvailabilityZone("az-a".to_string());
+        let nodes = make_test_nodes(
+            3,
+            &[
+                az_a.clone(),
+                AvailabilityZone("az-b".to_string()),
+                AvailabilityZone("az-c".to_string()),
+            ],
+        );
+
+        let mut scheduler = Scheduler::new(nodes.values());
+        let mut context = ScheduleContext::default();
+
+        let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Secondary);
+        tenant_shard.intent.preferred_az_id = Some(az_a.clone());
+        tenant_shard
+            .schedule(&mut scheduler, &mut context)
+            .expect("we have enough nodes, scheduling should work");
+        assert_eq!(tenant_shard.intent.secondary.len(), 1);
+        assert!(tenant_shard.intent.attached.is_none());
+
+        // Should have scheduled into the preferred AZ
+        assert_eq!(
+            scheduler
+                .get_node_az(&tenant_shard.intent.secondary[0])
+                .as_ref(),
+            tenant_shard.preferred_az()
+        );
+
+        // Optimizer should agree
+        assert_eq!(
+            tenant_shard.optimize_attachment(&mut scheduler, &context),
+            None
+        );
+        assert_eq!(
+            tenant_shard.optimize_secondary(&mut scheduler, &context),
+            None
+        );
+
+        // Switch to PlacementPolicy::Attached
+        tenant_shard.policy = PlacementPolicy::Attached(1);
+        tenant_shard
+            .schedule(&mut scheduler, &mut context)
+            .expect("we have enough nodes, scheduling should work");
+        assert_eq!(tenant_shard.intent.secondary.len(), 1);
+        assert!(tenant_shard.intent.attached.is_some());
+        // Secondary should now be in non-preferred AZ
+        assert_ne!(
+            scheduler
+                .get_node_az(&tenant_shard.intent.secondary[0])
+                .as_ref(),
+            tenant_shard.preferred_az()
+        );
+        // Attached should be in preferred AZ
+        assert_eq!(
+            scheduler
+                .get_node_az(&tenant_shard.intent.attached.unwrap())
+                .as_ref(),
+            tenant_shard.preferred_az()
+        );
+
+        // Optimizer should agree
+        assert_eq!(
+            tenant_shard.optimize_attachment(&mut scheduler, &context),
+            None
+        );
+        assert_eq!(
+            tenant_shard.optimize_secondary(&mut scheduler, &context),
+            None
+        );
+
+        // Switch back to PlacementPolicy::Secondary
+        tenant_shard.policy = PlacementPolicy::Secondary;
+        tenant_shard
+            .schedule(&mut scheduler, &mut context)
+            .expect("we have enough nodes, scheduling should work");
+        assert_eq!(tenant_shard.intent.secondary.len(), 1);
+        assert!(tenant_shard.intent.attached.is_none());
+        // When we picked a location to keep, we should have kept the one in the preferred AZ
+        assert_eq!(
+            scheduler
+                .get_node_az(&tenant_shard.intent.secondary[0])
+                .as_ref(),
+            tenant_shard.preferred_az()
+        );
+
+        // Optimizer should agree
+        assert_eq!(
+            tenant_shard.optimize_attachment(&mut scheduler, &context),
+            None
+        );
+        assert_eq!(
+            tenant_shard.optimize_secondary(&mut scheduler, &context),
+            None
+        );
+
+        tenant_shard.intent.clear(&mut scheduler);
+
+        Ok(())
+    }
 }
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 350fe31099..11a4d09202 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -373,6 +373,7 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
     but imports the generation number.
     """
 
+    neon_env_builder.num_azs = 3
     env, origin_ps, tenant_id, generation = prepare_onboarding_env(neon_env_builder)
 
     virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
@@ -409,6 +410,9 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
             "node_secondary"
         ][0]
 
+        # Check that the secondary's scheduling is stable
+        assert env.storage_controller.reconcile_all() == 0
+
     # Call into storage controller to onboard the tenant
     generation += 1
     r = virtual_ps_http.tenant_location_conf(
@@ -460,6 +464,9 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
     )
     assert len(r["shards"]) == 1
 
+    # Check that onboarding did not result in an unstable scheduling state
+    assert env.storage_controller.reconcile_all() == 0
+
     # We should see the tenant is now attached to the pageserver managed
     # by the sharding service
     origin_tenants = origin_ps.http_client().tenant_list()

From 06b45fd0fd38515cd431aab0d9baae7e13a52058 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Mon, 3 Feb 2025 20:23:12 +0100
Subject: [PATCH 34/77] utils/logging: add `critical!` macro and metric
 (#10641)

## Problem

We don't currently have good alerts for critical errors, e.g. data
loss/corruption.

Touches #10094.

## Summary of changes

Add a `critical!` macro and corresponding
`libmetrics_tracing_event_count{level="critical"}` metric. This will:

* Emit an `ERROR` log message with prefix `"CRITICAL:"` and a backtrace.
* Increment `libmetrics_tracing_event_count{level="critical"}`, and
indirectly `level="error"`.
* Trigger a pageable alert (via the metric above).
* In debug builds, panic the process.

I'll add uses of the macro separately.
---
 libs/utils/src/logging.rs | 31 +++++++++++++++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs
index e205d60d74..753f05b6fd 100644
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -5,6 +5,24 @@ use metrics::{IntCounter, IntCounterVec};
 use once_cell::sync::Lazy;
 use strum_macros::{EnumString, VariantNames};
 
+/// Logs a critical error, similarly to `tracing::error!`. This will:
+///
+/// * Emit an ERROR log message with prefix "CRITICAL:" and a backtrace.
+/// * Increment libmetrics_tracing_event_count{level="critical"}, and indirectly level="error".
+/// * Trigger a pageable alert (via the metric above).
+/// * In debug builds, panic the process.
+#[macro_export]
+macro_rules! critical {
+    ($($arg:tt)*) => {
+        if cfg!(debug_assertions) {
+            panic!($($arg)*);
+        }
+        $crate::logging::TRACING_EVENT_COUNT_METRIC.inc_critical();
+        let backtrace = std::backtrace::Backtrace::capture();
+        tracing::error!("CRITICAL: {}\n{backtrace}", format!($($arg)*));
+    };
+}
+
 #[derive(EnumString, strum_macros::Display, VariantNames, Eq, PartialEq, Debug, Clone, Copy)]
 #[strum(serialize_all = "snake_case")]
 pub enum LogFormat {
@@ -25,7 +43,10 @@ impl LogFormat {
     }
 }
 
-struct TracingEventCountMetric {
+pub struct TracingEventCountMetric {
+    /// CRITICAL is not a `tracing` log level. Instead, we increment it in the `critical!` macro,
+    /// and also emit it as a regular error. These are thus double-counted, but that seems fine.
+    critical: IntCounter,
     error: IntCounter,
     warn: IntCounter,
     info: IntCounter,
@@ -33,7 +54,7 @@ struct TracingEventCountMetric {
     trace: IntCounter,
 }
 
-static TRACING_EVENT_COUNT_METRIC: Lazy<TracingEventCountMetric> = Lazy::new(|| {
+pub static TRACING_EVENT_COUNT_METRIC: Lazy<TracingEventCountMetric> = Lazy::new(|| {
     let vec = metrics::register_int_counter_vec!(
         "libmetrics_tracing_event_count",
         "Number of tracing events, by level",
@@ -46,6 +67,7 @@ static TRACING_EVENT_COUNT_METRIC: Lazy<TracingEventCountMetric> = Lazy::new(||
 impl TracingEventCountMetric {
     fn new(vec: IntCounterVec) -> Self {
         Self {
+            critical: vec.with_label_values(&["critical"]),
             error: vec.with_label_values(&["error"]),
             warn: vec.with_label_values(&["warn"]),
             info: vec.with_label_values(&["info"]),
@@ -54,6 +76,11 @@ impl TracingEventCountMetric {
         }
     }
 
+    // Allow public access from `critical!` macro.
+    pub fn inc_critical(&self) {
+        self.critical.inc();
+    }
+
     fn inc_for_level(&self, level: tracing::Level) {
         let counter = match level {
             tracing::Level::ERROR => &self.error,

From d80cbb244312aacfacbc5b4b8b26efbe63752a56 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 3 Feb 2025 19:42:40 +0000
Subject: [PATCH 35/77] build(deps): bump openssl from 0.10.66 to 0.10.70 in
 /test_runner/pg_clients/rust/tokio-postgres in the cargo group across 1
 directory (#10642)

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 test_runner/pg_clients/rust/tokio-postgres/Cargo.lock | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock
index 354fc15745..0b138bf167 100644
--- a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock
+++ b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock
@@ -421,9 +421,9 @@ checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
 
 [[package]]
 name = "openssl"
-version = "0.10.66"
+version = "0.10.70"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9529f4786b70a3e8c61e11179af17ab6188ad8d0ded78c5529441ed39d4bd9c1"
+checksum = "61cfb4e166a8bb8c9b55c500bc2308550148ece889be90f609377e58140f42c6"
 dependencies = [
  "bitflags 2.6.0",
  "cfg-if",
@@ -453,9 +453,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
 
 [[package]]
 name = "openssl-sys"
-version = "0.9.103"
+version = "0.9.105"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7f9e8deee91df40a943c71b917e5874b951d32a802526c85721ce3b776c929d6"
+checksum = "8b22d5b84be05a8d6947c7cb71f7c849aa0f112acd4bf51c2a7c1c988ac0a9dc"
 dependencies = [
  "cc",
  "libc",

From c1be84197eb6f0f8c793bd251428501097b7c580 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Mon, 3 Feb 2025 15:55:47 -0500
Subject: [PATCH 36/77] feat(pageserver): preempt image layer generation if L0
 piles up (#10572)

## Problem

Image layer generation could block L0 compactions for a long time.

## Summary of changes

* Refactored the return value of `create_image_layers_for_*` functions
to make it self-explainable.
* Preempt image layer generation in `Try` mode if L0 piles up.

Note that we might potentially run into a state that only the beginning
part of the keyspace gets image coverage. In that case, we either need
to implement something to prioritize some keyspaces with image coverage,
or tune the image_creation_threshold to ensure that the frequency of
image creation could keep up with L0 compaction.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Erik Grinaker <erik@neon.tech>
---
 control_plane/src/pageserver.rs               |   5 +
 libs/pageserver_api/src/config.rs             |   9 +
 libs/pageserver_api/src/models.rs             |   8 +
 pageserver/src/tenant.rs                      |   3 +
 pageserver/src/tenant/config.rs               |  12 ++
 .../storage_layer/batch_split_writer.rs       |   4 +
 pageserver/src/tenant/timeline.rs             | 197 +++++++++++++-----
 pageserver/src/tenant/timeline/compaction.rs  |  30 ++-
 .../regress/test_attach_tenant_config.py      |   1 +
 9 files changed, 209 insertions(+), 60 deletions(-)

diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 383c174684..dd37bfc407 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -388,6 +388,11 @@ impl PageServerNode {
                 .map(|x| x.parse::<u8>())
                 .transpose()
                 .context("Failed to parse 'image_creation_check_threshold' as integer")?,
+            image_creation_preempt_threshold: settings
+                .remove("image_creation_preempt_threshold")
+                .map(|x| x.parse::<usize>())
+                .transpose()
+                .context("Failed to parse 'image_creation_preempt_threshold' as integer")?,
             pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
             walreceiver_connect_timeout: settings
                 .remove("walreceiver_connect_timeout")
diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 422da0dc95..a0b5feea94 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -323,6 +323,10 @@ pub struct TenantConfigToml {
     // Expresed in multiples of checkpoint distance.
     pub image_layer_creation_check_threshold: u8,
 
+    // How many multiples of L0 `compaction_threshold` will preempt image layer creation and do L0 compaction.
+    // Set to 0 to disable preemption.
+    pub image_creation_preempt_threshold: usize,
+
     /// The length for an explicit LSN lease request.
     /// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval.
     #[serde(with = "humantime_serde")]
@@ -547,6 +551,10 @@ pub mod tenant_conf_defaults {
     // Relevant: https://github.com/neondatabase/neon/issues/3394
     pub const DEFAULT_GC_PERIOD: &str = "1 hr";
     pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3;
+    // If there are more than threshold * compaction_threshold (that is 3 * 10 in the default config) L0 layers, image
+    // layer creation will end immediately. Set to 0 to disable. The target default will be 3 once we
+    // want to enable this feature.
+    pub const DEFAULT_IMAGE_CREATION_PREEMPT_THRESHOLD: usize = 0;
     pub const DEFAULT_PITR_INTERVAL: &str = "7 days";
     pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds";
     pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
@@ -605,6 +613,7 @@ impl Default for TenantConfigToml {
             lazy_slru_download: false,
             timeline_get_throttle: crate::models::ThrottleConfig::disabled(),
             image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
+            image_creation_preempt_threshold: DEFAULT_IMAGE_CREATION_PREEMPT_THRESHOLD,
             lsn_lease_length: LsnLease::DEFAULT_LENGTH,
             lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS,
             timeline_offloading: false,
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 43447c67bd..19beb37ab3 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -498,6 +498,8 @@ pub struct TenantConfigPatch {
     #[serde(skip_serializing_if = "FieldPatch::is_noop")]
     pub image_layer_creation_check_threshold: FieldPatch<u8>,
     #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub image_creation_preempt_threshold: FieldPatch<usize>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
     pub lsn_lease_length: FieldPatch<String>,
     #[serde(skip_serializing_if = "FieldPatch::is_noop")]
     pub lsn_lease_length_for_ts: FieldPatch<String>,
@@ -544,6 +546,7 @@ pub struct TenantConfig {
     pub lazy_slru_download: Option<bool>,
     pub timeline_get_throttle: Option<ThrottleConfig>,
     pub image_layer_creation_check_threshold: Option<u8>,
+    pub image_creation_preempt_threshold: Option<usize>,
     pub lsn_lease_length: Option<String>,
     pub lsn_lease_length_for_ts: Option<String>,
     pub timeline_offloading: Option<bool>,
@@ -581,6 +584,7 @@ impl TenantConfig {
             mut lazy_slru_download,
             mut timeline_get_throttle,
             mut image_layer_creation_check_threshold,
+            mut image_creation_preempt_threshold,
             mut lsn_lease_length,
             mut lsn_lease_length_for_ts,
             mut timeline_offloading,
@@ -635,6 +639,9 @@ impl TenantConfig {
         patch
             .image_layer_creation_check_threshold
             .apply(&mut image_layer_creation_check_threshold);
+        patch
+            .image_creation_preempt_threshold
+            .apply(&mut image_creation_preempt_threshold);
         patch.lsn_lease_length.apply(&mut lsn_lease_length);
         patch
             .lsn_lease_length_for_ts
@@ -679,6 +686,7 @@ impl TenantConfig {
             lazy_slru_download,
             timeline_get_throttle,
             image_layer_creation_check_threshold,
+            image_creation_preempt_threshold,
             lsn_lease_length,
             lsn_lease_length_for_ts,
             timeline_offloading,
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 1914a95562..80a61eba92 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -5486,6 +5486,9 @@ pub(crate) mod harness {
                 image_layer_creation_check_threshold: Some(
                     tenant_conf.image_layer_creation_check_threshold,
                 ),
+                image_creation_preempt_threshold: Some(
+                    tenant_conf.image_creation_preempt_threshold,
+                ),
                 lsn_lease_length: Some(tenant_conf.lsn_lease_length),
                 lsn_lease_length_for_ts: Some(tenant_conf.lsn_lease_length_for_ts),
                 timeline_offloading: Some(tenant_conf.timeline_offloading),
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index 139ed27bd2..972837dc44 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -357,6 +357,9 @@ pub struct TenantConfOpt {
     #[serde(skip_serializing_if = "Option::is_none")]
     pub image_layer_creation_check_threshold: Option<u8>,
 
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub image_creation_preempt_threshold: Option<usize>,
+
     #[serde(skip_serializing_if = "Option::is_none")]
     #[serde(with = "humantime_serde")]
     #[serde(default)]
@@ -453,6 +456,9 @@ impl TenantConfOpt {
             image_layer_creation_check_threshold: self
                 .image_layer_creation_check_threshold
                 .unwrap_or(global_conf.image_layer_creation_check_threshold),
+            image_creation_preempt_threshold: self
+                .image_creation_preempt_threshold
+                .unwrap_or(global_conf.image_creation_preempt_threshold),
             lsn_lease_length: self
                 .lsn_lease_length
                 .unwrap_or(global_conf.lsn_lease_length),
@@ -504,6 +510,7 @@ impl TenantConfOpt {
             mut lazy_slru_download,
             mut timeline_get_throttle,
             mut image_layer_creation_check_threshold,
+            mut image_creation_preempt_threshold,
             mut lsn_lease_length,
             mut lsn_lease_length_for_ts,
             mut timeline_offloading,
@@ -578,6 +585,9 @@ impl TenantConfOpt {
         patch
             .image_layer_creation_check_threshold
             .apply(&mut image_layer_creation_check_threshold);
+        patch
+            .image_creation_preempt_threshold
+            .apply(&mut image_creation_preempt_threshold);
         patch
             .lsn_lease_length
             .map(|v| humantime::parse_duration(&v))?
@@ -626,6 +636,7 @@ impl TenantConfOpt {
             lazy_slru_download,
             timeline_get_throttle,
             image_layer_creation_check_threshold,
+            image_creation_preempt_threshold,
             lsn_lease_length,
             lsn_lease_length_for_ts,
             timeline_offloading,
@@ -689,6 +700,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
             lazy_slru_download: value.lazy_slru_download,
             timeline_get_throttle: value.timeline_get_throttle,
             image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
+            image_creation_preempt_threshold: value.image_creation_preempt_threshold,
             lsn_lease_length: value.lsn_lease_length.map(humantime),
             lsn_lease_length_for_ts: value.lsn_lease_length_for_ts.map(humantime),
             timeline_offloading: value.timeline_offloading,
diff --git a/pageserver/src/tenant/storage_layer/batch_split_writer.rs b/pageserver/src/tenant/storage_layer/batch_split_writer.rs
index 22d8b81bcc..7da51c27df 100644
--- a/pageserver/src/tenant/storage_layer/batch_split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/batch_split_writer.rs
@@ -166,6 +166,10 @@ impl BatchLayerWriter {
         // END: catch every error and do the recovery in the above section
         Ok(generated_layers)
     }
+
+    pub fn pending_layer_num(&self) -> usize {
+        self.generated_layer_writers.len()
+    }
 }
 
 /// An image writer that takes images and produces multiple image layers.
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index d6a8eaa4d9..d65b382e50 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -189,6 +189,14 @@ pub enum ImageLayerCreationMode {
     Initial,
 }
 
+#[derive(Clone, Debug, Default)]
+pub enum LastImageLayerCreationStatus {
+    Incomplete, // TODO: record the last key being processed
+    Complete,
+    #[default]
+    Initial,
+}
+
 impl std::fmt::Display for ImageLayerCreationMode {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         write!(f, "{:?}", self)
@@ -347,6 +355,8 @@ pub struct Timeline {
     // garbage collecting data that is still needed by the child timelines.
     pub(crate) gc_info: std::sync::RwLock<GcInfo>,
 
+    pub(crate) last_image_layer_creation_status: ArcSwap<LastImageLayerCreationStatus>,
+
     // It may change across major versions so for simplicity
     // keep it after running initdb for a timeline.
     // It is needed in checks when we want to error on some operations
@@ -936,9 +946,16 @@ pub(crate) enum ShutdownMode {
     Hard,
 }
 
-struct ImageLayerCreationOutcome {
-    unfinished_image_layer: Option<ImageLayerWriter>,
-    next_start_key: Key,
+enum ImageLayerCreationOutcome {
+    /// We generated an image layer
+    Generated {
+        unfinished_image_layer: ImageLayerWriter,
+    },
+    /// The key range is empty
+    Empty,
+    /// (Only used in metadata image layer creation), after reading the metadata keys, we decide to skip
+    /// the image layer creation.
+    Skip,
 }
 
 /// Public interface functions
@@ -2349,6 +2366,18 @@ impl Timeline {
             )
     }
 
+    fn get_image_creation_preempt_threshold(&self) -> usize {
+        let tenant_conf = self.tenant_conf.load();
+        tenant_conf
+            .tenant_conf
+            .image_creation_preempt_threshold
+            .unwrap_or(
+                self.conf
+                    .default_tenant_conf
+                    .image_creation_preempt_threshold,
+            )
+    }
+
     /// Resolve the effective WAL receiver protocol to use for this tenant.
     ///
     /// Priority order is:
@@ -2499,6 +2528,10 @@ impl Timeline {
 
                 gc_info: std::sync::RwLock::new(GcInfo::default()),
 
+                last_image_layer_creation_status: ArcSwap::new(Arc::new(
+                    LastImageLayerCreationStatus::default(),
+                )),
+
                 latest_gc_cutoff_lsn: Rcu::new(metadata.latest_gc_cutoff_lsn()),
                 initdb_lsn: metadata.initdb_lsn(),
 
@@ -4042,15 +4075,20 @@ impl Timeline {
             }
 
             let mut layers_to_upload = Vec::new();
-            layers_to_upload.extend(
-                self.create_image_layers(
+            let (generated_image_layers, is_complete) = self
+                .create_image_layers(
                     &partitions,
                     self.initdb_lsn,
                     ImageLayerCreationMode::Initial,
                     ctx,
+                    LastImageLayerCreationStatus::Initial,
                 )
-                .await?,
+                .await?;
+            debug_assert!(
+                matches!(is_complete, LastImageLayerCreationStatus::Complete),
+                "init image generation mode must fully cover the keyspace"
             );
+            layers_to_upload.extend(generated_image_layers);
 
             (layers_to_upload, None)
         } else {
@@ -4370,7 +4408,6 @@ impl Timeline {
         lsn: Lsn,
         ctx: &RequestContext,
         img_range: Range<Key>,
-        start: Key,
         io_concurrency: IoConcurrency,
     ) -> Result<ImageLayerCreationOutcome, CreateImageLayersError> {
         let mut wrote_keys = false;
@@ -4458,26 +4495,23 @@ impl Timeline {
                     lsn
                 },
             );
-            Ok(ImageLayerCreationOutcome {
-                unfinished_image_layer: Some(image_layer_writer),
-                next_start_key: img_range.end,
+            Ok(ImageLayerCreationOutcome::Generated {
+                unfinished_image_layer: image_layer_writer,
             })
         } else {
-            // Special case: the image layer may be empty if this is a sharded tenant and the
-            // partition does not cover any keys owned by this shard.  In this case, to ensure
-            // we don't leave gaps between image layers, leave `start` where it is, so that the next
-            // layer we write will cover the key range that we just scanned.
             tracing::debug!("no data in range {}-{}", img_range.start, img_range.end);
-            Ok(ImageLayerCreationOutcome {
-                unfinished_image_layer: None,
-                next_start_key: start,
-            })
+            Ok(ImageLayerCreationOutcome::Empty)
         }
     }
 
     /// Create an image layer for metadata keys. This function produces one image layer for all metadata
     /// keys for now. Because metadata keys cannot exceed basebackup size limit, the image layer for it
     /// would not be too large to fit in a single image layer.
+    ///
+    /// Creating image layers for metadata keys are different from relational keys. Firstly, instead of
+    /// iterating each key and get an image for each of them, we do a `vectored_get` scan over the sparse
+    /// keyspace to get all images in one run. Secondly, we use a different image layer generation metrics
+    /// for metadata keys than relational keys, which is the number of delta files visited during the scan.
     #[allow(clippy::too_many_arguments)]
     async fn create_image_layer_for_metadata_keys(
         self: &Arc<Self>,
@@ -4487,12 +4521,13 @@ impl Timeline {
         ctx: &RequestContext,
         img_range: Range<Key>,
         mode: ImageLayerCreationMode,
-        start: Key,
         io_concurrency: IoConcurrency,
     ) -> Result<ImageLayerCreationOutcome, CreateImageLayersError> {
         // Metadata keys image layer creation.
         let mut reconstruct_state = ValuesReconstructState::new(io_concurrency);
         let begin = Instant::now();
+        // Directly use `get_vectored_impl` to skip the max_vectored_read_key limit check. Note that the keyspace should
+        // not contain too many keys, otherwise this takes a lot of memory.
         let data = self
             .get_vectored_impl(partition.clone(), lsn, &mut reconstruct_state, ctx)
             .await?;
@@ -4517,10 +4552,7 @@ impl Timeline {
         );
 
         if !trigger_generation && mode == ImageLayerCreationMode::Try {
-            return Ok(ImageLayerCreationOutcome {
-                unfinished_image_layer: None,
-                next_start_key: img_range.end,
-            });
+            return Ok(ImageLayerCreationOutcome::Skip);
         }
         if self.cancel.is_cancelled() {
             return Err(CreateImageLayersError::Cancelled);
@@ -4551,20 +4583,12 @@ impl Timeline {
                     lsn
                 }
             );
-            Ok(ImageLayerCreationOutcome {
-                unfinished_image_layer: Some(image_layer_writer),
-                next_start_key: img_range.end,
+            Ok(ImageLayerCreationOutcome::Generated {
+                unfinished_image_layer: image_layer_writer,
             })
         } else {
-            // Special case: the image layer may be empty if this is a sharded tenant and the
-            // partition does not cover any keys owned by this shard. In this case, to ensure
-            // we don't leave gaps between image layers, leave `start` where it is, so that the next
-            // layer we write will cover the key range that we just scanned.
             tracing::debug!("no data in range {}-{}", img_range.start, img_range.end);
-            Ok(ImageLayerCreationOutcome {
-                unfinished_image_layer: None,
-                next_start_key: start,
-            })
+            Ok(ImageLayerCreationOutcome::Empty)
         }
     }
 
@@ -4620,6 +4644,8 @@ impl Timeline {
         decision
     }
 
+    /// Returns the image layers generated and an enum indicating whether the process is fully completed.
+    /// true = we have generate all image layers, false = we preempt the process for L0 compaction.
     #[tracing::instrument(skip_all, fields(%lsn, %mode))]
     async fn create_image_layers(
         self: &Arc<Timeline>,
@@ -4627,7 +4653,8 @@ impl Timeline {
         lsn: Lsn,
         mode: ImageLayerCreationMode,
         ctx: &RequestContext,
-    ) -> Result<Vec<ResidentLayer>, CreateImageLayersError> {
+        last_status: LastImageLayerCreationStatus,
+    ) -> Result<(Vec<ResidentLayer>, LastImageLayerCreationStatus), CreateImageLayersError> {
         let timer = self.metrics.create_images_time_histo.start_timer();
 
         // We need to avoid holes between generated image layers.
@@ -4641,10 +4668,23 @@ impl Timeline {
         // image layers  <100000000..100000099> and <200000000..200000199> are not completely covering it.
         let mut start = Key::MIN;
 
-        let check_for_image_layers = self.should_check_if_image_layers_required(lsn);
+        let check_for_image_layers = if let LastImageLayerCreationStatus::Incomplete = last_status {
+            info!(
+                "resuming image layer creation: last_status={:?}",
+                last_status
+            );
+            true
+        } else {
+            self.should_check_if_image_layers_required(lsn)
+        };
 
         let mut batch_image_writer = BatchLayerWriter::new(self.conf).await?;
 
+        let mut all_generated = true;
+
+        let mut partition_processed = 0;
+        let total_partitions = partitioning.parts.len();
+
         for partition in partitioning.parts.iter() {
             if self.cancel.is_cancelled() {
                 return Err(CreateImageLayersError::Cancelled);
@@ -4717,17 +4757,13 @@ impl Timeline {
                     .map_err(|_| CreateImageLayersError::Cancelled)?,
             );
 
-            let ImageLayerCreationOutcome {
-                unfinished_image_layer,
-                next_start_key,
-            } = if !compact_metadata {
+            let outcome = if !compact_metadata {
                 self.create_image_layer_for_rel_blocks(
                     partition,
                     image_layer_writer,
                     lsn,
                     ctx,
                     img_range.clone(),
-                    start,
                     io_concurrency,
                 )
                 .await?
@@ -4739,18 +4775,58 @@ impl Timeline {
                     ctx,
                     img_range.clone(),
                     mode,
-                    start,
                     io_concurrency,
                 )
                 .await?
             };
-            start = next_start_key;
-            if let Some(unfinished_image_layer) = unfinished_image_layer {
-                batch_image_writer.add_unfinished_image_writer(
+            match outcome {
+                ImageLayerCreationOutcome::Empty => {
+                    // No data in this partition, so we don't need to create an image layer (for now).
+                    // The next image layer should cover this key range, so we don't advance the `start`
+                    // key.
+                }
+                ImageLayerCreationOutcome::Generated {
                     unfinished_image_layer,
-                    img_range,
-                    lsn,
-                );
+                } => {
+                    batch_image_writer.add_unfinished_image_writer(
+                        unfinished_image_layer,
+                        img_range.clone(),
+                        lsn,
+                    );
+                    // The next image layer should be generated right after this one.
+                    start = img_range.end;
+                }
+                ImageLayerCreationOutcome::Skip => {
+                    // We don't need to create an image layer for this partition.
+                    // The next image layer should NOT cover this range, otherwise
+                    // the keyspace becomes empty (reads don't go past image layers).
+                    start = img_range.end;
+                }
+            }
+
+            partition_processed += 1;
+
+            if let ImageLayerCreationMode::Try = mode {
+                // We have at least made some progress
+                if batch_image_writer.pending_layer_num() >= 1 {
+                    // The `Try` mode is currently only used on the compaction path. We want to avoid
+                    // image layer generation taking too long time and blocking L0 compaction. So in this
+                    // mode, we also inspect the current number of L0 layers and skip image layer generation
+                    // if there are too many of them.
+                    let num_of_l0_layers = {
+                        let layers = self.layers.read().await;
+                        layers.layer_map()?.level0_deltas().len()
+                    };
+                    let image_preempt_threshold = self.get_image_creation_preempt_threshold()
+                        * self.get_compaction_threshold();
+                    if image_preempt_threshold != 0 && num_of_l0_layers >= image_preempt_threshold {
+                        tracing::info!(
+                        "preempt image layer generation at {start} at {lsn}: too many L0 layers {num_of_l0_layers}",
+                    );
+                        all_generated = false;
+                        break;
+                    }
+                }
             }
         }
 
@@ -4765,14 +4841,35 @@ impl Timeline {
             .open_mut()?
             .track_new_image_layers(&image_layers, &self.metrics);
         drop_wlock(guard);
-        timer.stop_and_record();
+        let duration = timer.stop_and_record();
 
         // Creating image layers may have caused some previously visible layers to be covered
         if !image_layers.is_empty() {
             self.update_layer_visibility().await?;
         }
 
-        Ok(image_layers)
+        let total_layer_size = image_layers
+            .iter()
+            .map(|l| l.metadata().file_size)
+            .sum::<u64>();
+
+        info!(
+            "created {} image layers ({} bytes) in {}s, processed {} out of {} partitions",
+            image_layers.len(),
+            total_layer_size,
+            duration.as_secs_f64(),
+            partition_processed,
+            total_partitions
+        );
+
+        Ok((
+            image_layers,
+            if all_generated {
+                LastImageLayerCreationStatus::Complete
+            } else {
+                LastImageLayerCreationStatus::Incomplete
+            },
+        ))
     }
 
     /// Wait until the background initial logical size calculation is complete, or
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 7244e946cb..9bd61bbac5 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -11,7 +11,7 @@ use std::sync::Arc;
 use super::layer_manager::LayerManager;
 use super::{
     CompactFlags, CompactOptions, CreateImageLayersError, DurationRecorder, ImageLayerCreationMode,
-    RecordedDuration, Timeline,
+    LastImageLayerCreationStatus, RecordedDuration, Timeline,
 };
 
 use anyhow::{anyhow, bail, Context};
@@ -709,7 +709,7 @@ impl Timeline {
                     .extend(sparse_partitioning.into_dense().parts);
 
                 // 3. Create new image layers for partitions that have been modified "enough".
-                let image_layers = self
+                let (image_layers, outcome) = self
                     .create_image_layers(
                         &partitioning,
                         lsn,
@@ -722,10 +722,22 @@ impl Timeline {
                             ImageLayerCreationMode::Try
                         },
                         &image_ctx,
+                        self.last_image_layer_creation_status
+                            .load()
+                            .as_ref()
+                            .clone(),
                     )
                     .await?;
 
+                self.last_image_layer_creation_status
+                    .store(Arc::new(outcome.clone()));
+
                 self.upload_new_image_layers(image_layers)?;
+                if let LastImageLayerCreationStatus::Incomplete = outcome {
+                    // Yield and do not do any other kind of compaction.
+                    info!("skipping shard ancestor compaction due to pending image layer generation tasks (preempted by L0 compaction).");
+                    return Ok(true);
+                }
                 partitioning.parts.len()
             }
             Err(err) => {
@@ -3232,11 +3244,7 @@ impl TimelineAdaptor {
             ranges: self.get_keyspace(key_range, lsn, ctx).await?,
         };
         // TODO set proper (stateful) start. The create_image_layer_for_rel_blocks function mostly
-        let start = Key::MIN;
-        let ImageLayerCreationOutcome {
-            unfinished_image_layer,
-            next_start_key: _,
-        } = self
+        let outcome = self
             .timeline
             .create_image_layer_for_rel_blocks(
                 &keyspace,
@@ -3244,13 +3252,15 @@ impl TimelineAdaptor {
                 lsn,
                 ctx,
                 key_range.clone(),
-                start,
                 IoConcurrency::sequential(),
             )
             .await?;
 
-        if let Some(image_layer_writer) = unfinished_image_layer {
-            let (desc, path) = image_layer_writer.finish(ctx).await?;
+        if let ImageLayerCreationOutcome::Generated {
+            unfinished_image_layer,
+        } = outcome
+        {
+            let (desc, path) = unfinished_image_layer.finish(ctx).await?;
             let image_layer =
                 Layer::finish_creating(self.timeline.conf, &self.timeline, desc, &path)?;
             self.new_images.push(image_layer);
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index e88d245c8f..a4b9eabf8e 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -184,6 +184,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
         "gc_compaction_enabled": True,
         "gc_compaction_initial_threshold_kb": 1024000,
         "gc_compaction_ratio_percent": 200,
+        "image_creation_preempt_threshold": 5,
     }
 
     vps_http = env.storage_controller.pageserver_api()

From e219d48bfe9991f40eb0d6c4f8713c9f8dc9eb05 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Mon, 3 Feb 2025 16:56:55 -0500
Subject: [PATCH 37/77] refactor(pageserver): clearify compaction return value
 (#10643)

## Problem

## Summary of changes

Make the return value of the set of compaction functions less confusing.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant.rs                     | 35 +++++++------
 pageserver/src/tenant/tasks.rs               |  5 +-
 pageserver/src/tenant/timeline.rs            | 13 ++---
 pageserver/src/tenant/timeline/compaction.rs | 55 +++++++++++++-------
 4 files changed, 67 insertions(+), 41 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 80a61eba92..c1b408ed72 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -46,6 +46,7 @@ use std::sync::atomic::AtomicBool;
 use std::sync::Weak;
 use std::time::SystemTime;
 use storage_broker::BrokerClientChannel;
+use timeline::compaction::CompactionOutcome;
 use timeline::compaction::GcCompactionQueue;
 use timeline::import_pgdata;
 use timeline::offload::offload_timeline;
@@ -2907,10 +2908,10 @@ impl Tenant {
         self: &Arc<Self>,
         cancel: &CancellationToken,
         ctx: &RequestContext,
-    ) -> Result<bool, timeline::CompactionError> {
+    ) -> Result<CompactionOutcome, timeline::CompactionError> {
         // Don't start doing work during shutdown, or when broken, we do not need those in the logs
         if !self.is_active() {
-            return Ok(false);
+            return Ok(CompactionOutcome::Done);
         }
 
         {
@@ -2924,7 +2925,7 @@ impl Tenant {
             // to AttachedSingle state.
             if !conf.location.may_upload_layers_hint() {
                 info!("Skipping compaction in location state {:?}", conf.location);
-                return Ok(false);
+                return Ok(CompactionOutcome::Done);
             }
         }
 
@@ -2967,7 +2968,7 @@ impl Tenant {
         // Before doing any I/O work, check our circuit breaker
         if self.compaction_circuit_breaker.lock().unwrap().is_broken() {
             info!("Skipping compaction due to previous failures");
-            return Ok(false);
+            return Ok(CompactionOutcome::Done);
         }
 
         let mut has_pending_task = false;
@@ -2975,10 +2976,10 @@ impl Tenant {
         for (timeline_id, timeline, (can_compact, can_offload)) in &timelines_to_compact_or_offload
         {
             // pending_task_left == None: cannot compact, maybe still pending tasks
-            // pending_task_left == Some(true): compaction task left
-            // pending_task_left == Some(false): no compaction task left
+            // pending_task_left == Some(Pending): compaction task left
+            // pending_task_left == Some(Done): no compaction task left
             let pending_task_left = if *can_compact {
-                let has_pending_l0_compaction_task = timeline
+                let compaction_outcome = timeline
                     .compact(cancel, EnumSet::empty(), ctx)
                     .instrument(info_span!("compact_timeline", %timeline_id))
                     .await
@@ -2996,27 +2997,27 @@ impl Tenant {
                                 .fail(&CIRCUIT_BREAKERS_BROKEN, e);
                         }
                     })?;
-                if has_pending_l0_compaction_task {
-                    Some(true)
+                if let CompactionOutcome::Pending = compaction_outcome {
+                    Some(CompactionOutcome::Pending)
                 } else {
                     let queue = {
                         let guard = self.scheduled_compaction_tasks.lock().unwrap();
                         guard.get(timeline_id).cloned()
                     };
                     if let Some(queue) = queue {
-                        let has_pending_tasks = queue
+                        let outcome = queue
                             .iteration(cancel, ctx, &self.gc_block, timeline)
                             .await?;
-                        Some(has_pending_tasks)
+                        Some(outcome)
                     } else {
-                        Some(false)
+                        Some(CompactionOutcome::Done)
                     }
                 }
             } else {
                 None
             };
-            has_pending_task |= pending_task_left.unwrap_or(false);
-            if pending_task_left == Some(false) && *can_offload {
+            has_pending_task |= pending_task_left == Some(CompactionOutcome::Pending);
+            if pending_task_left == Some(CompactionOutcome::Done) && *can_offload {
                 pausable_failpoint!("before-timeline-auto-offload");
                 match offload_timeline(self, timeline)
                     .instrument(info_span!("offload_timeline", %timeline_id))
@@ -3036,7 +3037,11 @@ impl Tenant {
             .unwrap()
             .success(&CIRCUIT_BREAKERS_UNBROKEN);
 
-        Ok(has_pending_task)
+        Ok(if has_pending_task {
+            CompactionOutcome::Pending
+        } else {
+            CompactionOutcome::Done
+        })
     }
 
     /// Cancel scheduled compaction tasks
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 3725e2f7fc..b6b64d02dd 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -11,6 +11,7 @@ use crate::metrics::TENANT_TASK_EVENTS;
 use crate::task_mgr;
 use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
 use crate::tenant::throttle::Stats;
+use crate::tenant::timeline::compaction::CompactionOutcome;
 use crate::tenant::timeline::CompactionError;
 use crate::tenant::{Tenant, TenantState};
 use rand::Rng;
@@ -206,10 +207,10 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                     .run(tenant.compaction_iteration(&cancel, &ctx))
                     .await;
                 match output {
-                    Ok(has_pending_task) => {
+                    Ok(outcome) => {
                         error_run_count = 0;
                         // schedule the next compaction immediately in case there is a pending compaction task
-                        sleep_duration = if has_pending_task {
+                        sleep_duration = if let CompactionOutcome::Pending = outcome {
                             Duration::ZERO
                         } else {
                             period
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index d65b382e50..11c0bbdfe5 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -18,6 +18,7 @@ use arc_swap::{ArcSwap, ArcSwapOption};
 use bytes::Bytes;
 use camino::Utf8Path;
 use chrono::{DateTime, Utc};
+use compaction::CompactionOutcome;
 use enumset::EnumSet;
 use fail::fail_point;
 use futures::{stream::FuturesUnordered, StreamExt};
@@ -1679,7 +1680,7 @@ impl Timeline {
         cancel: &CancellationToken,
         flags: EnumSet<CompactFlags>,
         ctx: &RequestContext,
-    ) -> Result<bool, CompactionError> {
+    ) -> Result<CompactionOutcome, CompactionError> {
         self.compact_with_options(
             cancel,
             CompactOptions {
@@ -1701,7 +1702,7 @@ impl Timeline {
         cancel: &CancellationToken,
         options: CompactOptions,
         ctx: &RequestContext,
-    ) -> Result<bool, CompactionError> {
+    ) -> Result<CompactionOutcome, CompactionError> {
         // most likely the cancellation token is from background task, but in tests it could be the
         // request task as well.
 
@@ -1721,8 +1722,8 @@ impl Timeline {
         // compaction task goes over it's period (20s) which is quite often in production.
         let (_guard, _permit) = tokio::select! {
             tuple = prepare => { tuple },
-            _ = self.cancel.cancelled() => return Ok(false),
-            _ = cancel.cancelled() => return Ok(false),
+            _ = self.cancel.cancelled() => return Ok(CompactionOutcome::Done),
+            _ = cancel.cancelled() => return Ok(CompactionOutcome::Done),
         };
 
         let last_record_lsn = self.get_last_record_lsn();
@@ -1730,13 +1731,13 @@ impl Timeline {
         // Last record Lsn could be zero in case the timeline was just created
         if !last_record_lsn.is_valid() {
             warn!("Skipping compaction for potentially just initialized timeline, it has invalid last record lsn: {last_record_lsn}");
-            return Ok(false);
+            return Ok(CompactionOutcome::Done);
         }
 
         let result = match self.get_compaction_algorithm_settings().kind {
             CompactionAlgorithm::Tiered => {
                 self.compact_tiered(cancel, ctx).await?;
-                Ok(false)
+                Ok(CompactionOutcome::Done)
             }
             CompactionAlgorithm::Legacy => self.compact_legacy(cancel, options, ctx).await,
         };
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 9bd61bbac5..7dd37d7232 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -262,13 +262,13 @@ impl GcCompactionQueue {
         ctx: &RequestContext,
         gc_block: &GcBlock,
         timeline: &Arc<Timeline>,
-    ) -> Result<bool, CompactionError> {
+    ) -> Result<CompactionOutcome, CompactionError> {
         let _one_op_at_a_time_guard = self.consumer_lock.lock().await;
         let has_pending_tasks;
         let (id, item) = {
             let mut guard = self.inner.lock().unwrap();
             let Some((id, item)) = guard.queued.pop_front() else {
-                return Ok(false);
+                return Ok(CompactionOutcome::Done);
             };
             guard.running = Some((id, item.clone()));
             has_pending_tasks = !guard.queued.is_empty();
@@ -323,7 +323,11 @@ impl GcCompactionQueue {
             let mut guard = self.inner.lock().unwrap();
             guard.running = None;
         }
-        Ok(has_pending_tasks)
+        Ok(if has_pending_tasks {
+            CompactionOutcome::Pending
+        } else {
+            CompactionOutcome::Done
+        })
     }
 
     #[allow(clippy::type_complexity)]
@@ -589,6 +593,17 @@ impl CompactionStatistics {
     }
 }
 
+#[derive(Default, Debug, Clone, Copy, PartialEq, Eq)]
+pub enum CompactionOutcome {
+    #[default]
+    /// No layers need to be compacted after this round. Compaction doesn't need
+    /// to be immediately scheduled.
+    Done,
+    /// Still has pending layers to be compacted after this round. Ideally, the scheduler
+    /// should immediately schedule another compaction.
+    Pending,
+}
+
 impl Timeline {
     /// TODO: cancellation
     ///
@@ -598,7 +613,7 @@ impl Timeline {
         cancel: &CancellationToken,
         options: CompactOptions,
         ctx: &RequestContext,
-    ) -> Result<bool, CompactionError> {
+    ) -> Result<CompactionOutcome, CompactionError> {
         if options
             .flags
             .contains(CompactFlags::EnhancedGcBottomMostCompaction)
@@ -606,7 +621,7 @@ impl Timeline {
             self.compact_with_gc(cancel, options, ctx)
                 .await
                 .map_err(CompactionError::Other)?;
-            return Ok(false);
+            return Ok(CompactionOutcome::Done);
         }
 
         if options.flags.contains(CompactFlags::DryRun) {
@@ -666,9 +681,9 @@ impl Timeline {
         // Define partitioning schema if needed
 
         // 1. L0 Compact
-        let fully_compacted = {
+        let l0_compaction_outcome = {
             let timer = self.metrics.compact_time_histo.start_timer();
-            let fully_compacted = self
+            let l0_compaction_outcome = self
                 .compact_level0(
                     target_file_size,
                     options.flags.contains(CompactFlags::ForceL0Compaction),
@@ -676,15 +691,15 @@ impl Timeline {
                 )
                 .await?;
             timer.stop_and_record();
-            fully_compacted
+            l0_compaction_outcome
         };
 
-        if !fully_compacted {
+        if let CompactionOutcome::Pending = l0_compaction_outcome {
             // Yield and do not do any other kind of compaction. True means
             // that we have pending L0 compaction tasks and the compaction scheduler
             // will prioritize compacting this tenant/timeline again.
             info!("skipping image layer generation and shard ancestor compaction due to L0 compaction did not include all layers.");
-            return Ok(true);
+            return Ok(CompactionOutcome::Pending);
         }
 
         // 2. Repartition and create image layers if necessary
@@ -736,7 +751,7 @@ impl Timeline {
                 if let LastImageLayerCreationStatus::Incomplete = outcome {
                     // Yield and do not do any other kind of compaction.
                     info!("skipping shard ancestor compaction due to pending image layer generation tasks (preempted by L0 compaction).");
-                    return Ok(true);
+                    return Ok(CompactionOutcome::Pending);
                 }
                 partitioning.parts.len()
             }
@@ -765,7 +780,7 @@ impl Timeline {
             self.compact_shard_ancestors(rewrite_max, ctx).await?;
         }
 
-        Ok(false)
+        Ok(CompactionOutcome::Done)
     }
 
     /// Check for layers that are elegible to be rewritten:
@@ -1022,11 +1037,11 @@ impl Timeline {
         target_file_size: u64,
         force_compaction_ignore_threshold: bool,
         ctx: &RequestContext,
-    ) -> Result<bool, CompactionError> {
+    ) -> Result<CompactionOutcome, CompactionError> {
         let CompactLevel0Phase1Result {
             new_layers,
             deltas_to_compact,
-            fully_compacted,
+            outcome,
         } = {
             let phase1_span = info_span!("compact_level0_phase1");
             let ctx = ctx.attached_child();
@@ -1055,12 +1070,12 @@ impl Timeline {
 
         if new_layers.is_empty() && deltas_to_compact.is_empty() {
             // nothing to do
-            return Ok(true);
+            return Ok(CompactionOutcome::Done);
         }
 
         self.finish_compact_batch(&new_layers, &Vec::new(), &deltas_to_compact)
             .await?;
-        Ok(fully_compacted)
+        Ok(outcome)
     }
 
     /// Level0 files first phase of compaction, explained in the [`Self::compact_legacy`] comment.
@@ -1602,7 +1617,11 @@ impl Timeline {
                 .into_iter()
                 .map(|x| x.drop_eviction_guard())
                 .collect::<Vec<_>>(),
-            fully_compacted,
+            outcome: if fully_compacted {
+                CompactionOutcome::Done
+            } else {
+                CompactionOutcome::Pending
+            },
         })
     }
 }
@@ -1613,7 +1632,7 @@ struct CompactLevel0Phase1Result {
     deltas_to_compact: Vec<Layer>,
     // Whether we have included all L0 layers, or selected only part of them due to the
     // L0 compaction size limit.
-    fully_compacted: bool,
+    outcome: CompactionOutcome,
 }
 
 #[derive(Default)]

From 8107140f7f8145148f8b9ffd007046c81c724c25 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 4 Feb 2025 11:35:43 +0100
Subject: [PATCH 38/77] Refactor compute dockerfile (#10371)

Refactor how extensions are built in compute Dockerfile

1. Rename some of the extension layers, so that names correspond more
   precisely to the upstream repository name and the source directory
   name. For example, instead of "pg-jsonschema-pg-build", spell it
   "pg_jsonschema-build". Some of the layer names had the extra "pg-"
   part, and some didn't; harmonize on not having it. And use an
   underscore if the upstream project name uses an underscore.

2. Each extension now consists of two dockerfile targets:
   [extension]-src and [extension]-build. By convention, the -src
   target downloads the sources and applies any neon-specific patches
   if necessary. The source tarball is downloaded and extracted under
   /ext-src. For example, the 'pgvector' extension creates the
   following files and directory:

        /ext-src/pgvector.tar.gz  # original tarball
/ext-src/pgvector.patch # neon-specific patch, copied from patches/ dir
/ext-src/pgvector-src/ # extracted tarball, with patch applied

    This separation avoids re-downloading the sources every time the
    extension is recompiled. The 'extension-tests' target also uses the
    [extension]-src layers, by copying the /ext-src/ dirs from all
    the extensions together into one image.

This refactoring came about when I was experimenting with different
ways of splitting up the Dockerfile so that each extension would be in
a separate file. That's not part of this PR yet, but this is a good
step in modularizing the extensions.
---
 .github/workflows/build_and_test.yml |   2 +-
 compute/compute-node.Dockerfile      | 896 ++++++++++++++++++---------
 2 files changed, 599 insertions(+), 299 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 1274543429..5a4bdecb99 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -682,7 +682,7 @@ jobs:
           push: true
           pull: true
           file: compute/compute-node.Dockerfile
-          target: neon-pg-ext-test
+          target: extension-tests
           cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }}
           tags: |
             neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.tag.outputs.build-tag}}-${{ matrix.version.debian }}-${{ matrix.arch }}
diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 32226c56a5..ea29630001 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1,3 +1,81 @@
+#
+# This Dockerfile builds the compute image. It is built multiple times to produce
+# different images for each PostgreSQL major version.
+#
+# We use Debian as the base for all the steps. The production images use Debian bookworm
+# for v17, and Debian bullseye for older PostgreSQL versions.
+#
+# ## Intermediary layers
+#
+# build-tools:   This contains Rust compiler toolchain and other tools needed at compile
+#                time. This is also used for the storage builds. This image is defined in
+#                build-tools.Dockerfile.
+#
+# build-deps:    Contains C compiler, other build tools, and compile-time dependencies
+#                needed to compile PostgreSQL and most extensions. (Some extensions need
+#                extra tools and libraries that are not included in this image. They are
+#                installed in the extension-specific build stages.)
+#
+# pg-build:      Result of compiling PostgreSQL. The PostgreSQL binaries are copied from
+#                this to the final image. This is also used as the base for compiling all
+#                the extensions.
+#
+# compute-tools: This contains compute_ctl, the launcher program that starts Postgres
+#                in Neon. It also contains a few other tools that are built from the
+#                sources from this repository and used in compute VMs: 'fast_import' and
+#                'local_proxy'
+#
+# ## Extensions
+#
+# By convention, the build of each extension consists of two layers:
+#
+# {extension}-src:   Contains the source tarball, possible neon-specific patches, and
+#                    the extracted tarball with the patches applied. All of these are
+#                    under the /ext-src/ directory.
+#
+# {extension}-build: Contains the installed extension files, under /usr/local/pgsql
+#                    (in addition to the PostgreSQL binaries inherited from the pg-build
+#                    image). A few extensions need extra libraries or other files
+#                    installed elsewhere in the filesystem. They are installed by ONBUILD
+#                    directives.
+#
+# These are merged together into two layers:
+#
+# all-extensions:    All the extension -build layers merged together
+#
+# extension-tests:   All the extension -src layers merged together. This is used by the
+#                    extension tests. The tests are executed against the compiled image,
+#                    but the tests need test scripts, expected result files etc. from the
+#                    original sources, which are not included in the binary image.
+#
+# ## Extra components
+#
+# These are extra included in the compute image, but are not directly used by PostgreSQL
+# itself.
+#
+# pgbouncer:         pgbouncer and its configuration
+#
+# sql_exporter:      Metrics exporter daemon.
+#
+# postgres_exporter: Another metrics exporter daemon, for different sets of metrics.
+#
+# The configuration files for the metrics exporters are under etc/ directory. We use
+# a templating system to handle variations between different PostgreSQL versions,
+# building slightly different config files for each PostgreSQL version.
+#
+#
+# ## Final image
+#
+# The final image puts together the PostgreSQL binaries (pg-build), the compute tools
+# (compute-tools), all the extensions (all-extensions) and the extra components into
+# one image.
+#
+# VM image: The final image built by this dockerfile isn't actually the final image that
+# we use in computes VMs. There's an extra step that adds some files and makes other
+# small adjustments, and builds the QCOV2 filesystem image suitable for using in a VM.
+# That step is done by the 'vm-builder' tool. See the vm-compute-node-image job in the
+# build_and_test.yml github workflow for how that's done.
+
 ARG PG_VERSION
 ARG REPOSITORY=neondatabase
 ARG IMAGE=build-tools
@@ -122,17 +200,9 @@ ENV PATH="/usr/local/pgsql/bin:$PATH"
 # Build PostGIS from the upstream PostGIS mirror.
 #
 #########################################################################################
-FROM pg-build AS postgis-build
+FROM build-deps AS postgis-src
 ARG DEBIAN_VERSION
 ARG PG_VERSION
-RUN apt update && \
-    apt install --no-install-recommends --no-install-suggests -y \
-    gdal-bin libboost-dev libboost-thread-dev libboost-filesystem-dev \
-    libboost-system-dev libboost-iostreams-dev libboost-program-options-dev libboost-timer-dev \
-    libcgal-dev libgdal-dev libgmp-dev libmpfr-dev libopenscenegraph-dev libprotobuf-c-dev \
-    protobuf-c-compiler xsltproc \
-    && apt clean && rm -rf /var/lib/apt/lists/*
-
 
 # Postgis 3.5.0 requires SFCGAL 1.4+
 #
@@ -141,6 +211,7 @@ RUN apt update && \
 # and also we must check backward compatibility with older versions of PostGIS.
 #
 # Use new version only for v17
+WORKDIR /ext-src
 RUN case "${DEBIAN_VERSION}" in \
     "bookworm") \
         export SFCGAL_VERSION=1.4.1 \
@@ -154,15 +225,12 @@ RUN case "${DEBIAN_VERSION}" in \
         echo "unexpected PostgreSQL version" && exit 1 \
     ;; \
     esac && \
-    mkdir -p /sfcgal && \
     wget https://gitlab.com/sfcgal/SFCGAL/-/archive/v${SFCGAL_VERSION}/SFCGAL-v${SFCGAL_VERSION}.tar.gz -O SFCGAL.tar.gz && \
     echo "${SFCGAL_CHECKSUM} SFCGAL.tar.gz" | sha256sum --check && \
-    mkdir sfcgal-src && cd sfcgal-src && tar xzf ../SFCGAL.tar.gz --strip-components=1 -C . && \
-    cmake -DCMAKE_BUILD_TYPE=Release -GNinja . && ninja -j $(getconf _NPROCESSORS_ONLN) && \
-    DESTDIR=/sfcgal ninja install -j $(getconf _NPROCESSORS_ONLN) && \
-    ninja clean && cp -R /sfcgal/* /
+    mkdir sfcgal-src && cd sfcgal-src && tar xzf ../SFCGAL.tar.gz --strip-components=1 -C .
 
 # Postgis 3.5.0 supports v17
+WORKDIR /ext-src
 RUN case "${PG_VERSION}" in \
     "v17") \
         export POSTGIS_VERSION=3.5.0 \
@@ -178,8 +246,27 @@ RUN case "${PG_VERSION}" in \
     esac && \
     wget https://download.osgeo.org/postgis/source/postgis-${POSTGIS_VERSION}.tar.gz -O postgis.tar.gz && \
     echo "${POSTGIS_CHECKSUM} postgis.tar.gz" | sha256sum --check && \
-    mkdir postgis-src && cd postgis-src && tar xzf ../postgis.tar.gz --strip-components=1 -C . && \
-    ./autogen.sh && \
+    mkdir postgis-src && cd postgis-src && tar xzf ../postgis.tar.gz --strip-components=1 -C .
+
+# This is reused for pgrouting
+FROM pg-build AS postgis-build-deps
+RUN apt update && \
+    apt install --no-install-recommends --no-install-suggests -y \
+    gdal-bin libboost-dev libboost-thread-dev libboost-filesystem-dev \
+    libboost-system-dev libboost-iostreams-dev libboost-program-options-dev libboost-timer-dev \
+    libcgal-dev libgdal-dev libgmp-dev libmpfr-dev libopenscenegraph-dev libprotobuf-c-dev \
+    protobuf-c-compiler xsltproc \
+    && apt clean && rm -rf /var/lib/apt/lists/*
+
+FROM postgis-build-deps AS postgis-build
+COPY --from=postgis-src /ext-src/ /ext-src/
+WORKDIR /ext-src/sfcgal-src
+RUN cmake -DCMAKE_BUILD_TYPE=Release -GNinja . && ninja -j $(getconf _NPROCESSORS_ONLN) && \
+    DESTDIR=/sfcgal ninja install -j $(getconf _NPROCESSORS_ONLN) && \
+    ninja clean && cp -R /sfcgal/* /
+
+WORKDIR /ext-src/postgis-src
+RUN ./autogen.sh && \
     ./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -202,12 +289,23 @@ RUN case "${PG_VERSION}" in \
     cp /usr/local/pgsql/share/extension/address_standardizer.control /extensions/postgis && \
     cp /usr/local/pgsql/share/extension/address_standardizer_data_us.control /extensions/postgis
 
+#########################################################################################
+#
+# Layer "pgrouting-build"
+# Build pgrouting. Note: This depends on the postgis-build-deps layer built above
+#
+#########################################################################################
+
 # Uses versioned libraries, i.e. libpgrouting-3.4
 # and may introduce function signature changes between releases
 # i.e. release 3.5.0 has new signature for pg_dijkstra function
 #
 # Use new version only for v17
 # last release v3.6.2 - Mar 30, 2024
+FROM build-deps AS pgrouting-src
+ARG DEBIAN_VERSION
+ARG PG_VERSION
+WORKDIR /ext-src
 RUN case "${PG_VERSION}" in \
     "v17") \
         export PGROUTING_VERSION=3.6.2 \
@@ -223,8 +321,12 @@ RUN case "${PG_VERSION}" in \
     esac && \
     wget https://github.com/pgRouting/pgrouting/archive/v${PGROUTING_VERSION}.tar.gz -O pgrouting.tar.gz && \
     echo "${PGROUTING_CHECKSUM} pgrouting.tar.gz" | sha256sum --check && \
-    mkdir pgrouting-src && cd pgrouting-src && tar xzf ../pgrouting.tar.gz --strip-components=1 -C . && \
-    mkdir build && cd build && \
+    mkdir pgrouting-src && cd pgrouting-src && tar xzf ../pgrouting.tar.gz --strip-components=1 -C .
+
+FROM postgis-build-deps AS pgrouting-build
+COPY --from=pgrouting-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pgrouting-src
+RUN mkdir build && cd build && \
     cmake -GNinja -DCMAKE_BUILD_TYPE=Release .. && \
     ninja -j $(getconf _NPROCESSORS_ONLN) && \
     ninja -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -236,15 +338,11 @@ RUN case "${PG_VERSION}" in \
 # Build plv8
 #
 #########################################################################################
-FROM pg-build AS plv8-build
+FROM build-deps AS plv8-src
 ARG PG_VERSION
+WORKDIR /ext-src
 
-COPY compute/patches/plv8-3.1.10.patch /plv8-3.1.10.patch
-
-RUN apt update && \
-    apt install --no-install-recommends --no-install-suggests -y \
-    ninja-build python3-dev libncurses5 binutils clang \
-    && apt clean && rm -rf /var/lib/apt/lists/*
+COPY compute/patches/plv8-3.1.10.patch .
 
 # plv8 3.2.3 supports v17
 # last release v3.2.3 - Sep 7, 2024
@@ -268,9 +366,20 @@ RUN case "${PG_VERSION}" in \
     git clone --recurse-submodules --depth 1 --branch ${PLV8_TAG} https://github.com/plv8/plv8.git plv8-src && \
     tar -czf plv8.tar.gz --exclude .git plv8-src && \
     cd plv8-src && \
-    if [[ "${PG_VERSION}" < "v17" ]]; then patch -p1 < /plv8-3.1.10.patch; fi && \
+    if [[ "${PG_VERSION}" < "v17" ]]; then patch -p1 < /ext-src/plv8-3.1.10.patch; fi
+
+FROM pg-build AS plv8-build
+ARG PG_VERSION
+RUN apt update && \
+    apt install --no-install-recommends --no-install-suggests -y \
+    ninja-build python3-dev libncurses5 binutils clang \
+    && apt clean && rm -rf /var/lib/apt/lists/*
+
+COPY --from=plv8-src /ext-src/ /ext-src/
+WORKDIR /ext-src/plv8-src
+RUN \
     # generate and copy upgrade scripts
-    mkdir -p upgrade && ./generate_upgrade.sh ${PLV8_TAG#v} && \
+    make generate_upgrades && \
     cp upgrade/* /usr/local/pgsql/share/extension/ && \
     make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
     rm -rf /plv8-* && \
@@ -298,16 +407,28 @@ RUN case "${PG_VERSION}" in \
 # Build h3_pg
 #
 #########################################################################################
-FROM pg-build AS h3-pg-build
+FROM build-deps AS h3-pg-src
 ARG PG_VERSION
+WORKDIR /ext-src
 
 # not version-specific
 # last release v4.1.0 - Jan 18, 2023
 RUN mkdir -p /h3/usr/ && \
     wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz && \
     echo "ec99f1f5974846bde64f4513cf8d2ea1b8d172d2218ab41803bf6a63532272bc h3.tar.gz" | sha256sum --check && \
-    mkdir h3-src && cd h3-src && tar xzf ../h3.tar.gz --strip-components=1 -C . && \
-    mkdir build && cd build && \
+    mkdir h3-src && cd h3-src && tar xzf ../h3.tar.gz --strip-components=1 -C .
+
+# not version-specific
+# last release v4.1.3 - Jul 26, 2023
+WORKDIR /ext-src
+RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \
+    echo "5c17f09a820859ffe949f847bebf1be98511fb8f1bd86f94932512c00479e324 h3-pg.tar.gz" | sha256sum --check && \
+    mkdir h3-pg-src && cd h3-pg-src && tar xzf ../h3-pg.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS h3-pg-build
+COPY --from=h3-pg-src /ext-src/ /ext-src/
+WORKDIR /ext-src/h3-src
+RUN mkdir build && cd build && \
     cmake .. -GNinja -DBUILD_BENCHMARKS=0 -DCMAKE_BUILD_TYPE=Release \
         -DBUILD_FUZZERS=0 -DBUILD_FILTERS=0 -DBUILD_GENERATORS=0 -DBUILD_TESTING=0 \
     && ninja -j $(getconf _NPROCESSORS_ONLN) && \
@@ -315,11 +436,8 @@ RUN mkdir -p /h3/usr/ && \
     cp -R /h3/usr / && \
     rm -rf build
 
-# not version-specific
-# last release v4.1.3 - Jul 26, 2023
-RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \
-    echo "5c17f09a820859ffe949f847bebf1be98511fb8f1bd86f94932512c00479e324 h3-pg.tar.gz" | sha256sum --check && \
-    mkdir h3-pg-src && cd h3-pg-src && tar xzf ../h3-pg.tar.gz --strip-components=1 -C . && \
+WORKDIR /ext-src/h3-pg-src
+RUN ls -l && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control && \
@@ -327,19 +445,24 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3
 
 #########################################################################################
 #
-# Layer "unit-pg-build"
+# Layer "postgresql-unit-build"
 # compile unit extension
 #
 #########################################################################################
-FROM pg-build AS unit-pg-build
+FROM build-deps AS postgresql-unit-src
 ARG PG_VERSION
 
 # not version-specific
 # last release 7.9 - Sep 15, 2024
+WORKDIR /ext-src
 RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.9.tar.gz -O postgresql-unit.tar.gz && \
     echo "e46de6245dcc8b2c2ecf29873dbd43b2b346773f31dd5ce4b8315895a052b456 postgresql-unit.tar.gz" | sha256sum --check && \
-    mkdir postgresql-unit-src && cd postgresql-unit-src && tar xzf ../postgresql-unit.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
+    mkdir postgresql-unit-src && cd postgresql-unit-src && tar xzf ../postgresql-unit.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS postgresql-unit-build
+COPY --from=postgresql-unit-src /ext-src/ /ext-src/
+WORKDIR /ext-src/postgresql-unit-src
+RUN make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     # unit extension's "create extension" script relies on absolute install path to fill some reference tables.
     # We move the extension from '/usr/local/pgsql/' to '/usr/local/'  after it is build. So we need to adjust the path.
@@ -350,14 +473,15 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.9.tar.gz -
 
 #########################################################################################
 #
-# Layer "vector-pg-build"
+# Layer "pgvector-build"
 # compile pgvector extension
 #
 #########################################################################################
-FROM pg-build AS vector-pg-build
+FROM build-deps AS pgvector-src
 ARG PG_VERSION
 
-COPY compute/patches/pgvector.patch /pgvector.patch
+WORKDIR /ext-src
+COPY compute/patches/pgvector.patch .
 
 # By default, pgvector Makefile uses `-march=native`. We don't want that,
 # because we build the images on different machines than where we run them.
@@ -370,74 +494,94 @@ RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.8.0.tar.gz -O
     mkdir pgvector-src && cd pgvector-src && tar xzf ../pgvector.tar.gz --strip-components=1 -C . && \
     wget https://github.com/pgvector/pgvector/raw/refs/tags/v0.7.4/sql/vector.sql -O ./sql/vector--0.7.4.sql && \
     echo "10218d05dc02299562252a9484775178b14a1d8edb92a2d1672ef488530f7778 ./sql/vector--0.7.4.sql" | sha256sum --check && \
-    patch -p1 < /pgvector.patch && \
-    make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" && \
+    patch -p1 < /ext-src/pgvector.patch
+
+FROM pg-build AS pgvector-build
+COPY --from=pgvector-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pgvector-src
+RUN make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" && \
     make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control
 
 #########################################################################################
 #
-# Layer "pgjwt-pg-build"
+# Layer "pgjwt-build"
 # compile pgjwt extension
 #
 #########################################################################################
-FROM pg-build AS pgjwt-pg-build
+FROM build-deps AS pgjwt-src
 ARG PG_VERSION
 
 # not version-specific
 # doesn't use releases, last commit f3d82fd - Mar 2, 2023
+WORKDIR /ext-src
 RUN wget https://github.com/michelp/pgjwt/archive/f3d82fd30151e754e19ce5d6a06c71c20689ce3d.tar.gz -O pgjwt.tar.gz && \
     echo "dae8ed99eebb7593b43013f6532d772b12dfecd55548d2673f2dfd0163f6d2b9 pgjwt.tar.gz" | sha256sum --check && \
-    mkdir pgjwt-src && cd pgjwt-src && tar xzf ../pgjwt.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    mkdir pgjwt-src && cd pgjwt-src && tar xzf ../pgjwt.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS pgjwt-build
+COPY --from=pgjwt-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pgjwt-src
+RUN make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgjwt.control
 
 #########################################################################################
 #
-# Layer "hypopg-pg-build"
+# Layer "hypopg-build"
 # compile hypopg extension
 #
 #########################################################################################
-FROM pg-build AS hypopg-pg-build
+FROM build-deps AS hypopg-src
 ARG PG_VERSION
 
 # HypoPG 1.4.1 supports v17
 # last release 1.4.1 - Apr 28, 2024
+WORKDIR /ext-src
 RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.1.tar.gz -O hypopg.tar.gz && \
     echo "9afe6357fd389d8d33fad81703038ce520b09275ec00153c6c89282bcdedd6bc hypopg.tar.gz" | sha256sum --check && \
-    mkdir hypopg-src && cd hypopg-src && tar xzf ../hypopg.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
+    mkdir hypopg-src && cd hypopg-src && tar xzf ../hypopg.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS hypopg-build
+COPY --from=hypopg-src /ext-src/ /ext-src/
+WORKDIR /ext-src/hypopg-src
+RUN make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/hypopg.control
 
 #########################################################################################
 #
-# Layer "pg-hashids-pg-build"
+# Layer "pg_hashids-build"
 # compile pg_hashids extension
 #
 #########################################################################################
-FROM pg-build AS pg-hashids-pg-build
+FROM build-deps AS pg_hashids-src
 ARG PG_VERSION
 
 # not version-specific
 # last release v1.2.1 -Jan 12, 2018
+WORKDIR /ext-src
 RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz -O pg_hashids.tar.gz && \
     echo "74576b992d9277c92196dd8d816baa2cc2d8046fe102f3dcd7f3c3febed6822a pg_hashids.tar.gz" | sha256sum --check && \
-    mkdir pg_hashids-src && cd pg_hashids-src && tar xzf ../pg_hashids.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) USE_PGXS=1 && \
+    mkdir pg_hashids-src && cd pg_hashids-src && tar xzf ../pg_hashids.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS pg_hashids-build
+COPY --from=pg_hashids-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pg_hashids-src
+RUN make -j $(getconf _NPROCESSORS_ONLN) USE_PGXS=1 && \
     make -j $(getconf _NPROCESSORS_ONLN) install USE_PGXS=1 && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_hashids.control
 
 #########################################################################################
 #
-# Layer "rum-pg-build"
+# Layer "rum-build"
 # compile rum extension
 #
 #########################################################################################
-FROM pg-build AS rum-pg-build
+FROM build-deps AS rum-src
 ARG PG_VERSION
 
-COPY compute/patches/rum.patch /rum.patch
+WORKDIR /ext-src
+COPY compute/patches/rum.patch .
 
 # supports v17 since https://github.com/postgrespro/rum/commit/cb1edffc57736cd2a4455f8d0feab0d69928da25
 # doesn't use releases since 1.3.13 - Sep 19, 2022
@@ -445,110 +589,140 @@ COPY compute/patches/rum.patch /rum.patch
 RUN wget https://github.com/postgrespro/rum/archive/cb1edffc57736cd2a4455f8d0feab0d69928da25.tar.gz -O rum.tar.gz && \
     echo "65e0a752e99f4c3226400c9b899f997049e93503db8bf5c8072efa136d32fd83 rum.tar.gz" | sha256sum --check && \
     mkdir rum-src && cd rum-src && tar xzf ../rum.tar.gz --strip-components=1 -C . && \
-    patch -p1 < /rum.patch && \
-    make -j $(getconf _NPROCESSORS_ONLN) USE_PGXS=1 && \
+    patch -p1 < /ext-src/rum.patch
+
+FROM pg-build AS rum-build
+COPY --from=rum-src /ext-src/ /ext-src/
+WORKDIR /ext-src/rum-src
+RUN make -j $(getconf _NPROCESSORS_ONLN) USE_PGXS=1 && \
     make -j $(getconf _NPROCESSORS_ONLN) install USE_PGXS=1 && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/rum.control
 
 #########################################################################################
 #
-# Layer "pgtap-pg-build"
+# Layer "pgtap-build"
 # compile pgTAP extension
 #
 #########################################################################################
-FROM pg-build AS pgtap-pg-build
+FROM build-deps AS pgtap-src
 ARG PG_VERSION
 
 # pgtap 1.3.3 supports v17
 # last release v1.3.3 - Apr 8, 2024
+WORKDIR /ext-src
 RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.3.3.tar.gz -O pgtap.tar.gz && \
     echo "325ea79d0d2515bce96bce43f6823dcd3effbd6c54cb2a4d6c2384fffa3a14c7 pgtap.tar.gz" | sha256sum --check && \
-    mkdir pgtap-src && cd pgtap-src && tar xzf ../pgtap.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
+    mkdir pgtap-src && cd pgtap-src && tar xzf ../pgtap.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS pgtap-build
+COPY --from=pgtap-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pgtap-src
+RUN make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgtap.control
 
 #########################################################################################
 #
-# Layer "ip4r-pg-build"
+# Layer "ip4r-build"
 # compile ip4r extension
 #
 #########################################################################################
-FROM pg-build AS ip4r-pg-build
+FROM build-deps AS ip4r-src
 ARG PG_VERSION
 
 # not version-specific
 # last release v2.4.2 - Jul 29, 2023
+WORKDIR /ext-src
 RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O ip4r.tar.gz && \
     echo "0f7b1f159974f49a47842a8ab6751aecca1ed1142b6d5e38d81b064b2ead1b4b ip4r.tar.gz" | sha256sum --check && \
-    mkdir ip4r-src && cd ip4r-src && tar xzf ../ip4r.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
+    mkdir ip4r-src && cd ip4r-src && tar xzf ../ip4r.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS ip4r-build
+COPY --from=ip4r-src /ext-src/ /ext-src/
+WORKDIR /ext-src/ip4r-src
+RUN make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/ip4r.control
 
 #########################################################################################
 #
-# Layer "prefix-pg-build"
+# Layer "prefix-build"
 # compile Prefix extension
 #
 #########################################################################################
-FROM pg-build AS prefix-pg-build
+FROM build-deps AS prefix-src
 ARG PG_VERSION
 
 # not version-specific
 # last release v1.2.10  - Jul 5, 2023
+WORKDIR /ext-src
 RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O prefix.tar.gz && \
     echo "4342f251432a5f6fb05b8597139d3ccde8dcf87e8ca1498e7ee931ca057a8575 prefix.tar.gz" | sha256sum --check && \
-    mkdir prefix-src && cd prefix-src && tar xzf ../prefix.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
+    mkdir prefix-src && cd prefix-src && tar xzf ../prefix.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS prefix-build
+COPY --from=prefix-src /ext-src/ /ext-src/
+WORKDIR /ext-src/prefix-src
+RUN make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/prefix.control
 
 #########################################################################################
 #
-# Layer "hll-pg-build"
+# Layer "hll-build"
 # compile hll extension
 #
 #########################################################################################
-FROM pg-build AS hll-pg-build
+FROM build-deps AS hll-src
 ARG PG_VERSION
 
 # not version-specific
 # last release v2.18 - Aug 29, 2023
+WORKDIR /ext-src
 RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar.gz -O hll.tar.gz && \
     echo "e2f55a6f4c4ab95ee4f1b4a2b73280258c5136b161fe9d059559556079694f0e hll.tar.gz" | sha256sum --check && \
-    mkdir hll-src && cd hll-src && tar xzf ../hll.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
+    mkdir hll-src && cd hll-src && tar xzf ../hll.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS hll-build
+COPY --from=hll-src /ext-src/ /ext-src/
+WORKDIR /ext-src/hll-src
+RUN make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/hll.control
 
 #########################################################################################
 #
-# Layer "plpgsql-check-pg-build"
+# Layer "plpgsql_check-build"
 # compile plpgsql_check extension
 #
 #########################################################################################
-FROM pg-build AS plpgsql-check-pg-build
+FROM build-deps AS plpgsql_check-src
 ARG PG_VERSION
 
 # plpgsql_check v2.7.11 supports v17
 # last release v2.7.11 - Sep 16, 2024
+WORKDIR /ext-src
 RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.7.11.tar.gz -O plpgsql_check.tar.gz && \
     echo "208933f8dbe8e0d2628eb3851e9f52e6892b8e280c63700c0f1ce7883625d172 plpgsql_check.tar.gz" | sha256sum --check && \
-    mkdir plpgsql_check-src && cd plpgsql_check-src && tar xzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) USE_PGXS=1 && \
+    mkdir plpgsql_check-src && cd plpgsql_check-src && tar xzf ../plpgsql_check.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS plpgsql_check-build
+COPY --from=plpgsql_check-src /ext-src/ /ext-src/
+WORKDIR /ext-src/plpgsql_check-src
+RUN make -j $(getconf _NPROCESSORS_ONLN) USE_PGXS=1 && \
     make -j $(getconf _NPROCESSORS_ONLN) install USE_PGXS=1 && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/plpgsql_check.control
 
 #########################################################################################
 #
-# Layer "timescaledb-pg-build"
+# Layer "timescaledb-build"
 # compile timescaledb extension
 #
 #########################################################################################
-FROM pg-build AS timescaledb-pg-build
+FROM build-deps AS timescaledb-src
 ARG PG_VERSION
 
+WORKDIR /ext-src
 RUN case "${PG_VERSION}" in \
       "v14" | "v15") \
         export TIMESCALEDB_VERSION=2.10.1 \
@@ -565,8 +739,12 @@ RUN case "${PG_VERSION}" in \
     esac && \
     wget https://github.com/timescale/timescaledb/archive/refs/tags/${TIMESCALEDB_VERSION}.tar.gz -O timescaledb.tar.gz && \
     echo "${TIMESCALEDB_CHECKSUM} timescaledb.tar.gz" | sha256sum --check && \
-    mkdir timescaledb-src && cd timescaledb-src && tar xzf ../timescaledb.tar.gz --strip-components=1 -C . && \
-    ./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON -DCMAKE_BUILD_TYPE=Release && \
+    mkdir timescaledb-src && cd timescaledb-src && tar xzf ../timescaledb.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS timescaledb-build
+COPY --from=timescaledb-src /ext-src/ /ext-src/
+WORKDIR /ext-src/timescaledb-src
+RUN ./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON -DCMAKE_BUILD_TYPE=Release && \
     cd build && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
     make install -j $(getconf _NPROCESSORS_ONLN) && \
@@ -574,14 +752,15 @@ RUN case "${PG_VERSION}" in \
 
 #########################################################################################
 #
-# Layer "pg-hint-plan-pg-build"
+# Layer "pg_hint_plan-build"
 # compile pg_hint_plan extension
 #
 #########################################################################################
-FROM pg-build AS pg-hint-plan-pg-build
+FROM build-deps AS pg_hint_plan-src
 ARG PG_VERSION
 
 # version-specific, has separate releases for each version
+WORKDIR /ext-src
 RUN case "${PG_VERSION}" in \
       "v14") \
         export PG_HINT_PLAN_VERSION=14_1_4_1 \
@@ -605,50 +784,51 @@ RUN case "${PG_VERSION}" in \
     esac && \
     wget https://github.com/ossc-db/pg_hint_plan/archive/refs/tags/REL${PG_HINT_PLAN_VERSION}.tar.gz -O pg_hint_plan.tar.gz && \
     echo "${PG_HINT_PLAN_CHECKSUM} pg_hint_plan.tar.gz" | sha256sum --check && \
-    mkdir pg_hint_plan-src && cd pg_hint_plan-src && tar xzf ../pg_hint_plan.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
+    mkdir pg_hint_plan-src && cd pg_hint_plan-src && tar xzf ../pg_hint_plan.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS pg_hint_plan-build
+COPY --from=pg_hint_plan-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pg_hint_plan-src
+RUN make -j $(getconf _NPROCESSORS_ONLN) && \
     make install -j $(getconf _NPROCESSORS_ONLN) && \
     echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_hint_plan.control
 
 
 #########################################################################################
 #
-# Layer "pg-cron-pg-build"
+# Layer "pg_cron-build"
 # compile pg_cron extension
 #
 #########################################################################################
-FROM pg-build AS pg-cron-pg-build
+FROM build-deps AS pg_cron-src
 ARG PG_VERSION
 
 # This is an experimental extension that we do not support on prod yet.
 # !Do not remove!
 # We set it in shared_preload_libraries and computes will fail to start if library is not found.
+WORKDIR /ext-src
+COPY compute/patches/pg_cron.patch .
 RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.4.tar.gz -O pg_cron.tar.gz && \
     echo "52d1850ee7beb85a4cb7185731ef4e5a90d1de216709d8988324b0d02e76af61 pg_cron.tar.gz" | sha256sum --check && \
     mkdir pg_cron-src && cd pg_cron-src && tar xzf ../pg_cron.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
+    patch < /ext-src/pg_cron.patch
+
+FROM pg-build AS pg_cron-build
+COPY --from=pg_cron-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pg_cron-src
+RUN make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_cron.control
 
 #########################################################################################
 #
-# Layer "rdkit-pg-build"
+# Layer "rdkit-build"
 # compile rdkit extension
 #
 #########################################################################################
-FROM pg-build AS rdkit-pg-build
+FROM build-deps AS rdkit-src
 ARG PG_VERSION
 
-RUN apt update && \
-    apt install --no-install-recommends --no-install-suggests -y \
-        libboost-iostreams1.74-dev \
-        libboost-regex1.74-dev \
-        libboost-serialization1.74-dev \
-        libboost-system1.74-dev \
-        libeigen3-dev \
-        libboost-all-dev \
-    && apt clean && rm -rf /var/lib/apt/lists/*
-
 # rdkit Release_2024_09_1 supports v17
 # last release Release_2024_09_1 - Sep 27, 2024
 #
@@ -656,12 +836,7 @@ RUN apt update && \
 # because Release_2024_09_1 has some backward incompatible changes
 # https://github.com/rdkit/rdkit/releases/tag/Release_2024_09_1
 
-# XXX: /usr/local/pgsql/bin is already in PATH, and that should be enough to find
-# pg_config. For some reason the rdkit cmake script doesn't work with just that,
-# however. By also adding /usr/local/pgsql, it works, which is weird because there
-# are no executables in that directory.
-ENV PATH="/usr/local/pgsql:$PATH"
-
+WORKDIR /ext-src
 RUN case "${PG_VERSION}" in \
     "v17") \
         export RDKIT_VERSION=Release_2024_09_1 \
@@ -677,8 +852,28 @@ RUN case "${PG_VERSION}" in \
     esac && \
     wget https://github.com/rdkit/rdkit/archive/refs/tags/${RDKIT_VERSION}.tar.gz -O rdkit.tar.gz && \
     echo "${RDKIT_CHECKSUM} rdkit.tar.gz" | sha256sum --check && \
-    mkdir rdkit-src && cd rdkit-src && tar xzf ../rdkit.tar.gz --strip-components=1 -C . && \
-    cmake \
+    mkdir rdkit-src && cd rdkit-src && tar xzf ../rdkit.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS rdkit-build
+RUN apt update && \
+    apt install --no-install-recommends --no-install-suggests -y \
+        libboost-iostreams1.74-dev \
+        libboost-regex1.74-dev \
+        libboost-serialization1.74-dev \
+        libboost-system1.74-dev \
+        libeigen3-dev \
+        libboost-all-dev \
+    && apt clean && rm -rf /var/lib/apt/lists/*
+
+COPY --from=rdkit-src /ext-src/ /ext-src/
+WORKDIR /ext-src/rdkit-src
+
+# XXX: /usr/local/pgsql/bin is already in PATH, and that should be enough to find
+# pg_config. For some reason the rdkit cmake script doesn't work with just that,
+# however. By also adding /usr/local/pgsql, it works, which is weird because there
+# are no executables in that directory.
+ENV PATH="/usr/local/pgsql:$PATH"
+RUN cmake \
         -D RDK_BUILD_CAIRO_SUPPORT=OFF \
         -D RDK_BUILD_INCHI_SUPPORT=ON \
         -D RDK_BUILD_AVALON_SUPPORT=ON \
@@ -710,47 +905,57 @@ RUN case "${PG_VERSION}" in \
 
 #########################################################################################
 #
-# Layer "pg-uuidv7-pg-build"
+# Layer "pg_uuidv7-build"
 # compile pg_uuidv7 extension
 #
 #########################################################################################
-FROM pg-build AS pg-uuidv7-pg-build
+FROM build-deps AS pg_uuidv7-src
 ARG PG_VERSION
 
 # not version-specific
 # last release v1.6.0 - Oct 9, 2024
+WORKDIR /ext-src
 RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.6.0.tar.gz -O pg_uuidv7.tar.gz && \
     echo "0fa6c710929d003f6ce276a7de7a864e9d1667b2d78be3dc2c07f2409eb55867 pg_uuidv7.tar.gz" | sha256sum --check && \
-    mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
+    mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xzf ../pg_uuidv7.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS pg_uuidv7-build
+COPY --from=pg_uuidv7-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pg_uuidv7-src
+RUN make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_uuidv7.control
 
 #########################################################################################
 #
-# Layer "pg-roaringbitmap-pg-build"
+# Layer "pg_roaringbitmap-build"
 # compile pg_roaringbitmap extension
 #
 #########################################################################################
-FROM pg-build AS pg-roaringbitmap-pg-build
+FROM build-deps AS pg_roaringbitmap-src
 ARG PG_VERSION
 
 # not version-specific
 # last release v0.5.4 - Jun 28, 2022
+WORKDIR /ext-src
 RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \
     echo "b75201efcb1c2d1b014ec4ae6a22769cc7a224e6e406a587f5784a37b6b5a2aa pg_roaringbitmap.tar.gz" | sha256sum --check && \
-    mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
+    mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS pg_roaringbitmap-build
+COPY --from=pg_roaringbitmap-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pg_roaringbitmap-src
+RUN make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/roaringbitmap.control
 
 #########################################################################################
 #
-# Layer "pg-semver-pg-build"
+# Layer "pg_semver-build"
 # compile pg_semver extension
 #
 #########################################################################################
-FROM pg-build AS pg-semver-pg-build
+FROM build-deps AS pg_semver-src
 ARG PG_VERSION
 
 # Release 0.40.0 breaks backward compatibility with previous versions
@@ -758,6 +963,7 @@ ARG PG_VERSION
 # Use new version only for v17
 #
 # last release v0.40.0 - Jul 22, 2024
+WORKDIR /ext-src
 RUN case "${PG_VERSION}" in \
     "v17") \
         export SEMVER_VERSION=0.40.0 \
@@ -773,22 +979,27 @@ RUN case "${PG_VERSION}" in \
     esac && \
     wget https://github.com/theory/pg-semver/archive/refs/tags/v${SEMVER_VERSION}.tar.gz -O pg_semver.tar.gz && \
     echo "${SEMVER_CHECKSUM} pg_semver.tar.gz" | sha256sum --check && \
-    mkdir pg_semver-src && cd pg_semver-src && tar xzf ../pg_semver.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
+    mkdir pg_semver-src && cd pg_semver-src && tar xzf ../pg_semver.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS pg_semver-build
+COPY --from=pg_semver-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pg_semver-src
+RUN make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/semver.control
 
 #########################################################################################
 #
-# Layer "pg-embedding-pg-build"
+# Layer "pg_embedding-build"
 # compile pg_embedding extension
 #
 #########################################################################################
-FROM pg-build AS pg-embedding-pg-build
+FROM build-deps AS pg_embedding-src
+ARG PG_VERSION
 
 # This is our extension, support stopped in favor of pgvector
 # TODO: deprecate it
-ARG PG_VERSION
+WORKDIR /ext-src
 RUN case "${PG_VERSION}" in \
       "v14" | "v15") \
         export PG_EMBEDDING_VERSION=0.3.5 \
@@ -799,29 +1010,44 @@ RUN case "${PG_VERSION}" in \
     esac && \
     wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/${PG_EMBEDDING_VERSION}.tar.gz -O pg_embedding.tar.gz && \
     echo "${PG_EMBEDDING_CHECKSUM} pg_embedding.tar.gz" | sha256sum --check && \
-    mkdir pg_embedding-src && cd pg_embedding-src && tar xzf ../pg_embedding.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
-    make -j $(getconf _NPROCESSORS_ONLN) install
+    mkdir pg_embedding-src && cd pg_embedding-src && tar xzf ../pg_embedding.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS pg_embedding-build
+COPY --from=pg_embedding-src /ext-src/ /ext-src/
+WORKDIR /ext-src/
+RUN  if [ -d pg_embedding-src ]; then \
+        cd pg_embedding-src && \
+        make -j $(getconf _NPROCESSORS_ONLN) && \
+        make -j $(getconf _NPROCESSORS_ONLN) install; \
+    fi
 
 #########################################################################################
 #
-# Layer "pg-anon-pg-build"
+# Layer "pg_anon-build"
 # compile anon extension
 #
 #########################################################################################
-FROM pg-build AS pg-anon-pg-build
+FROM build-deps AS pg_anon-src
 ARG PG_VERSION
 
 # This is an experimental extension, never got to real production.
 # !Do not remove! It can be present in shared_preload_libraries and compute will fail to start if library is not found.
+WORKDIR /ext-src
 RUN case "${PG_VERSION}" in "v17") \
     echo "postgresql_anonymizer does not yet support PG17" && exit 0;; \
     esac && \
     wget  https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \
     echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9  pg_anon.tar.gz" | sha256sum --check && \
-    mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control
+    mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS pg_anon-build
+COPY --from=pg_anon-src /ext-src/ /ext-src/
+WORKDIR /ext-src
+RUN if [ -d pg_anon-src ]; then \
+        cd pg_anon-src && \
+        make -j $(getconf _NPROCESSORS_ONLN) install && \
+        echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control; \
+    fi
 
 #########################################################################################
 #
@@ -887,50 +1113,57 @@ USER root
 
 #########################################################################################
 #
-# Layers "pg-onnx-build" and "pgrag-pg-build"
+# Layers "pg-onnx-build" and "pgrag-build"
 # Compile "pgrag" extensions
 #
 #########################################################################################
 
-FROM rust-extensions-build-pgrx12 AS pg-onnx-build
+FROM build-deps AS pgrag-src
+ARG PG_VERSION
 
+WORKDIR /ext-src
+RUN wget https://github.com/microsoft/onnxruntime/archive/refs/tags/v1.18.1.tar.gz -O onnxruntime.tar.gz && \
+    mkdir onnxruntime-src && cd onnxruntime-src && tar xzf ../onnxruntime.tar.gz --strip-components=1 -C . && \
+    echo "#nothing to test here" > neon-test.sh
+
+RUN wget https://github.com/neondatabase-labs/pgrag/archive/refs/tags/v0.0.0.tar.gz -O pgrag.tar.gz &&  \
+    echo "2cbe394c1e74fc8bcad9b52d5fbbfb783aef834ca3ce44626cfd770573700bb4 pgrag.tar.gz" | sha256sum --check && \
+    mkdir pgrag-src && cd pgrag-src && tar xzf ../pgrag.tar.gz --strip-components=1 -C .
+
+FROM rust-extensions-build-pgrx12 AS pgrag-build
+COPY --from=pgrag-src /ext-src/ /ext-src/
+
+# Install build-time dependencies
 # cmake 3.26 or higher is required, so installing it using pip (bullseye-backports has cmake 3.25).
 # Install it using virtual environment, because Python 3.11 (the default version on Debian 12 (Bookworm)) complains otherwise
+WORKDIR /ext-src/onnxruntime-src
 RUN apt update && apt install --no-install-recommends --no-install-suggests -y \
-    python3 python3-pip python3-venv && \
+    python3 python3-pip python3-venv protobuf-compiler && \
     apt clean && rm -rf /var/lib/apt/lists/* && \
     python3 -m venv venv && \
     . venv/bin/activate && \
-    python3 -m pip install cmake==3.30.5 && \
-    wget https://github.com/microsoft/onnxruntime/archive/refs/tags/v1.18.1.tar.gz -O onnxruntime.tar.gz && \
-    mkdir onnxruntime-src && cd onnxruntime-src && tar xzf ../onnxruntime.tar.gz --strip-components=1 -C . && \
+    python3 -m pip install cmake==3.30.5
+
+RUN . venv/bin/activate && \
     ./build.sh --config Release --parallel --cmake_generator Ninja \
     --skip_submodule_sync --skip_tests --allow_running_as_root
 
-
-FROM pg-onnx-build AS pgrag-pg-build
-
-RUN apt update && apt install --no-install-recommends --no-install-suggests -y protobuf-compiler \
-    && apt clean && rm -rf /var/lib/apt/lists/* && \
-    wget https://github.com/neondatabase-labs/pgrag/archive/refs/tags/v0.0.0.tar.gz -O pgrag.tar.gz &&  \
-    echo "2cbe394c1e74fc8bcad9b52d5fbbfb783aef834ca3ce44626cfd770573700bb4 pgrag.tar.gz" | sha256sum --check && \
-    mkdir pgrag-src && cd pgrag-src && tar xzf ../pgrag.tar.gz --strip-components=1 -C . && \
-    \
-    cd exts/rag && \
+WORKDIR /ext-src/pgrag-src
+RUN cd exts/rag && \
     sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
     cargo pgrx install --release && \
-    echo "trusted = true" >> /usr/local/pgsql/share/extension/rag.control && \
-    \
-    cd ../rag_bge_small_en_v15 && \
+    echo "trusted = true" >> /usr/local/pgsql/share/extension/rag.control
+
+RUN cd exts/rag_bge_small_en_v15 && \
     sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
-    ORT_LIB_LOCATION=/home/nonroot/onnxruntime-src/build/Linux \
+    ORT_LIB_LOCATION=/ext-src/onnxruntime-src/build/Linux \
         REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/bge_small_en_v15.onnx \
         cargo pgrx install --release --features remote_onnx && \
-    echo "trusted = true" >> /usr/local/pgsql/share/extension/rag_bge_small_en_v15.control && \
-    \
-    cd ../rag_jina_reranker_v1_tiny_en && \
+    echo "trusted = true" >> /usr/local/pgsql/share/extension/rag_bge_small_en_v15.control
+
+RUN cd exts/rag_jina_reranker_v1_tiny_en && \
     sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
-    ORT_LIB_LOCATION=/home/nonroot/onnxruntime-src/build/Linux \
+    ORT_LIB_LOCATION=/ext-src/onnxruntime-src/build/Linux \
         REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/jina_reranker_v1_tiny_en.onnx \
         cargo pgrx install --release --features remote_onnx && \
     echo "trusted = true" >> /usr/local/pgsql/share/extension/rag_jina_reranker_v1_tiny_en.control
@@ -938,17 +1171,23 @@ RUN apt update && apt install --no-install-recommends --no-install-suggests -y p
 
 #########################################################################################
 #
-# Layer "pg-jsonschema-pg-build"
+# Layer "pg_jsonschema-build"
 # Compile "pg_jsonschema" extension
 #
 #########################################################################################
 
-FROM rust-extensions-build-pgrx12 AS pg-jsonschema-pg-build
+FROM build-deps AS pg_jsonschema-src
 ARG PG_VERSION
 # last release v0.3.3 - Oct 16, 2024
+WORKDIR /ext-src
 RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.3.tar.gz -O pg_jsonschema.tar.gz && \
     echo "40c2cffab4187e0233cb8c3bde013be92218c282f95f4469c5282f6b30d64eac pg_jsonschema.tar.gz" | sha256sum --check && \
-    mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \
+    mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xzf ../pg_jsonschema.tar.gz --strip-components=1 -C .
+
+FROM rust-extensions-build-pgrx12 AS pg_jsonschema-build
+COPY --from=pg_jsonschema-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pg_jsonschema-src
+RUN \
     # see commit 252b3685a27a0f4c31a0f91e983c6314838e89e8
     # `unsafe-postgres` feature allows to build pgx extensions
     # against postgres forks that decided to change their ABI name (like us).
@@ -961,55 +1200,69 @@ RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.3.tar.
 
 #########################################################################################
 #
-# Layer "pg-graphql-pg-build"
+# Layer "pg_graphql-build"
 # Compile "pg_graphql" extension
 #
 #########################################################################################
 
-FROM rust-extensions-build-pgrx12 AS pg-graphql-pg-build
+FROM build-deps AS pg_graphql-src
 ARG PG_VERSION
 
 # last release v1.5.9 - Oct 16, 2024
+WORKDIR /ext-src
+COPY compute/patches/pg_graphql.patch .
 RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.5.9.tar.gz -O pg_graphql.tar.gz && \
     echo "cf768385a41278be1333472204fc0328118644ae443182cf52f7b9b23277e497 pg_graphql.tar.gz" | sha256sum --check && \
     mkdir pg_graphql-src && cd pg_graphql-src && tar xzf ../pg_graphql.tar.gz --strip-components=1 -C . && \
-    sed -i 's/pgrx = "=0.12.6"/pgrx = { version = "=0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    sed -i 's/pgrx = "=0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
     sed -i 's/pgrx-tests = "=0.12.6"/pgrx-tests = "=0.12.9"/g' Cargo.toml && \
-    cargo pgrx install --release && \
+    patch -p1 < /ext-src/pg_graphql.patch
+
+
+FROM rust-extensions-build-pgrx12 AS pg_graphql-build
+COPY --from=pg_graphql-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pg_graphql-src
+RUN cargo pgrx install --release && \
     # it's needed to enable extension because it uses untrusted C language
     sed -i 's/superuser = false/superuser = true/g' /usr/local/pgsql/share/extension/pg_graphql.control && \
     echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_graphql.control
 
 #########################################################################################
 #
-# Layer "pg-tiktoken-build"
+# Layer "pg_tiktoken-build"
 # Compile "pg_tiktoken" extension
 #
 #########################################################################################
 
-FROM rust-extensions-build-pgrx12 AS pg-tiktoken-pg-build
+FROM build-deps AS pg_tiktoken-src
 ARG PG_VERSION
 
 # doesn't use releases
 # 9118dd4549b7d8c0bbc98e04322499f7bf2fa6f7 - on Oct 29, 2024
+WORKDIR /ext-src
 RUN wget https://github.com/kelvich/pg_tiktoken/archive/9118dd4549b7d8c0bbc98e04322499f7bf2fa6f7.tar.gz -O pg_tiktoken.tar.gz && \
     echo "a5bc447e7920ee149d3c064b8b9f0086c0e83939499753178f7d35788416f628 pg_tiktoken.tar.gz" | sha256sum --check && \
     mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \
     sed -i 's/pgrx = { version = "=0.12.6",/pgrx = { version = "0.12.9",/g' Cargo.toml && \
-    sed -i 's/pgrx-tests = "=0.12.6"/pgrx-tests = "0.12.9"/g' Cargo.toml && \
-    cargo pgrx install --release && \
+    sed -i 's/pgrx-tests = "=0.12.6"/pgrx-tests = "0.12.9"/g' Cargo.toml
+
+FROM rust-extensions-build-pgrx12 AS pg_tiktoken-build
+COPY --from=pg_tiktoken-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pg_tiktoken-src
+RUN cargo pgrx install --release && \
     echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_tiktoken.control
 
 #########################################################################################
 #
-# Layer "pg-pgx-ulid-build"
+# Layer "pgx_ulid-build"
 # Compile "pgx_ulid" extension for v16 and below
 #
 #########################################################################################
 
-FROM rust-extensions-build AS pg-pgx-ulid-build
+FROM build-deps AS pgx_ulid-src
 ARG PG_VERSION
 
+WORKDIR /ext-src
 RUN case "${PG_VERSION}" in \
     "v14" | "v15" | "v16") \
         ;; \
@@ -1020,20 +1273,28 @@ RUN case "${PG_VERSION}" in \
     wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.5.tar.gz -O pgx_ulid.tar.gz && \
     echo "9d1659a2da65af0133d5451c454de31b37364e3502087dadf579f790bc8bef17  pgx_ulid.tar.gz" | sha256sum --check && \
     mkdir pgx_ulid-src && cd pgx_ulid-src && tar xzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
-    sed -i 's/pgrx       = "^0.11.2"/pgrx       = { version = "0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
-    cargo pgrx install --release && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/ulid.control
+    sed -i 's/pgrx       = "^0.11.2"/pgrx = { version = "=0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml
+
+FROM rust-extensions-build AS pgx_ulid-build
+COPY --from=pgx_ulid-src /ext-src/ /ext-src/
+WORKDIR /ext-src/
+RUN if [ -d pgx_ulid-src ]; then \
+        cd pgx_ulid-src && \
+        cargo pgrx install --release && \
+        echo 'trusted = true' >> /usr/local/pgsql/share/extension/ulid.control; \
+    fi
 
 #########################################################################################
 #
-# Layer "pg-pgx-ulid-pgrx12-build"
+# Layer "pgx_ulid-pgrx12-build"
 # Compile "pgx_ulid" extension for v17 and up
 #
 #########################################################################################
 
-FROM rust-extensions-build-pgrx12 AS pg-pgx-ulid-pgrx12-build
+FROM build-deps AS pgx_ulid-pgrx12-src
 ARG PG_VERSION
 
+WORKDIR /ext-src
 RUN case "${PG_VERSION}" in \
     "v17") \
         ;; \
@@ -1044,23 +1305,32 @@ RUN case "${PG_VERSION}" in \
     wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.2.0.tar.gz -O pgx_ulid.tar.gz && \
     echo "cef6a9a2e5e7bd1a10a18989286586ee9e6c1c06005a4055cff190de41bf3e9f pgx_ulid.tar.gz" | sha256sum --check && \
     mkdir pgx_ulid-src && cd pgx_ulid-src && tar xzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
-    sed -i 's/pgrx       = "^0.12.7"/pgrx       = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
-    cargo pgrx install --release && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgx_ulid.control
+    sed -i 's/pgrx       = "^0.12.7"/pgrx       = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml
+
+FROM rust-extensions-build-pgrx12 AS pgx_ulid-pgrx12-build
+ARG PG_VERSION
+WORKDIR /ext-src
+COPY --from=pgx_ulid-pgrx12-src /ext-src/ /ext-src/
+RUN if [ -d pgx_ulid-src ]; then \
+        cd pgx_ulid-src && \
+        cargo pgrx install --release && \
+        echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgx_ulid.control; \
+    fi
 
 #########################################################################################
 #
-# Layer "pg-session-jwt-build"
+# Layer "pg_session_jwt-build"
 # Compile "pg_session_jwt" extension
 #
 #########################################################################################
 
-FROM rust-extensions-build-pgrx12 AS pg-session-jwt-build
+FROM build-deps AS pg_session_jwt-src
 ARG PG_VERSION
 
 # NOTE: local_proxy depends on the version of pg_session_jwt
 # Do not update without approve from proxy team
 # Make sure the version is reflected in proxy/src/serverless/local_conn_pool.rs
+WORKDIR /ext-src
 RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.2.0.tar.gz -O pg_session_jwt.tar.gz && \
     echo "5ace028e591f2e000ca10afa5b1ca62203ebff014c2907c0ec3b29c36f28a1bb pg_session_jwt.tar.gz" | sha256sum --check && \
     mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \
@@ -1068,8 +1338,12 @@ RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.2.0
     sed -i 's/version = "0.12.6"/version = "0.12.9"/g' pgrx-tests/Cargo.toml && \
     sed -i 's/pgrx = "=0.12.6"/pgrx = { version = "=0.12.9", features = [ "unsafe-postgres" ] }/g' pgrx-tests/Cargo.toml && \
     sed -i 's/pgrx-macros = "=0.12.6"/pgrx-macros = "=0.12.9"/g' pgrx-tests/Cargo.toml && \
-    sed -i 's/pgrx-pg-config = "=0.12.6"/pgrx-pg-config = "=0.12.9"/g' pgrx-tests/Cargo.toml && \
-    cargo pgrx install --release
+    sed -i 's/pgrx-pg-config = "=0.12.6"/pgrx-pg-config = "=0.12.9"/g' pgrx-tests/Cargo.toml
+
+FROM rust-extensions-build-pgrx12 AS pg_session_jwt-build
+COPY --from=pg_session_jwt-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pg_session_jwt-src
+RUN cargo pgrx install --release
 
 #########################################################################################
 #
@@ -1078,15 +1352,20 @@ RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.2.0
 #
 #########################################################################################
 
-FROM pg-build AS wal2json-pg-build
+FROM build-deps AS wal2json-src
 ARG PG_VERSION
 
 # wal2json wal2json_2_6 supports v17
 # last release wal2json_2_6 - Apr 25, 2024
+WORKDIR /ext-src
 RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_6.tar.gz -O wal2json.tar.gz && \
     echo "18b4bdec28c74a8fc98a11c72de38378a760327ef8e5e42e975b0029eb96ba0d wal2json.tar.gz" | sha256sum --check && \
-    mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
+    mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS wal2json-build
+COPY --from=wal2json-src /ext-src/ /ext-src/
+WORKDIR /ext-src/wal2json-src
+RUN make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install
 
 #########################################################################################
@@ -1095,15 +1374,20 @@ RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_6.tar.
 # compile pg_ivm extension
 #
 #########################################################################################
-FROM pg-build AS pg-ivm-build
+FROM build-deps AS pg_ivm-src
 ARG PG_VERSION
 
 # pg_ivm v1.9 supports v17
 # last release v1.9 - Jul 31
+WORKDIR /ext-src
 RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.9.tar.gz -O pg_ivm.tar.gz && \
     echo "59e15722939f274650abf637f315dd723c87073496ca77236b044cb205270d8b pg_ivm.tar.gz" | sha256sum --check && \
-    mkdir pg_ivm-src && cd pg_ivm-src && tar xzf ../pg_ivm.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
+    mkdir pg_ivm-src && cd pg_ivm-src && tar xzf ../pg_ivm.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS pg_ivm-build
+COPY --from=pg_ivm-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pg_ivm-src
+RUN make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_ivm.control
 
@@ -1113,15 +1397,20 @@ RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.9.tar.gz -O pg_iv
 # compile pg_partman extension
 #
 #########################################################################################
-FROM pg-build AS pg-partman-build
+FROM build-deps AS pg_partman-src
 ARG PG_VERSION
 
 # should support v17 https://github.com/pgpartman/pg_partman/discussions/693
 # last release 5.1.0  Apr 2, 2024
+WORKDIR /ext-src
 RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.1.0.tar.gz -O pg_partman.tar.gz && \
     echo "3e3a27d7ff827295d5c55ef72f07a49062d6204b3cb0b9a048645d6db9f3cb9f pg_partman.tar.gz" | sha256sum --check && \
-    mkdir pg_partman-src && cd pg_partman-src && tar xzf ../pg_partman.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
+    mkdir pg_partman-src && cd pg_partman-src && tar xzf ../pg_partman.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS pg_partman-build
+COPY --from=pg_partman-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pg_partman-src
+RUN make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_partman.control
 
@@ -1131,13 +1420,19 @@ RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.1.0.tar.gz
 # compile pg_mooncake extension
 #
 #########################################################################################
-FROM rust-extensions-build AS pg-mooncake-build
+FROM build-deps AS pg_mooncake-src
 ARG PG_VERSION
-
+WORKDIR /ext-src
 RUN wget https://github.com/Mooncake-Labs/pg_mooncake/releases/download/v0.1.1/pg_mooncake-0.1.1.tar.gz -O pg_mooncake.tar.gz && \
     echo "a2d16eff7948dde64f072609ca5d2962d6b4d07cb89d45952add473529c55f55 pg_mooncake.tar.gz" | sha256sum --check && \
     mkdir pg_mooncake-src && cd pg_mooncake-src && tar xzf ../pg_mooncake.tar.gz --strip-components=1 -C . && \
-    make release -j $(getconf _NPROCESSORS_ONLN) && \
+    echo "make -f pg_mooncake-src/Makefile.build installcheck TEST_DIR=./test SQL_DIR=./sql SRC_DIR=./src" > neon-test.sh && \
+    chmod a+x neon-test.sh
+
+FROM rust-extensions-build AS pg_mooncake-build
+COPY --from=pg_mooncake-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pg_mooncake-src
+RUN make release -j $(getconf _NPROCESSORS_ONLN) && \
     make install -j $(getconf _NPROCESSORS_ONLN) && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_mooncake.control
 
@@ -1148,80 +1443,92 @@ RUN wget https://github.com/Mooncake-Labs/pg_mooncake/releases/download/v0.1.1/p
 #
 #########################################################################################
 
-FROM pg-build AS pg-repack-build
+FROM build-deps AS pg_repack-src
 ARG PG_VERSION
-
+WORKDIR /ext-src
 RUN wget https://github.com/reorg/pg_repack/archive/refs/tags/ver_1.5.2.tar.gz -O pg_repack.tar.gz && \
     echo '4516cad42251ed3ad53ff619733004db47d5755acac83f75924cd94d1c4fb681 pg_repack.tar.gz' | sha256sum --check && \
-    mkdir pg_repack-src && cd pg_repack-src && tar xzf ../pg_repack.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
+    mkdir pg_repack-src && cd pg_repack-src && tar xzf ../pg_repack.tar.gz --strip-components=1 -C .
+
+FROM rust-extensions-build AS pg_repack-build
+COPY --from=pg_repack-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pg_repack-src
+RUN make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install
 
 #########################################################################################
 #
-# Layer "neon-pg-ext-build"
+# Layer "neon-ext-build"
 # compile neon extensions
 #
 #########################################################################################
-FROM build-deps AS neon-pg-ext-build
+FROM pg-build AS neon-ext-build
+ARG PG_VERSION
+
+COPY pgxn/ pgxn/
+RUN make -j $(getconf _NPROCESSORS_ONLN) \
+        -C pgxn/neon \
+        -s install && \
+    make -j $(getconf _NPROCESSORS_ONLN) \
+        -C pgxn/neon_utils \
+        -s install && \
+    make -j $(getconf _NPROCESSORS_ONLN) \
+        -C pgxn/neon_test_utils \
+        -s install && \
+    make -j $(getconf _NPROCESSORS_ONLN) \
+        -C pgxn/neon_rmgr \
+        -s install
+
+#########################################################################################
+#
+# Layer "all-extensions"
+# Bundle together all the extensions
+#
+#########################################################################################
+FROM build-deps AS all-extensions
 ARG PG_VERSION
 
 # Public extensions
 COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=postgis-build /sfcgal/* /
+COPY --from=pgrouting-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=h3-pg-build /h3/usr /
-COPY --from=unit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=vector-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pgjwt-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pgrag-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg-jsonschema-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg-graphql-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg-tiktoken-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=hypopg-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg-hashids-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=rum-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pgtap-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=ip4r-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=prefix-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=hll-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=plpgsql-check-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=timescaledb-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg-hint-plan-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg-cron-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg-pgx-ulid-pgrx12-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg-session-jwt-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg-uuidv7-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg-roaringbitmap-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg-semver-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg-embedding-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=wal2json-pg-build /usr/local/pgsql /usr/local/pgsql
-COPY --from=pg-anon-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg-ivm-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg-partman-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg-mooncake-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg-repack-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY pgxn/ pgxn/
+COPY --from=postgresql-unit-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pgvector-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pgjwt-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pgrag-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg_jsonschema-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg_graphql-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg_tiktoken-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=hypopg-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg_hashids-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=rum-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pgtap-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=ip4r-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=prefix-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=hll-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=plpgsql_check-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=timescaledb-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg_hint_plan-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg_cron-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pgx_ulid-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pgx_ulid-pgrx12-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg_session_jwt-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=rdkit-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg_uuidv7-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg_roaringbitmap-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg_semver-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg_embedding-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=wal2json-build /usr/local/pgsql /usr/local/pgsql
+COPY --from=pg_anon-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg_ivm-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg_partman-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg_mooncake-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg_repack-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN make -j $(getconf _NPROCESSORS_ONLN) \
-        PG_CONFIG=/usr/local/pgsql/bin/pg_config \
-        -C pgxn/neon \
-        -s install && \
-    make -j $(getconf _NPROCESSORS_ONLN) \
-        PG_CONFIG=/usr/local/pgsql/bin/pg_config \
-        -C pgxn/neon_utils \
-        -s install && \
-    make -j $(getconf _NPROCESSORS_ONLN) \
-        PG_CONFIG=/usr/local/pgsql/bin/pg_config \
-        -C pgxn/neon_test_utils \
-        -s install && \
-    make -j $(getconf _NPROCESSORS_ONLN) \
-        PG_CONFIG=/usr/local/pgsql/bin/pg_config \
-        -C pgxn/neon_rmgr \
-        -s install
+COPY --from=neon-ext-build /usr/local/pgsql/ /usr/local/pgsql/
 
 #########################################################################################
 #
@@ -1302,8 +1609,8 @@ RUN echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 30
 # Clean up postgres folder before inclusion
 #
 #########################################################################################
-FROM neon-pg-ext-build AS postgres-cleanup-layer
-COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql
+FROM neon-ext-build AS postgres-cleanup-layer
+COPY --from=all-extensions /usr/local/pgsql /usr/local/pgsql
 
 # Remove binaries from /bin/ that we won't use (or would manually copy & install otherwise)
 RUN cd /usr/local/pgsql/bin && rm -f ecpg raster2pgsql shp2pgsql pgtopo_export pgtopo_import pgsql2shp
@@ -1332,66 +1639,59 @@ RUN make PG_VERSION="${PG_VERSION}" -C compute
 
 #########################################################################################
 #
-# Layer neon-pg-ext-test
+# Layer extension-tests
 #
 #########################################################################################
 
-FROM neon-pg-ext-build AS neon-pg-ext-test
+FROM pg-build AS extension-tests
 ARG PG_VERSION
 RUN mkdir /ext-src
 
 COPY --from=pg-build /postgres /postgres
-#COPY --from=postgis-build /postgis.tar.gz /ext-src/
-#COPY --from=postgis-build /sfcgal/* /usr
-COPY --from=plv8-build /plv8.tar.gz /ext-src/
-#COPY --from=h3-pg-build /h3-pg.tar.gz /ext-src/
-COPY --from=unit-pg-build /postgresql-unit.tar.gz /ext-src/
-COPY --from=vector-pg-build /pgvector.tar.gz /ext-src/
-COPY --from=vector-pg-build /pgvector.patch /ext-src/
-COPY --from=pgjwt-pg-build /pgjwt.tar.gz /ext-src
-#COPY --from=pgrag-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-#COPY --from=pg-jsonschema-pg-build /home/nonroot/pg_jsonschema.tar.gz /ext-src
-COPY --from=pg-graphql-pg-build /home/nonroot/pg_graphql.tar.gz /ext-src
-COPY compute/patches/pg_graphql.patch /ext-src
-#COPY --from=pg-tiktoken-pg-build /home/nonroot/pg_tiktoken.tar.gz /ext-src
-COPY --from=hypopg-pg-build /hypopg.tar.gz /ext-src
-COPY --from=pg-hashids-pg-build /pg_hashids.tar.gz /ext-src
-COPY --from=rum-pg-build /rum.tar.gz /ext-src
-COPY compute/patches/rum.patch /ext-src
-#COPY --from=pgtap-pg-build /pgtap.tar.gz /ext-src
-COPY --from=ip4r-pg-build /ip4r.tar.gz /ext-src
-COPY --from=prefix-pg-build /prefix.tar.gz /ext-src
-COPY --from=hll-pg-build /hll.tar.gz /ext-src
-COPY --from=plpgsql-check-pg-build /plpgsql_check.tar.gz /ext-src
-#COPY --from=timescaledb-pg-build /timescaledb.tar.gz /ext-src
-COPY --from=pg-hint-plan-pg-build /pg_hint_plan.tar.gz /ext-src
+#COPY --from=postgis-src /ext-src/ /ext-src/
+COPY --from=plv8-src /ext-src/ /ext-src/
+#COPY --from=h3-pg-src /ext-src/ /ext-src/
+COPY --from=postgresql-unit-src /ext-src/ /ext-src/
+COPY --from=pgvector-src /ext-src/ /ext-src/
+COPY --from=pgjwt-src /ext-src/ /ext-src/
+#COPY --from=pgrag-src /ext-src/ /ext-src/
+#COPY --from=pg_jsonschema-src /ext-src/ /ext-src/
+COPY --from=pg_graphql-src /ext-src/ /ext-src/
+#COPY --from=pg_tiktoken-src /ext-src/ /ext-src/
+COPY --from=hypopg-src /ext-src/ /ext-src/
+COPY --from=pg_hashids-src /ext-src/ /ext-src/
+COPY --from=rum-src /ext-src/ /ext-src/
+#COPY --from=pgtap-src /ext-src/ /ext-src/
+COPY --from=ip4r-src /ext-src/ /ext-src/
+COPY --from=prefix-src /ext-src/ /ext-src/
+COPY --from=hll-src /ext-src/ /ext-src/
+COPY --from=plpgsql_check-src /ext-src/ /ext-src/
+#COPY --from=timescaledb-src /ext-src/ /ext-src/
+COPY --from=pg_hint_plan-src /ext-src/ /ext-src/
 COPY compute/patches/pg_hint_plan_${PG_VERSION}.patch /ext-src
-COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src
-COPY compute/patches/pg_cron.patch /ext-src
-#COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src
-#COPY --from=rdkit-pg-build /rdkit.tar.gz /ext-src
-COPY --from=pg-uuidv7-pg-build /pg_uuidv7.tar.gz /ext-src
-COPY --from=pg-roaringbitmap-pg-build /pg_roaringbitmap.tar.gz /ext-src
-COPY --from=pg-semver-pg-build /pg_semver.tar.gz /ext-src
-#COPY --from=pg-embedding-pg-build /home/nonroot/pg_embedding-src/ /ext-src
-#COPY --from=wal2json-pg-build /wal2json_2_5.tar.gz /ext-src
-COPY --from=pg-ivm-build /pg_ivm.tar.gz /ext-src
-COPY --from=pg-partman-build /pg_partman.tar.gz /ext-src
-RUN cd /ext-src/ && for f in *.tar.gz; \
-    do echo $f; dname=$(echo $f | sed 's/\.tar.*//')-src; \
-    rm -rf $dname; mkdir $dname; tar xzf $f --strip-components=1 -C $dname \
-    || exit 1; rm -f $f; done
-RUN cd /ext-src/rum-src && patch -p1 <../rum.patch
-RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch
 RUN cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan_${PG_VERSION}.patch
+COPY --from=pg_cron-src /ext-src/ /ext-src/
+#COPY --from=pgx_ulid-src /ext-src/ /ext-src/
+#COPY --from=pgx_ulid-pgrx12-src /ext-src/ /ext-src/
+#COPY --from=pg_session_jwt-src /ext-src/ /ext-src/
+#COPY --from=rdkit-src /ext-src/ /ext-src/
+COPY --from=pg_uuidv7-src /ext-src/ /ext-src/
+COPY --from=pg_roaringbitmap-src /ext-src/ /ext-src/
+COPY --from=pg_semver-src /ext-src/ /ext-src/
+#COPY --from=pg_embedding-src /ext-src/ /ext-src/
+#COPY --from=wal2json-src /ext-src/ /ext-src/
+COPY --from=pg_ivm-src /ext-src/ /ext-src/
+COPY --from=pg_partman-src /ext-src/ /ext-src/
+#COPY --from=pg_mooncake-src /ext-src/ /ext-src/
+#COPY --from=pg_repack-src /ext-src/ /ext-src/
+
 COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh
-RUN patch -p1 </ext-src/pg_cron.patch
-RUN cd /ext-src/pg_graphql-src && patch -p1 </ext-src/pg_graphql.patch
 ENV PATH=/usr/local/pgsql/bin:$PATH
 ENV PGHOST=compute
 ENV PGPORT=55433
 ENV PGUSER=cloud_admin
 ENV PGDATABASE=postgres
+
 #########################################################################################
 #
 # Final layer

From d5c3a4e2b911bd6dcf8a318d861c99ee3264008a Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Tue, 4 Feb 2025 14:49:44 +0100
Subject: [PATCH 39/77] Add support for pgjwt test (#10611)

## Problem
We don't currently test pgjwt, while it is based on pg_prove and can be
easily added
## Summary of changes
The test for pgjwt was added.
---
 docker-compose/docker_compose_test.sh             |  1 +
 docker-compose/ext-src/pgjwt-src/neon-test.sh     |  4 ++++
 .../ext-src/pgjwt-src/test-upgrade.patch          | 15 +++++++++++++++
 docker-compose/ext-src/pgjwt-src/test-upgrade.sh  |  5 +++++
 docker-compose/test_extensions_upgrade.sh         |  5 +++--
 5 files changed, 28 insertions(+), 2 deletions(-)
 create mode 100755 docker-compose/ext-src/pgjwt-src/neon-test.sh
 create mode 100644 docker-compose/ext-src/pgjwt-src/test-upgrade.patch
 create mode 100755 docker-compose/ext-src/pgjwt-src/test-upgrade.sh

diff --git a/docker-compose/docker_compose_test.sh b/docker-compose/docker_compose_test.sh
index e0c537edf3..c4ff86ab66 100755
--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -52,6 +52,7 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
 
     if [ $pg_version -ge 16 ]; then
         docker cp ext-src $TEST_CONTAINER_NAME:/
+        docker exec $TEST_CONTAINER_NAME bash -c "apt update && apt install -y libtap-parser-sourcehandler-pgtap-perl"
         # This is required for the pg_hint_plan test, to prevent flaky log message causing the test to fail
         # It cannot be moved to Dockerfile now because the database directory is created after the start of the container
         echo Adding dummy config
diff --git a/docker-compose/ext-src/pgjwt-src/neon-test.sh b/docker-compose/ext-src/pgjwt-src/neon-test.sh
new file mode 100755
index 0000000000..95af0be77b
--- /dev/null
+++ b/docker-compose/ext-src/pgjwt-src/neon-test.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+set -ex
+cd "$(dirname "${0}")"
+pg_prove test.sql
\ No newline at end of file
diff --git a/docker-compose/ext-src/pgjwt-src/test-upgrade.patch b/docker-compose/ext-src/pgjwt-src/test-upgrade.patch
new file mode 100644
index 0000000000..85b3565480
--- /dev/null
+++ b/docker-compose/ext-src/pgjwt-src/test-upgrade.patch
@@ -0,0 +1,15 @@
+diff --git a/test.sql b/test.sql
+index d7a0ca8..f15bc76 100644
+--- a/test.sql
++++ b/test.sql
+@@ -9,9 +9,7 @@
+ \set ON_ERROR_STOP true
+ \set QUIET 1
+ 
+-CREATE EXTENSION pgcrypto;
+-CREATE EXTENSION pgtap;
+-CREATE EXTENSION pgjwt;
++CREATE EXTENSION IF NOT EXISTS pgtap;
+ 
+ BEGIN;
+ SELECT plan(23);
diff --git a/docker-compose/ext-src/pgjwt-src/test-upgrade.sh b/docker-compose/ext-src/pgjwt-src/test-upgrade.sh
new file mode 100755
index 0000000000..b7158d2340
--- /dev/null
+++ b/docker-compose/ext-src/pgjwt-src/test-upgrade.sh
@@ -0,0 +1,5 @@
+#!/bin/sh
+set -ex
+cd "$(dirname ${0})"
+patch -p1 <test-upgrade.patch
+pg_prove test.sql
\ No newline at end of file
diff --git a/docker-compose/test_extensions_upgrade.sh b/docker-compose/test_extensions_upgrade.sh
index ff93b98065..08b1a60f2d 100755
--- a/docker-compose/test_extensions_upgrade.sh
+++ b/docker-compose/test_extensions_upgrade.sh
@@ -24,7 +24,7 @@ function wait_for_ready {
 }
 function create_extensions() {
   for ext in ${1}; do
-    docker compose exec neon-test-extensions psql -X -v ON_ERROR_STOP=1 -d contrib_regression -c "CREATE EXTENSION IF NOT EXISTS ${ext}"
+    docker compose exec neon-test-extensions psql -X -v ON_ERROR_STOP=1 -d contrib_regression -c "CREATE EXTENSION IF NOT EXISTS ${ext} CASCADE"
   done
 }
 EXTENSIONS='[
@@ -40,7 +40,8 @@ EXTENSIONS='[
 {"extname": "pg_uuidv7", "extdir": "pg_uuidv7-src"},
 {"extname": "roaringbitmap", "extdir": "pg_roaringbitmap-src"},
 {"extname": "semver", "extdir": "pg_semver-src"},
-{"extname": "pg_ivm", "extdir": "pg_ivm-src"}
+{"extname": "pg_ivm", "extdir": "pg_ivm-src"},
+{"extname": "pgjwt", "extdir": "pgjwt-src"}
 ]'
 EXTNAMES=$(echo ${EXTENSIONS} | jq -r '.[].extname' | paste -sd ' ' -)
 TAG=${NEWTAG} docker compose --profile test-extensions up --quiet-pull --build -d

From b6e9daea9a3fe8a6a9f0e070053e8e643722258b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 4 Feb 2025 15:01:57 +0100
Subject: [PATCH 40/77] storcon: only allow errrors of the server cert
 verification (#10644)

This PR does a bunch of things:

* only allow errors of the server cert verification, not of the TLS
handshake. The TLS handshake doesn't cause any errors for us so we can
just always require it to be valid. This simplifies the code a little.
* As the solution is more permanent than originally anticipated, I think
it makes sense to move the `AcceptAll` verifier outside.
* log the connstr information. this helps with figuring out which domain
names are configured in the connstr, etc. I think it is generally useful
to print it. make extra sure that the password is not leaked.

Follow-up of #10640
---
 storage_controller/src/persistence.rs | 138 ++++++++++++++------------
 1 file changed, 72 insertions(+), 66 deletions(-)

diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index 45f3108d6b..c4e5b39589 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -27,7 +27,7 @@ use pageserver_api::shard::ShardConfigError;
 use pageserver_api::shard::ShardIdentity;
 use pageserver_api::shard::ShardStripeSize;
 use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
-use rustls::client::danger::ServerCertVerifier;
+use rustls::client::danger::{ServerCertVerified, ServerCertVerifier};
 use rustls::client::WebPkiServerVerifier;
 use rustls::crypto::ring;
 use scoped_futures::ScopedBoxFuture;
@@ -194,6 +194,8 @@ impl Persistence {
         timeout: Duration,
     ) -> Result<(), diesel::ConnectionError> {
         let started_at = Instant::now();
+        log_postgres_connstr_info(database_url)
+            .map_err(|e| diesel::ConnectionError::InvalidConnectionUrl(e.to_string()))?;
         loop {
             match establish_connection_rustls(database_url).await {
                 Ok(_) => {
@@ -1281,6 +1283,51 @@ pub(crate) fn load_certs() -> anyhow::Result<Arc<rustls::RootCertStore>> {
     Ok(Arc::new(store))
 }
 
+#[derive(Debug)]
+/// A verifier that accepts all certificates (but logs an error still)
+struct AcceptAll(Arc<WebPkiServerVerifier>);
+impl ServerCertVerifier for AcceptAll {
+    fn verify_server_cert(
+        &self,
+        end_entity: &rustls::pki_types::CertificateDer<'_>,
+        intermediates: &[rustls::pki_types::CertificateDer<'_>],
+        server_name: &rustls::pki_types::ServerName<'_>,
+        ocsp_response: &[u8],
+        now: rustls::pki_types::UnixTime,
+    ) -> Result<ServerCertVerified, rustls::Error> {
+        let r =
+            self.0
+                .verify_server_cert(end_entity, intermediates, server_name, ocsp_response, now);
+        if let Err(err) = r {
+            tracing::info!(
+                ?server_name,
+                "ignoring db connection TLS validation error: {err:?}"
+            );
+            return Ok(ServerCertVerified::assertion());
+        }
+        r
+    }
+    fn verify_tls12_signature(
+        &self,
+        message: &[u8],
+        cert: &rustls::pki_types::CertificateDer<'_>,
+        dss: &rustls::DigitallySignedStruct,
+    ) -> Result<rustls::client::danger::HandshakeSignatureValid, rustls::Error> {
+        self.0.verify_tls12_signature(message, cert, dss)
+    }
+    fn verify_tls13_signature(
+        &self,
+        message: &[u8],
+        cert: &rustls::pki_types::CertificateDer<'_>,
+        dss: &rustls::DigitallySignedStruct,
+    ) -> Result<rustls::client::danger::HandshakeSignatureValid, rustls::Error> {
+        self.0.verify_tls13_signature(message, cert, dss)
+    }
+    fn supported_verify_schemes(&self) -> Vec<rustls::SignatureScheme> {
+        self.0.supported_verify_schemes()
+    }
+}
+
 /// Loads the root certificates and constructs a client config suitable for connecting.
 /// This function is blocking.
 fn client_config_with_root_certs() -> anyhow::Result<rustls::ClientConfig> {
@@ -1290,76 +1337,12 @@ fn client_config_with_root_certs() -> anyhow::Result<rustls::ClientConfig> {
             .expect("ring should support the default protocol versions");
     static DO_CERT_CHECKS: std::sync::OnceLock<bool> = std::sync::OnceLock::new();
     let do_cert_checks =
-        DO_CERT_CHECKS.get_or_init(|| std::env::var("STORCON_CERT_CHECKS").is_ok());
+        DO_CERT_CHECKS.get_or_init(|| std::env::var("STORCON_DB_CERT_CHECKS").is_ok());
     Ok(if *do_cert_checks {
         client_config
             .with_root_certificates(load_certs()?)
             .with_no_client_auth()
     } else {
-        use rustls::client::danger::{HandshakeSignatureValid, ServerCertVerified};
-        #[derive(Debug)]
-        struct AcceptAll(Arc<WebPkiServerVerifier>);
-        impl ServerCertVerifier for AcceptAll {
-            fn verify_server_cert(
-                &self,
-                end_entity: &rustls::pki_types::CertificateDer<'_>,
-                intermediates: &[rustls::pki_types::CertificateDer<'_>],
-                server_name: &rustls::pki_types::ServerName<'_>,
-                ocsp_response: &[u8],
-                now: rustls::pki_types::UnixTime,
-            ) -> Result<ServerCertVerified, rustls::Error> {
-                let r = self.0.verify_server_cert(
-                    end_entity,
-                    intermediates,
-                    server_name,
-                    ocsp_response,
-                    now,
-                );
-                if let Err(err) = r {
-                    tracing::info!(
-                        ?server_name,
-                        "ignoring db connection TLS validation error: {err:?}"
-                    );
-                    return Ok(ServerCertVerified::assertion());
-                }
-                r
-            }
-            fn verify_tls12_signature(
-                &self,
-                message: &[u8],
-                cert: &rustls::pki_types::CertificateDer<'_>,
-                dss: &rustls::DigitallySignedStruct,
-            ) -> Result<rustls::client::danger::HandshakeSignatureValid, rustls::Error>
-            {
-                let r = self.0.verify_tls12_signature(message, cert, dss);
-                if let Err(err) = r {
-                    tracing::info!(
-                        "ignoring db connection 1.2 signature TLS validation error: {err:?}"
-                    );
-                    return Ok(HandshakeSignatureValid::assertion());
-                }
-                r
-            }
-            fn verify_tls13_signature(
-                &self,
-                message: &[u8],
-                cert: &rustls::pki_types::CertificateDer<'_>,
-                dss: &rustls::DigitallySignedStruct,
-            ) -> Result<rustls::client::danger::HandshakeSignatureValid, rustls::Error>
-            {
-                let r = self.0.verify_tls13_signature(message, cert, dss);
-                if let Err(err) = r {
-                    tracing::info!(
-                        "ignoring db connection 1.3 signature TLS validation error: {err:?}"
-                    );
-                    return Ok(HandshakeSignatureValid::assertion());
-                }
-                r
-            }
-            fn supported_verify_schemes(&self) -> Vec<rustls::SignatureScheme> {
-                self.0.supported_verify_schemes()
-            }
-        }
         let verifier = AcceptAll(
             WebPkiServerVerifier::builder_with_provider(
                 load_certs()?,
@@ -1389,6 +1372,29 @@ fn establish_connection_rustls(config: &str) -> BoxFuture<ConnectionResult<Async
     fut.boxed()
 }
 
+#[cfg_attr(test, test)]
+fn test_config_debug_censors_password() {
+    let has_pw =
+        "host=/var/lib/postgresql,localhost port=1234 user=specialuser password='NOT ALLOWED TAG'";
+    let has_pw_cfg = has_pw.parse::<tokio_postgres::Config>().unwrap();
+    assert!(format!("{has_pw_cfg:?}").contains("specialuser"));
+    // Ensure that the password is not leaked by the debug impl
+    assert!(!format!("{has_pw_cfg:?}").contains("NOT ALLOWED TAG"));
+}
+
+fn log_postgres_connstr_info(config_str: &str) -> anyhow::Result<()> {
+    let config = config_str
+        .parse::<tokio_postgres::Config>()
+        .map_err(|_e| anyhow::anyhow!("Couldn't parse config str"))?;
+    // We use debug formatting here, and use a unit test to ensure that we don't leak the password.
+    // To make extra sure the test gets ran, run it every time the function is called
+    // (this is rather cold code, we can afford it).
+    #[cfg(not(test))]
+    test_config_debug_censors_password();
+    tracing::info!("database connection config: {config:?}");
+    Ok(())
+}
+
 /// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably
 #[derive(
     QueryableByName, Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq,

From dcf335a25195bdd2d8fa8dd5080ca661ea4396e3 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Tue, 4 Feb 2025 15:50:53 +0100
Subject: [PATCH 41/77] proxy: Switch proxy to JSON logging (#9857)

## Problem

We want to switch proxy and ideally all Rust services to structured JSON
logging to support better filtering and cross-referencing with tracing.

## Summary of changes

* Introduce a custom tracing-subscriber to write the JSON. In a first
attempt a customized tracing::fmt::FmtSubscriber was used, but it's very
inefficient and can still generate invalid JSON. It's also doesn't allow
us to add important fields to the root object.
* Make this opt in: the `LOGFMT` env var can be set to `"json"` to
enable to new logger at startup.
---
 Cargo.lock                |  55 +++
 Cargo.toml                |   3 +
 deny.toml                 |   1 +
 proxy/Cargo.toml          |   8 +
 proxy/src/logging.rs      | 904 +++++++++++++++++++++++++++++++++++++-
 workspace_hack/Cargo.toml |   1 +
 6 files changed, 964 insertions(+), 8 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 0133c83564..de1b1218ca 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -206,6 +206,16 @@ dependencies = [
  "syn 2.0.90",
 ]
 
+[[package]]
+name = "assert-json-diff"
+version = "2.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "47e4f2b81832e72834d7518d8487a0396a28cc408186a2e8854c0f98011faf12"
+dependencies = [
+ "serde",
+ "serde_json",
+]
+
 [[package]]
 name = "async-channel"
 version = "1.9.0"
@@ -1010,6 +1020,12 @@ dependencies = [
  "generic-array",
 ]
 
+[[package]]
+name = "boxcar"
+version = "0.2.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2721c3c5a6f0e7f7e607125d963fedeb765f545f67adc9d71ed934693881eb42"
+
 [[package]]
 name = "bstr"
 version = "1.5.0"
@@ -2433,6 +2449,16 @@ dependencies = [
  "wasm-bindgen",
 ]
 
+[[package]]
+name = "gettid"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "397256552fed4a9e577850498071831ec8f18ea83368aecc114cab469dcb43e5"
+dependencies = [
+ "libc",
+ "winapi",
+]
+
 [[package]]
 name = "gimli"
 version = "0.31.1"
@@ -4212,6 +4238,16 @@ dependencies = [
  "workspace_hack",
 ]
 
+[[package]]
+name = "papaya"
+version = "0.1.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc7c76487f7eaa00a0fc1d7f88dc6b295aec478d11b0fc79f857b62c2874124c"
+dependencies = [
+ "equivalent",
+ "seize",
+]
+
 [[package]]
 name = "parking"
 version = "2.1.1"
@@ -4839,6 +4875,7 @@ dependencies = [
  "ahash",
  "anyhow",
  "arc-swap",
+ "assert-json-diff",
  "async-compression",
  "async-trait",
  "atomic-take",
@@ -4846,6 +4883,7 @@ dependencies = [
  "aws-sdk-iam",
  "aws-sigv4",
  "base64 0.13.1",
+ "boxcar",
  "bstr",
  "bytes",
  "camino",
@@ -4862,6 +4900,7 @@ dependencies = [
  "flate2",
  "framed-websockets",
  "futures",
+ "gettid",
  "hashbrown 0.14.5",
  "hashlink",
  "hex",
@@ -4884,7 +4923,9 @@ dependencies = [
  "measured",
  "metrics",
  "once_cell",
+ "opentelemetry",
  "p256 0.13.2",
+ "papaya",
  "parking_lot 0.12.1",
  "parquet",
  "parquet_derive",
@@ -4931,6 +4972,9 @@ dependencies = [
  "tokio-tungstenite 0.21.0",
  "tokio-util",
  "tracing",
+ "tracing-log",
+ "tracing-opentelemetry",
+ "tracing-serde",
  "tracing-subscriber",
  "tracing-utils",
  "try-lock",
@@ -5884,6 +5928,16 @@ dependencies = [
  "libc",
 ]
 
+[[package]]
+name = "seize"
+version = "0.4.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d84b0c858bdd30cb56f5597f8b3bf702ec23829e652cc636a1e5a7b9de46ae93"
+dependencies = [
+ "libc",
+ "windows-sys 0.52.0",
+]
+
 [[package]]
 name = "semver"
 version = "1.0.17"
@@ -8145,6 +8199,7 @@ dependencies = [
  "tower 0.4.13",
  "tracing",
  "tracing-core",
+ "tracing-log",
  "url",
  "zerocopy",
  "zeroize",
diff --git a/Cargo.toml b/Cargo.toml
index 267a91d773..76b54ae1d8 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -54,6 +54,7 @@ async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
 atomic-take = "1.1.0"
 backtrace = "0.3.74"
 flate2 = "1.0.26"
+assert-json-diff = "2"
 async-stream = "0.3"
 async-trait = "0.1"
 aws-config = { version = "1.5", default-features = false, features=["rustls", "sso"] }
@@ -193,7 +194,9 @@ tower-http = { version = "0.6.2", features = ["request-id", "trace"] }
 tower-service = "0.3.3"
 tracing = "0.1"
 tracing-error = "0.2"
+tracing-log = "0.2"
 tracing-opentelemetry = "0.28"
+tracing-serde = "0.2.0"
 tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
 try-lock = "0.2.5"
 twox-hash = { version = "1.6.3", default-features = false }
diff --git a/deny.toml b/deny.toml
index df00a34c60..b551405568 100644
--- a/deny.toml
+++ b/deny.toml
@@ -32,6 +32,7 @@ reason = "the marvin attack only affects private key decryption, not public key
 # https://embarkstudios.github.io/cargo-deny/checks/licenses/cfg.html
 [licenses]
 allow = [
+    "0BSD",
     "Apache-2.0",
     "BSD-2-Clause",
     "BSD-3-Clause",
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 35574e945c..d7880ea7b9 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -19,6 +19,7 @@ aws-config.workspace = true
 aws-sdk-iam.workspace = true
 aws-sigv4.workspace = true
 base64.workspace = true
+boxcar = "0.2.8"
 bstr.workspace = true
 bytes = { workspace = true, features = ["serde"] }
 camino.workspace = true
@@ -42,6 +43,7 @@ hyper0.workspace = true
 hyper = { workspace = true, features = ["server", "http1", "http2"] }
 hyper-util = { version = "0.1", features = ["server", "http1", "http2", "tokio"] }
 http-body-util = { version = "0.1" }
+gettid = "0.1.3"
 indexmap = { workspace = true, features = ["serde"] }
 ipnet.workspace = true
 itertools.workspace = true
@@ -50,6 +52,8 @@ lasso = { workspace = true, features = ["multi-threaded"] }
 measured = { workspace = true, features = ["lasso"] }
 metrics.workspace = true
 once_cell.workspace = true
+opentelemetry = { workspace = true, features = ["trace"] }
+papaya = "0.1.8"
 parking_lot.workspace = true
 parquet.workspace = true
 parquet_derive.workspace = true
@@ -89,6 +93,9 @@ tokio = { workspace = true, features = ["signal"] }
 tracing-subscriber.workspace = true
 tracing-utils.workspace = true
 tracing.workspace = true
+tracing-log.workspace = true
+tracing-serde.workspace = true
+tracing-opentelemetry.workspace = true
 try-lock.workspace = true
 typed-json.workspace = true
 url.workspace = true
@@ -112,6 +119,7 @@ rsa = "0.9"
 workspace_hack.workspace = true
 
 [dev-dependencies]
+assert-json-diff.workspace = true
 camino-tempfile.workspace = true
 fallible-iterator.workspace = true
 flate2.workspace = true
diff --git a/proxy/src/logging.rs b/proxy/src/logging.rs
index 41f10f052f..97c9f5a59c 100644
--- a/proxy/src/logging.rs
+++ b/proxy/src/logging.rs
@@ -1,10 +1,23 @@
-use tracing::Subscriber;
+use std::cell::{Cell, RefCell};
+use std::collections::HashMap;
+use std::hash::BuildHasher;
+use std::{env, io};
+
+use chrono::{DateTime, Utc};
+use opentelemetry::trace::TraceContextExt;
+use scopeguard::defer;
+use serde::ser::{SerializeMap, Serializer};
+use tracing::span;
+use tracing::subscriber::Interest;
+use tracing::{callsite, Event, Metadata, Span, Subscriber};
+use tracing_opentelemetry::OpenTelemetrySpanExt;
 use tracing_subscriber::filter::{EnvFilter, LevelFilter};
 use tracing_subscriber::fmt::format::{Format, Full};
 use tracing_subscriber::fmt::time::SystemTime;
 use tracing_subscriber::fmt::{FormatEvent, FormatFields};
+use tracing_subscriber::layer::{Context, Layer};
 use tracing_subscriber::prelude::*;
-use tracing_subscriber::registry::LookupSpan;
+use tracing_subscriber::registry::{LookupSpan, SpanRef};
 
 /// Initialize logging and OpenTelemetry tracing and exporter.
 ///
@@ -15,6 +28,8 @@ use tracing_subscriber::registry::LookupSpan;
 /// destination, set `OTEL_EXPORTER_OTLP_ENDPOINT=http://jaeger:4318`.
 /// See <https://opentelemetry.io/docs/reference/specification/sdk-environment-variables>
 pub async fn init() -> anyhow::Result<LoggingGuard> {
+    let logfmt = LogFormat::from_env()?;
+
     let env_filter = EnvFilter::builder()
         .with_default_directive(LevelFilter::INFO.into())
         .from_env_lossy()
@@ -29,17 +44,36 @@ pub async fn init() -> anyhow::Result<LoggingGuard> {
                 .expect("this should be a valid filter directive"),
         );
 
-    let fmt_layer = tracing_subscriber::fmt::layer()
-        .with_ansi(false)
-        .with_writer(std::io::stderr)
-        .with_target(false);
-
     let otlp_layer = tracing_utils::init_tracing("proxy").await;
 
+    let json_log_layer = if logfmt == LogFormat::Json {
+        Some(JsonLoggingLayer {
+            clock: RealClock,
+            skipped_field_indices: papaya::HashMap::default(),
+            writer: StderrWriter {
+                stderr: std::io::stderr(),
+            },
+        })
+    } else {
+        None
+    };
+
+    let text_log_layer = if logfmt == LogFormat::Text {
+        Some(
+            tracing_subscriber::fmt::layer()
+                .with_ansi(false)
+                .with_writer(std::io::stderr)
+                .with_target(false),
+        )
+    } else {
+        None
+    };
+
     tracing_subscriber::registry()
         .with(env_filter)
         .with(otlp_layer)
-        .with(fmt_layer)
+        .with(json_log_layer)
+        .with(text_log_layer)
         .try_init()?;
 
     Ok(LoggingGuard)
@@ -94,3 +128,857 @@ impl Drop for LoggingGuard {
         tracing_utils::shutdown_tracing();
     }
 }
+
+// TODO: make JSON the default
+#[derive(Copy, Clone, PartialEq, Eq, Default, Debug)]
+enum LogFormat {
+    #[default]
+    Text = 1,
+    Json,
+}
+
+impl LogFormat {
+    fn from_env() -> anyhow::Result<Self> {
+        let logfmt = env::var("LOGFMT");
+        Ok(match logfmt.as_deref() {
+            Err(_) => LogFormat::default(),
+            Ok("text") => LogFormat::Text,
+            Ok("json") => LogFormat::Json,
+            Ok(logfmt) => anyhow::bail!("unknown log format: {logfmt}"),
+        })
+    }
+}
+
+trait MakeWriter {
+    fn make_writer(&self) -> impl io::Write;
+}
+
+struct StderrWriter {
+    stderr: io::Stderr,
+}
+
+impl MakeWriter for StderrWriter {
+    #[inline]
+    fn make_writer(&self) -> impl io::Write {
+        self.stderr.lock()
+    }
+}
+
+// TODO: move into separate module or even separate crate.
+trait Clock {
+    fn now(&self) -> DateTime<Utc>;
+}
+
+struct RealClock;
+
+impl Clock for RealClock {
+    #[inline]
+    fn now(&self) -> DateTime<Utc> {
+        Utc::now()
+    }
+}
+
+/// Name of the field used by tracing crate to store the event message.
+const MESSAGE_FIELD: &str = "message";
+
+thread_local! {
+    /// Protects against deadlocks and double panics during log writing.
+    /// The current panic handler will use tracing to log panic information.
+    static REENTRANCY_GUARD: Cell<bool> = const { Cell::new(false) };
+    /// Thread-local instance with per-thread buffer for log writing.
+    static EVENT_FORMATTER: RefCell<EventFormatter> = RefCell::new(EventFormatter::new());
+    /// Cached OS thread ID.
+    static THREAD_ID: u64 = gettid::gettid();
+}
+
+/// Implements tracing layer to handle events specific to logging.
+struct JsonLoggingLayer<C: Clock, W: MakeWriter> {
+    clock: C,
+    skipped_field_indices: papaya::HashMap<callsite::Identifier, SkippedFieldIndices>,
+    writer: W,
+}
+
+impl<S, C: Clock + 'static, W: MakeWriter + 'static> Layer<S> for JsonLoggingLayer<C, W>
+where
+    S: Subscriber + for<'a> LookupSpan<'a>,
+{
+    fn on_event(&self, event: &Event<'_>, ctx: Context<'_, S>) {
+        use std::io::Write;
+
+        // TODO: consider special tracing subscriber to grab timestamp very
+        //       early, before OTel machinery, and add as event extension.
+        let now = self.clock.now();
+
+        let res: io::Result<()> = REENTRANCY_GUARD.with(move |entered| {
+            if entered.get() {
+                let mut formatter = EventFormatter::new();
+                formatter.format(now, event, &ctx, &self.skipped_field_indices)?;
+                self.writer.make_writer().write_all(formatter.buffer())
+            } else {
+                entered.set(true);
+                defer!(entered.set(false););
+
+                EVENT_FORMATTER.with_borrow_mut(move |formatter| {
+                    formatter.reset();
+                    formatter.format(now, event, &ctx, &self.skipped_field_indices)?;
+                    self.writer.make_writer().write_all(formatter.buffer())
+                })
+            }
+        });
+
+        // In case logging fails we generate a simpler JSON object.
+        if let Err(err) = res {
+            if let Ok(mut line) = serde_json::to_vec(&serde_json::json!( {
+                "timestamp": now.to_rfc3339_opts(chrono::SecondsFormat::Micros, true),
+                "level": "ERROR",
+                "message": format_args!("cannot log event: {err:?}"),
+                "fields": {
+                    "event": format_args!("{event:?}"),
+                },
+            })) {
+                line.push(b'\n');
+                self.writer.make_writer().write_all(&line).ok();
+            }
+        }
+    }
+
+    /// Registers a SpanFields instance as span extension.
+    fn on_new_span(&self, attrs: &span::Attributes<'_>, id: &span::Id, ctx: Context<'_, S>) {
+        let span = ctx.span(id).expect("span must exist");
+        let fields = SpanFields::default();
+        fields.record_fields(attrs);
+        // This could deadlock when there's a panic somewhere in the tracing
+        // event handling and a read or write guard is still held. This includes
+        // the OTel subscriber.
+        span.extensions_mut().insert(fields);
+    }
+
+    fn on_record(&self, id: &span::Id, values: &span::Record<'_>, ctx: Context<'_, S>) {
+        let span = ctx.span(id).expect("span must exist");
+        let ext = span.extensions();
+        if let Some(data) = ext.get::<SpanFields>() {
+            data.record_fields(values);
+        }
+    }
+
+    /// Called (lazily) whenever a new log call is executed. We quickly check
+    /// for duplicate field names and record duplicates as skippable. Last one
+    /// wins.
+    fn register_callsite(&self, metadata: &'static Metadata<'static>) -> Interest {
+        if !metadata.is_event() {
+            // Must not be never because we wouldn't get trace and span data.
+            return Interest::always();
+        }
+
+        let mut field_indices = SkippedFieldIndices::default();
+        let mut seen_fields = HashMap::<&'static str, usize>::new();
+        for field in metadata.fields() {
+            use std::collections::hash_map::Entry;
+            match seen_fields.entry(field.name()) {
+                Entry::Vacant(entry) => {
+                    // field not seen yet
+                    entry.insert(field.index());
+                }
+                Entry::Occupied(mut entry) => {
+                    // replace currently stored index
+                    let old_index = entry.insert(field.index());
+                    // ... and append it to list of skippable indices
+                    field_indices.push(old_index);
+                }
+            }
+        }
+
+        if !field_indices.is_empty() {
+            self.skipped_field_indices
+                .pin()
+                .insert(metadata.callsite(), field_indices);
+        }
+
+        Interest::always()
+    }
+}
+
+/// Stores span field values recorded during the spans lifetime.
+#[derive(Default)]
+struct SpanFields {
+    // TODO: Switch to custom enum with lasso::Spur for Strings?
+    fields: papaya::HashMap<&'static str, serde_json::Value>,
+}
+
+impl SpanFields {
+    #[inline]
+    fn record_fields<R: tracing_subscriber::field::RecordFields>(&self, fields: R) {
+        fields.record(&mut SpanFieldsRecorder {
+            fields: self.fields.pin(),
+        });
+    }
+}
+
+/// Implements a tracing field visitor to convert and store values.
+struct SpanFieldsRecorder<'m, S, G> {
+    fields: papaya::HashMapRef<'m, &'static str, serde_json::Value, S, G>,
+}
+
+impl<S: BuildHasher, G: papaya::Guard> tracing::field::Visit for SpanFieldsRecorder<'_, S, G> {
+    #[inline]
+    fn record_f64(&mut self, field: &tracing::field::Field, value: f64) {
+        self.fields
+            .insert(field.name(), serde_json::Value::from(value));
+    }
+
+    #[inline]
+    fn record_i64(&mut self, field: &tracing::field::Field, value: i64) {
+        self.fields
+            .insert(field.name(), serde_json::Value::from(value));
+    }
+
+    #[inline]
+    fn record_u64(&mut self, field: &tracing::field::Field, value: u64) {
+        self.fields
+            .insert(field.name(), serde_json::Value::from(value));
+    }
+
+    #[inline]
+    fn record_i128(&mut self, field: &tracing::field::Field, value: i128) {
+        if let Ok(value) = i64::try_from(value) {
+            self.fields
+                .insert(field.name(), serde_json::Value::from(value));
+        } else {
+            self.fields
+                .insert(field.name(), serde_json::Value::from(format!("{value}")));
+        }
+    }
+
+    #[inline]
+    fn record_u128(&mut self, field: &tracing::field::Field, value: u128) {
+        if let Ok(value) = u64::try_from(value) {
+            self.fields
+                .insert(field.name(), serde_json::Value::from(value));
+        } else {
+            self.fields
+                .insert(field.name(), serde_json::Value::from(format!("{value}")));
+        }
+    }
+
+    #[inline]
+    fn record_bool(&mut self, field: &tracing::field::Field, value: bool) {
+        self.fields
+            .insert(field.name(), serde_json::Value::from(value));
+    }
+
+    #[inline]
+    fn record_bytes(&mut self, field: &tracing::field::Field, value: &[u8]) {
+        self.fields
+            .insert(field.name(), serde_json::Value::from(value));
+    }
+
+    #[inline]
+    fn record_str(&mut self, field: &tracing::field::Field, value: &str) {
+        self.fields
+            .insert(field.name(), serde_json::Value::from(value));
+    }
+
+    #[inline]
+    fn record_debug(&mut self, field: &tracing::field::Field, value: &dyn std::fmt::Debug) {
+        self.fields
+            .insert(field.name(), serde_json::Value::from(format!("{value:?}")));
+    }
+
+    #[inline]
+    fn record_error(
+        &mut self,
+        field: &tracing::field::Field,
+        value: &(dyn std::error::Error + 'static),
+    ) {
+        self.fields
+            .insert(field.name(), serde_json::Value::from(format!("{value}")));
+    }
+}
+
+/// List of field indices skipped during logging. Can list duplicate fields or
+/// metafields not meant to be logged.
+#[derive(Clone, Default)]
+struct SkippedFieldIndices {
+    bits: u64,
+}
+
+impl SkippedFieldIndices {
+    #[inline]
+    fn is_empty(&self) -> bool {
+        self.bits == 0
+    }
+
+    #[inline]
+    fn push(&mut self, index: usize) {
+        self.bits |= 1u64
+            .checked_shl(index as u32)
+            .expect("field index too large");
+    }
+
+    #[inline]
+    fn contains(&self, index: usize) -> bool {
+        self.bits
+            & 1u64
+                .checked_shl(index as u32)
+                .expect("field index too large")
+            != 0
+    }
+}
+
+/// Formats a tracing event and writes JSON to its internal buffer including a newline.
+// TODO: buffer capacity management, truncate if too large
+struct EventFormatter {
+    logline_buffer: Vec<u8>,
+}
+
+impl EventFormatter {
+    #[inline]
+    fn new() -> Self {
+        EventFormatter {
+            logline_buffer: Vec::new(),
+        }
+    }
+
+    #[inline]
+    fn buffer(&self) -> &[u8] {
+        &self.logline_buffer
+    }
+
+    #[inline]
+    fn reset(&mut self) {
+        self.logline_buffer.clear();
+    }
+
+    fn format<S>(
+        &mut self,
+        now: DateTime<Utc>,
+        event: &Event<'_>,
+        ctx: &Context<'_, S>,
+        skipped_field_indices: &papaya::HashMap<callsite::Identifier, SkippedFieldIndices>,
+    ) -> io::Result<()>
+    where
+        S: Subscriber + for<'a> LookupSpan<'a>,
+    {
+        let timestamp = now.to_rfc3339_opts(chrono::SecondsFormat::Micros, true);
+
+        use tracing_log::NormalizeEvent;
+        let normalized_meta = event.normalized_metadata();
+        let meta = normalized_meta.as_ref().unwrap_or_else(|| event.metadata());
+
+        let skipped_field_indices = skipped_field_indices.pin();
+        let skipped_field_indices = skipped_field_indices.get(&meta.callsite());
+
+        let mut serialize = || {
+            let mut serializer = serde_json::Serializer::new(&mut self.logline_buffer);
+
+            let mut serializer = serializer.serialize_map(None)?;
+
+            // Timestamp comes first, so raw lines can be sorted by timestamp.
+            serializer.serialize_entry("timestamp", &timestamp)?;
+
+            // Level next.
+            serializer.serialize_entry("level", &meta.level().as_str())?;
+
+            // Message next.
+            serializer.serialize_key("message")?;
+            let mut message_extractor =
+                MessageFieldExtractor::new(serializer, skipped_field_indices);
+            event.record(&mut message_extractor);
+            let mut serializer = message_extractor.into_serializer()?;
+
+            let mut fields_present = FieldsPresent(false, skipped_field_indices);
+            event.record(&mut fields_present);
+            if fields_present.0 {
+                serializer.serialize_entry(
+                    "fields",
+                    &SerializableEventFields(event, skipped_field_indices),
+                )?;
+            }
+
+            let pid = std::process::id();
+            if pid != 1 {
+                serializer.serialize_entry("process_id", &pid)?;
+            }
+
+            THREAD_ID.with(|tid| serializer.serialize_entry("thread_id", tid))?;
+
+            // TODO: tls cache? name could change
+            if let Some(thread_name) = std::thread::current().name() {
+                if !thread_name.is_empty() && thread_name != "tokio-runtime-worker" {
+                    serializer.serialize_entry("thread_name", thread_name)?;
+                }
+            }
+
+            if let Some(task_id) = tokio::task::try_id() {
+                serializer.serialize_entry("task_id", &format_args!("{task_id}"))?;
+            }
+
+            serializer.serialize_entry("target", meta.target())?;
+
+            if let Some(module) = meta.module_path() {
+                if module != meta.target() {
+                    serializer.serialize_entry("module", module)?;
+                }
+            }
+
+            if let Some(file) = meta.file() {
+                if let Some(line) = meta.line() {
+                    serializer.serialize_entry("src", &format_args!("{file}:{line}"))?;
+                } else {
+                    serializer.serialize_entry("src", file)?;
+                }
+            }
+
+            {
+                let otel_context = Span::current().context();
+                let otel_spanref = otel_context.span();
+                let span_context = otel_spanref.span_context();
+                if span_context.is_valid() {
+                    serializer.serialize_entry(
+                        "trace_id",
+                        &format_args!("{}", span_context.trace_id()),
+                    )?;
+                }
+            }
+
+            serializer.serialize_entry("spans", &SerializableSpanStack(ctx))?;
+
+            serializer.end()
+        };
+
+        serialize().map_err(io::Error::other)?;
+        self.logline_buffer.push(b'\n');
+        Ok(())
+    }
+}
+
+/// Extracts the message field that's mixed will other fields.
+struct MessageFieldExtractor<'a, S: serde::ser::SerializeMap> {
+    serializer: S,
+    skipped_field_indices: Option<&'a SkippedFieldIndices>,
+    state: Option<Result<(), S::Error>>,
+}
+
+impl<'a, S: serde::ser::SerializeMap> MessageFieldExtractor<'a, S> {
+    #[inline]
+    fn new(serializer: S, skipped_field_indices: Option<&'a SkippedFieldIndices>) -> Self {
+        Self {
+            serializer,
+            skipped_field_indices,
+            state: None,
+        }
+    }
+
+    #[inline]
+    fn into_serializer(mut self) -> Result<S, S::Error> {
+        match self.state {
+            Some(Ok(())) => {}
+            Some(Err(err)) => return Err(err),
+            None => self.serializer.serialize_value("")?,
+        }
+        Ok(self.serializer)
+    }
+
+    #[inline]
+    fn accept_field(&self, field: &tracing::field::Field) -> bool {
+        self.state.is_none()
+            && field.name() == MESSAGE_FIELD
+            && !self
+                .skipped_field_indices
+                .is_some_and(|i| i.contains(field.index()))
+    }
+}
+
+impl<S: serde::ser::SerializeMap> tracing::field::Visit for MessageFieldExtractor<'_, S> {
+    #[inline]
+    fn record_f64(&mut self, field: &tracing::field::Field, value: f64) {
+        if self.accept_field(field) {
+            self.state = Some(self.serializer.serialize_value(&value));
+        }
+    }
+
+    #[inline]
+    fn record_i64(&mut self, field: &tracing::field::Field, value: i64) {
+        if self.accept_field(field) {
+            self.state = Some(self.serializer.serialize_value(&value));
+        }
+    }
+
+    #[inline]
+    fn record_u64(&mut self, field: &tracing::field::Field, value: u64) {
+        if self.accept_field(field) {
+            self.state = Some(self.serializer.serialize_value(&value));
+        }
+    }
+
+    #[inline]
+    fn record_i128(&mut self, field: &tracing::field::Field, value: i128) {
+        if self.accept_field(field) {
+            self.state = Some(self.serializer.serialize_value(&value));
+        }
+    }
+
+    #[inline]
+    fn record_u128(&mut self, field: &tracing::field::Field, value: u128) {
+        if self.accept_field(field) {
+            self.state = Some(self.serializer.serialize_value(&value));
+        }
+    }
+
+    #[inline]
+    fn record_bool(&mut self, field: &tracing::field::Field, value: bool) {
+        if self.accept_field(field) {
+            self.state = Some(self.serializer.serialize_value(&value));
+        }
+    }
+
+    #[inline]
+    fn record_bytes(&mut self, field: &tracing::field::Field, value: &[u8]) {
+        if self.accept_field(field) {
+            self.state = Some(self.serializer.serialize_value(&format_args!("{value:x?}")));
+        }
+    }
+
+    #[inline]
+    fn record_str(&mut self, field: &tracing::field::Field, value: &str) {
+        if self.accept_field(field) {
+            self.state = Some(self.serializer.serialize_value(&value));
+        }
+    }
+
+    #[inline]
+    fn record_debug(&mut self, field: &tracing::field::Field, value: &dyn std::fmt::Debug) {
+        if self.accept_field(field) {
+            self.state = Some(self.serializer.serialize_value(&format_args!("{value:?}")));
+        }
+    }
+
+    #[inline]
+    fn record_error(
+        &mut self,
+        field: &tracing::field::Field,
+        value: &(dyn std::error::Error + 'static),
+    ) {
+        if self.accept_field(field) {
+            self.state = Some(self.serializer.serialize_value(&format_args!("{value}")));
+        }
+    }
+}
+
+/// Checks if there's any fields and field values present. If not, the JSON subobject
+/// can be skipped.
+// This is entirely optional and only cosmetic, though maybe helps a
+// bit during log parsing in dashboards when there's no field with empty object.
+struct FieldsPresent<'a>(pub bool, Option<&'a SkippedFieldIndices>);
+
+// Even though some methods have an overhead (error, bytes) it is assumed the
+// compiler won't include this since we ignore the value entirely.
+impl tracing::field::Visit for FieldsPresent<'_> {
+    #[inline]
+    fn record_debug(&mut self, field: &tracing::field::Field, _: &dyn std::fmt::Debug) {
+        if !self.1.is_some_and(|i| i.contains(field.index()))
+            && field.name() != MESSAGE_FIELD
+            && !field.name().starts_with("log.")
+        {
+            self.0 |= true;
+        }
+    }
+}
+
+/// Serializes the fields directly supplied with a log event.
+struct SerializableEventFields<'a, 'event>(
+    &'a tracing::Event<'event>,
+    Option<&'a SkippedFieldIndices>,
+);
+
+impl serde::ser::Serialize for SerializableEventFields<'_, '_> {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: Serializer,
+    {
+        use serde::ser::SerializeMap;
+        let serializer = serializer.serialize_map(None)?;
+        let mut message_skipper = MessageFieldSkipper::new(serializer, self.1);
+        self.0.record(&mut message_skipper);
+        let serializer = message_skipper.into_serializer()?;
+        serializer.end()
+    }
+}
+
+/// A tracing field visitor that skips the message field.
+struct MessageFieldSkipper<'a, S: serde::ser::SerializeMap> {
+    serializer: S,
+    skipped_field_indices: Option<&'a SkippedFieldIndices>,
+    state: Result<(), S::Error>,
+}
+
+impl<'a, S: serde::ser::SerializeMap> MessageFieldSkipper<'a, S> {
+    #[inline]
+    fn new(serializer: S, skipped_field_indices: Option<&'a SkippedFieldIndices>) -> Self {
+        Self {
+            serializer,
+            skipped_field_indices,
+            state: Ok(()),
+        }
+    }
+
+    #[inline]
+    fn accept_field(&self, field: &tracing::field::Field) -> bool {
+        self.state.is_ok()
+            && field.name() != MESSAGE_FIELD
+            && !field.name().starts_with("log.")
+            && !self
+                .skipped_field_indices
+                .is_some_and(|i| i.contains(field.index()))
+    }
+
+    #[inline]
+    fn into_serializer(self) -> Result<S, S::Error> {
+        self.state?;
+        Ok(self.serializer)
+    }
+}
+
+impl<S: serde::ser::SerializeMap> tracing::field::Visit for MessageFieldSkipper<'_, S> {
+    #[inline]
+    fn record_f64(&mut self, field: &tracing::field::Field, value: f64) {
+        if self.accept_field(field) {
+            self.state = self.serializer.serialize_entry(field.name(), &value);
+        }
+    }
+
+    #[inline]
+    fn record_i64(&mut self, field: &tracing::field::Field, value: i64) {
+        if self.accept_field(field) {
+            self.state = self.serializer.serialize_entry(field.name(), &value);
+        }
+    }
+
+    #[inline]
+    fn record_u64(&mut self, field: &tracing::field::Field, value: u64) {
+        if self.accept_field(field) {
+            self.state = self.serializer.serialize_entry(field.name(), &value);
+        }
+    }
+
+    #[inline]
+    fn record_i128(&mut self, field: &tracing::field::Field, value: i128) {
+        if self.accept_field(field) {
+            self.state = self.serializer.serialize_entry(field.name(), &value);
+        }
+    }
+
+    #[inline]
+    fn record_u128(&mut self, field: &tracing::field::Field, value: u128) {
+        if self.accept_field(field) {
+            self.state = self.serializer.serialize_entry(field.name(), &value);
+        }
+    }
+
+    #[inline]
+    fn record_bool(&mut self, field: &tracing::field::Field, value: bool) {
+        if self.accept_field(field) {
+            self.state = self.serializer.serialize_entry(field.name(), &value);
+        }
+    }
+
+    #[inline]
+    fn record_bytes(&mut self, field: &tracing::field::Field, value: &[u8]) {
+        if self.accept_field(field) {
+            self.state = self
+                .serializer
+                .serialize_entry(field.name(), &format_args!("{value:x?}"));
+        }
+    }
+
+    #[inline]
+    fn record_str(&mut self, field: &tracing::field::Field, value: &str) {
+        if self.accept_field(field) {
+            self.state = self.serializer.serialize_entry(field.name(), &value);
+        }
+    }
+
+    #[inline]
+    fn record_debug(&mut self, field: &tracing::field::Field, value: &dyn std::fmt::Debug) {
+        if self.accept_field(field) {
+            self.state = self
+                .serializer
+                .serialize_entry(field.name(), &format_args!("{value:?}"));
+        }
+    }
+
+    #[inline]
+    fn record_error(
+        &mut self,
+        field: &tracing::field::Field,
+        value: &(dyn std::error::Error + 'static),
+    ) {
+        if self.accept_field(field) {
+            self.state = self.serializer.serialize_value(&format_args!("{value}"));
+        }
+    }
+}
+
+/// Serializes the span stack from root to leaf (parent of event) enumerated
+/// inside an object where the keys are just the number padded with zeroes
+/// to retain sorting order.
+// The object is necessary because Loki cannot flatten arrays.
+struct SerializableSpanStack<'a, 'b, Span>(&'b Context<'a, Span>)
+where
+    Span: Subscriber + for<'lookup> LookupSpan<'lookup>;
+
+impl<Span> serde::ser::Serialize for SerializableSpanStack<'_, '_, Span>
+where
+    Span: Subscriber + for<'lookup> LookupSpan<'lookup>,
+{
+    fn serialize<Ser>(&self, serializer: Ser) -> Result<Ser::Ok, Ser::Error>
+    where
+        Ser: serde::ser::Serializer,
+    {
+        let mut serializer = serializer.serialize_map(None)?;
+
+        if let Some(leaf_span) = self.0.lookup_current() {
+            for (i, span) in leaf_span.scope().from_root().enumerate() {
+                serializer.serialize_entry(&format_args!("{i:02}"), &SerializableSpan(&span))?;
+            }
+        }
+
+        serializer.end()
+    }
+}
+
+/// Serializes a single span. Include the span ID, name and its fields as
+/// recorded up to this point.
+struct SerializableSpan<'a, 'b, Span>(&'b SpanRef<'a, Span>)
+where
+    Span: for<'lookup> LookupSpan<'lookup>;
+
+impl<Span> serde::ser::Serialize for SerializableSpan<'_, '_, Span>
+where
+    Span: for<'lookup> LookupSpan<'lookup>,
+{
+    fn serialize<Ser>(&self, serializer: Ser) -> Result<Ser::Ok, Ser::Error>
+    where
+        Ser: serde::ser::Serializer,
+    {
+        let mut serializer = serializer.serialize_map(None)?;
+        // TODO: the span ID is probably only useful for debugging tracing.
+        serializer.serialize_entry("span_id", &format_args!("{:016x}", self.0.id().into_u64()))?;
+        serializer.serialize_entry("span_name", self.0.metadata().name())?;
+
+        let ext = self.0.extensions();
+        if let Some(data) = ext.get::<SpanFields>() {
+            for (key, value) in &data.fields.pin() {
+                serializer.serialize_entry(key, value)?;
+            }
+        }
+
+        serializer.end()
+    }
+}
+
+#[cfg(test)]
+#[allow(clippy::unwrap_used)]
+mod tests {
+    use std::sync::{Arc, Mutex, MutexGuard};
+
+    use assert_json_diff::assert_json_eq;
+    use tracing::info_span;
+
+    use super::*;
+
+    struct TestClock {
+        current_time: Mutex<DateTime<Utc>>,
+    }
+
+    impl Clock for Arc<TestClock> {
+        fn now(&self) -> DateTime<Utc> {
+            *self.current_time.lock().expect("poisoned")
+        }
+    }
+
+    struct VecWriter<'a> {
+        buffer: MutexGuard<'a, Vec<u8>>,
+    }
+
+    impl MakeWriter for Arc<Mutex<Vec<u8>>> {
+        fn make_writer(&self) -> impl io::Write {
+            VecWriter {
+                buffer: self.lock().expect("poisoned"),
+            }
+        }
+    }
+
+    impl io::Write for VecWriter<'_> {
+        fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
+            self.buffer.write(buf)
+        }
+
+        fn flush(&mut self) -> io::Result<()> {
+            Ok(())
+        }
+    }
+
+    #[test]
+    fn test_field_collection() {
+        let clock = Arc::new(TestClock {
+            current_time: Mutex::new(Utc::now()),
+        });
+        let buffer = Arc::new(Mutex::new(Vec::new()));
+        let log_layer = JsonLoggingLayer {
+            clock: clock.clone(),
+            skipped_field_indices: papaya::HashMap::default(),
+            writer: buffer.clone(),
+        };
+
+        let registry = tracing_subscriber::Registry::default().with(log_layer);
+
+        tracing::subscriber::with_default(registry, || {
+            info_span!("span1", x = 40, x = 41, x = 42).in_scope(|| {
+                info_span!("span2").in_scope(|| {
+                    tracing::error!(
+                        a = 1,
+                        a = 2,
+                        a = 3,
+                        message = "explicit message field",
+                        "implicit message field"
+                    );
+                });
+            });
+        });
+
+        let buffer = Arc::try_unwrap(buffer)
+            .expect("no other reference")
+            .into_inner()
+            .expect("poisoned");
+        let actual: serde_json::Value = serde_json::from_slice(&buffer).expect("valid JSON");
+        let expected: serde_json::Value = serde_json::json!(
+            {
+                "timestamp": clock.now().to_rfc3339_opts(chrono::SecondsFormat::Micros, true),
+                "level": "ERROR",
+                "message": "explicit message field",
+                "fields": {
+                    "a": 3,
+                },
+                "spans": {
+                    "00":{
+                        "span_id": "0000000000000001",
+                        "span_name": "span1",
+                        "x": 42,
+                    },
+                    "01": {
+                        "span_id": "0000000000000002",
+                        "span_name": "span2",
+                    }
+                },
+                "src": actual.as_object().unwrap().get("src").unwrap().as_str().unwrap(),
+                "target": "proxy::logging::tests",
+                "process_id": actual.as_object().unwrap().get("process_id").unwrap().as_number().unwrap(),
+                "thread_id": actual.as_object().unwrap().get("thread_id").unwrap().as_number().unwrap(),
+                "thread_name": "logging::tests::test_field_collection",
+            }
+        );
+
+        assert_json_eq!(actual, expected);
+    }
+}
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index a3dffa8f19..2c65401154 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -92,6 +92,7 @@ tonic = { version = "0.12", default-features = false, features = ["codegen", "pr
 tower = { version = "0.4", default-features = false, features = ["balance", "buffer", "limit", "util"] }
 tracing = { version = "0.1", features = ["log"] }
 tracing-core = { version = "0.1" }
+tracing-log = { version = "0.2" }
 url = { version = "2", features = ["serde"] }
 zerocopy = { version = "0.7", features = ["derive", "simd"] }
 zeroize = { version = "1", features = ["derive", "serde"] }

From 06090bbccdf0d2b89f6a355afd42b352fadd40d6 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Tue, 4 Feb 2025 15:55:11 +0100
Subject: [PATCH 42/77] pageserver: log critical error on `ClearVmBits` for
 unknown pages (#10634)

## Problem

In #9895, we fixed some issues where `ClearVmBits` were broadcast to all
shards, even those not owning the VM relation. As part of that, we found
some ancient code from #1417, which discarded spurious incorrect
`ClearVmBits` records for pages outside of the VM relation. We added
observability in #9911 to see how often this actually happens in the
wild.

After two months, we have not seen this happen once in production or
staging. However, out of caution, we don't want a hard error and break
WAL ingestion.

Resolves #10067.

## Summary of changes

Log a critical error when ingesting `ClearVmBits` for unknown VM
relations or pages.
---
 pageserver/src/metrics.rs   |   7 ---
 pageserver/src/walingest.rs | 117 +++++++++++++++---------------------
 2 files changed, 49 insertions(+), 75 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index f9edf88553..48aed70826 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -2379,7 +2379,6 @@ pub(crate) struct WalIngestMetrics {
     pub(crate) records_committed: IntCounter,
     pub(crate) records_filtered: IntCounter,
     pub(crate) gap_blocks_zeroed_on_rel_extend: IntCounter,
-    pub(crate) clear_vm_bits_unknown: IntCounterVec,
 }
 
 pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| {
@@ -2414,12 +2413,6 @@ pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| {
         "Total number of zero gap blocks written on relation extends"
     )
     .expect("failed to define a metric"),
-    clear_vm_bits_unknown: register_int_counter_vec!(
-        "pageserver_wal_ingest_clear_vm_bits_unknown",
-        "Number of ignored ClearVmBits operations due to unknown pages/relations",
-        &["entity"],
-    )
-    .expect("failed to define a metric"),
 }
 });
 
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index e0283d99e0..04edb3e3f4 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -28,17 +28,9 @@ use std::time::Duration;
 use std::time::Instant;
 use std::time::SystemTime;
 
-use pageserver_api::shard::ShardIdentity;
-use postgres_ffi::fsm_logical_to_physical;
-use postgres_ffi::walrecord::*;
-use postgres_ffi::{dispatch_pgversion, enum_pgversion, enum_pgversion_dispatch, TimestampTz};
-use wal_decoder::models::*;
-
 use anyhow::{bail, Result};
 use bytes::{Buf, Bytes};
 use tracing::*;
-use utils::failpoint_support;
-use utils::rate_limit::RateLimit;
 
 use crate::context::RequestContext;
 use crate::metrics::WAL_INGEST;
@@ -50,11 +42,18 @@ use crate::ZERO_PAGE;
 use pageserver_api::key::rel_block_to_key;
 use pageserver_api::record::NeonWalRecord;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
+use pageserver_api::shard::ShardIdentity;
+use postgres_ffi::fsm_logical_to_physical;
 use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
+use postgres_ffi::walrecord::*;
 use postgres_ffi::TransactionId;
+use postgres_ffi::{dispatch_pgversion, enum_pgversion, enum_pgversion_dispatch, TimestampTz};
 use utils::bin_ser::SerializeError;
 use utils::lsn::Lsn;
+use utils::rate_limit::RateLimit;
+use utils::{critical, failpoint_support};
+use wal_decoder::models::*;
 
 enum_pgversion! {CheckPoint, pgv::CheckPoint}
 
@@ -327,93 +326,75 @@ impl WalIngest {
         let mut new_vm_blk = new_heap_blkno.map(pg_constants::HEAPBLK_TO_MAPBLOCK);
         let mut old_vm_blk = old_heap_blkno.map(pg_constants::HEAPBLK_TO_MAPBLOCK);
 
-        // Sometimes, Postgres seems to create heap WAL records with the
-        // ALL_VISIBLE_CLEARED flag set, even though the bit in the VM page is
-        // not set. In fact, it's possible that the VM page does not exist at all.
-        // In that case, we don't want to store a record to clear the VM bit;
-        // replaying it would fail to find the previous image of the page, because
-        // it doesn't exist. So check if the VM page(s) exist, and skip the WAL
-        // record if it doesn't.
-        //
-        // TODO: analyze the metrics and tighten this up accordingly. This logic
-        // implicitly assumes that VM pages see explicit WAL writes before
-        // implicit ClearVmBits, and will otherwise silently drop updates.
+        // VM bits can only be cleared on the shard(s) owning the VM relation, and must be within
+        // its view of the VM relation size. Out of caution, error instead of failing WAL ingestion,
+        // as there has historically been cases where PostgreSQL has cleared spurious VM pages. See:
+        // https://github.com/neondatabase/neon/pull/10634.
         let Some(vm_size) = get_relsize(modification, vm_rel, ctx).await? else {
-            WAL_INGEST
-                .clear_vm_bits_unknown
-                .with_label_values(&["relation"])
-                .inc();
+            critical!("clear_vm_bits for unknown VM relation {vm_rel}");
             return Ok(());
         };
         if let Some(blknum) = new_vm_blk {
             if blknum >= vm_size {
-                WAL_INGEST
-                    .clear_vm_bits_unknown
-                    .with_label_values(&["new_page"])
-                    .inc();
+                critical!("new_vm_blk {blknum} not in {vm_rel} of size {vm_size}");
                 new_vm_blk = None;
             }
         }
         if let Some(blknum) = old_vm_blk {
             if blknum >= vm_size {
-                WAL_INGEST
-                    .clear_vm_bits_unknown
-                    .with_label_values(&["old_page"])
-                    .inc();
+                critical!("old_vm_blk {blknum} not in {vm_rel} of size {vm_size}");
                 old_vm_blk = None;
             }
         }
 
-        if new_vm_blk.is_some() || old_vm_blk.is_some() {
-            if new_vm_blk == old_vm_blk {
-                // An UPDATE record that needs to clear the bits for both old and the
-                // new page, both of which reside on the same VM page.
+        if new_vm_blk.is_none() && old_vm_blk.is_none() {
+            return Ok(());
+        } else if new_vm_blk == old_vm_blk {
+            // An UPDATE record that needs to clear the bits for both old and the new page, both of
+            // which reside on the same VM page.
+            self.put_rel_wal_record(
+                modification,
+                vm_rel,
+                new_vm_blk.unwrap(),
+                NeonWalRecord::ClearVisibilityMapFlags {
+                    new_heap_blkno,
+                    old_heap_blkno,
+                    flags,
+                },
+                ctx,
+            )
+            .await?;
+        } else {
+            // Clear VM bits for one heap page, or for two pages that reside on different VM pages.
+            if let Some(new_vm_blk) = new_vm_blk {
                 self.put_rel_wal_record(
                     modification,
                     vm_rel,
-                    new_vm_blk.unwrap(),
+                    new_vm_blk,
                     NeonWalRecord::ClearVisibilityMapFlags {
                         new_heap_blkno,
+                        old_heap_blkno: None,
+                        flags,
+                    },
+                    ctx,
+                )
+                .await?;
+            }
+            if let Some(old_vm_blk) = old_vm_blk {
+                self.put_rel_wal_record(
+                    modification,
+                    vm_rel,
+                    old_vm_blk,
+                    NeonWalRecord::ClearVisibilityMapFlags {
+                        new_heap_blkno: None,
                         old_heap_blkno,
                         flags,
                     },
                     ctx,
                 )
                 .await?;
-            } else {
-                // Clear VM bits for one heap page, or for two pages that reside on
-                // different VM pages.
-                if let Some(new_vm_blk) = new_vm_blk {
-                    self.put_rel_wal_record(
-                        modification,
-                        vm_rel,
-                        new_vm_blk,
-                        NeonWalRecord::ClearVisibilityMapFlags {
-                            new_heap_blkno,
-                            old_heap_blkno: None,
-                            flags,
-                        },
-                        ctx,
-                    )
-                    .await?;
-                }
-                if let Some(old_vm_blk) = old_vm_blk {
-                    self.put_rel_wal_record(
-                        modification,
-                        vm_rel,
-                        old_vm_blk,
-                        NeonWalRecord::ClearVisibilityMapFlags {
-                            new_heap_blkno: None,
-                            old_heap_blkno,
-                            flags,
-                        },
-                        ctx,
-                    )
-                    .await?;
-                }
             }
         }
-
         Ok(())
     }
 

From cab60b6d9f97983ac8cbd904982cecd67edd2d92 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Tue, 4 Feb 2025 11:11:31 -0500
Subject: [PATCH 43/77] fix(pagesever): stablize gc-compaction tests (#10621)

## Problem

Hopefully this can resolve
https://github.com/neondatabase/neon/issues/10517. The reason why the
test is flaky is that after restart the compute node might write some
data so that the pageserver flush some layers, and in the end, causing
L0 compaction to run, and we cannot get the test scenario as we want.

## Summary of changes

Ensure all L0 layers are compacted before starting the test.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 test_runner/regress/test_compaction.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index 763a63c2e5..c031d66dfb 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -250,6 +250,9 @@ def test_pageserver_gc_compaction_idempotent(
     workload.churn_rows(row_count, env.pageserver.id)
     # compact 3 times if mode is before_restart
     n_compactions = 3 if compaction_mode == "before_restart" else 1
+    ps_http.timeline_compact(
+        tenant_id, timeline_id, force_l0_compaction=True, wait_until_uploaded=True
+    )
     for _ in range(n_compactions):
         # Force refresh gc info to have gc_cutoff generated
         ps_http.timeline_gc(tenant_id, timeline_id, None)

From f9009d6b80f5351454eabdc3df5cbd137f95d5e2 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 4 Feb 2025 17:52:54 +0000
Subject: [PATCH 44/77] pageserver: write heatmap to disk after uploading it
 (#10650)

## Problem

We wish to make heatmap generation additive in
https://github.com/neondatabase/neon/pull/10597.
However, if the pageserver restarts and has a heatmap on disk from when
it was a secondary long ago,
we can end up keeping extra layers on the secondary's disk.

## Summary of changes

Persist the heatmap after a successful upload.
---
 .../src/tenant/secondary/heatmap_uploader.rs  | 22 ++++++++++++++++---
 test_runner/fixtures/neon_fixtures.py         |  5 +++++
 .../regress/test_pageserver_secondary.py      | 10 ++++++---
 3 files changed, 31 insertions(+), 6 deletions(-)

diff --git a/pageserver/src/tenant/secondary/heatmap_uploader.rs b/pageserver/src/tenant/secondary/heatmap_uploader.rs
index c5e5e04945..d72c337369 100644
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -9,13 +9,14 @@ use crate::{
     metrics::SECONDARY_MODE,
     tenant::{
         config::AttachmentMode,
-        mgr::GetTenantError,
-        mgr::TenantManager,
+        mgr::{GetTenantError, TenantManager},
         remote_timeline_client::remote_heatmap_path,
         span::debug_assert_current_span_has_tenant_id,
         tasks::{warn_when_period_overrun, BackgroundLoopKind},
         Tenant,
     },
+    virtual_file::VirtualFile,
+    TEMP_FILE_SUFFIX,
 };
 
 use futures::Future;
@@ -32,7 +33,10 @@ use super::{
 };
 use tokio_util::sync::CancellationToken;
 use tracing::{info_span, instrument, Instrument};
-use utils::{backoff, completion::Barrier, yielding_loop::yielding_loop};
+use utils::{
+    backoff, completion::Barrier, crashsafe::path_with_suffix_extension,
+    yielding_loop::yielding_loop,
+};
 
 pub(super) async fn heatmap_uploader_task(
     tenant_manager: Arc<TenantManager>,
@@ -461,6 +465,18 @@ async fn upload_tenant_heatmap(
         }
     }
 
+    // After a successful upload persist the fresh heatmap to disk.
+    // When restarting, the tenant will read the heatmap from disk
+    // and additively generate a new heatmap (see [`Timeline::generate_heatmap`]).
+    // If the heatmap is stale, the additive generation can lead to keeping previously
+    // evicted timelines on the secondarie's disk.
+    let tenant_shard_id = tenant.get_tenant_shard_id();
+    let heatmap_path = tenant.conf.tenant_heatmap_path(tenant_shard_id);
+    let temp_path = path_with_suffix_extension(&heatmap_path, TEMP_FILE_SUFFIX);
+    if let Err(err) = VirtualFile::crashsafe_overwrite(heatmap_path, temp_path, bytes).await {
+        tracing::warn!("Non fatal IO error writing to disk after heatmap upload: {err}");
+    }
+
     tracing::info!("Successfully uploaded {size} byte heatmap to {path}");
 
     Ok(UploadHeatmapOutcome::Uploaded(LastUploadState {
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 8909f7f249..7c4991ffab 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2766,6 +2766,11 @@ class NeonPageserver(PgProtocol, LogUtils):
             log.error(f"Failed to decode LocationConf, raw content ({len(bytes)} bytes): {bytes}")
             raise
 
+    def heatmap_content(self, tenant_shard_id: TenantId | TenantShardId) -> Any:
+        path = self.tenant_dir(tenant_shard_id) / "heatmap-v1.json"
+        with open(path) as f:
+            return json.load(f)
+
     def tenant_create(
         self,
         tenant_id: TenantId,
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index 1292682f9e..590093d23c 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -443,7 +443,7 @@ def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder):
     workload.write_rows(256, env.pageservers[0].id)
     env.pageserver.http_client().tenant_heatmap_upload(tenant_id)
 
-    def validate_heatmap(heatmap):
+    def validate_heatmap(heatmap, on_disk_heatmap):
         assert len(heatmap["timelines"]) == 1
         assert heatmap["timelines"][0]["timeline_id"] == str(timeline_id)
         assert len(heatmap["timelines"][0]["layers"]) > 0
@@ -452,10 +452,13 @@ def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder):
         # Each layer appears at most once
         assert len(set(layer["name"] for layer in layers)) == len(layers)
 
+        assert heatmap == on_disk_heatmap
+
     # Download and inspect the heatmap that the pageserver uploaded
     heatmap_first = env.pageserver_remote_storage.heatmap_content(tenant_id)
+    heatmap_first_on_disk = env.pageserver.heatmap_content(tenant_id)
     log.info(f"Read back heatmap: {heatmap_first}")
-    validate_heatmap(heatmap_first)
+    validate_heatmap(heatmap_first, heatmap_first_on_disk)
 
     # Do some more I/O to generate more layers
     workload.churn_rows(64, env.pageservers[0].id)
@@ -463,9 +466,10 @@ def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder):
 
     # Ensure that another heatmap upload includes the new layers
     heatmap_second = env.pageserver_remote_storage.heatmap_content(tenant_id)
+    heatmap_second_on_disk = env.pageserver.heatmap_content(tenant_id)
     log.info(f"Read back heatmap: {heatmap_second}")
     assert heatmap_second != heatmap_first
-    validate_heatmap(heatmap_second)
+    validate_heatmap(heatmap_second, heatmap_second_on_disk)
 
 
 def list_elegible_layers(

From 472007dd7ce5c5b861b29870a852ad7c99e1ea01 Mon Sep 17 00:00:00 2001
From: Fedor Dikarev <fedor@neon.tech>
Date: Tue, 4 Feb 2025 19:58:02 +0100
Subject: [PATCH 45/77] ci: unify Dockerfiles, set bash as SHELL for debian
 layers, make cpan step as separate RUN (#10645)

## Problem
Ref: https://github.com/neondatabase/cloud/issues/23461

and follow-up after: https://github.com/neondatabase/neon/pull/10553

we used `echo` to set-up `.wgetrc` and `.curlrc`, and there we used `\n`
to make these multiline configs with one echo command.

The problem is that Debian `/bin/sh`'s built-in echo command behaves
differently from the `/bin/echo` executable and from the `echo` built-in
in `bash`. Namely, it does not support the`-e` option, and while it does
treat `\n` as a newline, passing `-e` here will add that `-e` to the
output.
At the same time, when we use different base images, for example
`alpine/curl`, their `/bin/sh` supports and requires `-e` for treating
escape sequences like `\n`.
But having different `echo` and remembering difference in their
behaviour isn't best experience for the developer and makes bad
experience maintaining Dockerfiles.

Work-arounds:

- Explicitly use `/bin/bash` (like in this PR)
- Use `/bin/echo` instead of the shell's built-in echo function
- Use printf "foo\n" instead of echo -e "foo\n"

## Summary of changes
1. To fix that, we process with the option setting `/bin/bash` as a
SHELL for the debian-baysed layers
2. With no changes for `alpine/curl` based layers.
3. And one more change here: in `extensions` layer split to the 2 steps:
installing dependencies from `CPAN` and installing `lcov` from github,
so upgrading `lcov` could reuse previous layer with installed cpan
modules.
---
 build-tools.Dockerfile          | 22 +++++++++++++++++-----
 compute/compute-node.Dockerfile | 10 +++++++++-
 2 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile
index f744b44808..3ade57b175 100644
--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -3,8 +3,13 @@ ARG DEBIAN_VERSION=bookworm
 FROM debian:bookworm-slim AS pgcopydb_builder
 ARG DEBIAN_VERSION
 
+# Use strict mode for bash to catch errors early
+SHELL ["/bin/bash", "-euo", "pipefail", "-c"]
+
+# By default, /bin/sh used in debian images will treat '\n' as eol,
+# but as we use bash as SHELL, and built-in echo in bash requires '-e' flag for that.
 RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
-    echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc \
+    echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc && \
     echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc
 
 RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \
@@ -55,7 +60,8 @@ ARG DEBIAN_VERSION
 
 # Add nonroot user
 RUN useradd -ms /bin/bash nonroot -b /home
-SHELL ["/bin/bash", "-c"]
+# Use strict mode for bash to catch errors early
+SHELL ["/bin/bash", "-euo", "pipefail", "-c"]
 
 RUN mkdir -p /pgcopydb/bin && \
     mkdir -p /pgcopydb/lib && \
@@ -66,7 +72,7 @@ COPY --from=pgcopydb_builder /usr/lib/postgresql/16/bin/pgcopydb /pgcopydb/bin/p
 COPY --from=pgcopydb_builder /pgcopydb/lib/libpq.so.5 /pgcopydb/lib/libpq.so.5
 
 RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
-    echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc \
+    echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc && \
     echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc
 
 # System deps
@@ -190,8 +196,14 @@ RUN set -e \
 # It includes several bug fixes on top on v2.0 release (https://github.com/linux-test-project/lcov/compare/v2.0...master)
 # And patches from us:
 # - Generates json file with code coverage summary (https://github.com/neondatabase/lcov/commit/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz)
-RUN for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JSON::XS Memory::Process Time::HiRes JSON; do yes | perl -MCPAN -e "CPAN::Shell->notest('install', '$package')"; done \
-    && wget https://github.com/neondatabase/lcov/archive/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz -O lcov.tar.gz \
+RUN set +o pipefail && \
+	 for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JSON::XS Memory::Process Time::HiRes JSON; do \
+		yes | perl -MCPAN -e "CPAN::Shell->notest('install', '$package')";\
+	 done && \
+	set -o pipefail
+# Split into separate step to debug flaky failures here
+RUN wget https://github.com/neondatabase/lcov/archive/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz -O lcov.tar.gz \
+    && ls -laht lcov.tar.gz && sha256sum lcov.tar.gz \
     && echo "61a22a62e20908b8b9e27d890bd0ea31f567a7b9668065589266371dcbca0992  lcov.tar.gz" | sha256sum --check \
     && mkdir -p lcov && tar -xzf lcov.tar.gz -C lcov --strip-components=1 \
     && cd lcov \
diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index ea29630001..9379856eab 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -96,8 +96,10 @@ ARG DEBIAN_VERSION
 # Use strict mode for bash to catch errors early
 SHELL ["/bin/bash", "-euo", "pipefail", "-c"]
 
+# By default, /bin/sh used in debian images will treat '\n' as eol,
+# but as we use bash as SHELL, and built-in echo in bash requires '-e' flag for that.
 RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
-    echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc \
+    echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc && \
     echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc
 
 RUN case $DEBIAN_VERSION in \
@@ -1068,6 +1070,7 @@ ENV PATH="/home/nonroot/.cargo/bin:$PATH"
 USER nonroot
 WORKDIR /home/nonroot
 
+# See comment on the top of the file regading `echo` and `\n`
 RUN echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /home/nonroot/.curlrc
 
 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
@@ -1584,6 +1587,7 @@ FROM alpine/curl:${ALPINE_CURL_VERSION} AS exporters
 ARG TARGETARCH
 # Keep sql_exporter version same as in build-tools.Dockerfile and
 # test_runner/regress/test_compute_metrics.py
+# See comment on the top of the file regading `echo`, `-e` and `\n`
 RUN echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc; \
     if [ "$TARGETARCH" = "amd64" ]; then\
         postgres_exporter_sha256='027e75dda7af621237ff8f5ac66b78a40b0093595f06768612b92b1374bd3105';\
@@ -1700,6 +1704,10 @@ ENV PGDATABASE=postgres
 #########################################################################################
 FROM debian:$DEBIAN_FLAVOR
 ARG DEBIAN_VERSION
+
+# Use strict mode for bash to catch errors early
+SHELL ["/bin/bash", "-euo", "pipefail", "-c"]
+
 # Add user postgres
 RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
     echo "postgres:test_console_pass" | chpasswd && \

From 47975d06d98bde792b61cf1d1d397567f16b0b49 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 5 Feb 2025 12:41:09 +0000
Subject: [PATCH 46/77] storcon: silence cplane 404s on tenant creation
 (#10665)

## Problem

We get WARN log noise on tenant creations. Cplane creates tenants via
/location_config. That returns the attached locations in the response
and spawns a reconciliation which will also attempt to notify cplane. If
the notification is attempted before cplane persists the shards to its
database, storcon gets back a 404. The situation is harmless, but
annoying.

## Summary of Changes

* Add a tenant creation hint to the reconciler config
* If the hint is true and we get back a 404 on the notification from
cplane, ignore the error, but still queue the reconcile up for a retry.

Closes https://github.com/neondatabase/cloud/issues/20732
---
 storage_controller/src/compute_hook.rs |  2 +-
 storage_controller/src/reconciler.rs   | 50 ++++++++++++++++++++++----
 storage_controller/src/service.rs      |  7 +++-
 3 files changed, 50 insertions(+), 9 deletions(-)

diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs
index 3884a6df46..5bc3c81f02 100644
--- a/storage_controller/src/compute_hook.rs
+++ b/storage_controller/src/compute_hook.rs
@@ -225,7 +225,7 @@ pub(crate) enum NotifyError {
     // We shutdown while sending
     #[error("Shutting down")]
     ShuttingDown,
-    // A response indicates we will never succeed, such as 400 or 404
+    // A response indicates we will never succeed, such as 400 or 403
     #[error("Non-retryable error {0}")]
     Fatal(StatusCode),
 
diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs
index 03db947263..58bc0ba1cd 100644
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -115,6 +115,15 @@ impl ReconcilerConfigBuilder {
         }
     }
 
+    pub(crate) fn tenant_creation_hint(self, hint: bool) -> Self {
+        Self {
+            config: ReconcilerConfig {
+                tenant_creation_hint: hint,
+                ..self.config
+            },
+        }
+    }
+
     pub(crate) fn build(self) -> ReconcilerConfig {
         self.config
     }
@@ -129,6 +138,10 @@ pub(crate) struct ReconcilerConfig {
     // During live migrations this is the amount of time that
     // the pagserver will hold our poll.
     secondary_download_request_timeout: Option<Duration>,
+
+    // A hint indicating whether this reconciliation is done on the
+    // creation of a new tenant. This only informs logging behaviour.
+    tenant_creation_hint: bool,
 }
 
 impl ReconcilerConfig {
@@ -143,6 +156,10 @@ impl ReconcilerConfig {
         self.secondary_download_request_timeout
             .unwrap_or(SECONDARY_DOWNLOAD_REQUEST_TIMEOUT_DEFAULT)
     }
+
+    pub(crate) fn tenant_creation_hint(&self) -> bool {
+        self.tenant_creation_hint
+    }
 }
 
 /// RAII resource units granted to a Reconciler, which it should keep alive until it finishes doing I/O
@@ -934,16 +951,35 @@ impl Reconciler {
                 )
                 .await;
             if let Err(e) = &result {
-                // It is up to the caller whether they want to drop out on this error, but they don't have to:
-                // in general we should avoid letting unavailability of the cloud control plane stop us from
-                // making progress.
-                if !matches!(e, NotifyError::ShuttingDown) {
-                    tracing::warn!("Failed to notify compute of attached pageserver {node}: {e}");
-                }
-
                 // Set this flag so that in our ReconcileResult we will set the flag on the shard that it
                 // needs to retry at some point.
                 self.compute_notify_failure = true;
+
+                // It is up to the caller whether they want to drop out on this error, but they don't have to:
+                // in general we should avoid letting unavailability of the cloud control plane stop us from
+                // making progress.
+                match e {
+                    // 404s from cplane during tenant creation are expected.
+                    // Cplane only persists the shards to the database after
+                    // creating the tenant and the timeline. If we notify before
+                    // that, we'll get a 404.
+                    //
+                    // This is fine because tenant creations happen via /location_config
+                    // and that returns the list of locations in the response. Hence, we
+                    // silence the error and return Ok(()) here. Reconciliation will still
+                    // be retried because we set [`Reconciler::compute_notify_failure`] above.
+                    NotifyError::Unexpected(hyper::StatusCode::NOT_FOUND)
+                        if self.reconciler_config.tenant_creation_hint() =>
+                    {
+                        return Ok(());
+                    }
+                    NotifyError::ShuttingDown => {}
+                    _ => {
+                        tracing::warn!(
+                            "Failed to notify compute of attached pageserver {node}: {e}"
+                        );
+                    }
+                }
             }
             result
         } else {
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 9ac9ee17ca..4028cd7023 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -2238,9 +2238,14 @@ impl Service {
         let waiters = {
             let mut locked = self.inner.write().unwrap();
             let (nodes, tenants, _scheduler) = locked.parts_mut();
+            let config = ReconcilerConfigBuilder::new()
+                .tenant_creation_hint(true)
+                .build();
             tenants
                 .range_mut(TenantShardId::tenant_range(tenant_id))
-                .filter_map(|(_shard_id, shard)| self.maybe_reconcile_shard(shard, nodes))
+                .filter_map(|(_shard_id, shard)| {
+                    self.maybe_configured_reconcile_shard(shard, nodes, config)
+                })
                 .collect::<Vec<_>>()
         };
 

From f07119cca798e24745b4eda21bde8c2376a22952 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Wed, 5 Feb 2025 15:33:04 +0100
Subject: [PATCH 47/77] pageserver: add
 `pageserver_wal_ingest_values_committed` metric (#10653)

## Problem

We don't have visibility into the ratio of image vs. delta pages
ingested in Pageservers. This might be useful to determine whether we
should compress WAL records before storing them, which in turn might
make compaction more efficient.

## Summary of changes

Add `pageserver_wal_ingest_values_committed` metric with dimensions
`class=metadata|data` and `kind=image|delta`.
---
 pageserver/src/metrics.rs                     | 35 +++++++++++++++++++
 pageserver/src/pgdatadir_mapping.rs           | 31 +++++++++++++++-
 .../walreceiver/walreceiver_connection.rs     | 25 +++++++++----
 3 files changed, 84 insertions(+), 7 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 48aed70826..6ab1178a7b 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -32,6 +32,7 @@ use utils::id::TimelineId;
 
 use crate::config::PageServerConf;
 use crate::context::{PageContentKind, RequestContext};
+use crate::pgdatadir_mapping::DatadirModificationStats;
 use crate::task_mgr::TaskKind;
 use crate::tenant::layer_map::LayerMap;
 use crate::tenant::mgr::TenantSlot;
@@ -2378,10 +2379,40 @@ pub(crate) struct WalIngestMetrics {
     pub(crate) records_observed: IntCounter,
     pub(crate) records_committed: IntCounter,
     pub(crate) records_filtered: IntCounter,
+    pub(crate) values_committed_metadata_images: IntCounter,
+    pub(crate) values_committed_metadata_deltas: IntCounter,
+    pub(crate) values_committed_data_images: IntCounter,
+    pub(crate) values_committed_data_deltas: IntCounter,
     pub(crate) gap_blocks_zeroed_on_rel_extend: IntCounter,
 }
 
+impl WalIngestMetrics {
+    pub(crate) fn inc_values_committed(&self, stats: &DatadirModificationStats) {
+        if stats.metadata_images > 0 {
+            self.values_committed_metadata_images
+                .inc_by(stats.metadata_images);
+        }
+        if stats.metadata_deltas > 0 {
+            self.values_committed_metadata_deltas
+                .inc_by(stats.metadata_deltas);
+        }
+        if stats.data_images > 0 {
+            self.values_committed_data_images.inc_by(stats.data_images);
+        }
+        if stats.data_deltas > 0 {
+            self.values_committed_data_deltas.inc_by(stats.data_deltas);
+        }
+    }
+}
+
 pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| {
+    let values_committed = register_int_counter_vec!(
+        "pageserver_wal_ingest_values_committed",
+        "Number of values committed to pageserver storage from WAL records",
+        &["class", "kind"],
+    )
+    .expect("failed to define a metric");
+
     WalIngestMetrics {
     bytes_received: register_int_counter!(
         "pageserver_wal_ingest_bytes_received",
@@ -2408,6 +2439,10 @@ pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| {
         "Number of WAL records filtered out due to sharding"
     )
     .expect("failed to define a metric"),
+    values_committed_metadata_images: values_committed.with_label_values(&["metadata", "image"]),
+    values_committed_metadata_deltas: values_committed.with_label_values(&["metadata", "delta"]),
+    values_committed_data_images: values_committed.with_label_values(&["data", "image"]),
+    values_committed_data_deltas: values_committed.with_label_values(&["data", "delta"]),
     gap_blocks_zeroed_on_rel_extend: register_int_counter!(
         "pageserver_gap_blocks_zeroed_on_rel_extend",
         "Total number of zero gap blocks written on relation extends"
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 40c657524d..dcbf62b56c 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -48,7 +48,7 @@ use tracing::{debug, trace, warn};
 use utils::bin_ser::DeserializeError;
 use utils::pausable_failpoint;
 use utils::{bin_ser::BeSer, lsn::Lsn};
-use wal_decoder::serialized_batch::SerializedValueBatch;
+use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta};
 
 /// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached.
 pub const MAX_AUX_FILE_DELTAS: usize = 1024;
@@ -1297,6 +1297,26 @@ impl DatadirModification<'_> {
             .is_some_and(|b| b.has_data())
     }
 
+    /// Returns statistics about the currently pending modifications.
+    pub(crate) fn stats(&self) -> DatadirModificationStats {
+        let mut stats = DatadirModificationStats::default();
+        for (_, _, value) in self.pending_metadata_pages.values().flatten() {
+            match value {
+                Value::Image(_) => stats.metadata_images += 1,
+                Value::WalRecord(r) if r.will_init() => stats.metadata_images += 1,
+                Value::WalRecord(_) => stats.metadata_deltas += 1,
+            }
+        }
+        for valuemeta in self.pending_data_batch.iter().flat_map(|b| &b.metadata) {
+            match valuemeta {
+                ValueMeta::Serialized(s) if s.will_init => stats.data_images += 1,
+                ValueMeta::Serialized(_) => stats.data_deltas += 1,
+                ValueMeta::Observed(_) => {}
+            }
+        }
+        stats
+    }
+
     /// Set the current lsn
     pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> {
         ensure!(
@@ -2317,6 +2337,15 @@ impl DatadirModification<'_> {
     }
 }
 
+/// Statistics for a DatadirModification.
+#[derive(Default)]
+pub struct DatadirModificationStats {
+    pub metadata_images: u64,
+    pub metadata_deltas: u64,
+    pub data_images: u64,
+    pub data_deltas: u64,
+}
+
 /// This struct facilitates accessing either a committed key from the timeline at a
 /// specific LSN, or the latest uncommitted key from a pending modification.
 ///
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index d69e7dbd32..de917377cb 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -355,6 +355,19 @@ pub(super) async fn handle_walreceiver_connection(
                 // advances it to its end LSN. 0 is just an initialization placeholder.
                 let mut modification = timeline.begin_modification(Lsn(0));
 
+                async fn commit(
+                    modification: &mut DatadirModification<'_>,
+                    ctx: &RequestContext,
+                    uncommitted: &mut u64,
+                ) -> anyhow::Result<()> {
+                    let stats = modification.stats();
+                    modification.commit(ctx).await?;
+                    WAL_INGEST.records_committed.inc_by(*uncommitted);
+                    WAL_INGEST.inc_values_committed(&stats);
+                    *uncommitted = 0;
+                    Ok(())
+                }
+
                 if !records.is_empty() {
                     timeline
                         .metrics
@@ -366,8 +379,7 @@ pub(super) async fn handle_walreceiver_connection(
                     if matches!(interpreted.flush_uncommitted, FlushUncommittedRecords::Yes)
                         && uncommitted_records > 0
                     {
-                        modification.commit(&ctx).await?;
-                        uncommitted_records = 0;
+                        commit(&mut modification, &ctx, &mut uncommitted_records).await?;
                     }
 
                     let local_next_record_lsn = interpreted.next_record_lsn;
@@ -396,8 +408,7 @@ pub(super) async fn handle_walreceiver_connection(
                         || modification.approx_pending_bytes()
                             > DatadirModification::MAX_PENDING_BYTES
                     {
-                        modification.commit(&ctx).await?;
-                        uncommitted_records = 0;
+                        commit(&mut modification, &ctx, &mut uncommitted_records).await?;
                     }
                 }
 
@@ -415,7 +426,7 @@ pub(super) async fn handle_walreceiver_connection(
 
                 if uncommitted_records > 0 || needs_last_record_lsn_advance {
                     // Commit any uncommitted records
-                    modification.commit(&ctx).await?;
+                    commit(&mut modification, &ctx, &mut uncommitted_records).await?;
                 }
 
                 if !caught_up && streaming_lsn >= end_of_wal {
@@ -442,10 +453,12 @@ pub(super) async fn handle_walreceiver_connection(
                     filtered: &mut u64,
                     ctx: &RequestContext,
                 ) -> anyhow::Result<()> {
+                    let stats = modification.stats();
+                    modification.commit(ctx).await?;
                     WAL_INGEST
                         .records_committed
                         .inc_by(*uncommitted - *filtered);
-                    modification.commit(ctx).await?;
+                    WAL_INGEST.inc_values_committed(&stats);
                     *uncommitted = 0;
                     *filtered = 0;
                     Ok(())

From ebc55e6ae87723a95303e62e9e7b16dae218676c Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 5 Feb 2025 08:58:33 -0600
Subject: [PATCH 48/77] Fix logic for checking if a compute can install a
 remote extension (#10656)

Given a remote extensions manifest of the following:

```json
  {
    "public_extensions": [],
    "custom_extensions": null,
    "library_index": {
      "pg_search": "pg_search"
    },
    "extension_data": {
      "pg_search": {
        "control_data": {
          "pg_search.control": "comment = 'pg_search: Full text search for PostgreSQL using BM25'\ndefault_version = '0.14.1'\nmodule_pathname = '$libdir/pg_search'\nrelocatable = false\nsuperuser = true\nschema = paradedb\ntrusted = true\n"
        },
        "archive_path": "13117844657/v14/extensions/pg_search.tar.zst"
      }
    }
  }
```

We were allowing a compute to install a remote extension that wasn't
listed in either public_extensions or custom_extensions.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 libs/compute_api/src/spec.rs                  | 108 ++++++++++++++++--
 .../regress/test_download_extensions.py       |   2 +
 2 files changed, 102 insertions(+), 8 deletions(-)

diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs
index b3f18dc6da..2fc95c47c6 100644
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -204,14 +204,16 @@ impl RemoteExtSpec {
 
         // Check if extension is present in public or custom.
         // If not, then it is not allowed to be used by this compute.
-        if let Some(public_extensions) = &self.public_extensions {
-            if !public_extensions.contains(&real_ext_name.to_string()) {
-                if let Some(custom_extensions) = &self.custom_extensions {
-                    if !custom_extensions.contains(&real_ext_name.to_string()) {
-                        return Err(anyhow::anyhow!("extension {} is not found", real_ext_name));
-                    }
-                }
-            }
+        if !self
+            .public_extensions
+            .as_ref()
+            .is_some_and(|exts| exts.iter().any(|e| e == ext_name))
+            && !self
+                .custom_extensions
+                .as_ref()
+                .is_some_and(|exts| exts.iter().any(|e| e == ext_name))
+        {
+            return Err(anyhow::anyhow!("extension {} is not found", real_ext_name));
         }
 
         match self.extension_data.get(real_ext_name) {
@@ -340,6 +342,96 @@ mod tests {
     use super::*;
     use std::fs::File;
 
+    #[test]
+    fn allow_installing_remote_extensions() {
+        let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({
+            "public_extensions": null,
+            "custom_extensions": null,
+            "library_index": {},
+            "extension_data": {},
+        }))
+        .unwrap();
+
+        rspec
+            .get_ext("ext", false, "latest", "v17")
+            .expect_err("Extension should not be found");
+
+        let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({
+            "public_extensions": [],
+            "custom_extensions": null,
+            "library_index": {},
+            "extension_data": {},
+        }))
+        .unwrap();
+
+        rspec
+            .get_ext("ext", false, "latest", "v17")
+            .expect_err("Extension should not be found");
+
+        let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({
+            "public_extensions": [],
+            "custom_extensions": [],
+            "library_index": {
+                "ext": "ext"
+            },
+            "extension_data": {
+                "ext": {
+                    "control_data": {
+                        "ext.control": ""
+                    },
+                    "archive_path": ""
+                }
+            },
+        }))
+        .unwrap();
+
+        rspec
+            .get_ext("ext", false, "latest", "v17")
+            .expect_err("Extension should not be found");
+
+        let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({
+            "public_extensions": [],
+            "custom_extensions": ["ext"],
+            "library_index": {
+                "ext": "ext"
+            },
+            "extension_data": {
+                "ext": {
+                    "control_data": {
+                        "ext.control": ""
+                    },
+                    "archive_path": ""
+                }
+            },
+        }))
+        .unwrap();
+
+        rspec
+            .get_ext("ext", false, "latest", "v17")
+            .expect("Extension should be found");
+
+        let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({
+            "public_extensions": ["ext"],
+            "custom_extensions": [],
+            "library_index": {
+                "ext": "ext"
+            },
+            "extension_data": {
+                "ext": {
+                    "control_data": {
+                        "ext.control": ""
+                    },
+                    "archive_path": ""
+                }
+            },
+        }))
+        .unwrap();
+
+        rspec
+            .get_ext("ext", false, "latest", "v17")
+            .expect("Extension should be found");
+    }
+
     #[test]
     fn parse_spec_file() {
         let file = File::open("tests/cluster_spec.json").unwrap();
diff --git a/test_runner/regress/test_download_extensions.py b/test_runner/regress/test_download_extensions.py
index d7e6e9de56..7f12c14073 100644
--- a/test_runner/regress/test_download_extensions.py
+++ b/test_runner/regress/test_download_extensions.py
@@ -95,6 +95,8 @@ def test_remote_extensions(
 
     # mock remote_extensions spec
     spec: dict[str, Any] = {
+        "public_extensions": ["anon"],
+        "custom_extensions": None,
         "library_index": {
             "anon": "anon",
         },

From 14e05276a3e0882777c4ee38252bed25324c93e8 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 5 Feb 2025 16:05:12 +0000
Subject: [PATCH 49/77] storcon: fix a case where optimise could get stuck on
 unschedulable node (#10648)

## Problem

When a shard has two secondary locations, but one of them is on a node
with MaySchedule::No, the optimiser would get stuck, because it couldn't
decide which secondary to remove.

This is generally okay if a node is offline, but if a node is in Pause
mode for a long period of time, it's a problem.

Closes: https://github.com/neondatabase/neon/issues/10646

## Summary of changes

- Instead of insisting on finding a node in the wrong AZ to remove, find
an available node in the _right_ AZ, and remove all the others. This
ensures that if there is one live suitable node, then other
offline/paused nodes cannot hold things up.
---
 storage_controller/src/tenant_shard.rs | 150 +++++++++++++++++++++++--
 1 file changed, 141 insertions(+), 9 deletions(-)

diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index 302104dc97..219c0dffe7 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -707,6 +707,7 @@ impl TenantShard {
                 if let Some(node_id) = self.intent.get_attached() {
                     // Populate secondary by demoting the attached node
                     self.intent.demote_attached(scheduler, *node_id);
+
                     modified = true;
                 } else if self.intent.secondary.is_empty() {
                     // Populate secondary by scheduling a fresh node
@@ -979,24 +980,51 @@ impl TenantShard {
                         ),
                     )
                 })
-                .collect::<Vec<_>>();
+                .collect::<HashMap<_, _>>();
 
             if secondary_scores.iter().any(|score| score.1.is_none()) {
-                // Don't have full list of scores, so can't make a good decision about which to drop unless
-                // there is an obvious one in the wrong AZ
-                for secondary in self.intent.get_secondary() {
-                    if scheduler.get_node_az(secondary) == self.intent.preferred_az_id {
+                // Trivial case: if we only have one secondary, drop that one
+                if self.intent.get_secondary().len() == 1 {
+                    return Some(ScheduleOptimization {
+                        sequence: self.sequence,
+                        action: ScheduleOptimizationAction::RemoveSecondary(
+                            *self.intent.get_secondary().first().unwrap(),
+                        ),
+                    });
+                }
+
+                // Try to find a "good" secondary to keep, without relying on scores (one or more nodes is in a state
+                // where its score can't be calculated), and drop the others.  This enables us to make progress in
+                // most cases, even if some nodes are offline or have scheduling=pause set.
+
+                debug_assert!(self.intent.attached.is_some()); // We should not make it here unless attached -- this
+                                                               // logic presumes we are in a mode where we want secondaries to be in non-home AZ
+                if let Some(retain_secondary) = self.intent.get_secondary().iter().find(|n| {
+                    let in_home_az = scheduler.get_node_az(n) == self.intent.preferred_az_id;
+                    let is_available = secondary_scores
+                        .get(n)
+                        .expect("Built from same list of nodes")
+                        .is_some();
+                    is_available && !in_home_az
+                }) {
+                    // Great, we found one to retain.  Pick some other to drop.
+                    if let Some(victim) = self
+                        .intent
+                        .get_secondary()
+                        .iter()
+                        .find(|n| n != &retain_secondary)
+                    {
                         return Some(ScheduleOptimization {
                             sequence: self.sequence,
-                            action: ScheduleOptimizationAction::RemoveSecondary(*secondary),
+                            action: ScheduleOptimizationAction::RemoveSecondary(*victim),
                         });
                     }
                 }
 
                 // Fall through: we didn't identify one to remove.  This ought to be rare.
                 tracing::warn!("Keeping extra secondaries: can't determine which of {:?} to remove (some nodes offline?)",
-                self.intent.get_secondary()
-            );
+                    self.intent.get_secondary()
+                );
             } else {
                 let victim = secondary_scores
                     .iter()
@@ -1005,7 +1033,7 @@ impl TenantShard {
                     .0;
                 return Some(ScheduleOptimization {
                     sequence: self.sequence,
-                    action: ScheduleOptimizationAction::RemoveSecondary(victim),
+                    action: ScheduleOptimizationAction::RemoveSecondary(*victim),
                 });
             }
         }
@@ -2379,6 +2407,110 @@ pub(crate) mod tests {
         Ok(())
     }
 
+    /// Test how the optimisation code behaves with an extra secondary
+    #[test]
+    fn optimize_removes_secondary() -> anyhow::Result<()> {
+        let az_a_tag = AvailabilityZone("az-a".to_string());
+        let az_b_tag = AvailabilityZone("az-b".to_string());
+        let mut nodes = make_test_nodes(
+            4,
+            &[
+                az_a_tag.clone(),
+                az_b_tag.clone(),
+                az_a_tag.clone(),
+                az_b_tag.clone(),
+            ],
+        );
+        let mut scheduler = Scheduler::new(nodes.values());
+
+        let mut schedule_context = ScheduleContext::default();
+
+        let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1));
+        shard_a.intent.preferred_az_id = Some(az_a_tag.clone());
+        shard_a
+            .schedule(&mut scheduler, &mut schedule_context)
+            .unwrap();
+
+        // Attached on node 1, secondary on node 2
+        assert_eq!(shard_a.intent.get_attached(), &Some(NodeId(1)));
+        assert_eq!(shard_a.intent.get_secondary(), &vec![NodeId(2)]);
+
+        // Initially optimiser is idle
+        assert_eq!(
+            shard_a.optimize_attachment(&mut scheduler, &schedule_context),
+            None
+        );
+        assert_eq!(
+            shard_a.optimize_secondary(&mut scheduler, &schedule_context),
+            None
+        );
+
+        // A spare secondary in the home AZ: it should be removed -- this is the situation when we're midway through a graceful migration, after cutting over
+        // to our new location
+        shard_a.intent.push_secondary(&mut scheduler, NodeId(3));
+        let optimization = shard_a.optimize_attachment(&mut scheduler, &schedule_context);
+        assert_eq!(
+            optimization,
+            Some(ScheduleOptimization {
+                sequence: shard_a.sequence,
+                action: ScheduleOptimizationAction::RemoveSecondary(NodeId(3))
+            })
+        );
+        shard_a.apply_optimization(&mut scheduler, optimization.unwrap());
+
+        // A spare secondary in the non-home AZ, and one of them is offline
+        shard_a.intent.push_secondary(&mut scheduler, NodeId(4));
+        nodes
+            .get_mut(&NodeId(4))
+            .unwrap()
+            .set_availability(NodeAvailability::Offline);
+        scheduler.node_upsert(nodes.get(&NodeId(4)).unwrap());
+        let optimization = shard_a.optimize_attachment(&mut scheduler, &schedule_context);
+        assert_eq!(
+            optimization,
+            Some(ScheduleOptimization {
+                sequence: shard_a.sequence,
+                action: ScheduleOptimizationAction::RemoveSecondary(NodeId(4))
+            })
+        );
+        shard_a.apply_optimization(&mut scheduler, optimization.unwrap());
+
+        // A spare secondary when should have none
+        shard_a.policy = PlacementPolicy::Attached(0);
+        let optimization = shard_a.optimize_attachment(&mut scheduler, &schedule_context);
+        assert_eq!(
+            optimization,
+            Some(ScheduleOptimization {
+                sequence: shard_a.sequence,
+                action: ScheduleOptimizationAction::RemoveSecondary(NodeId(2))
+            })
+        );
+        shard_a.apply_optimization(&mut scheduler, optimization.unwrap());
+        assert_eq!(shard_a.intent.get_attached(), &Some(NodeId(1)));
+        assert_eq!(shard_a.intent.get_secondary(), &vec![]);
+
+        // Check that in secondary mode, we preserve the secondary in the preferred AZ
+        let mut schedule_context = ScheduleContext::default(); // Fresh context, we're about to call schedule()
+        shard_a.policy = PlacementPolicy::Secondary;
+        shard_a
+            .schedule(&mut scheduler, &mut schedule_context)
+            .unwrap();
+        assert_eq!(shard_a.intent.get_attached(), &None);
+        assert_eq!(shard_a.intent.get_secondary(), &vec![NodeId(1)]);
+        assert_eq!(
+            shard_a.optimize_attachment(&mut scheduler, &schedule_context),
+            None
+        );
+        assert_eq!(
+            shard_a.optimize_secondary(&mut scheduler, &schedule_context),
+            None
+        );
+
+        shard_a.intent.clear(&mut scheduler);
+
+        Ok(())
+    }
+
     // Optimize til quiescent: this emulates what Service::optimize_all does, when
     // called repeatedly in the background.
     // Returns the applied optimizations

From fba22a7123a444970dd053c09e942a9a2e237a10 Mon Sep 17 00:00:00 2001
From: Arseny Sher <ars@neon.tech>
Date: Wed, 5 Feb 2025 20:00:26 +0300
Subject: [PATCH 50/77] Record more timings in test_layer_map (#10670)

## Problem

It it is not very clear how much time take different operations.

## Summary of changes

Record more timings.

ref https://github.com/neondatabase/neon/issues/10409
---
 test_runner/performance/test_layer_map.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/test_runner/performance/test_layer_map.py b/test_runner/performance/test_layer_map.py
index efc7fa59db..6c00944005 100644
--- a/test_runner/performance/test_layer_map.py
+++ b/test_runner/performance/test_layer_map.py
@@ -34,16 +34,20 @@ def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark):
     cur.execute("set log_statement = 'all'")
     cur.execute("create table t(x integer)")
     for _ in range(n_iters):
-        cur.execute(f"insert into t values (generate_series(1,{n_records}))")
+        with zenbenchmark.record_duration(f"insert into t values (generate_series(1,{n_records}))"):
+            cur.execute(f"insert into t values (generate_series(1,{n_records}))")
         time.sleep(1)
 
-    cur.execute("vacuum t")
+    with zenbenchmark.record_duration("vacuum t"):
+        cur.execute("vacuum t")
 
-    with zenbenchmark.record_duration("test_query"):
+    with zenbenchmark.record_duration("SELECT count(*) from t"):
         cur.execute("SELECT count(*) from t")
         assert cur.fetchone() == (n_iters * n_records,)
 
-    flush_ep_to_pageserver(env, endpoint, tenant, timeline)
-    env.pageserver.http_client().timeline_checkpoint(
-        tenant, timeline, compact=False, wait_until_uploaded=True
-    )
+    with zenbenchmark.record_duration("flush_ep_to_pageserver"):
+        flush_ep_to_pageserver(env, endpoint, tenant, timeline)
+    with zenbenchmark.record_duration("timeline_checkpoint"):
+        env.pageserver.http_client().timeline_checkpoint(
+            tenant, timeline, compact=False, wait_until_uploaded=True
+        )

From 133b89a83db9d49a802efc5a66a7fde65b7c8d5e Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 5 Feb 2025 12:35:39 -0500
Subject: [PATCH 51/77] feat(pageserver): continue from last incomplete image
 layer creation (#10660)

## Problem

close https://github.com/neondatabase/neon/issues/10651

## Summary of changes

* Image layer creation starts from the next partition of the last
processed partition if the previous attempt was not complete.
* Add tests.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/timeline.rs            | 92 ++++++++++++++++----
 pageserver/src/tenant/timeline/compaction.rs |  2 +-
 test_runner/regress/test_compaction.py       | 56 +++++++++++-
 3 files changed, 129 insertions(+), 21 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 11c0bbdfe5..b6a349a209 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -192,7 +192,12 @@ pub enum ImageLayerCreationMode {
 
 #[derive(Clone, Debug, Default)]
 pub enum LastImageLayerCreationStatus {
-    Incomplete, // TODO: record the last key being processed
+    Incomplete {
+        /// The last key of the partition (exclusive) that was processed in the last
+        /// image layer creation attempt. We will continue from this key in the next
+        /// attempt.
+        last_key: Key,
+    },
     Complete,
     #[default]
     Initial,
@@ -4346,7 +4351,7 @@ impl Timeline {
         Ok(result)
     }
 
-    // Is it time to create a new image layer for the given partition?
+    // Is it time to create a new image layer for the given partition? True if we want to generate.
     async fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> bool {
         let threshold = self.get_image_creation_threshold();
 
@@ -4658,6 +4663,11 @@ impl Timeline {
     ) -> Result<(Vec<ResidentLayer>, LastImageLayerCreationStatus), CreateImageLayersError> {
         let timer = self.metrics.create_images_time_histo.start_timer();
 
+        if partitioning.parts.is_empty() {
+            warn!("no partitions to create image layers for");
+            return Ok((vec![], LastImageLayerCreationStatus::Complete));
+        }
+
         // We need to avoid holes between generated image layers.
         // Otherwise LayerMap::image_layer_exists will return false if key range of some layer is covered by more than one
         // image layer with hole between them. In this case such layer can not be utilized by GC.
@@ -4669,28 +4679,65 @@ impl Timeline {
         // image layers  <100000000..100000099> and <200000000..200000199> are not completely covering it.
         let mut start = Key::MIN;
 
-        let check_for_image_layers = if let LastImageLayerCreationStatus::Incomplete = last_status {
-            info!(
-                "resuming image layer creation: last_status={:?}",
-                last_status
-            );
-            true
-        } else {
-            self.should_check_if_image_layers_required(lsn)
-        };
+        let check_for_image_layers =
+            if let LastImageLayerCreationStatus::Incomplete { last_key } = last_status {
+                info!(
+                    "resuming image layer creation: last_status=incomplete, continue from {}",
+                    last_key
+                );
+                true
+            } else {
+                self.should_check_if_image_layers_required(lsn)
+            };
 
         let mut batch_image_writer = BatchLayerWriter::new(self.conf).await?;
 
         let mut all_generated = true;
 
         let mut partition_processed = 0;
-        let total_partitions = partitioning.parts.len();
+        let mut total_partitions = partitioning.parts.len();
+        let mut last_partition_processed = None;
+        let mut partition_parts = partitioning.parts.clone();
 
-        for partition in partitioning.parts.iter() {
+        if let LastImageLayerCreationStatus::Incomplete { last_key } = last_status {
+            // We need to skip the partitions that have already been processed.
+            let mut found = false;
+            for (i, partition) in partition_parts.iter().enumerate() {
+                if last_key <= partition.end().unwrap() {
+                    // ```plain
+                    // |------|--------|----------|------|
+                    //              ^last_key
+                    //                    ^start from this partition
+                    // ```
+                    // Why `i+1` instead of `i`?
+                    // It is possible that the user did some writes after the previous image layer creation attempt so that
+                    // a relation grows in size, and the last_key is now in the middle of the partition. In this case, we
+                    // still want to skip this partition, so that we can make progress and avoid generating image layers over
+                    // the same partition. Doing a mod to ensure we don't end up with an empty vec.
+                    if i + 1 >= total_partitions {
+                        // In general, this case should not happen -- if last_key is on the last partition, the previous
+                        // iteration of image layer creation should return a complete status.
+                        break; // with found=false
+                    }
+                    partition_parts = partition_parts.split_off(i + 1); // Remove the first i + 1 elements
+                    total_partitions = partition_parts.len();
+                    // Update the start key to the partition start.
+                    start = partition_parts[0].start().unwrap();
+                    found = true;
+                    break;
+                }
+            }
+            if !found {
+                // Last key is within the last partition, or larger than all partitions.
+                return Ok((vec![], LastImageLayerCreationStatus::Complete));
+            }
+        }
+
+        for partition in partition_parts.iter() {
             if self.cancel.is_cancelled() {
                 return Err(CreateImageLayersError::Cancelled);
             }
-
+            partition_processed += 1;
             let img_range = start..partition.ranges.last().unwrap().end;
             let compact_metadata = partition.overlaps(&Key::metadata_key_range());
             if compact_metadata {
@@ -4725,6 +4772,8 @@ impl Timeline {
                     lsn_range: PersistentLayerDesc::image_layer_lsn_range(lsn),
                     is_delta: false,
                 }) {
+                    // TODO: this can be processed with the BatchLayerWriter::finish_with_discard
+                    // in the future.
                     tracing::info!(
                         "Skipping image layer at {lsn} {}..{}, already exists",
                         img_range.start,
@@ -4805,8 +4854,6 @@ impl Timeline {
                 }
             }
 
-            partition_processed += 1;
-
             if let ImageLayerCreationMode::Try = mode {
                 // We have at least made some progress
                 if batch_image_writer.pending_layer_num() >= 1 {
@@ -4822,8 +4869,10 @@ impl Timeline {
                         * self.get_compaction_threshold();
                     if image_preempt_threshold != 0 && num_of_l0_layers >= image_preempt_threshold {
                         tracing::info!(
-                        "preempt image layer generation at {start} at {lsn}: too many L0 layers {num_of_l0_layers}",
+                        "preempt image layer generation at {lsn} when processing partition {}..{}: too many L0 layers {}",
+                        partition.start().unwrap(), partition.end().unwrap(), num_of_l0_layers
                     );
+                        last_partition_processed = Some(partition.clone());
                         all_generated = false;
                         break;
                     }
@@ -4868,7 +4917,14 @@ impl Timeline {
             if all_generated {
                 LastImageLayerCreationStatus::Complete
             } else {
-                LastImageLayerCreationStatus::Incomplete
+                LastImageLayerCreationStatus::Incomplete {
+                    last_key: if let Some(last_partition_processed) = last_partition_processed {
+                        last_partition_processed.end().unwrap_or(Key::MIN)
+                    } else {
+                        // This branch should be unreachable, but in case it happens, we can just return the start key.
+                        Key::MIN
+                    },
+                }
             },
         ))
     }
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 7dd37d7232..466ebea783 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -748,7 +748,7 @@ impl Timeline {
                     .store(Arc::new(outcome.clone()));
 
                 self.upload_new_image_layers(image_layers)?;
-                if let LastImageLayerCreationStatus::Incomplete = outcome {
+                if let LastImageLayerCreationStatus::Incomplete { .. } = outcome {
                     // Yield and do not do any other kind of compaction.
                     info!("skipping shard ancestor compaction due to pending image layer generation tasks (preempted by L0 compaction).");
                     return Ok(CompactionOutcome::Pending);
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index c031d66dfb..f3347b594e 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -29,6 +29,21 @@ AGGRESSIVE_COMPACTION_TENANT_CONF = {
     # "lsn_lease_length": "0s", -- TODO: would cause branch creation errors, should fix later
 }
 
+PREEMPT_COMPACTION_TENANT_CONF = {
+    "gc_period": "5s",
+    "compaction_period": "5s",
+    # Small checkpoint distance to create many layers
+    "checkpoint_distance": 1024**2,
+    # Compact small layers
+    "compaction_target_size": 1024**2,
+    "image_creation_threshold": 1,
+    "image_creation_preempt_threshold": 1,
+    # compact more frequently
+    "compaction_threshold": 3,
+    "compaction_upper_limit": 6,
+    "lsn_lease_length": "0s",
+}
+
 
 @skip_in_debug_build("only run with release build")
 @pytest.mark.parametrize(
@@ -36,7 +51,8 @@ AGGRESSIVE_COMPACTION_TENANT_CONF = {
     [PageserverWalReceiverProtocol.VANILLA, PageserverWalReceiverProtocol.INTERPRETED],
 )
 def test_pageserver_compaction_smoke(
-    neon_env_builder: NeonEnvBuilder, wal_receiver_protocol: PageserverWalReceiverProtocol
+    neon_env_builder: NeonEnvBuilder,
+    wal_receiver_protocol: PageserverWalReceiverProtocol,
 ):
     """
     This is a smoke test that compaction kicks in. The workload repeatedly churns
@@ -54,7 +70,8 @@ def test_pageserver_compaction_smoke(
 page_cache_size=10
 """
 
-    env = neon_env_builder.init_start(initial_tenant_conf=AGGRESSIVE_COMPACTION_TENANT_CONF)
+    conf = AGGRESSIVE_COMPACTION_TENANT_CONF.copy()
+    env = neon_env_builder.init_start(initial_tenant_conf=conf)
 
     tenant_id = env.initial_tenant
     timeline_id = env.initial_timeline
@@ -113,6 +130,41 @@ page_cache_size=10
     assert vectored_average < 8
 
 
+@skip_in_debug_build("only run with release build")
+def test_pageserver_compaction_preempt(
+    neon_env_builder: NeonEnvBuilder,
+):
+    # Ideally we should be able to do unit tests for this, but we need real Postgres
+    # WALs in order to do unit testing...
+
+    conf = PREEMPT_COMPACTION_TENANT_CONF.copy()
+    env = neon_env_builder.init_start(initial_tenant_conf=conf)
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    row_count = 200000
+    churn_rounds = 10
+
+    ps_http = env.pageserver.http_client()
+
+    workload = Workload(env, tenant_id, timeline_id)
+    workload.init(env.pageserver.id)
+
+    log.info("Writing initial data ...")
+    workload.write_rows(row_count, env.pageserver.id)
+
+    for i in range(1, churn_rounds + 1):
+        log.info(f"Running churn round {i}/{churn_rounds} ...")
+        workload.churn_rows(row_count, env.pageserver.id, upload=False)
+        workload.validate(env.pageserver.id)
+    ps_http.timeline_compact(tenant_id, timeline_id, wait_until_uploaded=True)
+    log.info("Validating at workload end ...")
+    workload.validate(env.pageserver.id)
+    # ensure image layer creation gets preempted and then resumed
+    env.pageserver.assert_log_contains("resuming image layer creation")
+
+
 @skip_in_debug_build("only run with release build")
 @pytest.mark.parametrize(
     "with_branches",

From 6699a30a4948309703a90ed167980ef9d467bfad Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 5 Feb 2025 20:07:51 +0200
Subject: [PATCH 52/77] Make it easy to build only a subset of extensions into
 compute image (#10655)

The full build of all extensions takes a long time. When working locally
on parts that don't need extensions, you can iterate more quickly by
skipping the unnecessary extensions.

This adds a build argument to the dockerfile to specify extensions to
build. There are three options:

- EXTENSIONS=all (default)
- EXTENSIONS=minimal: Build only a few extensions that are listed in
shared_preload_libraries in the default neon config.
- EXTENSIONS=none: Build no extensions (except for the mandatory 'neon'
extension).
---
 compute/compute-node.Dockerfile | 44 +++++++++++++++++++++++++++++----
 1 file changed, 39 insertions(+), 5 deletions(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 9379856eab..43910f2622 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -85,6 +85,10 @@ ARG DEBIAN_VERSION=bookworm
 ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim
 ARG ALPINE_CURL_VERSION=8.11.1
 
+# By default, build all PostgreSQL extensions. For quick local testing when you don't
+# care about the extensions, pass EXTENSIONS=none or EXTENSIONS=minimal
+ARG EXTENSIONS=all
+
 #########################################################################################
 #
 # Layer "build-deps"
@@ -1484,12 +1488,35 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
 
 #########################################################################################
 #
-# Layer "all-extensions"
+# Layer "extensions-none"
+#
+#########################################################################################
+FROM build-deps AS extensions-none
+
+RUN mkdir /usr/local/pgsql
+
+#########################################################################################
+#
+# Layer "extensions-minimal"
+#
+# This subset of extensions includes the extensions that we have in
+# shared_preload_libraries by default.
+#
+#########################################################################################
+FROM build-deps AS extensions-minimal
+
+COPY --from=pgrag-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=timescaledb-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg_cron-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg_partman-build /usr/local/pgsql/ /usr/local/pgsql/
+
+#########################################################################################
+#
+# Layer "extensions-all"
 # Bundle together all the extensions
 #
 #########################################################################################
-FROM build-deps AS all-extensions
-ARG PG_VERSION
+FROM build-deps AS extensions-all
 
 # Public extensions
 COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
@@ -1531,7 +1558,13 @@ COPY --from=pg_partman-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_mooncake-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_repack-build /usr/local/pgsql/ /usr/local/pgsql/
 
-COPY --from=neon-ext-build /usr/local/pgsql/ /usr/local/pgsql/
+#########################################################################################
+#
+# Layer "neon-pg-ext-build"
+# Includes Postgres and all the extensions chosen by EXTENSIONS arg.
+#
+#########################################################################################
+FROM extensions-${EXTENSIONS} AS neon-pg-ext-build
 
 #########################################################################################
 #
@@ -1614,7 +1647,8 @@ RUN echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 30
 #
 #########################################################################################
 FROM neon-ext-build AS postgres-cleanup-layer
-COPY --from=all-extensions /usr/local/pgsql /usr/local/pgsql
+
+COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql
 
 # Remove binaries from /bin/ that we won't use (or would manually copy & install otherwise)
 RUN cd /usr/local/pgsql/bin && rm -f ecpg raster2pgsql shp2pgsql pgtopo_export pgtopo_import pgsql2shp

From 733a57247bb04893b62855d68ca54fbc85a58aa7 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 5 Feb 2025 15:44:28 -0500
Subject: [PATCH 53/77] fix(pageserver): disallow gc-compaction produce l0
 layer (#10679)

## Problem

Any compaction should never produce l0 layers. This never happened in my
experiments, but would be good to guard it early.

## Summary of changes

Disallow gc-compaction to produce l0 layers.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/timeline/compaction.rs | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 466ebea783..cfde070442 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -33,6 +33,7 @@ use crate::page_cache;
 use crate::statvfs::Statvfs;
 use crate::tenant::checks::check_valid_layermap;
 use crate::tenant::gc_block::GcBlock;
+use crate::tenant::layer_map::LayerMap;
 use crate::tenant::remote_timeline_client::WaitCompletionError;
 use crate::tenant::storage_layer::batch_split_writer::{
     BatchWriterResult, SplitDeltaLayerWriter, SplitImageLayerWriter,
@@ -438,6 +439,11 @@ impl KeyHistoryRetention {
         if dry_run {
             return true;
         }
+        if LayerMap::is_l0(&key.key_range, key.is_delta) {
+            // gc-compaction should not produce L0 deltas, otherwise it will break the layer order.
+            // We should ignore such layers.
+            return true;
+        }
         let layer_generation;
         {
             let guard = tline.layers.read().await;

From 0ceeec9be3acd07cda24d849d7cd71dc4c0ac3cc Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 5 Feb 2025 17:11:50 -0500
Subject: [PATCH 54/77] fix(pageserver): schedule compaction immediately if
 pending (#10684)

## Problem

The code is intended to reschedule compaction immediately if there are
pending tasks. We set the duration to 0 before if there are pending
tasks, but this will go through the `if period == Duration::ZERO {`
branch and sleep for another 10 seconds.

## Summary of changes

Set duration to 1 so that it doesn't sleep for too long.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/tasks.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index b6b64d02dd..d65f099182 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -211,7 +211,7 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                         error_run_count = 0;
                         // schedule the next compaction immediately in case there is a pending compaction task
                         sleep_duration = if let CompactionOutcome::Pending = outcome {
-                            Duration::ZERO
+                            Duration::from_secs(1)
                         } else {
                             period
                         };

From 77f9e74d86d62bca524571aa50416680afb1f5ec Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 6 Feb 2025 02:14:29 +0100
Subject: [PATCH 55/77] pgxn: include socket send & recv queue size in slow
 response logs (#10673)

# Problem

When we see an apparent slow request, one possible cause is that the
client is failing to consume responses, but we don't have a clear way to
see that.

# Solution

- Log the socket queue depths on slow/stuck connections, so that we have
an indication of whether the compute is keeping up with processing the
connection's responses.

refs
- slack https://neondb.slack.com/archives/C036U0GRMRB/p1738652644396329
- refs https://github.com/neondatabase/cloud/issues/23515
- refs https://github.com/neondatabase/cloud/issues/23486
---
 pgxn/neon/libpagestore.c | 34 ++++++++++++++++++++++++++++++++--
 1 file changed, 32 insertions(+), 2 deletions(-)

diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 4460e3b40c..22aeb2e2d6 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -36,6 +36,11 @@
 #include "pagestore_client.h"
 #include "walproposer.h"
 
+#ifdef __linux__
+#include <sys/ioctl.h>
+#include <linux/sockios.h>
+#endif
+
 #define PageStoreTrace DEBUG5
 
 #define MIN_RECONNECT_INTERVAL_USEC 1000
@@ -728,11 +733,36 @@ retry:
 		INSTR_TIME_SUBTRACT(since_last_log, last_log_ts);
 		if (INSTR_TIME_GET_MILLISEC(since_last_log) >= LOG_INTERVAL_MS)
 		{
+			int sndbuf = -1;
+			int recvbuf = -1;
+#ifdef __linux__
+			int socketfd;
+#endif
+
 			since_start = now;
 			INSTR_TIME_SUBTRACT(since_start, start_ts);
-			neon_shard_log(shard_no, LOG, "no response received from pageserver for %0.3f s, still waiting (sent " UINT64_FORMAT " requests, received " UINT64_FORMAT " responses)",
+
+#ifdef __linux__
+			/*
+			 * get kernel's send and recv queue size via ioctl
+			 * https://elixir.bootlin.com/linux/v6.1.128/source/include/uapi/linux/sockios.h#L25-L27
+			 */
+			socketfd = PQsocket(pageserver_conn);
+			if (socketfd != -1) {
+				int ioctl_err;
+				ioctl_err = ioctl(socketfd, SIOCOUTQ, &sndbuf);
+				if (ioctl_err!= 0) {
+					sndbuf = -errno;
+				}
+				ioctl_err = ioctl(socketfd, FIONREAD, &recvbuf);
+				if (ioctl_err != 0) {
+					recvbuf = -errno;
+				}
+			}
+#endif
+			neon_shard_log(shard_no, LOG, "no response received from pageserver for %0.3f s, still waiting (sent " UINT64_FORMAT " requests, received " UINT64_FORMAT " responses) (socket sndbuf=%d recvbuf=%d)",
 						   INSTR_TIME_GET_DOUBLE(since_start),
-						   shard->nrequests_sent, shard->nresponses_received);
+						   shard->nrequests_sent, shard->nresponses_received, sndbuf, recvbuf);
 			last_log_ts = now;
 			logged = true;
 		}

From 7fc6953da41d536f8d785ea3a41640c80bb7c6bb Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 6 Feb 2025 07:42:14 +0200
Subject: [PATCH 56/77] Is neon superuser (#10625)

## Problem

is_neon_superuser() fiunction is public in pg14/pg15
but statically defined in publicationcmd.c in pg16/pg17

## Summary of changes

Make this function public for all Postgres version.
It is intended to be used not only in  publicationcmd.c

See
https://github.com/neondatabase/postgres/pull/573
https://github.com/neondatabase/postgres/pull/576

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 vendor/postgres-v16   | 2 +-
 vendor/postgres-v17   | 2 +-
 vendor/revisions.json | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 3cf7ce1afa..86d9ea96eb 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 3cf7ce1afab75027716d14223f95ddb300754162
+Subproject commit 86d9ea96ebb9088eac62f57f1f5ace68e70e0d1c
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index f0ffc8279d..8dfd5a7030 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit f0ffc8279dbcbbc439981a4fd001a9687e5d665d
+Subproject commit 8dfd5a7030d3e8a98b60265ebe045788892ac7f3
diff --git a/vendor/revisions.json b/vendor/revisions.json
index c3eaeac927..efddaef46a 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,11 +1,11 @@
 {
   "v17": [
     "17.2",
-    "f0ffc8279dbcbbc439981a4fd001a9687e5d665d"
+    "8dfd5a7030d3e8a98b60265ebe045788892ac7f3"
   ],
   "v16": [
     "16.6",
-    "3cf7ce1afab75027716d14223f95ddb300754162"
+    "86d9ea96ebb9088eac62f57f1f5ace68e70e0d1c"
   ],
   "v15": [
     "15.10",

From 81cd30e4d6e2f9e43a92e59c6ffce4f0890980ca Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 6 Feb 2025 07:47:56 +0200
Subject: [PATCH 57/77] Use #ifdef instead of #if USE_ASSERT_CHECKING (#10683)

## Problem

USE_ASSERT _CHECKING is defined as empty entity. but it is checked using
#if

## Summary of changes

Replace `#if USE_ASSERT _CHECKING` with `#ifdef USE_ASSERT _CHECKING` as
done in other places in Postgres

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/file_cache.c     | 6 +++---
 pgxn/neon/pagestore_smgr.c | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index 08b7652175..01da61f84b 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -563,8 +563,8 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 
 	LWLockRelease(lfc_lock);
 
-#if USE_ASSERT_CHECKING
-	do {
+#ifdef USE_ASSERT_CHECKING
+	{
 		int count = 0;
 
 		for (int j = 0; j < nblocks; j++)
@@ -574,7 +574,7 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		}
 
 		Assert(count == found);
-	} while (false);
+	}
 #endif
 
 	return found;
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 54cacea984..012bd479bc 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -916,7 +916,7 @@ prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns,
 {
 	uint64		min_ring_index;
 	PrefetchRequest hashkey;
-#if USE_ASSERT_CHECKING
+#ifdef USE_ASSERT_CHECKING
 	bool		any_hits = false;
 #endif
 	/* We will never read further ahead than our buffer can store. */
@@ -955,7 +955,7 @@ Retry:
 		else
 			lsns = NULL;
 
-#if USE_ASSERT_CHECKING
+#ifdef USE_ASSERT_CHECKING
 		any_hits = true;
 #endif
 

From 01f0be03b55e97fd3d1679e25415e1474513419e Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 6 Feb 2025 09:00:00 +0200
Subject: [PATCH 58/77] Fix bugs in lfc_cache_containsv (#10682)

## Problem

Incorrect manipulations with iteration index in `lfc_cache_containsv`

## Summary of changes

```
-		int		this_chunk = Min(nblocks, BLOCKS_PER_CHUNK - chunk_offs);
+		int		this_chunk = Min(nblocks - i, BLOCKS_PER_CHUNK - chunk_offs);		int		this_chunk = ```
 -		if (i + 1 >= nblocks)
+		if (i >= nblocks)
```

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/file_cache.c | 39 ++++++++++++++++++---------------------
 1 file changed, 18 insertions(+), 21 deletions(-)

diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index 01da61f84b..a61dc9f4c6 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -509,47 +509,44 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 
 	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
 
-	tag.blockNum = (blkno + i) & ~(BLOCKS_PER_CHUNK - 1);
+	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
 	hash = get_hash_value(lfc_hash, &tag);
-	chunk_offs = (blkno + i) & (BLOCKS_PER_CHUNK - 1);
+	chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1);
 
 	LWLockAcquire(lfc_lock, LW_SHARED);
 
+	if (!LFC_ENABLED())
+	{
+		LWLockRelease(lfc_lock);
+		return 0;
+	}
 	while (true)
 	{
-		int		this_chunk = Min(nblocks, BLOCKS_PER_CHUNK - chunk_offs);
-		if (LFC_ENABLED())
-		{
-			entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL);
+		int		this_chunk = Min(nblocks - i, BLOCKS_PER_CHUNK - chunk_offs);
+		entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL);
 
-			if (entry != NULL)
+		if (entry != NULL)
+		{
+			for (; chunk_offs < BLOCKS_PER_CHUNK && i < nblocks; chunk_offs++, i++)
 			{
-				for (; chunk_offs < BLOCKS_PER_CHUNK && i < nblocks; chunk_offs++, i++)
+				if ((entry->bitmap[chunk_offs >> 5] & 
+					 ((uint32)1 << (chunk_offs & 31))) != 0)
 				{
-					if ((entry->bitmap[chunk_offs >> 5] & 
-						((uint32)1 << (chunk_offs & 31))) != 0)
-					{
-						BITMAP_SET(bitmap, i);
-						found++;
-					}
+					BITMAP_SET(bitmap, i);
+					found++;
 				}
 			}
-			else
-			{
-				i += this_chunk;
-			}
 		}
 		else
 		{
-			LWLockRelease(lfc_lock);
-			return found;
+			i += this_chunk;
 		}
 
 		/*
 		 * Break out of the iteration before doing expensive stuff for
 		 * a next iteration
 		 */
-		if (i + 1 >= nblocks)
+		if (i >= nblocks)
 			break;
 
 		/*

From abcd00181c07a4e3427eb2645bab64500aa82d49 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Thu, 6 Feb 2025 08:24:36 +0100
Subject: [PATCH 59/77] pageserver: set a concurrency limit for LocalFS
 (#10676)

## Problem

The local filesystem backend for remote storage doesn't set a
concurrency limit. While it can't/won't enforce a concurrency limit
itself, this also bounds the upload queue concurrency. Some tests create
thousands of uploads, which slows down the quadratic scheduling of the
upload queue, and there is no point spawning that many Tokio tasks.

Resolves #10409.

## Summary of changes

Set a concurrency limit of 100 for the LocalFS backend.

Before: `test_layer_map[release-pg17].test_query: 68.338 s`
After: `test_layer_map[release-pg17].test_query: 5.209 s`
---
 libs/remote_storage/src/config.rs               | 10 +++++-----
 libs/remote_storage/src/lib.rs                  |  6 ++++++
 pageserver/src/tenant/remote_timeline_client.rs |  9 +++------
 3 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/libs/remote_storage/src/config.rs b/libs/remote_storage/src/config.rs
index dae141bf77..ff34158c9c 100644
--- a/libs/remote_storage/src/config.rs
+++ b/libs/remote_storage/src/config.rs
@@ -7,7 +7,7 @@ use serde::{Deserialize, Serialize};
 
 use crate::{
     DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT,
-    DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
+    DEFAULT_REMOTE_STORAGE_LOCALFS_CONCURRENCY_LIMIT, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
 };
 
 /// External backup storage configuration, enough for creating a client for that storage.
@@ -45,11 +45,11 @@ impl RemoteStorageKind {
 
 impl RemoteStorageConfig {
     /// Helper to fetch the configured concurrency limit.
-    pub fn concurrency_limit(&self) -> Option<usize> {
+    pub fn concurrency_limit(&self) -> usize {
         match &self.storage {
-            RemoteStorageKind::LocalFs { .. } => None,
-            RemoteStorageKind::AwsS3(c) => Some(c.concurrency_limit.into()),
-            RemoteStorageKind::AzureContainer(c) => Some(c.concurrency_limit.into()),
+            RemoteStorageKind::LocalFs { .. } => DEFAULT_REMOTE_STORAGE_LOCALFS_CONCURRENCY_LIMIT,
+            RemoteStorageKind::AwsS3(c) => c.concurrency_limit.into(),
+            RemoteStorageKind::AzureContainer(c) => c.concurrency_limit.into(),
         }
     }
 }
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index 7a864151ec..69b522d63e 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -65,6 +65,12 @@ pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
 /// Here, a limit of max 20k concurrent connections was noted.
 /// <https://learn.microsoft.com/en-us/answers/questions/1301863/is-there-any-limitation-to-concurrent-connections>
 pub const DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT: usize = 100;
+/// Set this limit analogously to the S3 limit.
+///
+/// The local filesystem backend doesn't enforce a concurrency limit itself, but this also bounds
+/// the upload queue concurrency. Some tests create thousands of uploads, which slows down the
+/// quadratic scheduling of the upload queue, and there is no point spawning so many Tokio tasks.
+pub const DEFAULT_REMOTE_STORAGE_LOCALFS_CONCURRENCY_LIMIT: usize = 100;
 /// No limits on the client side, which currenltly means 1000 for AWS S3.
 /// <https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax>
 pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option<i32> = None;
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index bcba6d1f62..ad6d8dfae8 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -437,8 +437,7 @@ impl RemoteTimelineClient {
             .conf
             .remote_storage_config
             .as_ref()
-            .and_then(|r| r.concurrency_limit())
-            .unwrap_or(0);
+            .map_or(0, |r| r.concurrency_limit());
         let mut upload_queue = self.upload_queue.lock().unwrap();
         upload_queue.initialize_with_current_remote_index_part(index_part, inprogress_limit)?;
         self.update_remote_physical_size_gauge(Some(index_part));
@@ -461,8 +460,7 @@ impl RemoteTimelineClient {
             .conf
             .remote_storage_config
             .as_ref()
-            .and_then(|r| r.concurrency_limit())
-            .unwrap_or(0);
+            .map_or(0, |r| r.concurrency_limit());
         let mut upload_queue = self.upload_queue.lock().unwrap();
         upload_queue.initialize_empty_remote(local_metadata, inprogress_limit)?;
         self.update_remote_physical_size_gauge(None);
@@ -484,8 +482,7 @@ impl RemoteTimelineClient {
             .conf
             .remote_storage_config
             .as_ref()
-            .and_then(|r| r.concurrency_limit())
-            .unwrap_or(0);
+            .map_or(0, |r| r.concurrency_limit());
 
         let mut upload_queue = self.upload_queue.lock().unwrap();
         upload_queue.initialize_with_current_remote_index_part(index_part, inprogress_limit)?;

From 1686d9e7332653ef44d8690d4764c582437eb6bf Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 6 Feb 2025 09:33:37 +0100
Subject: [PATCH 60/77] perf(page_service): dont `.instrument(span.clone())`
 the response flush (#10686)

On my AX102 Hetzner box, removing this line removes about 20us from the
`latency_mean` result in

`test_pageserver_characterize_latencies_with_1_client_and_throughput_with_many_clients_one_tenant`.

If the same 20us can be removed in the nightly benchmark run, this will
be a ~10% improvement because there, mean latencies are about ~220us.

This span was added during batching refactors, we didn't have it before,
and I don't think it's terribly useful.

refs
- https://github.com/neondatabase/cloud/issues/21759
---
 pageserver/src/page_service.rs | 2 --
 1 file changed, 2 deletions(-)

diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index e103338c7c..679cf3b2d5 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -1280,8 +1280,6 @@ impl PageServerHandler {
                 }
                 Ok(())
             }
-            // and log the info! line inside the request span
-            .instrument(span.clone())
             .await?;
         }
         Ok(())

From 95588dab9874f33bd4ef637171b765a7b71455ca Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 6 Feb 2025 09:24:28 +0000
Subject: [PATCH 61/77] safekeeper: fix wal fan-out shard subscription data
 race (#10677)

## Problem

[This select
arm](https://github.com/neondatabase/neon/blob/main/safekeeper/src/send_interpreted_wal.rs#L414)
runs when we want to attach a new reader to the current cursor.
It checks the current position of the cursor and resets it if required.

The current position of the cursor is updated in the [other select
arm](https://github.com/neondatabase/neon/blob/main/safekeeper/src/send_interpreted_wal.rs#L336-L345).
That runs when we get some WAL to send.

Now, what happens if we want to attach two shards consecutively to the
cursor?
Let's say [this select
arm](https://github.com/neondatabase/neon/blob/main/safekeeper/src/send_interpreted_wal.rs#L397)
runs twice in a row.

Let's assume cursor is currently at LSN X. First shard wants to attach
at position V
and the other one at W. Assume X > W > V.

First shard resets the stream to position V. Second shard comes in,
sees stale cursor position X and resets it to W. This means that the
first shard doesn't get wal in the [V, W) range.

## Summary of changes

Ultimately, this boils down to the current position not being kept in
sync with the reset of the WAL stream. This patch fixes the race by
updating it when resetting the WAL stream and adds a unit test repro.

Closes https://github.com/neondatabase/cloud/issues/23750
---
 safekeeper/src/send_interpreted_wal.rs | 156 ++++++++++++++++++++-----
 safekeeper/src/test_utils.rs           |   6 +-
 safekeeper/src/wal_reader_stream.rs    |   2 +-
 3 files changed, 130 insertions(+), 34 deletions(-)

diff --git a/safekeeper/src/send_interpreted_wal.rs b/safekeeper/src/send_interpreted_wal.rs
index ea09ce364d..b57cc8001d 100644
--- a/safekeeper/src/send_interpreted_wal.rs
+++ b/safekeeper/src/send_interpreted_wal.rs
@@ -120,6 +120,20 @@ pub enum InterpretedWalReaderError {
     WalStreamClosed,
 }
 
+enum CurrentPositionUpdate {
+    Reset(Lsn),
+    NotReset(Lsn),
+}
+
+impl CurrentPositionUpdate {
+    fn current_position(&self) -> Lsn {
+        match self {
+            CurrentPositionUpdate::Reset(lsn) => *lsn,
+            CurrentPositionUpdate::NotReset(lsn) => *lsn,
+        }
+    }
+}
+
 impl InterpretedWalReaderState {
     fn current_position(&self) -> Option<Lsn> {
         match self {
@@ -129,6 +143,26 @@ impl InterpretedWalReaderState {
             InterpretedWalReaderState::Done => None,
         }
     }
+
+    // Reset the current position of the WAL reader if the requested starting position
+    // of the new shard is smaller than the current value.
+    fn maybe_reset(&mut self, new_shard_start_pos: Lsn) -> CurrentPositionUpdate {
+        match self {
+            InterpretedWalReaderState::Running {
+                current_position, ..
+            } => {
+                if new_shard_start_pos < *current_position {
+                    *current_position = new_shard_start_pos;
+                    CurrentPositionUpdate::Reset(*current_position)
+                } else {
+                    CurrentPositionUpdate::NotReset(*current_position)
+                }
+            }
+            InterpretedWalReaderState::Done => {
+                panic!("maybe_reset called on finished reader")
+            }
+        }
+    }
 }
 
 pub(crate) struct AttachShardNotification {
@@ -410,15 +444,24 @@ impl InterpretedWalReader {
                         };
 
                         senders.push(ShardSenderState { sender_id: new_sender_id, tx: sender, next_record_lsn: start_pos});
-                        let current_pos = self.state.read().unwrap().current_position().unwrap();
-                        if start_pos < current_pos {
-                            self.wal_stream.reset(start_pos).await;
-                            wal_decoder = WalStreamDecoder::new(start_pos, self.pg_version);
-                        }
+
+                        // If the shard is subscribing below the current position the we need
+                        // to update the cursor that tracks where we are at in the WAL
+                        // ([`Self::state`]) and reset the WAL stream itself
+                        // (`[Self::wal_stream`]). This must be done atomically from the POV of
+                        // anything outside the select statement.
+                        let position_reset = self.state.write().unwrap().maybe_reset(start_pos);
+                        match position_reset {
+                            CurrentPositionUpdate::Reset(to) => {
+                                self.wal_stream.reset(to).await;
+                                wal_decoder = WalStreamDecoder::new(to, self.pg_version);
+                            },
+                            CurrentPositionUpdate::NotReset(_) => {}
+                        };
 
                         tracing::info!(
                             "Added shard sender {} with start_pos={} current_pos={}",
-                            ShardSenderId::new(shard_id, new_sender_id), start_pos, current_pos
+                            ShardSenderId::new(shard_id, new_sender_id), start_pos, position_reset.current_position()
                         );
                     }
                 }
@@ -584,7 +627,7 @@ mod tests {
             .unwrap();
 
         let resident_tli = tli.wal_residence_guard().await.unwrap();
-        let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT)
+        let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT, None)
             .await
             .unwrap();
         let end_pos = end_watch.get();
@@ -715,7 +758,6 @@ mod tests {
         const MSG_COUNT: usize = 200;
         const PG_VERSION: u32 = 17;
         const SHARD_COUNT: u8 = 2;
-        const ATTACHED_SHARDS: u8 = 4;
 
         let start_lsn = Lsn::from_str("0/149FD18").unwrap();
         let env = Env::new(true).unwrap();
@@ -725,9 +767,11 @@ mod tests {
             .unwrap();
 
         let resident_tli = tli.wal_residence_guard().await.unwrap();
-        let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT)
-            .await
-            .unwrap();
+        let mut next_record_lsns = Vec::default();
+        let end_watch =
+            Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT, Some(&mut next_record_lsns))
+                .await
+                .unwrap();
         let end_pos = end_watch.get();
 
         let streaming_wal_reader = StreamingWalReader::new(
@@ -746,38 +790,71 @@ mod tests {
         )
         .unwrap();
 
-        let (tx, rx) = tokio::sync::mpsc::channel::<Batch>(MSG_COUNT * 2);
-        let mut batch_receivers = vec![rx];
+        struct Sender {
+            tx: Option<tokio::sync::mpsc::Sender<Batch>>,
+            rx: tokio::sync::mpsc::Receiver<Batch>,
+            shard: ShardIdentity,
+            start_lsn: Lsn,
+            received_next_record_lsns: Vec<Lsn>,
+        }
 
+        impl Sender {
+            fn new(start_lsn: Lsn, shard: ShardIdentity) -> Self {
+                let (tx, rx) = tokio::sync::mpsc::channel::<Batch>(MSG_COUNT * 2);
+                Self {
+                    tx: Some(tx),
+                    rx,
+                    shard,
+                    start_lsn,
+                    received_next_record_lsns: Vec::default(),
+                }
+            }
+        }
+
+        assert!(next_record_lsns.len() > 7);
+        let start_lsns = vec![
+            next_record_lsns[5],
+            next_record_lsns[1],
+            next_record_lsns[3],
+        ];
+        let mut senders = start_lsns
+            .into_iter()
+            .map(|lsn| Sender::new(lsn, shard_0))
+            .collect::<Vec<_>>();
+
+        let first_sender = senders.first_mut().unwrap();
         let handle = InterpretedWalReader::spawn(
             streaming_wal_reader,
-            start_lsn,
-            tx,
-            shard_0,
+            first_sender.start_lsn,
+            first_sender.tx.take().unwrap(),
+            first_sender.shard,
             PG_VERSION,
             &Some("pageserver".to_string()),
         );
 
-        for _ in 0..(ATTACHED_SHARDS - 1) {
-            let (tx, rx) = tokio::sync::mpsc::channel::<Batch>(MSG_COUNT * 2);
-            handle.fanout(shard_0, tx, start_lsn).unwrap();
-            batch_receivers.push(rx);
+        for sender in senders.iter_mut().skip(1) {
+            handle
+                .fanout(sender.shard, sender.tx.take().unwrap(), sender.start_lsn)
+                .unwrap();
         }
 
-        loop {
-            let batch = batch_receivers.first_mut().unwrap().recv().await.unwrap();
-            for rx in batch_receivers.iter_mut().skip(1) {
-                let other_batch = rx.recv().await.unwrap();
-
-                assert_eq!(batch.wal_end_lsn, other_batch.wal_end_lsn);
-                assert_eq!(
-                    batch.available_wal_end_lsn,
-                    other_batch.available_wal_end_lsn
+        for sender in senders.iter_mut() {
+            loop {
+                let batch = sender.rx.recv().await.unwrap();
+                tracing::info!(
+                    "Sender with start_lsn={} received batch ending at {} with {} records",
+                    sender.start_lsn,
+                    batch.wal_end_lsn,
+                    batch.records.records.len()
                 );
-            }
 
-            if batch.wal_end_lsn == batch.available_wal_end_lsn {
-                break;
+                for rec in batch.records.records {
+                    sender.received_next_record_lsns.push(rec.next_record_lsn);
+                }
+
+                if batch.wal_end_lsn == batch.available_wal_end_lsn {
+                    break;
+                }
             }
         }
 
@@ -792,5 +869,20 @@ mod tests {
         }
 
         assert!(done);
+
+        for sender in senders {
+            tracing::info!(
+                "Validating records received by sender with start_lsn={}",
+                sender.start_lsn
+            );
+
+            assert!(sender.received_next_record_lsns.is_sorted());
+            let expected = next_record_lsns
+                .iter()
+                .filter(|lsn| **lsn > sender.start_lsn)
+                .copied()
+                .collect::<Vec<_>>();
+            assert_eq!(sender.received_next_record_lsns, expected);
+        }
     }
 }
diff --git a/safekeeper/src/test_utils.rs b/safekeeper/src/test_utils.rs
index 4e851c5b3d..79ceddd366 100644
--- a/safekeeper/src/test_utils.rs
+++ b/safekeeper/src/test_utils.rs
@@ -122,6 +122,7 @@ impl Env {
         start_lsn: Lsn,
         msg_size: usize,
         msg_count: usize,
+        mut next_record_lsns: Option<&mut Vec<Lsn>>,
     ) -> anyhow::Result<EndWatch> {
         let (msg_tx, msg_rx) = tokio::sync::mpsc::channel(receive_wal::MSG_QUEUE_SIZE);
         let (reply_tx, mut reply_rx) = tokio::sync::mpsc::channel(receive_wal::REPLY_QUEUE_SIZE);
@@ -130,7 +131,7 @@ impl Env {
 
         WalAcceptor::spawn(tli.wal_residence_guard().await?, msg_rx, reply_tx, Some(0));
 
-        let prefix = c"p";
+        let prefix = c"neon-file:";
         let prefixlen = prefix.to_bytes_with_nul().len();
         assert!(msg_size >= prefixlen);
         let message = vec![0; msg_size - prefixlen];
@@ -139,6 +140,9 @@ impl Env {
             &mut WalGenerator::new(LogicalMessageGenerator::new(prefix, &message), start_lsn);
         for _ in 0..msg_count {
             let (lsn, record) = walgen.next().unwrap();
+            if let Some(ref mut lsns) = next_record_lsns {
+                lsns.push(lsn);
+            }
 
             let req = AppendRequest {
                 h: AppendRequestHeader {
diff --git a/safekeeper/src/wal_reader_stream.rs b/safekeeper/src/wal_reader_stream.rs
index adac6067da..a0dd571a34 100644
--- a/safekeeper/src/wal_reader_stream.rs
+++ b/safekeeper/src/wal_reader_stream.rs
@@ -246,7 +246,7 @@ mod tests {
             .unwrap();
 
         let resident_tli = tli.wal_residence_guard().await.unwrap();
-        let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT)
+        let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT, None)
             .await
             .unwrap();
         let end_pos = end_watch.get();

From b66fbd6176152f0d67e8b230a78639fd3481adcc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 6 Feb 2025 11:09:20 +0100
Subject: [PATCH 62/77] Warn on basebackups for archived timelines (#10688)

We don't want any external requests for an archived timeline. This
includes basebackup requests, i.e. when a compute is being started up.

Therefore, we'd like to forbid such basebackup requests: any attempt to
get a basebackup on an archived timeline (or any getpage request really)
is a cplane bug. Make this a warning for now so that, if there is
potentially a bug, we can detect cases in the wild before they cause
stuck operations, but the intention is to return an error eventually.

Related: #9548
---
 pageserver/src/page_service.rs | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 679cf3b2d5..d4898532d6 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -2035,6 +2035,12 @@ impl PageServerHandler {
             .get(tenant_id, timeline_id, ShardSelector::Zero)
             .await?;
 
+        if timeline.is_archived() == Some(true) {
+            // TODO after a grace period, turn this log line into a hard error
+            tracing::warn!("timeline {tenant_id}/{timeline_id} is archived, but got basebackup request for it.");
+            //return Err(QueryError::NotFound("timeline is archived".into()))
+        }
+
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         if let Some(lsn) = lsn {
             // Backup was requested at a particular LSN. Wait for it to arrive.

From 05326cc247b0149ff27ada881249f4945dc6b541 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 6 Feb 2025 11:10:11 +0100
Subject: [PATCH 63/77] Skip gc cutoff lsn check at timeline creation if lease
 exists (#10685)

Right now, branch creation doesn't care if a lsn lease exists or not, it
just fails if the passed lsn is older than either the last or the
planned gc cutoff.

However, if an lsn lease exists for a given lsn, we can actually create
a branch at that point: nothing has been gc'd away.

This prevents race conditions that #10678 still leaves around.

Related: #10639
https://github.com/neondatabase/cloud/issues/23667
---
 pageserver/src/page_service.rs    |  2 +-
 pageserver/src/tenant.rs          | 32 +++++++++++++++++--------------
 pageserver/src/tenant/timeline.rs |  3 +++
 3 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index d4898532d6..24a350399d 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -1690,7 +1690,7 @@ impl PageServerHandler {
         // to distinguish a misbehaving client (asking for old LSN) from a storage issue (data missing at a legitimate LSN).
         if request_lsn < **latest_gc_cutoff_lsn && !timeline.is_gc_blocked_by_lsn_lease_deadline() {
             let gc_info = &timeline.gc_info.read().unwrap();
-            if !gc_info.leases.contains_key(&request_lsn) {
+            if !gc_info.lsn_covered_by_lease(request_lsn) {
                 return Err(
                     PageStreamError::BadRequest(format!(
                         "tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index c1b408ed72..3c6996dd51 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -4642,22 +4642,26 @@ impl Tenant {
 
         // check against last actual 'latest_gc_cutoff' first
         let latest_gc_cutoff_lsn = src_timeline.get_latest_gc_cutoff_lsn();
-        src_timeline
-            .check_lsn_is_in_scope(start_lsn, &latest_gc_cutoff_lsn)
-            .context(format!(
-                "invalid branch start lsn: less than latest GC cutoff {}",
-                *latest_gc_cutoff_lsn,
-            ))
-            .map_err(CreateTimelineError::AncestorLsn)?;
-
-        // and then the planned GC cutoff
         {
             let gc_info = src_timeline.gc_info.read().unwrap();
-            let cutoff = gc_info.min_cutoff();
-            if start_lsn < cutoff {
-                return Err(CreateTimelineError::AncestorLsn(anyhow::anyhow!(
-                    "invalid branch start lsn: less than planned GC cutoff {cutoff}"
-                )));
+            let planned_cutoff = gc_info.min_cutoff();
+            if gc_info.lsn_covered_by_lease(start_lsn) {
+                tracing::info!("skipping comparison of {start_lsn} with gc cutoff {} and planned gc cutoff {planned_cutoff} due to lsn lease", *latest_gc_cutoff_lsn);
+            } else {
+                src_timeline
+                    .check_lsn_is_in_scope(start_lsn, &latest_gc_cutoff_lsn)
+                    .context(format!(
+                        "invalid branch start lsn: less than latest GC cutoff {}",
+                        *latest_gc_cutoff_lsn,
+                    ))
+                    .map_err(CreateTimelineError::AncestorLsn)?;
+
+                // and then the planned GC cutoff
+                if start_lsn < planned_cutoff {
+                    return Err(CreateTimelineError::AncestorLsn(anyhow::anyhow!(
+                        "invalid branch start lsn: less than planned GC cutoff {planned_cutoff}"
+                    )));
+                }
             }
         }
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index b6a349a209..45ddd38f67 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -531,6 +531,9 @@ impl GcInfo {
     pub(super) fn remove_child_offloaded(&mut self, child_id: TimelineId) -> bool {
         self.remove_child_maybe_offloaded(child_id, MaybeOffloaded::Yes)
     }
+    pub(crate) fn lsn_covered_by_lease(&self, lsn: Lsn) -> bool {
+        self.leases.contains_key(&lsn)
+    }
 }
 
 /// The `GcInfo` component describing which Lsns need to be retained.  Functionally, this

From f4cfa725b8ab67fafba2aef8761549c0cc010147 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Thu, 6 Feb 2025 11:30:27 +0100
Subject: [PATCH 64/77] pageserver: add a few critical errors (#10657)

## Problem

Following #10641, let's add a few critical errors.

Resolves #10094.

## Summary of changes

Adds the following critical errors:

* WAL sender read/decode failure.
* WAL record ingestion failure.
* WAL redo failure.
* Missing key during compaction.

We don't add an error for missing keys during GetPage requests, since
we've seen a handful of these in production recently, and the cause is
still unclear (most likely a benign race).
---
 libs/utils/src/logging.rs                     |  9 ++++++---
 pageserver/src/tenant/timeline.rs             |  8 +++++---
 pageserver/src/tenant/timeline/compaction.rs  | 15 ++++++++++++---
 .../walreceiver/walreceiver_connection.rs     | 16 +++++++++++++++-
 safekeeper/src/send_interpreted_wal.rs        | 19 +++++++++----------
 5 files changed, 47 insertions(+), 20 deletions(-)

diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs
index 753f05b6fd..4a6069294d 100644
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -8,19 +8,22 @@ use strum_macros::{EnumString, VariantNames};
 /// Logs a critical error, similarly to `tracing::error!`. This will:
 ///
 /// * Emit an ERROR log message with prefix "CRITICAL:" and a backtrace.
+/// * Trigger a pageable alert (via the metric below).
 /// * Increment libmetrics_tracing_event_count{level="critical"}, and indirectly level="error".
-/// * Trigger a pageable alert (via the metric above).
 /// * In debug builds, panic the process.
+///
+/// When including errors in the message, please use {err:?} to include the error cause and original
+/// backtrace.
 #[macro_export]
 macro_rules! critical {
-    ($($arg:tt)*) => {
+    ($($arg:tt)*) => {{
         if cfg!(debug_assertions) {
             panic!($($arg)*);
         }
         $crate::logging::TRACING_EVENT_COUNT_METRIC.inc_critical();
         let backtrace = std::backtrace::Backtrace::capture();
         tracing::error!("CRITICAL: {}\n{backtrace}", format!($($arg)*));
-    };
+    }};
 }
 
 #[derive(EnumString, strum_macros::Display, VariantNames, Eq, PartialEq, Debug, Clone, Copy)]
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 45ddd38f67..908356c459 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -52,6 +52,7 @@ use tokio::{
 };
 use tokio_util::sync::CancellationToken;
 use tracing::*;
+use utils::critical;
 use utils::rate_limit::RateLimit;
 use utils::{
     fs_ext,
@@ -5807,10 +5808,11 @@ impl Timeline {
                 let img = match res {
                     Ok(img) => img,
                     Err(walredo::Error::Cancelled) => return Err(PageReconstructError::Cancelled),
-                    Err(walredo::Error::Other(e)) => {
+                    Err(walredo::Error::Other(err)) => {
+                        critical!("walredo failure during page reconstruction: {err:?}");
                         return Err(PageReconstructError::WalRedo(
-                            e.context("reconstruct a page image"),
-                        ))
+                            err.context("reconstruct a page image"),
+                        ));
                     }
                 };
                 Ok(img)
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index cfde070442..b9f4954453 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -10,8 +10,8 @@ use std::sync::Arc;
 
 use super::layer_manager::LayerManager;
 use super::{
-    CompactFlags, CompactOptions, CreateImageLayersError, DurationRecorder, ImageLayerCreationMode,
-    LastImageLayerCreationStatus, RecordedDuration, Timeline,
+    CompactFlags, CompactOptions, CreateImageLayersError, DurationRecorder, GetVectoredError,
+    ImageLayerCreationMode, LastImageLayerCreationStatus, RecordedDuration, Timeline,
 };
 
 use anyhow::{anyhow, bail, Context};
@@ -26,6 +26,7 @@ use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId};
 use serde::Serialize;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, info, info_span, trace, warn, Instrument};
+use utils::critical;
 use utils::id::TimelineId;
 
 use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
@@ -748,7 +749,15 @@ impl Timeline {
                             .as_ref()
                             .clone(),
                     )
-                    .await?;
+                    .await
+                    .inspect_err(|err| {
+                        if let CreateImageLayersError::GetVectoredError(
+                            GetVectoredError::MissingKey(_),
+                        ) = err
+                        {
+                            critical!("missing key during compaction: {err:?}");
+                        }
+                    })?;
 
                 self.last_image_layer_creation_status
                     .store(Arc::new(outcome.clone()));
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index de917377cb..23db4f88d2 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -39,7 +39,7 @@ use crate::{
 use postgres_backend::is_expected_io_error;
 use postgres_connection::PgConnectionConfig;
 use postgres_ffi::waldecoder::WalStreamDecoder;
-use utils::{id::NodeId, lsn::Lsn, postgres_client::PostgresClientProtocol};
+use utils::{critical, id::NodeId, lsn::Lsn, postgres_client::PostgresClientProtocol};
 use utils::{pageserver_feedback::PageserverFeedback, sync::gate::GateError};
 
 /// Status of the connection.
@@ -393,6 +393,13 @@ pub(super) async fn handle_walreceiver_connection(
                         .await
                         .with_context(|| {
                             format!("could not ingest record at {local_next_record_lsn}")
+                        })
+                        .inspect_err(|err| {
+                            // TODO: we can't differentiate cancellation errors with
+                            // anyhow::Error, so just ignore it if we're cancelled.
+                            if !cancellation.is_cancelled() {
+                                critical!("{err:?}")
+                            }
                         })?;
 
                     uncommitted_records += 1;
@@ -520,6 +527,13 @@ pub(super) async fn handle_walreceiver_connection(
                             .await
                             .with_context(|| {
                                 format!("could not ingest record at {next_record_lsn}")
+                            })
+                            .inspect_err(|err| {
+                                // TODO: we can't differentiate cancellation errors with
+                                // anyhow::Error, so just ignore it if we're cancelled.
+                                if !cancellation.is_cancelled() {
+                                    critical!("{err:?}")
+                                }
                             })?;
                         if !ingested {
                             tracing::debug!("ingest: filtered out record @ LSN {next_record_lsn}");
diff --git a/safekeeper/src/send_interpreted_wal.rs b/safekeeper/src/send_interpreted_wal.rs
index b57cc8001d..5916675c3f 100644
--- a/safekeeper/src/send_interpreted_wal.rs
+++ b/safekeeper/src/send_interpreted_wal.rs
@@ -15,7 +15,8 @@ use tokio::io::{AsyncRead, AsyncWrite};
 use tokio::sync::mpsc::error::SendError;
 use tokio::task::JoinHandle;
 use tokio::time::MissedTickBehavior;
-use tracing::{info_span, Instrument};
+use tracing::{error, info, info_span, Instrument};
+use utils::critical;
 use utils::lsn::Lsn;
 use utils::postgres_client::Compression;
 use utils::postgres_client::InterpretedFormat;
@@ -213,11 +214,10 @@ impl InterpretedWalReader {
                     metric.dec();
                 }
 
-                let res = reader.run_impl(start_pos).await;
-                if let Err(ref err) = res {
-                    tracing::error!("Task finished with error: {err}");
-                }
-                res
+                reader
+                    .run_impl(start_pos)
+                    .await
+                    .inspect_err(|err| critical!("failed to read WAL record: {err:?}"))
             }
             .instrument(info_span!("interpreted wal reader")),
         );
@@ -273,11 +273,10 @@ impl InterpretedWalReader {
             metric.dec();
         }
 
-        let res = self.run_impl(start_pos).await;
-        if let Err(err) = res {
-            tracing::error!("Interpreted wal reader encountered error: {err}");
+        if let Err(err) = self.run_impl(start_pos).await {
+            critical!("failed to read WAL record: {err:?}");
         } else {
-            tracing::info!("Interpreted wal reader exiting");
+            info!("interpreted wal reader exiting");
         }
 
         Err(CopyStreamHandlerEnd::Other(anyhow!(

From 67b71538d0a7775043614122916b5ec4e5f61f8d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 6 Feb 2025 12:17:08 +0100
Subject: [PATCH 65/77] Limit returned lsn for timestamp by the planned gc
 cutoff (#10678)

Often the output of the timestamp->lsn API is used as input for branch
creation, and branch creation takes the planned lsn into account, i.e.
rejects lsn's as branch lsns that are before the planned lsn.

This patch doesn't fix all race conditions, it's still racy. But at
least it is a step into the right direction.

For #10639
---
 pageserver/src/pgdatadir_mapping.rs | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index dcbf62b56c..00f332d797 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -612,11 +612,18 @@ impl Timeline {
         pausable_failpoint!("find-lsn-for-timestamp-pausable");
 
         let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn();
+        let gc_cutoff_planned = {
+            let gc_info = self.gc_info.read().unwrap();
+            gc_info.min_cutoff()
+        };
+        // Usually the planned cutoff is newer than the cutoff of the last gc run,
+        // but let's be defensive.
+        let gc_cutoff = gc_cutoff_planned.max(*gc_cutoff_lsn_guard);
         // We use this method to figure out the branching LSN for the new branch, but the
         // GC cutoff could be before the branching point and we cannot create a new branch
         // with LSN < `ancestor_lsn`. Thus, pick the maximum of these two to be
         // on the safe side.
-        let min_lsn = std::cmp::max(*gc_cutoff_lsn_guard, self.get_ancestor_lsn());
+        let min_lsn = std::cmp::max(gc_cutoff, self.get_ancestor_lsn());
         let max_lsn = self.get_last_record_lsn();
 
         // LSNs are always 8-byte aligned. low/mid/high represent the

From 977781e423ad614c4fe8c55735fb35707170a402 Mon Sep 17 00:00:00 2001
From: Alexander Lakhin <exclusion@gmail.com>
Date: Thu, 6 Feb 2025 14:53:43 +0200
Subject: [PATCH 66/77] Enable sanitizers for postgres v17 (#10401)

Add a build with sanitizers (asan, ubsan) to the CI pipeline and run
tests on it.

See https://github.com/neondatabase/neon/issues/6053

---------

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 .../actions/run-python-test-set/action.yml    |   2 +
 .github/workflows/_build-and-test-locally.yml |  35 +++--
 .../build_and_test_with_sanitizers.yml        | 133 ++++++++++++++++++
 Makefile                                      |  17 ++-
 compute_tools/src/bin/fast_import.rs          |   8 ++
 control_plane/src/background_process.rs       |   8 +-
 control_plane/src/storage_controller.rs       |  18 ++-
 libs/postgres_ffi/wal_craft/src/lib.rs        |  10 +-
 libs/postgres_initdb/src/lib.rs               |   8 ++
 libs/utils/scripts/restore_from_wal.sh        |   2 +-
 pageserver/src/walredo/process.rs             |   8 ++
 .../ingest_regress_test_result-new-format.py  |   6 +-
 test_runner/fixtures/parametrize.py           |   3 +
 test_runner/regress/test_compatibility.py     |   7 +-
 test_runner/regress/test_pg_regress.py        |   8 +-
 .../regress/test_subscriber_restart.py        |   2 +
 16 files changed, 253 insertions(+), 22 deletions(-)
 create mode 100644 .github/workflows/build_and_test_with_sanitizers.yml

diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index 9a0261d430..0eddfe5da6 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -121,6 +121,8 @@ runs:
         export DEFAULT_PG_VERSION=${PG_VERSION#v}
         export LD_LIBRARY_PATH=${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/lib
         export BENCHMARK_CONNSTR=${BENCHMARK_CONNSTR:-}
+        export ASAN_OPTIONS=detect_leaks=0:detect_stack_use_after_return=0:abort_on_error=1:strict_string_checks=1:check_initialization_order=1:strict_init_order=1
+        export UBSAN_OPTIONS=abort_on_error=1:print_stacktrace=1
 
         if [ "${BUILD_TYPE}" = "remote" ]; then
           export REMOTE_ENV=1
diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index 1dec8106b4..3a6fbf4234 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -20,7 +20,7 @@ on:
         required: true
         type: string
       test-cfg:
-        description: 'a json object of postgres versions and lfc states to run regression tests on'
+        description: 'a json object of postgres versions and lfc/sanitizers states to build and run regression tests on'
         required: true
         type: string
 
@@ -48,6 +48,8 @@ jobs:
       # io_uring will account the memory of the CQ and SQ as locked.
       # More details: https://github.com/neondatabase/neon/issues/6373#issuecomment-1905814391
       options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
+    strategy:
+      matrix: ${{ fromJSON(format('{{"include":{0}}}', inputs.test-cfg)) }}
     env:
       BUILD_TYPE: ${{ inputs.build-type }}
       GIT_VERSION: ${{ github.event.pull_request.head.sha || github.sha }}
@@ -87,6 +89,7 @@ jobs:
       - name: Set env variables
         env:
           ARCH: ${{ inputs.arch }}
+          SANITIZERS: ${{ matrix.sanitizers }}
         run: |
           CARGO_FEATURES="--features testing"
           if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' ]]; then
@@ -99,8 +102,14 @@ jobs:
             cov_prefix=""
             CARGO_FLAGS="--locked --release"
           fi
+          if [[ $SANITIZERS == 'enabled' ]]; then
+            make_vars="WITH_SANITIZERS=yes"
+          else
+            make_vars=""
+          fi
           {
             echo "cov_prefix=${cov_prefix}"
+            echo "make_vars=${make_vars}"
             echo "CARGO_FEATURES=${CARGO_FEATURES}"
             echo "CARGO_FLAGS=${CARGO_FLAGS}"
             echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo"
@@ -136,35 +145,39 @@ jobs:
 
       - name: Build postgres v14
         if: steps.cache_pg_14.outputs.cache-hit != 'true'
-        run: mold -run make postgres-v14 -j$(nproc)
+        run: mold -run make ${make_vars} postgres-v14 -j$(nproc)
 
       - name: Build postgres v15
         if: steps.cache_pg_15.outputs.cache-hit != 'true'
-        run: mold -run make postgres-v15 -j$(nproc)
+        run: mold -run make ${make_vars} postgres-v15 -j$(nproc)
 
       - name: Build postgres v16
         if: steps.cache_pg_16.outputs.cache-hit != 'true'
-        run: mold -run make postgres-v16 -j$(nproc)
+        run: mold -run make ${make_vars} postgres-v16 -j$(nproc)
 
       - name: Build postgres v17
         if: steps.cache_pg_17.outputs.cache-hit != 'true'
-        run: mold -run make postgres-v17 -j$(nproc)
+        run: mold -run make ${make_vars} postgres-v17 -j$(nproc)
 
       - name: Build neon extensions
-        run: mold -run make neon-pg-ext -j$(nproc)
+        run: mold -run make ${make_vars} neon-pg-ext -j$(nproc)
 
       - name: Build walproposer-lib
-        run: mold -run make walproposer-lib -j$(nproc)
+        run: mold -run make ${make_vars} walproposer-lib -j$(nproc)
 
       - name: Run cargo build
+        env:
+          WITH_TESTS: ${{ matrix.sanitizers != 'enabled' && '--tests' || '' }}
         run: |
-          ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
+          export ASAN_OPTIONS=detect_leaks=0
+          ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins ${WITH_TESTS}
 
       # Do install *before* running rust tests because they might recompile the
       # binaries with different features/flags.
       - name: Install rust binaries
         env:
           ARCH: ${{ inputs.arch }}
+          SANITIZERS: ${{ matrix.sanitizers }}
         run: |
           # Install target binaries
           mkdir -p /tmp/neon/bin/
@@ -179,7 +192,7 @@ jobs:
           done
 
           # Install test executables and write list of all binaries (for code coverage)
-          if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' ]]; then
+          if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' && $SANITIZERS != 'enabled' ]]; then
             # Keep bloated coverage data files away from the rest of the artifact
             mkdir -p /tmp/coverage/
 
@@ -212,6 +225,7 @@ jobs:
           role-duration-seconds: 18000 # 5 hours
 
       - name: Run rust tests
+        if: ${{ matrix.sanitizers != 'enabled' }}
         env:
           NEXTEST_RETRIES: 3
         run: |
@@ -319,7 +333,7 @@ jobs:
       - name: Pytest regression tests
         continue-on-error: ${{ matrix.lfc_state == 'with-lfc' && inputs.build-type == 'debug' }}
         uses: ./.github/actions/run-python-test-set
-        timeout-minutes: 60
+        timeout-minutes: ${{ matrix.sanitizers != 'enabled' && 60 || 180 }}
         with:
           build_type: ${{ inputs.build-type }}
           test_selection: regress
@@ -337,6 +351,7 @@ jobs:
           PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
           PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task
           USE_LFC: ${{ matrix.lfc_state == 'with-lfc' && 'true' || 'false' }}
+          SANITIZERS: ${{ matrix.sanitizers }}
 
       # Temporary disable this step until we figure out why it's so flaky
       # Ref https://github.com/neondatabase/neon/issues/4540
diff --git a/.github/workflows/build_and_test_with_sanitizers.yml b/.github/workflows/build_and_test_with_sanitizers.yml
new file mode 100644
index 0000000000..cf0de3f8dc
--- /dev/null
+++ b/.github/workflows/build_and_test_with_sanitizers.yml
@@ -0,0 +1,133 @@
+name: Build and Test with Sanitizers
+
+on:
+  schedule:
+    # * is a special character in YAML so you have to quote this string
+    #          ┌───────────── minute (0 - 59)
+    #          │ ┌───────────── hour (0 - 23)
+    #          │ │ ┌───────────── day of the month (1 - 31)
+    #          │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
+    #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
+    - cron:   '0 1 * * *' # run once a day, timezone is utc
+  workflow_dispatch:
+
+defaults:
+  run:
+    shell: bash -euxo pipefail {0}
+
+concurrency:
+  # Allow only one workflow per any non-`main` branch.
+  group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
+  cancel-in-progress: true
+
+env:
+  RUST_BACKTRACE: 1
+  COPT: '-Werror'
+
+jobs:
+  tag:
+    runs-on: [ self-hosted, small ]
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
+    outputs:
+      build-tag: ${{steps.build-tag.outputs.tag}}
+
+    steps:
+      # Need `fetch-depth: 0` to count the number of commits in the branch
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Get build tag
+        run: |
+          echo run:$GITHUB_RUN_ID
+          echo ref:$GITHUB_REF_NAME
+          echo rev:$(git rev-list --count HEAD)
+          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
+            echo "tag=$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
+          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
+            echo "tag=release-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
+          elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
+            echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
+          elif [[ "$GITHUB_REF_NAME" == "release-compute" ]]; then
+            echo "tag=release-compute-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
+          else
+            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release', 'release-proxy', 'release-compute'"
+            echo "tag=$GITHUB_RUN_ID" >> $GITHUB_OUTPUT
+          fi
+        shell: bash
+        id: build-tag
+
+  build-build-tools-image:
+    uses: ./.github/workflows/build-build-tools-image.yml
+    secrets: inherit
+
+  build-and-test-locally:
+    needs: [ tag, build-build-tools-image ]
+    strategy:
+      fail-fast: false
+      matrix:
+        arch: [ x64, arm64 ]
+        build-type: [ release ]
+    uses: ./.github/workflows/_build-and-test-locally.yml
+    with:
+      arch: ${{ matrix.arch }}
+      build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
+      build-tag: ${{ needs.tag.outputs.build-tag }}
+      build-type: ${{ matrix.build-type }}
+      test-cfg: '[{"pg_version":"v17", "sanitizers": "enabled"}]'
+    secrets: inherit
+
+
+  create-test-report:
+    needs: [ build-and-test-locally, build-build-tools-image ]
+    if: ${{ !cancelled() }}
+    permissions:
+      id-token: write # aws-actions/configure-aws-credentials
+      statuses: write
+      contents: write
+      pull-requests: write
+    outputs:
+      report-url: ${{ steps.create-allure-report.outputs.report-url }}
+
+    runs-on: [ self-hosted, small ]
+    container:
+      image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      options: --init
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Create Allure report
+        if: ${{ !cancelled() }}
+        id: create-allure-report
+        uses: ./.github/actions/allure-report-generate
+        with:
+          store-test-results-into-db: true
+          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        env:
+          REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
+
+      - uses: actions/github-script@v7
+        if: ${{ !cancelled() }}
+        with:
+          # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
+          retries: 5
+          script: |
+            const report = {
+              reportUrl:     "${{ steps.create-allure-report.outputs.report-url }}",
+              reportJsonUrl: "${{ steps.create-allure-report.outputs.report-json-url }}",
+            }
+
+            const coverage = {}
+
+            const script = require("./scripts/comment-test-report.js")
+            await script({
+              github,
+              context,
+              fetch,
+              report,
+              coverage,
+            })
diff --git a/Makefile b/Makefile
index d1238caebf..42ee643bb5 100644
--- a/Makefile
+++ b/Makefile
@@ -10,18 +10,29 @@ ICU_PREFIX_DIR := /usr/local/icu
 # environment variable.
 #
 BUILD_TYPE ?= debug
+WITH_SANITIZERS ?= no
 ifeq ($(BUILD_TYPE),release)
 	PG_CONFIGURE_OPTS = --enable-debug --with-openssl
 	PG_CFLAGS = -O2 -g3 $(CFLAGS)
+	PG_LDFLAGS = $(LDFLAGS)
 	# Unfortunately, `--profile=...` is a nightly feature
 	CARGO_BUILD_FLAGS += --release
 else ifeq ($(BUILD_TYPE),debug)
 	PG_CONFIGURE_OPTS = --enable-debug --with-openssl --enable-cassert --enable-depend
 	PG_CFLAGS = -O0 -g3 $(CFLAGS)
+	PG_LDFLAGS = $(LDFLAGS)
 else
 	$(error Bad build type '$(BUILD_TYPE)', see Makefile for options)
 endif
 
+ifeq ($(WITH_SANITIZERS),yes)
+	PG_CFLAGS += -fsanitize=address -fsanitize=undefined -fno-sanitize-recover
+	COPT += -Wno-error # to avoid failing on warnings induced by sanitizers
+	PG_LDFLAGS = -fsanitize=address -fsanitize=undefined -static-libasan -static-libubsan $(LDFLAGS)
+	export CC := gcc
+	export ASAN_OPTIONS := detect_leaks=0
+endif
+
 ifeq ($(shell test -e /home/nonroot/.docker_build && echo -n yes),yes)
 	# Exclude static build openssl, icu for local build (MacOS, Linux)
 	# Only keep for build type release and debug
@@ -33,7 +44,9 @@ endif
 UNAME_S := $(shell uname -s)
 ifeq ($(UNAME_S),Linux)
 	# Seccomp BPF is only available for Linux
-	PG_CONFIGURE_OPTS += --with-libseccomp
+	ifneq ($(WITH_SANITIZERS),yes)
+		PG_CONFIGURE_OPTS += --with-libseccomp
+	endif
 else ifeq ($(UNAME_S),Darwin)
 	PG_CFLAGS += -DUSE_PREFETCH
 	ifndef DISABLE_HOMEBREW
@@ -106,7 +119,7 @@ $(POSTGRES_INSTALL_DIR)/build/%/config.status:
 	EXTRA_VERSION=$$(cd $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION && git rev-parse HEAD); \
 	(cd $(POSTGRES_INSTALL_DIR)/build/$$VERSION && \
 	env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION/configure \
-		CFLAGS='$(PG_CFLAGS)' \
+		CFLAGS='$(PG_CFLAGS)' LDFLAGS='$(PG_LDFLAGS)' \
 		$(PG_CONFIGURE_OPTS) --with-extra-version=" ($$EXTRA_VERSION)" \
 		--prefix=$(abspath $(POSTGRES_INSTALL_DIR))/$$VERSION > configure.log)
 
diff --git a/compute_tools/src/bin/fast_import.rs b/compute_tools/src/bin/fast_import.rs
index c8440afb64..1398f443dd 100644
--- a/compute_tools/src/bin/fast_import.rs
+++ b/compute_tools/src/bin/fast_import.rs
@@ -231,6 +231,14 @@ pub(crate) async fn main() -> anyhow::Result<()> {
         ])
         .env_clear()
         .env("LD_LIBRARY_PATH", &pg_lib_dir)
+        .env(
+            "ASAN_OPTIONS",
+            std::env::var("ASAN_OPTIONS").unwrap_or_default(),
+        )
+        .env(
+            "UBSAN_OPTIONS",
+            std::env::var("UBSAN_OPTIONS").unwrap_or_default(),
+        )
         .stdout(std::process::Stdio::piped())
         .stderr(std::process::Stdio::piped())
         .spawn()
diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs
index af312d73a7..c668e68402 100644
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -261,7 +261,13 @@ fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command {
     let mut filled_cmd = cmd.env_clear().env("RUST_BACKTRACE", backtrace_setting);
 
     // Pass through these environment variables to the command
-    for var in ["LLVM_PROFILE_FILE", "FAILPOINTS", "RUST_LOG"] {
+    for var in [
+        "LLVM_PROFILE_FILE",
+        "FAILPOINTS",
+        "RUST_LOG",
+        "ASAN_OPTIONS",
+        "UBSAN_OPTIONS",
+    ] {
         if let Some(val) = std::env::var_os(var) {
             filled_cmd = filled_cmd.env(var, val);
         }
diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index c41ff22d15..9a2d30c861 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -221,7 +221,17 @@ impl StorageController {
             "-p",
             &format!("{}", postgres_port),
         ];
-        let exitcode = Command::new(bin_path).args(args).spawn()?.wait().await?;
+        let pg_lib_dir = self.get_pg_lib_dir().await.unwrap();
+        let envs = [
+            ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+            ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+        ];
+        let exitcode = Command::new(bin_path)
+            .args(args)
+            .envs(envs)
+            .spawn()?
+            .wait()
+            .await?;
 
         Ok(exitcode.success())
     }
@@ -242,6 +252,11 @@ impl StorageController {
 
         let pg_bin_dir = self.get_pg_bin_dir().await?;
         let createdb_path = pg_bin_dir.join("createdb");
+        let pg_lib_dir = self.get_pg_lib_dir().await.unwrap();
+        let envs = [
+            ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+            ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+        ];
         let output = Command::new(&createdb_path)
             .args([
                 "-h",
@@ -254,6 +269,7 @@ impl StorageController {
                 &username(),
                 DB_NAME,
             ])
+            .envs(envs)
             .output()
             .await
             .expect("Failed to spawn createdb");
diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs
index 9524a5149b..77dff4ac99 100644
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -76,7 +76,15 @@ impl Conf {
         let mut cmd = Command::new(path);
         cmd.env_clear()
             .env("LD_LIBRARY_PATH", self.pg_lib_dir()?)
-            .env("DYLD_LIBRARY_PATH", self.pg_lib_dir()?);
+            .env("DYLD_LIBRARY_PATH", self.pg_lib_dir()?)
+            .env(
+                "ASAN_OPTIONS",
+                std::env::var("ASAN_OPTIONS").unwrap_or_default(),
+            )
+            .env(
+                "UBSAN_OPTIONS",
+                std::env::var("UBSAN_OPTIONS").unwrap_or_default(),
+            );
         Ok(cmd)
     }
 
diff --git a/libs/postgres_initdb/src/lib.rs b/libs/postgres_initdb/src/lib.rs
index 2f072354fb..ed54696861 100644
--- a/libs/postgres_initdb/src/lib.rs
+++ b/libs/postgres_initdb/src/lib.rs
@@ -64,6 +64,14 @@ pub async fn do_run_initdb(args: RunInitdbArgs<'_>) -> Result<(), Error> {
         .env_clear()
         .env("LD_LIBRARY_PATH", library_search_path)
         .env("DYLD_LIBRARY_PATH", library_search_path)
+        .env(
+            "ASAN_OPTIONS",
+            std::env::var("ASAN_OPTIONS").unwrap_or_default(),
+        )
+        .env(
+            "UBSAN_OPTIONS",
+            std::env::var("UBSAN_OPTIONS").unwrap_or_default(),
+        )
         .stdin(std::process::Stdio::null())
         // stdout invocation produces the same output every time, we don't need it
         .stdout(std::process::Stdio::null())
diff --git a/libs/utils/scripts/restore_from_wal.sh b/libs/utils/scripts/restore_from_wal.sh
index a8615c2337..f394d4c58d 100755
--- a/libs/utils/scripts/restore_from_wal.sh
+++ b/libs/utils/scripts/restore_from_wal.sh
@@ -39,7 +39,7 @@ function initdb_with_args {
             ;;
     esac
 
-    eval env -i LD_LIBRARY_PATH="$PG_BIN"/../lib "${cmd[*]}"
+    eval env -i LD_LIBRARY_PATH="$PG_BIN"/../lib ASAN_OPTIONS="${ASAN_OPTIONS-}" UBSAN_OPTIONS="${UBSAN_OPTIONS-}" "${cmd[*]}"
 }
 
 rm -fr "$DATA_DIR"
diff --git a/pageserver/src/walredo/process.rs b/pageserver/src/walredo/process.rs
index 7e9477cfbc..bf30b92ea5 100644
--- a/pageserver/src/walredo/process.rs
+++ b/pageserver/src/walredo/process.rs
@@ -79,6 +79,14 @@ impl WalRedoProcess {
             .env_clear()
             .env("LD_LIBRARY_PATH", &pg_lib_dir_path)
             .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
+            .env(
+                "ASAN_OPTIONS",
+                std::env::var("ASAN_OPTIONS").unwrap_or_default(),
+            )
+            .env(
+                "UBSAN_OPTIONS",
+                std::env::var("UBSAN_OPTIONS").unwrap_or_default(),
+            )
             // NB: The redo process is not trusted after we sent it the first
             // walredo work. Before that, it is trusted. Specifically, we trust
             // it to
diff --git a/scripts/ingest_regress_test_result-new-format.py b/scripts/ingest_regress_test_result-new-format.py
index ad2baf56bb..3a5cdf013a 100644
--- a/scripts/ingest_regress_test_result-new-format.py
+++ b/scripts/ingest_regress_test_result-new-format.py
@@ -32,6 +32,7 @@ CREATE TABLE IF NOT EXISTS results (
     flaky        BOOLEAN NOT NULL,
     arch         arch DEFAULT 'X64',
     lfc          BOOLEAN DEFAULT false NOT NULL,
+    sanitizers   BOOLEAN DEFAULT false NOT NULL,
     build_type   TEXT NOT NULL,
     pg_version   INT NOT NULL,
     run_id       BIGINT NOT NULL,
@@ -39,7 +40,7 @@ CREATE TABLE IF NOT EXISTS results (
     reference    TEXT NOT NULL,
     revision     CHAR(40) NOT NULL,
     raw          JSONB COMPRESSION lz4 NOT NULL,
-    UNIQUE (parent_suite, suite, name, arch, build_type, pg_version, started_at, stopped_at, run_id)
+    UNIQUE (parent_suite, suite, name, arch, lfc, sanitizers, build_type, pg_version, started_at, stopped_at, run_id)
 );
 """
 
@@ -56,6 +57,7 @@ class Row:
     flaky: bool
     arch: str
     lfc: bool
+    sanitizers: bool
     build_type: str
     pg_version: int
     run_id: int
@@ -135,6 +137,7 @@ def ingest_test_result(
         }
         arch = parameters.get("arch", "UNKNOWN").strip("'")
         lfc = parameters.get("lfc", "without-lfc").strip("'") == "with-lfc"
+        sanitizers = parameters.get("sanitizers", "disabled").strip("'") == "enabled"
 
         build_type, pg_version, unparametrized_name = parse_test_name(test["name"])
         labels = {label["name"]: label["value"] for label in test["labels"]}
@@ -149,6 +152,7 @@ def ingest_test_result(
             flaky=test["flaky"] or test["retriesStatusChange"],
             arch=arch,
             lfc=lfc,
+            sanitizers=sanitizers,
             build_type=build_type,
             pg_version=pg_version,
             run_id=run_id,
diff --git a/test_runner/fixtures/parametrize.py b/test_runner/fixtures/parametrize.py
index 1acb1af23b..c33342c89e 100644
--- a/test_runner/fixtures/parametrize.py
+++ b/test_runner/fixtures/parametrize.py
@@ -124,5 +124,8 @@ def pytest_runtest_makereport(*args, **kwargs):
     allure.dynamic.parameter(
         "__lfc", "with-lfc" if os.getenv("USE_LFC") != "false" else "without-lfc"
     )
+    allure.dynamic.parameter(
+        "__sanitizers", "enabled" if os.getenv("SANITIZERS") == "enabled" else "disabled"
+    )
 
     yield
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index cdc6c0053d..ba3078d493 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -314,7 +314,10 @@ def test_forward_compatibility(
 
 
 def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, repo_dir: Path):
-    ep = env.endpoints.create_start("main")
+    ep = env.endpoints.create("main")
+    ep_env = {"LD_LIBRARY_PATH": str(env.pg_distrib_dir / f"v{env.pg_version}/lib")}
+    ep.start(env=ep_env)
+
     connstr = ep.connstr()
 
     pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version)
@@ -363,7 +366,7 @@ def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, r
     )
 
     # Timeline exists again: restart the endpoint
-    ep.start()
+    ep.start(env=ep_env)
 
     pg_bin.run_capture(
         ["pg_dumpall", f"--dbname={connstr}", f"--file={test_output_dir / 'dump-from-wal.sql'}"]
diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py
index 2877f14e0e..c5ae669dce 100644
--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -120,7 +120,7 @@ def post_checks(env: NeonEnv, test_output_dir: Path, db_name: str, endpoint: End
 
 # Run the main PostgreSQL regression tests, in src/test/regress.
 #
-@pytest.mark.timeout(900)  # Contains many sub-tests, is slow in debug builds
+@pytest.mark.timeout(3000)  # Contains many sub-tests, is slow in debug builds
 @pytest.mark.parametrize("shard_count", [None, 4])
 def test_pg_regress(
     neon_env_builder: NeonEnvBuilder,
@@ -194,7 +194,7 @@ def test_pg_regress(
 
 # Run the PostgreSQL "isolation" tests, in src/test/isolation.
 #
-@pytest.mark.timeout(600)  # Contains many sub-tests, is slow in debug builds
+@pytest.mark.timeout(1500)  # Contains many sub-tests, is slow in debug builds
 @pytest.mark.parametrize("shard_count", [None, 4])
 def test_isolation(
     neon_env_builder: NeonEnvBuilder,
@@ -222,6 +222,8 @@ def test_isolation(
             "max_prepared_transactions=100",
             # Enable the test mode, so that we don't need to patch the test cases.
             "neon.regress_test_mode = true",
+            # Stack size should be increased for tests to pass with asan.
+            "max_stack_depth = 4MB",
         ],
     )
     endpoint.safe_psql(f"CREATE DATABASE {DBNAME}")
@@ -417,7 +419,7 @@ def test_tx_abort_with_many_relations(
         try:
             # Rollback phase should be fast: this is one WAL record that we should process efficiently
             fut = exec.submit(rollback_and_wait)
-            fut.result(timeout=5)
+            fut.result(timeout=15)
         except:
             exec.shutdown(wait=False, cancel_futures=True)
             raise
diff --git a/test_runner/regress/test_subscriber_restart.py b/test_runner/regress/test_subscriber_restart.py
index 7d4f66d044..8ad7282ea2 100644
--- a/test_runner/regress/test_subscriber_restart.py
+++ b/test_runner/regress/test_subscriber_restart.py
@@ -3,12 +3,14 @@ from __future__ import annotations
 import threading
 import time
 
+import pytest
 from fixtures.neon_fixtures import NeonEnv
 from fixtures.utils import wait_until
 
 
 # This test checks of logical replication subscriber is able to correctly restart replication without receiving duplicates.
 # It requires tracking information about replication origins at page server side
+@pytest.mark.timeout(900)  # This test is slow with sanitizers enabled, especially on ARM
 def test_subscriber_restart(neon_simple_env: NeonEnv):
     env = neon_simple_env
     env.create_branch("publisher")

From f22d41eaec1b6d96c71940f2d4e27bcc04d7617a Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 6 Feb 2025 09:39:37 -0500
Subject: [PATCH 67/77] feat(pageserver): num of background job metrics
 (#10690)

## Problem

We need a metrics to know what's going on in pageserver's background
jobs.

## Summary of changes

* Waiting tasks: task still waiting for the semaphore.
* Running tasks: tasks doing their actual jobs.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Erik Grinaker <erik@neon.tech>
---
 pageserver/src/metrics.rs                     | 88 +++++++++++++++----
 pageserver/src/tenant/tasks.rs                | 23 +++--
 .../src/tenant/timeline/eviction_task.rs      | 11 ++-
 3 files changed, 96 insertions(+), 26 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 6ab1178a7b..1cc18d83ce 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -2214,6 +2214,8 @@ pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
 pub struct BackgroundLoopSemaphoreMetrics {
     counters: EnumMap<BackgroundLoopKind, IntCounterPair>,
     durations: EnumMap<BackgroundLoopKind, Counter>,
+    waiting_tasks: EnumMap<BackgroundLoopKind, IntGauge>,
+    running_tasks: EnumMap<BackgroundLoopKind, IntGauge>,
 }
 
 pub(crate) static BACKGROUND_LOOP_SEMAPHORE: Lazy<BackgroundLoopSemaphoreMetrics> = Lazy::new(
@@ -2234,6 +2236,20 @@ pub(crate) static BACKGROUND_LOOP_SEMAPHORE: Lazy<BackgroundLoopSemaphoreMetrics
         )
         .unwrap();
 
+        let waiting_tasks = register_int_gauge_vec!(
+            "pageserver_background_loop_semaphore_waiting_tasks",
+            "Number of background loop tasks waiting for semaphore",
+            &["task"],
+        )
+        .unwrap();
+
+        let running_tasks = register_int_gauge_vec!(
+            "pageserver_background_loop_semaphore_running_tasks",
+            "Number of background loop tasks running concurrently",
+            &["task"],
+        )
+        .unwrap();
+
         BackgroundLoopSemaphoreMetrics {
             counters: enum_map::EnumMap::from_array(std::array::from_fn(|i| {
                 let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
@@ -2243,29 +2259,69 @@ pub(crate) static BACKGROUND_LOOP_SEMAPHORE: Lazy<BackgroundLoopSemaphoreMetrics
                 let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
                 durations.with_label_values(&[kind.into()])
             })),
+            waiting_tasks: enum_map::EnumMap::from_array(std::array::from_fn(|i| {
+                let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
+                waiting_tasks.with_label_values(&[kind.into()])
+            })),
+            running_tasks: enum_map::EnumMap::from_array(std::array::from_fn(|i| {
+                let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
+                running_tasks.with_label_values(&[kind.into()])
+            })),
         }
     },
 );
 
 impl BackgroundLoopSemaphoreMetrics {
-    pub(crate) fn measure_acquisition(&self, task: BackgroundLoopKind) -> impl Drop + '_ {
-        struct Record<'a> {
-            metrics: &'a BackgroundLoopSemaphoreMetrics,
-            task: BackgroundLoopKind,
-            _counter_guard: metrics::IntCounterPairGuard,
-            start: Instant,
-        }
-        impl Drop for Record<'_> {
-            fn drop(&mut self) {
-                let elapsed = self.start.elapsed().as_secs_f64();
-                self.metrics.durations[self.task].inc_by(elapsed);
-            }
-        }
-        Record {
-            metrics: self,
+    /// Starts recording semaphore metrics. Call `acquired()` on the returned recorder when the
+    /// semaphore is acquired, and drop it when the task completes or is cancelled.
+    pub(crate) fn record(
+        &self,
+        task: BackgroundLoopKind,
+    ) -> BackgroundLoopSemaphoreMetricsRecorder {
+        BackgroundLoopSemaphoreMetricsRecorder::start(self, task)
+    }
+}
+
+/// Records metrics for a background task.
+pub struct BackgroundLoopSemaphoreMetricsRecorder<'a> {
+    metrics: &'a BackgroundLoopSemaphoreMetrics,
+    task: BackgroundLoopKind,
+    start: Instant,
+    wait_counter_guard: Option<metrics::IntCounterPairGuard>,
+}
+
+impl<'a> BackgroundLoopSemaphoreMetricsRecorder<'a> {
+    /// Starts recording semaphore metrics, by recording wait time and incrementing
+    /// `wait_start_count` and `waiting_tasks`.
+    fn start(metrics: &'a BackgroundLoopSemaphoreMetrics, task: BackgroundLoopKind) -> Self {
+        metrics.waiting_tasks[task].inc();
+        Self {
+            metrics,
             task,
-            _counter_guard: self.counters[task].guard(),
             start: Instant::now(),
+            wait_counter_guard: Some(metrics.counters[task].guard()),
+        }
+    }
+
+    /// Signals that the semaphore has been acquired, and updates relevant metrics.
+    pub fn acquired(&mut self) {
+        self.wait_counter_guard.take().expect("already acquired");
+        self.metrics.durations[self.task].inc_by(self.start.elapsed().as_secs_f64());
+        self.metrics.waiting_tasks[self.task].dec();
+        self.metrics.running_tasks[self.task].inc();
+    }
+}
+
+impl Drop for BackgroundLoopSemaphoreMetricsRecorder<'_> {
+    /// The task either completed or was cancelled.
+    fn drop(&mut self) {
+        if self.wait_counter_guard.take().is_some() {
+            // Waiting.
+            self.metrics.durations[self.task].inc_by(self.start.elapsed().as_secs_f64());
+            self.metrics.waiting_tasks[self.task].dec();
+        } else {
+            // Running.
+            self.metrics.running_tasks[self.task].dec();
         }
     }
 }
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index d65f099182..1c3237d0bd 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -7,7 +7,7 @@ use std::sync::Arc;
 use std::time::{Duration, Instant};
 
 use crate::context::{DownloadBehavior, RequestContext};
-use crate::metrics::TENANT_TASK_EVENTS;
+use crate::metrics::{BackgroundLoopSemaphoreMetricsRecorder, TENANT_TASK_EVENTS};
 use crate::task_mgr;
 use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
 use crate::tenant::throttle::Stats;
@@ -61,21 +61,32 @@ impl BackgroundLoopKind {
     }
 }
 
+pub struct BackgroundLoopSemaphorePermit<'a> {
+    _permit: tokio::sync::SemaphorePermit<'static>,
+    _recorder: BackgroundLoopSemaphoreMetricsRecorder<'a>,
+}
+
 /// Cancellation safe.
 pub(crate) async fn concurrent_background_tasks_rate_limit_permit(
     loop_kind: BackgroundLoopKind,
     _ctx: &RequestContext,
-) -> tokio::sync::SemaphorePermit<'static> {
-    let _guard = crate::metrics::BACKGROUND_LOOP_SEMAPHORE.measure_acquisition(loop_kind);
+) -> BackgroundLoopSemaphorePermit<'static> {
+    let mut recorder = crate::metrics::BACKGROUND_LOOP_SEMAPHORE.record(loop_kind);
 
     if loop_kind == BackgroundLoopKind::InitialLogicalSizeCalculation {
         pausable_failpoint!("initial-size-calculation-permit-pause");
     }
 
     // TODO: assert that we run on BACKGROUND_RUNTIME; requires tokio_unstable Handle::id();
-    match CONCURRENT_BACKGROUND_TASKS.acquire().await {
-        Ok(permit) => permit,
-        Err(_closed) => unreachable!("we never close the semaphore"),
+    let permit = CONCURRENT_BACKGROUND_TASKS
+        .acquire()
+        .await
+        .expect("should never close");
+    recorder.acquired();
+
+    BackgroundLoopSemaphorePermit {
+        _permit: permit,
+        _recorder: recorder,
     }
 }
 
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index 26c2861b93..9836aafecb 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -30,8 +30,11 @@ use crate::{
     pgdatadir_mapping::CollectKeySpaceError,
     task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
     tenant::{
-        size::CalculateSyntheticSizeError, storage_layer::LayerVisibilityHint,
-        tasks::BackgroundLoopKind, timeline::EvictionError, LogicalSizeCalculationCause, Tenant,
+        size::CalculateSyntheticSizeError,
+        storage_layer::LayerVisibilityHint,
+        tasks::{BackgroundLoopKind, BackgroundLoopSemaphorePermit},
+        timeline::EvictionError,
+        LogicalSizeCalculationCause, Tenant,
     },
 };
 
@@ -330,7 +333,7 @@ impl Timeline {
         &self,
         cancel: &CancellationToken,
         ctx: &RequestContext,
-    ) -> ControlFlow<(), tokio::sync::SemaphorePermit<'static>> {
+    ) -> ControlFlow<(), BackgroundLoopSemaphorePermit<'static>> {
         let acquire_permit = crate::tenant::tasks::concurrent_background_tasks_rate_limit_permit(
             BackgroundLoopKind::Eviction,
             ctx,
@@ -374,7 +377,7 @@ impl Timeline {
         p: &EvictionPolicyLayerAccessThreshold,
         cancel: &CancellationToken,
         gate: &GateGuard,
-        permit: tokio::sync::SemaphorePermit<'static>,
+        permit: BackgroundLoopSemaphorePermit<'static>,
         ctx: &RequestContext,
     ) -> ControlFlow<()> {
         if !self.tenant_shard_id.is_shard_zero() {

From 839f41f5bb5ef072972eefded5e5ccc32429b6e3 Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Thu, 6 Feb 2025 15:39:45 +0100
Subject: [PATCH 68/77] fix pgcopydb seg fault and -c
 idle_in_transaction_session_timeout=0 (#10692)

## Problem

During ingest_benchmark which uses `pgcopydb`
([see](https://github.com/dimitri/pgcopydb))we sometimes had outages.
- when PostgreSQL COPY step failed we got a segfault (reported
[here](https://github.com/dimitri/pgcopydb/issues/899))
- the root cause was Neon idle_in_transaction_session_timeout is set to
5 minutes which is suboptimal for long-running tasks like project import
(reported [here](https://github.com/dimitri/pgcopydb/issues/900))

## Summary of changes

Patch pgcopydb to avoid segfault.
override idle_in_transaction_session_timeout and set it to "unlimited"
---
 .dockerignore                                 |  1 +
 build-tools.Dockerfile                        |  3 ++
 build_tools/patches/pgcopydbv017.patch        | 37 +++++++++++++++++++
 .../test_perf_ingest_using_pgcopydb.py        |  2 +-
 4 files changed, 42 insertions(+), 1 deletion(-)
 create mode 100644 build_tools/patches/pgcopydbv017.patch

diff --git a/.dockerignore b/.dockerignore
index 9e2d2e7108..7ead48db7c 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -24,3 +24,4 @@
 !storage_controller/
 !vendor/postgres-*/
 !workspace_hack/
+!build_tools/patches
diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile
index 3ade57b175..52874d2ef6 100644
--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -12,6 +12,8 @@ RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
     echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc && \
     echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc
 
+COPY build_tools/patches/pgcopydbv017.patch /pgcopydbv017.patch
+
 RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \
         set -e && \
         apt update && \
@@ -44,6 +46,7 @@ RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \
         mkdir /tmp/pgcopydb && \
         tar -xzf /tmp/pgcopydb.tar.gz -C /tmp/pgcopydb --strip-components=1 && \
         cd /tmp/pgcopydb && \
+        patch -p1 < /pgcopydbv017.patch && \
         make -s clean && \
         make -s -j12 install && \
         libpq_path=$(find /lib /usr/lib -name "libpq.so.5" | head -n 1) && \
diff --git a/build_tools/patches/pgcopydbv017.patch b/build_tools/patches/pgcopydbv017.patch
new file mode 100644
index 0000000000..c309d8fe59
--- /dev/null
+++ b/build_tools/patches/pgcopydbv017.patch
@@ -0,0 +1,37 @@
+diff --git a/src/bin/pgcopydb/copydb.c b/src/bin/pgcopydb/copydb.c
+index d730b03..69a9be9 100644
+--- a/src/bin/pgcopydb/copydb.c
++++ b/src/bin/pgcopydb/copydb.c
+@@ -44,6 +44,7 @@ GUC dstSettings[] = {
+ 	{ "synchronous_commit", "'off'" },
+ 	{ "statement_timeout", "0" },
+ 	{ "lock_timeout", "0" },
++	{ "idle_in_transaction_session_timeout", "0" },
+ 	{ NULL, NULL },
+ };
+ 
+diff --git a/src/bin/pgcopydb/pgsql.c b/src/bin/pgcopydb/pgsql.c
+index 94f2f46..86b9448 100644
+--- a/src/bin/pgcopydb/pgsql.c
++++ b/src/bin/pgcopydb/pgsql.c
+@@ -3174,11 +3174,18 @@ pgcopy_log_error(PGSQL *pgsql, PGresult *res, const char *context)
+ 		/* errors have already been logged */
+ 		return;
+ 	}
+-
+ 	if (res != NULL)
+ 	{
+ 		char *sqlstate = PQresultErrorField(res, PG_DIAG_SQLSTATE);
+-		strlcpy(pgsql->sqlstate, sqlstate, sizeof(pgsql->sqlstate));
++		if (sqlstate == NULL)
++		{
++			// PQresultErrorField returned NULL!
++			pgsql->sqlstate[0] = '\0';  // Set to an empty string to avoid segfault
++		}
++		else
++		{
++			strlcpy(pgsql->sqlstate, sqlstate, sizeof(pgsql->sqlstate));
++		}
+ 	}
+ 
+ 	char *endpoint =
diff --git a/test_runner/performance/test_perf_ingest_using_pgcopydb.py b/test_runner/performance/test_perf_ingest_using_pgcopydb.py
index f0a0c1f5a2..da62422fca 100644
--- a/test_runner/performance/test_perf_ingest_using_pgcopydb.py
+++ b/test_runner/performance/test_perf_ingest_using_pgcopydb.py
@@ -136,7 +136,7 @@ def run_command_and_log_output(command, log_file_path: Path):
         "LD_LIBRARY_PATH": f"{os.getenv('PGCOPYDB_LIB_PATH')}:{os.getenv('PG_16_LIB_PATH')}",
         "PGCOPYDB_SOURCE_PGURI": cast(str, os.getenv("BENCHMARK_INGEST_SOURCE_CONNSTR")),
         "PGCOPYDB_TARGET_PGURI": cast(str, os.getenv("BENCHMARK_INGEST_TARGET_CONNSTR")),
-        "PGOPTIONS": "-c maintenance_work_mem=8388608 -c max_parallel_maintenance_workers=7",
+        "PGOPTIONS": "-c idle_in_transaction_session_timeout=0 -c maintenance_work_mem=8388608 -c max_parallel_maintenance_workers=7",
     }
     # Combine the current environment with custom variables
     env = os.environ.copy()

From ddd7c363430de8126c69edb02903a6e2bf7a1919 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 6 Feb 2025 14:40:22 +0000
Subject: [PATCH 69/77] CI(approved-for-ci-run): Use internal CI_ACCESS_TOKEN
 for cloning repo (#10693)

## Problem

The default `GITHUB_TOKEN` is used to push changes created with
`approved-for-ci-run`, which doesn't work:
```
Run git push --force origin "${BRANCH}"
remote: Permission to neondatabase/neon.git denied to github-actions[bot].
fatal: unable to access 'https://github.com/neondatabase/neon/': The requested URL returned error: 403
```
Ref:
https://github.com/neondatabase/neon/actions/runs/13166108303/job/36746518291?pr=10687

## Summary of changes
- Use `CI_ACCESS_TOKEN` to clone an external repo
- Remove unneeded `actions/checkout`
---
 .github/workflows/approved-for-ci-run.yml | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/approved-for-ci-run.yml b/.github/workflows/approved-for-ci-run.yml
index fc2f36c74b..f4e1e2e96c 100644
--- a/.github/workflows/approved-for-ci-run.yml
+++ b/.github/workflows/approved-for-ci-run.yml
@@ -67,9 +67,9 @@ jobs:
 
       - uses: actions/checkout@v4
         with:
-          ref: main
+          ref: ${{ github.event.pull_request.head.sha }}
           token: ${{ secrets.CI_ACCESS_TOKEN }}
-      
+
       - name: Look for existing PR
         id: get-pr
         env:
@@ -77,7 +77,7 @@ jobs:
         run: |
           ALREADY_CREATED="$(gh pr --repo ${GITHUB_REPOSITORY} list --head ${BRANCH} --base main --json number --jq '.[].number')"
           echo "ALREADY_CREATED=${ALREADY_CREATED}" >> ${GITHUB_OUTPUT}
-      
+
       - name: Get changed labels
         id: get-labels
         if: steps.get-pr.outputs.ALREADY_CREATED != ''
@@ -94,10 +94,6 @@ jobs:
           echo "LABELS_TO_ADD=${LABELS_TO_ADD}" >> ${GITHUB_OUTPUT}
           echo "LABELS_TO_REMOVE=${LABELS_TO_REMOVE}" >> ${GITHUB_OUTPUT}
 
-      - uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event.pull_request.head.sha }}
-
       - run: git checkout -b "${BRANCH}"
 
       - run: git push --force origin "${BRANCH}"
@@ -105,7 +101,7 @@ jobs:
 
       - name: Create a Pull Request for CI run (if required)
         if: steps.get-pr.outputs.ALREADY_CREATED == ''
-        env: 
+        env:
           GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
         run: |
           cat << EOF > body.md
@@ -142,7 +138,7 @@ jobs:
 
       - run: git push --force origin "${BRANCH}"
         if: steps.get-pr.outputs.ALREADY_CREATED != ''
-             
+
   cleanup:
     # Close PRs and delete branchs if the original PR is closed.
 

From df06c410852e51e01528ba63942f2361b8f6f68b Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 6 Feb 2025 15:18:50 +0000
Subject: [PATCH 70/77] tests: don't detach from controller in test_issue_5878
 (#10675)

## Problem

This test called NeonPageserver.tenant_detach, which as well as
detaching locally on the pageserver, also updates the storage controller
to put the tenant into Detached mode. When the test runs slowly in debug
mode, it sometimes takes long enough that the background_reconcile loop
wakes up and drops the tenant from memory in response, such that the
pageserver can't validate its deletions and the test does not behave as
expected.

Closes: https://github.com/neondatabase/neon/issues/10513

## Summary of changes

- Call the pageserver HTTP client directly rather than going via
NeonPageserver.tenant_detach
---
 test_runner/regress/test_layers_from_future.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py
index 5e06a1d47f..872d3dc4cf 100644
--- a/test_runner/regress/test_layers_from_future.py
+++ b/test_runner/regress/test_layers_from_future.py
@@ -172,7 +172,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder, attach_mode: str):
     # force removal of layers from the future
     tenant_conf = ps_http.tenant_config(tenant_id)
     generation_before_detach = get_generation_number()
-    env.pageserver.tenant_detach(tenant_id)
+    env.pageserver.http_client().tenant_detach(tenant_id)
     failpoint_deletion_queue = "deletion-queue-before-execute-pause"
 
     ps_http.configure_failpoints((failpoint_deletion_queue, "pause"))

From 2943590694c57bde91eed71aa92bc96c18abb152 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Thu, 6 Feb 2025 18:17:47 +0100
Subject: [PATCH 71/77] pageserver: use histogram for background job semaphore
 waits (#10697)

## Problem

We don't have visibility into how long an individual background job is
waiting for a semaphore permit.

## Summary of changes

* Make `pageserver_background_loop_semaphore_wait_seconds` a histogram
rather than a sum.
* Add a paced warning when a task takes more than 10 minutes to get a
permit (for now).
* Drive-by cleanup of some `EnumMap` usage.
---
 pageserver/src/metrics.rs      | 58 ++++++++++++++++++----------------
 pageserver/src/tenant/tasks.rs | 38 ++++++++++++++++------
 2 files changed, 58 insertions(+), 38 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 1cc18d83ce..3b8612a3fa 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -6,7 +6,7 @@ use std::sync::{Arc, Mutex};
 use std::task::{Context, Poll};
 use std::time::{Duration, Instant};
 
-use enum_map::EnumMap;
+use enum_map::{Enum as _, EnumMap};
 use futures::Future;
 use metrics::{
     register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
@@ -104,7 +104,7 @@ pub(crate) static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::n
     .expect("failed to define a metric")
 });
 
-// Buckets for background operations like compaction, GC, size calculation
+// Buckets for background operation duration in seconds, like compaction, GC, size calculation.
 const STORAGE_OP_BUCKETS: &[f64] = &[0.010, 0.100, 1.0, 10.0, 100.0, 1000.0];
 
 pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
@@ -236,7 +236,7 @@ pub(crate) static GET_VECTORED_LATENCY: Lazy<GetVectoredLatency> = Lazy::new(||
 
     GetVectoredLatency {
         map: EnumMap::from_array(std::array::from_fn(|task_kind_idx| {
-            let task_kind = <TaskKind as enum_map::Enum>::from_usize(task_kind_idx);
+            let task_kind = TaskKind::from_usize(task_kind_idx);
 
             if GetVectoredLatency::TRACKED_TASK_KINDS.contains(&task_kind) {
                 let task_kind = task_kind.into();
@@ -259,7 +259,7 @@ pub(crate) static SCAN_LATENCY: Lazy<ScanLatency> = Lazy::new(|| {
 
     ScanLatency {
         map: EnumMap::from_array(std::array::from_fn(|task_kind_idx| {
-            let task_kind = <TaskKind as enum_map::Enum>::from_usize(task_kind_idx);
+            let task_kind = TaskKind::from_usize(task_kind_idx);
 
             if ScanLatency::TRACKED_TASK_KINDS.contains(&task_kind) {
                 let task_kind = task_kind.into();
@@ -300,10 +300,10 @@ static PAGE_CACHE_READ_ACCESSES: Lazy<IntCounterVec> = Lazy::new(|| {
 
 pub(crate) static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMetrics {
     map: EnumMap::from_array(std::array::from_fn(|task_kind| {
-        let task_kind = <TaskKind as enum_map::Enum>::from_usize(task_kind);
+        let task_kind = TaskKind::from_usize(task_kind);
         let task_kind: &'static str = task_kind.into();
         EnumMap::from_array(std::array::from_fn(|content_kind| {
-            let content_kind = <PageContentKind as enum_map::Enum>::from_usize(content_kind);
+            let content_kind = PageContentKind::from_usize(content_kind);
             let content_kind: &'static str = content_kind.into();
             PageCacheMetricsForTaskKind {
                 read_accesses_immutable: {
@@ -1913,7 +1913,7 @@ pub(crate) static COMPUTE_COMMANDS_COUNTERS: Lazy<ComputeCommandCounters> = Lazy
 
     ComputeCommandCounters {
         map: EnumMap::from_array(std::array::from_fn(|i| {
-            let command = <ComputeCommandKind as enum_map::Enum>::from_usize(i);
+            let command = ComputeCommandKind::from_usize(i);
             let command_str: &'static str = command.into();
             inner.with_label_values(&[command_str])
         })),
@@ -2213,13 +2213,13 @@ pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
 
 pub struct BackgroundLoopSemaphoreMetrics {
     counters: EnumMap<BackgroundLoopKind, IntCounterPair>,
-    durations: EnumMap<BackgroundLoopKind, Counter>,
+    durations: EnumMap<BackgroundLoopKind, Histogram>,
     waiting_tasks: EnumMap<BackgroundLoopKind, IntGauge>,
     running_tasks: EnumMap<BackgroundLoopKind, IntGauge>,
 }
 
-pub(crate) static BACKGROUND_LOOP_SEMAPHORE: Lazy<BackgroundLoopSemaphoreMetrics> = Lazy::new(
-    || {
+pub(crate) static BACKGROUND_LOOP_SEMAPHORE: Lazy<BackgroundLoopSemaphoreMetrics> =
+    Lazy::new(|| {
         let counters = register_int_counter_pair_vec!(
             "pageserver_background_loop_semaphore_wait_start_count",
             "Counter for background loop concurrency-limiting semaphore acquire calls started",
@@ -2229,10 +2229,11 @@ pub(crate) static BACKGROUND_LOOP_SEMAPHORE: Lazy<BackgroundLoopSemaphoreMetrics
         )
         .unwrap();
 
-        let durations = register_counter_vec!(
-            "pageserver_background_loop_semaphore_wait_duration_seconds",
-            "Sum of wall clock time spent waiting on the background loop concurrency-limiting semaphore acquire calls",
+        let durations = register_histogram_vec!(
+            "pageserver_background_loop_semaphore_wait_seconds",
+            "Seconds spent waiting on background loop semaphore acquisition",
             &["task"],
+            vec![0.01, 1.0, 5.0, 10.0, 30.0, 60.0, 180.0, 300.0, 600.0],
         )
         .unwrap();
 
@@ -2251,25 +2252,24 @@ pub(crate) static BACKGROUND_LOOP_SEMAPHORE: Lazy<BackgroundLoopSemaphoreMetrics
         .unwrap();
 
         BackgroundLoopSemaphoreMetrics {
-            counters: enum_map::EnumMap::from_array(std::array::from_fn(|i| {
-                let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
+            counters: EnumMap::from_array(std::array::from_fn(|i| {
+                let kind = BackgroundLoopKind::from_usize(i);
                 counters.with_label_values(&[kind.into()])
             })),
-            durations: enum_map::EnumMap::from_array(std::array::from_fn(|i| {
-                let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
+            durations: EnumMap::from_array(std::array::from_fn(|i| {
+                let kind = BackgroundLoopKind::from_usize(i);
                 durations.with_label_values(&[kind.into()])
             })),
-            waiting_tasks: enum_map::EnumMap::from_array(std::array::from_fn(|i| {
-                let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
+            waiting_tasks: EnumMap::from_array(std::array::from_fn(|i| {
+                let kind = BackgroundLoopKind::from_usize(i);
                 waiting_tasks.with_label_values(&[kind.into()])
             })),
-            running_tasks: enum_map::EnumMap::from_array(std::array::from_fn(|i| {
-                let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
+            running_tasks: EnumMap::from_array(std::array::from_fn(|i| {
+                let kind = BackgroundLoopKind::from_usize(i);
                 running_tasks.with_label_values(&[kind.into()])
             })),
         }
-    },
-);
+    });
 
 impl BackgroundLoopSemaphoreMetrics {
     /// Starts recording semaphore metrics. Call `acquired()` on the returned recorder when the
@@ -2304,11 +2304,13 @@ impl<'a> BackgroundLoopSemaphoreMetricsRecorder<'a> {
     }
 
     /// Signals that the semaphore has been acquired, and updates relevant metrics.
-    pub fn acquired(&mut self) {
+    pub fn acquired(&mut self) -> Duration {
+        let waited = self.start.elapsed();
         self.wait_counter_guard.take().expect("already acquired");
-        self.metrics.durations[self.task].inc_by(self.start.elapsed().as_secs_f64());
+        self.metrics.durations[self.task].observe(waited.as_secs_f64());
         self.metrics.waiting_tasks[self.task].dec();
         self.metrics.running_tasks[self.task].inc();
+        waited
     }
 }
 
@@ -2317,7 +2319,7 @@ impl Drop for BackgroundLoopSemaphoreMetricsRecorder<'_> {
     fn drop(&mut self) {
         if self.wait_counter_guard.take().is_some() {
             // Waiting.
-            self.metrics.durations[self.task].inc_by(self.start.elapsed().as_secs_f64());
+            self.metrics.durations[self.task].observe(self.start.elapsed().as_secs_f64());
             self.metrics.waiting_tasks[self.task].dec();
         } else {
             // Running.
@@ -2570,7 +2572,7 @@ pub(crate) static WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM: Lazy<Histogram> =
 
 pub(crate) struct WalRedoProcessCounters {
     pub(crate) started: IntCounter,
-    pub(crate) killed_by_cause: enum_map::EnumMap<WalRedoKillCause, IntCounter>,
+    pub(crate) killed_by_cause: EnumMap<WalRedoKillCause, IntCounter>,
     pub(crate) active_stderr_logger_tasks_started: IntCounter,
     pub(crate) active_stderr_logger_tasks_finished: IntCounter,
 }
@@ -2612,7 +2614,7 @@ impl Default for WalRedoProcessCounters {
         Self {
             started,
             killed_by_cause: EnumMap::from_array(std::array::from_fn(|i| {
-                let cause = <WalRedoKillCause as enum_map::Enum>::from_usize(i);
+                let cause = WalRedoKillCause::from_usize(i);
                 let cause_str: &'static str = cause.into();
                 killed.with_label_values(&[cause_str])
             })),
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 1c3237d0bd..0f10dd7e10 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -3,7 +3,7 @@
 
 use std::ops::ControlFlow;
 use std::str::FromStr;
-use std::sync::Arc;
+use std::sync::{Arc, Mutex};
 use std::time::{Duration, Instant};
 
 use crate::context::{DownloadBehavior, RequestContext};
@@ -14,9 +14,11 @@ use crate::tenant::throttle::Stats;
 use crate::tenant::timeline::compaction::CompactionOutcome;
 use crate::tenant::timeline::CompactionError;
 use crate::tenant::{Tenant, TenantState};
+use once_cell::sync::Lazy;
 use rand::Rng;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
+use utils::rate_limit::RateLimit;
 use utils::{backoff, completion, pausable_failpoint};
 
 static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy<tokio::sync::Semaphore> =
@@ -41,7 +43,16 @@ static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy<tokio::sync::Semaphore
         tokio::sync::Semaphore::new(permits)
     });
 
-#[derive(Debug, PartialEq, Eq, Clone, Copy, strum_macros::IntoStaticStr, enum_map::Enum)]
+#[derive(
+    Debug,
+    PartialEq,
+    Eq,
+    Clone,
+    Copy,
+    strum_macros::IntoStaticStr,
+    strum_macros::Display,
+    enum_map::Enum,
+)]
 #[strum(serialize_all = "snake_case")]
 pub(crate) enum BackgroundLoopKind {
     Compaction,
@@ -55,12 +66,6 @@ pub(crate) enum BackgroundLoopKind {
     SecondaryDownload,
 }
 
-impl BackgroundLoopKind {
-    fn as_static_str(&self) -> &'static str {
-        self.into()
-    }
-}
-
 pub struct BackgroundLoopSemaphorePermit<'a> {
     _permit: tokio::sync::SemaphorePermit<'static>,
     _recorder: BackgroundLoopSemaphoreMetricsRecorder<'a>,
@@ -71,6 +76,11 @@ pub(crate) async fn concurrent_background_tasks_rate_limit_permit(
     loop_kind: BackgroundLoopKind,
     _ctx: &RequestContext,
 ) -> BackgroundLoopSemaphorePermit<'static> {
+    // TODO: use a lower threshold and remove the pacer once we resolve some blockage.
+    const WARN_THRESHOLD: Duration = Duration::from_secs(600);
+    static WARN_PACER: Lazy<Mutex<RateLimit>> =
+        Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
+
     let mut recorder = crate::metrics::BACKGROUND_LOOP_SEMAPHORE.record(loop_kind);
 
     if loop_kind == BackgroundLoopKind::InitialLogicalSizeCalculation {
@@ -82,7 +92,15 @@ pub(crate) async fn concurrent_background_tasks_rate_limit_permit(
         .acquire()
         .await
         .expect("should never close");
-    recorder.acquired();
+
+    let waited = recorder.acquired();
+    if waited >= WARN_THRESHOLD {
+        let waited = waited.as_secs_f64();
+        WARN_PACER
+            .lock()
+            .unwrap()
+            .call(|| warn!("{loop_kind} task waited {waited:.3}s for semaphore permit"));
+    }
 
     BackgroundLoopSemaphorePermit {
         _permit: permit,
@@ -628,7 +646,7 @@ pub(crate) fn warn_when_period_overrun(
             "task iteration took longer than the configured period"
         );
         crate::metrics::BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT
-            .with_label_values(&[task.as_static_str(), &format!("{}", period.as_secs())])
+            .with_label_values(&[task.into(), &format!("{}", period.as_secs())])
             .inc();
     }
 }

From 82cbab75123d9e413c432ff8adec2f633056af77 Mon Sep 17 00:00:00 2001
From: OBBO67 <35974943+OBBO67@users.noreply.github.com>
Date: Thu, 6 Feb 2025 17:26:26 +0000
Subject: [PATCH 72/77] Switch reqlsns[0].request_lsn to arrow operator in
 neon_read_at_lsnv() (#10620) (#10687)

## Problem

Currently the following line below uses array subscript notation which
is confusing since `reqlsns` is not an array but just a pointer to a
struct.

```
XLogWaitForReplayOf(reqlsns[0].request_lsn);
```

## Summary of changes

Switch from array subscript notation to arrow operator to improve
readability of code.

Close #10620.
---
 pgxn/neon/pagestore_smgr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 012bd479bc..8051970176 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -3011,7 +3011,7 @@ neon_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber base_block
 		start_ts = GetCurrentTimestamp();
 
 		if (RecoveryInProgress() && MyBackendType != B_STARTUP)
-			XLogWaitForReplayOf(reqlsns[0].request_lsn);
+			XLogWaitForReplayOf(reqlsns->request_lsn);
 
 		/*
 		 * Try to find prefetched page in the list of received pages.

From 186199f406fbc0d6b46b9edf0ae79ca2f1a53cb2 Mon Sep 17 00:00:00 2001
From: Arseny Sher <ars@neon.tech>
Date: Thu, 6 Feb 2025 20:28:27 +0300
Subject: [PATCH 73/77] Update aws sdk (#10699)

## Problem

We have unclear issue with stuck s3 client, probably after partial aws
sdk update without updating sdk-s3.
https://github.com/neondatabase/neon/pull/10588
Let's try to update s3 as well.

## Summary of changes

Result of running

cargo update -p aws-types -p aws-sigv4 -p aws-credential-types -p
aws-smithy-types -p aws-smithy-async -p aws-sdk-kms -p aws-sdk-iam -p
aws-sdk-s3 -p aws-config

ref https://github.com/neondatabase/neon/issues/10695
---
 Cargo.lock | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index de1b1218ca..2c5b0a113f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -368,9 +368,9 @@ dependencies = [
 
 [[package]]
 name = "aws-sdk-iam"
-version = "1.53.0"
+version = "1.60.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fb8a6fea8d335cde419176b1f2c6d2d6e97997719e7df4b51e59064310f48e4a"
+checksum = "a43daa438f8e7e4ebbbcb5c712b3b85db50d62e637a7da4ba9da51095d327460"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
@@ -391,9 +391,9 @@ dependencies = [
 
 [[package]]
 name = "aws-sdk-kms"
-version = "1.51.0"
+version = "1.58.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3c30f6fd5646b99d9b45ec3a0c22e67112c175b2383100c960d7ee39d96c8d96"
+checksum = "40b7a24700ac548025a47a5c579886f5198895bb1eccd8964dfd71cd66c16912"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
@@ -413,9 +413,9 @@ dependencies = [
 
 [[package]]
 name = "aws-sdk-s3"
-version = "1.65.0"
+version = "1.68.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d3ba2c5c0f2618937ce3d4a5ad574b86775576fa24006bcb3128c6e2cbf3c34e"
+checksum = "bc5ddf1dc70287dc9a2f953766a1fe15e3e74aef02fd1335f2afa475c9b4f4fc"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
@@ -514,9 +514,9 @@ dependencies = [
 
 [[package]]
 name = "aws-sigv4"
-version = "1.2.7"
+version = "1.2.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "690118821e46967b3c4501d67d7d52dd75106a9c54cf36cefa1985cedbe94e05"
+checksum = "0bc5bbd1e4a2648fd8c5982af03935972c24a2f9846b396de661d351ee3ce837"
 dependencies = [
  "aws-credential-types",
  "aws-smithy-eventstream",
@@ -670,9 +670,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-types"
-version = "1.2.12"
+version = "1.2.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a28f6feb647fb5e0d5b50f0472c19a7db9462b74e2fec01bb0b44eedcc834e97"
+checksum = "c7b8a53819e42f10d0821f56da995e1470b199686a1809168db6ca485665f042"
 dependencies = [
  "base64-simd",
  "bytes",
@@ -705,9 +705,9 @@ dependencies = [
 
 [[package]]
 name = "aws-types"
-version = "1.3.4"
+version = "1.3.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b0df5a18c4f951c645300d365fec53a61418bcf4650f604f85fe2a665bfaa0c2"
+checksum = "dfbd0a668309ec1f66c0f6bda4840dd6d4796ae26d699ebc266d7cc95c6d040f"
 dependencies = [
  "aws-credential-types",
  "aws-smithy-async",

From 44b905d14b03d141fa94e0dfb0e1eae06c437ae6 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Thu, 6 Feb 2025 19:21:38 +0000
Subject: [PATCH 74/77] Fix remote extension lookup (#10708)

when library name doesn't match extension name.
The bug was introduced by recent commit ebc55e6a
---
 libs/compute_api/src/spec.rs | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs
index 2fc95c47c6..767a34bcbc 100644
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -207,11 +207,11 @@ impl RemoteExtSpec {
         if !self
             .public_extensions
             .as_ref()
-            .is_some_and(|exts| exts.iter().any(|e| e == ext_name))
+            .is_some_and(|exts| exts.iter().any(|e| e == real_ext_name))
             && !self
                 .custom_extensions
                 .as_ref()
-                .is_some_and(|exts| exts.iter().any(|e| e == ext_name))
+                .is_some_and(|exts| exts.iter().any(|e| e == real_ext_name))
         {
             return Err(anyhow::anyhow!("extension {} is not found", real_ext_name));
         }
@@ -414,7 +414,7 @@ mod tests {
             "public_extensions": ["ext"],
             "custom_extensions": [],
             "library_index": {
-                "ext": "ext"
+                "extlib": "ext",
             },
             "extension_data": {
                 "ext": {
@@ -430,6 +430,12 @@ mod tests {
         rspec
             .get_ext("ext", false, "latest", "v17")
             .expect("Extension should be found");
+
+        // test library index for the case when library name
+        // doesn't match the extension name
+        rspec
+            .get_ext("extlib", true, "latest", "v17")
+            .expect("Library should be found");
     }
 
     #[test]

From e73d681a0e05d055de93e8d72762996bb746f440 Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Thu, 6 Feb 2025 21:21:18 +0100
Subject: [PATCH 75/77] Patch pgcopydb and fix another segfault (#10706)

## Problem

Found another pgcopydb segfault in error handling

```bash
2025-02-06 15:30:40.112 51299 ERROR  pgsql.c:2330              [TARGET -738302813] FATAL:  terminating connection due to administrator command
2025-02-06 15:30:40.112 51298 ERROR  pgsql.c:2330              [TARGET -1407749748] FATAL:  terminating connection due to administrator command
2025-02-06 15:30:40.112 51297 ERROR  pgsql.c:2330              [TARGET -2073308066] FATAL:  terminating connection due to administrator command
2025-02-06 15:30:40.112 51300 ERROR  pgsql.c:2330              [TARGET 1220908650] FATAL:  terminating connection due to administrator command
2025-02-06 15:30:40.432 51300 ERROR  pgsql.c:2536              [Postgres] FATAL:  terminating connection due to administrator command
2025-02-06 15:30:40.513 51290 ERROR  copydb.c:773              Sub-process 51300 exited with code 0 and signal Segmentation fault
2025-02-06 15:30:40.578 51299 ERROR  pgsql.c:2536              [Postgres] FATAL:  terminating connection due to administrator command
2025-02-06 15:30:40.613 51290 ERROR  copydb.c:773              Sub-process 51299 exited with code 0 and signal Segmentation fault
2025-02-06 15:30:41.253 51298 ERROR  pgsql.c:2536              [Postgres] FATAL:  terminating connection due to administrator command
2025-02-06 15:30:41.314 51290 ERROR  copydb.c:773              Sub-process 51298 exited with code 0 and signal Segmentation fault
2025-02-06 15:30:43.133 51297 ERROR  pgsql.c:2536              [Postgres] FATAL:  terminating connection due to administrator command
2025-02-06 15:30:43.215 51290 ERROR  copydb.c:773              Sub-process 51297 exited with code 0 and signal Segmentation fault
2025-02-06 15:30:43.215 51290 ERROR  indexes.c:123             Some INDEX worker process(es) have exited with error, see above for details
2025-02-06 15:30:43.215 51290 ERROR  indexes.c:59              Failed to create indexes, see above for details
2025-02-06 15:30:43.232 51271 ERROR  copydb.c:768              Sub-process 51290 exited with code 12
```

```bashadmin@ip-172-31-38-164:~/pgcopydb$ gdb /usr/local/pgsql/bin/pgcopydb core
GNU gdb (Debian 13.1-3) 13.1
Copyright (C) 2023 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law.
Type "show copying" and "show warranty" for details.
This GDB was configured as "aarch64-linux-gnu".
Type "show configuration" for configuration details.
For bug reporting instructions, please see:
<https://www.gnu.org/software/gdb/bugs/>.
Find the GDB manual and other documentation resources online at:
    <http://www.gnu.org/software/gdb/documentation/>.

For help, type "help".
Type "apropos word" to search for commands related to "word"...
Reading symbols from /usr/local/pgsql/bin/pgcopydb...
[New LWP 51297]
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib/aarch64-linux-gnu/libthread_db.so.1".
Core was generated by `pgcopydb: create index ocr.ocr_pipeline_step_results_version_pkey             '.
Program terminated with signal SIGSEGV, Segmentation fault.
#0  0x0000aaaac3a4b030 in splitLines (lbuf=lbuf@entry=0xffffd8b86930, buffer=<optimized out>) at string_utils.c:630
630				*newLinePtr = '\0';
(gdb) bt
#0  0x0000aaaac3a4b030 in splitLines (lbuf=lbuf@entry=0xffffd8b86930, buffer=<optimized out>) at string_utils.c:630
#1  0x0000aaaac3a3a678 in pgsql_execute_log_error (pgsql=pgsql@entry=0xffffd8b87040, result=result@entry=0x0,
    sql=sql@entry=0xffff81fe9be0 "CREATE UNIQUE INDEX IF NOT EXISTS ocr_pipeline_step_results_version_pkey ON ocr.ocr_pipeline_step_results_version USING btree (id, transaction_id);",
    debugParameters=debugParameters@entry=0xaaaaec5f92f0, context=context@entry=0x0) at pgsql.c:2322
#2  0x0000aaaac3a3bbec in pgsql_execute_with_params (pgsql=pgsql@entry=0xffffd8b87040,
    sql=0xffff81fe9be0 "CREATE UNIQUE INDEX IF NOT EXISTS ocr_pipeline_step_results_version_pkey ON ocr.ocr_pipeline_step_results_version USING btree (id, transaction_id);", paramCount=paramCount@entry=0,
    paramTypes=paramTypes@entry=0x0, paramValues=paramValues@entry=0x0, context=context@entry=0x0, parseFun=parseFun@entry=0x0) at pgsql.c:1649
#3  0x0000aaaac3a3c468 in pgsql_execute (pgsql=pgsql@entry=0xffffd8b87040, sql=<optimized out>) at pgsql.c:1522
#4  0x0000aaaac3a245f4 in copydb_create_index (specs=specs@entry=0xffffd8b8ec98, dst=dst@entry=0xffffd8b87040, index=index@entry=0xffff81f71800, ifNotExists=<optimized out>) at indexes.c:846
#5  0x0000aaaac3a24ca8 in copydb_create_index_by_oid (specs=specs@entry=0xffffd8b8ec98, dst=dst@entry=0xffffd8b87040, indexOid=<optimized out>) at indexes.c:410
#6  0x0000aaaac3a25040 in copydb_index_worker (specs=specs@entry=0xffffd8b8ec98) at indexes.c:297
#7  0x0000aaaac3a25238 in copydb_start_index_workers (specs=specs@entry=0xffffd8b8ec98) at indexes.c:209
#8  0x0000aaaac3a252f4 in copydb_index_supervisor (specs=specs@entry=0xffffd8b8ec98) at indexes.c:112
#9  0x0000aaaac3a253f4 in copydb_start_index_supervisor (specs=0xffffd8b8ec98) at indexes.c:57
#10 copydb_start_index_supervisor (specs=specs@entry=0xffffd8b8ec98) at indexes.c:34
#11 0x0000aaaac3a51ff4 in copydb_process_table_data (specs=specs@entry=0xffffd8b8ec98) at table-data.c:146
#12 0x0000aaaac3a520dc in copydb_copy_all_table_data (specs=specs@entry=0xffffd8b8ec98) at table-data.c:69
#13 0x0000aaaac3a0ccd8 in cloneDB (copySpecs=copySpecs@entry=0xffffd8b8ec98) at cli_clone_follow.c:602
#14 0x0000aaaac3a0d2cc in start_clone_process (pid=0xffffd8b743d8, copySpecs=0xffffd8b8ec98) at cli_clone_follow.c:502
#15 start_clone_process (copySpecs=copySpecs@entry=0xffffd8b8ec98, pid=pid@entry=0xffffd8b89788) at cli_clone_follow.c:482
#16 0x0000aaaac3a0d52c in cli_clone (argc=<optimized out>, argv=<optimized out>) at cli_clone_follow.c:164
#17 0x0000aaaac3a53850 in commandline_run (command=command@entry=0xffffd8b9eb88, argc=0, argc@entry=22, argv=0xffffd8b9edf8, argv@entry=0xffffd8b9ed48) at /home/admin/pgcopydb/src/bin/pgcopydb/../lib/subcommands.c/commandline.c:71
#18 0x0000aaaac3a01464 in main (argc=22, argv=0xffffd8b9ed48) at main.c:140
(gdb)

```

The problem is most likely that the following call returned a message in
a read-only memory segment where we cannot replace \n with \0 in
string_utils.c splitLines() function
```C
char *message = PQerrorMessage(pgsql->connection);
```

## Summary of changes

modified the patch to also address this problem
---
 build_tools/patches/pgcopydbv017.patch | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/build_tools/patches/pgcopydbv017.patch b/build_tools/patches/pgcopydbv017.patch
index c309d8fe59..4e68793afc 100644
--- a/build_tools/patches/pgcopydbv017.patch
+++ b/build_tools/patches/pgcopydbv017.patch
@@ -11,10 +11,30 @@ index d730b03..69a9be9 100644
  };
  
 diff --git a/src/bin/pgcopydb/pgsql.c b/src/bin/pgcopydb/pgsql.c
-index 94f2f46..86b9448 100644
+index 94f2f46..e051ba8 100644
 --- a/src/bin/pgcopydb/pgsql.c
 +++ b/src/bin/pgcopydb/pgsql.c
-@@ -3174,11 +3174,18 @@ pgcopy_log_error(PGSQL *pgsql, PGresult *res, const char *context)
+@@ -2319,6 +2319,11 @@ pgsql_execute_log_error(PGSQL *pgsql,
+ 
+ 	LinesBuffer lbuf = { 0 };
+ 
++	if (message != NULL){
++		// make sure message is writable by splitLines
++		message = strdup(message);
++	}
++
+ 	if (!splitLines(&lbuf, message))
+ 	{
+ 		/* errors have already been logged */
+@@ -2332,6 +2337,7 @@ pgsql_execute_log_error(PGSQL *pgsql,
+ 				  PQbackendPID(pgsql->connection),
+ 				  lbuf.lines[lineNumber]);
+ 	}
++        free(message); // free copy of message we created above
+ 
+ 	if (pgsql->logSQL)
+ 	{
+@@ -3174,11 +3180,18 @@ pgcopy_log_error(PGSQL *pgsql, PGresult *res, const char *context)
  		/* errors have already been logged */
  		return;
  	}

From d255fa4b7e6e165facfd3c2af455e3f1e0782b1b Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Fri, 7 Feb 2025 06:02:18 +0000
Subject: [PATCH 76/77] Storage release 2025-02-07


From 69007f7ac8af50730fbf95cd3453dc890591cafb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 7 Feb 2025 18:37:53 +0100
Subject: [PATCH 77/77] Revert recent AWS SDK update (#10724)

We've been seeing some regressions in staging since the AWS SDK updates:
https://github.com/neondatabase/neon/issues/10695 . We aren't sure the
regression was caused by the SDK update, but the issues do involve S3,
so it's not unlikely. By reverting the SDK update we find out whether it
was really the SDK update, or something else.

Reverts the two PRs:

* https://github.com/neondatabase/neon/pull/10588
* https://github.com/neondatabase/neon/pull/10699

https://neondb.slack.com/archives/C08C2G15M6U/p1738576986047179
---
 Cargo.lock | 87 ++++++++++++++++++++++++++++++------------------------
 1 file changed, 48 insertions(+), 39 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 2c5b0a113f..e73f1f9cdb 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -300,9 +300,9 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
 
 [[package]]
 name = "aws-config"
-version = "1.5.15"
+version = "1.5.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dc47e70fc35d054c8fcd296d47a61711f043ac80534a10b4f741904f81e73a90"
+checksum = "9b49afaa341e8dd8577e1a2200468f98956d6eda50bcf4a53246cc00174ba924"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
@@ -311,7 +311,7 @@ dependencies = [
  "aws-sdk-sts",
  "aws-smithy-async",
  "aws-smithy-http",
- "aws-smithy-json",
+ "aws-smithy-json 0.60.7",
  "aws-smithy-runtime",
  "aws-smithy-runtime-api",
  "aws-smithy-types",
@@ -342,9 +342,9 @@ dependencies = [
 
 [[package]]
 name = "aws-runtime"
-version = "1.5.4"
+version = "1.4.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bee7643696e7fdd74c10f9eb42848a87fe469d35eae9c3323f80aa98f350baac"
+checksum = "b5ac934720fbb46206292d2c75b57e67acfc56fe7dfd34fb9a02334af08409ea"
 dependencies = [
  "aws-credential-types",
  "aws-sigv4",
@@ -368,15 +368,15 @@ dependencies = [
 
 [[package]]
 name = "aws-sdk-iam"
-version = "1.60.0"
+version = "1.53.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a43daa438f8e7e4ebbbcb5c712b3b85db50d62e637a7da4ba9da51095d327460"
+checksum = "fb8a6fea8d335cde419176b1f2c6d2d6e97997719e7df4b51e59064310f48e4a"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
  "aws-smithy-async",
  "aws-smithy-http",
- "aws-smithy-json",
+ "aws-smithy-json 0.61.1",
  "aws-smithy-query",
  "aws-smithy-runtime",
  "aws-smithy-runtime-api",
@@ -391,15 +391,15 @@ dependencies = [
 
 [[package]]
 name = "aws-sdk-kms"
-version = "1.58.0"
+version = "1.51.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "40b7a24700ac548025a47a5c579886f5198895bb1eccd8964dfd71cd66c16912"
+checksum = "3c30f6fd5646b99d9b45ec3a0c22e67112c175b2383100c960d7ee39d96c8d96"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
  "aws-smithy-async",
  "aws-smithy-http",
- "aws-smithy-json",
+ "aws-smithy-json 0.61.1",
  "aws-smithy-runtime",
  "aws-smithy-runtime-api",
  "aws-smithy-types",
@@ -413,9 +413,9 @@ dependencies = [
 
 [[package]]
 name = "aws-sdk-s3"
-version = "1.68.0"
+version = "1.65.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bc5ddf1dc70287dc9a2f953766a1fe15e3e74aef02fd1335f2afa475c9b4f4fc"
+checksum = "d3ba2c5c0f2618937ce3d4a5ad574b86775576fa24006bcb3128c6e2cbf3c34e"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
@@ -424,7 +424,7 @@ dependencies = [
  "aws-smithy-checksums",
  "aws-smithy-eventstream",
  "aws-smithy-http",
- "aws-smithy-json",
+ "aws-smithy-json 0.61.1",
  "aws-smithy-runtime",
  "aws-smithy-runtime-api",
  "aws-smithy-types",
@@ -447,15 +447,15 @@ dependencies = [
 
 [[package]]
 name = "aws-sdk-sso"
-version = "1.57.0"
+version = "1.50.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c54bab121fe1881a74c338c5f723d1592bf3b53167f80268a1274f404e1acc38"
+checksum = "05ca43a4ef210894f93096039ef1d6fa4ad3edfabb3be92b80908b9f2e4b4eab"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
  "aws-smithy-async",
  "aws-smithy-http",
- "aws-smithy-json",
+ "aws-smithy-json 0.61.1",
  "aws-smithy-runtime",
  "aws-smithy-runtime-api",
  "aws-smithy-types",
@@ -469,15 +469,15 @@ dependencies = [
 
 [[package]]
 name = "aws-sdk-ssooidc"
-version = "1.58.0"
+version = "1.51.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8c8234fd024f7ac61c4e44ea008029bde934250f371efe7d4a39708397b1080c"
+checksum = "abaf490c2e48eed0bb8e2da2fb08405647bd7f253996e0f93b981958ea0f73b0"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
  "aws-smithy-async",
  "aws-smithy-http",
- "aws-smithy-json",
+ "aws-smithy-json 0.61.1",
  "aws-smithy-runtime",
  "aws-smithy-runtime-api",
  "aws-smithy-types",
@@ -491,15 +491,15 @@ dependencies = [
 
 [[package]]
 name = "aws-sdk-sts"
-version = "1.58.0"
+version = "1.51.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ba60e1d519d6f23a9df712c04fdeadd7872ac911c84b2f62a8bda92e129b7962"
+checksum = "b68fde0d69c8bfdc1060ea7da21df3e39f6014da316783336deff0a9ec28f4bf"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
  "aws-smithy-async",
  "aws-smithy-http",
- "aws-smithy-json",
+ "aws-smithy-json 0.61.1",
  "aws-smithy-query",
  "aws-smithy-runtime",
  "aws-smithy-runtime-api",
@@ -514,9 +514,9 @@ dependencies = [
 
 [[package]]
 name = "aws-sigv4"
-version = "1.2.8"
+version = "1.2.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0bc5bbd1e4a2648fd8c5982af03935972c24a2f9846b396de661d351ee3ce837"
+checksum = "7d3820e0c08d0737872ff3c7c1f21ebbb6693d832312d6152bf18ef50a5471c2"
 dependencies = [
  "aws-credential-types",
  "aws-smithy-eventstream",
@@ -543,9 +543,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-async"
-version = "1.2.4"
+version = "1.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fa59d1327d8b5053c54bf2eaae63bf629ba9e904434d0835a28ed3c0ed0a614e"
+checksum = "62220bc6e97f946ddd51b5f1361f78996e704677afc518a4ff66b7a72ea1378c"
 dependencies = [
  "futures-util",
  "pin-project-lite",
@@ -575,9 +575,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-eventstream"
-version = "0.60.6"
+version = "0.60.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8b18559a41e0c909b77625adf2b8c50de480a8041e5e4a3f5f7d177db70abc5a"
+checksum = "cef7d0a272725f87e51ba2bf89f8c21e4df61b9e49ae1ac367a6d69916ef7c90"
 dependencies = [
  "aws-smithy-types",
  "bytes",
@@ -586,9 +586,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-http"
-version = "0.60.12"
+version = "0.60.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7809c27ad8da6a6a68c454e651d4962479e81472aa19ae99e59f9aba1f9713cc"
+checksum = "5c8bc3e8fdc6b8d07d976e301c02fe553f72a39b7a9fea820e023268467d7ab6"
 dependencies = [
  "aws-smithy-eventstream",
  "aws-smithy-runtime-api",
@@ -607,9 +607,18 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-json"
-version = "0.61.2"
+version = "0.60.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "623a51127f24c30776c8b374295f2df78d92517386f77ba30773f15a30ce1422"
+checksum = "4683df9469ef09468dad3473d129960119a0d3593617542b7d52086c8486f2d6"
+dependencies = [
+ "aws-smithy-types",
+]
+
+[[package]]
+name = "aws-smithy-json"
+version = "0.61.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ee4e69cc50921eb913c6b662f8d909131bb3e6ad6cb6090d3a39b66fc5c52095"
 dependencies = [
  "aws-smithy-types",
 ]
@@ -626,9 +635,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-runtime"
-version = "1.7.7"
+version = "1.7.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "865f7050bbc7107a6c98a397a9fcd9413690c27fa718446967cf03b2d3ac517e"
+checksum = "9f20685047ca9d6f17b994a07f629c813f08b5bce65523e47124879e60103d45"
 dependencies = [
  "aws-smithy-async",
  "aws-smithy-http",
@@ -670,9 +679,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-types"
-version = "1.2.13"
+version = "1.2.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c7b8a53819e42f10d0821f56da995e1470b199686a1809168db6ca485665f042"
+checksum = "4fbd94a32b3a7d55d3806fe27d98d3ad393050439dd05eb53ece36ec5e3d3510"
 dependencies = [
  "base64-simd",
  "bytes",
@@ -705,9 +714,9 @@ dependencies = [
 
 [[package]]
 name = "aws-types"
-version = "1.3.5"
+version = "1.3.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dfbd0a668309ec1f66c0f6bda4840dd6d4796ae26d699ebc266d7cc95c6d040f"
+checksum = "5221b91b3e441e6675310829fd8984801b772cb1546ef6c0e54dec9f1ac13fef"
 dependencies = [
  "aws-credential-types",
  "aws-smithy-async",