From 8bdb1828c8feea6f115cb63dd1c184ceec3ceffd Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Fri, 14 Feb 2025 10:19:56 +0200
Subject: [PATCH 01/78] Perform seqscan to fill LFC chunks with data so that
 on-disk file size included size of table (#10775)

## Problem

See https://github.com/neondatabase/neon/issues/10755

Random access pattern of pgbench leaves sparse chunks, which makes the
on-disk size of file.cache unpredictable.

## Summary of changes

Perform seqscan to fill LFC chunks with data so that on-disk file size
included size of table.

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 test_runner/regress/test_lfc_resize.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/test_runner/regress/test_lfc_resize.py b/test_runner/regress/test_lfc_resize.py
index 8762e6525b..ea7d38a3d9 100644
--- a/test_runner/regress/test_lfc_resize.py
+++ b/test_runner/regress/test_lfc_resize.py
@@ -72,6 +72,11 @@ def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin):
 
     thread.join()
 
+    # Fill LFC: seqscan should fetch the whole table in cache.
+    # It is needed for further correct evaluation of LFC file size
+    # (a sparse chunk of LFC takes less than 1 MB on disk).
+    cur.execute("select sum(abalance) from pgbench_accounts")
+
     # Before shrinking the cache, check that it really is large now
     (lfc_file_size, lfc_file_blocks) = get_lfc_size()
     assert int(lfc_file_blocks) > 128 * 1024

From 996f0a3753fd7626d935cb327abe57056a60a06c Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 14 Feb 2025 09:57:19 +0000
Subject: [PATCH 02/78] storcon: fix eliding parameters from proxied URL labels
 (#10817)

## Problem

We had code for stripping IDs out of proxied paths to reduce cardinality
of metrics, but it was only stripping out tenant IDs, and leaving in
timeline IDs and query parameters (e.g. LSN in lsn->timestamp lookups).

## Summary of changes

- Use a more general regex approach.

There is still some risk that a future pageserver API might include a
parameter in `/the/path/`, but we control that API and it is not often
extended. We will also alert on metrics cardinality in staging so that
if we made that mistake we would notice.
---
 Cargo.lock                     |  1 +
 storage_controller/Cargo.toml  |  1 +
 storage_controller/src/http.rs | 29 +++++++++++++++++++++++++----
 3 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 86d9603d36..74922d71c9 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -6462,6 +6462,7 @@ dependencies = [
  "pageserver_client",
  "postgres_connection",
  "rand 0.8.5",
+ "regex",
  "reqwest",
  "routerify",
  "rustls 0.23.18",
diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml
index 69276bfde4..a93bbdeaaf 100644
--- a/storage_controller/Cargo.toml
+++ b/storage_controller/Cargo.toml
@@ -34,6 +34,7 @@ reqwest = { workspace = true, features = ["stream"] }
 routerify.workspace = true
 safekeeper_api.workspace = true
 safekeeper_client.workspace = true
+regex.workspace = true
 rustls-native-certs.workspace = true
 serde.workspace = true
 serde_json.workspace = true
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 1a56116cad..e3e35a6303 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -516,6 +516,17 @@ async fn handle_tenant_timeline_block_unblock_gc(
     json_response(StatusCode::OK, ())
 }
 
+// For metric labels where we would like to include the approximate path, but exclude high-cardinality fields like query parameters
+// and tenant/timeline IDs.  Since we are proxying to arbitrary paths, we don't have routing templates to
+// compare to, so we can just filter out our well known ID format with regexes.
+fn path_without_ids(path: &str) -> String {
+    static ID_REGEX: std::sync::OnceLock<regex::Regex> = std::sync::OnceLock::new();
+    ID_REGEX
+        .get_or_init(|| regex::Regex::new(r"([0-9a-fA-F]{32}(-[0-9]{4})?|\?.*)").unwrap())
+        .replace_all(path, "")
+        .to_string()
+}
+
 async fn handle_tenant_timeline_passthrough(
     service: Arc<Service>,
     req: Request<Body>,
@@ -551,10 +562,7 @@ async fn handle_tenant_timeline_passthrough(
         .metrics_group
         .storage_controller_passthrough_request_latency;
 
-    // This is a bit awkward. We remove the param from the request
-    // and join the words by '_' to get a label for the request.
-    let just_path = path.replace(&tenant_shard_str, "");
-    let path_label = just_path
+    let path_label = path_without_ids(&path)
         .split('/')
         .filter(|token| !token.is_empty())
         .collect::<Vec<_>>()
@@ -2089,3 +2097,16 @@ pub fn make_router(
             )
         })
 }
+
+#[cfg(test)]
+mod test {
+
+    use super::path_without_ids;
+
+    #[test]
+    fn test_path_without_ids() {
+        assert_eq!(path_without_ids("/v1/tenant/1a2b3344556677881122334455667788/timeline/AA223344556677881122334455667788"), "/v1/tenant//timeline/");
+        assert_eq!(path_without_ids("/v1/tenant/1a2b3344556677881122334455667788-0108/timeline/AA223344556677881122334455667788"), "/v1/tenant//timeline/");
+        assert_eq!(path_without_ids("/v1/tenant/1a2b3344556677881122334455667788-0108/timeline/AA223344556677881122334455667788?parameter=foo"), "/v1/tenant//timeline/");
+    }
+}

From 878c1c7110348ef0352f4c0cd282746cd62f0fea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 14 Feb 2025 11:21:50 +0100
Subject: [PATCH 03/78] offload_timeline: check if the timeline is archived on
 HasChildren error (#10776)

PR #10305 makes sure that there is no *actual* race, i.e. we will never
attempt to offload a timeline that has just been unarchived, or similar.

However, if a timeline has been unarchived and has children that are
unarchived too, we will get an error log line. Such races can occur as
in compaction we check if the timeline can be offloaded way before we
attempt to offload it: the result might change in the meantime.

This patch checks if the delete guard can't be obtained because the
timeline has unarchived children, and if yes, it does another check for
whether the timeline has become unarchived or not. If it is unarchived,
it just prints an info log msg and integrates itself into the error
suppression logic of the compaction calling into it.

If you squint at it really closely, there is still a possible race in
which we print an error log, but this one is unlikely because the
timeline and its children need to be archived right after the check for
whether the timeline has any unarchived children, and right before the
check whether the timeline is archived. Archival involves a network
operation while nothing between these two checks does that, so it's very
unlikely to happen in real life.


https://github.com/neondatabase/cloud/issues/23979#issuecomment-2651265729
---
 pageserver/src/tenant/timeline/offload.rs | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/pageserver/src/tenant/timeline/offload.rs b/pageserver/src/tenant/timeline/offload.rs
index 3b5bf8290c..93e5a1100d 100644
--- a/pageserver/src/tenant/timeline/offload.rs
+++ b/pageserver/src/tenant/timeline/offload.rs
@@ -7,7 +7,9 @@ use super::Timeline;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::remote_timeline_client::ShutdownIfArchivedError;
 use crate::tenant::timeline::delete::{make_timeline_delete_guard, TimelineDeleteGuardKind};
-use crate::tenant::{OffloadedTimeline, Tenant, TenantManifestError, TimelineOrOffloaded};
+use crate::tenant::{
+    DeleteTimelineError, OffloadedTimeline, Tenant, TenantManifestError, TimelineOrOffloaded,
+};
 
 #[derive(thiserror::Error, Debug)]
 pub(crate) enum OffloadError {
@@ -37,12 +39,25 @@ pub(crate) async fn offload_timeline(
     debug_assert_current_span_has_tenant_and_timeline_id();
     tracing::info!("offloading archived timeline");
 
-    let (timeline, guard) = make_timeline_delete_guard(
+    let delete_guard_res = make_timeline_delete_guard(
         tenant,
         timeline.timeline_id,
         TimelineDeleteGuardKind::Offload,
-    )
-    .map_err(|e| OffloadError::Other(anyhow::anyhow!(e)))?;
+    );
+    if let Err(DeleteTimelineError::HasChildren(children)) = delete_guard_res {
+        let is_archived = timeline.is_archived();
+        if is_archived == Some(true) {
+            tracing::error!("timeline is archived but has non-archived children: {children:?}");
+            return Err(OffloadError::NotArchived);
+        }
+        tracing::info!(
+            ?is_archived,
+            "timeline is not archived and has unarchived children"
+        );
+        return Err(OffloadError::NotArchived);
+    };
+    let (timeline, guard) =
+        delete_guard_res.map_err(|e| OffloadError::Other(anyhow::anyhow!(e)))?;
 
     let TimelineOrOffloaded::Timeline(timeline) = timeline else {
         tracing::error!("timeline already offloaded, but given timeline object");

From 646e011c4db9fee802386382fadb0060cbbf77d6 Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Fri, 14 Feb 2025 12:41:57 +0100
Subject: [PATCH 04/78] Tests the test-upgrade scripts themselves (#10664)

## Problem
We run the compatibility tests only if we are upgrading the extension.
An accidental code change may break the test itself, so we have to check
this code as well.
## Summary of changes
The test is scheduled once a day to save time and resources.

---------

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 .../force-test-extensions-upgrade.yml         | 76 +++++++++++++++++++
 docker-compose/test_extensions_upgrade.sh     | 14 +++-
 2 files changed, 87 insertions(+), 3 deletions(-)
 create mode 100644 .github/workflows/force-test-extensions-upgrade.yml

diff --git a/.github/workflows/force-test-extensions-upgrade.yml b/.github/workflows/force-test-extensions-upgrade.yml
new file mode 100644
index 0000000000..71c5158ef6
--- /dev/null
+++ b/.github/workflows/force-test-extensions-upgrade.yml
@@ -0,0 +1,76 @@
+name: Force Test Upgrading of Extension
+on:
+  schedule:
+    # * is a special character in YAML so you have to quote this string
+    #          ┌───────────── minute (0 - 59)
+    #          │ ┌───────────── hour (0 - 23)
+    #          │ │ ┌───────────── day of the month (1 - 31)
+    #          │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
+    #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
+    - cron:  '45 2 * * *' # run once a day, timezone is utc
+  workflow_dispatch: # adds ability to run this manually
+
+defaults:
+  run:
+    shell: bash -euxo pipefail {0}
+
+concurrency:
+  # Allow only one workflow
+  group: ${{ github.workflow }}
+  cancel-in-progress: true
+
+permissions:
+  id-token: write # aws-actions/configure-aws-credentials
+  statuses: write
+  contents: read
+
+jobs:
+  regress:
+    strategy:
+      fail-fast: false
+      matrix:
+        pg-version: [16, 17]
+
+    runs-on: small
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: false
+
+      - name: Get the last compute release tag
+        id: get-last-compute-release-tag
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          tag=$(gh api -q '[.[].tag_name | select(startswith("release-compute"))][0]'\
+            -H "Accept: application/vnd.github+json" \
+            -H "X-GitHub-Api-Version: 2022-11-28" \
+            "/repos/${GITHUB_REPOSITORY}/releases")
+          echo tag=${tag} >> ${GITHUB_OUTPUT}
+
+      - name: Test extension upgrade
+        timeout-minutes: 20
+        env:
+          NEWTAG: latest
+          OLDTAG: ${{ steps.get-last-compute-release-tag.outputs.tag }}
+          PG_VERSION: ${{ matrix.pg-version }}
+          FORCE_ALL_UPGRADE_TESTS: true
+        run: ./docker-compose/test_extensions_upgrade.sh
+
+      - name: Print logs and clean up
+        if: always()
+        run: |
+          docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml logs || true
+          docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml down
+
+      - name: Post to the Slack channel
+        if: ${{ github.event.schedule && failure() }}
+        uses: slackapi/slack-github-action@v1
+        with:
+          channel-id: ${{ vars.SLACK_ON_CALL_QA_STAGING_STREAM }}
+          slack-message: |
+            Test upgrading of extensions: ${{ job.status }}
+            <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
+        env:
+          SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
diff --git a/docker-compose/test_extensions_upgrade.sh b/docker-compose/test_extensions_upgrade.sh
index 082b804a87..775acada1f 100755
--- a/docker-compose/test_extensions_upgrade.sh
+++ b/docker-compose/test_extensions_upgrade.sh
@@ -11,6 +11,7 @@ if [ -z ${OLDTAG+x} ] || [ -z ${NEWTAG+x} ] || [ -z "${OLDTAG}" ] || [ -z "${NEW
   exit 1
 fi
 export PG_VERSION=${PG_VERSION:-16}
+export PG_TEST_VERSION=${PG_VERSION}
 function wait_for_ready {
   TIME=0
   while ! docker compose logs compute_is_ready | grep -q "accepting connections" && [ ${TIME} -le 300 ] ; do
@@ -59,8 +60,12 @@ docker compose cp  ext-src neon-test-extensions:/
 docker compose exec neon-test-extensions psql -c "DROP DATABASE IF EXISTS contrib_regression"
 docker compose exec neon-test-extensions psql -c "CREATE DATABASE contrib_regression"
 create_extensions "${EXTNAMES}"
-query="select pge.extname from pg_extension pge join (select key as extname, value as extversion from json_each_text('${new_vers}')) x on pge.extname=x.extname and pge.extversion <> x.extversion"
-exts=$(docker compose exec neon-test-extensions psql -Aqt -d contrib_regression -c "$query")
+if [ "${FORCE_ALL_UPGRADE_TESTS:-false}" = true ]; then
+  exts="${EXTNAMES}"
+else
+  query="select pge.extname from pg_extension pge join (select key as extname, value as extversion from json_each_text('${new_vers}')) x on pge.extname=x.extname and pge.extversion <> x.extversion"
+  exts=$(docker compose exec neon-test-extensions psql -Aqt -d contrib_regression -c "$query")
+fi
 if [ -z "${exts}" ]; then
   echo "No extensions were upgraded"
 else
@@ -88,7 +93,10 @@ else
       exit 1
     fi
     docker compose exec neon-test-extensions psql -d contrib_regression -c "\dx ${ext}"
-    docker compose exec neon-test-extensions sh -c /ext-src/${EXTDIR}/test-upgrade.sh
+    if ! docker compose exec neon-test-extensions sh -c /ext-src/${EXTDIR}/test-upgrade.sh; then
+      docker  compose exec neon-test-extensions  cat /ext-src/${EXTDIR}/regression.diffs
+      exit 1
+    fi
     docker compose exec neon-test-extensions psql -d contrib_regression -c "alter extension ${ext} update"
     docker compose exec neon-test-extensions psql -d contrib_regression -c "\dx ${ext}"
   done

From da7496e1eef145253419ae699744353c79008047 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Fri, 14 Feb 2025 13:34:09 +0100
Subject: [PATCH 05/78] proxy: Post-refactor + future clippy lint cleanup
 (#10824)

* Clean up deps and code after logging and binary refactor
* Also include future clippy lint cleanup
---
 Cargo.lock                                 | 10 --------
 proxy/Cargo.toml                           |  4 ----
 proxy/src/auth/backend/console_redirect.rs |  5 ++--
 proxy/src/auth/backend/jwt.rs              |  8 +++----
 proxy/src/binary/local_proxy.rs            | 28 +++++++++++-----------
 proxy/src/binary/pg_sni_router.rs          | 13 +++++-----
 proxy/src/binary/proxy.rs                  | 24 ++++++++++---------
 proxy/src/cache/endpoints.rs               |  2 +-
 proxy/src/compute.rs                       |  4 ++--
 proxy/src/console_redirect_proxy.rs        |  2 +-
 proxy/src/control_plane/mod.rs             |  3 +--
 proxy/src/logging.rs                       |  3 +--
 proxy/src/protocol2.rs                     |  4 ++--
 proxy/src/proxy/connect_compute.rs         |  2 +-
 proxy/src/proxy/mod.rs                     |  2 +-
 proxy/src/redis/notifications.rs           |  4 ++--
 proxy/src/serverless/backend.rs            |  2 +-
 17 files changed, 53 insertions(+), 67 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 74922d71c9..287201b4e0 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1029,12 +1029,6 @@ dependencies = [
  "generic-array",
 ]
 
-[[package]]
-name = "boxcar"
-version = "0.2.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2721c3c5a6f0e7f7e607125d963fedeb765f545f67adc9d71ed934693881eb42"
-
 [[package]]
 name = "bstr"
 version = "1.5.0"
@@ -4929,7 +4923,6 @@ dependencies = [
  "aws-sdk-iam",
  "aws-sigv4",
  "base64 0.13.1",
- "boxcar",
  "bstr",
  "bytes",
  "camino",
@@ -4981,7 +4974,6 @@ dependencies = [
  "postgres-protocol2",
  "postgres_backend",
  "pq_proto",
- "prometheus",
  "rand 0.8.5",
  "rand_distr",
  "rcgen",
@@ -5006,7 +4998,6 @@ dependencies = [
  "smallvec",
  "smol_str",
  "socket2",
- "strum",
  "strum_macros",
  "subtle",
  "thiserror 1.0.69",
@@ -5021,7 +5012,6 @@ dependencies = [
  "tracing",
  "tracing-log",
  "tracing-opentelemetry",
- "tracing-serde",
  "tracing-subscriber",
  "tracing-utils",
  "try-lock",
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 3aa6ac3a76..6a381bf094 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -19,7 +19,6 @@ aws-config.workspace = true
 aws-sdk-iam.workspace = true
 aws-sigv4.workspace = true
 base64.workspace = true
-boxcar = "0.2.8"
 bstr.workspace = true
 bytes = { workspace = true, features = ["serde"] }
 camino.workspace = true
@@ -63,7 +62,6 @@ postgres_backend.workspace = true
 postgres-client = { package = "tokio-postgres2", path = "../libs/proxy/tokio-postgres2" }
 postgres-protocol = { package = "postgres-protocol2", path = "../libs/proxy/postgres-protocol2" }
 pq_proto.workspace = true
-prometheus.workspace = true
 rand.workspace = true
 regex.workspace = true
 remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
@@ -81,7 +79,6 @@ sha2 = { workspace = true, features = ["asm", "oid"] }
 smol_str.workspace = true
 smallvec.workspace = true
 socket2.workspace = true
-strum.workspace = true
 strum_macros.workspace = true
 subtle.workspace = true
 thiserror.workspace = true
@@ -95,7 +92,6 @@ tracing-subscriber.workspace = true
 tracing-utils.workspace = true
 tracing.workspace = true
 tracing-log.workspace = true
-tracing-serde.workspace = true
 tracing-opentelemetry.workspace = true
 try-lock.workspace = true
 typed-json.workspace = true
diff --git a/proxy/src/auth/backend/console_redirect.rs b/proxy/src/auth/backend/console_redirect.rs
index 9be29c38c9..7503b4eac9 100644
--- a/proxy/src/auth/backend/console_redirect.rs
+++ b/proxy/src/auth/backend/console_redirect.rs
@@ -140,9 +140,8 @@ async fn authenticate(
     let (psql_session_id, waiter) = loop {
         let psql_session_id = new_psql_session_id();
 
-        match control_plane::mgmt::get_waiter(&psql_session_id) {
-            Ok(waiter) => break (psql_session_id, waiter),
-            Err(_e) => continue,
+        if let Ok(waiter) = control_plane::mgmt::get_waiter(&psql_session_id) {
+            break (psql_session_id, waiter);
         }
     };
 
diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs
index e05a693cee..5d032c0deb 100644
--- a/proxy/src/auth/backend/jwt.rs
+++ b/proxy/src/auth/backend/jwt.rs
@@ -220,11 +220,11 @@ async fn fetch_jwks(
 }
 
 impl JwkCacheEntryLock {
-    async fn acquire_permit<'a>(self: &'a Arc<Self>) -> JwkRenewalPermit<'a> {
+    async fn acquire_permit(self: &Arc<Self>) -> JwkRenewalPermit<'_> {
         JwkRenewalPermit::acquire_permit(self).await
     }
 
-    fn try_acquire_permit<'a>(self: &'a Arc<Self>) -> Option<JwkRenewalPermit<'a>> {
+    fn try_acquire_permit(self: &Arc<Self>) -> Option<JwkRenewalPermit<'_>> {
         JwkRenewalPermit::try_acquire_permit(self)
     }
 
@@ -393,7 +393,7 @@ impl JwkCacheEntryLock {
                 verify_rsa_signature(header_payload.as_bytes(), &sig, key, &header.algorithm)?;
             }
             key => return Err(JwtError::UnsupportedKeyType(key.into())),
-        };
+        }
 
         tracing::debug!(?payload, "JWT signature valid with claims");
 
@@ -510,7 +510,7 @@ fn verify_rsa_signature(
             key.verify(data, &sig)?;
         }
         _ => return Err(JwtError::InvalidRsaSigningAlgorithm),
-    };
+    }
 
     Ok(())
 }
diff --git a/proxy/src/binary/local_proxy.rs b/proxy/src/binary/local_proxy.rs
index e0d8515375..4ab11f828c 100644
--- a/proxy/src/binary/local_proxy.rs
+++ b/proxy/src/binary/local_proxy.rs
@@ -4,6 +4,20 @@ use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
 
+use anyhow::{bail, ensure, Context};
+use camino::{Utf8Path, Utf8PathBuf};
+use clap::Parser;
+use compute_api::spec::LocalProxySpec;
+use futures::future::Either;
+use thiserror::Error;
+use tokio::net::TcpListener;
+use tokio::sync::Notify;
+use tokio::task::JoinSet;
+use tokio_util::sync::CancellationToken;
+use tracing::{debug, error, info, warn};
+use utils::sentry_init::init_sentry;
+use utils::{pid_file, project_build_tag, project_git_version};
+
 use crate::auth::backend::jwt::JwkCache;
 use crate::auth::backend::local::{LocalBackend, JWKS_ROLE_MAP};
 use crate::auth::{self};
@@ -25,24 +39,10 @@ use crate::serverless::{self, GlobalConnPoolOptions};
 use crate::tls::client_config::compute_client_config_with_root_certs;
 use crate::types::RoleName;
 use crate::url::ApiUrl;
-use anyhow::{bail, ensure, Context};
-use camino::{Utf8Path, Utf8PathBuf};
-use compute_api::spec::LocalProxySpec;
-use futures::future::Either;
 
 project_git_version!(GIT_VERSION);
 project_build_tag!(BUILD_TAG);
 
-use clap::Parser;
-use thiserror::Error;
-use tokio::net::TcpListener;
-use tokio::sync::Notify;
-use tokio::task::JoinSet;
-use tokio_util::sync::CancellationToken;
-use tracing::{debug, error, info, warn};
-use utils::sentry_init::init_sentry;
-use utils::{pid_file, project_build_tag, project_git_version};
-
 /// Neon proxy/router
 #[derive(Parser)]
 #[command(version = GIT_VERSION, about)]
diff --git a/proxy/src/binary/pg_sni_router.rs b/proxy/src/binary/pg_sni_router.rs
index 235e9674c6..94e771a61c 100644
--- a/proxy/src/binary/pg_sni_router.rs
+++ b/proxy/src/binary/pg_sni_router.rs
@@ -5,12 +5,6 @@
 /// the outside. Similar to an ingress controller for HTTPS.
 use std::{net::SocketAddr, sync::Arc};
 
-use crate::context::RequestContext;
-use crate::metrics::{Metrics, ThreadPoolMetrics};
-use crate::protocol2::ConnectionInfo;
-use crate::proxy::{copy_bidirectional_client_compute, run_until_cancelled, ErrorSource};
-use crate::stream::{PqStream, Stream};
-use crate::tls::TlsServerEndPoint;
 use anyhow::{anyhow, bail, ensure, Context};
 use clap::Arg;
 use futures::future::Either;
@@ -25,6 +19,13 @@ use tracing::{error, info, Instrument};
 use utils::project_git_version;
 use utils::sentry_init::init_sentry;
 
+use crate::context::RequestContext;
+use crate::metrics::{Metrics, ThreadPoolMetrics};
+use crate::protocol2::ConnectionInfo;
+use crate::proxy::{copy_bidirectional_client_compute, run_until_cancelled, ErrorSource};
+use crate::stream::{PqStream, Stream};
+use crate::tls::TlsServerEndPoint;
+
 project_git_version!(GIT_VERSION);
 
 fn cli() -> clap::Command {
diff --git a/proxy/src/binary/proxy.rs b/proxy/src/binary/proxy.rs
index e38c49ca10..b72799df54 100644
--- a/proxy/src/binary/proxy.rs
+++ b/proxy/src/binary/proxy.rs
@@ -3,6 +3,16 @@ use std::pin::pin;
 use std::sync::Arc;
 use std::time::Duration;
 
+use anyhow::bail;
+use futures::future::Either;
+use remote_storage::RemoteStorageConfig;
+use tokio::net::TcpListener;
+use tokio::task::JoinSet;
+use tokio_util::sync::CancellationToken;
+use tracing::{info, warn, Instrument};
+use utils::sentry_init::init_sentry;
+use utils::{project_build_tag, project_git_version};
+
 use crate::auth::backend::jwt::JwkCache;
 use crate::auth::backend::{AuthRateLimiter, ConsoleRedirectBackend, MaybeOwned};
 use crate::cancellation::{handle_cancel_messages, CancellationHandler};
@@ -24,15 +34,6 @@ use crate::serverless::cancel_set::CancelSet;
 use crate::serverless::GlobalConnPoolOptions;
 use crate::tls::client_config::compute_client_config_with_root_certs;
 use crate::{auth, control_plane, http, serverless, usage_metrics};
-use anyhow::bail;
-use futures::future::Either;
-use remote_storage::RemoteStorageConfig;
-use tokio::net::TcpListener;
-use tokio::task::JoinSet;
-use tokio_util::sync::CancellationToken;
-use tracing::{info, warn, Instrument};
-use utils::sentry_init::init_sentry;
-use utils::{project_build_tag, project_git_version};
 
 project_git_version!(GIT_VERSION);
 project_build_tag!(BUILD_TAG);
@@ -303,7 +304,7 @@ pub async fn run() -> anyhow::Result<()> {
     match auth_backend {
         Either::Left(auth_backend) => info!("Authentication backend: {auth_backend}"),
         Either::Right(auth_backend) => info!("Authentication backend: {auth_backend:?}"),
-    };
+    }
     info!("Using region: {}", args.aws_region);
 
     // TODO: untangle the config args
@@ -803,9 +804,10 @@ fn build_auth_backend(
 mod tests {
     use std::time::Duration;
 
-    use crate::rate_limiter::RateBucketInfo;
     use clap::Parser;
 
+    use crate::rate_limiter::RateBucketInfo;
+
     #[test]
     fn parse_endpoint_rps_limit() {
         let config = super::ProxyCliArgs::parse_from([
diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs
index b5c42cd23d..8ec1a4648b 100644
--- a/proxy/src/cache/endpoints.rs
+++ b/proxy/src/cache/endpoints.rs
@@ -242,7 +242,7 @@ impl EndpointsCache {
                             });
                             tracing::error!("error parsing value {value:?}: {err:?}");
                         }
-                    };
+                    }
                 }
                 if total.is_power_of_two() {
                     tracing::debug!("endpoints read {}", total);
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index d71465765f..5447a4a4c0 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -137,8 +137,8 @@ impl ConnCfg {
             match k {
                 // Only set `user` if it's not present in the config.
                 // Console redirect auth flow takes username from the console's response.
-                "user" if self.user_is_set() => continue,
-                "database" if self.db_is_set() => continue,
+                "user" if self.user_is_set() => {}
+                "database" if self.db_is_set() => {}
                 "options" => {
                     if let Some(options) = filtered_options(v) {
                         self.set_param(k, &options);
diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs
index c4548a7ddd..1044f5f8e2 100644
--- a/proxy/src/console_redirect_proxy.rs
+++ b/proxy/src/console_redirect_proxy.rs
@@ -82,7 +82,7 @@ pub async fn task_main(
                     error!("per-client task finished with an error: failed to set socket option: {e:#}");
                     return;
                 }
-            };
+            }
 
             let ctx = RequestContext::new(
                 session_id,
diff --git a/proxy/src/control_plane/mod.rs b/proxy/src/control_plane/mod.rs
index f92e4f3f60..89ec4f9b33 100644
--- a/proxy/src/control_plane/mod.rs
+++ b/proxy/src/control_plane/mod.rs
@@ -19,8 +19,7 @@ use crate::cache::{Cached, TimedLru};
 use crate::config::ComputeConfig;
 use crate::context::RequestContext;
 use crate::control_plane::messages::{ControlPlaneErrorMessage, MetricsAuxInfo};
-use crate::intern::AccountIdInt;
-use crate::intern::ProjectIdInt;
+use crate::intern::{AccountIdInt, ProjectIdInt};
 use crate::types::{EndpointCacheKey, EndpointId};
 use crate::{compute, scram};
 
diff --git a/proxy/src/logging.rs b/proxy/src/logging.rs
index 97c9f5a59c..fbd4811b54 100644
--- a/proxy/src/logging.rs
+++ b/proxy/src/logging.rs
@@ -7,9 +7,8 @@ use chrono::{DateTime, Utc};
 use opentelemetry::trace::TraceContextExt;
 use scopeguard::defer;
 use serde::ser::{SerializeMap, Serializer};
-use tracing::span;
 use tracing::subscriber::Interest;
-use tracing::{callsite, Event, Metadata, Span, Subscriber};
+use tracing::{callsite, span, Event, Metadata, Span, Subscriber};
 use tracing_opentelemetry::OpenTelemetrySpanExt;
 use tracing_subscriber::filter::{EnvFilter, LevelFilter};
 use tracing_subscriber::fmt::format::{Format, Full};
diff --git a/proxy/src/protocol2.rs b/proxy/src/protocol2.rs
index 0dc97b7097..74a15d9bf4 100644
--- a/proxy/src/protocol2.rs
+++ b/proxy/src/protocol2.rs
@@ -119,7 +119,7 @@ pub(crate) async fn read_proxy_protocol<T: AsyncRead + Unpin>(
         // if no more bytes available then exit
         if bytes_read == 0 {
             return Ok((ChainRW { inner: read, buf }, ConnectHeader::Missing));
-        };
+        }
 
         // check if we have enough bytes to continue
         if let Some(header) = buf.try_get::<ProxyProtocolV2Header>() {
@@ -169,7 +169,7 @@ fn process_proxy_payload(
                 header.version_and_command
             ),
         )),
-    };
+    }
 
     let size_err =
         "invalid proxy protocol length. payload not large enough to fit requested IP addresses";
diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
index dd145e6bb2..26fb1754bf 100644
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -198,7 +198,7 @@ where
 
                 warn!(error = ?e, num_retries, retriable = true, COULD_NOT_CONNECT);
             }
-        };
+        }
 
         let wait_duration = retry_after(num_retries, compute.retry);
         num_retries += 1;
diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs
index 8a407c8119..2a406fcb34 100644
--- a/proxy/src/proxy/mod.rs
+++ b/proxy/src/proxy/mod.rs
@@ -118,7 +118,7 @@ pub async fn task_main(
                     error!("per-client task finished with an error: failed to set socket option: {e:#}");
                     return;
                 }
-            };
+            }
 
             let ctx = RequestContext::new(
                 session_id,
diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs
index 1a7024588a..5f9f2509e2 100644
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -169,7 +169,7 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
                         });
                         tracing::error!("broken message: {e}");
                     }
-                };
+                }
                 return Ok(());
             }
             Ok(msg) => msg,
@@ -180,7 +180,7 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
                 match serde_json::from_str::<NotificationHeader>(&payload) {
                     Ok(header) => tracing::error!(topic = header.topic, "broken message: {e}"),
                     Err(_) => tracing::error!("broken message: {e}"),
-                };
+                }
                 return Ok(());
             }
         };
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index edc2935618..6a59d413c4 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -651,7 +651,7 @@ async fn connect_http2(
                     e,
                 )));
             }
-        };
+        }
     };
 
     let (client, connection) = hyper::client::conn::http2::Builder::new(TokioExecutor::new())

From a82a6631fdfb4471aeb090c8cee9e0e53b4f96ad Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 14 Feb 2025 13:25:43 +0000
Subject: [PATCH 06/78] storage controller: prioritize reconciles for
 user-facing operations (#10822)

## Problem

Some situations may produce a large number of pending reconciles. If we
experience an issue where reconciles are processed more slowly than
expected, that can prevent us responding promptly to user requests like
tenant/timeline CRUD.

This is a cleaner implementation of the hotfix in
https://github.com/neondatabase/neon/pull/10815

## Summary of changes

- Introduce a second semaphore for high priority tasks, with
configurable units (default 256). The intent is that in practical
situations these user-facing requests should never have to wait.
- Use the high priority semaphore for: tenant/timeline CRUD, and shard
splitting operations. Use normal priority for everything else.
---
 storage_controller/src/main.rs                |  12 +-
 storage_controller/src/reconciler.rs          |  33 ++++-
 storage_controller/src/service.rs             | 124 ++++++++++++++----
 .../src/service/chaos_injector.rs             |   6 +-
 4 files changed, 143 insertions(+), 32 deletions(-)

diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index 07279a67ff..ea6bc38e89 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -12,7 +12,8 @@ use storage_controller::persistence::Persistence;
 use storage_controller::service::chaos_injector::ChaosInjector;
 use storage_controller::service::{
     Config, Service, HEARTBEAT_INTERVAL_DEFAULT, LONG_RECONCILE_THRESHOLD_DEFAULT,
-    MAX_OFFLINE_INTERVAL_DEFAULT, MAX_WARMING_UP_INTERVAL_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT,
+    MAX_OFFLINE_INTERVAL_DEFAULT, MAX_WARMING_UP_INTERVAL_DEFAULT,
+    PRIORITY_RECONCILER_CONCURRENCY_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT,
 };
 use tokio::signal::unix::SignalKind;
 use tokio_util::sync::CancellationToken;
@@ -75,10 +76,14 @@ struct Cli {
     #[arg(long)]
     split_threshold: Option<u64>,
 
-    /// Maximum number of reconcilers that may run in parallel
+    /// Maximum number of normal-priority reconcilers that may run in parallel
     #[arg(long)]
     reconciler_concurrency: Option<usize>,
 
+    /// Maximum number of high-priority reconcilers that may run in parallel
+    #[arg(long)]
+    priority_reconciler_concurrency: Option<usize>,
+
     /// How long to wait for the initial database connection to be available.
     #[arg(long, default_value = "5s")]
     db_connect_timeout: humantime::Duration,
@@ -289,6 +294,9 @@ async fn async_main() -> anyhow::Result<()> {
         reconciler_concurrency: args
             .reconciler_concurrency
             .unwrap_or(RECONCILER_CONCURRENCY_DEFAULT),
+        priority_reconciler_concurrency: args
+            .priority_reconciler_concurrency
+            .unwrap_or(PRIORITY_RECONCILER_CONCURRENCY_DEFAULT),
         split_threshold: args.split_threshold,
         neon_local_repo_dir: args.neon_local_repo_dir,
         max_secondary_lag_bytes: args.max_secondary_lag_bytes,
diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs
index 8c7e9b1726..48f0804926 100644
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -91,9 +91,10 @@ pub(crate) struct ReconcilerConfigBuilder {
 }
 
 impl ReconcilerConfigBuilder {
-    pub(crate) fn new() -> Self {
+    /// Priority is special: you must pick one thoughtfully, do not just use 'normal' as the default
+    pub(crate) fn new(priority: ReconcilerPriority) -> Self {
         Self {
-            config: ReconcilerConfig::default(),
+            config: ReconcilerConfig::new(priority),
         }
     }
 
@@ -129,8 +130,18 @@ impl ReconcilerConfigBuilder {
     }
 }
 
-#[derive(Default, Debug, Copy, Clone)]
+// Higher priorities are used for user-facing tasks, so that a long backlog of housekeeping work (e.g. reconciling on startup, rescheduling
+// things on node changes) does not starve user-facing tasks.
+#[derive(Debug, Copy, Clone)]
+pub(crate) enum ReconcilerPriority {
+    Normal,
+    High,
+}
+
+#[derive(Debug, Copy, Clone)]
 pub(crate) struct ReconcilerConfig {
+    pub(crate) priority: ReconcilerPriority,
+
     // During live migration give up on warming-up the secondary
     // after this timeout.
     secondary_warmup_timeout: Option<Duration>,
@@ -145,6 +156,18 @@ pub(crate) struct ReconcilerConfig {
 }
 
 impl ReconcilerConfig {
+    /// Configs are always constructed with an explicit priority, to force callers to think about whether
+    /// the operation they're scheduling is high-priority or not. Normal priority is not a safe default, because
+    /// scheduling something user-facing at normal priority can result in it getting starved out by background work.
+    pub(crate) fn new(priority: ReconcilerPriority) -> Self {
+        Self {
+            priority,
+            secondary_warmup_timeout: None,
+            secondary_download_request_timeout: None,
+            tenant_creation_hint: false,
+        }
+    }
+
     pub(crate) fn get_secondary_warmup_timeout(&self) -> Duration {
         const SECONDARY_WARMUP_TIMEOUT_DEFAULT: Duration = Duration::from_secs(300);
         self.secondary_warmup_timeout
@@ -164,7 +187,9 @@ impl ReconcilerConfig {
 
 impl From<&MigrationConfig> for ReconcilerConfig {
     fn from(value: &MigrationConfig) -> Self {
-        let mut builder = ReconcilerConfigBuilder::new();
+        // Run reconciler at high priority because MigrationConfig comes from human requests that should
+        // be presumed urgent.
+        let mut builder = ReconcilerConfigBuilder::new(ReconcilerPriority::High);
 
         if let Some(timeout) = value.secondary_warmup_timeout {
             builder = builder.secondary_warmup_timeout(timeout)
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index c1da9374e4..d5713d49ee 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -30,7 +30,10 @@ use crate::{
         AbortShardSplitStatus, ControllerPersistence, DatabaseResult, MetadataHealthPersistence,
         ShardGenerationState, TenantFilter,
     },
-    reconciler::{ReconcileError, ReconcileUnits, ReconcilerConfig, ReconcilerConfigBuilder},
+    reconciler::{
+        ReconcileError, ReconcileUnits, ReconcilerConfig, ReconcilerConfigBuilder,
+        ReconcilerPriority,
+    },
     safekeeper::Safekeeper,
     scheduler::{MaySchedule, ScheduleContext, ScheduleError, ScheduleMode},
     tenant_shard::{
@@ -79,7 +82,7 @@ use pageserver_api::{
     },
 };
 use pageserver_client::{mgmt_api, BlockUnblock};
-use tokio::sync::mpsc::error::TrySendError;
+use tokio::sync::{mpsc::error::TrySendError, TryAcquireError};
 use tokio_util::sync::CancellationToken;
 use utils::{
     completion::Barrier,
@@ -195,6 +198,7 @@ pub(crate) enum LeadershipStatus {
 }
 
 pub const RECONCILER_CONCURRENCY_DEFAULT: usize = 128;
+pub const PRIORITY_RECONCILER_CONCURRENCY_DEFAULT: usize = 256;
 
 // Depth of the channel used to enqueue shards for reconciliation when they can't do it immediately.
 // This channel is finite-size to avoid using excessive memory if we get into a state where reconciles are finishing more slowly
@@ -366,9 +370,12 @@ pub struct Config {
     /// and/or upon handling the re-attach request from a node.
     pub max_warming_up_interval: Duration,
 
-    /// How many Reconcilers may be spawned concurrently
+    /// How many normal-priority Reconcilers may be spawned concurrently
     pub reconciler_concurrency: usize,
 
+    /// How many high-priority Reconcilers may be spawned concurrently
+    pub priority_reconciler_concurrency: usize,
+
     /// How large must a shard grow in bytes before we split it?
     /// None disables auto-splitting.
     pub split_threshold: Option<u64>,
@@ -436,9 +443,14 @@ pub struct Service {
     // that transition it to/from Active.
     node_op_locks: IdLockMap<NodeId, NodeOperations>,
 
-    // Limit how many Reconcilers we will spawn concurrently
+    // Limit how many Reconcilers we will spawn concurrently for normal-priority tasks such as background reconciliations
+    // and reconciliation on startup.
     reconciler_concurrency: Arc<tokio::sync::Semaphore>,
 
+    // Limit how many Reconcilers we will spawn concurrently for high-priority tasks such as tenant/timeline CRUD, which
+    // a human user might be waiting for.
+    priority_reconciler_concurrency: Arc<tokio::sync::Semaphore>,
+
     /// Queue of tenants who are waiting for concurrency limits to permit them to reconcile
     /// Send into this queue to promptly attempt to reconcile this shard next time units are available.
     ///
@@ -1263,12 +1275,15 @@ impl Service {
         }
 
         // Maybe some other work can proceed now that this job finished.
+        //
+        // Only bother with this if we have some semaphore units available in the normal-priority semaphore (these
+        // reconciles are scheduled at `[ReconcilerPriority::Normal]`).
         if self.reconciler_concurrency.available_permits() > 0 {
             while let Ok(tenant_shard_id) = locked.delayed_reconcile_rx.try_recv() {
                 let (nodes, tenants, _scheduler) = locked.parts_mut();
                 if let Some(shard) = tenants.get_mut(&tenant_shard_id) {
                     shard.delayed_reconcile = false;
-                    self.maybe_reconcile_shard(shard, nodes);
+                    self.maybe_reconcile_shard(shard, nodes, ReconcilerPriority::Normal);
                 }
 
                 if self.reconciler_concurrency.available_permits() == 0 {
@@ -1565,6 +1580,9 @@ impl Service {
             reconciler_concurrency: Arc::new(tokio::sync::Semaphore::new(
                 config.reconciler_concurrency,
             )),
+            priority_reconciler_concurrency: Arc::new(tokio::sync::Semaphore::new(
+                config.priority_reconciler_concurrency,
+            )),
             delayed_reconcile_tx,
             abort_tx,
             startup_complete: startup_complete.clone(),
@@ -2337,7 +2355,7 @@ impl Service {
         let waiters = {
             let mut locked = self.inner.write().unwrap();
             let (nodes, tenants, _scheduler) = locked.parts_mut();
-            let config = ReconcilerConfigBuilder::new()
+            let config = ReconcilerConfigBuilder::new(ReconcilerPriority::High)
                 .tenant_creation_hint(true)
                 .build();
             tenants
@@ -2812,7 +2830,8 @@ impl Service {
 
                         shard.schedule(scheduler, &mut schedule_context)?;
 
-                        let maybe_waiter = self.maybe_reconcile_shard(shard, nodes);
+                        let maybe_waiter =
+                            self.maybe_reconcile_shard(shard, nodes, ReconcilerPriority::High);
                         if let Some(waiter) = maybe_waiter {
                             waiters.push(waiter);
                         }
@@ -2933,7 +2952,9 @@ impl Service {
             let (nodes, tenants, _scheduler) = locked.parts_mut();
             for (_shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
                 shard.config = config.clone();
-                if let Some(waiter) = self.maybe_reconcile_shard(shard, nodes) {
+                if let Some(waiter) =
+                    self.maybe_reconcile_shard(shard, nodes, ReconcilerPriority::High)
+                {
                     waiters.push(waiter);
                 }
             }
@@ -3215,7 +3236,9 @@ impl Service {
                 debug_assert!(shard.intent.get_attached().is_none());
                 debug_assert!(shard.intent.get_secondary().is_empty());
 
-                if let Some(waiter) = self.maybe_reconcile_shard(shard, nodes) {
+                if let Some(waiter) =
+                    self.maybe_reconcile_shard(shard, nodes, ReconcilerPriority::High)
+                {
                     detach_waiters.push(waiter);
                 }
             }
@@ -3367,7 +3390,7 @@ impl Service {
 
             // In case scheduling is being switched back on, try it now.
             shard.schedule(scheduler, &mut schedule_context).ok();
-            self.maybe_reconcile_shard(shard, nodes);
+            self.maybe_reconcile_shard(shard, nodes, ReconcilerPriority::High);
         }
 
         Ok(())
@@ -4416,7 +4439,7 @@ impl Service {
                     tracing::warn!("Failed to schedule {tenant_shard_id} during shard abort: {e}")
                 }
 
-                self.maybe_reconcile_shard(shard, nodes);
+                self.maybe_reconcile_shard(shard, nodes, ReconcilerPriority::High);
             }
 
             // We don't expect any new_shard_count shards to exist here, but drop them just in case
@@ -4582,7 +4605,11 @@ impl Service {
                         tracing::warn!("Failed to schedule child shard {child}: {e}");
                     }
                     // In the background, attach secondary locations for the new shards
-                    if let Some(waiter) = self.maybe_reconcile_shard(&mut child_state, nodes) {
+                    if let Some(waiter) = self.maybe_reconcile_shard(
+                        &mut child_state,
+                        nodes,
+                        ReconcilerPriority::High,
+                    ) {
                         waiters.push(waiter);
                     }
 
@@ -4947,7 +4974,9 @@ impl Service {
                 shard.intent.clear_secondary(scheduler);
 
                 // Run Reconciler to execute detach fo secondary locations.
-                if let Some(waiter) = self.maybe_reconcile_shard(shard, nodes) {
+                if let Some(waiter) =
+                    self.maybe_reconcile_shard(shard, nodes, ReconcilerPriority::High)
+                {
                     waiters.push(waiter);
                 }
             }
@@ -5215,7 +5244,7 @@ impl Service {
 
             let reconciler_config = match migrate_req.migration_config {
                 Some(cfg) => (&cfg).into(),
-                None => ReconcilerConfig::default(),
+                None => ReconcilerConfig::new(ReconcilerPriority::High),
             };
 
             self.maybe_configured_reconcile_shard(shard, nodes, reconciler_config)
@@ -5281,7 +5310,7 @@ impl Service {
                 );
             }
 
-            self.maybe_reconcile_shard(shard, nodes)
+            self.maybe_reconcile_shard(shard, nodes, ReconcilerPriority::High)
         };
 
         if let Some(waiter) = waiter {
@@ -5693,7 +5722,7 @@ impl Service {
                             )
                         }
 
-                        self.maybe_reconcile_shard(shard, nodes);
+                        self.maybe_reconcile_shard(shard, nodes, ReconcilerPriority::Normal);
                     }
 
                     // Here we remove an existing observed location for the node we're removing, and it will
@@ -6062,7 +6091,14 @@ impl Service {
                                     tracing::warn!(%tenant_shard_id, "Scheduling error when marking pageserver {} offline: {e}", node_id);
                                 }
                                 Ok(()) => {
-                                    if self.maybe_reconcile_shard(tenant_shard, nodes).is_some() {
+                                    if self
+                                        .maybe_reconcile_shard(
+                                            tenant_shard,
+                                            nodes,
+                                            ReconcilerPriority::Normal,
+                                        )
+                                        .is_some()
+                                    {
                                         tenants_affected += 1;
                                     };
                                 }
@@ -6093,7 +6129,11 @@ impl Service {
 
                     if let Some(observed_loc) = tenant_shard.observed.locations.get_mut(&node_id) {
                         if observed_loc.conf.is_none() {
-                            self.maybe_reconcile_shard(tenant_shard, nodes);
+                            self.maybe_reconcile_shard(
+                                tenant_shard,
+                                nodes,
+                                ReconcilerPriority::Normal,
+                            );
                         }
                     }
                 }
@@ -6457,8 +6497,36 @@ impl Service {
         &self,
         shard: &mut TenantShard,
         nodes: &Arc<HashMap<NodeId, Node>>,
+        priority: ReconcilerPriority,
     ) -> Option<ReconcilerWaiter> {
-        self.maybe_configured_reconcile_shard(shard, nodes, ReconcilerConfig::default())
+        self.maybe_configured_reconcile_shard(shard, nodes, ReconcilerConfig::new(priority))
+    }
+
+    /// Before constructing a Reconciler, acquire semaphore units from the appropriate concurrency limit (depends on priority)
+    fn get_reconciler_units(
+        &self,
+        priority: ReconcilerPriority,
+    ) -> Result<ReconcileUnits, TryAcquireError> {
+        let units = match priority {
+            ReconcilerPriority::Normal => self.reconciler_concurrency.clone().try_acquire_owned(),
+            ReconcilerPriority::High => {
+                match self
+                    .priority_reconciler_concurrency
+                    .clone()
+                    .try_acquire_owned()
+                {
+                    Ok(u) => Ok(u),
+                    Err(TryAcquireError::NoPermits) => {
+                        // If the high priority semaphore is exhausted, then high priority tasks may steal units from
+                        // the normal priority semaphore.
+                        self.reconciler_concurrency.clone().try_acquire_owned()
+                    }
+                    Err(e) => Err(e),
+                }
+            }
+        };
+
+        units.map(ReconcileUnits::new)
     }
 
     /// Wrap [`TenantShard`] reconciliation methods with acquisition of [`Gate`] and [`ReconcileUnits`],
@@ -6478,8 +6546,8 @@ impl Service {
             }
         };
 
-        let units = match self.reconciler_concurrency.clone().try_acquire_owned() {
-            Ok(u) => ReconcileUnits::new(u),
+        let units = match self.get_reconciler_units(reconciler_config.priority) {
+            Ok(u) => u,
             Err(_) => {
                 tracing::info!(tenant_id=%shard.tenant_shard_id.tenant_id, shard_id=%shard.tenant_shard_id.shard_slug(),
                     "Concurrency limited: enqueued for reconcile later");
@@ -6572,7 +6640,10 @@ impl Service {
 
             // Eventual consistency: if an earlier reconcile job failed, and the shard is still
             // dirty, spawn another rone
-            if self.maybe_reconcile_shard(shard, &pageservers).is_some() {
+            if self
+                .maybe_reconcile_shard(shard, &pageservers, ReconcilerPriority::Normal)
+                .is_some()
+            {
                 reconciles_spawned += 1;
             } else if shard.delayed_reconcile {
                 // Shard wanted to reconcile but for some reason couldn't.
@@ -6658,7 +6729,10 @@ impl Service {
             tracing::info!(tenant_shard_id=%tenant_shard_id, "Applying optimization: {optimization:?}");
             if shard.apply_optimization(scheduler, optimization) {
                 optimizations_applied += 1;
-                if self.maybe_reconcile_shard(shard, nodes).is_some() {
+                if self
+                    .maybe_reconcile_shard(shard, nodes, ReconcilerPriority::Normal)
+                    .is_some()
+                {
                     reconciles_spawned += 1;
                 }
             }
@@ -7208,7 +7282,7 @@ impl Service {
         // to not stall the operation when a cold secondary is encountered.
         const SECONDARY_WARMUP_TIMEOUT: Duration = Duration::from_secs(20);
         const SECONDARY_DOWNLOAD_REQUEST_TIMEOUT: Duration = Duration::from_secs(5);
-        let reconciler_config = ReconcilerConfigBuilder::new()
+        let reconciler_config = ReconcilerConfigBuilder::new(ReconcilerPriority::Normal)
             .secondary_warmup_timeout(SECONDARY_WARMUP_TIMEOUT)
             .secondary_download_request_timeout(SECONDARY_DOWNLOAD_REQUEST_TIMEOUT)
             .build();
@@ -7541,7 +7615,7 @@ impl Service {
     ) -> Result<(), OperationError> {
         const SECONDARY_WARMUP_TIMEOUT: Duration = Duration::from_secs(20);
         const SECONDARY_DOWNLOAD_REQUEST_TIMEOUT: Duration = Duration::from_secs(5);
-        let reconciler_config = ReconcilerConfigBuilder::new()
+        let reconciler_config = ReconcilerConfigBuilder::new(ReconcilerPriority::Normal)
             .secondary_warmup_timeout(SECONDARY_WARMUP_TIMEOUT)
             .secondary_download_request_timeout(SECONDARY_DOWNLOAD_REQUEST_TIMEOUT)
             .build();
diff --git a/storage_controller/src/service/chaos_injector.rs b/storage_controller/src/service/chaos_injector.rs
index 91d7183fde..aa0ee0df5a 100644
--- a/storage_controller/src/service/chaos_injector.rs
+++ b/storage_controller/src/service/chaos_injector.rs
@@ -88,7 +88,11 @@ impl ChaosInjector {
 
         shard.intent.demote_attached(scheduler, old_location);
         shard.intent.promote_attached(scheduler, new_location);
-        self.service.maybe_reconcile_shard(shard, nodes);
+        self.service.maybe_reconcile_shard(
+            shard,
+            nodes,
+            crate::reconciler::ReconcilerPriority::Normal,
+        );
     }
 
     async fn inject_chaos(&mut self) {

From fac5db3c8de25b6f44b267365926fd122c901a44 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 14 Feb 2025 15:37:03 +0100
Subject: [PATCH 07/78] page_service: emit periodic log message while response
 flush is slow (#10813)

The logic might seem a bit intricate / over-optimized, but I recently
spent time benchmarking this code path in the context of a nightly
pagebench regression
(https://github.com/neondatabase/cloud/issues/21759)
and I want to avoid regressing it any further.

Ideally would also log the socket send & recv queue length like we do on
the compute side in
- https://github.com/neondatabase/neon/pull/10673

But that is proving difficult due to the Rust abstractions that wrap the
socket fd.
Work in progress on that is happening in
- https://github.com/neondatabase/neon/pull/10823

Regarding production impact, I am worried at a theoretical level that
the additional logging may cause a downward spiral in the case where a
pageserver is slow to flush because there is not enough CPU. The logging
would consume more CPU and thereby slow down flushes even more. However,
I don't think this matters practically speaking.


# Refs

- context:
https://neondb.slack.com/archives/C08DE6Q9C3B/p1739464533762049?thread_ts=1739462628.361019&cid=C08DE6Q9C3B
- fixes https://github.com/neondatabase/neon/issues/10668
- part of https://github.com/neondatabase/cloud/issues/23515

# Testing

Tested locally by running

```
./target/debug/pagebench get-page-latest-lsn --num-clients=1000 --queue-depth=1000
```
in one terminal, waiting a bit, then
```
pkill -STOP pagebench
```
then wait for slow logs to show up in `pageserver.log`.
To see that the completion log message is logged, run
```
pkill -CONT pagebench
```
---
 pageserver/src/metrics.rs | 42 +++++++++++++++++++++++++++------------
 1 file changed, 29 insertions(+), 13 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 983a3079e4..6a5dc3e749 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1439,27 +1439,43 @@ impl Drop for SmgrOpTimer {
 }
 
 impl SmgrOpFlushInProgress {
-    pub(crate) async fn measure<Fut, O>(self, mut started_at: Instant, mut fut: Fut) -> O
+    pub(crate) async fn measure<Fut, O>(self, started_at: Instant, mut fut: Fut) -> O
     where
         Fut: std::future::Future<Output = O>,
     {
         let mut fut = std::pin::pin!(fut);
 
-        // Whenever observe_guard gets called, or dropped,
-        // it adds the time elapsed since its last call to metrics.
-        // Last call is tracked in `now`.
+        let mut logged = false;
+        let mut last_counter_increment_at = started_at;
         let mut observe_guard = scopeguard::guard(
-            || {
+            |is_timeout| {
                 let now = Instant::now();
-                let elapsed = now - started_at;
-                self.global_micros
-                    .inc_by(u64::try_from(elapsed.as_micros()).unwrap());
-                self.per_timeline_micros
-                    .inc_by(u64::try_from(elapsed.as_micros()).unwrap());
-                started_at = now;
+
+                // Increment counter
+                {
+                    let elapsed_since_last_observe = now - last_counter_increment_at;
+                    self.global_micros
+                        .inc_by(u64::try_from(elapsed_since_last_observe.as_micros()).unwrap());
+                    self.per_timeline_micros
+                        .inc_by(u64::try_from(elapsed_since_last_observe.as_micros()).unwrap());
+                    last_counter_increment_at = now;
+                }
+
+                // Log something on every timeout, and on completion but only if we hit a timeout.
+                if is_timeout || logged {
+                    logged = true;
+                    let elapsed_total = now - started_at;
+                    let msg = if is_timeout {
+                        "slow flush ongoing"
+                    } else {
+                        "slow flush completed or cancelled"
+                    };
+                    let elapsed_total_secs = format!("{:.6}", elapsed_total.as_secs_f64());
+                    tracing::info!(elapsed_total_secs, msg);
+                }
             },
             |mut observe| {
-                observe();
+                observe(false);
             },
         );
 
@@ -1467,7 +1483,7 @@ impl SmgrOpFlushInProgress {
             match tokio::time::timeout(Duration::from_secs(10), &mut fut).await {
                 Ok(v) => return v,
                 Err(_timeout) => {
-                    (*observe_guard)();
+                    (*observe_guard)(true);
                 }
             }
         }

From 3d7a32f6196e87b00491fcdc4887ec9ed1bd1640 Mon Sep 17 00:00:00 2001
From: Gleb Novikov <NanoBjorn@users.noreply.github.com>
Date: Fri, 14 Feb 2025 16:10:06 +0000
Subject: [PATCH 08/78] fast import: allow restore to provided connection
 string (#10407)

Within https://github.com/neondatabase/cloud/issues/22089 we decided
that would be nice to start with import that runs dump-restore into a
running compute (more on this
[here](https://www.notion.so/neondatabase/2024-Jan-13-Migration-Assistant-Next-Steps-Proposal-Revised-17af189e004780228bdbcad13eeda93f?pvs=4#17af189e004780de816ccd9c13afd953))
We could do it by writing another tool or by extending existing
`fast_import.rs`, we chose the latter.

In this PR, I have added optional `restore_connection_string` as a cli
arg and as a part of the json spec. If specified, the script will not
run postgres and will just perform restore into provided connection
string.

TODO:
- [x] fast_import.rs:
	- [x] cli arg in the fast_import.rs
	- [x] encoded connstring in json spec
- [x] simplify `fn main` a little, take out too verbose stuff to some
functions
- [ ] ~~allow streaming from dump stdout to restore stdin~~ will do in a
separate PR
- [ ] ~~address
https://github.com/neondatabase/neon/pull/10251#pullrequestreview-2551877845~~
will do in a separate PR
- [x] tests:
    - [x] restore with cli arg in the fast_import.rs
    - [x] restore with encoded connstring in json spec in s3
    - [ ] ~~test with custom dbname~~ will do in a separate PR
- [ ] ~~test with s3 + pageserver + fast import binary~~
https://github.com/neondatabase/neon/pull/10487
- [ ]
~~https://github.com/neondatabase/neon/pull/10271#discussion_r1923715493~~
will do in a separate PR

neondatabase/cloud#22775

---------

Co-authored-by: Eduard Dykman <bird.duskpoet@gmail.com>
---
 compute_tools/src/bin/fast_import.rs      | 656 ++++++++++++++--------
 poetry.lock                               |  15 +-
 pyproject.toml                            |   2 +-
 test_runner/fixtures/fast_import.py       |  62 +-
 test_runner/fixtures/neon_fixtures.py     |  27 +
 test_runner/regress/test_import_pgdata.py | 346 +++++++++++-
 6 files changed, 866 insertions(+), 242 deletions(-)

diff --git a/compute_tools/src/bin/fast_import.rs b/compute_tools/src/bin/fast_import.rs
index 27cf1c2317..dad15d67b7 100644
--- a/compute_tools/src/bin/fast_import.rs
+++ b/compute_tools/src/bin/fast_import.rs
@@ -25,10 +25,10 @@
 //! docker push localhost:3030/localregistry/compute-node-v14:latest
 //! ```
 
-use anyhow::Context;
+use anyhow::{bail, Context};
 use aws_config::BehaviorVersion;
 use camino::{Utf8Path, Utf8PathBuf};
-use clap::Parser;
+use clap::{Parser, Subcommand};
 use compute_tools::extension_server::{get_pg_version, PostgresMajorVersion};
 use nix::unistd::Pid;
 use tracing::{error, info, info_span, warn, Instrument};
@@ -44,32 +44,59 @@ mod s3_uri;
 const PG_WAIT_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(600);
 const PG_WAIT_RETRY_INTERVAL: std::time::Duration = std::time::Duration::from_millis(300);
 
+#[derive(Subcommand, Debug)]
+enum Command {
+    /// Runs local postgres (neon binary), restores into it,
+    /// uploads pgdata to s3 to be consumed by pageservers
+    Pgdata {
+        /// Raw connection string to the source database. Used only in tests,
+        /// real scenario uses encrypted connection string in spec.json from s3.
+        #[clap(long)]
+        source_connection_string: Option<String>,
+        /// If specified, will not shut down the local postgres after the import. Used in local testing
+        #[clap(short, long)]
+        interactive: bool,
+        /// Port to run postgres on. Default is 5432.
+        #[clap(long, default_value_t = 5432)]
+        pg_port: u16, // port to run postgres on, 5432 is default
+
+        /// Number of CPUs in the system. This is used to configure # of
+        /// parallel worker processes, for index creation.
+        #[clap(long, env = "NEON_IMPORTER_NUM_CPUS")]
+        num_cpus: Option<usize>,
+
+        /// Amount of RAM in the system. This is used to configure shared_buffers
+        /// and maintenance_work_mem.
+        #[clap(long, env = "NEON_IMPORTER_MEMORY_MB")]
+        memory_mb: Option<usize>,
+    },
+
+    /// Runs pg_dump-pg_restore from source to destination without running local postgres.
+    DumpRestore {
+        /// Raw connection string to the source database. Used only in tests,
+        /// real scenario uses encrypted connection string in spec.json from s3.
+        #[clap(long)]
+        source_connection_string: Option<String>,
+        /// Raw connection string to the destination database. Used only in tests,
+        /// real scenario uses encrypted connection string in spec.json from s3.
+        #[clap(long)]
+        destination_connection_string: Option<String>,
+    },
+}
+
 #[derive(clap::Parser)]
 struct Args {
-    #[clap(long)]
+    #[clap(long, env = "NEON_IMPORTER_WORKDIR")]
     working_directory: Utf8PathBuf,
     #[clap(long, env = "NEON_IMPORTER_S3_PREFIX")]
     s3_prefix: Option<s3_uri::S3Uri>,
-    #[clap(long)]
-    source_connection_string: Option<String>,
-    #[clap(short, long)]
-    interactive: bool,
-    #[clap(long)]
+    #[clap(long, env = "NEON_IMPORTER_PG_BIN_DIR")]
     pg_bin_dir: Utf8PathBuf,
-    #[clap(long)]
+    #[clap(long, env = "NEON_IMPORTER_PG_LIB_DIR")]
     pg_lib_dir: Utf8PathBuf,
-    #[clap(long)]
-    pg_port: Option<u16>, // port to run postgres on, 5432 is default
 
-    /// Number of CPUs in the system. This is used to configure # of
-    /// parallel worker processes, for index creation.
-    #[clap(long, env = "NEON_IMPORTER_NUM_CPUS")]
-    num_cpus: Option<usize>,
-
-    /// Amount of RAM in the system. This is used to configure shared_buffers
-    /// and maintenance_work_mem.
-    #[clap(long, env = "NEON_IMPORTER_MEMORY_MB")]
-    memory_mb: Option<usize>,
+    #[clap(subcommand)]
+    command: Command,
 }
 
 #[serde_with::serde_as]
@@ -78,6 +105,8 @@ struct Spec {
     encryption_secret: EncryptionSecret,
     #[serde_as(as = "serde_with::base64::Base64")]
     source_connstring_ciphertext_base64: Vec<u8>,
+    #[serde_as(as = "Option<serde_with::base64::Base64>")]
+    destination_connstring_ciphertext_base64: Option<Vec<u8>>,
 }
 
 #[derive(serde::Deserialize)]
@@ -93,192 +122,151 @@ const DEFAULT_LOCALE: &str = if cfg!(target_os = "macos") {
     "C.UTF-8"
 };
 
-#[tokio::main]
-pub(crate) async fn main() -> anyhow::Result<()> {
-    utils::logging::init(
-        utils::logging::LogFormat::Plain,
-        utils::logging::TracingErrorLayerEnablement::EnableWithRustLogFilter,
-        utils::logging::Output::Stdout,
-    )?;
-
-    info!("starting");
-
-    let args = Args::parse();
-
-    // Validate arguments
-    if args.s3_prefix.is_none() && args.source_connection_string.is_none() {
-        anyhow::bail!("either s3_prefix or source_connection_string must be specified");
-    }
-    if args.s3_prefix.is_some() && args.source_connection_string.is_some() {
-        anyhow::bail!("only one of s3_prefix or source_connection_string can be specified");
-    }
-
-    let working_directory = args.working_directory;
-    let pg_bin_dir = args.pg_bin_dir;
-    let pg_lib_dir = args.pg_lib_dir;
-    let pg_port = args.pg_port.unwrap_or_else(|| {
-        info!("pg_port not specified, using default 5432");
-        5432
-    });
-
-    // Initialize AWS clients only if s3_prefix is specified
-    let (aws_config, kms_client) = if args.s3_prefix.is_some() {
-        let config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await;
-        let kms = aws_sdk_kms::Client::new(&config);
-        (Some(config), Some(kms))
-    } else {
-        (None, None)
-    };
-
-    // Get source connection string either from S3 spec or direct argument
-    let source_connection_string = if let Some(s3_prefix) = &args.s3_prefix {
-        let spec: Spec = {
-            let spec_key = s3_prefix.append("/spec.json");
-            let s3_client = aws_sdk_s3::Client::new(aws_config.as_ref().unwrap());
-            let object = s3_client
-                .get_object()
-                .bucket(&spec_key.bucket)
-                .key(spec_key.key)
-                .send()
-                .await
-                .context("get spec from s3")?
-                .body
-                .collect()
-                .await
-                .context("download spec body")?;
-            serde_json::from_slice(&object.into_bytes()).context("parse spec as json")?
-        };
-
-        match spec.encryption_secret {
-            EncryptionSecret::KMS { key_id } => {
-                let mut output = kms_client
-                    .unwrap()
-                    .decrypt()
-                    .key_id(key_id)
-                    .ciphertext_blob(aws_sdk_s3::primitives::Blob::new(
-                        spec.source_connstring_ciphertext_base64,
-                    ))
-                    .send()
-                    .await
-                    .context("decrypt source connection string")?;
-                let plaintext = output
-                    .plaintext
-                    .take()
-                    .context("get plaintext source connection string")?;
-                String::from_utf8(plaintext.into_inner())
-                    .context("parse source connection string as utf8")?
-            }
-        }
-    } else {
-        args.source_connection_string.unwrap()
-    };
-
-    match tokio::fs::create_dir(&working_directory).await {
-        Ok(()) => {}
-        Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => {
-            if !is_directory_empty(&working_directory)
-                .await
-                .context("check if working directory is empty")?
-            {
-                anyhow::bail!("working directory is not empty");
-            } else {
-                // ok
-            }
-        }
-        Err(e) => return Err(anyhow::Error::new(e).context("create working directory")),
-    }
-
-    let pgdata_dir = working_directory.join("pgdata");
-    tokio::fs::create_dir(&pgdata_dir)
+async fn decode_connstring(
+    kms_client: &aws_sdk_kms::Client,
+    key_id: &String,
+    connstring_ciphertext_base64: Vec<u8>,
+) -> Result<String, anyhow::Error> {
+    let mut output = kms_client
+        .decrypt()
+        .key_id(key_id)
+        .ciphertext_blob(aws_sdk_s3::primitives::Blob::new(
+            connstring_ciphertext_base64,
+        ))
+        .send()
         .await
-        .context("create pgdata directory")?;
+        .context("decrypt connection string")?;
 
-    let pgbin = pg_bin_dir.join("postgres");
-    let pg_version = match get_pg_version(pgbin.as_ref()) {
-        PostgresMajorVersion::V14 => 14,
-        PostgresMajorVersion::V15 => 15,
-        PostgresMajorVersion::V16 => 16,
-        PostgresMajorVersion::V17 => 17,
-    };
-    let superuser = "cloud_admin"; // XXX: this shouldn't be hard-coded
-    postgres_initdb::do_run_initdb(postgres_initdb::RunInitdbArgs {
-        superuser,
-        locale: DEFAULT_LOCALE, // XXX: this shouldn't be hard-coded,
-        pg_version,
-        initdb_bin: pg_bin_dir.join("initdb").as_ref(),
-        library_search_path: &pg_lib_dir, // TODO: is this right? Prob works in compute image, not sure about neon_local.
-        pgdata: &pgdata_dir,
-    })
-    .await
-    .context("initdb")?;
+    let plaintext = output
+        .plaintext
+        .take()
+        .context("get plaintext connection string")?;
 
-    // If the caller didn't specify CPU / RAM to use for sizing, default to
-    // number of CPUs in the system, and pretty arbitrarily, 256 MB of RAM.
-    let nproc = args.num_cpus.unwrap_or_else(num_cpus::get);
-    let memory_mb = args.memory_mb.unwrap_or(256);
+    String::from_utf8(plaintext.into_inner()).context("parse connection string as utf8")
+}
 
-    // Somewhat arbitrarily, use 10 % of memory for shared buffer cache, 70% for
-    // maintenance_work_mem (i.e. for sorting during index creation), and leave the rest
-    // available for misc other stuff that PostgreSQL uses memory for.
-    let shared_buffers_mb = ((memory_mb as f32) * 0.10) as usize;
-    let maintenance_work_mem_mb = ((memory_mb as f32) * 0.70) as usize;
+struct PostgresProcess {
+    pgdata_dir: Utf8PathBuf,
+    pg_bin_dir: Utf8PathBuf,
+    pgbin: Utf8PathBuf,
+    pg_lib_dir: Utf8PathBuf,
+    postgres_proc: Option<tokio::process::Child>,
+}
 
-    //
-    // Launch postgres process
-    //
-    let mut postgres_proc = tokio::process::Command::new(pgbin)
-        .arg("-D")
-        .arg(&pgdata_dir)
-        .args(["-p", &format!("{pg_port}")])
-        .args(["-c", "wal_level=minimal"])
-        .args(["-c", &format!("shared_buffers={shared_buffers_mb}MB")])
-        .args(["-c", "max_wal_senders=0"])
-        .args(["-c", "fsync=off"])
-        .args(["-c", "full_page_writes=off"])
-        .args(["-c", "synchronous_commit=off"])
-        .args([
-            "-c",
-            &format!("maintenance_work_mem={maintenance_work_mem_mb}MB"),
-        ])
-        .args(["-c", &format!("max_parallel_maintenance_workers={nproc}")])
-        .args(["-c", &format!("max_parallel_workers={nproc}")])
-        .args(["-c", &format!("max_parallel_workers_per_gather={nproc}")])
-        .args(["-c", &format!("max_worker_processes={nproc}")])
-        .args([
-            "-c",
-            &format!(
-                "effective_io_concurrency={}",
-                if cfg!(target_os = "macos") { 0 } else { 100 }
-            ),
-        ])
-        .env_clear()
-        .env("LD_LIBRARY_PATH", &pg_lib_dir)
-        .env(
-            "ASAN_OPTIONS",
-            std::env::var("ASAN_OPTIONS").unwrap_or_default(),
+impl PostgresProcess {
+    fn new(pgdata_dir: Utf8PathBuf, pg_bin_dir: Utf8PathBuf, pg_lib_dir: Utf8PathBuf) -> Self {
+        Self {
+            pgdata_dir,
+            pgbin: pg_bin_dir.join("postgres"),
+            pg_bin_dir,
+            pg_lib_dir,
+            postgres_proc: None,
+        }
+    }
+
+    async fn prepare(&self, initdb_user: &str) -> Result<(), anyhow::Error> {
+        tokio::fs::create_dir(&self.pgdata_dir)
+            .await
+            .context("create pgdata directory")?;
+
+        let pg_version = match get_pg_version(self.pgbin.as_ref()) {
+            PostgresMajorVersion::V14 => 14,
+            PostgresMajorVersion::V15 => 15,
+            PostgresMajorVersion::V16 => 16,
+            PostgresMajorVersion::V17 => 17,
+        };
+        postgres_initdb::do_run_initdb(postgres_initdb::RunInitdbArgs {
+            superuser: initdb_user,
+            locale: DEFAULT_LOCALE, // XXX: this shouldn't be hard-coded,
+            pg_version,
+            initdb_bin: self.pg_bin_dir.join("initdb").as_ref(),
+            library_search_path: &self.pg_lib_dir, // TODO: is this right? Prob works in compute image, not sure about neon_local.
+            pgdata: &self.pgdata_dir,
+        })
+        .await
+        .context("initdb")
+    }
+
+    async fn start(
+        &mut self,
+        initdb_user: &str,
+        port: u16,
+        nproc: usize,
+        memory_mb: usize,
+    ) -> Result<&tokio::process::Child, anyhow::Error> {
+        self.prepare(initdb_user).await?;
+
+        // Somewhat arbitrarily, use 10 % of memory for shared buffer cache, 70% for
+        // maintenance_work_mem (i.e. for sorting during index creation), and leave the rest
+        // available for misc other stuff that PostgreSQL uses memory for.
+        let shared_buffers_mb = ((memory_mb as f32) * 0.10) as usize;
+        let maintenance_work_mem_mb = ((memory_mb as f32) * 0.70) as usize;
+
+        //
+        // Launch postgres process
+        //
+        let mut proc = tokio::process::Command::new(&self.pgbin)
+            .arg("-D")
+            .arg(&self.pgdata_dir)
+            .args(["-p", &format!("{port}")])
+            .args(["-c", "wal_level=minimal"])
+            .args(["-c", &format!("shared_buffers={shared_buffers_mb}MB")])
+            .args(["-c", "shared_buffers=10GB"])
+            .args(["-c", "max_wal_senders=0"])
+            .args(["-c", "fsync=off"])
+            .args(["-c", "full_page_writes=off"])
+            .args(["-c", "synchronous_commit=off"])
+            .args([
+                "-c",
+                &format!("maintenance_work_mem={maintenance_work_mem_mb}MB"),
+            ])
+            .args(["-c", &format!("max_parallel_maintenance_workers={nproc}")])
+            .args(["-c", &format!("max_parallel_workers={nproc}")])
+            .args(["-c", &format!("max_parallel_workers_per_gather={nproc}")])
+            .args(["-c", &format!("max_worker_processes={nproc}")])
+            .args(["-c", "effective_io_concurrency=100"])
+            .env_clear()
+            .env("LD_LIBRARY_PATH", &self.pg_lib_dir)
+            .env(
+                "ASAN_OPTIONS",
+                std::env::var("ASAN_OPTIONS").unwrap_or_default(),
+            )
+            .env(
+                "UBSAN_OPTIONS",
+                std::env::var("UBSAN_OPTIONS").unwrap_or_default(),
+            )
+            .stdout(std::process::Stdio::piped())
+            .stderr(std::process::Stdio::piped())
+            .spawn()
+            .context("spawn postgres")?;
+
+        info!("spawned postgres, waiting for it to become ready");
+        tokio::spawn(
+            child_stdio_to_log::relay_process_output(proc.stdout.take(), proc.stderr.take())
+                .instrument(info_span!("postgres")),
+        );
+
+        self.postgres_proc = Some(proc);
+        Ok(self.postgres_proc.as_ref().unwrap())
+    }
+
+    async fn shutdown(&mut self) -> Result<(), anyhow::Error> {
+        let proc: &mut tokio::process::Child = self.postgres_proc.as_mut().unwrap();
+        info!("shutdown postgres");
+        nix::sys::signal::kill(
+            Pid::from_raw(i32::try_from(proc.id().unwrap()).expect("convert child pid to i32")),
+            nix::sys::signal::SIGTERM,
         )
-        .env(
-            "UBSAN_OPTIONS",
-            std::env::var("UBSAN_OPTIONS").unwrap_or_default(),
-        )
-        .stdout(std::process::Stdio::piped())
-        .stderr(std::process::Stdio::piped())
-        .spawn()
-        .context("spawn postgres")?;
-
-    info!("spawned postgres, waiting for it to become ready");
-    tokio::spawn(
-        child_stdio_to_log::relay_process_output(
-            postgres_proc.stdout.take(),
-            postgres_proc.stderr.take(),
-        )
-        .instrument(info_span!("postgres")),
-    );
+        .context("signal postgres to shut down")?;
+        proc.wait()
+            .await
+            .context("wait for postgres to shut down")
+            .map(|_| ())
+    }
+}
 
+async fn wait_until_ready(connstring: String, create_dbname: String) {
     // Create neondb database in the running postgres
-    let restore_pg_connstring =
-        format!("host=localhost port={pg_port} user={superuser} dbname=postgres");
-
     let start_time = std::time::Instant::now();
 
     loop {
@@ -289,7 +277,12 @@ pub(crate) async fn main() -> anyhow::Result<()> {
             std::process::exit(1);
         }
 
-        match tokio_postgres::connect(&restore_pg_connstring, tokio_postgres::NoTls).await {
+        match tokio_postgres::connect(
+            &connstring.replace("dbname=neondb", "dbname=postgres"),
+            tokio_postgres::NoTls,
+        )
+        .await
+        {
             Ok((client, connection)) => {
                 // Spawn the connection handling task to maintain the connection
                 tokio::spawn(async move {
@@ -298,9 +291,12 @@ pub(crate) async fn main() -> anyhow::Result<()> {
                     }
                 });
 
-                match client.simple_query("CREATE DATABASE neondb;").await {
+                match client
+                    .simple_query(format!("CREATE DATABASE {create_dbname};").as_str())
+                    .await
+                {
                     Ok(_) => {
-                        info!("created neondb database");
+                        info!("created {} database", create_dbname);
                         break;
                     }
                     Err(e) => {
@@ -324,10 +320,16 @@ pub(crate) async fn main() -> anyhow::Result<()> {
             }
         }
     }
+}
 
-    let restore_pg_connstring = restore_pg_connstring.replace("dbname=postgres", "dbname=neondb");
-
-    let dumpdir = working_directory.join("dumpdir");
+async fn run_dump_restore(
+    workdir: Utf8PathBuf,
+    pg_bin_dir: Utf8PathBuf,
+    pg_lib_dir: Utf8PathBuf,
+    source_connstring: String,
+    destination_connstring: String,
+) -> Result<(), anyhow::Error> {
+    let dumpdir = workdir.join("dumpdir");
 
     let common_args = [
         // schema mapping (prob suffices to specify them on one side)
@@ -356,7 +358,7 @@ pub(crate) async fn main() -> anyhow::Result<()> {
             .arg("--no-sync")
             // POSITIONAL args
             // source db (db name included in connection string)
-            .arg(&source_connection_string)
+            .arg(&source_connstring)
             // how we run it
             .env_clear()
             .env("LD_LIBRARY_PATH", &pg_lib_dir)
@@ -376,19 +378,18 @@ pub(crate) async fn main() -> anyhow::Result<()> {
         let st = pg_dump.wait().await.context("wait for pg_dump")?;
         info!(status=?st, "pg_dump exited");
         if !st.success() {
-            warn!(status=%st, "pg_dump failed, restore will likely fail as well");
+            error!(status=%st, "pg_dump failed, restore will likely fail as well");
+            bail!("pg_dump failed");
         }
     }
 
-    // TODO: do it in a streaming way, plenty of internal research done on this already
+    // TODO: maybe do it in a streaming way, plenty of internal research done on this already
     // TODO: do the unlogged table trick
-
-    info!("restore from working directory into vanilla postgres");
     {
         let mut pg_restore = tokio::process::Command::new(pg_bin_dir.join("pg_restore"))
             .args(&common_args)
             .arg("-d")
-            .arg(&restore_pg_connstring)
+            .arg(&destination_connstring)
             // POSITIONAL args
             .arg(&dumpdir)
             // how we run it
@@ -411,33 +412,82 @@ pub(crate) async fn main() -> anyhow::Result<()> {
         let st = pg_restore.wait().await.context("wait for pg_restore")?;
         info!(status=?st, "pg_restore exited");
         if !st.success() {
-            warn!(status=%st, "pg_restore failed, restore will likely fail as well");
+            error!(status=%st, "pg_restore failed, restore will likely fail as well");
+            bail!("pg_restore failed");
         }
     }
 
+    Ok(())
+}
+
+#[allow(clippy::too_many_arguments)]
+async fn cmd_pgdata(
+    kms_client: Option<aws_sdk_kms::Client>,
+    maybe_s3_prefix: Option<s3_uri::S3Uri>,
+    maybe_spec: Option<Spec>,
+    source_connection_string: Option<String>,
+    interactive: bool,
+    pg_port: u16,
+    workdir: Utf8PathBuf,
+    pg_bin_dir: Utf8PathBuf,
+    pg_lib_dir: Utf8PathBuf,
+    num_cpus: Option<usize>,
+    memory_mb: Option<usize>,
+) -> Result<(), anyhow::Error> {
+    if maybe_spec.is_none() && source_connection_string.is_none() {
+        bail!("spec must be provided for pgdata command");
+    }
+    if maybe_spec.is_some() && source_connection_string.is_some() {
+        bail!("only one of spec or source_connection_string can be provided");
+    }
+
+    let source_connection_string = if let Some(spec) = maybe_spec {
+        match spec.encryption_secret {
+            EncryptionSecret::KMS { key_id } => {
+                decode_connstring(
+                    kms_client.as_ref().unwrap(),
+                    &key_id,
+                    spec.source_connstring_ciphertext_base64,
+                )
+                .await?
+            }
+        }
+    } else {
+        source_connection_string.unwrap()
+    };
+
+    let superuser = "cloud_admin";
+    let destination_connstring = format!(
+        "host=localhost port={} user={} dbname=neondb",
+        pg_port, superuser
+    );
+
+    let pgdata_dir = workdir.join("pgdata");
+    let mut proc = PostgresProcess::new(pgdata_dir.clone(), pg_bin_dir.clone(), pg_lib_dir.clone());
+    let nproc = num_cpus.unwrap_or_else(num_cpus::get);
+    let memory_mb = memory_mb.unwrap_or(256);
+    proc.start(superuser, pg_port, nproc, memory_mb).await?;
+    wait_until_ready(destination_connstring.clone(), "neondb".to_string()).await;
+
+    run_dump_restore(
+        workdir.clone(),
+        pg_bin_dir,
+        pg_lib_dir,
+        source_connection_string,
+        destination_connstring,
+    )
+    .await?;
+
     // If interactive mode, wait for Ctrl+C
-    if args.interactive {
+    if interactive {
         info!("Running in interactive mode. Press Ctrl+C to shut down.");
         tokio::signal::ctrl_c().await.context("wait for ctrl-c")?;
     }
 
-    info!("shutdown postgres");
-    {
-        nix::sys::signal::kill(
-            Pid::from_raw(
-                i32::try_from(postgres_proc.id().unwrap()).expect("convert child pid to i32"),
-            ),
-            nix::sys::signal::SIGTERM,
-        )
-        .context("signal postgres to shut down")?;
-        postgres_proc
-            .wait()
-            .await
-            .context("wait for postgres to shut down")?;
-    }
+    proc.shutdown().await?;
 
     // Only sync if s3_prefix was specified
-    if let Some(s3_prefix) = args.s3_prefix {
+    if let Some(s3_prefix) = maybe_s3_prefix {
         info!("upload pgdata");
         aws_s3_sync::sync(Utf8Path::new(&pgdata_dir), &s3_prefix.append("/pgdata/"))
             .await
@@ -445,7 +495,7 @@ pub(crate) async fn main() -> anyhow::Result<()> {
 
         info!("write status");
         {
-            let status_dir = working_directory.join("status");
+            let status_dir = workdir.join("status");
             std::fs::create_dir(&status_dir).context("create status directory")?;
             let status_file = status_dir.join("pgdata");
             std::fs::write(&status_file, serde_json::json!({"done": true}).to_string())
@@ -458,3 +508,153 @@ pub(crate) async fn main() -> anyhow::Result<()> {
 
     Ok(())
 }
+
+async fn cmd_dumprestore(
+    kms_client: Option<aws_sdk_kms::Client>,
+    maybe_spec: Option<Spec>,
+    source_connection_string: Option<String>,
+    destination_connection_string: Option<String>,
+    workdir: Utf8PathBuf,
+    pg_bin_dir: Utf8PathBuf,
+    pg_lib_dir: Utf8PathBuf,
+) -> Result<(), anyhow::Error> {
+    let (source_connstring, destination_connstring) = if let Some(spec) = maybe_spec {
+        match spec.encryption_secret {
+            EncryptionSecret::KMS { key_id } => {
+                let source = decode_connstring(
+                    kms_client.as_ref().unwrap(),
+                    &key_id,
+                    spec.source_connstring_ciphertext_base64,
+                )
+                .await?;
+
+                let dest = if let Some(dest_ciphertext) =
+                    spec.destination_connstring_ciphertext_base64
+                {
+                    decode_connstring(kms_client.as_ref().unwrap(), &key_id, dest_ciphertext)
+                        .await?
+                } else {
+                    bail!("destination connection string must be provided in spec for dump_restore command");
+                };
+
+                (source, dest)
+            }
+        }
+    } else {
+        (
+            source_connection_string.unwrap(),
+            if let Some(val) = destination_connection_string {
+                val
+            } else {
+                bail!("destination connection string must be provided for dump_restore command");
+            },
+        )
+    };
+
+    run_dump_restore(
+        workdir,
+        pg_bin_dir,
+        pg_lib_dir,
+        source_connstring,
+        destination_connstring,
+    )
+    .await
+}
+
+#[tokio::main]
+pub(crate) async fn main() -> anyhow::Result<()> {
+    utils::logging::init(
+        utils::logging::LogFormat::Json,
+        utils::logging::TracingErrorLayerEnablement::EnableWithRustLogFilter,
+        utils::logging::Output::Stdout,
+    )?;
+
+    info!("starting");
+
+    let args = Args::parse();
+
+    // Initialize AWS clients only if s3_prefix is specified
+    let (aws_config, kms_client) = if args.s3_prefix.is_some() {
+        let config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await;
+        let kms = aws_sdk_kms::Client::new(&config);
+        (Some(config), Some(kms))
+    } else {
+        (None, None)
+    };
+
+    let spec: Option<Spec> = if let Some(s3_prefix) = &args.s3_prefix {
+        let spec_key = s3_prefix.append("/spec.json");
+        let s3_client = aws_sdk_s3::Client::new(aws_config.as_ref().unwrap());
+        let object = s3_client
+            .get_object()
+            .bucket(&spec_key.bucket)
+            .key(spec_key.key)
+            .send()
+            .await
+            .context("get spec from s3")?
+            .body
+            .collect()
+            .await
+            .context("download spec body")?;
+        serde_json::from_slice(&object.into_bytes()).context("parse spec as json")?
+    } else {
+        None
+    };
+
+    match tokio::fs::create_dir(&args.working_directory).await {
+        Ok(()) => {}
+        Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => {
+            if !is_directory_empty(&args.working_directory)
+                .await
+                .context("check if working directory is empty")?
+            {
+                bail!("working directory is not empty");
+            } else {
+                // ok
+            }
+        }
+        Err(e) => return Err(anyhow::Error::new(e).context("create working directory")),
+    }
+
+    match args.command {
+        Command::Pgdata {
+            source_connection_string,
+            interactive,
+            pg_port,
+            num_cpus,
+            memory_mb,
+        } => {
+            cmd_pgdata(
+                kms_client,
+                args.s3_prefix,
+                spec,
+                source_connection_string,
+                interactive,
+                pg_port,
+                args.working_directory,
+                args.pg_bin_dir,
+                args.pg_lib_dir,
+                num_cpus,
+                memory_mb,
+            )
+            .await?;
+        }
+        Command::DumpRestore {
+            source_connection_string,
+            destination_connection_string,
+        } => {
+            cmd_dumprestore(
+                kms_client,
+                spec,
+                source_connection_string,
+                destination_connection_string,
+                args.working_directory,
+                args.pg_bin_dir,
+                args.pg_lib_dir,
+            )
+            .await?;
+        }
+    }
+
+    Ok(())
+}
diff --git a/poetry.lock b/poetry.lock
index fd200159b9..e2c71ca012 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -412,6 +412,7 @@ files = [
 
 [package.dependencies]
 botocore-stubs = "*"
+mypy-boto3-kms = {version = ">=1.26.0,<1.27.0", optional = true, markers = "extra == \"kms\""}
 mypy-boto3-s3 = {version = ">=1.26.0,<1.27.0", optional = true, markers = "extra == \"s3\""}
 types-s3transfer = "*"
 typing-extensions = ">=4.1.0"
@@ -2022,6 +2023,18 @@ install-types = ["pip"]
 mypyc = ["setuptools (>=50)"]
 reports = ["lxml"]
 
+[[package]]
+name = "mypy-boto3-kms"
+version = "1.26.147"
+description = "Type annotations for boto3.KMS 1.26.147 service generated with mypy-boto3-builder 7.14.5"
+optional = false
+python-versions = ">=3.7"
+groups = ["main"]
+files = [
+    {file = "mypy-boto3-kms-1.26.147.tar.gz", hash = "sha256:816a4d1bb0585e1b9620a3f96c1d69a06f53b7b5621858579dd77c60dbb5fa5c"},
+    {file = "mypy_boto3_kms-1.26.147-py3-none-any.whl", hash = "sha256:493f0db674a25c88769f5cb8ab8ac00d3dda5dfc903d5cda34c990ee64689f79"},
+]
+
 [[package]]
 name = "mypy-boto3-s3"
 version = "1.26.0.post1"
@@ -3807,4 +3820,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.1"
 python-versions = "^3.11"
-content-hash = "4dc3165fe22c0e0f7a030ea0f8a680ae2ff74561d8658c393abbe9112caaf5d7"
+content-hash = "03697c0a4d438ef088b0d397b8f0570aa3998ccf833fe612400824792498878b"
diff --git a/pyproject.toml b/pyproject.toml
index e299c421e9..51cd68e002 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,7 +17,7 @@ Jinja2 = "^3.1.5"
 types-requests = "^2.31.0.0"
 types-psycopg2 = "^2.9.21.20241019"
 boto3 = "^1.34.11"
-boto3-stubs = {extras = ["s3"], version = "^1.26.16"}
+boto3-stubs = {extras = ["s3", "kms"], version = "^1.26.16"}
 moto = {extras = ["server"], version = "^5.0.6"}
 backoff = "^2.2.1"
 pytest-lazy-fixture = "^0.6.3"
diff --git a/test_runner/fixtures/fast_import.py b/test_runner/fixtures/fast_import.py
index 33248132ab..d674be99de 100644
--- a/test_runner/fixtures/fast_import.py
+++ b/test_runner/fixtures/fast_import.py
@@ -4,8 +4,10 @@ import subprocess
 import tempfile
 from collections.abc import Iterator
 from pathlib import Path
+from typing import cast
 
 import pytest
+from _pytest.config import Config
 
 from fixtures.log_helper import log
 from fixtures.neon_cli import AbstractNeonCli
@@ -23,6 +25,7 @@ class FastImport(AbstractNeonCli):
         pg_distrib_dir: Path,
         pg_version: PgVersion,
         workdir: Path,
+        cleanup: bool = True,
     ):
         if extra_env is None:
             env_vars = {}
@@ -47,12 +50,43 @@ class FastImport(AbstractNeonCli):
         if not workdir.exists():
             raise Exception(f"Working directory '{workdir}' does not exist")
         self.workdir = workdir
+        self.cleanup = cleanup
+
+    def run_pgdata(
+        self,
+        s3prefix: str | None = None,
+        pg_port: int | None = None,
+        source_connection_string: str | None = None,
+        interactive: bool = False,
+    ):
+        return self.run(
+            "pgdata",
+            s3prefix=s3prefix,
+            pg_port=pg_port,
+            source_connection_string=source_connection_string,
+            interactive=interactive,
+        )
+
+    def run_dump_restore(
+        self,
+        s3prefix: str | None = None,
+        source_connection_string: str | None = None,
+        destination_connection_string: str | None = None,
+    ):
+        return self.run(
+            "dump-restore",
+            s3prefix=s3prefix,
+            source_connection_string=source_connection_string,
+            destination_connection_string=destination_connection_string,
+        )
 
     def run(
         self,
-        pg_port: int,
-        source_connection_string: str | None = None,
+        command: str,
         s3prefix: str | None = None,
+        pg_port: int | None = None,
+        source_connection_string: str | None = None,
+        destination_connection_string: str | None = None,
         interactive: bool = False,
     ) -> subprocess.CompletedProcess[str]:
         if self.cmd is not None:
@@ -60,13 +94,17 @@ class FastImport(AbstractNeonCli):
         args = [
             f"--pg-bin-dir={self.pg_bin}",
             f"--pg-lib-dir={self.pg_lib}",
-            f"--pg-port={pg_port}",
             f"--working-directory={self.workdir}",
         ]
-        if source_connection_string is not None:
-            args.append(f"--source-connection-string={source_connection_string}")
         if s3prefix is not None:
             args.append(f"--s3-prefix={s3prefix}")
+        args.append(command)
+        if pg_port is not None:
+            args.append(f"--pg-port={pg_port}")
+        if source_connection_string is not None:
+            args.append(f"--source-connection-string={source_connection_string}")
+        if destination_connection_string is not None:
+            args.append(f"--destination-connection-string={destination_connection_string}")
         if interactive:
             args.append("--interactive")
 
@@ -77,7 +115,7 @@ class FastImport(AbstractNeonCli):
         return self
 
     def __exit__(self, *args):
-        if self.workdir.exists():
+        if self.workdir.exists() and self.cleanup:
             shutil.rmtree(self.workdir)
 
 
@@ -87,9 +125,17 @@ def fast_import(
     test_output_dir: Path,
     neon_binpath: Path,
     pg_distrib_dir: Path,
+    pytestconfig: Config,
 ) -> Iterator[FastImport]:
-    workdir = Path(tempfile.mkdtemp())
-    with FastImport(None, neon_binpath, pg_distrib_dir, pg_version, workdir) as fi:
+    workdir = Path(tempfile.mkdtemp(dir=test_output_dir, prefix="fast_import_"))
+    with FastImport(
+        None,
+        neon_binpath,
+        pg_distrib_dir,
+        pg_version,
+        workdir,
+        cleanup=not cast(bool, pytestconfig.getoption("--preserve-database-files")),
+    ) as fi:
         yield fi
 
         if fi.cmd is None:
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 469bc8a1e5..73607db7d8 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -27,6 +27,7 @@ from urllib.parse import quote, urlparse
 
 import asyncpg
 import backoff
+import boto3
 import httpx
 import psycopg2
 import psycopg2.sql
@@ -37,6 +38,8 @@ from _pytest.config import Config
 from _pytest.config.argparsing import Parser
 from _pytest.fixtures import FixtureRequest
 from jwcrypto import jwk
+from mypy_boto3_kms import KMSClient
+from mypy_boto3_s3 import S3Client
 
 # Type-related stuff
 from psycopg2.extensions import connection as PgConnection
@@ -199,6 +202,30 @@ def mock_s3_server(port_distributor: PortDistributor) -> Iterator[MockS3Server]:
     mock_s3_server.kill()
 
 
+@pytest.fixture(scope="session")
+def mock_kms(mock_s3_server: MockS3Server) -> Iterator[KMSClient]:
+    yield boto3.client(
+        "kms",
+        endpoint_url=mock_s3_server.endpoint(),
+        region_name=mock_s3_server.region(),
+        aws_access_key_id=mock_s3_server.access_key(),
+        aws_secret_access_key=mock_s3_server.secret_key(),
+        aws_session_token=mock_s3_server.session_token(),
+    )
+
+
+@pytest.fixture(scope="session")
+def mock_s3_client(mock_s3_server: MockS3Server) -> Iterator[S3Client]:
+    yield boto3.client(
+        "s3",
+        endpoint_url=mock_s3_server.endpoint(),
+        region_name=mock_s3_server.region(),
+        aws_access_key_id=mock_s3_server.access_key(),
+        aws_secret_access_key=mock_s3_server.secret_key(),
+        aws_session_token=mock_s3_server.session_token(),
+    )
+
+
 class PgProtocol:
     """Reusable connection logic"""
 
diff --git a/test_runner/regress/test_import_pgdata.py b/test_runner/regress/test_import_pgdata.py
index ea86eb62eb..71e0d16edd 100644
--- a/test_runner/regress/test_import_pgdata.py
+++ b/test_runner/regress/test_import_pgdata.py
@@ -1,7 +1,9 @@
+import base64
 import json
 import re
 import time
 from enum import Enum
+from pathlib import Path
 
 import psycopg2
 import psycopg2.errors
@@ -14,8 +16,12 @@ from fixtures.pageserver.http import (
     ImportPgdataIdemptencyKey,
     PageserverApiException,
 )
+from fixtures.pg_version import PgVersion
 from fixtures.port_distributor import PortDistributor
-from fixtures.remote_storage import RemoteStorageKind
+from fixtures.remote_storage import MockS3Server, RemoteStorageKind
+from mypy_boto3_kms import KMSClient
+from mypy_boto3_kms.type_defs import EncryptResponseTypeDef
+from mypy_boto3_s3 import S3Client
 from pytest_httpserver import HTTPServer
 from werkzeug.wrappers.request import Request
 from werkzeug.wrappers.response import Response
@@ -103,13 +109,15 @@ def test_pgdata_import_smoke(
     while True:
         relblock_size = vanilla_pg.safe_psql_scalar("select pg_relation_size('t')")
         log.info(
-            f"relblock size: {relblock_size/8192} pages (target: {target_relblock_size//8192}) pages"
+            f"relblock size: {relblock_size / 8192} pages (target: {target_relblock_size // 8192}) pages"
         )
         if relblock_size >= target_relblock_size:
             break
         addrows = int((target_relblock_size - relblock_size) // 8192)
         assert addrows >= 1, "forward progress"
-        vanilla_pg.safe_psql(f"insert into t select generate_series({nrows+1}, {nrows + addrows})")
+        vanilla_pg.safe_psql(
+            f"insert into t select generate_series({nrows + 1}, {nrows + addrows})"
+        )
         nrows += addrows
     expect_nrows = nrows
     expect_sum = (
@@ -332,6 +340,224 @@ def test_pgdata_import_smoke(
         br_initdb_endpoint.safe_psql("select * from othertable")
 
 
+def test_fast_import_with_pageserver_ingest(
+    test_output_dir,
+    vanilla_pg: VanillaPostgres,
+    port_distributor: PortDistributor,
+    fast_import: FastImport,
+    pg_distrib_dir: Path,
+    pg_version: PgVersion,
+    mock_s3_server: MockS3Server,
+    mock_kms: KMSClient,
+    mock_s3_client: S3Client,
+    neon_env_builder: NeonEnvBuilder,
+    make_httpserver: HTTPServer,
+):
+    # Prepare KMS and S3
+    key_response = mock_kms.create_key(
+        Description="Test key",
+        KeyUsage="ENCRYPT_DECRYPT",
+        Origin="AWS_KMS",
+    )
+    key_id = key_response["KeyMetadata"]["KeyId"]
+
+    def encrypt(x: str) -> EncryptResponseTypeDef:
+        return mock_kms.encrypt(KeyId=key_id, Plaintext=x)
+
+    # Start source postgres and ingest data
+    vanilla_pg.start()
+    vanilla_pg.safe_psql("CREATE TABLE foo (a int); INSERT INTO foo SELECT generate_series(1, 10);")
+
+    # Setup pageserver and fake cplane for import progress
+    def handler(request: Request) -> Response:
+        log.info(f"control plane request: {request.json}")
+        return Response(json.dumps({}), status=200)
+
+    cplane_mgmt_api_server = make_httpserver
+    cplane_mgmt_api_server.expect_request(re.compile(".*")).respond_with_handler(handler)
+
+    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3)
+    env = neon_env_builder.init_start()
+
+    env.pageserver.patch_config_toml_nonrecursive(
+        {
+            "import_pgdata_upcall_api": f"http://{cplane_mgmt_api_server.host}:{cplane_mgmt_api_server.port}/path/to/mgmt/api",
+            # because import_pgdata code uses this endpoint, not the one in common remote storage config
+            # TODO: maybe use common remote_storage config in pageserver?
+            "import_pgdata_aws_endpoint_url": env.s3_mock_server.endpoint(),
+        }
+    )
+    env.pageserver.stop()
+    env.pageserver.start()
+
+    # Encrypt connstrings and put spec into S3
+    source_connstring_encrypted = encrypt(vanilla_pg.connstr())
+    spec = {
+        "encryption_secret": {"KMS": {"key_id": key_id}},
+        "source_connstring_ciphertext_base64": base64.b64encode(
+            source_connstring_encrypted["CiphertextBlob"]
+        ).decode("utf-8"),
+        "project_id": "someproject",
+        "branch_id": "somebranch",
+    }
+
+    bucket = "test-bucket"
+    key_prefix = "test-prefix"
+    mock_s3_client.create_bucket(Bucket=bucket)
+    mock_s3_client.put_object(Bucket=bucket, Key=f"{key_prefix}/spec.json", Body=json.dumps(spec))
+
+    # Create timeline with import_pgdata
+    tenant_id = TenantId.generate()
+    env.storage_controller.tenant_create(tenant_id)
+
+    timeline_id = TimelineId.generate()
+    log.info("starting import")
+    start = time.monotonic()
+
+    idempotency = ImportPgdataIdemptencyKey.random()
+    log.info(f"idempotency key {idempotency}")
+    # TODO: teach neon_local CLI about the idempotency & 429 error so we can run inside the loop
+    # and check for 429
+
+    import_branch_name = "imported"
+    env.storage_controller.timeline_create(
+        tenant_id,
+        {
+            "new_timeline_id": str(timeline_id),
+            "import_pgdata": {
+                "idempotency_key": str(idempotency),
+                "location": {
+                    "AwsS3": {
+                        "region": env.s3_mock_server.region(),
+                        "bucket": bucket,
+                        "key": key_prefix,
+                    }
+                },
+            },
+        },
+    )
+    env.neon_cli.mappings_map_branch(import_branch_name, tenant_id, timeline_id)
+
+    # Run fast_import
+    if fast_import.extra_env is None:
+        fast_import.extra_env = {}
+    fast_import.extra_env["AWS_ACCESS_KEY_ID"] = mock_s3_server.access_key()
+    fast_import.extra_env["AWS_SECRET_ACCESS_KEY"] = mock_s3_server.secret_key()
+    fast_import.extra_env["AWS_SESSION_TOKEN"] = mock_s3_server.session_token()
+    fast_import.extra_env["AWS_REGION"] = mock_s3_server.region()
+    fast_import.extra_env["AWS_ENDPOINT_URL"] = mock_s3_server.endpoint()
+    fast_import.extra_env["RUST_LOG"] = "aws_config=debug,aws_sdk_kms=debug"
+    pg_port = port_distributor.get_port()
+    fast_import.run_pgdata(pg_port=pg_port, s3prefix=f"s3://{bucket}/{key_prefix}")
+    vanilla_pg.stop()
+
+    def validate_vanilla_equivalence(ep):
+        res = ep.safe_psql("SELECT count(*), sum(a) FROM foo;", dbname="neondb")
+        assert res[0] == (10, 55), f"got result: {res}"
+
+    # Sanity check that data in pgdata is expected:
+    pgbin = PgBin(test_output_dir, fast_import.pg_distrib_dir, fast_import.pg_version)
+    with VanillaPostgres(
+        fast_import.workdir / "pgdata", pgbin, pg_port, False
+    ) as new_pgdata_vanilla_pg:
+        new_pgdata_vanilla_pg.start()
+
+        # database name and user are hardcoded in fast_import binary, and they are different from normal vanilla postgres
+        conn = PgProtocol(dsn=f"postgresql://cloud_admin@localhost:{pg_port}/neondb")
+        validate_vanilla_equivalence(conn)
+
+    # Poll pageserver statuses in s3
+    while True:
+        locations = env.storage_controller.locate(tenant_id)
+        active_count = 0
+        for location in locations:
+            shard_id = TenantShardId.parse(location["shard_id"])
+            ps = env.get_pageserver(location["node_id"])
+            try:
+                detail = ps.http_client().timeline_detail(shard_id, timeline_id)
+                log.info(f"timeline {tenant_id}/{timeline_id} detail: {detail}")
+                state = detail["state"]
+                log.info(f"shard {shard_id} state: {state}")
+                if state == "Active":
+                    active_count += 1
+            except PageserverApiException as e:
+                if e.status_code == 404:
+                    log.info("not found, import is in progress")
+                    continue
+                elif e.status_code == 429:
+                    log.info("import is in progress")
+                    continue
+                else:
+                    raise
+
+            if state == "Active":
+                key = f"{key_prefix}/status/shard-{shard_id.shard_index}"
+                shard_status_file_contents = (
+                    mock_s3_client.get_object(Bucket=bucket, Key=key)["Body"].read().decode("utf-8")
+                )
+                shard_status = json.loads(shard_status_file_contents)
+                assert shard_status["done"] is True
+
+        if active_count == len(locations):
+            log.info("all shards are active")
+            break
+        time.sleep(0.5)
+
+    import_duration = time.monotonic() - start
+    log.info(f"import complete; duration={import_duration:.2f}s")
+
+    ep = env.endpoints.create_start(branch_name=import_branch_name, tenant_id=tenant_id)
+
+    # check that data is there
+    validate_vanilla_equivalence(ep)
+
+    # check that we can do basic ops
+
+    ep.safe_psql("create table othertable(values text)", dbname="neondb")
+    rw_lsn = Lsn(ep.safe_psql_scalar("select pg_current_wal_flush_lsn()"))
+    ep.stop()
+
+    # ... at the tip
+    _ = env.create_branch(
+        new_branch_name="br-tip",
+        ancestor_branch_name=import_branch_name,
+        tenant_id=tenant_id,
+        ancestor_start_lsn=rw_lsn,
+    )
+    br_tip_endpoint = env.endpoints.create_start(
+        branch_name="br-tip", endpoint_id="br-tip-ro", tenant_id=tenant_id
+    )
+    validate_vanilla_equivalence(br_tip_endpoint)
+    br_tip_endpoint.safe_psql("select * from othertable", dbname="neondb")
+    br_tip_endpoint.stop()
+
+    # ... at the initdb lsn
+    locations = env.storage_controller.locate(tenant_id)
+    [shard_zero] = [
+        loc for loc in locations if TenantShardId.parse(loc["shard_id"]).shard_number == 0
+    ]
+    shard_zero_ps = env.get_pageserver(shard_zero["node_id"])
+    shard_zero_timeline_info = shard_zero_ps.http_client().timeline_detail(
+        shard_zero["shard_id"], timeline_id
+    )
+    initdb_lsn = Lsn(shard_zero_timeline_info["initdb_lsn"])
+    _ = env.create_branch(
+        new_branch_name="br-initdb",
+        ancestor_branch_name=import_branch_name,
+        tenant_id=tenant_id,
+        ancestor_start_lsn=initdb_lsn,
+    )
+    br_initdb_endpoint = env.endpoints.create_start(
+        branch_name="br-initdb", endpoint_id="br-initdb-ro", tenant_id=tenant_id
+    )
+    validate_vanilla_equivalence(br_initdb_endpoint)
+    with pytest.raises(psycopg2.errors.UndefinedTable):
+        br_initdb_endpoint.safe_psql("select * from othertable", dbname="neondb")
+    br_initdb_endpoint.stop()
+
+    env.pageserver.stop(immediate=True)
+
+
 def test_fast_import_binary(
     test_output_dir,
     vanilla_pg: VanillaPostgres,
@@ -342,7 +568,7 @@ def test_fast_import_binary(
     vanilla_pg.safe_psql("CREATE TABLE foo (a int); INSERT INTO foo SELECT generate_series(1, 10);")
 
     pg_port = port_distributor.get_port()
-    fast_import.run(pg_port, vanilla_pg.connstr())
+    fast_import.run_pgdata(pg_port=pg_port, source_connection_string=vanilla_pg.connstr())
     vanilla_pg.stop()
 
     pgbin = PgBin(test_output_dir, fast_import.pg_distrib_dir, fast_import.pg_version)
@@ -358,6 +584,118 @@ def test_fast_import_binary(
         assert res[0][0] == 10
 
 
+def test_fast_import_restore_to_connstring(
+    test_output_dir,
+    vanilla_pg: VanillaPostgres,
+    port_distributor: PortDistributor,
+    fast_import: FastImport,
+    pg_distrib_dir: Path,
+    pg_version: PgVersion,
+):
+    vanilla_pg.start()
+    vanilla_pg.safe_psql("CREATE TABLE foo (a int); INSERT INTO foo SELECT generate_series(1, 10);")
+
+    pgdatadir = test_output_dir / "destination-pgdata"
+    pg_bin = PgBin(test_output_dir, pg_distrib_dir, pg_version)
+    port = port_distributor.get_port()
+    with VanillaPostgres(pgdatadir, pg_bin, port) as destination_vanilla_pg:
+        destination_vanilla_pg.configure(["shared_preload_libraries='neon_rmgr'"])
+        destination_vanilla_pg.start()
+
+        # create another database & role and try to restore there
+        destination_vanilla_pg.safe_psql("""
+            CREATE ROLE testrole WITH
+                LOGIN
+                PASSWORD 'testpassword'
+                NOSUPERUSER
+                NOCREATEDB
+                NOCREATEROLE;
+        """)
+        destination_vanilla_pg.safe_psql("CREATE DATABASE testdb OWNER testrole;")
+
+        destination_connstring = destination_vanilla_pg.connstr(
+            dbname="testdb", user="testrole", password="testpassword"
+        )
+        fast_import.run_dump_restore(
+            source_connection_string=vanilla_pg.connstr(),
+            destination_connection_string=destination_connstring,
+        )
+        vanilla_pg.stop()
+        conn = PgProtocol(dsn=destination_connstring)
+        res = conn.safe_psql("SELECT count(*) FROM foo;")
+        log.info(f"Result: {res}")
+        assert res[0][0] == 10
+
+
+def test_fast_import_restore_to_connstring_from_s3_spec(
+    test_output_dir,
+    vanilla_pg: VanillaPostgres,
+    port_distributor: PortDistributor,
+    fast_import: FastImport,
+    pg_distrib_dir: Path,
+    pg_version: PgVersion,
+    mock_s3_server: MockS3Server,
+    mock_kms: KMSClient,
+    mock_s3_client: S3Client,
+):
+    # Prepare KMS and S3
+    key_response = mock_kms.create_key(
+        Description="Test key",
+        KeyUsage="ENCRYPT_DECRYPT",
+        Origin="AWS_KMS",
+    )
+    key_id = key_response["KeyMetadata"]["KeyId"]
+
+    def encrypt(x: str) -> EncryptResponseTypeDef:
+        return mock_kms.encrypt(KeyId=key_id, Plaintext=x)
+
+    # Start source postgres and ingest data
+    vanilla_pg.start()
+    vanilla_pg.safe_psql("CREATE TABLE foo (a int); INSERT INTO foo SELECT generate_series(1, 10);")
+
+    # Start target postgres
+    pgdatadir = test_output_dir / "destination-pgdata"
+    pg_bin = PgBin(test_output_dir, pg_distrib_dir, pg_version)
+    port = port_distributor.get_port()
+    with VanillaPostgres(pgdatadir, pg_bin, port) as destination_vanilla_pg:
+        destination_vanilla_pg.configure(["shared_preload_libraries='neon_rmgr'"])
+        destination_vanilla_pg.start()
+
+        # Encrypt connstrings and put spec into S3
+        source_connstring_encrypted = encrypt(vanilla_pg.connstr())
+        destination_connstring_encrypted = encrypt(destination_vanilla_pg.connstr())
+        spec = {
+            "encryption_secret": {"KMS": {"key_id": key_id}},
+            "source_connstring_ciphertext_base64": base64.b64encode(
+                source_connstring_encrypted["CiphertextBlob"]
+            ).decode("utf-8"),
+            "destination_connstring_ciphertext_base64": base64.b64encode(
+                destination_connstring_encrypted["CiphertextBlob"]
+            ).decode("utf-8"),
+        }
+
+        mock_s3_client.create_bucket(Bucket="test-bucket")
+        mock_s3_client.put_object(
+            Bucket="test-bucket", Key="test-prefix/spec.json", Body=json.dumps(spec)
+        )
+
+        # Run fast_import
+        if fast_import.extra_env is None:
+            fast_import.extra_env = {}
+        fast_import.extra_env["AWS_ACCESS_KEY_ID"] = mock_s3_server.access_key()
+        fast_import.extra_env["AWS_SECRET_ACCESS_KEY"] = mock_s3_server.secret_key()
+        fast_import.extra_env["AWS_SESSION_TOKEN"] = mock_s3_server.session_token()
+        fast_import.extra_env["AWS_REGION"] = mock_s3_server.region()
+        fast_import.extra_env["AWS_ENDPOINT_URL"] = mock_s3_server.endpoint()
+        fast_import.extra_env["RUST_LOG"] = "aws_config=debug,aws_sdk_kms=debug"
+        fast_import.run_dump_restore(s3prefix="s3://test-bucket/test-prefix")
+        vanilla_pg.stop()
+
+        res = destination_vanilla_pg.safe_psql("SELECT count(*) FROM foo;")
+        log.info(f"Result: {res}")
+        assert res[0][0] == 10
+
+
 # TODO: Maybe test with pageserver?
 # 1. run whole neon env
 # 2. create timeline with some s3 path???

From b992a1a62a2d4029de0a8b0cd343b1909d8bb311 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 14 Feb 2025 17:20:07 +0100
Subject: [PATCH 09/78] page_service: include socket send & recv queue length
 in slow flush log mesage (#10823)

# Summary

In
- https://github.com/neondatabase/neon/pull/10813

we added slow flush logging but it didn't log the TCP send & recv queue
length.
This PR adds that data to the log message.

I believe the implementation to be safe & correct right now, but it's
brittle and thus this PR should be reverted or improved upon once the
investigation is over.

Refs:
- stacked atop https://github.com/neondatabase/neon/pull/10813
- context:
https://neondb.slack.com/archives/C08DE6Q9C3B/p1739464533762049?thread_ts=1739462628.361019&cid=C08DE6Q9C3B
- improves  https://github.com/neondatabase/neon/issues/10668
- part of https://github.com/neondatabase/cloud/issues/23515

# How It Works

The trouble is two-fold:
1. getting to the raw socket file descriptor through the many Rust types
that wrap it and
2. integrating with the `measure()` function

Rust wraps it in types to model file descriptor lifetimes and ownership,
and usually one can get access using `as_raw_fd()`.
However, we `split()` the stream and the resulting
[`tokio::io::WriteHalf`](https://docs.rs/tokio/latest/tokio/io/struct.WriteHalf.html)
.
Check the PR commit history for my attempts to do it.

My solution is to get the socket fd before we wrap it in our protocol
types, and to store that fd in the new `PostgresBackend::socket_fd`
field.
I believe it's safe because the lifetime of `PostgresBackend::socket_fd`
value == the lifetime of the `TcpStream` that wrap and store in
`PostgresBackend::framed`.
Specifically, the only place that close()s the socket is the `impl Drop
for TcpStream`.
I think the protocol stack calls `TcpStream::shutdown()`, but, that
doesn't `close()` the file descriptor underneath.

Regarding integration with the `measure()` function, the trouble is that
`flush_fut` is currently a generic `Future` type. So, we just pass in
the `socket_fd` as a separate argument.

A clean implementation would convert the `pgb_writer.flush()` to a named
future that provides an accessor for the socket fd while not being
polled.
I tried (see PR history), but failed to break through the `WriteHalf`.


# Testing

Tested locally by running

```
./target/debug/pagebench get-page-latest-lsn --num-clients=1000 --queue-depth=1000
```
in one terminal, waiting a bit, then
```
pkill -STOP pagebench
```
then wait for slow logs to show up in `pageserver.log`.
Pick one of the slow log message's port pairs, e.g., `127.0.0.1:39500`,
and then checking sockstat output
```
ss -ntp | grep '127.0.0.1:39500'
```

to ensure that send & recv queue size match those in the log message.
---
 libs/postgres_backend/src/lib.rs     |  7 ++++++
 libs/utils/Cargo.toml                |  2 +-
 libs/utils/src/lib.rs                |  3 +++
 libs/utils/src/linux_socket_ioctl.rs | 35 ++++++++++++++++++++++++++++
 pageserver/src/metrics.rs            | 27 +++++++++++++++++++--
 pageserver/src/page_service.rs       | 14 +++++++----
 safekeeper/src/wal_service.rs        |  5 +++-
 7 files changed, 85 insertions(+), 8 deletions(-)
 create mode 100644 libs/utils/src/linux_socket_ioctl.rs

diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs
index 8c024375c1..f74b229ac4 100644
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -9,6 +9,8 @@ use bytes::Bytes;
 use serde::{Deserialize, Serialize};
 use std::io::ErrorKind;
 use std::net::SocketAddr;
+use std::os::fd::AsRawFd;
+use std::os::fd::RawFd;
 use std::pin::Pin;
 use std::sync::Arc;
 use std::task::{ready, Poll};
@@ -268,6 +270,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> MaybeWriteOnly<IO> {
 }
 
 pub struct PostgresBackend<IO> {
+    pub socket_fd: RawFd,
     framed: MaybeWriteOnly<IO>,
 
     pub state: ProtoState,
@@ -293,9 +296,11 @@ impl PostgresBackend<tokio::net::TcpStream> {
         tls_config: Option<Arc<rustls::ServerConfig>>,
     ) -> io::Result<Self> {
         let peer_addr = socket.peer_addr()?;
+        let socket_fd = socket.as_raw_fd();
         let stream = MaybeTlsStream::Unencrypted(socket);
 
         Ok(Self {
+            socket_fd,
             framed: MaybeWriteOnly::Full(Framed::new(stream)),
             state: ProtoState::Initialization,
             auth_type,
@@ -307,6 +312,7 @@ impl PostgresBackend<tokio::net::TcpStream> {
 
 impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
     pub fn new_from_io(
+        socket_fd: RawFd,
         socket: IO,
         peer_addr: SocketAddr,
         auth_type: AuthType,
@@ -315,6 +321,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
         let stream = MaybeTlsStream::Unencrypted(socket);
 
         Ok(Self {
+            socket_fd,
             framed: MaybeWriteOnly::Full(Framed::new(stream)),
             state: ProtoState::Initialization,
             auth_type,
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index 0f10300959..e9611a0f12 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -28,7 +28,7 @@ inferno.workspace = true
 fail.workspace = true
 futures = { workspace = true }
 jsonwebtoken.workspace = true
-nix.workspace = true
+nix = {workspace = true, features = [ "ioctl" ] }
 once_cell.workspace = true
 pin-project-lite.workspace = true
 regex.workspace = true
diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs
index 820ff2d5ea..9389a27bf3 100644
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -93,6 +93,9 @@ pub mod try_rcu;
 
 pub mod guard_arc_swap;
 
+#[cfg(target_os = "linux")]
+pub mod linux_socket_ioctl;
+
 // Re-export used in macro. Avoids adding git-version as dep in target crates.
 #[doc(hidden)]
 pub use git_version;
diff --git a/libs/utils/src/linux_socket_ioctl.rs b/libs/utils/src/linux_socket_ioctl.rs
new file mode 100644
index 0000000000..5ae0e86af8
--- /dev/null
+++ b/libs/utils/src/linux_socket_ioctl.rs
@@ -0,0 +1,35 @@
+//! Linux-specific socket ioctls.
+//!
+//! <https://elixir.bootlin.com/linux/v6.1.128/source/include/uapi/linux/sockios.h#L25-L27>
+
+use std::{
+    io,
+    mem::MaybeUninit,
+    os::{fd::RawFd, raw::c_int},
+};
+
+use nix::libc::{FIONREAD, TIOCOUTQ};
+
+unsafe fn do_ioctl(socket_fd: RawFd, cmd: nix::libc::Ioctl) -> io::Result<c_int> {
+    let mut inq: MaybeUninit<c_int> = MaybeUninit::uninit();
+    let err = nix::libc::ioctl(socket_fd, cmd, inq.as_mut_ptr());
+    if err == 0 {
+        Ok(inq.assume_init())
+    } else {
+        Err(io::Error::last_os_error())
+    }
+}
+
+/// # Safety
+///
+/// Caller must ensure that `socket_fd` is a valid TCP socket file descriptor.
+pub unsafe fn inq(socket_fd: RawFd) -> io::Result<c_int> {
+    do_ioctl(socket_fd, FIONREAD)
+}
+
+/// # Safety
+///
+/// Caller must ensure that `socket_fd` is a valid TCP socket file descriptor.
+pub unsafe fn outq(socket_fd: RawFd) -> io::Result<c_int> {
+    do_ioctl(socket_fd, TIOCOUTQ)
+}
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 6a5dc3e749..0ffd4e851a 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1,5 +1,6 @@
 use std::collections::HashMap;
 use std::num::NonZeroUsize;
+use std::os::fd::RawFd;
 use std::pin::Pin;
 use std::sync::atomic::AtomicU64;
 use std::sync::{Arc, Mutex};
@@ -1439,7 +1440,13 @@ impl Drop for SmgrOpTimer {
 }
 
 impl SmgrOpFlushInProgress {
-    pub(crate) async fn measure<Fut, O>(self, started_at: Instant, mut fut: Fut) -> O
+    /// The caller must guarantee that `socket_fd`` outlives this function.
+    pub(crate) async fn measure<Fut, O>(
+        self,
+        started_at: Instant,
+        mut fut: Fut,
+        socket_fd: RawFd,
+    ) -> O
     where
         Fut: std::future::Future<Output = O>,
     {
@@ -1470,8 +1477,24 @@ impl SmgrOpFlushInProgress {
                     } else {
                         "slow flush completed or cancelled"
                     };
+
+                    let (inq, outq) = {
+                        // SAFETY: caller guarantees that `socket_fd` outlives this function.
+                        #[cfg(target_os = "linux")]
+                        unsafe {
+                            (
+                                utils::linux_socket_ioctl::inq(socket_fd).unwrap_or(-2),
+                                utils::linux_socket_ioctl::outq(socket_fd).unwrap_or(-2),
+                            )
+                        }
+                        #[cfg(not(target_os = "linux"))]
+                        {
+                            (-1, -1)
+                        }
+                    };
+
                     let elapsed_total_secs = format!("{:.6}", elapsed_total.as_secs_f64());
-                    tracing::info!(elapsed_total_secs, msg);
+                    tracing::info!(elapsed_total_secs, inq, outq, msg);
                 }
             },
             |mut observe| {
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index bc0ed4198b..e9d87dec71 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -73,6 +73,7 @@ use pageserver_api::models::PageTraceEvent;
 use pageserver_api::reltag::SlruKind;
 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
 use postgres_ffi::BLCKSZ;
+use std::os::fd::AsRawFd;
 
 /// How long we may wait for a [`crate::tenant::mgr::TenantSlot::InProgress`]` and/or a [`crate::tenant::Tenant`] which
 /// is not yet in state [`TenantState::Active`].
@@ -257,6 +258,8 @@ async fn page_service_conn_main(
         .set_nodelay(true)
         .context("could not set TCP_NODELAY")?;
 
+    let socket_fd = socket.as_raw_fd();
+
     let peer_addr = socket.peer_addr().context("get peer address")?;
     tracing::Span::current().record("peer_addr", field::display(peer_addr));
 
@@ -305,7 +308,7 @@ async fn page_service_conn_main(
         cancel.clone(),
         gate_guard,
     );
-    let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
+    let pgbackend = PostgresBackend::new_from_io(socket_fd, socket, peer_addr, auth_type, None)?;
 
     match pgbackend.run(&mut conn_handler, &cancel).await {
         Ok(()) => {
@@ -1286,12 +1289,15 @@ impl PageServerHandler {
             ))?;
 
             // what we want to do
+            let socket_fd = pgb_writer.socket_fd;
             let flush_fut = pgb_writer.flush();
             // metric for how long flushing takes
             let flush_fut = match flushing_timer {
-                Some(flushing_timer) => {
-                    futures::future::Either::Left(flushing_timer.measure(Instant::now(), flush_fut))
-                }
+                Some(flushing_timer) => futures::future::Either::Left(flushing_timer.measure(
+                    Instant::now(),
+                    flush_fut,
+                    socket_fd,
+                )),
                 None => futures::future::Either::Right(flush_fut),
             };
             // do it while respecting cancellation
diff --git a/safekeeper/src/wal_service.rs b/safekeeper/src/wal_service.rs
index 1ebcb060e7..e5ccbb3230 100644
--- a/safekeeper/src/wal_service.rs
+++ b/safekeeper/src/wal_service.rs
@@ -13,6 +13,8 @@ use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::{auth::Scope, measured_stream::MeasuredStream};
 
+use std::os::fd::AsRawFd;
+
 use crate::metrics::TrafficMetrics;
 use crate::SafeKeeperConf;
 use crate::{handler::SafekeeperPostgresHandler, GlobalTimelines};
@@ -62,6 +64,7 @@ async fn handle_socket(
     global_timelines: Arc<GlobalTimelines>,
 ) -> Result<(), QueryError> {
     socket.set_nodelay(true)?;
+    let socket_fd = socket.as_raw_fd();
     let peer_addr = socket.peer_addr()?;
 
     // Set timeout on reading from the socket. It prevents hanged up connection
@@ -107,7 +110,7 @@ async fn handle_socket(
         auth_pair,
         global_timelines,
     );
-    let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
+    let pgbackend = PostgresBackend::new_from_io(socket_fd, socket, peer_addr, auth_type, None)?;
     // libpq protocol between safekeeper and walproposer / pageserver
     // We don't use shutdown.
     pgbackend

From 9177312ba6bd1b8ba85e77d4490517a7f4c01ec5 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 14 Feb 2025 18:57:18 +0100
Subject: [PATCH 10/78] basebackup: use `Timeline::get` for `get_rel` instead
 of `get_rel_page_at_lsn` (#10476)

I noticed the opportunity to simplify here while working on
https://github.com/neondatabase/neon/pull/9353 .

The only difference is the zero-fill behavior: if one reads past rel
size,
`get_rel_page_at_lsn` returns a zeroed page whereas `Timeline::get`
returns an error.

However, the `endblk` is at most rel size large, because `nblocks` is eq
`get_rel_size`, see a few lines above this change.

We're using the same LSN (`self.lsn`) for everything, so there is no
chance of non-determinism.

Refs:

- Slack discussion debating correctness:
https://neondb.slack.com/archives/C033RQ5SPDH/p1737457010607119
---
 pageserver/src/basebackup.rs | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index 25078b57c8..e03b1bbe96 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -13,7 +13,7 @@
 use anyhow::{anyhow, Context};
 use bytes::{BufMut, Bytes, BytesMut};
 use fail::fail_point;
-use pageserver_api::key::Key;
+use pageserver_api::key::{rel_block_to_key, Key};
 use postgres_ffi::pg_constants;
 use std::fmt::Write as FmtWrite;
 use std::time::{Instant, SystemTime};
@@ -501,13 +501,9 @@ where
             for blknum in startblk..endblk {
                 let img = self
                     .timeline
-                    .get_rel_page_at_lsn(
-                        src,
-                        blknum,
-                        Version::Lsn(self.lsn),
-                        self.ctx,
-                        self.io_concurrency.clone(),
-                    )
+                    // TODO: investigate using get_vectored for the entire startblk..endblk range.
+                    // But this code path is not on the critical path for most basebackups (?).
+                    .get(rel_block_to_key(src, blknum), self.lsn, self.ctx)
                     .await
                     .map_err(|e| BasebackupError::Server(e.into()))?;
                 segment_data.extend_from_slice(&img[..]);

From a32e8871acc1922f8bfd8057c08a97e504b1dacc Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 14 Feb 2025 21:11:42 +0100
Subject: [PATCH 11/78] compute/pageserver: correlation of logs through backend
 PID (via `application_name`) (#10810)

This PR makes compute set the `application_name` field to the
PG backend process PID which is also included in each compute log line.

This allows correlation of Pageserver connection logs with compute logs
in a way that was guesswork before this PR.

In future, we can switch for a more unique identifier for a page_service
session.

Refs
- discussion in
https://neondb.slack.com/archives/C08DE6Q9C3B/p1739465208296169?thread_ts=1739462628.361019&cid=C08DE6Q9C3B
- fixes https://github.com/neondatabase/neon/issues/10808
---
 pageserver/src/page_service.rs | 11 +++++++++--
 pgxn/neon/libpagestore.c       | 31 ++++++++++++++++++++++++-------
 2 files changed, 33 insertions(+), 9 deletions(-)

diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index e9d87dec71..53a6a7124d 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -237,7 +237,7 @@ pub async fn libpq_listener_main(
 
 type ConnectionHandlerResult = anyhow::Result<()>;
 
-#[instrument(skip_all, fields(peer_addr))]
+#[instrument(skip_all, fields(peer_addr, application_name))]
 #[allow(clippy::too_many_arguments)]
 async fn page_service_conn_main(
     conf: &'static PageServerConf,
@@ -2463,9 +2463,16 @@ where
     fn startup(
         &mut self,
         _pgb: &mut PostgresBackend<IO>,
-        _sm: &FeStartupPacket,
+        sm: &FeStartupPacket,
     ) -> Result<(), QueryError> {
         fail::fail_point!("ps::connection-start::startup-packet");
+
+        if let FeStartupPacket::StartupMessage { params, .. } = sm {
+            if let Some(app_name) = params.get("application_name") {
+                Span::current().record("application_name", field::display(app_name));
+            }
+        };
+
         Ok(())
     }
 
diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 22aeb2e2d6..fc1aecd340 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -378,8 +378,9 @@ pageserver_connect(shardno_t shard_no, int elevel)
 	{
 	case PS_Disconnected:
 	{
-		const char *keywords[3];
-		const char *values[3];
+		const char *keywords[4];
+		const char *values[4];
+		char pid_str[16];
 		int			n_pgsql_params;
 		TimestampTz	now;
 		int64		us_since_last_attempt;
@@ -424,14 +425,30 @@ pageserver_connect(shardno_t shard_no, int elevel)
 		 * can override the password from the env variable. Seems useful, although
 		 * we don't currently use that capability anywhere.
 		 */
-		keywords[0] = "dbname";
-		values[0] = connstr;
-		n_pgsql_params = 1;
+		n_pgsql_params = 0;
+
+		/*
+		 * Pageserver logs include this in the connection's tracing span.
+		 * This allows for reasier log correlation between compute and pageserver.
+		 */
+		keywords[n_pgsql_params] = "application_name";
+		{
+			int ret = snprintf(pid_str, sizeof(pid_str), "%d", MyProcPid);
+			if (ret < 0 || ret >= (int)(sizeof(pid_str)))
+				elog(FATAL, "stack-allocated buffer too small to hold pid");
+		}
+		/* lifetime: PQconnectStartParams strdups internally */
+		values[n_pgsql_params] = (const char*) pid_str;
+		n_pgsql_params++;
+
+		keywords[n_pgsql_params] = "dbname";
+		values[n_pgsql_params] = connstr;
+		n_pgsql_params++;
 
 		if (neon_auth_token)
 		{
-			keywords[1] = "password";
-			values[1] = neon_auth_token;
+			keywords[n_pgsql_params] = "password";
+			values[n_pgsql_params] = neon_auth_token;
 			n_pgsql_params++;
 		}
 

From ae091c6913066ad6f5ad9ef5a3115fe2ff7d7597 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Fri, 14 Feb 2025 15:31:54 -0500
Subject: [PATCH 12/78] feat(pageserver): store reldir in sparse keyspace
 (#10593)

## Problem

Part of https://github.com/neondatabase/neon/issues/9516

## Summary of changes

This patch adds the support for storing reldir in the sparse keyspace.
All logic are guarded with the `rel_size_v2_enabled` flag, so if it's
set to false, the code path is exactly the same as what's currently in
prod.

Note that we did not persist the `rel_size_v2_enabled` flag and the
logic around it will be implemented in the next patch. (i.e., what if we
enabled it, restart the pageserver, and then it gets set to false? we
should still read from v2 using the rel_size_v2_migration_status in the
index_part). The persistence logic I'll implement in the next patch will
disallow switching from v2->v1 via config item.

I also refactored the metrics so that it can work with the new reldir
store. However, this metric is not correctly computed for reldirs (see
the comments) before. With the refactor, the value will be computed only
when we have an initial value for the reldir size. The refactor keeps
the incorrectness of the computation when there are more than 1
database.

For the tests, we currently run all the tests with v2, and I'll set it
to false and add some v2-specific tests before merging, probably also
v1->v2 migration tests.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/pageserver_api/src/config.rs     |   4 +-
 libs/pageserver_api/src/key.rs        | 112 ++++++++-
 pageserver/src/pgdatadir_mapping.rs   | 322 +++++++++++++++++++++-----
 pageserver/src/tenant.rs              |   9 +-
 pageserver/src/tenant/config.rs       |   4 +-
 pageserver/src/tenant/timeline.rs     |  54 ++++-
 test_runner/regress/test_relations.py |  68 ++++++
 test_runner/regress/test_tenants.py   |   3 +-
 8 files changed, 507 insertions(+), 69 deletions(-)
 create mode 100644 test_runner/regress/test_relations.py

diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 79f068a47b..e64052c73d 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -351,7 +351,7 @@ pub struct TenantConfigToml {
 
     /// Enable rel_size_v2 for this tenant. Once enabled, the tenant will persist this information into
     /// `index_part.json`, and it cannot be reversed.
-    pub rel_size_v2_enabled: Option<bool>,
+    pub rel_size_v2_enabled: bool,
 
     // gc-compaction related configs
     /// Enable automatic gc-compaction trigger on this tenant.
@@ -633,7 +633,7 @@ impl Default for TenantConfigToml {
             lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS,
             timeline_offloading: true,
             wal_receiver_protocol_override: None,
-            rel_size_v2_enabled: None,
+            rel_size_v2_enabled: false,
             gc_compaction_enabled: DEFAULT_GC_COMPACTION_ENABLED,
             gc_compaction_initial_threshold_kb: DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB,
             gc_compaction_ratio_percent: DEFAULT_GC_COMPACTION_RATIO_PERCENT,
diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs
index dbd45da314..b88a2e46a1 100644
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -1,10 +1,12 @@
 use anyhow::{bail, Result};
 use byteorder::{ByteOrder, BE};
+use bytes::Bytes;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::Oid;
 use postgres_ffi::RepOriginId;
 use serde::{Deserialize, Serialize};
 use std::{fmt, ops::Range};
+use utils::const_assert;
 
 use crate::reltag::{BlockNumber, RelTag, SlruKind};
 
@@ -49,6 +51,64 @@ pub const AUX_KEY_PREFIX: u8 = 0x62;
 /// The key prefix of ReplOrigin keys.
 pub const REPL_ORIGIN_KEY_PREFIX: u8 = 0x63;
 
+/// The key prefix of db directory keys.
+pub const DB_DIR_KEY_PREFIX: u8 = 0x64;
+
+/// The key prefix of rel directory keys.
+pub const REL_DIR_KEY_PREFIX: u8 = 0x65;
+
+#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
+pub enum RelDirExists {
+    Exists,
+    Removed,
+}
+
+#[derive(Debug)]
+pub struct DecodeError;
+
+impl fmt::Display for DecodeError {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "invalid marker")
+    }
+}
+
+impl std::error::Error for DecodeError {}
+
+impl RelDirExists {
+    /// The value of the rel directory keys that indicates the existence of a relation.
+    const REL_EXISTS_MARKER: Bytes = Bytes::from_static(b"r");
+
+    pub fn encode(&self) -> Bytes {
+        match self {
+            Self::Exists => Self::REL_EXISTS_MARKER.clone(),
+            Self::Removed => SPARSE_TOMBSTONE_MARKER.clone(),
+        }
+    }
+
+    pub fn decode_option(data: Option<impl AsRef<[u8]>>) -> Result<Self, DecodeError> {
+        match data {
+            Some(marker) if marker.as_ref() == Self::REL_EXISTS_MARKER => Ok(Self::Exists),
+            // Any other marker is invalid
+            Some(_) => Err(DecodeError),
+            None => Ok(Self::Removed),
+        }
+    }
+
+    pub fn decode(data: impl AsRef<[u8]>) -> Result<Self, DecodeError> {
+        let data = data.as_ref();
+        if data == Self::REL_EXISTS_MARKER {
+            Ok(Self::Exists)
+        } else if data == SPARSE_TOMBSTONE_MARKER {
+            Ok(Self::Removed)
+        } else {
+            Err(DecodeError)
+        }
+    }
+}
+
+/// A tombstone in the sparse keyspace, which is an empty buffer.
+pub const SPARSE_TOMBSTONE_MARKER: Bytes = Bytes::from_static(b"");
+
 /// Check if the key falls in the range of metadata keys.
 pub const fn is_metadata_key_slice(key: &[u8]) -> bool {
     key[0] >= METADATA_KEY_BEGIN_PREFIX && key[0] < METADATA_KEY_END_PREFIX
@@ -110,6 +170,24 @@ impl Key {
         }
     }
 
+    pub fn rel_dir_sparse_key_range() -> Range<Self> {
+        Key {
+            field1: REL_DIR_KEY_PREFIX,
+            field2: 0,
+            field3: 0,
+            field4: 0,
+            field5: 0,
+            field6: 0,
+        }..Key {
+            field1: REL_DIR_KEY_PREFIX + 1,
+            field2: 0,
+            field3: 0,
+            field4: 0,
+            field5: 0,
+            field6: 0,
+        }
+    }
+
     /// This function checks more extensively what keys we can take on the write path.
     /// If a key beginning with 00 does not have a global/default tablespace OID, it
     /// will be rejected on the write path.
@@ -440,6 +518,36 @@ pub fn rel_dir_to_key(spcnode: Oid, dbnode: Oid) -> Key {
     }
 }
 
+#[inline(always)]
+pub fn rel_tag_sparse_key(spcnode: Oid, dbnode: Oid, relnode: Oid, forknum: u8) -> Key {
+    Key {
+        field1: REL_DIR_KEY_PREFIX,
+        field2: spcnode,
+        field3: dbnode,
+        field4: relnode,
+        field5: forknum,
+        field6: 1,
+    }
+}
+
+pub fn rel_tag_sparse_key_range(spcnode: Oid, dbnode: Oid) -> Range<Key> {
+    Key {
+        field1: REL_DIR_KEY_PREFIX,
+        field2: spcnode,
+        field3: dbnode,
+        field4: 0,
+        field5: 0,
+        field6: 0,
+    }..Key {
+        field1: REL_DIR_KEY_PREFIX,
+        field2: spcnode,
+        field3: dbnode,
+        field4: u32::MAX,
+        field5: u8::MAX,
+        field6: u32::MAX,
+    } // it's fine to exclude the last key b/c we only use field6 == 1
+}
+
 #[inline(always)]
 pub fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
     Key {
@@ -734,9 +842,9 @@ impl Key {
         self.field1 == RELATION_SIZE_PREFIX
     }
 
-    pub fn sparse_non_inherited_keyspace() -> Range<Key> {
+    pub const fn sparse_non_inherited_keyspace() -> Range<Key> {
         // The two keys are adjacent; if we will have non-adjancent keys in the future, we should return a keyspace
-        debug_assert_eq!(AUX_KEY_PREFIX + 1, REPL_ORIGIN_KEY_PREFIX);
+        const_assert!(AUX_KEY_PREFIX + 1 == REPL_ORIGIN_KEY_PREFIX);
         Key {
             field1: AUX_KEY_PREFIX,
             field2: 0,
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index f2dca8befa..ae2762bd1e 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -23,13 +23,14 @@ use anyhow::{ensure, Context};
 use bytes::{Buf, Bytes, BytesMut};
 use enum_map::Enum;
 use itertools::Itertools;
-use pageserver_api::key::Key;
 use pageserver_api::key::{
     dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key,
-    relmap_file_key, repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key,
-    slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
-    CompactKey, AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
+    rel_tag_sparse_key_range, relmap_file_key, repl_origin_key, repl_origin_key_range,
+    slru_block_to_key, slru_dir_to_key, slru_segment_key_range, slru_segment_size_to_key,
+    twophase_file_key, twophase_key_range, CompactKey, RelDirExists, AUX_FILES_KEY, CHECKPOINT_KEY,
+    CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
 };
+use pageserver_api::key::{rel_tag_sparse_key, Key};
 use pageserver_api::keyspace::SparseKeySpace;
 use pageserver_api::record::NeonWalRecord;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
@@ -490,12 +491,33 @@ impl Timeline {
         if !dbdirs.contains_key(&(tag.spcnode, tag.dbnode)) {
             return Ok(false);
         }
-        // fetch directory listing
+
+        // Read path: first read the new reldir keyspace. Early return if the relation exists.
+        // Otherwise, read the old reldir keyspace.
+        // TODO: if IndexPart::rel_size_migration is `Migrated`, we only need to read from v2.
+
+        if self.get_rel_size_v2_enabled() {
+            // fetch directory listing (new)
+            let key = rel_tag_sparse_key(tag.spcnode, tag.dbnode, tag.relnode, tag.forknum);
+            let buf = RelDirExists::decode_option(version.sparse_get(self, key, ctx).await?)
+                .map_err(|_| PageReconstructError::Other(anyhow::anyhow!("invalid reldir key")))?;
+            let exists_v2 = buf == RelDirExists::Exists;
+            // Fast path: if the relation exists in the new format, return true.
+            // TODO: we should have a verification mode that checks both keyspaces
+            // to ensure the relation only exists in one of them.
+            if exists_v2 {
+                return Ok(true);
+            }
+        }
+
+        // fetch directory listing (old)
+
         let key = rel_dir_to_key(tag.spcnode, tag.dbnode);
         let buf = version.get(self, key, ctx).await?;
 
         let dir = RelDirectory::des(&buf)?;
-        Ok(dir.rels.contains(&(tag.relnode, tag.forknum)))
+        let exists_v1 = dir.rels.contains(&(tag.relnode, tag.forknum));
+        Ok(exists_v1)
     }
 
     /// Get a list of all existing relations in given tablespace and database.
@@ -513,12 +535,12 @@ impl Timeline {
         version: Version<'_>,
         ctx: &RequestContext,
     ) -> Result<HashSet<RelTag>, PageReconstructError> {
-        // fetch directory listing
+        // fetch directory listing (old)
         let key = rel_dir_to_key(spcnode, dbnode);
         let buf = version.get(self, key, ctx).await?;
 
         let dir = RelDirectory::des(&buf)?;
-        let rels: HashSet<RelTag> =
+        let rels_v1: HashSet<RelTag> =
             HashSet::from_iter(dir.rels.iter().map(|(relnode, forknum)| RelTag {
                 spcnode,
                 dbnode,
@@ -526,6 +548,46 @@ impl Timeline {
                 forknum: *forknum,
             }));
 
+        if !self.get_rel_size_v2_enabled() {
+            return Ok(rels_v1);
+        }
+
+        // scan directory listing (new), merge with the old results
+        let key_range = rel_tag_sparse_key_range(spcnode, dbnode);
+        let io_concurrency = IoConcurrency::spawn_from_conf(
+            self.conf,
+            self.gate
+                .enter()
+                .map_err(|_| PageReconstructError::Cancelled)?,
+        );
+        let results = self
+            .scan(
+                KeySpace::single(key_range),
+                version.get_lsn(),
+                ctx,
+                io_concurrency,
+            )
+            .await?;
+        let mut rels = rels_v1;
+        for (key, val) in results {
+            let val = RelDirExists::decode(&val?)
+                .map_err(|_| PageReconstructError::Other(anyhow::anyhow!("invalid reldir key")))?;
+            assert_eq!(key.field6, 1);
+            assert_eq!(key.field2, spcnode);
+            assert_eq!(key.field3, dbnode);
+            let tag = RelTag {
+                spcnode,
+                dbnode,
+                relnode: key.field4,
+                forknum: key.field5,
+            };
+            if val == RelDirExists::Removed {
+                debug_assert!(!rels.contains(&tag), "removed reltag in v2");
+                continue;
+            }
+            let did_not_contain = rels.insert(tag);
+            debug_assert!(did_not_contain, "duplicate reltag in v2");
+        }
         Ok(rels)
     }
 
@@ -1144,7 +1206,11 @@ impl Timeline {
 
         let dense_keyspace = result.to_keyspace();
         let sparse_keyspace = SparseKeySpace(KeySpace {
-            ranges: vec![Key::metadata_aux_key_range(), repl_origin_key_range()],
+            ranges: vec![
+                Key::metadata_aux_key_range(),
+                repl_origin_key_range(),
+                Key::rel_dir_sparse_key_range(),
+            ],
         });
 
         if cfg!(debug_assertions) {
@@ -1274,12 +1340,22 @@ pub struct DatadirModification<'a> {
 
     /// For special "directory" keys that store key-value maps, track the size of the map
     /// if it was updated in this modification.
-    pending_directory_entries: Vec<(DirectoryKind, usize)>,
+    pending_directory_entries: Vec<(DirectoryKind, MetricsUpdate)>,
 
     /// An **approximation** of how many metadata bytes will be written to the EphemeralFile.
     pending_metadata_bytes: usize,
 }
 
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum MetricsUpdate {
+    /// Set the metrics to this value
+    Set(u64),
+    /// Increment the metrics by this value
+    Add(u64),
+    /// Decrement the metrics by this value
+    Sub(u64),
+}
+
 impl DatadirModification<'_> {
     // When a DatadirModification is committed, we do a monolithic serialization of all its contents.  WAL records can
     // contain multiple pages, so the pageserver's record-based batch size isn't sufficient to bound this allocation: we
@@ -1359,7 +1435,8 @@ impl DatadirModification<'_> {
         let buf = DbDirectory::ser(&DbDirectory {
             dbdirs: HashMap::new(),
         })?;
-        self.pending_directory_entries.push((DirectoryKind::Db, 0));
+        self.pending_directory_entries
+            .push((DirectoryKind::Db, MetricsUpdate::Set(0)));
         self.put(DBDIR_KEY, Value::Image(buf.into()));
 
         let buf = if self.tline.pg_version >= 17 {
@@ -1372,7 +1449,7 @@ impl DatadirModification<'_> {
             })
         }?;
         self.pending_directory_entries
-            .push((DirectoryKind::TwoPhase, 0));
+            .push((DirectoryKind::TwoPhase, MetricsUpdate::Set(0)));
         self.put(TWOPHASEDIR_KEY, Value::Image(buf.into()));
 
         let buf: Bytes = SlruSegmentDirectory::ser(&SlruSegmentDirectory::default())?.into();
@@ -1382,17 +1459,23 @@ impl DatadirModification<'_> {
         // harmless but they'd just be dropped on later compaction.
         if self.tline.tenant_shard_id.is_shard_zero() {
             self.put(slru_dir_to_key(SlruKind::Clog), empty_dir.clone());
-            self.pending_directory_entries
-                .push((DirectoryKind::SlruSegment(SlruKind::Clog), 0));
+            self.pending_directory_entries.push((
+                DirectoryKind::SlruSegment(SlruKind::Clog),
+                MetricsUpdate::Set(0),
+            ));
             self.put(
                 slru_dir_to_key(SlruKind::MultiXactMembers),
                 empty_dir.clone(),
             );
-            self.pending_directory_entries
-                .push((DirectoryKind::SlruSegment(SlruKind::Clog), 0));
+            self.pending_directory_entries.push((
+                DirectoryKind::SlruSegment(SlruKind::Clog),
+                MetricsUpdate::Set(0),
+            ));
             self.put(slru_dir_to_key(SlruKind::MultiXactOffsets), empty_dir);
-            self.pending_directory_entries
-                .push((DirectoryKind::SlruSegment(SlruKind::MultiXactOffsets), 0));
+            self.pending_directory_entries.push((
+                DirectoryKind::SlruSegment(SlruKind::MultiXactOffsets),
+                MetricsUpdate::Set(0),
+            ));
         }
 
         Ok(())
@@ -1658,10 +1741,16 @@ impl DatadirModification<'_> {
         }
         if r.is_none() {
             // Create RelDirectory
+            // TODO: if we have fully migrated to v2, no need to create this directory
             let buf = RelDirectory::ser(&RelDirectory {
                 rels: HashSet::new(),
             })?;
-            self.pending_directory_entries.push((DirectoryKind::Rel, 0));
+            self.pending_directory_entries
+                .push((DirectoryKind::Rel, MetricsUpdate::Set(0)));
+            if self.tline.get_rel_size_v2_enabled() {
+                self.pending_directory_entries
+                    .push((DirectoryKind::RelV2, MetricsUpdate::Set(0)));
+            }
             self.put(
                 rel_dir_to_key(spcnode, dbnode),
                 Value::Image(Bytes::from(buf)),
@@ -1685,8 +1774,10 @@ impl DatadirModification<'_> {
             if !dir.xids.insert(xid) {
                 anyhow::bail!("twophase file for xid {} already exists", xid);
             }
-            self.pending_directory_entries
-                .push((DirectoryKind::TwoPhase, dir.xids.len()));
+            self.pending_directory_entries.push((
+                DirectoryKind::TwoPhase,
+                MetricsUpdate::Set(dir.xids.len() as u64),
+            ));
             Bytes::from(TwoPhaseDirectoryV17::ser(&dir)?)
         } else {
             let xid = xid as u32;
@@ -1694,8 +1785,10 @@ impl DatadirModification<'_> {
             if !dir.xids.insert(xid) {
                 anyhow::bail!("twophase file for xid {} already exists", xid);
             }
-            self.pending_directory_entries
-                .push((DirectoryKind::TwoPhase, dir.xids.len()));
+            self.pending_directory_entries.push((
+                DirectoryKind::TwoPhase,
+                MetricsUpdate::Set(dir.xids.len() as u64),
+            ));
             Bytes::from(TwoPhaseDirectory::ser(&dir)?)
         };
         self.put(TWOPHASEDIR_KEY, Value::Image(newdirbuf));
@@ -1744,8 +1837,10 @@ impl DatadirModification<'_> {
         let mut dir = DbDirectory::des(&buf)?;
         if dir.dbdirs.remove(&(spcnode, dbnode)).is_some() {
             let buf = DbDirectory::ser(&dir)?;
-            self.pending_directory_entries
-                .push((DirectoryKind::Db, dir.dbdirs.len()));
+            self.pending_directory_entries.push((
+                DirectoryKind::Db,
+                MetricsUpdate::Set(dir.dbdirs.len() as u64),
+            ));
             self.put(DBDIR_KEY, Value::Image(buf.into()));
         } else {
             warn!(
@@ -1778,39 +1873,85 @@ impl DatadirModification<'_> {
         // tablespace.  Create the reldir entry for it if so.
         let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await.context("read db")?)
             .context("deserialize db")?;
-        let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
-        let mut rel_dir =
+
+        let dbdir_exists =
             if let hash_map::Entry::Vacant(e) = dbdir.dbdirs.entry((rel.spcnode, rel.dbnode)) {
                 // Didn't exist. Update dbdir
                 e.insert(false);
                 let buf = DbDirectory::ser(&dbdir).context("serialize db")?;
-                self.pending_directory_entries
-                    .push((DirectoryKind::Db, dbdir.dbdirs.len()));
+                self.pending_directory_entries.push((
+                    DirectoryKind::Db,
+                    MetricsUpdate::Set(dbdir.dbdirs.len() as u64),
+                ));
                 self.put(DBDIR_KEY, Value::Image(buf.into()));
-
-                // and create the RelDirectory
-                RelDirectory::default()
+                false
             } else {
-                // reldir already exists, fetch it
-                RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?)
-                    .context("deserialize db")?
+                true
             };
 
+        let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
+        let mut rel_dir = if !dbdir_exists {
+            // Create the RelDirectory
+            RelDirectory::default()
+        } else {
+            // reldir already exists, fetch it
+            RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?)
+                .context("deserialize db")?
+        };
+
         // Add the new relation to the rel directory entry, and write it back
         if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
             return Err(RelationError::AlreadyExists);
         }
 
-        self.pending_directory_entries
-            .push((DirectoryKind::Rel, rel_dir.rels.len()));
-
-        self.put(
-            rel_dir_key,
-            Value::Image(Bytes::from(
-                RelDirectory::ser(&rel_dir).context("serialize")?,
-            )),
-        );
-
+        if self.tline.get_rel_size_v2_enabled() {
+            let sparse_rel_dir_key =
+                rel_tag_sparse_key(rel.spcnode, rel.dbnode, rel.relnode, rel.forknum);
+            // check if the rel_dir_key exists in v2
+            let val = self
+                .sparse_get(sparse_rel_dir_key, ctx)
+                .await
+                .map_err(|e| RelationError::Other(e.into()))?;
+            let val = RelDirExists::decode_option(val)
+                .map_err(|_| RelationError::Other(anyhow::anyhow!("invalid reldir key")))?;
+            if val == RelDirExists::Exists {
+                return Err(RelationError::AlreadyExists);
+            }
+            self.put(
+                sparse_rel_dir_key,
+                Value::Image(RelDirExists::Exists.encode()),
+            );
+            if !dbdir_exists {
+                self.pending_directory_entries
+                    .push((DirectoryKind::Rel, MetricsUpdate::Set(0)));
+                self.pending_directory_entries
+                    .push((DirectoryKind::RelV2, MetricsUpdate::Set(0)));
+                // We don't write `rel_dir_key -> rel_dir.rels` back to the storage in the v2 path unless it's the initial creation.
+                // TODO: if we have fully migrated to v2, no need to create this directory. Otherwise, there
+                // will be key not found errors if we don't create an empty one for rel_size_v2.
+                self.put(
+                    rel_dir_key,
+                    Value::Image(Bytes::from(
+                        RelDirectory::ser(&RelDirectory::default()).context("serialize")?,
+                    )),
+                );
+            }
+            self.pending_directory_entries
+                .push((DirectoryKind::RelV2, MetricsUpdate::Add(1)));
+        } else {
+            if !dbdir_exists {
+                self.pending_directory_entries
+                    .push((DirectoryKind::Rel, MetricsUpdate::Set(0)))
+            }
+            self.pending_directory_entries
+                .push((DirectoryKind::Rel, MetricsUpdate::Add(1)));
+            self.put(
+                rel_dir_key,
+                Value::Image(Bytes::from(
+                    RelDirectory::ser(&rel_dir).context("serialize")?,
+                )),
+            );
+        }
         // Put size
         let size_key = rel_size_to_key(rel);
         let buf = nblocks.to_le_bytes();
@@ -1896,9 +2037,34 @@ impl DatadirModification<'_> {
 
             let mut dirty = false;
             for rel_tag in rel_tags {
-                if dir.rels.remove(&(rel_tag.relnode, rel_tag.forknum)) {
+                let found = if dir.rels.remove(&(rel_tag.relnode, rel_tag.forknum)) {
+                    self.pending_directory_entries
+                        .push((DirectoryKind::Rel, MetricsUpdate::Sub(1)));
                     dirty = true;
+                    true
+                } else if self.tline.get_rel_size_v2_enabled() {
+                    // The rel is not found in the old reldir key, so we need to check the new sparse keyspace.
+                    // Note that a relation can only exist in one of the two keyspaces (guaranteed by the ingestion
+                    // logic).
+                    let key =
+                        rel_tag_sparse_key(spc_node, db_node, rel_tag.relnode, rel_tag.forknum);
+                    let val = RelDirExists::decode_option(self.sparse_get(key, ctx).await?)
+                        .map_err(|_| RelationError::Other(anyhow::anyhow!("invalid reldir key")))?;
+                    if val == RelDirExists::Exists {
+                        self.pending_directory_entries
+                            .push((DirectoryKind::RelV2, MetricsUpdate::Sub(1)));
+                        // put tombstone
+                        self.put(key, Value::Image(RelDirExists::Removed.encode()));
+                        // no need to set dirty to true
+                        true
+                    } else {
+                        false
+                    }
+                } else {
+                    false
+                };
 
+                if found {
                     // update logical size
                     let size_key = rel_size_to_key(rel_tag);
                     let old_size = self.get(size_key, ctx).await?.get_u32_le();
@@ -1914,8 +2080,6 @@ impl DatadirModification<'_> {
 
             if dirty {
                 self.put(dir_key, Value::Image(Bytes::from(RelDirectory::ser(&dir)?)));
-                self.pending_directory_entries
-                    .push((DirectoryKind::Rel, dir.rels.len()));
             }
         }
 
@@ -1939,8 +2103,10 @@ impl DatadirModification<'_> {
         if !dir.segments.insert(segno) {
             anyhow::bail!("slru segment {kind:?}/{segno} already exists");
         }
-        self.pending_directory_entries
-            .push((DirectoryKind::SlruSegment(kind), dir.segments.len()));
+        self.pending_directory_entries.push((
+            DirectoryKind::SlruSegment(kind),
+            MetricsUpdate::Set(dir.segments.len() as u64),
+        ));
         self.put(
             dir_key,
             Value::Image(Bytes::from(SlruSegmentDirectory::ser(&dir)?)),
@@ -1987,8 +2153,10 @@ impl DatadirModification<'_> {
         if !dir.segments.remove(&segno) {
             warn!("slru segment {:?}/{} does not exist", kind, segno);
         }
-        self.pending_directory_entries
-            .push((DirectoryKind::SlruSegment(kind), dir.segments.len()));
+        self.pending_directory_entries.push((
+            DirectoryKind::SlruSegment(kind),
+            MetricsUpdate::Set(dir.segments.len() as u64),
+        ));
         self.put(
             dir_key,
             Value::Image(Bytes::from(SlruSegmentDirectory::ser(&dir)?)),
@@ -2020,8 +2188,10 @@ impl DatadirModification<'_> {
             if !dir.xids.remove(&xid) {
                 warn!("twophase file for xid {} does not exist", xid);
             }
-            self.pending_directory_entries
-                .push((DirectoryKind::TwoPhase, dir.xids.len()));
+            self.pending_directory_entries.push((
+                DirectoryKind::TwoPhase,
+                MetricsUpdate::Set(dir.xids.len() as u64),
+            ));
             Bytes::from(TwoPhaseDirectoryV17::ser(&dir)?)
         } else {
             let xid: u32 = u32::try_from(xid)?;
@@ -2030,8 +2200,10 @@ impl DatadirModification<'_> {
             if !dir.xids.remove(&xid) {
                 warn!("twophase file for xid {} does not exist", xid);
             }
-            self.pending_directory_entries
-                .push((DirectoryKind::TwoPhase, dir.xids.len()));
+            self.pending_directory_entries.push((
+                DirectoryKind::TwoPhase,
+                MetricsUpdate::Set(dir.xids.len() as u64),
+            ));
             Bytes::from(TwoPhaseDirectory::ser(&dir)?)
         };
         self.put(TWOPHASEDIR_KEY, Value::Image(newdirbuf));
@@ -2147,7 +2319,7 @@ impl DatadirModification<'_> {
         }
 
         for (kind, count) in std::mem::take(&mut self.pending_directory_entries) {
-            writer.update_directory_entries_count(kind, count as u64);
+            writer.update_directory_entries_count(kind, count);
         }
 
         Ok(())
@@ -2233,7 +2405,7 @@ impl DatadirModification<'_> {
         }
 
         for (kind, count) in std::mem::take(&mut self.pending_directory_entries) {
-            writer.update_directory_entries_count(kind, count as u64);
+            writer.update_directory_entries_count(kind, count);
         }
 
         self.pending_metadata_bytes = 0;
@@ -2297,6 +2469,22 @@ impl DatadirModification<'_> {
         self.tline.get(key, lsn, ctx).await
     }
 
+    /// Get a key from the sparse keyspace. Automatically converts the missing key error
+    /// and the empty value into None.
+    async fn sparse_get(
+        &self,
+        key: Key,
+        ctx: &RequestContext,
+    ) -> Result<Option<Bytes>, PageReconstructError> {
+        let val = self.get(key, ctx).await;
+        match val {
+            Ok(val) if val.is_empty() => Ok(None),
+            Ok(val) => Ok(Some(val)),
+            Err(PageReconstructError::MissingKey(_)) => Ok(None),
+            Err(e) => Err(e),
+        }
+    }
+
     fn put(&mut self, key: Key, val: Value) {
         if Self::is_data_key(&key) {
             self.put_data(key.to_compact(), val)
@@ -2379,6 +2567,23 @@ impl Version<'_> {
         }
     }
 
+    /// Get a key from the sparse keyspace. Automatically converts the missing key error
+    /// and the empty value into None.
+    async fn sparse_get(
+        &self,
+        timeline: &Timeline,
+        key: Key,
+        ctx: &RequestContext,
+    ) -> Result<Option<Bytes>, PageReconstructError> {
+        let val = self.get(timeline, key, ctx).await;
+        match val {
+            Ok(val) if val.is_empty() => Ok(None),
+            Ok(val) => Ok(Some(val)),
+            Err(PageReconstructError::MissingKey(_)) => Ok(None),
+            Err(e) => Err(e),
+        }
+    }
+
     fn get_lsn(&self) -> Lsn {
         match self {
             Version::Lsn(lsn) => *lsn,
@@ -2438,6 +2643,7 @@ pub(crate) enum DirectoryKind {
     Rel,
     AuxFiles,
     SlruSegment(SlruKind),
+    RelV2,
 }
 
 impl DirectoryKind {
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index dec585ff65..5a2c5c0c46 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3924,6 +3924,13 @@ impl Tenant {
             .unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
     }
 
+    pub fn get_rel_size_v2_enabled(&self) -> bool {
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
+        tenant_conf
+            .rel_size_v2_enabled
+            .unwrap_or(self.conf.default_tenant_conf.rel_size_v2_enabled)
+    }
+
     pub fn get_compaction_upper_limit(&self) -> usize {
         let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
         tenant_conf
@@ -5640,7 +5647,7 @@ pub(crate) mod harness {
                 lsn_lease_length_for_ts: Some(tenant_conf.lsn_lease_length_for_ts),
                 timeline_offloading: Some(tenant_conf.timeline_offloading),
                 wal_receiver_protocol_override: tenant_conf.wal_receiver_protocol_override,
-                rel_size_v2_enabled: tenant_conf.rel_size_v2_enabled,
+                rel_size_v2_enabled: Some(tenant_conf.rel_size_v2_enabled),
                 gc_compaction_enabled: Some(tenant_conf.gc_compaction_enabled),
                 gc_compaction_initial_threshold_kb: Some(
                     tenant_conf.gc_compaction_initial_threshold_kb,
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index 7fdfd736ad..c6bcfdf2fb 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -485,7 +485,9 @@ impl TenantConfOpt {
             wal_receiver_protocol_override: self
                 .wal_receiver_protocol_override
                 .or(global_conf.wal_receiver_protocol_override),
-            rel_size_v2_enabled: self.rel_size_v2_enabled.or(global_conf.rel_size_v2_enabled),
+            rel_size_v2_enabled: self
+                .rel_size_v2_enabled
+                .unwrap_or(global_conf.rel_size_v2_enabled),
             gc_compaction_enabled: self
                 .gc_compaction_enabled
                 .unwrap_or(global_conf.gc_compaction_enabled),
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 782b7d88b0..277dce7761 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -117,7 +117,7 @@ use pageserver_api::config::tenant_conf_defaults::DEFAULT_PITR_INTERVAL;
 use crate::config::PageServerConf;
 use crate::keyspace::{KeyPartitioning, KeySpace};
 use crate::metrics::{TimelineMetrics, DELTAS_PER_READ_GLOBAL, LAYERS_PER_READ_GLOBAL};
-use crate::pgdatadir_mapping::CalculateLogicalSizeError;
+use crate::pgdatadir_mapping::{CalculateLogicalSizeError, MetricsUpdate};
 use crate::tenant::config::TenantConfOpt;
 use pageserver_api::reltag::RelTag;
 use pageserver_api::shard::ShardIndex;
@@ -327,6 +327,7 @@ pub struct Timeline {
     // in `crate::page_service` writes these metrics.
     pub(crate) query_metrics: crate::metrics::SmgrQueryTimePerTimeline,
 
+    directory_metrics_inited: [AtomicBool; DirectoryKind::KINDS_NUM],
     directory_metrics: [AtomicU64; DirectoryKind::KINDS_NUM],
 
     /// Ensures layers aren't frozen by checkpointer between
@@ -2355,6 +2356,14 @@ impl Timeline {
             .unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
     }
 
+    pub(crate) fn get_rel_size_v2_enabled(&self) -> bool {
+        let tenant_conf = self.tenant_conf.load();
+        tenant_conf
+            .tenant_conf
+            .rel_size_v2_enabled
+            .unwrap_or(self.conf.default_tenant_conf.rel_size_v2_enabled)
+    }
+
     fn get_compaction_upper_limit(&self) -> usize {
         let tenant_conf = self.tenant_conf.load();
         tenant_conf
@@ -2664,6 +2673,7 @@ impl Timeline {
                 ),
 
                 directory_metrics: array::from_fn(|_| AtomicU64::new(0)),
+                directory_metrics_inited: array::from_fn(|_| AtomicBool::new(false)),
 
                 flush_loop_state: Mutex::new(FlushLoopState::NotStarted),
 
@@ -3430,8 +3440,42 @@ impl Timeline {
         }
     }
 
-    pub(crate) fn update_directory_entries_count(&self, kind: DirectoryKind, count: u64) {
-        self.directory_metrics[kind.offset()].store(count, AtomicOrdering::Relaxed);
+    pub(crate) fn update_directory_entries_count(&self, kind: DirectoryKind, count: MetricsUpdate) {
+        // TODO: this directory metrics is not correct -- we could have multiple reldirs in the system
+        // for each of the database, but we only store one value, and therefore each pgdirmodification
+        // would overwrite the previous value if they modify different databases.
+
+        match count {
+            MetricsUpdate::Set(count) => {
+                self.directory_metrics[kind.offset()].store(count, AtomicOrdering::Relaxed);
+                self.directory_metrics_inited[kind.offset()].store(true, AtomicOrdering::Relaxed);
+            }
+            MetricsUpdate::Add(count) => {
+                // TODO: these operations are not atomic; but we only have one writer to the metrics, so
+                // it's fine.
+                if self.directory_metrics_inited[kind.offset()].load(AtomicOrdering::Relaxed) {
+                    // The metrics has been initialized with `MetricsUpdate::Set` before, so we can add/sub
+                    // the value reliably.
+                    self.directory_metrics[kind.offset()].fetch_add(count, AtomicOrdering::Relaxed);
+                }
+                // Otherwise, ignore this update
+            }
+            MetricsUpdate::Sub(count) => {
+                // TODO: these operations are not atomic; but we only have one writer to the metrics, so
+                // it's fine.
+                if self.directory_metrics_inited[kind.offset()].load(AtomicOrdering::Relaxed) {
+                    // The metrics has been initialized with `MetricsUpdate::Set` before.
+                    // The operation could overflow so we need to normalize the value.
+                    let prev_val =
+                        self.directory_metrics[kind.offset()].load(AtomicOrdering::Relaxed);
+                    let res = prev_val.saturating_sub(count);
+                    self.directory_metrics[kind.offset()].store(res, AtomicOrdering::Relaxed);
+                }
+                // Otherwise, ignore this update
+            }
+        };
+
+        // TODO: remove this, there's no place in the code that updates this aux metrics.
         let aux_metric =
             self.directory_metrics[DirectoryKind::AuxFiles.offset()].load(AtomicOrdering::Relaxed);
 
@@ -3649,7 +3693,9 @@ impl Timeline {
             // space. If that's not the case, we had at least one key encounter a gap in the image layer
             // and stop the search as a result of that.
             let mut removed = keyspace.remove_overlapping_with(&image_covered_keyspace);
-            // Do not fire missing key error for sparse keys.
+            // Do not fire missing key error and end early for sparse keys. Note that we hava already removed
+            // non-inherited keyspaces before, so we can safely do a full `SPARSE_RANGE` remove instead of
+            // figuring out what is the inherited key range and do a fine-grained pruning.
             removed.remove_overlapping_with(&KeySpace {
                 ranges: vec![SPARSE_RANGE],
             });
diff --git a/test_runner/regress/test_relations.py b/test_runner/regress/test_relations.py
new file mode 100644
index 0000000000..3e29c92a96
--- /dev/null
+++ b/test_runner/regress/test_relations.py
@@ -0,0 +1,68 @@
+from __future__ import annotations
+
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+)
+
+
+def test_pageserver_reldir_v2(
+    neon_env_builder: NeonEnvBuilder,
+):
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
+            "rel_size_v2_enabled": "false",
+        }
+    )
+
+    endpoint = env.endpoints.create_start("main")
+    # Create a relation in v1
+    endpoint.safe_psql("CREATE TABLE foo1 (id INTEGER PRIMARY KEY, val text)")
+    endpoint.safe_psql("CREATE TABLE foo2 (id INTEGER PRIMARY KEY, val text)")
+
+    # Switch to v2
+    env.pageserver.http_client().update_tenant_config(
+        env.initial_tenant,
+        {
+            "rel_size_v2_enabled": True,
+        },
+    )
+
+    # Check if both relations are still accessible
+    endpoint.safe_psql("SELECT * FROM foo1")
+    endpoint.safe_psql("SELECT * FROM foo2")
+
+    # Restart the endpoint
+    endpoint.stop()
+    endpoint.start()
+
+    # Check if both relations are still accessible again after restart
+    endpoint.safe_psql("SELECT * FROM foo1")
+    endpoint.safe_psql("SELECT * FROM foo2")
+
+    # Create a relation in v2
+    endpoint.safe_psql("CREATE TABLE foo3 (id INTEGER PRIMARY KEY, val text)")
+    # Delete a relation in v1
+    endpoint.safe_psql("DROP TABLE foo1")
+
+    # Check if both relations are still accessible
+    endpoint.safe_psql("SELECT * FROM foo2")
+    endpoint.safe_psql("SELECT * FROM foo3")
+
+    # Restart the endpoint
+    endpoint.stop()
+    # This will acquire a basebackup, which lists all relations.
+    endpoint.start()
+
+    # Check if both relations are still accessible
+    endpoint.safe_psql("DROP TABLE IF EXISTS foo1")
+    endpoint.safe_psql("SELECT * FROM foo2")
+    endpoint.safe_psql("SELECT * FROM foo3")
+
+    endpoint.safe_psql("DROP TABLE foo3")
+    endpoint.stop()
+    endpoint.start()
+
+    # Check if relations are still accessible
+    endpoint.safe_psql("DROP TABLE IF EXISTS foo1")
+    endpoint.safe_psql("SELECT * FROM foo2")
+    endpoint.safe_psql("DROP TABLE IF EXISTS foo3")
diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index b4c968b217..afe444f227 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -481,7 +481,8 @@ def test_pageserver_metrics_many_relations(neon_env_builder: NeonEnvBuilder):
     counts = timeline_detail["directory_entries_counts"]
     assert counts
     log.info(f"directory counts: {counts}")
-    assert counts[2] > COUNT_AT_LEAST_EXPECTED
+    # We need to add up reldir v1 + v2 counts
+    assert counts[2] + counts[7] > COUNT_AT_LEAST_EXPECTED
 
 
 def test_timelines_parallel_endpoints(neon_simple_env: NeonEnv):

From 2ec8dff6f77c605c34b6a6ed9b6e4e1b56229f26 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Sat, 15 Feb 2025 10:34:11 +0000
Subject: [PATCH 13/78] CI(build-and-test-locally): set `session-timeout` for
 pytest (#10831)

## Problem

Sometimes, a regression test run gets stuck (taking more than 60
minutes) and is killed by GitHub's `timeout-minutes` without leaving any
traces in the test results database.
I find no correlation between this and either the build type, the
architecture, or the Postgres version.

See: https://neonprod.grafana.net/goto/nM7ih7cHR?orgId=1

## Summary of changes
- Bump `pytest-timeout` to the version that supports `--session-timeout`
- Set `--session-timeout` to (timeout-minutes - 10 minutes) * 60 seconds
in Attempt to stop tests gracefully to generate test reports until they
are forcibly stopped by the stricter `timeout-minutes` limit.
---
 .github/workflows/_build-and-test-locally.yml |  4 ++++
 poetry.lock                                   | 12 ++++++------
 pyproject.toml                                |  2 +-
 3 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index 86a791497c..3740e6dc9c 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -348,6 +348,10 @@ jobs:
           rerun_failed: true
           pg_version: ${{ matrix.pg_version }}
           aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+          # `--session-timeout` is equal to (timeout-minutes - 10 minutes) * 60 seconds.
+          # Attempt to stop tests gracefully to generate test reports
+          # until they are forcibly stopped by the stricter `timeout-minutes` limit.
+          extra_params: --session-timeout=${{ inputs.sanitizers != 'enabled' && 3000 || 10200 }}
         env:
           TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
           CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
diff --git a/poetry.lock b/poetry.lock
index e2c71ca012..d66c3aae7a 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -2771,18 +2771,18 @@ pytest = ">=5,<8"
 
 [[package]]
 name = "pytest-timeout"
-version = "2.1.0"
+version = "2.3.1"
 description = "pytest plugin to abort hanging tests"
 optional = false
-python-versions = ">=3.6"
+python-versions = ">=3.7"
 groups = ["main"]
 files = [
-    {file = "pytest-timeout-2.1.0.tar.gz", hash = "sha256:c07ca07404c612f8abbe22294b23c368e2e5104b521c1790195561f37e1ac3d9"},
-    {file = "pytest_timeout-2.1.0-py3-none-any.whl", hash = "sha256:f6f50101443ce70ad325ceb4473c4255e9d74e3c7cd0ef827309dfa4c0d975c6"},
+    {file = "pytest-timeout-2.3.1.tar.gz", hash = "sha256:12397729125c6ecbdaca01035b9e5239d4db97352320af155b3f5de1ba5165d9"},
+    {file = "pytest_timeout-2.3.1-py3-none-any.whl", hash = "sha256:68188cb703edfc6a18fad98dc25a3c61e9f24d644b0b70f33af545219fc7813e"},
 ]
 
 [package.dependencies]
-pytest = ">=5.0.0"
+pytest = ">=7.0.0"
 
 [[package]]
 name = "pytest-xdist"
@@ -3820,4 +3820,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.1"
 python-versions = "^3.11"
-content-hash = "03697c0a4d438ef088b0d397b8f0570aa3998ccf833fe612400824792498878b"
+content-hash = "00ddc42c32e235b6171845fc066dcab078282ed832cd464d5e8a0afa959dd04a"
diff --git a/pyproject.toml b/pyproject.toml
index 51cd68e002..92a660c233 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -22,7 +22,7 @@ moto = {extras = ["server"], version = "^5.0.6"}
 backoff = "^2.2.1"
 pytest-lazy-fixture = "^0.6.3"
 prometheus-client = "^0.14.1"
-pytest-timeout = "^2.1.0"
+pytest-timeout = "^2.3.1"
 Werkzeug = "^3.0.6"
 pytest-order = "^1.1.0"
 allure-pytest = "^2.13.2"

From 2dae0612dd429ea293fa273350c725e702528f6d Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Sun, 16 Feb 2025 02:01:19 +0200
Subject: [PATCH 14/78] fast_import: Fix shared_buffers setting (#10837)

In commit 9537829ccd I made shared_buffers be derived from the system's
available RAM. However, I failed to remove the old hard-coded
shared_buffers=10GB settings, shared_buffers was set twice. Oopsie.
---
 compute_tools/src/bin/fast_import.rs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/compute_tools/src/bin/fast_import.rs b/compute_tools/src/bin/fast_import.rs
index dad15d67b7..4c8d031532 100644
--- a/compute_tools/src/bin/fast_import.rs
+++ b/compute_tools/src/bin/fast_import.rs
@@ -211,7 +211,6 @@ impl PostgresProcess {
             .args(["-p", &format!("{port}")])
             .args(["-c", "wal_level=minimal"])
             .args(["-c", &format!("shared_buffers={shared_buffers_mb}MB")])
-            .args(["-c", "shared_buffers=10GB"])
             .args(["-c", "max_wal_senders=0"])
             .args(["-c", "fsync=off"])
             .args(["-c", "full_page_writes=off"])

From f739773eddc2bb94f7eca7b10046e77115c7d3f9 Mon Sep 17 00:00:00 2001
From: Alexander Lakhin <exclusion@gmail.com>
Date: Sun, 16 Feb 2025 06:59:52 +0200
Subject: [PATCH 15/78] Fix format of milliseconds in pytest output (#10836)

## Problem

The timestamp prefix of pytest log lines contains milliseconds without
leading zeros, so values of milliseconds less than 100 printed
incorrectly.

For example:
```
2025-02-15 12:02:51.997 INFO [_internal.py:97] 127.0.0.1 - - ...
2025-02-15 12:02:52.4   INFO [_internal.py:97] 127.0.0.1 - - ...
2025-02-15 12:02:52.9   INFO [_internal.py:97] 127.0.0.1 - - ...
2025-02-15 12:02:52.23  INFO [_internal.py:97] 127.0.0.1 - - ...
```

## Summary of changes
Fix log_format for pytest so that milliseconds are printed with leading
zeros.
---
 pytest.ini | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytest.ini b/pytest.ini
index 7197b078c6..237066b1f6 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -11,7 +11,7 @@ markers =
 testpaths =
     test_runner
 minversion = 6.0
-log_format = %(asctime)s.%(msecs)-3d %(levelname)s [%(filename)s:%(lineno)d] %(message)s
+log_format = %(asctime)s.%(msecs)03d %(levelname)s [%(filename)s:%(lineno)d] %(message)s
 log_date_format = %Y-%m-%d %H:%M:%S
 log_cli = true
 timeout = 300

From d566d604cfc7e598741a2342330013c43ad3cbb6 Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Mon, 17 Feb 2025 11:43:16 +0100
Subject: [PATCH 16/78] feat(compute) add pg_duckdb extension v0.3.1 (#10829)

We want to host pg_duckdb (starting with v0.3.1) on Neon.

This PR replaces https://github.com/neondatabase/neon/pull/10350 which
was for older pg_duckdb v0.2.0

Use cases
- faster OLAP queries
- access to datelake files (e.g. parquet) on S3 buckets from Neon
PostgreSQL

Because neon does not provide superuser role to neon customers we need
to grant some additional permissions to neon_superuser:

Note: some grants that we require are already granted to `PUBLIC` in new
release of pg_duckdb
[here](https://github.com/duckdb/pg_duckdb/blob/3789e4c50961c03c92b7b16776804252974f8c62/sql/pg_duckdb--0.2.0--0.3.0.sql#L1054)

```sql
GRANT ALL ON FUNCTION duckdb.install_extension(TEXT) TO neon_superuser;
GRANT ALL ON TABLE duckdb.extensions TO neon_superuser;
GRANT ALL ON SEQUENCE duckdb.extensions_table_seq TO neon_superuser;
```
---
 compute/compute-node.Dockerfile      | 28 +++++++++++++++++++++++++++-
 compute/patches/pg_duckdb_v031.patch | 11 +++++++++++
 2 files changed, 38 insertions(+), 1 deletion(-)
 create mode 100644 compute/patches/pg_duckdb_v031.patch

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 30348c2b90..1236372d27 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -148,7 +148,7 @@ RUN case $DEBIAN_VERSION in \
     apt install --no-install-recommends --no-install-suggests -y \
     ninja-build git autoconf automake libtool build-essential bison flex libreadline-dev \
     zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget ca-certificates pkg-config libssl-dev \
-    libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd curl unzip \
+    libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd curl unzip g++ \
     $VERSION_INSTALLS \
     && apt clean && rm -rf /var/lib/apt/lists/*
 
@@ -1464,6 +1464,31 @@ RUN make release -j $(getconf _NPROCESSORS_ONLN) && \
     make install -j $(getconf _NPROCESSORS_ONLN) && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_mooncake.control
 
+#########################################################################################
+#
+# Layer "pg-duckdb-pg-build"
+# compile pg_duckdb extension
+#
+#########################################################################################
+FROM build-deps AS pg_duckdb-src
+WORKDIR /ext-src
+COPY compute/patches/pg_duckdb_v031.patch .
+# pg_duckdb build requires source dir to be a git repo to get submodules
+# allow neon_superuser to execute some functions that in pg_duckdb are available to superuser only: 
+# - extension management function duckdb.install_extension()
+# - access to duckdb.extensions table and its sequence
+RUN git clone --depth 1 --branch v0.3.1 https://github.com/duckdb/pg_duckdb.git pg_duckdb-src && \
+    cd pg_duckdb-src && \
+    git submodule update --init --recursive && \
+    patch -p1 < /ext-src/pg_duckdb_v031.patch
+
+FROM pg-build AS pg_duckdb-build
+ARG PG_VERSION
+COPY --from=pg_duckdb-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pg_duckdb-src
+RUN make install -j $(getconf _NPROCESSORS_ONLN) && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_duckdb.control 
+        
 #########################################################################################
 #
 # Layer "pg_repack"
@@ -1577,6 +1602,7 @@ COPY --from=pg_anon-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_ivm-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_partman-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_mooncake-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg_duckdb-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_repack-build /usr/local/pgsql/ /usr/local/pgsql/
 
 #########################################################################################
diff --git a/compute/patches/pg_duckdb_v031.patch b/compute/patches/pg_duckdb_v031.patch
new file mode 100644
index 0000000000..a7e188d69e
--- /dev/null
+++ b/compute/patches/pg_duckdb_v031.patch
@@ -0,0 +1,11 @@
+diff --git a/sql/pg_duckdb--0.2.0--0.3.0.sql b/sql/pg_duckdb--0.2.0--0.3.0.sql
+index d777d76..af60106 100644
+--- a/sql/pg_duckdb--0.2.0--0.3.0.sql
++++ b/sql/pg_duckdb--0.2.0--0.3.0.sql
+@@ -1056,3 +1056,6 @@ GRANT ALL ON FUNCTION duckdb.cache(TEXT, TEXT) TO PUBLIC;
+ GRANT ALL ON FUNCTION duckdb.cache_info() TO PUBLIC;
+ GRANT ALL ON FUNCTION duckdb.cache_delete(TEXT) TO PUBLIC;
+ GRANT ALL ON PROCEDURE duckdb.recycle_ddb() TO PUBLIC;
++GRANT ALL ON FUNCTION duckdb.install_extension(TEXT) TO neon_superuser;
++GRANT ALL ON TABLE duckdb.extensions TO neon_superuser;
++GRANT ALL ON SEQUENCE duckdb.extensions_table_seq TO neon_superuser;

From 81f08d304afab319556c969712531e4af813132e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 17 Feb 2025 11:44:44 +0100
Subject: [PATCH 17/78] Rebase Azure SDK and apply newest patch (#10825)

The [upstream PR](https://github.com/Azure/azure-sdk-for-rust/pull/1997)
has been merged with some changes to use threads with async, so apply
them to the neon specific fork to be nice to the executor (before, we
had the state as of filing of that PR). Also, rebase onto the latest
version of upstream's `legacy` branch.

current SDK commits:
[link](https://github.com/neondatabase/azure-sdk-for-rust/commits/neon-2025-02-14)
now:
[link](https://github.com/neondatabase/azure-sdk-for-rust/commits/arpad/neon-refresh)

Prior update was in #10790
---
 Cargo.lock | 10 +++++-----
 Cargo.toml |  8 ++++----
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 287201b4e0..64eb53ff00 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -786,7 +786,7 @@ dependencies = [
 [[package]]
 name = "azure_core"
 version = "0.21.0"
-source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#c36ed4c039bb3d59b5a1705f2cc337636c73b541"
+source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=arpad%2Fneon-refresh#f64bd57262ced51afce5d8909c06dcb11a6dd85a"
 dependencies = [
  "async-trait",
  "base64 0.22.1",
@@ -815,7 +815,7 @@ dependencies = [
 [[package]]
 name = "azure_identity"
 version = "0.21.0"
-source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#c36ed4c039bb3d59b5a1705f2cc337636c73b541"
+source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=arpad%2Fneon-refresh#f64bd57262ced51afce5d8909c06dcb11a6dd85a"
 dependencies = [
  "async-lock",
  "async-trait",
@@ -834,7 +834,7 @@ dependencies = [
 [[package]]
 name = "azure_storage"
 version = "0.21.0"
-source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#c36ed4c039bb3d59b5a1705f2cc337636c73b541"
+source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=arpad%2Fneon-refresh#f64bd57262ced51afce5d8909c06dcb11a6dd85a"
 dependencies = [
  "RustyXML",
  "async-lock",
@@ -852,7 +852,7 @@ dependencies = [
 [[package]]
 name = "azure_storage_blobs"
 version = "0.21.0"
-source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#c36ed4c039bb3d59b5a1705f2cc337636c73b541"
+source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=arpad%2Fneon-refresh#f64bd57262ced51afce5d8909c06dcb11a6dd85a"
 dependencies = [
  "RustyXML",
  "azure_core",
@@ -872,7 +872,7 @@ dependencies = [
 [[package]]
 name = "azure_svc_blobstorage"
 version = "0.21.0"
-source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#c36ed4c039bb3d59b5a1705f2cc337636c73b541"
+source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=arpad%2Fneon-refresh#f64bd57262ced51afce5d8909c06dcb11a6dd85a"
 dependencies = [
  "azure_core",
  "bytes",
diff --git a/Cargo.toml b/Cargo.toml
index 7228623c6b..0ca5ae4f5a 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -222,10 +222,10 @@ postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", br
 tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
 
 ## Azure SDK crates
-azure_core = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "neon", default-features = false, features = ["enable_reqwest_rustls", "hmac_rust"] }
-azure_identity = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "neon", default-features = false, features = ["enable_reqwest_rustls"] }
-azure_storage = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "neon", default-features = false, features = ["enable_reqwest_rustls"] }
-azure_storage_blobs = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "neon", default-features = false, features = ["enable_reqwest_rustls"] }
+azure_core = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "arpad/neon-refresh", default-features = false, features = ["enable_reqwest_rustls", "hmac_rust"] }
+azure_identity = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "arpad/neon-refresh", default-features = false, features = ["enable_reqwest_rustls"] }
+azure_storage = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "arpad/neon-refresh", default-features = false, features = ["enable_reqwest_rustls"] }
+azure_storage_blobs = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "arpad/neon-refresh", default-features = false, features = ["enable_reqwest_rustls"] }
 
 ## Local libraries
 compute_api = { version = "0.1", path = "./libs/compute_api/" }

From 8c6d133d31ced1dc9bba9fc79a9ca2d50c636b66 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Mon, 17 Feb 2025 14:54:17 +0200
Subject: [PATCH 18/78] Fix out-of-boundaries access in addSHLL function
 (#10840)

## Problem

See https://github.com/neondatabase/neon/issues/10839

rho(x,b) functions returns values in range [1,b+1] and addSHLL tries to
store it in array of size b+1.

## Summary of changes

Subtract 1 fro value returned by rho

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/hll.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pgxn/neon/hll.c b/pgxn/neon/hll.c
index 1f53c8fd36..bbaad09f5f 100644
--- a/pgxn/neon/hll.c
+++ b/pgxn/neon/hll.c
@@ -122,8 +122,8 @@ addSHLL(HyperLogLogState *cState, uint32 hash)
 	index = hash >> HLL_C_BITS;
 
 	/* Compute the rank of the remaining 32 - "k" (registerWidth) bits */
-	count = rho(hash << HLL_BIT_WIDTH, HLL_C_BITS);
-
+	count = rho(hash << HLL_BIT_WIDTH, HLL_C_BITS) - 1;
+	Assert(count <= HLL_C_BITS);
 	cState->regs[index][count] = now;
 }
 
@@ -136,7 +136,7 @@ getMaximum(const TimestampTz* reg, TimestampTz since)
 	{
 		if (reg[i] >= since)
 		{
-			max = i;
+			max = i + 1;
 		}
 	}
 

From 8a2d95b4b5d513996fda52b5029fedd0d0ebd47d Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Mon, 17 Feb 2025 15:41:22 +0100
Subject: [PATCH 19/78] pageserver: appease unused lint on macOS (#10846)

## Problem

`SmgrOpFlushInProgress::measure()` takes a `socket_fd` argument which is
only used on Linux. This causes linter warnings on macOS.

Touches #10823.

## Summary of changes

Add a noop use of `socket_fd` on non-Linux branch.
---
 pageserver/src/metrics.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 0ffd4e851a..16ca4683ad 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1489,6 +1489,7 @@ impl SmgrOpFlushInProgress {
                         }
                         #[cfg(not(target_os = "linux"))]
                         {
+                            _ = socket_fd; // appease unused lint on macOS
                             (-1, -1)
                         }
                     };

From 0330b617291f6ad6459a406a5b1a6217fcc587ba Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 17 Feb 2025 15:59:01 +0100
Subject: [PATCH 20/78] Azure SDK: use neon branch again (#10844)

Originally I wanted to switch back to the `neon` branch before merging
#10825, but I forgot to do it. Do it in a separate PR now.

No actual change of the source code, only changes the branch name (so
that maybe in a few weeks we can delete the temporary branch
`arpad/neon-rebase`).
---
 Cargo.lock | 10 +++++-----
 Cargo.toml |  8 ++++----
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 64eb53ff00..4f75fa5733 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -786,7 +786,7 @@ dependencies = [
 [[package]]
 name = "azure_core"
 version = "0.21.0"
-source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=arpad%2Fneon-refresh#f64bd57262ced51afce5d8909c06dcb11a6dd85a"
+source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#f64bd57262ced51afce5d8909c06dcb11a6dd85a"
 dependencies = [
  "async-trait",
  "base64 0.22.1",
@@ -815,7 +815,7 @@ dependencies = [
 [[package]]
 name = "azure_identity"
 version = "0.21.0"
-source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=arpad%2Fneon-refresh#f64bd57262ced51afce5d8909c06dcb11a6dd85a"
+source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#f64bd57262ced51afce5d8909c06dcb11a6dd85a"
 dependencies = [
  "async-lock",
  "async-trait",
@@ -834,7 +834,7 @@ dependencies = [
 [[package]]
 name = "azure_storage"
 version = "0.21.0"
-source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=arpad%2Fneon-refresh#f64bd57262ced51afce5d8909c06dcb11a6dd85a"
+source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#f64bd57262ced51afce5d8909c06dcb11a6dd85a"
 dependencies = [
  "RustyXML",
  "async-lock",
@@ -852,7 +852,7 @@ dependencies = [
 [[package]]
 name = "azure_storage_blobs"
 version = "0.21.0"
-source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=arpad%2Fneon-refresh#f64bd57262ced51afce5d8909c06dcb11a6dd85a"
+source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#f64bd57262ced51afce5d8909c06dcb11a6dd85a"
 dependencies = [
  "RustyXML",
  "azure_core",
@@ -872,7 +872,7 @@ dependencies = [
 [[package]]
 name = "azure_svc_blobstorage"
 version = "0.21.0"
-source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=arpad%2Fneon-refresh#f64bd57262ced51afce5d8909c06dcb11a6dd85a"
+source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#f64bd57262ced51afce5d8909c06dcb11a6dd85a"
 dependencies = [
  "azure_core",
  "bytes",
diff --git a/Cargo.toml b/Cargo.toml
index 0ca5ae4f5a..7228623c6b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -222,10 +222,10 @@ postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", br
 tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
 
 ## Azure SDK crates
-azure_core = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "arpad/neon-refresh", default-features = false, features = ["enable_reqwest_rustls", "hmac_rust"] }
-azure_identity = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "arpad/neon-refresh", default-features = false, features = ["enable_reqwest_rustls"] }
-azure_storage = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "arpad/neon-refresh", default-features = false, features = ["enable_reqwest_rustls"] }
-azure_storage_blobs = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "arpad/neon-refresh", default-features = false, features = ["enable_reqwest_rustls"] }
+azure_core = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "neon", default-features = false, features = ["enable_reqwest_rustls", "hmac_rust"] }
+azure_identity = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "neon", default-features = false, features = ["enable_reqwest_rustls"] }
+azure_storage = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "neon", default-features = false, features = ["enable_reqwest_rustls"] }
+azure_storage_blobs = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "neon", default-features = false, features = ["enable_reqwest_rustls"] }
 
 ## Local libraries
 compute_api = { version = "0.1", path = "./libs/compute_api/" }

From 39d42d846ae387c1ba8f5ab2432b48bd412360b6 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 17 Feb 2025 15:04:47 +0000
Subject: [PATCH 21/78] pageserver_api: fix decoding old-version TimelineInfo
 (#10845)

## Problem

In #10707 some new fields were introduced in TimelineInfo.

I forgot that we do not only use TimelineInfo for encoding, but also
decoding when the storage controller calls into a pageserver, so this
broke some calls from controller to pageserver while in a mixed-version
state.

## Summary of changes

- Make new fields have default behavior so that they are optional
---
 libs/pageserver_api/src/models.rs | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 426222a531..3d40cfe121 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -1144,6 +1144,7 @@ pub struct TimelineInfo {
     /// The LSN up to which GC has advanced: older data may still exist but it is not available for clients.
     /// This LSN is not suitable for deciding where to create branches etc: use [`TimelineInfo::min_readable_lsn`] instead,
     /// as it is easier to reason about.
+    #[serde(default)]
     pub applied_gc_cutoff_lsn: Lsn,
 
     /// The upper bound of data which is either already GC'ed, or elegible to be GC'ed at any time based on PITR interval.
@@ -1152,6 +1153,7 @@ pub struct TimelineInfo {
     ///
     /// Note that holders of valid LSN leases may be able to create branches and read pages earlier
     /// than this LSN, but new leases may not be taken out earlier than this LSN.
+    #[serde(default)]
     pub min_readable_lsn: Lsn,
 
     pub disk_consistent_lsn: Lsn,

From da79cc5eeee225986f1a12cb1a9dbeb6315d88ad Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 17 Feb 2025 09:40:43 -0600
Subject: [PATCH 22/78] Add neon.extension_server_{connect,request}_timeout
 (#10801)

Instead of hardcoding the request timeout, let's make it configurable as
a PGC_SUSET GUC.

Additionally, add a connect timeout GUC. Although the extension server
runs on the compute, it is always best to keep operations from hanging.
Better to present a timeout error to the user than a stuck backend.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 pgxn/neon/extension_server.c | 39 +++++++++++++++++++++++++++---------
 1 file changed, 30 insertions(+), 9 deletions(-)

diff --git a/pgxn/neon/extension_server.c b/pgxn/neon/extension_server.c
index 6e558c433a..0331f961b4 100644
--- a/pgxn/neon/extension_server.c
+++ b/pgxn/neon/extension_server.c
@@ -18,6 +18,8 @@
 #include "neon_utils.h"
 
 static int	extension_server_port = 0;
+static int	extension_server_request_timeout = 60;
+static int	extension_server_connect_timeout = 60;
 
 static download_extension_file_hook_type prev_download_extension_file_hook = NULL;
 
@@ -34,19 +36,18 @@ static download_extension_file_hook_type prev_download_extension_file_hook = NUL
 static bool
 neon_download_extension_file_http(const char *filename, bool is_library)
 {
-	static CURL	   *handle = NULL;
-
 	CURLcode	res;
-	char	   *compute_ctl_url;
 	bool		ret = false;
+	CURL	   *handle = NULL;
+	char	   *compute_ctl_url;
 
-	if (handle == NULL)
-	{
-		handle = alloc_curl_handle();
+	handle = alloc_curl_handle();
 
-		curl_easy_setopt(handle, CURLOPT_CUSTOMREQUEST, "POST");
-		curl_easy_setopt(handle, CURLOPT_TIMEOUT, 60L /* seconds */ );
-	}
+	curl_easy_setopt(handle, CURLOPT_CUSTOMREQUEST, "POST");
+	if (extension_server_request_timeout > 0)
+		curl_easy_setopt(handle, CURLOPT_TIMEOUT, (long)extension_server_request_timeout /* seconds */ );
+	if (extension_server_connect_timeout > 0)
+		curl_easy_setopt(handle, CURLOPT_CONNECTTIMEOUT, (long)extension_server_connect_timeout /* seconds */ );
 
 	compute_ctl_url = psprintf("http://localhost:%d/extension_server/%s%s",
 							   extension_server_port, filename, is_library ? "?is_library=true" : "");
@@ -57,6 +58,8 @@ neon_download_extension_file_http(const char *filename, bool is_library)
 
 	/* Perform the request, res will get the return code */
 	res = curl_easy_perform(handle);
+	curl_easy_cleanup(handle);
+
 	/* Check for errors */
 	if (res == CURLE_OK)
 	{
@@ -88,6 +91,24 @@ pg_init_extension_server()
 							0,	/* no flags required */
 							NULL, NULL, NULL);
 
+	DefineCustomIntVariable("neon.extension_server_request_timeout",
+							"timeout for fetching extensions in seconds",
+							NULL,
+							&extension_server_request_timeout,
+							60, 0, INT_MAX,
+							PGC_SUSET,
+							GUC_UNIT_S,
+							NULL, NULL, NULL);
+
+	DefineCustomIntVariable("neon.extension_server_connect_timeout",
+							"timeout for connecting to the extension server in seconds",
+							NULL,
+							&extension_server_connect_timeout,
+							60, 0, INT_MAX,
+							PGC_SUSET,
+							GUC_UNIT_S,
+							NULL, NULL, NULL);
+
 	/* set download_extension_file_hook */
 	prev_download_extension_file_hook = download_extension_file_hook;
 	download_extension_file_hook = neon_download_extension_file_http;

From 3204efc860bcd6e849733cc7759b6742e6df8d8e Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 17 Feb 2025 16:19:57 +0000
Subject: [PATCH 23/78] chore(proxy): use specially named prepared statements
 for type-checking (#10843)

I was looking into
https://github.com/neondatabase/serverless/issues/144, I recall previous
cases where proxy would trigger these prepared statements which would
conflict with other statements prepared by our client downstream.

Because of that, and also to aid in debugging, I've made sure all
prepared statements that proxy needs to make have specific names that
likely won't conflict and makes it clear in a error log if it's our
statements that are causing issues
---
 libs/proxy/tokio-postgres2/src/client.rs      | 98 +++----------------
 .../tokio-postgres2/src/generic_client.rs     |  9 +-
 libs/proxy/tokio-postgres2/src/lib.rs         |  2 -
 libs/proxy/tokio-postgres2/src/prepare.rs     | 48 ++-------
 libs/proxy/tokio-postgres2/src/query.rs       | 43 --------
 libs/proxy/tokio-postgres2/src/statement.rs   | 10 +-
 .../proxy/tokio-postgres2/src/to_statement.rs | 57 -----------
 proxy/src/serverless/backend.rs               |  2 +-
 proxy/src/serverless/local_conn_pool.rs       | 11 +--
 9 files changed, 36 insertions(+), 244 deletions(-)
 delete mode 100644 libs/proxy/tokio-postgres2/src/to_statement.rs

diff --git a/libs/proxy/tokio-postgres2/src/client.rs b/libs/proxy/tokio-postgres2/src/client.rs
index 9bbbd4c260..46151ab924 100644
--- a/libs/proxy/tokio-postgres2/src/client.rs
+++ b/libs/proxy/tokio-postgres2/src/client.rs
@@ -10,8 +10,8 @@ use crate::simple_query::SimpleQueryStream;
 use crate::types::{Oid, ToSql, Type};
 
 use crate::{
-    prepare, query, simple_query, slice_iter, CancelToken, Error, ReadyForQueryStatus, Row,
-    SimpleQueryMessage, Statement, ToStatement, Transaction, TransactionBuilder,
+    query, simple_query, slice_iter, CancelToken, Error, ReadyForQueryStatus, Row,
+    SimpleQueryMessage, Statement, Transaction, TransactionBuilder,
 };
 use bytes::BytesMut;
 use fallible_iterator::FallibleIterator;
@@ -54,18 +54,18 @@ impl Responses {
 }
 
 /// A cache of type info and prepared statements for fetching type info
-/// (corresponding to the queries in the [prepare] module).
+/// (corresponding to the queries in the [crate::prepare] module).
 #[derive(Default)]
 struct CachedTypeInfo {
     /// A statement for basic information for a type from its
-    /// OID. Corresponds to [TYPEINFO_QUERY](prepare::TYPEINFO_QUERY) (or its
+    /// OID. Corresponds to [TYPEINFO_QUERY](crate::prepare::TYPEINFO_QUERY) (or its
     /// fallback).
     typeinfo: Option<Statement>,
     /// A statement for getting information for a composite type from its OID.
-    /// Corresponds to [TYPEINFO_QUERY](prepare::TYPEINFO_COMPOSITE_QUERY).
+    /// Corresponds to [TYPEINFO_QUERY](crate::prepare::TYPEINFO_COMPOSITE_QUERY).
     typeinfo_composite: Option<Statement>,
     /// A statement for getting information for a composite type from its OID.
-    /// Corresponds to [TYPEINFO_QUERY](prepare::TYPEINFO_COMPOSITE_QUERY) (or
+    /// Corresponds to [TYPEINFO_QUERY](crate::prepare::TYPEINFO_COMPOSITE_QUERY) (or
     /// its fallback).
     typeinfo_enum: Option<Statement>,
 
@@ -190,26 +190,6 @@ impl Client {
         &self.inner
     }
 
-    /// Creates a new prepared statement.
-    ///
-    /// Prepared statements can be executed repeatedly, and may contain query parameters (indicated by `$1`, `$2`, etc),
-    /// which are set when executed. Prepared statements can only be used with the connection that created them.
-    pub async fn prepare(&self, query: &str) -> Result<Statement, Error> {
-        self.prepare_typed(query, &[]).await
-    }
-
-    /// Like `prepare`, but allows the types of query parameters to be explicitly specified.
-    ///
-    /// The list of types may be smaller than the number of parameters - the types of the remaining parameters will be
-    /// inferred. For example, `client.prepare_typed(query, &[])` is equivalent to `client.prepare(query)`.
-    pub async fn prepare_typed(
-        &self,
-        query: &str,
-        parameter_types: &[Type],
-    ) -> Result<Statement, Error> {
-        prepare::prepare(&self.inner, query, parameter_types).await
-    }
-
     /// Executes a statement, returning a vector of the resulting rows.
     ///
     /// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list
@@ -222,14 +202,11 @@ impl Client {
     /// # Panics
     ///
     /// Panics if the number of parameters provided does not match the number expected.
-    pub async fn query<T>(
+    pub async fn query(
         &self,
-        statement: &T,
+        statement: Statement,
         params: &[&(dyn ToSql + Sync)],
-    ) -> Result<Vec<Row>, Error>
-    where
-        T: ?Sized + ToStatement,
-    {
+    ) -> Result<Vec<Row>, Error> {
         self.query_raw(statement, slice_iter(params))
             .await?
             .try_collect()
@@ -250,13 +227,15 @@ impl Client {
     /// Panics if the number of parameters provided does not match the number expected.
     ///
     /// [`query`]: #method.query
-    pub async fn query_raw<'a, T, I>(&self, statement: &T, params: I) -> Result<RowStream, Error>
+    pub async fn query_raw<'a, I>(
+        &self,
+        statement: Statement,
+        params: I,
+    ) -> Result<RowStream, Error>
     where
-        T: ?Sized + ToStatement,
         I: IntoIterator<Item = &'a (dyn ToSql + Sync)>,
         I::IntoIter: ExactSizeIterator,
     {
-        let statement = statement.__convert().into_statement(self).await?;
         query::query(&self.inner, statement, params).await
     }
 
@@ -271,55 +250,6 @@ impl Client {
         query::query_txt(&self.inner, statement, params).await
     }
 
-    /// Executes a statement, returning the number of rows modified.
-    ///
-    /// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list
-    /// provided, 1-indexed.
-    ///
-    /// The `statement` argument can either be a `Statement`, or a raw query string. If the same statement will be
-    /// repeatedly executed (perhaps with different query parameters), consider preparing the statement up front
-    /// with the `prepare` method.
-    ///
-    /// If the statement does not modify any rows (e.g. `SELECT`), 0 is returned.
-    ///
-    /// # Panics
-    ///
-    /// Panics if the number of parameters provided does not match the number expected.
-    pub async fn execute<T>(
-        &self,
-        statement: &T,
-        params: &[&(dyn ToSql + Sync)],
-    ) -> Result<u64, Error>
-    where
-        T: ?Sized + ToStatement,
-    {
-        self.execute_raw(statement, slice_iter(params)).await
-    }
-
-    /// The maximally flexible version of [`execute`].
-    ///
-    /// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list
-    /// provided, 1-indexed.
-    ///
-    /// The `statement` argument can either be a `Statement`, or a raw query string. If the same statement will be
-    /// repeatedly executed (perhaps with different query parameters), consider preparing the statement up front
-    /// with the `prepare` method.
-    ///
-    /// # Panics
-    ///
-    /// Panics if the number of parameters provided does not match the number expected.
-    ///
-    /// [`execute`]: #method.execute
-    pub async fn execute_raw<'a, T, I>(&self, statement: &T, params: I) -> Result<u64, Error>
-    where
-        T: ?Sized + ToStatement,
-        I: IntoIterator<Item = &'a (dyn ToSql + Sync)>,
-        I::IntoIter: ExactSizeIterator,
-    {
-        let statement = statement.__convert().into_statement(self).await?;
-        query::execute(self.inner(), statement, params).await
-    }
-
     /// Executes a sequence of SQL statements using the simple query protocol, returning the resulting rows.
     ///
     /// Statements should be separated by semicolons. If an error occurs, execution of the sequence will stop at that
diff --git a/libs/proxy/tokio-postgres2/src/generic_client.rs b/libs/proxy/tokio-postgres2/src/generic_client.rs
index 768213f8ed..042b5a675e 100644
--- a/libs/proxy/tokio-postgres2/src/generic_client.rs
+++ b/libs/proxy/tokio-postgres2/src/generic_client.rs
@@ -1,7 +1,8 @@
+#![allow(async_fn_in_trait)]
+
 use crate::query::RowStream;
 use crate::types::Type;
 use crate::{Client, Error, Transaction};
-use async_trait::async_trait;
 use postgres_protocol2::Oid;
 
 mod private {
@@ -11,7 +12,6 @@ mod private {
 /// A trait allowing abstraction over connections and transactions.
 ///
 /// This trait is "sealed", and cannot be implemented outside of this crate.
-#[async_trait]
 pub trait GenericClient: private::Sealed {
     /// Like `Client::query_raw_txt`.
     async fn query_raw_txt<S, I>(&self, statement: &str, params: I) -> Result<RowStream, Error>
@@ -26,7 +26,6 @@ pub trait GenericClient: private::Sealed {
 
 impl private::Sealed for Client {}
 
-#[async_trait]
 impl GenericClient for Client {
     async fn query_raw_txt<S, I>(&self, statement: &str, params: I) -> Result<RowStream, Error>
     where
@@ -39,14 +38,12 @@ impl GenericClient for Client {
 
     /// Query for type information
     async fn get_type(&self, oid: Oid) -> Result<Type, Error> {
-        self.get_type(oid).await
+        crate::prepare::get_type(self.inner(), oid).await
     }
 }
 
 impl private::Sealed for Transaction<'_> {}
 
-#[async_trait]
-#[allow(clippy::needless_lifetimes)]
 impl GenericClient for Transaction<'_> {
     async fn query_raw_txt<S, I>(&self, statement: &str, params: I) -> Result<RowStream, Error>
     where
diff --git a/libs/proxy/tokio-postgres2/src/lib.rs b/libs/proxy/tokio-postgres2/src/lib.rs
index 9155dd8279..7426279167 100644
--- a/libs/proxy/tokio-postgres2/src/lib.rs
+++ b/libs/proxy/tokio-postgres2/src/lib.rs
@@ -14,7 +14,6 @@ pub use crate::row::{Row, SimpleQueryRow};
 pub use crate::simple_query::SimpleQueryStream;
 pub use crate::statement::{Column, Statement};
 pub use crate::tls::NoTls;
-pub use crate::to_statement::ToStatement;
 pub use crate::transaction::Transaction;
 pub use crate::transaction_builder::{IsolationLevel, TransactionBuilder};
 use crate::types::ToSql;
@@ -65,7 +64,6 @@ pub mod row;
 mod simple_query;
 mod statement;
 pub mod tls;
-mod to_statement;
 mod transaction;
 mod transaction_builder;
 pub mod types;
diff --git a/libs/proxy/tokio-postgres2/src/prepare.rs b/libs/proxy/tokio-postgres2/src/prepare.rs
index da0c755c5b..58bbb26cbc 100644
--- a/libs/proxy/tokio-postgres2/src/prepare.rs
+++ b/libs/proxy/tokio-postgres2/src/prepare.rs
@@ -1,7 +1,6 @@
 use crate::client::InnerClient;
 use crate::codec::FrontendMessage;
 use crate::connection::RequestMessages;
-use crate::error::SqlState;
 use crate::types::{Field, Kind, Oid, Type};
 use crate::{query, slice_iter};
 use crate::{Column, Error, Statement};
@@ -13,7 +12,6 @@ use postgres_protocol2::message::backend::Message;
 use postgres_protocol2::message::frontend;
 use std::future::Future;
 use std::pin::Pin;
-use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::Arc;
 
 pub(crate) const TYPEINFO_QUERY: &str = "\
@@ -24,14 +22,6 @@ INNER JOIN pg_catalog.pg_namespace n ON t.typnamespace = n.oid
 WHERE t.oid = $1
 ";
 
-// Range types weren't added until Postgres 9.2, so pg_range may not exist
-const TYPEINFO_FALLBACK_QUERY: &str = "\
-SELECT t.typname, t.typtype, t.typelem, NULL::OID, t.typbasetype, n.nspname, t.typrelid
-FROM pg_catalog.pg_type t
-INNER JOIN pg_catalog.pg_namespace n ON t.typnamespace = n.oid
-WHERE t.oid = $1
-";
-
 const TYPEINFO_ENUM_QUERY: &str = "\
 SELECT enumlabel
 FROM pg_catalog.pg_enum
@@ -39,14 +29,6 @@ WHERE enumtypid = $1
 ORDER BY enumsortorder
 ";
 
-// Postgres 9.0 didn't have enumsortorder
-const TYPEINFO_ENUM_FALLBACK_QUERY: &str = "\
-SELECT enumlabel
-FROM pg_catalog.pg_enum
-WHERE enumtypid = $1
-ORDER BY oid
-";
-
 pub(crate) const TYPEINFO_COMPOSITE_QUERY: &str = "\
 SELECT attname, atttypid
 FROM pg_catalog.pg_attribute
@@ -56,15 +38,13 @@ AND attnum > 0
 ORDER BY attnum
 ";
 
-static NEXT_ID: AtomicUsize = AtomicUsize::new(0);
-
 pub async fn prepare(
     client: &Arc<InnerClient>,
+    name: &'static str,
     query: &str,
     types: &[Type],
 ) -> Result<Statement, Error> {
-    let name = format!("s{}", NEXT_ID.fetch_add(1, Ordering::SeqCst));
-    let buf = encode(client, &name, query, types)?;
+    let buf = encode(client, name, query, types)?;
     let mut responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?;
 
     match responses.next().await? {
@@ -105,10 +85,11 @@ pub async fn prepare(
 
 fn prepare_rec<'a>(
     client: &'a Arc<InnerClient>,
+    name: &'static str,
     query: &'a str,
     types: &'a [Type],
 ) -> Pin<Box<dyn Future<Output = Result<Statement, Error>> + 'a + Send>> {
-    Box::pin(prepare(client, query, types))
+    Box::pin(prepare(client, name, query, types))
 }
 
 fn encode(client: &InnerClient, name: &str, query: &str, types: &[Type]) -> Result<Bytes, Error> {
@@ -192,13 +173,8 @@ async fn typeinfo_statement(client: &Arc<InnerClient>) -> Result<Statement, Erro
         return Ok(stmt);
     }
 
-    let stmt = match prepare_rec(client, TYPEINFO_QUERY, &[]).await {
-        Ok(stmt) => stmt,
-        Err(ref e) if e.code() == Some(&SqlState::UNDEFINED_TABLE) => {
-            prepare_rec(client, TYPEINFO_FALLBACK_QUERY, &[]).await?
-        }
-        Err(e) => return Err(e),
-    };
+    let typeinfo = "neon_proxy_typeinfo";
+    let stmt = prepare_rec(client, typeinfo, TYPEINFO_QUERY, &[]).await?;
 
     client.set_typeinfo(&stmt);
     Ok(stmt)
@@ -219,13 +195,8 @@ async fn typeinfo_enum_statement(client: &Arc<InnerClient>) -> Result<Statement,
         return Ok(stmt);
     }
 
-    let stmt = match prepare_rec(client, TYPEINFO_ENUM_QUERY, &[]).await {
-        Ok(stmt) => stmt,
-        Err(ref e) if e.code() == Some(&SqlState::UNDEFINED_COLUMN) => {
-            prepare_rec(client, TYPEINFO_ENUM_FALLBACK_QUERY, &[]).await?
-        }
-        Err(e) => return Err(e),
-    };
+    let typeinfo = "neon_proxy_typeinfo_enum";
+    let stmt = prepare_rec(client, typeinfo, TYPEINFO_ENUM_QUERY, &[]).await?;
 
     client.set_typeinfo_enum(&stmt);
     Ok(stmt)
@@ -255,7 +226,8 @@ async fn typeinfo_composite_statement(client: &Arc<InnerClient>) -> Result<State
         return Ok(stmt);
     }
 
-    let stmt = prepare_rec(client, TYPEINFO_COMPOSITE_QUERY, &[]).await?;
+    let typeinfo = "neon_proxy_typeinfo_composite";
+    let stmt = prepare_rec(client, typeinfo, TYPEINFO_COMPOSITE_QUERY, &[]).await?;
 
     client.set_typeinfo_composite(&stmt);
     Ok(stmt)
diff --git a/libs/proxy/tokio-postgres2/src/query.rs b/libs/proxy/tokio-postgres2/src/query.rs
index 534195a707..e21631c85d 100644
--- a/libs/proxy/tokio-postgres2/src/query.rs
+++ b/libs/proxy/tokio-postgres2/src/query.rs
@@ -157,49 +157,6 @@ where
     })
 }
 
-pub async fn execute<'a, I>(
-    client: &InnerClient,
-    statement: Statement,
-    params: I,
-) -> Result<u64, Error>
-where
-    I: IntoIterator<Item = &'a (dyn ToSql + Sync)>,
-    I::IntoIter: ExactSizeIterator,
-{
-    let buf = if log_enabled!(Level::Debug) {
-        let params = params.into_iter().collect::<Vec<_>>();
-        debug!(
-            "executing statement {} with parameters: {:?}",
-            statement.name(),
-            BorrowToSqlParamsDebug(params.as_slice()),
-        );
-        encode(client, &statement, params)?
-    } else {
-        encode(client, &statement, params)?
-    };
-    let mut responses = start(client, buf).await?;
-
-    let mut rows = 0;
-    loop {
-        match responses.next().await? {
-            Message::DataRow(_) => {}
-            Message::CommandComplete(body) => {
-                rows = body
-                    .tag()
-                    .map_err(Error::parse)?
-                    .rsplit(' ')
-                    .next()
-                    .unwrap()
-                    .parse()
-                    .unwrap_or(0);
-            }
-            Message::EmptyQueryResponse => rows = 0,
-            Message::ReadyForQuery(_) => return Ok(rows),
-            _ => return Err(Error::unexpected_message()),
-        }
-    }
-}
-
 async fn start(client: &InnerClient, buf: Bytes) -> Result<Responses, Error> {
     let mut responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?;
 
diff --git a/libs/proxy/tokio-postgres2/src/statement.rs b/libs/proxy/tokio-postgres2/src/statement.rs
index 22e160fc05..591872fbc5 100644
--- a/libs/proxy/tokio-postgres2/src/statement.rs
+++ b/libs/proxy/tokio-postgres2/src/statement.rs
@@ -13,7 +13,7 @@ use std::{
 
 struct StatementInner {
     client: Weak<InnerClient>,
-    name: String,
+    name: &'static str,
     params: Vec<Type>,
     columns: Vec<Column>,
 }
@@ -22,7 +22,7 @@ impl Drop for StatementInner {
     fn drop(&mut self) {
         if let Some(client) = self.client.upgrade() {
             let buf = client.with_buf(|buf| {
-                frontend::close(b'S', &self.name, buf).unwrap();
+                frontend::close(b'S', self.name, buf).unwrap();
                 frontend::sync(buf);
                 buf.split().freeze()
             });
@@ -40,7 +40,7 @@ pub struct Statement(Arc<StatementInner>);
 impl Statement {
     pub(crate) fn new(
         inner: &Arc<InnerClient>,
-        name: String,
+        name: &'static str,
         params: Vec<Type>,
         columns: Vec<Column>,
     ) -> Statement {
@@ -55,14 +55,14 @@ impl Statement {
     pub(crate) fn new_anonymous(params: Vec<Type>, columns: Vec<Column>) -> Statement {
         Statement(Arc::new(StatementInner {
             client: Weak::new(),
-            name: String::new(),
+            name: "<anonymous>",
             params,
             columns,
         }))
     }
 
     pub(crate) fn name(&self) -> &str {
-        &self.0.name
+        self.0.name
     }
 
     /// Returns the expected types of the statement's parameters.
diff --git a/libs/proxy/tokio-postgres2/src/to_statement.rs b/libs/proxy/tokio-postgres2/src/to_statement.rs
deleted file mode 100644
index 7e12992728..0000000000
--- a/libs/proxy/tokio-postgres2/src/to_statement.rs
+++ /dev/null
@@ -1,57 +0,0 @@
-use crate::to_statement::private::{Sealed, ToStatementType};
-use crate::Statement;
-
-mod private {
-    use crate::{Client, Error, Statement};
-
-    pub trait Sealed {}
-
-    pub enum ToStatementType<'a> {
-        Statement(&'a Statement),
-        Query(&'a str),
-    }
-
-    impl ToStatementType<'_> {
-        pub async fn into_statement(self, client: &Client) -> Result<Statement, Error> {
-            match self {
-                ToStatementType::Statement(s) => Ok(s.clone()),
-                ToStatementType::Query(s) => client.prepare(s).await,
-            }
-        }
-    }
-}
-
-/// A trait abstracting over prepared and unprepared statements.
-///
-/// Many methods are generic over this bound, so that they support both a raw query string as well as a statement which
-/// was prepared previously.
-///
-/// This trait is "sealed" and cannot be implemented by anything outside this crate.
-pub trait ToStatement: Sealed {
-    #[doc(hidden)]
-    fn __convert(&self) -> ToStatementType<'_>;
-}
-
-impl ToStatement for Statement {
-    fn __convert(&self) -> ToStatementType<'_> {
-        ToStatementType::Statement(self)
-    }
-}
-
-impl Sealed for Statement {}
-
-impl ToStatement for str {
-    fn __convert(&self) -> ToStatementType<'_> {
-        ToStatementType::Query(self)
-    }
-}
-
-impl Sealed for str {}
-
-impl ToStatement for String {
-    fn __convert(&self) -> ToStatementType<'_> {
-        ToStatementType::Query(self)
-    }
-}
-
-impl Sealed for String {}
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 6a59d413c4..f35c375ba2 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -372,7 +372,7 @@ impl PoolingBackend {
             debug!("setting up backend session state");
 
             // initiates the auth session
-            if let Err(e) = client.execute("select auth.init()", &[]).await {
+            if let Err(e) = client.batch_execute("select auth.init();").await {
                 discard.discard();
                 return Err(e.into());
             }
diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs
index fe33f0ff65..7ed514ff65 100644
--- a/proxy/src/serverless/local_conn_pool.rs
+++ b/proxy/src/serverless/local_conn_pool.rs
@@ -23,7 +23,6 @@ use indexmap::IndexMap;
 use jose_jwk::jose_b64::base64ct::{Base64UrlUnpadded, Encoding};
 use parking_lot::RwLock;
 use postgres_client::tls::NoTlsStream;
-use postgres_client::types::ToSql;
 use postgres_client::AsyncMessage;
 use serde_json::value::RawValue;
 use tokio::net::TcpStream;
@@ -281,13 +280,9 @@ impl ClientInnerCommon<postgres_client::Client> {
             let token = resign_jwt(&local_data.key, payload, local_data.jti)?;
 
             // initiates the auth session
-            self.inner.batch_execute("discard all").await?;
-            self.inner
-                .execute(
-                    "select auth.jwt_session_init($1)",
-                    &[&&*token as &(dyn ToSql + Sync)],
-                )
-                .await?;
+            // this is safe from query injections as the jwt format free of any escape characters.
+            let query = format!("discard all; select auth.jwt_session_init('{token}')");
+            self.inner.batch_execute(&query).await?;
 
             let pid = self.inner.get_process_id();
             info!(pid, jti = local_data.jti, "user session state init");

From b10890b81c5121735480f17dee244917bb575096 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Mon, 17 Feb 2025 16:32:24 +0000
Subject: [PATCH 24/78] tests: compare digests in test_peer_recovery (#10853)

## Problem

Test fails when comparing the first WAL segment because the system id in
the segment header is different. The system id is not consistently set
correctly since segments are usually inited on the safekeeper sync step
with sysid 0.

## Summary of Chnages

Compare timeline digests instead. This skips the header.

Closes https://github.com/neondatabase/neon/issues/10596
---
 test_runner/regress/test_wal_acceptor.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 21b2ad479c..c5045fe4a4 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -1445,6 +1445,7 @@ def test_peer_recovery(neon_env_builder: NeonEnvBuilder):
 
     # roughly fills one segment
     endpoint.safe_psql("insert into t select generate_series(1,250000), 'payload'")
+    lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
 
     endpoint.stop()  # stop compute
 
@@ -1473,7 +1474,15 @@ def test_peer_recovery(neon_env_builder: NeonEnvBuilder):
         "flush_lsn to get aligned",
     )
 
-    cmp_sk_wal([sk1, sk2], tenant_id, timeline_id)
+    sk1_digest = sk1.http_client().timeline_digest(
+        tenant_id, timeline_id, sk1.get_timeline_start_lsn(tenant_id, timeline_id), lsn
+    )
+
+    sk2_digest = sk1.http_client().timeline_digest(
+        tenant_id, timeline_id, sk2.get_timeline_start_lsn(tenant_id, timeline_id), lsn
+    )
+
+    assert sk1_digest == sk2_digest
 
     # stop one of safekeepers which weren't recovering and insert a bit more to check we can commit
     env.safekeepers[2].stop()

From 84bbe87d605fdd9daf0b2aff1fac7da40b43f725 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Mon, 17 Feb 2025 18:24:17 +0100
Subject: [PATCH 25/78] pageserver: tweak `pageserver_layers_per_read`
 histogram resolution (#10847)

## Problem

The current `pageserver_layers_per_read` histogram buckets don't
represent the current reality very well. For the percentiles we care
about (e.g. p50 and p99), we often see fairly high read amp, especially
during ingestion, and anything below 4 can be considered very good.

## Summary of changes

Change the per-timeline read amp histogram buckets to `[4.0, 8.0, 16.0,
32.0, 64.0, 128.0, 256.0]`.
---
 pageserver/src/metrics.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 16ca4683ad..e1c26b0684 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -130,7 +130,7 @@ pub(crate) static LAYERS_PER_READ: Lazy<HistogramVec> = Lazy::new(|| {
         "Layers visited to serve a single read (read amplification). In a batch, all visited layers count towards every read.",
         &["tenant_id", "shard_id", "timeline_id"],
         // Low resolution to reduce cardinality.
-        vec![1.0, 5.0, 10.0, 25.0, 50.0, 100.0],
+        vec![4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0],
     )
     .expect("failed to define a metric")
 });

From b34598516f25857969679c10ec6ebdbe0e523d55 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 17 Feb 2025 13:02:16 -0600
Subject: [PATCH 26/78] Warn when PR may require regenerating cloud PG settings
 (#10229)

These generated Postgres settings JSON files can get out of sync causing
the control plane to reject updated to an endpoint or project's Postgres
settings.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 .github/workflows/regenerate-pg-setting.yml | 41 +++++++++++++++++++++
 1 file changed, 41 insertions(+)
 create mode 100644 .github/workflows/regenerate-pg-setting.yml

diff --git a/.github/workflows/regenerate-pg-setting.yml b/.github/workflows/regenerate-pg-setting.yml
new file mode 100644
index 0000000000..1e9d2ec5e2
--- /dev/null
+++ b/.github/workflows/regenerate-pg-setting.yml
@@ -0,0 +1,41 @@
+name: Regenerate Postgres Settings
+
+on:
+  pull_request:
+    types:
+      - opened
+      - synchronize
+      - reopened
+    paths:
+      - pgxn/neon/**.c
+      - vendor/postgres-v*
+      - vendor/revisions.json
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref }}
+  cancel-in-progress: true
+
+permissions:
+  pull-requests: write
+
+jobs:
+  regenerate-pg-settings:
+    runs-on: ubuntu-22.04
+
+    steps:
+      - name: Add comment
+        uses: thollander/actions-comment-pull-request@v3
+        with:
+          comment-tag: ${{ github.job }}
+          pr-number: ${{ github.event.number }}
+          message: |
+            If this PR added a GUC in the Postgres fork or `neon` extension,
+            please regenerate the Postgres settings in the `cloud` repo:
+
+            ```
+            make NEON_WORKDIR=path/to/neon/checkout \
+              -C goapp/internal/shareddomain/postgres generate
+            ```
+
+            If you're an external contributor, a Neon employee will assist in
+            making sure this step is done.

From 2884917bd429a1b01e1d1f1a99cffd046a789578 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 17 Feb 2025 21:42:57 +0200
Subject: [PATCH 27/78] compute: Allow postgres user to power off the VM also
 on <= v16 (#10860)

I did this for debian bookworm variant in PR #10710, but forgot to
update the "bullseye" dockerfile that is used to build older PostgreSQL
versions.
---
 compute/vm-image-spec-bullseye.yaml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/compute/vm-image-spec-bullseye.yaml b/compute/vm-image-spec-bullseye.yaml
index 124c40cf5d..6617c98599 100644
--- a/compute/vm-image-spec-bullseye.yaml
+++ b/compute/vm-image-spec-bullseye.yaml
@@ -47,7 +47,9 @@ files:
       # Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap
       # and /neonvm/bin/set-disk-quota as root without requiring entering a password (NOPASSWD),
       # regardless of hostname (ALL)
-      postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota
+      #
+      # Also allow it to shut down the VM. The fast_import job does that when it's finished.
+      postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff
   - filename: cgconfig.conf
     content: |
       # Configuration for cgroups in VM compute nodes

From 811506aaa2b4f35de3415b6ba98c90200a0b1741 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 17 Feb 2025 22:07:31 +0200
Subject: [PATCH 28/78] fast_import: Use rust s3 client for uploading (#10777)

This replaces the use of the awscli utility. awscli binary is massive,
it added about 200 MB to the docker image size, while the s3 client was
already a dependency so using that is essentially free, as far as binary
size is concerned.

I implemented a simple upload function that tries to keep 10 uploads
going in parallel. I believe that's the default behavior of the "aws s3
sync" command too.
---
 Cargo.lock                                    |   2 +
 compute/compute-node.Dockerfile               |  26 ----
 compute_tools/Cargo.toml                      |   2 +
 compute_tools/src/bin/fast_import.rs          |  30 +++--
 .../src/bin/fast_import/aws_s3_sync.rs        | 116 +++++++++++++++---
 5 files changed, 122 insertions(+), 54 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 4f75fa5733..12c12bc771 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1303,6 +1303,7 @@ dependencies = [
  "aws-config",
  "aws-sdk-kms",
  "aws-sdk-s3",
+ "aws-smithy-types",
  "axum",
  "base64 0.13.1",
  "bytes",
@@ -1351,6 +1352,7 @@ dependencies = [
  "utils",
  "uuid",
  "vm_monitor",
+ "walkdir",
  "workspace_hack",
  "zstd",
 ]
diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 1236372d27..082dea6f1b 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1695,29 +1695,6 @@ RUN if [ "$TARGETARCH" = "amd64" ]; then\
     && echo "${pgbouncer_exporter_sha256} pgbouncer_exporter" | sha256sum -c -\
     && echo "${sql_exporter_sha256} sql_exporter" | sha256sum -c -
 
-#########################################################################################
-#
-# Layer "awscli"
-#
-#########################################################################################
-FROM build-deps AS awscli
-ARG TARGETARCH
-RUN set -ex; \
-    if [ "${TARGETARCH}" = "amd64" ]; then \
-        TARGETARCH_ALT="x86_64"; \
-        CHECKSUM="c9a9df3770a3ff9259cb469b6179e02829687a464e0824d5c32d378820b53a00"; \
-    elif [ "${TARGETARCH}" = "arm64" ]; then \
-        TARGETARCH_ALT="aarch64"; \
-        CHECKSUM="8181730be7891582b38b028112e81b4899ca817e8c616aad807c9e9d1289223a"; \
-    else \
-        echo "Unsupported architecture: ${TARGETARCH}"; exit 1; \
-    fi; \
-    curl --retry 5 -L "https://awscli.amazonaws.com/awscli-exe-linux-${TARGETARCH_ALT}-2.17.5.zip" -o /tmp/awscliv2.zip; \
-    echo "${CHECKSUM}  /tmp/awscliv2.zip" | sha256sum -c -; \
-    unzip /tmp/awscliv2.zip -d /tmp/awscliv2; \
-    /tmp/awscliv2/aws/install; \
-    rm -rf /tmp/awscliv2.zip /tmp/awscliv2
-
 #########################################################################################
 #
 # Clean up postgres folder before inclusion
@@ -1887,9 +1864,6 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
     mkdir /usr/local/download_extensions && \
     chown -R postgres:postgres /usr/local/download_extensions
 
-# aws cli is used by fast_import
-COPY --from=awscli /usr/local/aws-cli /usr/local/aws-cli
-
 # pgbouncer and its config
 COPY --from=pgbouncer         /usr/local/pgbouncer/bin/pgbouncer /usr/local/bin/pgbouncer
 COPY --chmod=0666 --chown=postgres compute/etc/pgbouncer.ini /etc/pgbouncer.ini
diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml
index b8828fa49f..81dcf99560 100644
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -14,6 +14,7 @@ base64.workspace = true
 aws-config.workspace = true
 aws-sdk-s3.workspace = true
 aws-sdk-kms.workspace = true
+aws-smithy-types.workspace = true
 anyhow.workspace = true
 axum = { workspace = true, features = [] }
 camino.workspace = true
@@ -54,6 +55,7 @@ thiserror.workspace = true
 url.workspace = true
 uuid.workspace = true
 prometheus.workspace = true
+walkdir.workspace = true
 
 postgres_initdb.workspace = true
 compute_api.workspace = true
diff --git a/compute_tools/src/bin/fast_import.rs b/compute_tools/src/bin/fast_import.rs
index 4c8d031532..614a93f48b 100644
--- a/compute_tools/src/bin/fast_import.rs
+++ b/compute_tools/src/bin/fast_import.rs
@@ -421,6 +421,7 @@ async fn run_dump_restore(
 
 #[allow(clippy::too_many_arguments)]
 async fn cmd_pgdata(
+    s3_client: Option<aws_sdk_s3::Client>,
     kms_client: Option<aws_sdk_kms::Client>,
     maybe_s3_prefix: Option<s3_uri::S3Uri>,
     maybe_spec: Option<Spec>,
@@ -488,9 +489,13 @@ async fn cmd_pgdata(
     // Only sync if s3_prefix was specified
     if let Some(s3_prefix) = maybe_s3_prefix {
         info!("upload pgdata");
-        aws_s3_sync::sync(Utf8Path::new(&pgdata_dir), &s3_prefix.append("/pgdata/"))
-            .await
-            .context("sync dump directory to destination")?;
+        aws_s3_sync::upload_dir_recursive(
+            s3_client.as_ref().unwrap(),
+            Utf8Path::new(&pgdata_dir),
+            &s3_prefix.append("/pgdata/"),
+        )
+        .await
+        .context("sync dump directory to destination")?;
 
         info!("write status");
         {
@@ -499,9 +504,13 @@ async fn cmd_pgdata(
             let status_file = status_dir.join("pgdata");
             std::fs::write(&status_file, serde_json::json!({"done": true}).to_string())
                 .context("write status file")?;
-            aws_s3_sync::sync(&status_dir, &s3_prefix.append("/status/"))
-                .await
-                .context("sync status directory to destination")?;
+            aws_s3_sync::upload_dir_recursive(
+                s3_client.as_ref().unwrap(),
+                &status_dir,
+                &s3_prefix.append("/status/"),
+            )
+            .await
+            .context("sync status directory to destination")?;
         }
     }
 
@@ -573,18 +582,20 @@ pub(crate) async fn main() -> anyhow::Result<()> {
     let args = Args::parse();
 
     // Initialize AWS clients only if s3_prefix is specified
-    let (aws_config, kms_client) = if args.s3_prefix.is_some() {
+    let (s3_client, kms_client) = if args.s3_prefix.is_some() {
         let config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await;
+        let s3_client = aws_sdk_s3::Client::new(&config);
         let kms = aws_sdk_kms::Client::new(&config);
-        (Some(config), Some(kms))
+        (Some(s3_client), Some(kms))
     } else {
         (None, None)
     };
 
     let spec: Option<Spec> = if let Some(s3_prefix) = &args.s3_prefix {
         let spec_key = s3_prefix.append("/spec.json");
-        let s3_client = aws_sdk_s3::Client::new(aws_config.as_ref().unwrap());
         let object = s3_client
+            .as_ref()
+            .unwrap()
             .get_object()
             .bucket(&spec_key.bucket)
             .key(spec_key.key)
@@ -624,6 +635,7 @@ pub(crate) async fn main() -> anyhow::Result<()> {
             memory_mb,
         } => {
             cmd_pgdata(
+                s3_client,
                 kms_client,
                 args.s3_prefix,
                 spec,
diff --git a/compute_tools/src/bin/fast_import/aws_s3_sync.rs b/compute_tools/src/bin/fast_import/aws_s3_sync.rs
index 5fa58c8f87..1be10b36d6 100644
--- a/compute_tools/src/bin/fast_import/aws_s3_sync.rs
+++ b/compute_tools/src/bin/fast_import/aws_s3_sync.rs
@@ -1,24 +1,102 @@
-use anyhow::Context;
-use camino::Utf8Path;
+use camino::{Utf8Path, Utf8PathBuf};
+use tokio::task::JoinSet;
+use walkdir::WalkDir;
 
 use super::s3_uri::S3Uri;
 
-pub(crate) async fn sync(local: &Utf8Path, remote: &S3Uri) -> anyhow::Result<()> {
-    let mut builder = tokio::process::Command::new("aws");
-    builder
-        .arg("s3")
-        .arg("sync")
-        .arg(local.as_str())
-        .arg(remote.to_string());
-    let st = builder
-        .spawn()
-        .context("spawn aws s3 sync")?
-        .wait()
-        .await
-        .context("wait for aws s3 sync")?;
-    if st.success() {
-        Ok(())
-    } else {
-        Err(anyhow::anyhow!("aws s3 sync failed"))
+use tracing::{info, warn};
+
+const MAX_PARALLEL_UPLOADS: usize = 10;
+
+/// Upload all files from 'local' to 'remote'
+pub(crate) async fn upload_dir_recursive(
+    s3_client: &aws_sdk_s3::Client,
+    local: &Utf8Path,
+    remote: &S3Uri,
+) -> anyhow::Result<()> {
+    // Recursively scan directory
+    let mut dirwalker = WalkDir::new(local)
+        .into_iter()
+        .map(|entry| {
+            let entry = entry?;
+            let file_type = entry.file_type();
+            let path = <&Utf8Path>::try_from(entry.path())?.to_path_buf();
+            Ok((file_type, path))
+        })
+        .filter_map(|e: anyhow::Result<(std::fs::FileType, Utf8PathBuf)>| {
+            match e {
+                Ok((file_type, path)) if file_type.is_file() => Some(Ok(path)),
+                Ok((file_type, _path)) if file_type.is_dir() => {
+                    // The WalkDir iterator will recurse into directories, but we don't want
+                    // to do anything with directories as such. There's no concept of uploading
+                    // an empty directory to S3.
+                    None
+                }
+                Ok((file_type, path)) if file_type.is_symlink() => {
+                    // huh, didn't expect a symlink. Can't upload that to S3. Warn and skip.
+                    warn!("cannot upload symlink ({})", path);
+                    None
+                }
+                Ok((_file_type, path)) => {
+                    // should not happen
+                    warn!("directory entry has unexpected type ({})", path);
+                    None
+                }
+                Err(e) => Some(Err(e)),
+            }
+        });
+
+    // Spawn upload tasks for each file, keeping MAX_PARALLEL_UPLOADS active in
+    // parallel.
+    let mut joinset = JoinSet::new();
+    loop {
+        // Could we upload more?
+        while joinset.len() < MAX_PARALLEL_UPLOADS {
+            if let Some(full_local_path) = dirwalker.next() {
+                let full_local_path = full_local_path?;
+                let relative_local_path = full_local_path
+                    .strip_prefix(local)
+                    .expect("all paths start from the walkdir root");
+                let remote_path = remote.append(relative_local_path.as_str());
+                info!(
+                    "starting upload of {} to {}",
+                    &full_local_path, &remote_path
+                );
+                let upload_task = upload_file(s3_client.clone(), full_local_path, remote_path);
+                joinset.spawn(upload_task);
+            } else {
+                info!("draining upload tasks");
+                break;
+            }
+        }
+
+        // Wait for an upload to complete
+        if let Some(res) = joinset.join_next().await {
+            let _ = res?;
+        } else {
+            // all done!
+            break;
+        }
     }
+    Ok(())
+}
+
+pub(crate) async fn upload_file(
+    s3_client: aws_sdk_s3::Client,
+    local_path: Utf8PathBuf,
+    remote: S3Uri,
+) -> anyhow::Result<()> {
+    use aws_smithy_types::byte_stream::ByteStream;
+    let stream = ByteStream::from_path(&local_path).await?;
+
+    let _result = s3_client
+        .put_object()
+        .bucket(remote.bucket)
+        .key(&remote.key)
+        .body(stream)
+        .send()
+        .await?;
+    info!("upload of {} to {} finished", &local_path, &remote.key);
+
+    Ok(())
 }

From 27241f039c2411910c987466def4f72c912c982e Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 17 Feb 2025 20:29:14 +0000
Subject: [PATCH 29/78] test_runner: fix `neon_local` usage for version
 mismatch tests (#10859)

## Problem

Tests with mixed versions of binaries always pick up new versions if
services are started using `neon_local`.

## Summary of changes
- Set `neon_local_binpath` along with `neon_binpath` and
`pg_distrib_dir` for tests with mixed versions
---
 test_runner/fixtures/neon_fixtures.py |  9 ++++++++-
 test_runner/fixtures/utils.py         | 10 +++++-----
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 73607db7d8..c4d4908568 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -491,6 +491,7 @@ class NeonEnvBuilder:
         self.test_may_use_compatibility_snapshot_binaries = False
         self.version_combination = combination
         self.mixdir = self.test_output_dir / "mixdir_neon"
+
         if self.version_combination is not None:
             assert (
                 self.compatibility_neon_binpath is not None
@@ -702,6 +703,11 @@ class NeonEnvBuilder:
 
     def _mix_versions(self):
         assert self.version_combination is not None, "version combination must be set"
+
+        # Always use a newer version of `neon_local`
+        (self.mixdir / "neon_local").symlink_to(self.neon_binpath / "neon_local")
+        self.neon_local_binpath = self.mixdir
+
         for component, paths in COMPONENT_BINARIES.items():
             directory = (
                 self.neon_binpath
@@ -711,9 +717,10 @@ class NeonEnvBuilder:
             for filename in paths:
                 destination = self.mixdir / filename
                 destination.symlink_to(directory / filename)
+        self.neon_binpath = self.mixdir
+
         if self.version_combination["compute"] == "old":
             self.pg_distrib_dir = self.compatibility_pg_distrib_dir
-        self.neon_binpath = self.mixdir
 
     def overlay_mount(self, ident: str, srcdir: Path, dstdir: Path):
         """
diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index e160c617cd..71b2de4f65 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -52,11 +52,11 @@ COMPONENT_BINARIES = {
 # Disable auto-formatting for better readability
 # fmt: off
 VERSIONS_COMBINATIONS = (
-    {"storage_controller": "new", "storage_broker": "new", "compute": "new", "safekeeper": "new", "pageserver": "new"},
-    {"storage_controller": "new", "storage_broker": "new", "compute": "old", "safekeeper": "old", "pageserver": "old"},
-    {"storage_controller": "new", "storage_broker": "new", "compute": "old", "safekeeper": "old", "pageserver": "new"},
-    {"storage_controller": "new", "storage_broker": "new", "compute": "old", "safekeeper": "new", "pageserver": "new"},
-    {"storage_controller": "old", "storage_broker": "old", "compute": "new", "safekeeper": "new", "pageserver": "new"},
+    {"storage_controller": "new", "storage_broker": "new", "compute": "new", "safekeeper": "new", "pageserver": "new"}, # combination: nnnnn
+    {"storage_controller": "new", "storage_broker": "new", "compute": "old", "safekeeper": "old", "pageserver": "old"}, # combination: ooonn
+    {"storage_controller": "new", "storage_broker": "new", "compute": "old", "safekeeper": "old", "pageserver": "new"}, # combination: ononn
+    {"storage_controller": "new", "storage_broker": "new", "compute": "old", "safekeeper": "new", "pageserver": "new"}, # combination: onnnn
+    {"storage_controller": "old", "storage_broker": "old", "compute": "new", "safekeeper": "new", "pageserver": "new"}, # combination: nnnoo
 )
 # fmt: on
 

From 719ec378cdf3b5454ed4b991b78bc1ad4de382ba Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Tue, 18 Feb 2025 08:54:20 +0000
Subject: [PATCH 30/78] fix(local_proxy): discard all in tx (#10864)

## Problem

`discard all` cannot run in a transaction (even if implicit)

## Summary of changes

Split up the query into two, we don't need transaction support.
---
 proxy/src/serverless/local_conn_pool.rs | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs
index 7ed514ff65..137a2d6377 100644
--- a/proxy/src/serverless/local_conn_pool.rs
+++ b/proxy/src/serverless/local_conn_pool.rs
@@ -279,9 +279,12 @@ impl ClientInnerCommon<postgres_client::Client> {
             local_data.jti += 1;
             let token = resign_jwt(&local_data.key, payload, local_data.jti)?;
 
+            // discard all cannot run in a transaction. must be executed alone.
+            self.inner.batch_execute("discard all").await?;
+
             // initiates the auth session
             // this is safe from query injections as the jwt format free of any escape characters.
-            let query = format!("discard all; select auth.jwt_session_init('{token}')");
+            let query = format!("select auth.jwt_session_init('{token}')");
             self.inner.batch_execute(&query).await?;
 
             let pid = self.inner.get_process_id();

From f81259967dacf94810ad2e883285213ebca00969 Mon Sep 17 00:00:00 2001
From: Alexander Lakhin <exclusion@gmail.com>
Date: Tue, 18 Feb 2025 15:23:18 +0200
Subject: [PATCH 31/78] Add test to make sure sanitizers really work when
 expected (#10838)

---
 test_runner/fixtures/utils.py              |  2 ++
 test_runner/regress/test_endpoint_crash.py | 19 +++++++++++++++++++
 2 files changed, 21 insertions(+)

diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index 71b2de4f65..2a59eab710 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -64,6 +64,8 @@ VERSIONS_COMBINATIONS = (
 # If it is not set or set to a value not equal to "false", LFC is enabled by default.
 USE_LFC = os.environ.get("USE_LFC") != "false"
 
+WITH_SANITIZERS = os.environ.get("SANITIZERS") == "enabled"
+
 
 def subprocess_capture(
     capture_dir: Path,
diff --git a/test_runner/regress/test_endpoint_crash.py b/test_runner/regress/test_endpoint_crash.py
index 0217cd0d03..03bfd1cb8d 100644
--- a/test_runner/regress/test_endpoint_crash.py
+++ b/test_runner/regress/test_endpoint_crash.py
@@ -2,6 +2,8 @@ from __future__ import annotations
 
 import pytest
 from fixtures.neon_fixtures import NeonEnvBuilder
+from fixtures.pg_version import PgVersion
+from fixtures.utils import WITH_SANITIZERS, run_only_on_postgres
 
 
 @pytest.mark.parametrize(
@@ -23,3 +25,20 @@ def test_endpoint_crash(neon_env_builder: NeonEnvBuilder, sql_func: str):
     endpoint.safe_psql("CREATE EXTENSION neon_test_utils;")
     with pytest.raises(Exception, match="This probably means the server terminated abnormally"):
         endpoint.safe_psql(f"SELECT {sql_func}();")
+
+
+@run_only_on_postgres([PgVersion.V17], "Currently, build vith sanitizers is possible with v17 only")
+def test_sanitizers(neon_env_builder: NeonEnvBuilder):
+    """
+    Test that undefined behavior leads to endpoint abort with sanitizers enabled
+    """
+    env = neon_env_builder.init_start()
+    env.create_branch("test_ubsan")
+    endpoint = env.endpoints.create_start("test_ubsan")
+
+    # Test case based on https://www.postgresql.org/message-id/17167-028026e4ca333817@postgresql.org
+    if not WITH_SANITIZERS:
+        endpoint.safe_psql("SELECT 1::int4 << 128")
+    else:
+        with pytest.raises(Exception, match="This probably means the server terminated abnormally"):
+            endpoint.safe_psql("SELECT 1::int4 << 128")

From d36baae7582a7fcebea08c7aa4f525a819f1023c Mon Sep 17 00:00:00 2001
From: Arseny Sher <ars@neon.tech>
Date: Tue, 18 Feb 2025 16:57:12 +0300
Subject: [PATCH 32/78] Add gc_blocking and restore latest_gc_cutoff in openapi
 spec (#10867)

## Problem

gc_blocking is missing in the tenant info, but cplane wants to use it.
Also, https://github.com/neondatabase/neon/pull/10707/ removed
latest_gc_cutoff from the spec, renaming it to applied_gc_cutoff.
Temporarily get it back until cplane migrates.

## Summary of changes

Add them.

ref https://neondb.slack.com/archives/C03438W3FLZ/p1739877734963979
---
 libs/pageserver_api/src/models.rs    | 3 +--
 pageserver/src/http/openapi_spec.yml | 5 +++++
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 3d40cfe121..dd7bea2916 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -1080,8 +1080,7 @@ pub struct TenantInfo {
 
     /// Opaque explanation if gc is being blocked.
     ///
-    /// Only looked up for the individual tenant detail, not the listing. This is purely for
-    /// debugging, not included in openapi.
+    /// Only looked up for the individual tenant detail, not the listing.
     #[serde(skip_serializing_if = "Option::is_none")]
     pub gc_blocking: Option<String>,
 }
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index b8ed7aaf26..733115539a 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -882,6 +882,8 @@ components:
               properties:
                 reason:
                   type: string
+        gc_blocking:
+          type: string
 
     TenantCreateRequest:
       allOf:
@@ -1083,6 +1085,9 @@ components:
         min_readable_lsn:
           type: string
           format: hex
+        latest_gc_cutoff_lsn:
+          type: string
+          format: hex
         applied_gc_cutoff_lsn:
           type: string
           format: hex

From caece02da7d50c31542379a50229b488dae4d463 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 18 Feb 2025 15:02:22 +0100
Subject: [PATCH 33/78] move pull_timeline to safekeeper_api and add
 SafekeeperGeneration (#10863)

Preparations for a successor of #10440:

* move `pull_timeline` to `safekeeper_api` and add it to
`SafekeeperClient`. we want to do `pull_timeline` on any creations that
we couldn't do initially.
* Add a `SafekeeperGeneration` type instead of relying on a type alias.
we want to maintain a safekeeper specific generation number now in the
storcon database. A separate type is important to make it impossible to
mix it up with the tenant's pageserver specific generation number. We
absolutely want to avoid that for correctness reasons. If someone mixes
up a safekeeper and pageserver id (both use the `NodeId` type), that's
bad but there is no wrong generations flying around.

part of #9011
---
 libs/safekeeper_api/src/membership.rs       | 42 +++++++++++++++++---
 libs/safekeeper_api/src/models.rs           | 15 +++++++
 libs/utils/src/bin_ser.rs                   | 43 +++++++++++++++++++++
 safekeeper/client/src/mgmt_api.rs           | 11 +++++-
 safekeeper/src/control_file.rs              |  4 +-
 safekeeper/src/http/routes.rs               |  3 +-
 safekeeper/src/pull_timeline.rs             | 32 +++++----------
 safekeeper/src/safekeeper.rs                |  4 +-
 storage_controller/src/safekeeper_client.rs | 18 ++++++++-
 9 files changed, 137 insertions(+), 35 deletions(-)

diff --git a/libs/safekeeper_api/src/membership.rs b/libs/safekeeper_api/src/membership.rs
index a39fda526f..8b14a4f290 100644
--- a/libs/safekeeper_api/src/membership.rs
+++ b/libs/safekeeper_api/src/membership.rs
@@ -9,13 +9,43 @@ use anyhow::bail;
 use serde::{Deserialize, Serialize};
 use utils::id::NodeId;
 
-/// Number uniquely identifying safekeeper configuration.
-/// Note: it is a part of sk control file.
-pub type Generation = u32;
 /// 1 is the first valid generation, 0 is used as
 /// a placeholder before we fully migrate to generations.
-pub const INVALID_GENERATION: Generation = 0;
-pub const INITIAL_GENERATION: Generation = 1;
+pub const INVALID_GENERATION: SafekeeperGeneration = SafekeeperGeneration::new(0);
+pub const INITIAL_GENERATION: SafekeeperGeneration = SafekeeperGeneration::new(1);
+
+/// Number uniquely identifying safekeeper configuration.
+/// Note: it is a part of sk control file.
+///
+/// Like tenant generations, but for safekeepers.
+#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
+pub struct SafekeeperGeneration(u32);
+
+impl SafekeeperGeneration {
+    pub const fn new(v: u32) -> Self {
+        Self(v)
+    }
+
+    #[track_caller]
+    pub fn previous(&self) -> Option<Self> {
+        Some(Self(self.0.checked_sub(1)?))
+    }
+
+    #[track_caller]
+    pub fn next(&self) -> Self {
+        Self(self.0 + 1)
+    }
+
+    pub fn into_inner(self) -> u32 {
+        self.0
+    }
+}
+
+impl Display for SafekeeperGeneration {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.0)
+    }
+}
 
 /// Membership is defined by ids so e.g. walproposer uses them to figure out
 /// quorums, but we also carry host and port to give wp idea where to connect.
@@ -89,7 +119,7 @@ impl Display for MemberSet {
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
 pub struct Configuration {
     /// Unique id.
-    pub generation: Generation,
+    pub generation: SafekeeperGeneration,
     /// Current members of the configuration.
     pub members: MemberSet,
     /// Some means it is a joint conf.
diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs
index 30418b0efd..41ccdaa428 100644
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -282,3 +282,18 @@ pub struct TimelineTermBumpResponse {
 pub struct SafekeeperUtilization {
     pub timeline_count: u64,
 }
+
+/// pull_timeline request body.
+#[derive(Debug, Deserialize, Serialize)]
+pub struct PullTimelineRequest {
+    pub tenant_id: TenantId,
+    pub timeline_id: TimelineId,
+    pub http_hosts: Vec<String>,
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+pub struct PullTimelineResponse {
+    // Donor safekeeper host
+    pub safekeeper_host: String,
+    // TODO: add more fields?
+}
diff --git a/libs/utils/src/bin_ser.rs b/libs/utils/src/bin_ser.rs
index 42b45eeea0..4d173d0726 100644
--- a/libs/utils/src/bin_ser.rs
+++ b/libs/utils/src/bin_ser.rs
@@ -286,6 +286,11 @@ mod tests {
     const SHORT2_ENC_LE: &[u8] = &[8, 0, 0, 3, 7];
     const SHORT2_ENC_LE_TRAILING: &[u8] = &[8, 0, 0, 3, 7, 0xff, 0xff, 0xff];
 
+    #[derive(Debug, PartialEq, Eq, Serialize, Deserialize)]
+    struct NewTypeStruct(u32);
+    const NT1: NewTypeStruct = NewTypeStruct(414243);
+    const NT1_INNER: u32 = 414243;
+
     #[derive(Debug, PartialEq, Eq, Serialize, Deserialize)]
     pub struct LongMsg {
         pub tag: u8,
@@ -408,4 +413,42 @@ mod tests {
         let msg2 = LongMsg::des(&encoded).unwrap();
         assert_eq!(msg, msg2);
     }
+
+    #[test]
+    /// Ensure that newtype wrappers around u32 don't change the serialization format
+    fn be_nt() {
+        use super::BeSer;
+
+        assert_eq!(NT1.serialized_size().unwrap(), 4);
+
+        let msg = NT1;
+
+        let encoded = msg.ser().unwrap();
+        let expected = hex_literal::hex!("0006 5223");
+        assert_eq!(encoded, expected);
+
+        assert_eq!(encoded, NT1_INNER.ser().unwrap());
+
+        let msg2 = NewTypeStruct::des(&encoded).unwrap();
+        assert_eq!(msg, msg2);
+    }
+
+    #[test]
+    /// Ensure that newtype wrappers around u32 don't change the serialization format
+    fn le_nt() {
+        use super::LeSer;
+
+        assert_eq!(NT1.serialized_size().unwrap(), 4);
+
+        let msg = NT1;
+
+        let encoded = msg.ser().unwrap();
+        let expected = hex_literal::hex!("2352 0600");
+        assert_eq!(encoded, expected);
+
+        assert_eq!(encoded, NT1_INNER.ser().unwrap());
+
+        let msg2 = NewTypeStruct::des(&encoded).unwrap();
+        assert_eq!(msg, msg2);
+    }
 }
diff --git a/safekeeper/client/src/mgmt_api.rs b/safekeeper/client/src/mgmt_api.rs
index d4f47fc96d..40e5afc4aa 100644
--- a/safekeeper/client/src/mgmt_api.rs
+++ b/safekeeper/client/src/mgmt_api.rs
@@ -5,7 +5,10 @@
 
 use http_utils::error::HttpErrorBody;
 use reqwest::{IntoUrl, Method, StatusCode};
-use safekeeper_api::models::{SafekeeperUtilization, TimelineCreateRequest, TimelineStatus};
+use safekeeper_api::models::{
+    PullTimelineRequest, PullTimelineResponse, SafekeeperUtilization, TimelineCreateRequest,
+    TimelineStatus,
+};
 use std::error::Error as _;
 use utils::{
     id::{NodeId, TenantId, TimelineId},
@@ -88,6 +91,12 @@ impl Client {
         resp.json().await.map_err(Error::ReceiveBody)
     }
 
+    pub async fn pull_timeline(&self, req: &PullTimelineRequest) -> Result<PullTimelineResponse> {
+        let uri = format!("{}/v1/pull_timeline", self.mgmt_api_endpoint);
+        let resp = self.post(&uri, req).await?;
+        resp.json().await.map_err(Error::ReceiveBody)
+    }
+
     pub async fn delete_timeline(
         &self,
         tenant_id: TenantId,
diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs
index e92ca881e1..35aebfd8ad 100644
--- a/safekeeper/src/control_file.rs
+++ b/safekeeper/src/control_file.rs
@@ -235,7 +235,7 @@ impl Storage for FileStorage {
 #[cfg(test)]
 mod test {
     use super::*;
-    use safekeeper_api::membership::{Configuration, MemberSet};
+    use safekeeper_api::membership::{Configuration, MemberSet, SafekeeperGeneration};
     use tokio::fs;
     use utils::lsn::Lsn;
 
@@ -246,7 +246,7 @@ mod test {
         let tempdir = camino_tempfile::tempdir()?;
         let mut state = TimelinePersistentState::empty();
         state.mconf = Configuration {
-            generation: 42,
+            generation: SafekeeperGeneration::new(42),
             members: MemberSet::empty(),
             new_members: None,
         };
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index 41e30d838a..cd2ac5f44c 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -2,6 +2,7 @@ use http_utils::failpoints::failpoints_handler;
 use hyper::{Body, Request, Response, StatusCode};
 use safekeeper_api::models;
 use safekeeper_api::models::AcceptorStateStatus;
+use safekeeper_api::models::PullTimelineRequest;
 use safekeeper_api::models::SafekeeperStatus;
 use safekeeper_api::models::TermSwitchApiEntry;
 use safekeeper_api::models::TimelineStatus;
@@ -230,7 +231,7 @@ async fn timeline_delete_handler(mut request: Request<Body>) -> Result<Response<
 async fn timeline_pull_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permission(&request, None)?;
 
-    let data: pull_timeline::Request = json_request(&mut request).await?;
+    let data: PullTimelineRequest = json_request(&mut request).await?;
     let conf = get_conf(&request);
     let global_timelines = get_global_timelines(&request);
 
diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs
index f2d8e4c85f..4827b73074 100644
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -4,10 +4,13 @@ use camino::Utf8PathBuf;
 use chrono::{DateTime, Utc};
 use futures::{SinkExt, StreamExt, TryStreamExt};
 use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI};
-use safekeeper_api::{models::TimelineStatus, Term};
+use safekeeper_api::{
+    models::{PullTimelineRequest, PullTimelineResponse, TimelineStatus},
+    Term,
+};
 use safekeeper_client::mgmt_api;
 use safekeeper_client::mgmt_api::Client;
-use serde::{Deserialize, Serialize};
+use serde::Deserialize;
 use std::{
     cmp::min,
     io::{self, ErrorKind},
@@ -33,7 +36,7 @@ use crate::{
 };
 use utils::{
     crashsafe::fsync_async_opt,
-    id::{NodeId, TenantId, TenantTimelineId, TimelineId},
+    id::{NodeId, TenantTimelineId},
     logging::SecretString,
     lsn::Lsn,
     pausable_failpoint,
@@ -378,21 +381,6 @@ impl WalResidentTimeline {
     }
 }
 
-/// pull_timeline request body.
-#[derive(Debug, Deserialize)]
-pub struct Request {
-    pub tenant_id: TenantId,
-    pub timeline_id: TimelineId,
-    pub http_hosts: Vec<String>,
-}
-
-#[derive(Debug, Serialize)]
-pub struct Response {
-    // Donor safekeeper host
-    pub safekeeper_host: String,
-    // TODO: add more fields?
-}
-
 /// Response for debug dump request.
 #[derive(Debug, Deserialize)]
 pub struct DebugDumpResponse {
@@ -405,10 +393,10 @@ pub struct DebugDumpResponse {
 
 /// Find the most advanced safekeeper and pull timeline from it.
 pub async fn handle_request(
-    request: Request,
+    request: PullTimelineRequest,
     sk_auth_token: Option<SecretString>,
     global_timelines: Arc<GlobalTimelines>,
-) -> Result<Response> {
+) -> Result<PullTimelineResponse> {
     let existing_tli = global_timelines.get(TenantTimelineId::new(
         request.tenant_id,
         request.timeline_id,
@@ -460,7 +448,7 @@ async fn pull_timeline(
     host: String,
     sk_auth_token: Option<SecretString>,
     global_timelines: Arc<GlobalTimelines>,
-) -> Result<Response> {
+) -> Result<PullTimelineResponse> {
     let ttid = TenantTimelineId::new(status.tenant_id, status.timeline_id);
     info!(
         "pulling timeline {} from safekeeper {}, commit_lsn={}, flush_lsn={}, term={}, epoch={}",
@@ -535,7 +523,7 @@ async fn pull_timeline(
         .load_temp_timeline(ttid, &tli_dir_path, false)
         .await?;
 
-    Ok(Response {
+    Ok(PullTimelineResponse {
         safekeeper_host: host,
     })
 }
diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index 45e19c31b6..f816f8459a 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -1004,7 +1004,7 @@ mod tests {
 
     use postgres_ffi::{XLogSegNo, WAL_SEGMENT_SIZE};
     use safekeeper_api::{
-        membership::{Configuration, MemberSet, SafekeeperId},
+        membership::{Configuration, MemberSet, SafekeeperGeneration, SafekeeperId},
         ServerInfo,
     };
 
@@ -1303,7 +1303,7 @@ mod tests {
             tenant_id,
             timeline_id,
             mconf: Configuration {
-                generation: 42,
+                generation: SafekeeperGeneration::new(42),
                 members: MemberSet::new(vec![SafekeeperId {
                     id: NodeId(1),
                     host: "hehe.org".to_owned(),
diff --git a/storage_controller/src/safekeeper_client.rs b/storage_controller/src/safekeeper_client.rs
index bb494f20fa..f234ab3429 100644
--- a/storage_controller/src/safekeeper_client.rs
+++ b/storage_controller/src/safekeeper_client.rs
@@ -1,5 +1,8 @@
 use crate::metrics::PageserverRequestLabelGroup;
-use safekeeper_api::models::{SafekeeperUtilization, TimelineCreateRequest, TimelineStatus};
+use safekeeper_api::models::{
+    PullTimelineRequest, PullTimelineResponse, SafekeeperUtilization, TimelineCreateRequest,
+    TimelineStatus,
+};
 use safekeeper_client::mgmt_api::{Client, Result};
 use utils::{
     id::{NodeId, TenantId, TimelineId},
@@ -94,6 +97,19 @@ impl SafekeeperClient {
         )
     }
 
+    #[allow(dead_code)]
+    pub(crate) async fn pull_timeline(
+        &self,
+        req: &PullTimelineRequest,
+    ) -> Result<PullTimelineResponse> {
+        measured_request!(
+            "pull_timeline",
+            crate::metrics::Method::Post,
+            &self.node_id_label,
+            self.inner.pull_timeline(req).await
+        )
+    }
+
     pub(crate) async fn get_utilization(&self) -> Result<SafekeeperUtilization> {
         measured_request!(
             "utilization",

From 29e4ca351ee12c97756123420e7ce4540fbee047 Mon Sep 17 00:00:00 2001
From: Alexander Lakhin <exclusion@gmail.com>
Date: Tue, 18 Feb 2025 17:41:20 +0200
Subject: [PATCH 34/78] Pass asan/ubsan options to pg_dump/pg_restore started
 by fast_import (#10866)

---
 compute_tools/src/bin/fast_import.rs | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/compute_tools/src/bin/fast_import.rs b/compute_tools/src/bin/fast_import.rs
index 614a93f48b..585f3e4e1d 100644
--- a/compute_tools/src/bin/fast_import.rs
+++ b/compute_tools/src/bin/fast_import.rs
@@ -361,6 +361,14 @@ async fn run_dump_restore(
             // how we run it
             .env_clear()
             .env("LD_LIBRARY_PATH", &pg_lib_dir)
+            .env(
+                "ASAN_OPTIONS",
+                std::env::var("ASAN_OPTIONS").unwrap_or_default(),
+            )
+            .env(
+                "UBSAN_OPTIONS",
+                std::env::var("UBSAN_OPTIONS").unwrap_or_default(),
+            )
             .kill_on_drop(true)
             .stdout(std::process::Stdio::piped())
             .stderr(std::process::Stdio::piped())
@@ -394,6 +402,14 @@ async fn run_dump_restore(
             // how we run it
             .env_clear()
             .env("LD_LIBRARY_PATH", &pg_lib_dir)
+            .env(
+                "ASAN_OPTIONS",
+                std::env::var("ASAN_OPTIONS").unwrap_or_default(),
+            )
+            .env(
+                "UBSAN_OPTIONS",
+                std::env::var("UBSAN_OPTIONS").unwrap_or_default(),
+            )
             .kill_on_drop(true)
             .stdout(std::process::Stdio::piped())
             .stderr(std::process::Stdio::piped())

From 290f007b8ea9ceb243ff536dfabfdcb847980743 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Tue, 18 Feb 2025 10:43:33 -0500
Subject: [PATCH 35/78] Revert "feat(pageserver): repartition on L0-L1 boundary
 (#10548)" (#10870)

This reverts commit 443c8d0b4bfead651ebbbade5dcb49c6cba00ee6.

## Problem

We observe a massive amount of compaction errors.

## Summary of changes

If the tenant did not write any L1 layers (i.e., they accumulate L0
layers where number of them is below L0 threshold), image creation will
always fail. Therefore, it's not correct to simply use the
disk_consistent_lsn or L0/L1 boundary for the image creation.
---
 pageserver/src/tenant.rs                      |  18 +-
 pageserver/src/tenant/timeline/compaction.rs  | 156 ++++++++----------
 .../regress/test_layers_from_future.py        |   3 -
 3 files changed, 69 insertions(+), 108 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 5a2c5c0c46..bab1a02527 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -7846,18 +7846,6 @@ mod tests {
             }
 
             tline.freeze_and_flush().await?;
-            // Force layers to L1
-            tline
-                .compact(
-                    &cancel,
-                    {
-                        let mut flags = EnumSet::new();
-                        flags.insert(CompactFlags::ForceL0Compaction);
-                        flags
-                    },
-                    &ctx,
-                )
-                .await?;
 
             if iter % 5 == 0 {
                 let (_, before_delta_file_accessed) =
@@ -7870,7 +7858,6 @@ mod tests {
                             let mut flags = EnumSet::new();
                             flags.insert(CompactFlags::ForceImageLayerCreation);
                             flags.insert(CompactFlags::ForceRepartition);
-                            flags.insert(CompactFlags::ForceL0Compaction);
                             flags
                         },
                         &ctx,
@@ -8317,8 +8304,6 @@ mod tests {
 
         let cancel = CancellationToken::new();
 
-        // Image layer creation happens on the disk_consistent_lsn so we need to force set it now.
-        tline.force_set_disk_consistent_lsn(Lsn(0x40));
         tline
             .compact(
                 &cancel,
@@ -8332,7 +8317,8 @@ mod tests {
             )
             .await
             .unwrap();
-        // Image layers are created at repartition LSN
+
+        // Image layers are created at last_record_lsn
         let images = tline
             .inspect_image_layers(Lsn(0x40), &ctx, io_concurrency.clone())
             .await
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 6931f360a4..e1e3eabb90 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -692,21 +692,6 @@ impl Timeline {
 
         // Define partitioning schema if needed
 
-        let l0_l1_boundary_lsn = {
-            // We do the repartition on the L0-L1 boundary. All data below the boundary
-            // are compacted by L0 with low read amplification, thus making the `repartition`
-            // function run fast.
-            let guard = self.layers.read().await;
-            let l0_min_lsn = guard
-                .layer_map()?
-                .level0_deltas()
-                .iter()
-                .map(|l| l.get_lsn_range().start)
-                .min()
-                .unwrap_or(self.get_disk_consistent_lsn());
-            l0_min_lsn.max(self.get_ancestor_lsn())
-        };
-
         // 1. L0 Compact
         let l0_outcome = {
             let timer = self.metrics.compact_time_histo.start_timer();
@@ -733,86 +718,79 @@ impl Timeline {
             return Ok(CompactionOutcome::YieldForL0);
         }
 
-        if l0_l1_boundary_lsn < self.partitioning.read().1 {
-            // We never go backwards when repartition and create image layers.
-            info!("skipping image layer generation because repartition LSN is greater than L0-L1 boundary LSN.");
-        } else {
-            // 2. Repartition and create image layers if necessary
-            match self
-                .repartition(
-                    l0_l1_boundary_lsn,
-                    self.get_compaction_target_size(),
-                    options.flags,
-                    ctx,
-                )
-                .await
-            {
-                Ok(((dense_partitioning, sparse_partitioning), lsn)) => {
-                    // Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them
-                    let image_ctx = RequestContextBuilder::extend(ctx)
-                        .access_stats_behavior(AccessStatsBehavior::Skip)
-                        .build();
+        // 2. Repartition and create image layers if necessary
+        match self
+            .repartition(
+                self.get_last_record_lsn(),
+                self.get_compaction_target_size(),
+                options.flags,
+                ctx,
+            )
+            .await
+        {
+            Ok(((dense_partitioning, sparse_partitioning), lsn)) => {
+                // Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them
+                let image_ctx = RequestContextBuilder::extend(ctx)
+                    .access_stats_behavior(AccessStatsBehavior::Skip)
+                    .build();
 
-                    let mut partitioning = dense_partitioning;
-                    partitioning
-                        .parts
-                        .extend(sparse_partitioning.into_dense().parts);
+                let mut partitioning = dense_partitioning;
+                partitioning
+                    .parts
+                    .extend(sparse_partitioning.into_dense().parts);
 
-                    // 3. Create new image layers for partitions that have been modified "enough".
-                    let (image_layers, outcome) = self
-                        .create_image_layers(
-                            &partitioning,
-                            lsn,
-                            if options
-                                .flags
-                                .contains(CompactFlags::ForceImageLayerCreation)
-                            {
-                                ImageLayerCreationMode::Force
-                            } else {
-                                ImageLayerCreationMode::Try
-                            },
-                            &image_ctx,
-                            self.last_image_layer_creation_status
-                                .load()
-                                .as_ref()
-                                .clone(),
-                            !options.flags.contains(CompactFlags::NoYield),
-                        )
-                        .await
-                        .inspect_err(|err| {
-                            if let CreateImageLayersError::GetVectoredError(
-                                GetVectoredError::MissingKey(_),
-                            ) = err
-                            {
-                                critical!("missing key during compaction: {err:?}");
-                            }
-                        })?;
+                // 3. Create new image layers for partitions that have been modified "enough".
+                let (image_layers, outcome) = self
+                    .create_image_layers(
+                        &partitioning,
+                        lsn,
+                        if options
+                            .flags
+                            .contains(CompactFlags::ForceImageLayerCreation)
+                        {
+                            ImageLayerCreationMode::Force
+                        } else {
+                            ImageLayerCreationMode::Try
+                        },
+                        &image_ctx,
+                        self.last_image_layer_creation_status
+                            .load()
+                            .as_ref()
+                            .clone(),
+                        !options.flags.contains(CompactFlags::NoYield),
+                    )
+                    .await
+                    .inspect_err(|err| {
+                        if let CreateImageLayersError::GetVectoredError(
+                            GetVectoredError::MissingKey(_),
+                        ) = err
+                        {
+                            critical!("missing key during compaction: {err:?}");
+                        }
+                    })?;
 
-                    self.last_image_layer_creation_status
-                        .store(Arc::new(outcome.clone()));
+                self.last_image_layer_creation_status
+                    .store(Arc::new(outcome.clone()));
 
-                    self.upload_new_image_layers(image_layers)?;
-                    if let LastImageLayerCreationStatus::Incomplete { .. } = outcome {
-                        // Yield and do not do any other kind of compaction.
-                        info!("skipping shard ancestor compaction due to pending image layer generation tasks (preempted by L0 compaction).");
-                        return Ok(CompactionOutcome::YieldForL0);
-                    }
+                self.upload_new_image_layers(image_layers)?;
+                if let LastImageLayerCreationStatus::Incomplete { .. } = outcome {
+                    // Yield and do not do any other kind of compaction.
+                    info!("skipping shard ancestor compaction due to pending image layer generation tasks (preempted by L0 compaction).");
+                    return Ok(CompactionOutcome::YieldForL0);
                 }
-                Err(err) => {
-                    // no partitioning? This is normal, if the timeline was just created
-                    // as an empty timeline. Also in unit tests, when we use the timeline
-                    // as a simple key-value store, ignoring the datadir layout. Log the
-                    // error but continue.
-                    //
-                    // Suppress error when it's due to cancellation
-                    if !self.cancel.is_cancelled() && !err.is_cancelled() {
-                        tracing::error!(
-                            "could not compact, repartitioning keyspace failed: {err:?}"
-                        );
-                    }
+            }
+            Err(err) => {
+                // no partitioning? This is normal, if the timeline was just created
+                // as an empty timeline. Also in unit tests, when we use the timeline
+                // as a simple key-value store, ignoring the datadir layout. Log the
+                // error but continue.
+                //
+                // Suppress error when it's due to cancellation
+                if !self.cancel.is_cancelled() && !err.is_cancelled() {
+                    tracing::error!("could not compact, repartitioning keyspace failed: {err:?}");
                 }
-            };
-        }
+            }
+        };
 
         let partition_count = self.partitioning.read().0 .0.parts.len();
 
diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py
index 3ac4ed1a3e..872d3dc4cf 100644
--- a/test_runner/regress/test_layers_from_future.py
+++ b/test_runner/regress/test_layers_from_future.py
@@ -20,9 +20,6 @@ from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
 from fixtures.utils import query_scalar, wait_until
 
 
-@pytest.mark.skip(
-    reason="We won't create future layers any more after https://github.com/neondatabase/neon/pull/10548"
-)
 @pytest.mark.parametrize(
     "attach_mode",
     ["default_generation", "same_generation"],

From 274cb13293f20e7206a5a6a88022c67838cd759f Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 18 Feb 2025 15:52:00 +0000
Subject: [PATCH 36/78] test_runner: fix mismatch versions tests on linux
 (#10869)

## Problem

Tests with mixed-version binaries always use the latest binaries on CI
([an
example](https://neon-github-public-dev.s3.amazonaws.com/reports/pr-10848/13378137061/index.html#suites/8fc5d1648d2225380766afde7c428d81/1ccefc4cfd4ef176/)):

The versions of new `storage_broker` and old `pageserver` are the same:
`b45254a5605f6fdafdf475cdd3e920fe00898543`.

This affects only Linux, on macOS the version mixed correctly.

## Summary of changes
- Use hardlinks instead of symlinks to create a directory with
mixed-version binaries
---
 test_runner/fixtures/neon_fixtures.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index c4d4908568..db81e54c49 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -705,7 +705,7 @@ class NeonEnvBuilder:
         assert self.version_combination is not None, "version combination must be set"
 
         # Always use a newer version of `neon_local`
-        (self.mixdir / "neon_local").symlink_to(self.neon_binpath / "neon_local")
+        (self.mixdir / "neon_local").hardlink_to(self.neon_binpath / "neon_local")
         self.neon_local_binpath = self.mixdir
 
         for component, paths in COMPONENT_BINARIES.items():
@@ -716,7 +716,7 @@ class NeonEnvBuilder:
             )
             for filename in paths:
                 destination = self.mixdir / filename
-                destination.symlink_to(directory / filename)
+                destination.hardlink_to(directory / filename)
         self.neon_binpath = self.mixdir
 
         if self.version_combination["compute"] == "old":

From f36ec5c84b06c2f930ce63e131e546df8a6c09cd Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 18 Feb 2025 17:56:43 +0200
Subject: [PATCH 37/78] chore(compute): Postgres 17.4, 16.8, 15.12 and 14.17
 (#10868)

Update all minor versions. No conflicts.

Postgres repository PRs:
- https://github.com/neondatabase/postgres/pull/584
- https://github.com/neondatabase/postgres/pull/583
- https://github.com/neondatabase/postgres/pull/582
- https://github.com/neondatabase/postgres/pull/581
---
 vendor/postgres-v14   |  2 +-
 vendor/postgres-v15   |  2 +-
 vendor/postgres-v16   |  2 +-
 vendor/postgres-v17   |  2 +-
 vendor/revisions.json | 16 ++++++++--------
 5 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 62a86dfc91..6254ab9b44 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 62a86dfc91e0c35a72f2ea5e99e6969b830c0c26
+Subproject commit 6254ab9b4496c3e481bc037ae69d859bbc2bdd7d
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 80ed91ce25..81e2eef061 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 80ed91ce255c765d25be0bb4a02c942fe6311fbf
+Subproject commit 81e2eef0616c65c2233c75b06f25766ae4c080c4
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 999cf81b10..9422247c58 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 999cf81b101ead40e597d5cd729458d8200f4537
+Subproject commit 9422247c582e7c1a08a4855d04af0874f8df2f34
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index 4d3a722312..a8fea8b4be 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit 4d3a722312b496ff7378156caa6d41c2e70c30e4
+Subproject commit a8fea8b4be43039f0782347c88a9b9b25f50c9d8
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 888f09124e..72d97d7f6a 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,18 +1,18 @@
 {
   "v17": [
-    "17.3",
-    "4d3a722312b496ff7378156caa6d41c2e70c30e4"
+    "17.4",
+    "a8fea8b4be43039f0782347c88a9b9b25f50c9d8"
   ],
   "v16": [
-    "16.7",
-    "999cf81b101ead40e597d5cd729458d8200f4537"
+    "16.8",
+    "9422247c582e7c1a08a4855d04af0874f8df2f34"
   ],
   "v15": [
-    "15.11",
-    "80ed91ce255c765d25be0bb4a02c942fe6311fbf"
+    "15.12",
+    "81e2eef0616c65c2233c75b06f25766ae4c080c4"
   ],
   "v14": [
-    "14.16",
-    "62a86dfc91e0c35a72f2ea5e99e6969b830c0c26"
+    "14.17",
+    "6254ab9b4496c3e481bc037ae69d859bbc2bdd7d"
   ]
 }

From f9a063e2e9b75a60bea9d3a523497ae6992f8b50 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Tue, 18 Feb 2025 11:06:20 -0500
Subject: [PATCH 38/78]  test(pageserver): fix
 test_pageserver_gc_compaction_idempotent  (#10833)

## Problem

ref https://github.com/neondatabase/neon/issues/10517

## Summary of changes

For some reasons the job split algorithm decides to have different image
coverage range for two compactions before/after restart. So we remove
the subcompaction key range and let it generate an image covering the
full range, which should make the test more stable.

Also slightly tuned the logging span.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant.rs                     |  3 +++
 pageserver/src/tenant/timeline/compaction.rs | 10 ++-------
 test_runner/regress/test_compaction.py       | 22 +++++---------------
 3 files changed, 10 insertions(+), 25 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index bab1a02527..5d917da574 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3101,6 +3101,9 @@ impl Tenant {
                 if let Some(queue) = queue {
                     outcome = queue
                         .iteration(cancel, ctx, &self.gc_block, &timeline)
+                        .instrument(
+                            info_span!("gc_compact_timeline", timeline_id = %timeline.timeline_id),
+                        )
                         .await?;
                 }
             }
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index e1e3eabb90..9e082d74b5 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -301,18 +301,12 @@ impl GcCompactionQueue {
                         let mut guard = self.inner.lock().unwrap();
                         guard.gc_guards.insert(id, gc_guard);
                     }
-                    let _ = timeline
-                        .compact_with_options(cancel, options, ctx)
-                        .instrument(info_span!("scheduled_compact_timeline", %timeline.timeline_id))
-                        .await?;
+                    let _ = timeline.compact_with_options(cancel, options, ctx).await?;
                     self.notify_and_unblock(id);
                 }
             }
             GcCompactionQueueItem::SubCompactionJob(options) => {
-                let _ = timeline
-                    .compact_with_options(cancel, options, ctx)
-                    .instrument(info_span!("scheduled_compact_timeline", %timeline.timeline_id))
-                    .await?;
+                let _ = timeline.compact_with_options(cancel, options, ctx).await?;
             }
             GcCompactionQueueItem::Notify(id) => {
                 self.notify_and_unblock(id);
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index f10872590c..c091cd0869 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -236,9 +236,7 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder, with_b
     wait_until(compaction_finished, timeout=60)
 
     # ensure gc_compaction is scheduled and it's actually running (instead of skipping due to no layers picked)
-    env.pageserver.assert_log_contains(
-        "scheduled_compact_timeline.*picked .* layers for compaction"
-    )
+    env.pageserver.assert_log_contains("gc_compact_timeline.*picked .* layers for compaction")
 
     log.info("Validating at workload end ...")
     workload.validate(env.pageserver.id)
@@ -300,6 +298,8 @@ def test_pageserver_gc_compaction_idempotent(
     workload.churn_rows(row_count, env.pageserver.id)
     env.create_branch("child_branch")  # so that we have a retain_lsn
     workload.churn_rows(row_count, env.pageserver.id)
+    env.create_branch("child_branch_2")  # so that we have another retain_lsn
+    workload.churn_rows(row_count, env.pageserver.id)
     # compact 3 times if mode is before_restart
     n_compactions = 3 if compaction_mode == "before_restart" else 1
     ps_http.timeline_compact(
@@ -315,10 +315,6 @@ def test_pageserver_gc_compaction_idempotent(
             body={
                 "scheduled": True,
                 "sub_compaction": True,
-                "compact_key_range": {
-                    "start": "000000000000000000000000000000000000",
-                    "end": "030000000000000000000000000000000000",
-                },
                 "sub_compaction_max_job_size_mb": 16,
             },
         )
@@ -336,19 +332,13 @@ def test_pageserver_gc_compaction_idempotent(
                 body={
                     "scheduled": True,
                     "sub_compaction": True,
-                    "compact_key_range": {
-                        "start": "000000000000000000000000000000000000",
-                        "end": "030000000000000000000000000000000000",
-                    },
                     "sub_compaction_max_job_size_mb": 16,
                 },
             )
             wait_until(compaction_finished, timeout=60)
 
     # ensure gc_compaction is scheduled and it's actually running (instead of skipping due to no layers picked)
-    env.pageserver.assert_log_contains(
-        "scheduled_compact_timeline.*picked .* layers for compaction"
-    )
+    env.pageserver.assert_log_contains("gc_compact_timeline.*picked .* layers for compaction")
 
     # ensure we hit the duplicated layer key warning at least once: we did two compactions consecutively,
     # and the second one should have hit the duplicated layer key warning.
@@ -466,9 +456,7 @@ def test_pageserver_gc_compaction_interrupt(neon_env_builder: NeonEnvBuilder):
     wait_until(compaction_finished, timeout=60)
 
     # ensure gc_compaction is scheduled and it's actually running (instead of skipping due to no layers picked)
-    env.pageserver.assert_log_contains(
-        "scheduled_compact_timeline.*picked .* layers for compaction"
-    )
+    env.pageserver.assert_log_contains("gc_compact_timeline.*picked .* layers for compaction")
 
     log.info("Validating at workload end ...")
     workload.validate(env.pageserver.id)

From ed98f6d57e9b1baab39f4ab25372193294d60bf7 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Tue, 18 Feb 2025 11:06:39 -0500
Subject: [PATCH 39/78] feat(pageserver): log lease request (#10832)

## Problem

To investigate https://github.com/neondatabase/cloud/issues/23650

## Summary of changes

We log lease requests to see why there are clients accessing things
below gc_cutoff.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/page_service.rs | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 53a6a7124d..0c8da6f2a8 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -1799,6 +1799,13 @@ impl PageServerHandler {
                 .as_millis()
                 .to_string()
         });
+
+        info!(
+            "acquired lease for {} until {}",
+            lsn,
+            valid_until_str.as_deref().unwrap_or("<unknown>")
+        );
+
         let bytes = valid_until_str.as_ref().map(|x| x.as_bytes());
 
         pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor::text_col(

From 1a69a8cba71a1f0d8cfaabf9bd4daf880b10ee8f Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 18 Feb 2025 16:09:06 +0000
Subject: [PATCH 40/78] storage: add APIs for warming up location after cold
 migrations (#10788)

## Problem

We lack an API for warming up attached locations based on the heatmap
contents.
This is problematic in two places:
1. If we manually migrate and cut over while the secondary is still cold
2. When we re-attach a previously offloaded tenant

## Summary of changes

https://github.com/neondatabase/neon/pull/10597 made heatmap generation
additive
across migrations, so we won't clobber it a after a cold migration. This
allows us to implement:

1. An endpoint for downloading all missing heatmap layers on the
pageserver:

`/v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_heatmap_layers`.
Only one such operation per timeline is allowed at any given time. The
granularity is tenant shard.
2. An endpoint to the storage controller to trigger the downloads on the
pageserver:

`/v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_heatmap_layers`.
This works both at
tenant and tenant shard level. If an unsharded tenant id is provided,
the operation is started on
all shards, otherwise only the specified shard.
3. A storcon cli command. Again, tenant and tenant-shard level
granularities are supported.

Cplane will call into storcon and trigger the downloads for all shards.
When we want to rescue a migration, we will use storcon cli targeting
the specific tenant shard.

Related:  https://github.com/neondatabase/neon/issues/10541
---
 control_plane/storcon_cli/src/main.rs         |  33 +++-
 libs/utils/src/shard.rs                       |   4 +
 pageserver/client/src/mgmt_api.rs             |  20 +++
 pageserver/src/http/openapi_spec.yml          |  32 ++++
 pageserver/src/http/routes.rs                 |  61 +++++++
 pageserver/src/tenant/timeline.rs             |  12 ++
 .../timeline/heatmap_layers_downloader.rs     | 162 ++++++++++++++++++
 storage_controller/src/http.rs                |  28 +++
 storage_controller/src/pageserver_client.rs   |  16 ++
 storage_controller/src/service.rs             |  56 ++++++
 test_runner/fixtures/neon_fixtures.py         |   8 +
 .../regress/test_pageserver_secondary.py      |  20 ++-
 12 files changed, 446 insertions(+), 6 deletions(-)
 create mode 100644 pageserver/src/tenant/timeline/heatmap_layers_downloader.rs

diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index 83faf6b4af..3c574efc63 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -22,7 +22,7 @@ use pageserver_api::{
 };
 use pageserver_client::mgmt_api::{self};
 use reqwest::{Method, StatusCode, Url};
-use utils::id::{NodeId, TenantId};
+use utils::id::{NodeId, TenantId, TimelineId};
 
 use pageserver_api::controller_api::{
     NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
@@ -239,6 +239,19 @@ enum Command {
         #[arg(long)]
         scheduling_policy: SkSchedulingPolicyArg,
     },
+    /// Downloads any missing heatmap layers for all shard for a given timeline
+    DownloadHeatmapLayers {
+        /// Tenant ID or tenant shard ID. When an unsharded tenant ID is specified,
+        /// the operation is performed on all shards. When a sharded tenant ID is
+        /// specified, the operation is only performed on the specified shard.
+        #[arg(long)]
+        tenant_shard_id: TenantShardId,
+        #[arg(long)]
+        timeline_id: TimelineId,
+        /// Optional: Maximum download concurrency (default is 16)
+        #[arg(long)]
+        concurrency: Option<usize>,
+    },
 }
 
 #[derive(Parser)]
@@ -1247,6 +1260,24 @@ async fn main() -> anyhow::Result<()> {
                 String::from(scheduling_policy)
             );
         }
+        Command::DownloadHeatmapLayers {
+            tenant_shard_id,
+            timeline_id,
+            concurrency,
+        } => {
+            let mut path = format!(
+                "/v1/tenant/{}/timeline/{}/download_heatmap_layers",
+                tenant_shard_id, timeline_id,
+            );
+
+            if let Some(c) = concurrency {
+                path = format!("{path}?concurrency={c}");
+            }
+
+            storcon_client
+                .dispatch::<(), ()>(Method::POST, path, None)
+                .await?;
+        }
     }
 
     Ok(())
diff --git a/libs/utils/src/shard.rs b/libs/utils/src/shard.rs
index 6352ea9f92..d98284f969 100644
--- a/libs/utils/src/shard.rs
+++ b/libs/utils/src/shard.rs
@@ -117,6 +117,10 @@ impl TenantShardId {
         )
     }
 
+    pub fn range(&self) -> RangeInclusive<Self> {
+        RangeInclusive::new(*self, *self)
+    }
+
     pub fn shard_slug(&self) -> impl std::fmt::Display + '_ {
         ShardSlug(self)
     }
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index da7ec5abce..bb0f64ca32 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -477,6 +477,26 @@ impl Client {
         self.request(Method::POST, &uri, ()).await.map(|_| ())
     }
 
+    pub async fn timeline_download_heatmap_layers(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        concurrency: Option<usize>,
+    ) -> Result<()> {
+        let mut path = reqwest::Url::parse(&format!(
+            "{}/v1/tenant/{}/timeline/{}/download_heatmap_layers",
+            self.mgmt_api_endpoint, tenant_shard_id, timeline_id
+        ))
+        .expect("Cannot build URL");
+
+        if let Some(concurrency) = concurrency {
+            path.query_pairs_mut()
+                .append_pair("concurrency", &format!("{}", concurrency));
+        }
+
+        self.request(Method::POST, path, ()).await.map(|_| ())
+    }
+
     pub async fn tenant_reset(&self, tenant_shard_id: TenantShardId) -> Result<()> {
         let uri = format!(
             "{}/v1/tenant/{}/reset",
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 733115539a..12252739fd 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -824,6 +824,38 @@ paths:
               schema:
                 $ref: "#/components/schemas/TenantConfigResponse"
 
+  /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/download_heatmap_layers:
+    parameters:
+      - name: tenant_shard_id
+        in: path
+        required: true
+        schema:
+          type: string
+      - name: timeline_id
+        in: path
+        required: true
+        schema:
+          type: string
+      - name: concurrency
+        description: Maximum number of concurrent downloads (capped at remote storage concurrency)
+        in: query
+        required: false
+        schema:
+          type: integer
+    post:
+      description: |
+        Download all layers in the specified timeline's heatmap. The `tenant_shard_id` parameter
+        may be used to target all shards of a tenant when the unsharded form is used, or a specific
+        tenant shard with the sharded form.
+      responses:
+        "200":
+          description: Success
+    delete:
+      description: Stop any on-going background downloads of heatmap layers for the specified timeline.
+      responses:
+        "200":
+          description: Success
+
   /v1/utilization:
     get:
       description: |
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index a0c639a16d..329bf82bde 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1463,6 +1463,59 @@ async fn timeline_layer_scan_disposable_keys(
     )
 }
 
+async fn timeline_download_heatmap_layers_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    // Only used in the case where remote storage is not configured.
+    const DEFAULT_MAX_CONCURRENCY: usize = 100;
+    // A conservative default.
+    const DEFAULT_CONCURRENCY: usize = 16;
+
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+
+    let desired_concurrency =
+        parse_query_param(&request, "concurrency")?.unwrap_or(DEFAULT_CONCURRENCY);
+
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+
+    let state = get_state(&request);
+    let timeline =
+        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+            .await?;
+
+    let max_concurrency = get_config(&request)
+        .remote_storage_config
+        .as_ref()
+        .map(|c| c.concurrency_limit())
+        .unwrap_or(DEFAULT_MAX_CONCURRENCY);
+    let concurrency = std::cmp::min(max_concurrency, desired_concurrency);
+
+    timeline.start_heatmap_layers_download(concurrency).await?;
+
+    json_response(StatusCode::ACCEPTED, ())
+}
+
+async fn timeline_shutdown_download_heatmap_layers_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+
+    let state = get_state(&request);
+    let timeline =
+        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+            .await?;
+
+    timeline.stop_and_drain_heatmap_layers_download().await;
+
+    json_response(StatusCode::OK, ())
+}
+
 async fn layer_download_handler(
     request: Request<Body>,
     _cancel: CancellationToken,
@@ -3626,6 +3679,14 @@ pub fn make_router(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer",
             |r| api_handler(r, layer_map_info_handler),
         )
+        .post(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_heatmap_layers",
+            |r| api_handler(r, timeline_download_heatmap_layers_handler),
+        )
+        .delete(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_heatmap_layers",
+            |r| api_handler(r, timeline_shutdown_download_heatmap_layers_handler),
+        )
         .get(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_file_name",
             |r| api_handler(r, layer_download_handler),
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 277dce7761..94b4abb7e9 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4,6 +4,7 @@ pub mod delete;
 pub(crate) mod detach_ancestor;
 mod eviction_task;
 pub(crate) mod handle;
+mod heatmap_layers_downloader;
 pub(crate) mod import_pgdata;
 mod init;
 pub mod layer_manager;
@@ -467,6 +468,10 @@ pub struct Timeline {
     pub(crate) page_trace: ArcSwapOption<Sender<PageTraceEvent>>,
 
     previous_heatmap: ArcSwapOption<PreviousHeatmap>,
+
+    /// May host a background Tokio task which downloads all the layers from the current
+    /// heatmap on demand.
+    heatmap_layers_downloader: Mutex<Option<heatmap_layers_downloader::HeatmapLayersDownloader>>,
 }
 
 pub(crate) enum PreviousHeatmap {
@@ -2039,6 +2044,11 @@ impl Timeline {
         tracing::debug!("Cancelling CancellationToken");
         self.cancel.cancel();
 
+        // If we have a background task downloading heatmap layers stop it.
+        // The background downloads are sensitive to timeline cancellation (done above),
+        // so the drain will be immediate.
+        self.stop_and_drain_heatmap_layers_download().await;
+
         // Ensure Prevent new page service requests from starting.
         self.handles.shutdown();
 
@@ -2752,6 +2762,8 @@ impl Timeline {
                 page_trace: Default::default(),
 
                 previous_heatmap: ArcSwapOption::from_pointee(previous_heatmap),
+
+                heatmap_layers_downloader: Mutex::new(None),
             };
 
             result.repartition_threshold =
diff --git a/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs b/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs
new file mode 100644
index 0000000000..0ba9753e85
--- /dev/null
+++ b/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs
@@ -0,0 +1,162 @@
+//! Timeline utility module to hydrate everything from the current heatmap.
+//!
+//! Provides utilities to spawn and abort a background task where the downloads happen.
+//! See /v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_heatmap_layers.
+
+use futures::StreamExt;
+use http_utils::error::ApiError;
+use std::sync::{Arc, Mutex};
+use tokio_util::sync::CancellationToken;
+use utils::sync::gate::Gate;
+
+use super::Timeline;
+
+// This status is not strictly necessary now, but gives us a nice place
+// to store progress information if we ever wish to expose it.
+pub(super) enum HeatmapLayersDownloadStatus {
+    InProgress,
+    Complete,
+}
+
+pub(super) struct HeatmapLayersDownloader {
+    handle: tokio::task::JoinHandle<()>,
+    status: Arc<Mutex<HeatmapLayersDownloadStatus>>,
+    cancel: CancellationToken,
+    downloads_guard: Arc<Gate>,
+}
+
+impl HeatmapLayersDownloader {
+    fn new(
+        timeline: Arc<Timeline>,
+        concurrency: usize,
+    ) -> Result<HeatmapLayersDownloader, ApiError> {
+        let tl_guard = timeline.gate.enter().map_err(|_| ApiError::Cancelled)?;
+
+        let cancel = timeline.cancel.child_token();
+        let downloads_guard = Arc::new(Gate::default());
+
+        let status = Arc::new(Mutex::new(HeatmapLayersDownloadStatus::InProgress));
+
+        let handle = tokio::task::spawn({
+            let status = status.clone();
+            let downloads_guard = downloads_guard.clone();
+            let cancel = cancel.clone();
+
+            async move {
+                let _guard = tl_guard;
+
+                scopeguard::defer! {
+                    *status.lock().unwrap() = HeatmapLayersDownloadStatus::Complete;
+                }
+
+                let Some(heatmap) = timeline.generate_heatmap().await else {
+                    tracing::info!("Heatmap layers download failed to generate heatmap");
+                    return;
+                };
+
+                tracing::info!(
+                    resident_size=%timeline.resident_physical_size(),
+                    heatmap_layers=%heatmap.layers.len(),
+                    "Starting heatmap layers download"
+                );
+
+                let stream = futures::stream::iter(heatmap.layers.into_iter().filter_map(
+                    |layer| {
+                        let tl = timeline.clone();
+                        let dl_guard = match downloads_guard.enter() {
+                            Ok(g) => g,
+                            Err(_) => {
+                                // [`Self::shutdown`] was called. Don't spawn any more downloads.
+                                return None;
+                            }
+                        };
+
+                        Some(async move {
+                            let _dl_guard = dl_guard;
+
+                            let res = tl.download_layer(&layer.name).await;
+                            if let Err(err) = res {
+                                if !err.is_cancelled() {
+                                    tracing::warn!(layer=%layer.name,"Failed to download heatmap layer: {err}")
+                                }
+                            }
+                        })
+                    }
+                )).buffered(concurrency);
+
+                tokio::select! {
+                    _ = stream.collect::<()>() => {
+                        tracing::info!(
+                            resident_size=%timeline.resident_physical_size(),
+                            "Heatmap layers download completed"
+                        );
+                    },
+                    _ = cancel.cancelled() => {
+                        tracing::info!("Heatmap layers download cancelled");
+                    }
+                }
+            }
+        });
+
+        Ok(Self {
+            status,
+            handle,
+            cancel,
+            downloads_guard,
+        })
+    }
+
+    fn is_complete(&self) -> bool {
+        matches!(
+            *self.status.lock().unwrap(),
+            HeatmapLayersDownloadStatus::Complete
+        )
+    }
+
+    /// Drive any in-progress downloads to completion and stop spawning any new ones.
+    ///
+    /// This has two callers and they behave differently
+    /// 1. [`Timeline::shutdown`]: the drain will be immediate since downloads themselves
+    ///    are sensitive to timeline cancellation.
+    ///
+    /// 2. Endpoint handler in [`crate::http::routes`]: the drain will wait for any in-progress
+    ///    downloads to complete.
+    async fn stop_and_drain(self) {
+        // Counterintuitive: close the guard before cancelling.
+        // Something needs to poll the already created download futures to completion.
+        // If we cancel first, then the underlying task exits and we lost
+        // the poller.
+        self.downloads_guard.close().await;
+        self.cancel.cancel();
+        if let Err(err) = self.handle.await {
+            tracing::warn!("Failed to join heatmap layer downloader task: {err}");
+        }
+    }
+}
+
+impl Timeline {
+    pub(crate) async fn start_heatmap_layers_download(
+        self: &Arc<Self>,
+        concurrency: usize,
+    ) -> Result<(), ApiError> {
+        let mut locked = self.heatmap_layers_downloader.lock().unwrap();
+        if locked.as_ref().map(|dl| dl.is_complete()).unwrap_or(true) {
+            let dl = HeatmapLayersDownloader::new(self.clone(), concurrency)?;
+            *locked = Some(dl);
+            Ok(())
+        } else {
+            Err(ApiError::Conflict("Already running".to_string()))
+        }
+    }
+
+    pub(crate) async fn stop_and_drain_heatmap_layers_download(&self) {
+        // This can race with the start of a new downloader and lead to a situation
+        // where one donloader is shutting down and another one is in-flight.
+        // The only impact is that we'd end up using more remote storage semaphore
+        // units than expected.
+        let downloader = self.heatmap_layers_downloader.lock().unwrap().take();
+        if let Some(dl) = downloader {
+            dl.stop_and_drain().await;
+        }
+    }
+}
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index e3e35a6303..8994721267 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -516,6 +516,24 @@ async fn handle_tenant_timeline_block_unblock_gc(
     json_response(StatusCode::OK, ())
 }
 
+async fn handle_tenant_timeline_download_heatmap_layers(
+    service: Arc<Service>,
+    req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
+
+    check_permissions(&req, Scope::PageServerApi)?;
+
+    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
+    let concurrency: Option<usize> = parse_query_param(&req, "concurrency")?;
+
+    service
+        .tenant_timeline_download_heatmap_layers(tenant_shard_id, timeline_id, concurrency)
+        .await?;
+
+    json_response(StatusCode::OK, ())
+}
+
 // For metric labels where we would like to include the approximate path, but exclude high-cardinality fields like query parameters
 // and tenant/timeline IDs.  Since we are proxying to arbitrary paths, we don't have routing templates to
 // compare to, so we can just filter out our well known ID format with regexes.
@@ -2078,6 +2096,16 @@ pub fn make_router(
                 )
             },
         )
+        .post(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_heatmap_layers",
+            |r| {
+                tenant_service_handler(
+                    r,
+                    handle_tenant_timeline_download_heatmap_layers,
+                    RequestName("v1_tenant_timeline_download_heatmap_layers"),
+                )
+            },
+        )
         // Tenant detail GET passthrough to shard zero:
         .get("/v1/tenant/:tenant_id", |r| {
             tenant_service_handler(
diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs
index 141ff6f720..645cbdfce1 100644
--- a/storage_controller/src/pageserver_client.rs
+++ b/storage_controller/src/pageserver_client.rs
@@ -280,6 +280,22 @@ impl PageserverClient {
         )
     }
 
+    pub(crate) async fn timeline_download_heatmap_layers(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        concurrency: Option<usize>,
+    ) -> Result<()> {
+        measured_request!(
+            "download_heatmap_layers",
+            crate::metrics::Method::Post,
+            &self.node_id_label,
+            self.inner
+                .timeline_download_heatmap_layers(tenant_shard_id, timeline_id, concurrency)
+                .await
+        )
+    }
+
     pub(crate) async fn get_utilization(&self) -> Result<PageserverUtilization> {
         measured_request!(
             "utilization",
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index d5713d49ee..5aa744f076 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -162,6 +162,7 @@ enum TenantOperations {
     TimelineDetachAncestor,
     TimelineGcBlockUnblock,
     DropDetached,
+    DownloadHeatmapLayers,
 }
 
 #[derive(Clone, strum_macros::Display)]
@@ -3757,6 +3758,61 @@ impl Service {
         Ok(())
     }
 
+    pub(crate) async fn tenant_timeline_download_heatmap_layers(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        concurrency: Option<usize>,
+    ) -> Result<(), ApiError> {
+        let _tenant_lock = trace_shared_lock(
+            &self.tenant_op_locks,
+            tenant_shard_id.tenant_id,
+            TenantOperations::DownloadHeatmapLayers,
+        )
+        .await;
+
+        let targets = {
+            let locked = self.inner.read().unwrap();
+            let mut targets = Vec::new();
+
+            // If the request got an unsharded tenant id, then apply
+            // the operation to all shards. Otherwise, apply it to a specific shard.
+            let shards_range = if tenant_shard_id.is_unsharded() {
+                TenantShardId::tenant_range(tenant_shard_id.tenant_id)
+            } else {
+                tenant_shard_id.range()
+            };
+
+            for (tenant_shard_id, shard) in locked.tenants.range(shards_range) {
+                if let Some(node_id) = shard.intent.get_attached() {
+                    let node = locked
+                        .nodes
+                        .get(node_id)
+                        .expect("Pageservers may not be deleted while referenced");
+
+                    targets.push((*tenant_shard_id, node.clone()));
+                }
+            }
+            targets
+        };
+
+        self.tenant_for_shards_api(
+            targets,
+            |tenant_shard_id, client| async move {
+                client
+                    .timeline_download_heatmap_layers(tenant_shard_id, timeline_id, concurrency)
+                    .await
+            },
+            1,
+            1,
+            SHORT_RECONCILE_TIMEOUT,
+            &self.cancel,
+        )
+        .await;
+
+        Ok(())
+    }
+
     /// Helper for concurrently calling a pageserver API on a number of shards, such as timeline creation.
     ///
     /// On success, the returned vector contains exactly the same number of elements as the input `locations`.
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index db81e54c49..12b096a2a0 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2467,6 +2467,14 @@ class NeonStorageController(MetricsGetter, LogUtils):
         response.raise_for_status()
         return [TenantShardId.parse(tid) for tid in response.json()["updated"]]
 
+    def download_heatmap_layers(self, tenant_shard_id: TenantShardId, timeline_id: TimelineId):
+        response = self.request(
+            "POST",
+            f"{self.api}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/download_heatmap_layers",
+            headers=self.headers(TokenScope.ADMIN),
+        )
+        response.raise_for_status()
+
     def __enter__(self) -> Self:
         return self
 
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index 8a91a255d8..aa375604f4 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -974,12 +974,22 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder):
 
     # The new layer map should contain all the layers in the pre-migration one
     # and a new in memory layer
-    assert len(heatmap_before_migration["timelines"][0]["layers"]) + 1 == len(
-        heatmap_after_migration["timelines"][0]["layers"]
+    after_migration_heatmap_layers_count = len(heatmap_after_migration["timelines"][0]["layers"])
+    assert (
+        len(heatmap_before_migration["timelines"][0]["layers"]) + 1
+        == after_migration_heatmap_layers_count
     )
 
-    log.info(
-        f'Heatmap size after cold migration is {len(heatmap_after_migration["timelines"][0]["layers"])}'
+    log.info(f"Heatmap size after cold migration is {after_migration_heatmap_layers_count}")
+
+    env.storage_controller.download_heatmap_layers(
+        TenantShardId(tenant_id, shard_number=0, shard_count=0), timeline_id
     )
 
-    # TODO: Once we have an endpoint for rescuing the cold location, exercise it here.
+    def all_layers_downloaded():
+        local_layers_count = len(ps_secondary.list_layers(tenant_id, timeline_id))
+
+        log.info(f"{local_layers_count=} {after_migration_heatmap_layers_count=}")
+        assert local_layers_count == after_migration_heatmap_layers_count
+
+    wait_until(all_layers_downloaded)

From 381115b68e8060e5601beeb300d723b9ad309fac Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Tue, 18 Feb 2025 16:32:32 +0000
Subject: [PATCH 41/78] Add pgaudit and pgauditlogtofile extensions (#10763)

to compute image.

This commit doesn't enable anything yet.
It is a preparatory work for enabling audit logging in computes.
---
 compute/compute-node.Dockerfile | 69 +++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 082dea6f1b..0491abe965 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1509,6 +1509,73 @@ WORKDIR /ext-src/pg_repack-src
 RUN make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install
 
+
+#########################################################################################
+#
+# Layer "pgaudit"
+# compile pgaudit extension
+#
+#########################################################################################
+
+FROM build-deps AS pgaudit-src
+ARG PG_VERSION
+WORKDIR /ext-src
+RUN case "${PG_VERSION}" in \
+    "v14") \
+    export PGAUDIT_VERSION=1.6.2 \
+    export PGAUDIT_CHECKSUM=1f350d70a0cbf488c0f2b485e3a5c9b11f78ad9e3cbb95ef6904afa1eb3187eb \
+    ;; \
+    "v15") \
+    export PGAUDIT_VERSION=1.7.0 \
+    export PGAUDIT_CHECKSUM=8f4a73e451c88c567e516e6cba7dc1e23bc91686bb6f1f77f8f3126d428a8bd8 \
+    ;; \
+    "v16") \
+    export PGAUDIT_VERSION=16.0 \
+    export PGAUDIT_CHECKSUM=d53ef985f2d0b15ba25c512c4ce967dce07b94fd4422c95bd04c4c1a055fe738 \
+    ;; \
+    "v17") \
+    export PGAUDIT_VERSION=17.0 \
+    export PGAUDIT_CHECKSUM=7d0d08d030275d525f36cd48b38c6455f1023da863385badff0cec44965bfd8c \
+    ;; \
+    *) \
+    echo "pgaudit is not supported on this PostgreSQL version" && exit 1;; \
+    esac && \
+    wget https://github.com/pgaudit/pgaudit/archive/refs/tags/${PGAUDIT_VERSION}.tar.gz -O pgaudit.tar.gz && \
+    echo "${PGAUDIT_CHECKSUM} pgaudit.tar.gz" | sha256sum --check && \
+    mkdir pgaudit-src && cd pgaudit-src && tar xzf ../pgaudit.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS pgaudit-build
+COPY --from=pgaudit-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pgaudit-src
+RUN make install USE_PGXS=1 -j $(getconf _NPROCESSORS_ONLN)
+
+#########################################################################################
+#
+# Layer "pgauditlogtofile"
+# compile pgauditlogtofile extension
+#
+#########################################################################################
+
+FROM build-deps AS pgauditlogtofile-src
+ARG PG_VERSION
+WORKDIR /ext-src
+RUN case "${PG_VERSION}" in \
+    "v14" | "v15" | "v16" | "v17") \
+    export PGAUDITLOGTOFILE_VERSION=v1.6.4 \
+    export PGAUDITLOGTOFILE_CHECKSUM=ef801eb09c26aaa935c0dabd92c81eb9ebe338930daa9674d420a280c6bc2d70 \
+    ;; \
+    *) \
+    echo "pgauditlogtofile is not supported on this PostgreSQL version" && exit 1;; \
+    esac && \
+    wget https://github.com/fmbiete/pgauditlogtofile/archive/refs/tags/${PGAUDITLOGTOFILE_VERSION}.tar.gz -O pgauditlogtofile.tar.gz && \
+    echo "${PGAUDITLOGTOFILE_CHECKSUM} pgauditlogtofile.tar.gz" | sha256sum --check && \
+    mkdir pgauditlogtofile-src && cd pgauditlogtofile-src && tar xzf ../pgauditlogtofile.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS pgauditlogtofile-build
+COPY --from=pgauditlogtofile-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pgauditlogtofile-src
+RUN make install USE_PGXS=1 -j $(getconf _NPROCESSORS_ONLN)
+
 #########################################################################################
 #
 # Layer "neon-ext-build"
@@ -1604,6 +1671,8 @@ COPY --from=pg_partman-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_mooncake-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_duckdb-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_repack-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pgaudit-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pgauditlogtofile-build /usr/local/pgsql/ /usr/local/pgsql/
 
 #########################################################################################
 #

From 9151d3a31899d2ce58732b85cf83f83393d7df74 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= <jc@neon.tech>
Date: Tue, 18 Feb 2025 18:20:03 +0100
Subject: [PATCH 42/78] feat(ci): notify storage oncall if deploy job fails on
 release branch (#10865)

## Problem
If the deploy job on the release branch doesn't succeed, the preprod
deployment will not have happened. It was requested that this triggers a
notification in https://github.com/neondatabase/neon/issues/10662.

## Summary of changes
If we're on the release branch and the deploy job doesn't end up in
"success", notify storage oncall on slack.
---
 .github/actionlint.yml               |  1 +
 .github/workflows/build_and_test.yml | 16 ++++++++++++++++
 2 files changed, 17 insertions(+)

diff --git a/.github/actionlint.yml b/.github/actionlint.yml
index 2b96ce95da..5114517e7f 100644
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -28,3 +28,4 @@ config-variables:
   - DEV_AWS_OIDC_ROLE_MANAGE_BENCHMARK_EC2_VMS_ARN
   - SLACK_ON_CALL_STORAGE_STAGING_STREAM
   - SLACK_CICD_CHANNEL_ID
+  - SLACK_STORAGE_CHANNEL_ID
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index bc773600ea..d9bf094aa9 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1178,6 +1178,22 @@ jobs:
             exit 1
           fi
 
+  notify-storage-release-deploy-failure:
+    needs: [ deploy ]
+    # We want this to run even if (transitive) dependencies are skipped, because deploy should really be successful on release branch workflow runs.
+    if: github.ref_name == 'release' && needs.deploy.result != 'success' && always()
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Post release-deploy failure to team-storage slack channel
+        uses: slackapi/slack-github-action@v2
+        with:
+          method: chat.postMessage
+          token: ${{ secrets.SLACK_BOT_TOKEN }}
+          payload: |
+            channel: ${{ vars.SLACK_STORAGE_CHANNEL_ID }}
+            text: |
+              🔴 @oncall-storage: deploy job on release branch had unexpected status "${{ needs.deploy.result }}" <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>.
+
   # The job runs on `release` branch and copies compatibility data and Neon artifact from the last *release PR* to the latest directory
   promote-compatibility-data:
     needs: [ deploy ]

From cb8060545d24b4abb3973f972ae371f45df12f8c Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Tue, 18 Feb 2025 18:49:01 +0100
Subject: [PATCH 43/78] pageserver: don't log noop image compaction (#10873)

## Problem

We log image compaction stats even when no image compaction happened.
This is logged every 10 seconds for every timeline.

## Summary of changes

Only log when we actually performed any image compaction.
---
 pageserver/src/tenant/timeline.rs | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 94b4abb7e9..d02ab36e78 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -5167,14 +5167,16 @@ impl Timeline {
             .map(|l| l.metadata().file_size)
             .sum::<u64>();
 
-        info!(
-            "created {} image layers ({} bytes) in {}s, processed {} out of {} partitions",
-            image_layers.len(),
-            total_layer_size,
-            duration.as_secs_f64(),
-            partition_processed,
-            total_partitions
-        );
+        if !image_layers.is_empty() {
+            info!(
+                "created {} image layers ({} bytes) in {}s, processed {} out of {} partitions",
+                image_layers.len(),
+                total_layer_size,
+                duration.as_secs_f64(),
+                partition_processed,
+                total_partitions
+            );
+        }
 
         Ok((
             image_layers,

From 538ea03f73e7359e475edb6715ebc369ccab12ea Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Tue, 18 Feb 2025 13:54:53 -0500
Subject: [PATCH 44/78] feat(pageserver): allow read path debug in getpagelsn
 API (#10748)

## Problem

The usual workflow for me to debug read path errors in staging is:
download the tenant to my laptop, import, and then run some read tests.

With this patch, we can do this directly over staging pageservers.

## Summary of changes

* Add a new `touchpagelsn` API that does a page read but does not return
page info back.
* Allow read from latest record LSN from get/touchpagelsn
* Add read_debug config in the context.
* The read path will read the context config to decide whether to enable
read path tracing or not.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/context.rs         | 12 ++++++++
 pageserver/src/http/routes.rs     | 50 ++++++++++++++++++++++++-------
 pageserver/src/tenant/timeline.rs |  2 +-
 3 files changed, 52 insertions(+), 12 deletions(-)

diff --git a/pageserver/src/context.rs b/pageserver/src/context.rs
index 8f2177fe5b..da9c095a15 100644
--- a/pageserver/src/context.rs
+++ b/pageserver/src/context.rs
@@ -98,6 +98,7 @@ pub struct RequestContext {
     download_behavior: DownloadBehavior,
     access_stats_behavior: AccessStatsBehavior,
     page_content_kind: PageContentKind,
+    read_path_debug: bool,
 }
 
 /// The kind of access to the page cache.
@@ -155,6 +156,7 @@ impl RequestContextBuilder {
                 download_behavior: DownloadBehavior::Download,
                 access_stats_behavior: AccessStatsBehavior::Update,
                 page_content_kind: PageContentKind::Unknown,
+                read_path_debug: false,
             },
         }
     }
@@ -168,6 +170,7 @@ impl RequestContextBuilder {
                 download_behavior: original.download_behavior,
                 access_stats_behavior: original.access_stats_behavior,
                 page_content_kind: original.page_content_kind,
+                read_path_debug: original.read_path_debug,
             },
         }
     }
@@ -191,6 +194,11 @@ impl RequestContextBuilder {
         self
     }
 
+    pub(crate) fn read_path_debug(mut self, b: bool) -> Self {
+        self.inner.read_path_debug = b;
+        self
+    }
+
     pub fn build(self) -> RequestContext {
         self.inner
     }
@@ -291,4 +299,8 @@ impl RequestContext {
     pub(crate) fn page_content_kind(&self) -> PageContentKind {
         self.page_content_kind
     }
+
+    pub(crate) fn read_path_debug(&self) -> bool {
+        self.read_path_debug
+    }
 }
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 329bf82bde..c2d5c3a933 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -68,6 +68,7 @@ use tokio_util::sync::CancellationToken;
 use tracing::*;
 
 use crate::config::PageServerConf;
+use crate::context::RequestContextBuilder;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::deletion_queue::DeletionQueueClient;
 use crate::pgdatadir_mapping::LsnForTimestamp;
@@ -2571,14 +2572,30 @@ async fn deletion_queue_flush(
     }
 }
 
-/// Try if `GetPage@Lsn` is successful, useful for manual debugging.
 async fn getpage_at_lsn_handler(
+    request: Request<Body>,
+    cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    getpage_at_lsn_handler_inner(false, request, cancel).await
+}
+
+async fn touchpage_at_lsn_handler(
+    request: Request<Body>,
+    cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    getpage_at_lsn_handler_inner(true, request, cancel).await
+}
+
+/// Try if `GetPage@Lsn` is successful, useful for manual debugging.
+async fn getpage_at_lsn_handler_inner(
+    touch: bool,
     request: Request<Body>,
     _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
     let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
-    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+    // Require pageserver admin permission for this API instead of only tenant-level token.
+    check_permission(&request, None)?;
     let state = get_state(&request);
 
     struct Key(pageserver_api::key::Key);
@@ -2593,22 +2610,29 @@ async fn getpage_at_lsn_handler(
 
     let key: Key = parse_query_param(&request, "key")?
         .ok_or_else(|| ApiError::BadRequest(anyhow!("missing 'key' query parameter")))?;
-    let lsn: Lsn = parse_query_param(&request, "lsn")?
-        .ok_or_else(|| ApiError::BadRequest(anyhow!("missing 'lsn' query parameter")))?;
+    let lsn: Option<Lsn> = parse_query_param(&request, "lsn")?;
 
     async {
         let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+        // Enable read path debugging
+        let ctx = RequestContextBuilder::extend(&ctx).read_path_debug(true).build();
         let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
 
+        // Use last_record_lsn if no lsn is provided
+        let lsn = lsn.unwrap_or_else(|| timeline.get_last_record_lsn());
         let page = timeline.get(key.0, lsn, &ctx).await?;
 
-        Result::<_, ApiError>::Ok(
-            Response::builder()
-                .status(StatusCode::OK)
-                .header(header::CONTENT_TYPE, "application/octet-stream")
-                .body(hyper::Body::from(page))
-                .unwrap(),
-        )
+        if touch {
+            json_response(StatusCode::OK, ())
+        } else {
+            Result::<_, ApiError>::Ok(
+                Response::builder()
+                    .status(StatusCode::OK)
+                    .header(header::CONTENT_TYPE, "application/octet-stream")
+                    .body(hyper::Body::from(page))
+                    .unwrap(),
+            )
+        }
     }
     .instrument(info_span!("timeline_get", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
     .await
@@ -3743,6 +3767,10 @@ pub fn make_router(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/getpage",
             |r| testing_api_handler("getpage@lsn", r, getpage_at_lsn_handler),
         )
+        .get(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/touchpage",
+            |r| api_handler(r, touchpage_at_lsn_handler),
+        )
         .get(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/keyspace",
             |r| api_handler(r, timeline_collect_keyspace),
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index d02ab36e78..582825e890 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1298,7 +1298,7 @@ impl Timeline {
         reconstruct_state: &mut ValuesReconstructState,
         ctx: &RequestContext,
     ) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
-        let read_path = if self.conf.enable_read_path_debugging {
+        let read_path = if self.conf.enable_read_path_debugging || ctx.read_path_debug() {
             Some(ReadPath::new(keyspace.clone(), lsn))
         } else {
             None

From 9d074db18db8c8a05df02f54563140e9cb2b7a63 Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Tue, 18 Feb 2025 20:54:21 +0100
Subject: [PATCH 45/78] Use link to  cross-service-endpoint dashboard in allure
 reports and benchmarking workflow logs (#10874)

## Problem

We have links to deprecated dashboards in our logs

Example
https://github.com/neondatabase/neon/actions/runs/13382454571/job/37401983608#step:8:348

## Summary of changes

Use link to cross service endpoint instead.

Example:
https://github.com/neondatabase/neon/actions/runs/13395407925/job/37413056148#step:7:345
---
 test_runner/fixtures/neon_fixtures.py |  4 +-
 test_runner/fixtures/utils.py         | 82 +++++++++++----------------
 2 files changed, 35 insertions(+), 51 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 12b096a2a0..58c5dbfd29 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -96,7 +96,7 @@ from fixtures.utils import (
     ATTACHMENT_NAME_REGEX,
     COMPONENT_BINARIES,
     USE_LFC,
-    allure_add_grafana_links,
+    allure_add_grafana_link,
     assert_no_errors,
     get_dir_size,
     print_gc_result,
@@ -3255,7 +3255,7 @@ def remote_pg(
     end_ms = int(datetime.utcnow().timestamp() * 1000)
     if is_neon:
         # Add 10s margin to the start and end times
-        allure_add_grafana_links(
+        allure_add_grafana_link(
             host,
             timeline_id,
             start_ms - 10_000,
diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index 2a59eab710..84d62fb877 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -312,62 +312,46 @@ def allure_attach_from_dir(dir: Path, preserve_database_files: bool = False):
 
 
 GRAFANA_URL = "https://neonprod.grafana.net"
-GRAFANA_EXPLORE_URL = f"{GRAFANA_URL}/explore"
-GRAFANA_TIMELINE_INSPECTOR_DASHBOARD_URL = f"{GRAFANA_URL}/d/8G011dlnk/timeline-inspector"
-LOGS_STAGING_DATASOURCE_ID = "xHHYY0dVz"
+GRAFANA_DASHBOARD_URL = f"{GRAFANA_URL}/d/cdya0okb81zwga/cross-service-endpoint-debugging"
 
 
-def allure_add_grafana_links(host: str, timeline_id: TimelineId, start_ms: int, end_ms: int):
-    """Add links to server logs in Grafana to Allure report"""
-    links: dict[str, str] = {}
-    # We expect host to be in format like ep-divine-night-159320.us-east-2.aws.neon.build
+def allure_add_grafana_link(host: str, timeline_id: TimelineId, start_ms: int, end_ms: int):
+    """
+    Add a link to the cross-service endpoint debugging dashboard in Grafana to Allure report.
+
+    Args:
+        host (str): The host string in the format 'ep-<endpoint_id>.<region_id>.<domain>'.
+        timeline_id (TimelineId): The timeline identifier for the Grafana dashboard.
+            (currently ignored but may be needed in future verions of the dashboard)
+        start_ms (int): The start time in milliseconds for the Grafana dashboard.
+        end_ms (int): The end time in milliseconds for the Grafana dashboard.
+
+    Example:
+        Given
+        host = ''
+        timeline_id = '996926d1f5ddbe7381b8840083f8fc9a'
+
+        The generated link would be something like:
+        https://neonprod.grafana.net/d/cdya0okb81zwga/cross-service-endpoint-debugging?orgId=1&from=2025-02-17T21:10:00.000Z&to=2025-02-17T21:20:00.000Z&timezone=utc&var-env=dev%7Cstaging&var-input_endpoint_id=ep-holy-mouse-w2u462gi
+
+    """
+    # We expect host to be in format like ep-holy-mouse-w2u462gi.us-east-2.aws.neon.build
     endpoint_id, region_id, _ = host.split(".", 2)
 
-    expressions = {
-        "compute logs": f'{{app="compute-node-{endpoint_id}", neon_region="{region_id}"}}',
-        "k8s events": f'{{job="integrations/kubernetes/eventhandler"}} |~ "name=compute-node-{endpoint_id}-"',
-        "console logs": f'{{neon_service="console", neon_region="{region_id}"}} | json | endpoint_id = "{endpoint_id}"',
-        "proxy logs": f'{{neon_service="proxy-scram", neon_region="{region_id}"}}',
+    params = {
+        "orgId": 1,
+        "from": start_ms,
+        "to": end_ms,
+        "timezone": "utc",
+        "var-env": "dev|staging",
+        "var-input_endpoint_id": endpoint_id,
     }
 
-    params: dict[str, Any] = {
-        "datasource": LOGS_STAGING_DATASOURCE_ID,
-        "queries": [
-            {
-                "expr": "<PUT AN EXPRESSION HERE>",
-                "refId": "A",
-                "datasource": {"type": "loki", "uid": LOGS_STAGING_DATASOURCE_ID},
-                "editorMode": "code",
-                "queryType": "range",
-            }
-        ],
-        "range": {
-            "from": str(start_ms),
-            "to": str(end_ms),
-        },
-    }
-    for name, expr in expressions.items():
-        params["queries"][0]["expr"] = expr
-        query_string = urlencode({"orgId": 1, "left": json.dumps(params)})
-        links[name] = f"{GRAFANA_EXPLORE_URL}?{query_string}"
+    query_string = urlencode(params)
+    link = f"{GRAFANA_DASHBOARD_URL}?{query_string}"
 
-    timeline_qs = urlencode(
-        {
-            "orgId": 1,
-            "var-environment": "victoria-metrics-aws-dev",
-            "var-timeline_id": timeline_id,
-            "var-endpoint_id": endpoint_id,
-            "var-log_datasource": "grafanacloud-neonstaging-logs",
-            "from": start_ms,
-            "to": end_ms,
-        }
-    )
-    link = f"{GRAFANA_TIMELINE_INSPECTOR_DASHBOARD_URL}?{timeline_qs}"
-    links["Timeline Inspector"] = link
-
-    for name, link in links.items():
-        allure.dynamic.link(link, name=name)
-        log.info(f"{name}: {link}")
+    allure.dynamic.link(link, name="Cross-Service Endpoint Debugging")
+    log.info(f"Cross-Service Endpoint Debugging: {link}")
 
 
 def start_in_background(

From a4e3989c8da1bf0dc8a88b35a010055c29afd43f Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Tue, 18 Feb 2025 15:19:23 -0500
Subject: [PATCH 46/78] fix(pageserver): make repartition error critical
 (#10872)

## Problem

Read errors during repartition should be a critical error.

## Summary of changes

<del>We only have one call site</del> We have two call sites of
`repartition` where one of them is during the initial image upload
optimization and another is during image layer creation, so I added a
`critical!` here instead of inside `collect_keyspace`.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/http/routes.rs                |  1 +
 pageserver/src/tenant.rs                     |  6 ++++++
 pageserver/src/tenant/tasks.rs               |  3 +++
 pageserver/src/tenant/timeline.rs            |  9 +++++++--
 pageserver/src/tenant/timeline/compaction.rs | 16 ++++++++++++++--
 5 files changed, 31 insertions(+), 4 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index c2d5c3a933..56a84a98a8 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2395,6 +2395,7 @@ async fn timeline_checkpoint_handler(
                     match e {
                         CompactionError::ShuttingDown => ApiError::ShuttingDown,
                         CompactionError::Offload(e) => ApiError::InternalServerError(anyhow::anyhow!(e)),
+                        CompactionError::CollectKeySpaceError(e) => ApiError::InternalServerError(anyhow::anyhow!(e)),
                         CompactionError::Other(e) => ApiError::InternalServerError(e)
                     }
                 )?;
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 5d917da574..efb35625f2 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3150,6 +3150,12 @@ impl Tenant {
             // Offload failures don't trip the circuit breaker, since they're cheap to retry and
             // shouldn't block compaction.
             CompactionError::Offload(_) => {}
+            CompactionError::CollectKeySpaceError(err) => {
+                self.compaction_circuit_breaker
+                    .lock()
+                    .unwrap()
+                    .fail(&CIRCUIT_BREAKERS_BROKEN, err);
+            }
             CompactionError::Other(err) => {
                 self.compaction_circuit_breaker
                     .lock()
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 029444e973..5e63f59fd8 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -287,6 +287,7 @@ fn log_compaction_error(
     sleep_duration: Duration,
     task_cancelled: bool,
 ) {
+    use crate::pgdatadir_mapping::CollectKeySpaceError;
     use crate::tenant::upload_queue::NotInitialized;
     use crate::tenant::PageReconstructError;
     use CompactionError::*;
@@ -294,6 +295,8 @@ fn log_compaction_error(
     let level = match err {
         ShuttingDown => return,
         Offload(_) => Level::ERROR,
+        CollectKeySpaceError(CollectKeySpaceError::Cancelled) => Level::INFO,
+        CollectKeySpaceError(_) => Level::ERROR,
         _ if task_cancelled => Level::INFO,
         Other(err) => {
             let root_cause = err.root_cause();
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 582825e890..ea966d2b43 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1881,7 +1881,7 @@ impl Timeline {
         // Signal compaction failure to avoid L0 flush stalls when it's broken.
         match result {
             Ok(_) => self.compaction_failed.store(false, AtomicOrdering::Relaxed),
-            Err(CompactionError::Other(_)) => {
+            Err(CompactionError::Other(_)) | Err(CompactionError::CollectKeySpaceError(_)) => {
                 self.compaction_failed.store(true, AtomicOrdering::Relaxed)
             }
             // Don't change the current value on offload failure or shutdown. We don't want to
@@ -4604,7 +4604,10 @@ impl Timeline {
             ));
         }
 
-        let (dense_ks, sparse_ks) = self.collect_keyspace(lsn, ctx).await?;
+        let (dense_ks, sparse_ks) = self
+            .collect_keyspace(lsn, ctx)
+            .await
+            .map_err(CompactionError::CollectKeySpaceError)?;
         let dense_partitioning = dense_ks.partition(&self.shard_identity, partition_size);
         let sparse_partitioning = SparseKeyPartitioning {
             parts: vec![sparse_ks],
@@ -5319,6 +5322,8 @@ pub(crate) enum CompactionError {
     #[error("Failed to offload timeline: {0}")]
     Offload(OffloadError),
     /// Compaction cannot be done right now; page reconstruction and so on.
+    #[error("Failed to collect keyspace: {0}")]
+    CollectKeySpaceError(CollectKeySpaceError),
     #[error(transparent)]
     Other(anyhow::Error),
 }
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 9e082d74b5..4e4f906d78 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -11,7 +11,8 @@ use std::sync::Arc;
 use super::layer_manager::LayerManager;
 use super::{
     CompactFlags, CompactOptions, CreateImageLayersError, DurationRecorder, GetVectoredError,
-    ImageLayerCreationMode, LastImageLayerCreationStatus, RecordedDuration, Timeline,
+    ImageLayerCreationMode, LastImageLayerCreationStatus, PageReconstructError, RecordedDuration,
+    Timeline,
 };
 
 use anyhow::{anyhow, bail, Context};
@@ -31,6 +32,7 @@ use utils::id::TimelineId;
 
 use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
 use crate::page_cache;
+use crate::pgdatadir_mapping::CollectKeySpaceError;
 use crate::statvfs::Statvfs;
 use crate::tenant::checks::check_valid_layermap;
 use crate::tenant::gc_block::GcBlock;
@@ -781,7 +783,17 @@ impl Timeline {
                 //
                 // Suppress error when it's due to cancellation
                 if !self.cancel.is_cancelled() && !err.is_cancelled() {
-                    tracing::error!("could not compact, repartitioning keyspace failed: {err:?}");
+                    if let CompactionError::CollectKeySpaceError(
+                        CollectKeySpaceError::Decode(_)
+                        | CollectKeySpaceError::PageRead(PageReconstructError::MissingKey(_)),
+                    ) = err
+                    {
+                        critical!("could not compact, repartitioning keyspace failed: {err:?}");
+                    } else {
+                        tracing::error!(
+                            "could not compact, repartitioning keyspace failed: {err:?}"
+                        );
+                    }
                 }
             }
         };

From 7199919f04887a993d07df5d556ffa0c2b5ee251 Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Wed, 19 Feb 2025 07:40:09 +0100
Subject: [PATCH 47/78] Fix the problems discovered in the upgrade test
 (#10826)

## Problem
The nightly test discovered problems in the extensions upgrade test.
1. `PLv8` has different versions on PGv17 and PGv16 and a different test
set, which was not implemented correctly
[sample](https://github.com/neondatabase/neon/actions/runs/13382330475/job/37372930271)
2. The same for `semver`
[sample](https://github.com/neondatabase/neon/actions/runs/13382330475/job/37372930017)
3. `pgtap` interfered with the other tests, e.g. tables, created by
other extensions caused the tests to fail.

## Summary of changes
The discovered problems were fixed.
1. The tests list for `PLv8` is now generated using the original
Makefile
2. The patches for `semver` are now split for PGv16 and PGv17.
3. `pgtap` is being tested in a separate database now.

---------

Co-authored-by: Mikhail Kot <mikhail@neon.tech>
---
 compute/compute-node.Dockerfile               |  3 +++
 ...st-upgrade.patch => test-upgrade-16.patch} |  0
 .../pg_semver-src/test-upgrade-17.patch       | 24 +++++++++++++++++++
 .../ext-src/pg_semver-src/test-upgrade.sh     |  3 ++-
 .../ext-src/pgtap-src/test-upgrade.patch      | 13 ++++++++++
 .../ext-src/pgtap-src/test-upgrade.sh         |  3 +--
 .../ext-src/plv8-src/test-upgrade.sh          |  3 ++-
 docker-compose/test_extensions_upgrade.sh     |  2 ++
 8 files changed, 47 insertions(+), 4 deletions(-)
 rename docker-compose/ext-src/pg_semver-src/{test-upgrade.patch => test-upgrade-16.patch} (100%)
 create mode 100644 docker-compose/ext-src/pg_semver-src/test-upgrade-17.patch

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 0491abe965..0b3001613d 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1847,11 +1847,14 @@ COPY --from=pg_partman-src /ext-src/ /ext-src/
 #COPY --from=pg_repack-src /ext-src/ /ext-src/
 
 COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh
+RUN apt-get update && apt-get install -y libtap-parser-sourcehandler-pgtap-perl\
+   && apt clean && rm -rf /ext-src/*.tar.gz /var/lib/apt/lists/*
 ENV PATH=/usr/local/pgsql/bin:$PATH
 ENV PGHOST=compute
 ENV PGPORT=55433
 ENV PGUSER=cloud_admin
 ENV PGDATABASE=postgres
+ENV PG_VERSION=${PG_VERSION:?}
 
 #########################################################################################
 #
diff --git a/docker-compose/ext-src/pg_semver-src/test-upgrade.patch b/docker-compose/ext-src/pg_semver-src/test-upgrade-16.patch
similarity index 100%
rename from docker-compose/ext-src/pg_semver-src/test-upgrade.patch
rename to docker-compose/ext-src/pg_semver-src/test-upgrade-16.patch
diff --git a/docker-compose/ext-src/pg_semver-src/test-upgrade-17.patch b/docker-compose/ext-src/pg_semver-src/test-upgrade-17.patch
new file mode 100644
index 0000000000..2d0bf280db
--- /dev/null
+++ b/docker-compose/ext-src/pg_semver-src/test-upgrade-17.patch
@@ -0,0 +1,24 @@
+diff --git a/test/sql/base.sql b/test/sql/base.sql
+index 53adb30..2eed91b 100644
+--- a/test/sql/base.sql
++++ b/test/sql/base.sql
+@@ -2,7 +2,6 @@
+ BEGIN;
+ 
+ \i test/pgtap-core.sql
+-CREATE EXTENSION semver;
+ 
+ SELECT plan(334);
+ --SELECT * FROM no_plan();
+diff --git a/test/sql/corpus.sql b/test/sql/corpus.sql
+index c0fe98e..39cdd2e 100644
+--- a/test/sql/corpus.sql
++++ b/test/sql/corpus.sql
+@@ -4,7 +4,6 @@ BEGIN;
+ -- Test the SemVer corpus from https://regex101.com/r/Ly7O1x/3/.
+ 
+ \i test/pgtap-core.sql
+-CREATE EXTENSION semver;
+ 
+ SELECT plan(76);
+ --SELECT * FROM no_plan();
diff --git a/docker-compose/ext-src/pg_semver-src/test-upgrade.sh b/docker-compose/ext-src/pg_semver-src/test-upgrade.sh
index e1541f272a..18b2848fd1 100755
--- a/docker-compose/ext-src/pg_semver-src/test-upgrade.sh
+++ b/docker-compose/ext-src/pg_semver-src/test-upgrade.sh
@@ -1,6 +1,7 @@
 #!/bin/sh
 set -ex
 cd "$(dirname ${0})"
-patch -p1 <test-upgrade.patch
+patch -p1 <test-upgrade-${PG_VERSION}.patch
+psql -d contrib_regression -c "DROP EXTENSION IF EXISTS pgtap"
 PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
 ${PG_REGRESS} --use-existing --inputdir=./ --bindir='/usr/local/pgsql/bin'    --inputdir=test --dbname=contrib_regression base corpus
\ No newline at end of file
diff --git a/docker-compose/ext-src/pgtap-src/test-upgrade.patch b/docker-compose/ext-src/pgtap-src/test-upgrade.patch
index 16089b2902..a4c46e93ce 100644
--- a/docker-compose/ext-src/pgtap-src/test-upgrade.patch
+++ b/docker-compose/ext-src/pgtap-src/test-upgrade.patch
@@ -1,3 +1,16 @@
+diff --git a/Makefile b/Makefile
+index f255fe6..0a0fa65 100644
+--- a/Makefile
++++ b/Makefile
+@@ -346,7 +346,7 @@ test: test-serial test-parallel
+ TB_DIR = test/build
+ GENERATED_SCHEDULE_DEPS = $(TB_DIR)/all_tests $(TB_DIR)/exclude_tests
+ REGRESS = --schedule $(TB_DIR)/run.sch # Set this again just to be safe
+-REGRESS_OPTS = --inputdir=test --max-connections=$(PARALLEL_CONN) --schedule $(SETUP_SCH) $(REGRESS_CONF)
++REGRESS_OPTS = --use-existing --dbname=pgtap_regression --inputdir=test --max-connections=$(PARALLEL_CONN) --schedule $(SETUP_SCH) $(REGRESS_CONF)
+ SETUP_SCH = test/schedule/main.sch # schedule to use for test setup; this can be forcibly changed by some targets!
+ IGNORE_TESTS = $(notdir $(EXCLUDE_TEST_FILES:.sql=))
+ PARALLEL_TESTS = $(filter-out $(IGNORE_TESTS),$(filter-out $(SERIAL_TESTS),$(ALL_TESTS)))
 diff --git a/test/schedule/create.sql b/test/schedule/create.sql
 index ba355ed..7e250f5 100644
 --- a/test/schedule/create.sql
diff --git a/docker-compose/ext-src/pgtap-src/test-upgrade.sh b/docker-compose/ext-src/pgtap-src/test-upgrade.sh
index a8c43dd010..5f59c636aa 100755
--- a/docker-compose/ext-src/pgtap-src/test-upgrade.sh
+++ b/docker-compose/ext-src/pgtap-src/test-upgrade.sh
@@ -2,5 +2,4 @@
 set -ex
 cd "$(dirname ${0})"
 patch -p1 <test-upgrade.patch
-PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
-${PG_REGRESS} --inputdir=./ --bindir='/usr/local/pgsql/bin'    --inputdir=test --max-connections=86 --schedule test/schedule/main.sch   --schedule test/build/run.sch --dbname contrib_regression --use-existing
\ No newline at end of file
+make installcheck
\ No newline at end of file
diff --git a/docker-compose/ext-src/plv8-src/test-upgrade.sh b/docker-compose/ext-src/plv8-src/test-upgrade.sh
index 6514d4fe92..325540b85d 100755
--- a/docker-compose/ext-src/plv8-src/test-upgrade.sh
+++ b/docker-compose/ext-src/plv8-src/test-upgrade.sh
@@ -2,4 +2,5 @@
 set -ex
 cd "$(dirname ${0})"
 PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
-${PG_REGRESS} --inputdir=./ --bindir='/usr/local/pgsql/bin'  --use-existing --dbname=contrib_regression plv8 plv8-errors scalar_args inline json startup_pre startup varparam json_conv jsonb_conv window guc es6 arraybuffer composites currentresource startup_perms bytea find_function_perms memory_limits reset show array_spread regression dialect bigint procedure
\ No newline at end of file
+REGRESS="$(make -n installcheck | awk '{print substr($0,index($0,"init-extension")+15);}')"
+${PG_REGRESS} --inputdir=./ --bindir='/usr/local/pgsql/bin'  --use-existing --dbname=contrib_regression ${REGRESS}
\ No newline at end of file
diff --git a/docker-compose/test_extensions_upgrade.sh b/docker-compose/test_extensions_upgrade.sh
index 775acada1f..4a9024569b 100755
--- a/docker-compose/test_extensions_upgrade.sh
+++ b/docker-compose/test_extensions_upgrade.sh
@@ -59,6 +59,8 @@ wait_for_ready
 docker compose cp  ext-src neon-test-extensions:/
 docker compose exec neon-test-extensions psql -c "DROP DATABASE IF EXISTS contrib_regression"
 docker compose exec neon-test-extensions psql -c "CREATE DATABASE contrib_regression"
+docker compose exec neon-test-extensions psql -c "CREATE DATABASE pgtap_regression"
+docker compose exec neon-test-extensions psql -d pgtap_regression -c "CREATE EXTENSION pgtap"
 create_extensions "${EXTNAMES}"
 if [ "${FORCE_ALL_UPGRADE_TESTS:-false}" = true ]; then
   exts="${EXTNAMES}"

From 2f0d6571a96cbf29701d8343e7d817391a30a7a4 Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Wed, 19 Feb 2025 09:43:53 +0100
Subject: [PATCH 48/78] add a variant to ingest benchmark with shard-splitting
 disabled (#10876)

## Problem

we measure ingest performance for a few variants (stripe-sizes,
pre-sharded, shard-splitted).
However some phenomena (e.g. related to L0 compaction) in PS can be
better observed and optimized with un-sharded tenants.

## Summary of changes

- Allow to create projects with a policy that disables sharding
(`{"scheduling": "Essential"}`)
- add a variant to ingest_benchmark that uses that policy for the new
project

## Test run
https://github.com/neondatabase/neon/actions/runs/13396325970
---
 .../actions/neon-project-create/action.yml    | 22 ++++++++++++++++++-
 .github/workflows/ingest_benchmark.yml        | 10 +++++++++
 2 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/.github/actions/neon-project-create/action.yml b/.github/actions/neon-project-create/action.yml
index c9f6b0832e..a393aa6106 100644
--- a/.github/actions/neon-project-create/action.yml
+++ b/.github/actions/neon-project-create/action.yml
@@ -19,7 +19,11 @@ inputs:
     default: '[1, 1]'
   # settings below only needed if you want the project to be sharded from the beginning
   shard_split_project:
-    description: 'by default new projects are not shard-split, specify true to shard-split'
+    description: 'by default new projects are not shard-split initiailly, but only when shard-split threshold is reached, specify true to explicitly shard-split initially'
+    required: false
+    default: 'false'
+  disable_sharding:
+    description: 'by default new projects use storage controller default policy to shard-split when shard-split threshold is reached, specify true to explicitly disable sharding'
     required: false
     default: 'false'
   admin_api_key:
@@ -107,6 +111,21 @@ runs:
             -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer ${ADMIN_API_KEY}" \
             -d "{\"new_shard_count\": $SHARD_COUNT, \"new_stripe_size\": $STRIPE_SIZE}"
         fi
+        if [ "${DISABLE_SHARDING}" = "true" ]; then
+          # determine tenant ID
+          TENANT_ID=`${PSQL} ${dsn} -t -A -c "SHOW neon.tenant_id"`
+
+          echo "Explicitly disabling shard-splitting for project ${project_id} with tenant_id ${TENANT_ID}"
+
+          echo "Sending PUT request to https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/storage/proxy/control/v1/tenant/${TENANT_ID}/policy"
+          echo "with body {\"scheduling\": \"Essential\"}"
+
+          # we need an ADMIN API KEY to invoke storage controller API for shard splitting (bash -u above checks that the variable is set)
+          curl -X PUT \
+            "https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/storage/proxy/control/v1/tenant/${TENANT_ID}/policy" \
+            -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer ${ADMIN_API_KEY}" \
+            -d "{\"scheduling\": \"Essential\"}"
+        fi
 
       env:
         API_HOST: ${{ inputs.api_host }}
@@ -116,6 +135,7 @@ runs:
         MIN_CU: ${{ fromJSON(inputs.compute_units)[0] }}
         MAX_CU: ${{ fromJSON(inputs.compute_units)[1] }}
         SHARD_SPLIT_PROJECT: ${{ inputs.shard_split_project }}
+        DISABLE_SHARDING: ${{ inputs.disable_sharding }}
         ADMIN_API_KEY: ${{ inputs.admin_api_key }}
         SHARD_COUNT: ${{ inputs.shard_count }}
         STRIPE_SIZE: ${{ inputs.stripe_size }}
diff --git a/.github/workflows/ingest_benchmark.yml b/.github/workflows/ingest_benchmark.yml
index 7b303fa37a..c20c5890f9 100644
--- a/.github/workflows/ingest_benchmark.yml
+++ b/.github/workflows/ingest_benchmark.yml
@@ -32,18 +32,27 @@ jobs:
           - target_project: new_empty_project_stripe_size_2048 
             stripe_size: 2048 # 16 MiB
             postgres_version: 16
+            disable_sharding: false
           - target_project: new_empty_project_stripe_size_32768
             stripe_size: 32768 # 256 MiB # note that this is different from null because using null will shard_split the project only if it reaches the threshold
                                # while here it is sharded from the beginning with a shard size of 256 MiB
+            disable_sharding: false
             postgres_version: 16
           - target_project: new_empty_project
             stripe_size: null # run with neon defaults which will shard split only when reaching the threshold
+            disable_sharding: false
             postgres_version: 16
           - target_project: new_empty_project
             stripe_size: null # run with neon defaults which will shard split only when reaching the threshold
+            disable_sharding: false
             postgres_version: 17
           - target_project: large_existing_project
             stripe_size: null # cannot re-shared or choose different stripe size for existing, already sharded project
+            disable_sharding: false
+            postgres_version: 16
+          - target_project: new_empty_project_unsharded
+            stripe_size: null # run with neon defaults which will shard split only when reaching the threshold
+            disable_sharding: true
             postgres_version: 16
       max-parallel: 1 # we want to run each stripe size sequentially to be able to compare the results
     permissions:
@@ -96,6 +105,7 @@ jobs:
         admin_api_key: ${{ secrets.NEON_STAGING_ADMIN_API_KEY }} 
         shard_count: 8
         stripe_size: ${{ matrix.stripe_size }}
+        disable_sharding: ${{ matrix.disable_sharding }} 
 
     - name: Initialize Neon project
       if: ${{ startsWith(matrix.target_project, 'new_empty_project') }}

From aa115a774cb4e2245399c9994a42ea05a1fd9ddd Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Wed, 19 Feb 2025 10:01:02 +0100
Subject: [PATCH 49/78] storcon: eagerly attempt autosplits (#10849)

## Problem

Autosplits are crucial for bulk ingest performance. However, autosplits
were only attempted when there was no other pending work. This could
cause e.g. mass AZ affinity violations following Pageserver restarts to
starve out autosplits for hours.

Resolves #10762.

## Summary of changes

Always attempt autosplits in the background reconciliation loop,
regardless of other pending work.
---
 storage_controller/src/service.rs | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 5aa744f076..dd4d93dc84 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -1031,12 +1031,11 @@ impl Service {
                 let reconciles_spawned = self.reconcile_all();
                 if reconciles_spawned == 0 {
                     // Run optimizer only when we didn't find any other work to do
-                    let optimizations = self.optimize_all().await;
-                    if optimizations == 0 {
-                        // Run new splits only when no optimizations are pending
-                        self.autosplit_tenants().await;
-                    }
+                    self.optimize_all().await;
                 }
+                // Always attempt autosplits. Sharding is crucial for bulk ingest performance, so we
+                // must be responsive when new projects begin ingesting and reach the threshold.
+                self.autosplit_tenants().await;
             }
               _ = self.reconcilers_cancel.cancelled() => return
             }

From e52e93797fbb930da8cf7139e150d748920182f1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= <jc@neon.tech>
Date: Wed, 19 Feb 2025 13:34:41 +0100
Subject: [PATCH 50/78] refactor(ci): use variables for AWS account IDs
 (#10886)

## Problem
Our AWS account IDs are copy-pasted all over the place. A wrong paste
might only be caught late if we hardcode them, but will get flagged
instantly by actionlint if we access them from github actions variables.
Resolves https://github.com/neondatabase/neon/issues/10787, follow-up
for https://github.com/neondatabase/neon/pull/10613.

## Summary of changes
Access AWS account IDs using Github Actions variables.
---
 .github/actionlint.yml                        |  3 +++
 .../workflows/_push-to-container-registry.yml |  2 +-
 .github/workflows/build_and_test.yml          | 23 +++++++++++--------
 .../build_and_test_with_sanitizers.yml        |  2 +-
 .github/workflows/pin-build-tools-image.yml   |  6 ++---
 scripts/generate_image_maps.py                |  7 ++++--
 6 files changed, 26 insertions(+), 17 deletions(-)

diff --git a/.github/actionlint.yml b/.github/actionlint.yml
index 5114517e7f..1e6c2d0aa2 100644
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -29,3 +29,6 @@ config-variables:
   - SLACK_ON_CALL_STORAGE_STAGING_STREAM
   - SLACK_CICD_CHANNEL_ID
   - SLACK_STORAGE_CHANNEL_ID
+  - NEON_DEV_AWS_ACCOUNT_ID
+  - NEON_PROD_AWS_ACCOUNT_ID
+  - AWS_ECR_REGION
diff --git a/.github/workflows/_push-to-container-registry.yml b/.github/workflows/_push-to-container-registry.yml
index 3c97c8a67a..c938f62ad5 100644
--- a/.github/workflows/_push-to-container-registry.yml
+++ b/.github/workflows/_push-to-container-registry.yml
@@ -2,7 +2,7 @@ name: Push images to Container Registry
 on:
   workflow_call:
     inputs:
-      # Example: {"docker.io/neondatabase/neon:13196061314":["369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:13196061314","neoneastus2.azurecr.io/neondatabase/neon:13196061314"]}
+      # Example: {"docker.io/neondatabase/neon:13196061314":["${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/neon:13196061314","neoneastus2.azurecr.io/neondatabase/neon:13196061314"]}
       image-map:
         description: JSON map of images, mapping from a source image to an array of target images that should be pushed.
         required: true
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index d9bf094aa9..f08280e112 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -68,7 +68,7 @@ jobs:
   tag:
     needs: [ check-permissions ]
     runs-on: [ self-hosted, small ]
-    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
+    container: ${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/base:pinned
     outputs:
       build-tag: ${{steps.build-tag.outputs.tag}}
 
@@ -859,14 +859,17 @@ jobs:
           BRANCH: "${{ github.ref_name }}"
           DEV_ACR: "${{ vars.AZURE_DEV_REGISTRY_NAME }}"
           PROD_ACR: "${{ vars.AZURE_PROD_REGISTRY_NAME }}"
+          DEV_AWS: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}"
+          PROD_AWS: "${{ vars.NEON_PROD_AWS_ACCOUNT_ID }}"
+          AWS_REGION: "${{ vars.AWS_ECR_REGION }}"
 
   push-neon-image-dev:
     needs: [ generate-image-maps, neon-image ]
     uses: ./.github/workflows/_push-to-container-registry.yml
     with:
       image-map: '${{ needs.generate-image-maps.outputs.neon-dev }}'
-      aws-region: eu-central-1
-      aws-account-ids: "369495373322"
+      aws-region: ${{ vars.AWS_ECR_REGION }}
+      aws-account-ids: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}"
       azure-client-id: ${{ vars.AZURE_DEV_CLIENT_ID }}
       azure-subscription-id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }}
       azure-tenant-id: ${{ vars.AZURE_TENANT_ID }}
@@ -881,8 +884,8 @@ jobs:
     uses: ./.github/workflows/_push-to-container-registry.yml
     with:
       image-map: '${{ needs.generate-image-maps.outputs.compute-dev }}'
-      aws-region: eu-central-1
-      aws-account-ids: "369495373322"
+      aws-region: ${{ vars.AWS_ECR_REGION }}
+      aws-account-ids: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}"
       azure-client-id: ${{ vars.AZURE_DEV_CLIENT_ID }}
       azure-subscription-id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }}
       azure-tenant-id: ${{ vars.AZURE_TENANT_ID }}
@@ -898,8 +901,8 @@ jobs:
     uses: ./.github/workflows/_push-to-container-registry.yml
     with:
       image-map: '${{ needs.generate-image-maps.outputs.neon-prod }}'
-      aws-region: eu-central-1
-      aws-account-ids: "093970136003"
+      aws-region: ${{ vars.AWS_ECR_REGION }}
+      aws-account-ids: "${{ vars.NEON_PROD_AWS_ACCOUNT_ID }}"
       azure-client-id: ${{ vars.AZURE_PROD_CLIENT_ID }}
       azure-subscription-id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }}
       azure-tenant-id: ${{ vars.AZURE_TENANT_ID }}
@@ -915,8 +918,8 @@ jobs:
     uses: ./.github/workflows/_push-to-container-registry.yml
     with:
       image-map: '${{ needs.generate-image-maps.outputs.compute-prod }}'
-      aws-region: eu-central-1
-      aws-account-ids: "093970136003"
+      aws-region: ${{ vars.AWS_ECR_REGION }}
+      aws-account-ids: "${{ vars.NEON_PROD_AWS_ACCOUNT_ID }}"
       azure-client-id: ${{ vars.AZURE_PROD_CLIENT_ID }}
       azure-subscription-id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }}
       azure-tenant-id: ${{ vars.AZURE_TENANT_ID }}
@@ -1029,7 +1032,7 @@ jobs:
       statuses: write
       contents: write
     runs-on: [ self-hosted, small ]
-    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
+    container: ${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/ansible:latest
     steps:
       - uses: actions/checkout@v4
 
diff --git a/.github/workflows/build_and_test_with_sanitizers.yml b/.github/workflows/build_and_test_with_sanitizers.yml
index 2bc938509f..e40b02b5d2 100644
--- a/.github/workflows/build_and_test_with_sanitizers.yml
+++ b/.github/workflows/build_and_test_with_sanitizers.yml
@@ -27,7 +27,7 @@ env:
 jobs:
   tag:
     runs-on: [ self-hosted, small ]
-    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
+    container: ${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/base:pinned
     outputs:
       build-tag: ${{steps.build-tag.outputs.tag}}
 
diff --git a/.github/workflows/pin-build-tools-image.yml b/.github/workflows/pin-build-tools-image.yml
index 626de2b0e0..8861c1f093 100644
--- a/.github/workflows/pin-build-tools-image.yml
+++ b/.github/workflows/pin-build-tools-image.yml
@@ -78,7 +78,7 @@ jobs:
       - name: Configure AWS credentials
         uses: aws-actions/configure-aws-credentials@v4
         with:
-          aws-region: eu-central-1
+          aws-region: ${{ vars.AWS_ECR_REGION }}
           role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
           role-duration-seconds: 3600
 
@@ -104,12 +104,12 @@ jobs:
             tags=()
 
             tags+=("-t" "neondatabase/build-tools:${TO_TAG}-${debian_version}")
-            tags+=("-t" "369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG}-${debian_version}")
+            tags+=("-t" "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/build-tools:${TO_TAG}-${debian_version}")
             tags+=("-t" "neoneastus2.azurecr.io/neondatabase/build-tools:${TO_TAG}-${debian_version}")
 
             if [ "${debian_version}" == "${DEFAULT_DEBIAN_VERSION}" ]; then
               tags+=("-t" "neondatabase/build-tools:${TO_TAG}")
-              tags+=("-t" "369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG}")
+              tags+=("-t" "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/build-tools:${TO_TAG}")
               tags+=("-t" "neoneastus2.azurecr.io/neondatabase/build-tools:${TO_TAG}")
             fi
 
diff --git a/scripts/generate_image_maps.py b/scripts/generate_image_maps.py
index a2f553d290..915eb33673 100644
--- a/scripts/generate_image_maps.py
+++ b/scripts/generate_image_maps.py
@@ -6,6 +6,9 @@ build_tag = os.environ["BUILD_TAG"]
 branch = os.environ["BRANCH"]
 dev_acr = os.environ["DEV_ACR"]
 prod_acr = os.environ["PROD_ACR"]
+dev_aws = os.environ["DEV_AWS"]
+prod_aws = os.environ["PROD_AWS"]
+aws_region = os.environ["AWS_REGION"]
 
 components = {
     "neon": ["neon"],
@@ -24,11 +27,11 @@ components = {
 registries = {
     "dev": [
         "docker.io/neondatabase",
-        "369495373322.dkr.ecr.eu-central-1.amazonaws.com",
+        f"{dev_aws}.dkr.ecr.{aws_region}.amazonaws.com",
         f"{dev_acr}.azurecr.io/neondatabase",
     ],
     "prod": [
-        "093970136003.dkr.ecr.eu-central-1.amazonaws.com",
+        f"{prod_aws}.dkr.ecr.{aws_region}.amazonaws.com",
         f"{prod_acr}.azurecr.io/neondatabase",
     ],
 }

From 2d96134a4ee963cb9fd76ad46d8613ee1204654a Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 19 Feb 2025 16:09:01 +0200
Subject: [PATCH 51/78] Remove unused dependencies (#10887)

Per cargo machete.
---
 Cargo.lock                            | 7 -------
 compute_tools/Cargo.toml              | 3 ---
 libs/proxy/tokio-postgres2/Cargo.toml | 5 +----
 libs/utils/Cargo.toml                 | 1 -
 4 files changed, 1 insertion(+), 15 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 12c12bc771..1cab85adb3 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1316,7 +1316,6 @@ dependencies = [
  "flate2",
  "futures",
  "http 1.1.0",
- "jsonwebtoken",
  "metrics",
  "nix 0.27.1",
  "notify",
@@ -1326,7 +1325,6 @@ dependencies = [
  "opentelemetry_sdk",
  "postgres",
  "postgres_initdb",
- "prometheus",
  "regex",
  "remote_storage",
  "reqwest",
@@ -1345,7 +1343,6 @@ dependencies = [
  "tower 0.5.2",
  "tower-http",
  "tracing",
- "tracing-opentelemetry",
  "tracing-subscriber",
  "tracing-utils",
  "url",
@@ -7021,14 +7018,11 @@ dependencies = [
 name = "tokio-postgres2"
 version = "0.1.0"
 dependencies = [
- "async-trait",
- "byteorder",
  "bytes",
  "fallible-iterator",
  "futures-util",
  "log",
  "parking_lot 0.12.1",
- "percent-encoding",
  "phf",
  "pin-project-lite",
  "postgres-protocol2",
@@ -7615,7 +7609,6 @@ dependencies = [
  "hex",
  "hex-literal",
  "humantime",
- "inferno 0.12.0",
  "jsonwebtoken",
  "metrics",
  "nix 0.27.1",
diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml
index 81dcf99560..c276996df5 100644
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -25,7 +25,6 @@ fail.workspace = true
 flate2.workspace = true
 futures.workspace = true
 http.workspace = true
-jsonwebtoken.workspace = true
 metrics.workspace = true
 nix.workspace = true
 notify.workspace = true
@@ -48,13 +47,11 @@ tokio-postgres.workspace = true
 tokio-util.workspace = true
 tokio-stream.workspace = true
 tracing.workspace = true
-tracing-opentelemetry.workspace = true
 tracing-subscriber.workspace = true
 tracing-utils.workspace = true
 thiserror.workspace = true
 url.workspace = true
 uuid.workspace = true
-prometheus.workspace = true
 walkdir.workspace = true
 
 postgres_initdb.workspace = true
diff --git a/libs/proxy/tokio-postgres2/Cargo.toml b/libs/proxy/tokio-postgres2/Cargo.toml
index ade0ffc9f6..161c6b8309 100644
--- a/libs/proxy/tokio-postgres2/Cargo.toml
+++ b/libs/proxy/tokio-postgres2/Cargo.toml
@@ -5,18 +5,15 @@ edition = "2021"
 license = "MIT/Apache-2.0"
 
 [dependencies]
-async-trait.workspace = true
 bytes.workspace = true
-byteorder.workspace = true
 fallible-iterator.workspace = true
 futures-util = { workspace = true, features = ["sink"] }
 log = "0.4"
 parking_lot.workspace = true
-percent-encoding = "2.0"
 pin-project-lite.workspace = true
 phf = "0.11"
 postgres-protocol2 = { path = "../postgres-protocol2" }
 postgres-types2 = { path = "../postgres-types2" }
 tokio = { workspace = true, features = ["io-util", "time", "net"] }
 tokio-util = { workspace = true, features = ["codec"] }
-serde = { workspace = true, features = ["derive"] }
\ No newline at end of file
+serde = { workspace = true, features = ["derive"] }
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index e9611a0f12..62e0f4cfba 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -24,7 +24,6 @@ diatomic-waker.workspace = true
 git-version.workspace = true
 hex = { workspace = true, features = ["serde"] }
 humantime.workspace = true
-inferno.workspace = true
 fail.workspace = true
 futures = { workspace = true }
 jsonwebtoken.workspace = true

From 0453eaf65c9328a49720db2af9747a6a8df01872 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Wed, 19 Feb 2025 15:12:05 +0100
Subject: [PATCH 52/78] pageserver: reduce default `compaction_upper_limit` to
 20 (#10889)

## Problem

We've seen the previous default of 50 cause OOMs. Compacting many L0
layers at once now has limited benefit, since the cost is mostly linear
anyway. This is already being reduced to 20 in production settings.

## Summary of changes

Reduce `DEFAULT_COMPACTION_UPPER_LIMIT` to 20.

Once released, let's remove the config overrides.
---
 libs/pageserver_api/src/config.rs | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index e64052c73d..0f33bcf45b 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -544,10 +544,11 @@ pub mod tenant_conf_defaults {
     pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s";
     pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10;
 
-    // This value needs to be tuned to avoid OOM. We have 3/4 of the total CPU threads to do background works, that's 16*3/4=9 on
-    // most of our pageservers. Compaction ~50 layers requires about 2GB memory (could be reduced later by optimizing L0 hole
-    // calculation to avoid loading all keys into the memory). So with this config, we can get a maximum peak compaction usage of 18GB.
-    pub const DEFAULT_COMPACTION_UPPER_LIMIT: usize = 50;
+    // This value needs to be tuned to avoid OOM. We have 3/4*CPUs threads for L0 compaction, that's
+    // 3/4*16=9 on most of our pageservers. Compacting 20 layers requires about 1 GB memory (could
+    // be reduced later by optimizing L0 hole calculation to avoid loading all keys into memory). So
+    // with this config, we can get a maximum peak compaction usage of 9 GB.
+    pub const DEFAULT_COMPACTION_UPPER_LIMIT: usize = 20;
     pub const DEFAULT_COMPACTION_L0_FIRST: bool = false;
     pub const DEFAULT_COMPACTION_L0_SEMAPHORE: bool = true;
 

From 3720cf1c5aed3aa8b50cd8d5c85572a51a01e766 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Wed, 19 Feb 2025 15:20:51 +0100
Subject: [PATCH 53/78] storcon: use jemalloc (#10892)

## Problem

We'd like to enable CPU/heap profiling for storcon. This requires
jemalloc.

## Summary of changes

Use jemalloc as the global allocator, and enable heap sampling for
profiling.
---
 Cargo.lock                     |  1 +
 storage_controller/Cargo.toml  |  1 +
 storage_controller/src/main.rs | 10 ++++++++++
 3 files changed, 12 insertions(+)

diff --git a/Cargo.lock b/Cargo.lock
index 1cab85adb3..12232eaece 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -6465,6 +6465,7 @@ dependencies = [
  "strum",
  "strum_macros",
  "thiserror 1.0.69",
+ "tikv-jemallocator",
  "tokio",
  "tokio-postgres",
  "tokio-postgres-rustls",
diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml
index a93bbdeaaf..73dc1a5c10 100644
--- a/storage_controller/Cargo.toml
+++ b/storage_controller/Cargo.toml
@@ -34,6 +34,7 @@ reqwest = { workspace = true, features = ["stream"] }
 routerify.workspace = true
 safekeeper_api.workspace = true
 safekeeper_client.workspace = true
+tikv-jemallocator.workspace = true
 regex.workspace = true
 rustls-native-certs.workspace = true
 serde.workspace = true
diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index ea6bc38e89..9a9958f7a6 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -27,6 +27,16 @@ use utils::{project_build_tag, project_git_version, tcp_listener};
 project_git_version!(GIT_VERSION);
 project_build_tag!(BUILD_TAG);
 
+#[global_allocator]
+static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
+
+/// Configure jemalloc to profile heap allocations by sampling stack traces every 2 MB (1 << 21).
+/// This adds roughly 3% overhead for allocations on average, which is acceptable considering
+/// performance-sensitive code will avoid allocations as far as possible anyway.
+#[allow(non_upper_case_globals)]
+#[export_name = "malloc_conf"]
+pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0";
+
 #[derive(Parser)]
 #[command(author, version, about, long_about = None)]
 #[command(arg_required_else_help(true))]

From aab5482fd5fc43b0c092e22c0cab0e86b8655673 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Wed, 19 Feb 2025 15:43:29 +0100
Subject: [PATCH 54/78] storcon: add CPU/heap profiling endpoints (#10894)

Adds CPU/heap profiling for storcon.

Also fixes allowlists to match on the path only, since profiling
endpoints take query parameters.

Requires #10892 for heap profiling.
---
 storage_controller/src/http.rs | 62 +++++++++++++++++++++++-----------
 1 file changed, 42 insertions(+), 20 deletions(-)

diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 8994721267..1cc61a12e8 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -9,7 +9,10 @@ use crate::service::{LeadershipStatus, Service, RECONCILE_TIMEOUT, STARTUP_RECON
 use anyhow::Context;
 use futures::Future;
 use http_utils::{
-    endpoint::{self, auth_middleware, check_permission_with, request_span},
+    endpoint::{
+        self, auth_middleware, check_permission_with, profile_cpu_handler, profile_heap_handler,
+        request_span,
+    },
     error::ApiError,
     failpoints::failpoints_handler,
     json::{json_request, json_response},
@@ -54,7 +57,7 @@ pub struct HttpState {
     service: Arc<crate::service::Service>,
     auth: Option<Arc<SwappableJwtAuth>>,
     neon_metrics: NeonMetrics,
-    allowlist_routes: Vec<Uri>,
+    allowlist_routes: &'static [&'static str],
 }
 
 impl HttpState {
@@ -63,15 +66,17 @@ impl HttpState {
         auth: Option<Arc<SwappableJwtAuth>>,
         build_info: BuildInfo,
     ) -> Self {
-        let allowlist_routes = ["/status", "/ready", "/metrics"]
-            .iter()
-            .map(|v| v.parse().unwrap())
-            .collect::<Vec<_>>();
         Self {
             service,
             auth,
             neon_metrics: NeonMetrics::new(build_info),
-            allowlist_routes,
+            allowlist_routes: &[
+                "/status",
+                "/ready",
+                "/metrics",
+                "/profile/cpu",
+                "/profile/heap",
+            ],
         }
     }
 }
@@ -1416,23 +1421,26 @@ pub fn prologue_leadership_status_check_middleware<
         let state = get_state(&req);
         let leadership_status = state.service.get_leadership_status();
 
-        enum AllowedRoutes<'a> {
+        enum AllowedRoutes {
             All,
-            Some(Vec<&'a str>),
+            Some(&'static [&'static str]),
         }
 
         let allowed_routes = match leadership_status {
             LeadershipStatus::Leader => AllowedRoutes::All,
             LeadershipStatus::SteppedDown => AllowedRoutes::All,
-            LeadershipStatus::Candidate => {
-                AllowedRoutes::Some(["/ready", "/status", "/metrics"].to_vec())
-            }
+            LeadershipStatus::Candidate => AllowedRoutes::Some(&[
+                "/ready",
+                "/status",
+                "/metrics",
+                "/profile/cpu",
+                "/profile/heap",
+            ]),
         };
 
-        let uri = req.uri().to_string();
         match allowed_routes {
             AllowedRoutes::All => Ok(req),
-            AllowedRoutes::Some(allowed) if allowed.contains(&uri.as_str()) => Ok(req),
+            AllowedRoutes::Some(allowed) if allowed.contains(&req.uri().path()) => Ok(req),
             _ => {
                 tracing::info!(
                     "Request {} not allowed due to current leadership state",
@@ -1541,7 +1549,8 @@ enum ForwardOutcome {
 
 /// Potentially forward the request to the current storage controler leader.
 /// More specifically we forward when:
-/// 1. Request is not one of ["/control/v1/step_down", "/status", "/ready", "/metrics"]
+/// 1. Request is not one of:
+///    ["/control/v1/step_down", "/status", "/ready", "/metrics", "/profile/cpu", "/profile/heap"]
 /// 2. Current instance is in [`LeadershipStatus::SteppedDown`] state
 /// 3. There is a leader in the database to forward to
 /// 4. Leader from step (3) is not the current instance
@@ -1562,10 +1571,17 @@ enum ForwardOutcome {
 /// Hence, if we are in the edge case scenario the leader persisted in the database is the
 /// stepped down instance that received the request. Condition (4) above covers this scenario.
 async fn maybe_forward(req: Request<Body>) -> ForwardOutcome {
-    const NOT_FOR_FORWARD: [&str; 4] = ["/control/v1/step_down", "/status", "/ready", "/metrics"];
+    const NOT_FOR_FORWARD: &[&str] = &[
+        "/control/v1/step_down",
+        "/status",
+        "/ready",
+        "/metrics",
+        "/profile/cpu",
+        "/profile/heap",
+    ];
 
-    let uri = req.uri().to_string();
-    let uri_for_forward = !NOT_FOR_FORWARD.contains(&uri.as_str());
+    let uri = req.uri();
+    let uri_for_forward = !NOT_FOR_FORWARD.contains(&uri.path());
 
     // Fast return before trying to take any Service locks, if we will never forward anyway
     if !uri_for_forward {
@@ -1765,7 +1781,7 @@ pub fn make_router(
     if auth.is_some() {
         router = router.middleware(auth_middleware(|request| {
             let state = get_state(request);
-            if state.allowlist_routes.contains(request.uri()) {
+            if state.allowlist_routes.contains(&request.uri().path()) {
                 None
             } else {
                 state.auth.as_deref()
@@ -1778,13 +1794,19 @@ pub fn make_router(
         .get("/metrics", |r| {
             named_request_span(r, measured_metrics_handler, RequestName("metrics"))
         })
-        // Non-prefixed generic endpoints (status, metrics)
+        // Non-prefixed generic endpoints (status, metrics, profiling)
         .get("/status", |r| {
             named_request_span(r, handle_status, RequestName("status"))
         })
         .get("/ready", |r| {
             named_request_span(r, handle_ready, RequestName("ready"))
         })
+        .get("/profile/cpu", |r| {
+            named_request_span(r, profile_cpu_handler, RequestName("profile_cpu"))
+        })
+        .get("/profile/heap", |r| {
+            named_request_span(r, profile_heap_handler, RequestName("profile_heap"))
+        })
         // Upcalls for the pageserver: point the pageserver's `control_plane_api` config to this prefix
         .post("/upcall/v1/re-attach", |r| {
             named_request_span(r, handle_re_attach, RequestName("upcall_v1_reattach"))

From 1f9511dbd9570c90efca17e4322987db1e209014 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 19 Feb 2025 10:10:12 -0500
Subject: [PATCH 55/78] feat(pageserver): yield image creation to L0
 compactions across timelines (#10877)

## Problem

A simpler version of https://github.com/neondatabase/neon/pull/10812

## Summary of changes

Image layer creation will be preempted by L0 accumulated on other
timelines. We stop image layer generation if there's a pending L0
compaction request.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/timeline.rs | 31 +++++++++++++++++++------------
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index ea966d2b43..48c208d5d7 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -22,6 +22,7 @@ use chrono::{DateTime, Utc};
 use compaction::CompactionOutcome;
 use enumset::EnumSet;
 use fail::fail_point;
+use futures::FutureExt;
 use futures::{stream::FuturesUnordered, StreamExt};
 use handle::ShardTimelineId;
 use layer_manager::Shutdown;
@@ -5128,20 +5129,26 @@ impl Timeline {
                     // image layer generation taking too long time and blocking L0 compaction. So in this
                     // mode, we also inspect the current number of L0 layers and skip image layer generation
                     // if there are too many of them.
-                    let num_of_l0_layers = {
-                        let layers = self.layers.read().await;
-                        layers.layer_map()?.level0_deltas().len()
-                    };
                     let image_preempt_threshold = self.get_image_creation_preempt_threshold()
                         * self.get_compaction_threshold();
-                    if image_preempt_threshold != 0 && num_of_l0_layers >= image_preempt_threshold {
-                        tracing::info!(
-                        "preempt image layer generation at {lsn} when processing partition {}..{}: too many L0 layers {}",
-                        partition.start().unwrap(), partition.end().unwrap(), num_of_l0_layers
-                    );
-                        last_partition_processed = Some(partition.clone());
-                        all_generated = false;
-                        break;
+                    // TODO: currently we do not respect `get_image_creation_preempt_threshold` and always yield
+                    // when there is a single timeline with more than L0 threshold L0 layers. As long as the
+                    // `get_image_creation_preempt_threshold` is set to a value greater than 0, we will yield for L0 compaction.
+                    if image_preempt_threshold != 0 {
+                        let should_yield = self
+                            .l0_compaction_trigger
+                            .notified()
+                            .now_or_never()
+                            .is_some();
+                        if should_yield {
+                            tracing::info!(
+                                "preempt image layer generation at {lsn} when processing partition {}..{}: too many L0 layers",
+                                partition.start().unwrap(), partition.end().unwrap()
+                            );
+                            last_partition_processed = Some(partition.clone());
+                            all_generated = false;
+                            break;
+                        }
                     }
                 }
             }

From 9ba2a87e69880f1bad63bcf3cd433eee054919dc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 19 Feb 2025 17:57:11 +0100
Subject: [PATCH 56/78] storcon: sk heartbeat fixes (#10891)

This PR does the following things:

* The initial heartbeat round blocks the storage controller from
becoming online again. If all safekeepers are unresponsive, this can
cause storage controller startup to be very slow. The original intent of
#10583 was that heartbeats don't affect normal functionality of the
storage controller. So add a short timeout to prevent it from impeding
storcon functionality.

* Fix the URL of the utilization endpoint.

* Don't send heartbeats to safekeepers which are decomissioned.

Part of https://github.com/neondatabase/neon/issues/9011

context: https://neondb.slack.com/archives/C033RQ5SPDH/p1739966807592589
---
 safekeeper/client/src/mgmt_api.rs              |  2 +-
 storage_controller/src/heartbeater.rs          |  8 +++++++-
 storage_controller/src/safekeeper.rs           | 16 ++++++++++++----
 storage_controller/src/service.rs              |  8 +++++---
 test_runner/regress/test_storage_controller.py | 14 +++++++++++---
 5 files changed, 36 insertions(+), 12 deletions(-)

diff --git a/safekeeper/client/src/mgmt_api.rs b/safekeeper/client/src/mgmt_api.rs
index 40e5afc4aa..5c305769dd 100644
--- a/safekeeper/client/src/mgmt_api.rs
+++ b/safekeeper/client/src/mgmt_api.rs
@@ -137,7 +137,7 @@ impl Client {
     }
 
     pub async fn utilization(&self) -> Result<SafekeeperUtilization> {
-        let uri = format!("{}/v1/utilization/", self.mgmt_api_endpoint);
+        let uri = format!("{}/v1/utilization", self.mgmt_api_endpoint);
         let resp = self.get(&uri).await?;
         resp.json().await.map_err(Error::ReceiveBody)
     }
diff --git a/storage_controller/src/heartbeater.rs b/storage_controller/src/heartbeater.rs
index 6f110d3294..1f20326398 100644
--- a/storage_controller/src/heartbeater.rs
+++ b/storage_controller/src/heartbeater.rs
@@ -10,7 +10,10 @@ use std::{
 };
 use tokio_util::sync::CancellationToken;
 
-use pageserver_api::{controller_api::NodeAvailability, models::PageserverUtilization};
+use pageserver_api::{
+    controller_api::{NodeAvailability, SkSchedulingPolicy},
+    models::PageserverUtilization,
+};
 
 use thiserror::Error;
 use utils::{id::NodeId, logging::SecretString};
@@ -311,6 +314,9 @@ impl HeartBeat<Safekeeper, SafekeeperState> for HeartbeaterTask<Safekeeper, Safe
 
         let mut heartbeat_futs = FuturesUnordered::new();
         for (node_id, sk) in &*safekeepers {
+            if sk.scheduling_policy() == SkSchedulingPolicy::Decomissioned {
+                continue;
+            }
             heartbeat_futs.push({
                 let jwt_token = self
                     .jwt_token
diff --git a/storage_controller/src/safekeeper.rs b/storage_controller/src/safekeeper.rs
index be073d0cb9..b85b4de1e8 100644
--- a/storage_controller/src/safekeeper.rs
+++ b/storage_controller/src/safekeeper.rs
@@ -18,12 +18,14 @@ pub struct Safekeeper {
     cancel: CancellationToken,
     listen_http_addr: String,
     listen_http_port: u16,
+    scheduling_policy: SkSchedulingPolicy,
     id: NodeId,
     availability: SafekeeperState,
 }
 
 impl Safekeeper {
     pub(crate) fn from_persistence(skp: SafekeeperPersistence, cancel: CancellationToken) -> Self {
+        let scheduling_policy = SkSchedulingPolicy::from_str(&skp.scheduling_policy).unwrap();
         Self {
             cancel,
             listen_http_addr: skp.host.clone(),
@@ -31,6 +33,7 @@ impl Safekeeper {
             id: NodeId(skp.id as u64),
             skp,
             availability: SafekeeperState::Offline,
+            scheduling_policy,
         }
     }
     pub(crate) fn base_url(&self) -> String {
@@ -46,6 +49,13 @@ impl Safekeeper {
     pub(crate) fn set_availability(&mut self, availability: SafekeeperState) {
         self.availability = availability;
     }
+    pub(crate) fn scheduling_policy(&self) -> SkSchedulingPolicy {
+        self.scheduling_policy
+    }
+    pub(crate) fn set_scheduling_policy(&mut self, scheduling_policy: SkSchedulingPolicy) {
+        self.scheduling_policy = scheduling_policy;
+        self.skp.scheduling_policy = String::from(scheduling_policy);
+    }
     /// Perform an operation (which is given a [`SafekeeperClient`]) with retries
     pub(crate) async fn with_client_retries<T, O, F>(
         &self,
@@ -129,10 +139,8 @@ impl Safekeeper {
                 self.id.0
             );
         }
-        self.skp = crate::persistence::SafekeeperPersistence::from_upsert(
-            record,
-            SkSchedulingPolicy::from_str(&self.skp.scheduling_policy).unwrap(),
-        );
+        self.skp =
+            crate::persistence::SafekeeperPersistence::from_upsert(record, self.scheduling_policy);
         self.listen_http_port = http_port as u16;
         self.listen_http_addr = host;
     }
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index dd4d93dc84..f47dd72579 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -819,7 +819,9 @@ impl Service {
             .heartbeater_ps
             .heartbeat(Arc::new(nodes_to_heartbeat))
             .await;
-        let res_sk = self.heartbeater_sk.heartbeat(all_sks).await;
+        // Put a small, but reasonable timeout to get the initial heartbeats of the safekeepers to avoid a storage controller downtime
+        const SK_TIMEOUT: Duration = Duration::from_secs(5);
+        let res_sk = tokio::time::timeout(SK_TIMEOUT, self.heartbeater_sk.heartbeat(all_sks)).await;
 
         let mut online_nodes = HashMap::new();
         if let Ok(deltas) = res_ps {
@@ -837,7 +839,7 @@ impl Service {
         }
 
         let mut online_sks = HashMap::new();
-        if let Ok(deltas) = res_sk {
+        if let Ok(Ok(deltas)) = res_sk {
             for (node_id, status) in deltas.0 {
                 match status {
                     SafekeeperState::Available {
@@ -7960,7 +7962,7 @@ impl Service {
             let sk = safekeepers
                 .get_mut(&node_id)
                 .ok_or(DatabaseError::Logical("Not found".to_string()))?;
-            sk.skp.scheduling_policy = String::from(scheduling_policy);
+            sk.set_scheduling_policy(scheduling_policy);
 
             locked.safekeepers = Arc::new(safekeepers);
         }
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 88d30308f7..1d95312140 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -3238,12 +3238,17 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder):
     newest_info = target.get_safekeeper(inserted["id"])
     assert newest_info
     assert newest_info["scheduling_policy"] == "Pause"
-    target.safekeeper_scheduling_policy(inserted["id"], "Decomissioned")
+    target.safekeeper_scheduling_policy(inserted["id"], "Active")
     newest_info = target.get_safekeeper(inserted["id"])
     assert newest_info
-    assert newest_info["scheduling_policy"] == "Decomissioned"
+    assert newest_info["scheduling_policy"] == "Active"
     # Ensure idempotency
-    target.safekeeper_scheduling_policy(inserted["id"], "Decomissioned")
+    target.safekeeper_scheduling_policy(inserted["id"], "Active")
+    newest_info = target.get_safekeeper(inserted["id"])
+    assert newest_info
+    assert newest_info["scheduling_policy"] == "Active"
+    # change back to paused again
+    target.safekeeper_scheduling_policy(inserted["id"], "Pause")
 
     def storcon_heartbeat():
         assert env.storage_controller.log_contains(
@@ -3252,6 +3257,9 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder):
 
     wait_until(storcon_heartbeat)
 
+    # Now decomission it
+    target.safekeeper_scheduling_policy(inserted["id"], "Decomissioned")
+
 
 def eq_safekeeper_records(a: dict[str, Any], b: dict[str, Any]) -> bool:
     compared = [dict(a), dict(b)]

From 0b3db74c44f0309b0ae6721ae721a71358dc8bc1 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Wed, 19 Feb 2025 18:11:12 +0100
Subject: [PATCH 57/78] libs: remove unnecessary regex in `pprof::symbolize`
 (#10893)

`pprof::symbolize()` used a regex to strip the Rust monomorphization
suffix from generic methods. However, the `backtrace` crate can do this
itself if formatted with the `:#` flag.

Also tighten up the code a bit.
---
 libs/http-utils/src/pprof.rs | 37 ++++++++++++++----------------------
 1 file changed, 14 insertions(+), 23 deletions(-)

diff --git a/libs/http-utils/src/pprof.rs b/libs/http-utils/src/pprof.rs
index dd57f9ed4b..fe1cc10838 100644
--- a/libs/http-utils/src/pprof.rs
+++ b/libs/http-utils/src/pprof.rs
@@ -2,7 +2,6 @@ use anyhow::bail;
 use flate2::write::{GzDecoder, GzEncoder};
 use flate2::Compression;
 use itertools::Itertools as _;
-use once_cell::sync::Lazy;
 use pprof::protos::{Function, Line, Location, Message as _, Profile};
 use regex::Regex;
 
@@ -58,38 +57,30 @@ pub fn symbolize(mut profile: Profile) -> anyhow::Result<Profile> {
 
         // Resolve the line and function for each location.
         backtrace::resolve(loc.address as *mut c_void, |symbol| {
-            let Some(symname) = symbol.name() else {
+            let Some(symbol_name) = symbol.name() else {
                 return;
             };
-            let mut name = symname.to_string();
 
-            // Strip the Rust monomorphization suffix from the symbol name.
-            static SUFFIX_REGEX: Lazy<Regex> =
-                Lazy::new(|| Regex::new("::h[0-9a-f]{16}$").expect("invalid regex"));
-            if let Some(m) = SUFFIX_REGEX.find(&name) {
-                name.truncate(m.start());
-            }
-
-            let function_id = match functions.get(&name) {
-                Some(function) => function.id,
-                None => {
-                    let id = functions.len() as u64 + 1;
-                    let system_name = String::from_utf8_lossy(symname.as_bytes());
+            let function_name = format!("{symbol_name:#}");
+            let functions_len = functions.len();
+            let function_id = functions
+                .entry(function_name)
+                .or_insert_with_key(|function_name| {
+                    let function_id = functions_len as u64 + 1;
+                    let system_name = String::from_utf8_lossy(symbol_name.as_bytes());
                     let filename = symbol
                         .filename()
                         .map(|path| path.to_string_lossy())
                         .unwrap_or(Cow::Borrowed(""));
-                    let function = Function {
-                        id,
-                        name: string_id(&name),
+                    Function {
+                        id: function_id,
+                        name: string_id(function_name),
                         system_name: string_id(&system_name),
                         filename: string_id(&filename),
                         ..Default::default()
-                    };
-                    functions.insert(name, function);
-                    id
-                }
-            };
+                    }
+                })
+                .id;
             loc.line.push(Line {
                 function_id,
                 line: symbol.lineno().unwrap_or(0) as i64,

From aad817d80678714d131973cc3c747be1b2c9c8a2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= <jc@neon.tech>
Date: Wed, 19 Feb 2025 18:26:09 +0100
Subject: [PATCH 58/78] refactor(ci): use reusable push-to-container-registry
 workflow for pinning the build-tools image (#10890)

## Problem
Pinning build tools still replicated the ACR/ECR/Docker Hub login and
pushing, even though we have a reusable workflow for this. Was mentioned
as a TODO in https://github.com/neondatabase/neon/pull/10613.

## Summary of changes
Reuse `_push-to-container-registry.yml` for pinning the build-tools
images.
---
 .github/workflows/build_and_test.yml        |  2 +-
 .github/workflows/pin-build-tools-image.yml | 92 ++++++++-------------
 2 files changed, 36 insertions(+), 58 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index f08280e112..8f3392ceea 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1293,7 +1293,7 @@ jobs:
           done
 
   pin-build-tools-image:
-    needs: [ build-build-tools-image, push-compute-image-prod, push-neon-image-prod, build-and-test-locally ]
+    needs: [ build-build-tools-image, test-images, build-and-test-locally ]
     if: github.ref_name == 'main'
     uses: ./.github/workflows/pin-build-tools-image.yml
     with:
diff --git a/.github/workflows/pin-build-tools-image.yml b/.github/workflows/pin-build-tools-image.yml
index 8861c1f093..b305b662ee 100644
--- a/.github/workflows/pin-build-tools-image.yml
+++ b/.github/workflows/pin-build-tools-image.yml
@@ -33,10 +33,6 @@ concurrency:
 # No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
 permissions: {}
 
-env:
-  FROM_TAG: ${{ inputs.from-tag }}
-  TO_TAG: pinned
-
 jobs:
   check-manifests:
     runs-on: ubuntu-22.04
@@ -46,11 +42,14 @@ jobs:
     steps:
       - name: Check if we really need to pin the image
         id: check-manifests
+        env:
+          FROM_TAG: ${{ inputs.from-tag }}
+          TO_TAG: pinned
         run: |
-          docker manifest inspect neondatabase/build-tools:${FROM_TAG} > ${FROM_TAG}.json
-          docker manifest inspect neondatabase/build-tools:${TO_TAG}   > ${TO_TAG}.json
+          docker manifest inspect "docker.io/neondatabase/build-tools:${FROM_TAG}" > "${FROM_TAG}.json"
+          docker manifest inspect "docker.io/neondatabase/build-tools:${TO_TAG}"   > "${TO_TAG}.json"
 
-          if diff ${FROM_TAG}.json ${TO_TAG}.json; then
+          if diff "${FROM_TAG}.json" "${TO_TAG}.json"; then
             skip=true
           else
             skip=false
@@ -64,55 +63,34 @@ jobs:
     # use format(..) to catch both inputs.force = true AND inputs.force = 'true'
     if: needs.check-manifests.outputs.skip == 'false' || format('{0}', inputs.force) == 'true'
 
-    runs-on: ubuntu-22.04
-
     permissions:
-      id-token: write # for `azure/login` and aws auth
+      id-token: write  # Required for aws/azure login
 
-    steps:
-      - uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
-
-      - name: Configure AWS credentials
-        uses: aws-actions/configure-aws-credentials@v4
-        with:
-          aws-region: ${{ vars.AWS_ECR_REGION }}
-          role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
-          role-duration-seconds: 3600
-
-      - name: Login to Amazon Dev ECR
-        uses: aws-actions/amazon-ecr-login@v2
-
-      - name: Azure login
-        uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a  # @v2.1.1
-        with:
-          client-id: ${{ secrets.AZURE_DEV_CLIENT_ID }}
-          tenant-id: ${{ secrets.AZURE_TENANT_ID }}
-          subscription-id: ${{ secrets.AZURE_DEV_SUBSCRIPTION_ID }}
-
-      - name: Login to ACR
-        run: |
-          az acr login --name=neoneastus2
-
-      - name: Tag build-tools with `${{ env.TO_TAG }}` in Docker Hub, ECR, and ACR
-        env:
-          DEFAULT_DEBIAN_VERSION: bookworm
-        run: |
-          for debian_version in bullseye bookworm; do
-            tags=()
-
-            tags+=("-t" "neondatabase/build-tools:${TO_TAG}-${debian_version}")
-            tags+=("-t" "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/build-tools:${TO_TAG}-${debian_version}")
-            tags+=("-t" "neoneastus2.azurecr.io/neondatabase/build-tools:${TO_TAG}-${debian_version}")
-
-            if [ "${debian_version}" == "${DEFAULT_DEBIAN_VERSION}" ]; then
-              tags+=("-t" "neondatabase/build-tools:${TO_TAG}")
-              tags+=("-t" "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/build-tools:${TO_TAG}")
-              tags+=("-t" "neoneastus2.azurecr.io/neondatabase/build-tools:${TO_TAG}")
-            fi
-
-            docker buildx imagetools create "${tags[@]}" \
-                                              neondatabase/build-tools:${FROM_TAG}-${debian_version}
-          done
+    uses: ./.github/workflows/_push-to-container-registry.yml
+    with:
+      image-map: |
+        {
+          "docker.io/neondatabase/build-tools:${{ inputs.from-tag }}-bullseye": [
+            "docker.io/neondatabase/build-tools:pinned-bullseye",
+            "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/build-tools:pinned-bullseye",
+            "${{ vars.AZURE_DEV_REGISTRY_NAME }}.azurecr.io/neondatabase/build-tools:pinned-bullseye"
+          ],
+          "docker.io/neondatabase/build-tools:${{ inputs.from-tag }}-bookworm": [
+            "docker.io/neondatabase/build-tools:pinned-bookworm",
+            "docker.io/neondatabase/build-tools:pinned",
+            "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/build-tools:pinned-bookworm",
+            "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/build-tools:pinned",
+            "${{ vars.AZURE_DEV_REGISTRY_NAME }}.azurecr.io/neondatabase/build-tools:pinned-bookworm",
+            "${{ vars.AZURE_DEV_REGISTRY_NAME }}.azurecr.io/neondatabase/build-tools:pinned"
+          ]
+        }
+      aws-region: ${{ vars.AWS_ECR_REGION }}
+      aws-account-ids: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}"
+      azure-client-id: ${{ vars.AZURE_DEV_CLIENT_ID }}
+      azure-subscription-id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }}
+      azure-tenant-id: ${{ vars.AZURE_TENANT_ID }}
+      acr-registry-name: ${{ vars.AZURE_DEV_REGISTRY_NAME }}
+    secrets:
+      aws-role-to-assume: "${{ vars.DEV_AWS_OIDC_ROLE_ARN }}"
+      docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+      docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}

From f148d71d9bf344230159a941cb20a6b804acec9e Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 19 Feb 2025 19:30:17 +0000
Subject: [PATCH 59/78] test: disable background heatmap uploads and downloads
 in cold migration test (#10895)

## Problem

Background heatmap uploads and downloads were blocking the ones done
manually by the test.

## Summary of changes

Disable Background heatmap uploads and downloads for the cold migration
test. The test does
them explicitly.
---
 test_runner/regress/test_pageserver_secondary.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index aa375604f4..602d493ae6 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -903,6 +903,9 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder):
         remote_storage_kind=RemoteStorageKind.MOCK_S3,
     )
 
+    tenant_conf = TENANT_CONF.copy()
+    tenant_conf["heatmap_period"] = "0s"
+
     env = neon_env_builder.init_configs()
     env.start()
 
@@ -910,7 +913,7 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder):
 
     tenant_id = TenantId.generate()
     timeline_id = TimelineId.generate()
-    env.create_tenant(tenant_id, timeline_id, conf=TENANT_CONF, placement_policy='{"Attached":1}')
+    env.create_tenant(tenant_id, timeline_id, conf=tenant_conf, placement_policy='{"Attached":1}')
 
     env.storage_controller.reconcile_until_idle()
 

From 787b98f8f2d67b1322a260e50a0afa3af9ed5ac9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 19 Feb 2025 21:45:22 +0100
Subject: [PATCH 60/78] storcon: log all safekeepers marked as offline (#10898)

Doing this to help debugging offline safekeepers.

Part of https://github.com/neondatabase/neon/issues/9011
---
 storage_controller/src/heartbeater.rs | 8 +++++++-
 storage_controller/src/safekeeper.rs  | 2 +-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/storage_controller/src/heartbeater.rs b/storage_controller/src/heartbeater.rs
index 1f20326398..57e9fd0f75 100644
--- a/storage_controller/src/heartbeater.rs
+++ b/storage_controller/src/heartbeater.rs
@@ -346,7 +346,13 @@ impl HeartBeat<Safekeeper, SafekeeperState> for HeartbeaterTask<Safekeeper, Safe
                             // We ignore the node in this case.
                             return None;
                         }
-                        Err(_) => SafekeeperState::Offline,
+                        Err(e) => {
+                            tracing::info!(
+                                "Marking safekeeper {} at as offline: {e}",
+                                sk.base_url()
+                            );
+                            SafekeeperState::Offline
+                        }
                     };
 
                     Some((*node_id, status))
diff --git a/storage_controller/src/safekeeper.rs b/storage_controller/src/safekeeper.rs
index b85b4de1e8..53cd8a908b 100644
--- a/storage_controller/src/safekeeper.rs
+++ b/storage_controller/src/safekeeper.rs
@@ -112,7 +112,7 @@ impl Safekeeper {
             warn_threshold,
             max_retries,
             &format!(
-                "Call to node {} ({}:{}) management API",
+                "Call to safekeeper {} ({}:{}) management API",
                 self.id, self.listen_http_addr, self.listen_http_port
             ),
             cancel,

From bb7e244a429742283ceff9b53f0ffab98a8d5ba3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 20 Feb 2025 00:04:05 +0100
Subject: [PATCH 61/78] storcon: fix heartbeats timing out causing a panic
 (#10902)

Fix an issue caused by PR
https://github.com/neondatabase/neon/pull/10891: we introduced the
concept of timeouts for heartbeats, where we would hang up on the other
side of the oneshot channel if a timeout happened (future gets
cancelled, receiver is dropped).

This hang up would make the heartbeat task panic when it did obtain the
response, as we unwrap the result of the result sending operation. The
panic would lead to the heartbeat task panicing itself, which is then
according to logs the last sign of life we of that process invocation.
I'm not sure what brings down the process, in theory tokio [should
continue](https://docs.rs/tokio/latest/tokio/runtime/enum.UnhandledPanic.html#variant.Ignore),
but idk.

Alternative to #10901.
---
 storage_controller/src/heartbeater.rs |  7 ++++++-
 storage_controller/src/service.rs     | 19 +++++++++++--------
 2 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/storage_controller/src/heartbeater.rs b/storage_controller/src/heartbeater.rs
index 57e9fd0f75..52b6110667 100644
--- a/storage_controller/src/heartbeater.rs
+++ b/storage_controller/src/heartbeater.rs
@@ -140,8 +140,13 @@ where
                 request = self.receiver.recv() => {
                     match request {
                         Some(req) => {
+                            if req.reply.is_closed() {
+                                // Prevent a possibly infinite buildup of the receiver channel, if requests arrive faster than we can handle them
+                                continue;
+                            }
                             let res = self.heartbeat(req.servers).await;
-                            req.reply.send(res).unwrap();
+                            // Ignore the return value in order to not panic if the heartbeat function's future was cancelled
+                            _ = req.reply.send(res);
                         },
                         None => { return; }
                     }
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index f47dd72579..fc6d2f3d29 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -815,13 +815,12 @@ impl Service {
         };
 
         tracing::info!("Sending initial heartbeats...");
-        let res_ps = self
-            .heartbeater_ps
-            .heartbeat(Arc::new(nodes_to_heartbeat))
-            .await;
         // Put a small, but reasonable timeout to get the initial heartbeats of the safekeepers to avoid a storage controller downtime
         const SK_TIMEOUT: Duration = Duration::from_secs(5);
-        let res_sk = tokio::time::timeout(SK_TIMEOUT, self.heartbeater_sk.heartbeat(all_sks)).await;
+        let (res_ps, res_sk) = tokio::join!(
+            self.heartbeater_ps.heartbeat(Arc::new(nodes_to_heartbeat)),
+            tokio::time::timeout(SK_TIMEOUT, self.heartbeater_sk.heartbeat(all_sks))
+        );
 
         let mut online_nodes = HashMap::new();
         if let Ok(deltas) = res_ps {
@@ -1064,8 +1063,12 @@ impl Service {
                 locked.safekeepers.clone()
             };
 
-            let res_ps = self.heartbeater_ps.heartbeat(nodes).await;
-            let res_sk = self.heartbeater_sk.heartbeat(safekeepers).await;
+            const SK_TIMEOUT: Duration = Duration::from_secs(3);
+            let (res_ps, res_sk) = tokio::join!(
+                self.heartbeater_ps.heartbeat(nodes),
+                tokio::time::timeout(SK_TIMEOUT, self.heartbeater_sk.heartbeat(safekeepers))
+            );
+
             if let Ok(deltas) = res_ps {
                 let mut to_handle = Vec::default();
 
@@ -1167,7 +1170,7 @@ impl Service {
                     }
                 }
             }
-            if let Ok(deltas) = res_sk {
+            if let Ok(Ok(deltas)) = res_sk {
                 let mut locked = self.inner.write().unwrap();
                 let mut safekeepers = (*locked.safekeepers).clone();
                 for (id, state) in deltas.0 {

From a6d8640d6f5c73f491648a2ab8373563c0d88bf6 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 20 Feb 2025 08:38:55 +0200
Subject: [PATCH 62/78] Persist pg_stat information in pageserver (#6560)

## Problem

Statistic is saved in local file and so lost on compute restart.

Persist in in page server using the same AUX file mechanism used for
replication slots

See more about motivation in
https://neondb.slack.com/archives/C04DGM6SMTM/p1703077676522789

## Summary of changes

Persist postal file using AUX mechanism


Postgres PRs:
https://github.com/neondatabase/postgres/pull/547
https://github.com/neondatabase/postgres/pull/446
https://github.com/neondatabase/postgres/pull/445

Related to #6684 and #6228

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 libs/postgres_ffi/src/lib.rs                  |  2 +-
 libs/postgres_ffi/src/xlog_utils.rs           | 42 +++++++++-
 pageserver/ctl/src/key.rs                     |  1 +
 pageserver/src/aux_file.rs                    |  4 +
 pageserver/src/basebackup.rs                  | 58 ++++++++-----
 pageserver/src/pgdatadir_mapping.rs           |  9 +-
 .../src/tenant/storage_layer/delta_layer.rs   |  8 +-
 .../src/tenant/storage_layer/image_layer.rs   |  8 +-
 pageserver/src/walingest.rs                   | 44 ++++++++++
 test_runner/regress/test_broken_timeline.py   |  4 +-
 test_runner/regress/test_pgstat.py            | 83 +++++++++++++++++++
 test_runner/regress/test_timeline_archive.py  |  2 +
 vendor/postgres-v15                           |  2 +-
 vendor/postgres-v16                           |  2 +-
 vendor/postgres-v17                           |  2 +-
 vendor/revisions.json                         |  6 +-
 16 files changed, 238 insertions(+), 39 deletions(-)
 create mode 100644 test_runner/regress/test_pgstat.py

diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs
index 0239b56d9c..301bc2f16e 100644
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -278,7 +278,7 @@ pub fn generate_pg_control(
     checkpoint_bytes: &[u8],
     lsn: Lsn,
     pg_version: u32,
-) -> anyhow::Result<(Bytes, u64)> {
+) -> anyhow::Result<(Bytes, u64, bool)> {
     dispatch_pgversion!(
         pg_version,
         pgv::xlog_utils::generate_pg_control(pg_control_bytes, checkpoint_bytes, lsn),
diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs
index 852b20eace..14fb1f2a1f 100644
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -124,23 +124,59 @@ pub fn normalize_lsn(lsn: Lsn, seg_sz: usize) -> Lsn {
     }
 }
 
+/// Generate a pg_control file, for a basebackup for starting up Postgres at the given LSN
+///
+/// 'pg_control_bytes' and 'checkpoint_bytes' are the contents of those keys persisted in
+/// the pageserver. They use the same format as the PostgreSQL control file and the
+/// checkpoint record, but see walingest.rs for how exactly they are kept up to date.
+/// 'lsn' is the LSN at which we're starting up.
+///
+/// Returns:
+/// - pg_control file contents
+/// - system_identifier, extracted from the persisted information
+/// - true, if we're starting up from a "clean shutdown", i.e. if there was a shutdown
+///   checkpoint at the given LSN
 pub fn generate_pg_control(
     pg_control_bytes: &[u8],
     checkpoint_bytes: &[u8],
     lsn: Lsn,
-) -> anyhow::Result<(Bytes, u64)> {
+) -> anyhow::Result<(Bytes, u64, bool)> {
     let mut pg_control = ControlFileData::decode(pg_control_bytes)?;
     let mut checkpoint = CheckPoint::decode(checkpoint_bytes)?;
 
     // Generate new pg_control needed for bootstrap
+    //
+    // NB: In the checkpoint struct that we persist in the pageserver, we have a different
+    // convention for the 'redo' field than in PostgreSQL: On a shutdown checkpoint,
+    // 'redo' points the *end* of the checkpoint WAL record. On PostgreSQL, it points to
+    // the beginning. Furthermore, on an online checkpoint, 'redo' is set to 0.
+    //
+    // We didn't always have this convention however, and old persisted records will have
+    // old REDO values that point to some old LSN.
+    //
+    // The upshot is that if 'redo' is equal to the "current" LSN, there was a shutdown
+    // checkpoint record at that point in WAL, with no new WAL records after it. That case
+    // can be treated as starting from a clean shutdown. All other cases are treated as
+    // non-clean shutdown. In Neon, we don't do WAL replay at startup in either case, so
+    // that distinction doesn't matter very much. As of this writing, it only affects
+    // whether the persisted pg_stats information can be used or not.
+    //
+    // In the Checkpoint struct in the returned pg_control file, the redo pointer is
+    // always set to the LSN we're starting at, to hint that no WAL replay is required.
+    // (There's some neon-specific code in Postgres startup to make that work, though.
+    // Just setting the redo pointer is not sufficient.)
+    let was_shutdown = Lsn(checkpoint.redo) == lsn;
     checkpoint.redo = normalize_lsn(lsn, WAL_SEGMENT_SIZE).0;
 
-    //save new values in pg_control
+    // We use DBState_DB_SHUTDOWNED even if it was not a clean shutdown.  The
+    // neon-specific code at postgres startup ignores the state stored in the control
+    // file, similar to archive recovery in standalone PostgreSQL. Similarly, the
+    // checkPoint pointer is ignored, so just set it to 0.
     pg_control.checkPoint = 0;
     pg_control.checkPointCopy = checkpoint;
     pg_control.state = DBState_DB_SHUTDOWNED;
 
-    Ok((pg_control.encode(), pg_control.system_identifier))
+    Ok((pg_control.encode(), pg_control.system_identifier, was_shutdown))
 }
 
 pub fn get_current_timestamp() -> TimestampTz {
diff --git a/pageserver/ctl/src/key.rs b/pageserver/ctl/src/key.rs
index af4b5a21ab..c7f0719c41 100644
--- a/pageserver/ctl/src/key.rs
+++ b/pageserver/ctl/src/key.rs
@@ -345,6 +345,7 @@ impl AuxFileV2 {
                 AuxFileV2::Recognized("pg_logical/replorigin_checkpoint", hash)
             }
             (2, 1) => AuxFileV2::Recognized("pg_replslot/", hash),
+            (3, 1) => AuxFileV2::Recognized("pg_stat/pgstat.stat", hash),
             (1, 0xff) => AuxFileV2::OtherWithPrefix("pg_logical/", hash),
             (0xff, 0xff) => AuxFileV2::Other(hash),
             _ => return None,
diff --git a/pageserver/src/aux_file.rs b/pageserver/src/aux_file.rs
index 5e527b7d61..5cc20a70b2 100644
--- a/pageserver/src/aux_file.rs
+++ b/pageserver/src/aux_file.rs
@@ -39,6 +39,7 @@ fn aux_hash_to_metadata_key(dir_level1: u8, dir_level2: u8, data: &[u8]) -> Key
 
 const AUX_DIR_PG_LOGICAL: u8 = 0x01;
 const AUX_DIR_PG_REPLSLOT: u8 = 0x02;
+const AUX_DIR_PG_STAT: u8 = 0x03;
 const AUX_DIR_PG_UNKNOWN: u8 = 0xFF;
 
 /// Encode the aux file into a fixed-size key.
@@ -53,6 +54,7 @@ const AUX_DIR_PG_UNKNOWN: u8 = 0xFF;
 /// * pg_logical/replorigin_checkpoint -> 0x0103
 /// * pg_logical/others -> 0x01FF
 /// * pg_replslot/ -> 0x0201
+/// * pg_stat/pgstat.stat -> 0x0301
 /// * others -> 0xFFFF
 ///
 /// If you add new AUX files to this function, please also add a test case to `test_encoding_portable`.
@@ -75,6 +77,8 @@ pub fn encode_aux_file_key(path: &str) -> Key {
         aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0xFF, fname.as_bytes())
     } else if let Some(fname) = path.strip_prefix("pg_replslot/") {
         aux_hash_to_metadata_key(AUX_DIR_PG_REPLSLOT, 0x01, fname.as_bytes())
+    } else if let Some(fname) = path.strip_prefix("pg_stat/") {
+        aux_hash_to_metadata_key(AUX_DIR_PG_STAT, 0x01, fname.as_bytes())
     } else {
         if cfg!(debug_assertions) {
             warn!(
diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index e03b1bbe96..99b0775316 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -264,6 +264,31 @@ where
     async fn send_tarball(mut self) -> Result<(), BasebackupError> {
         // TODO include checksum
 
+        // Construct the pg_control file from the persisted checkpoint and pg_control
+        // information. But we only add this to the tarball at the end, so that if the
+        // writing is interrupted half-way through, the resulting incomplete tarball will
+        // be missing the pg_control file, which prevents PostgreSQL from starting up on
+        // it. With proper error handling, you should never try to start up from an
+        // incomplete basebackup in the first place, of course, but this is a nice little
+        // extra safety measure.
+        let checkpoint_bytes = self
+            .timeline
+            .get_checkpoint(self.lsn, self.ctx)
+            .await
+            .context("failed to get checkpoint bytes")?;
+        let pg_control_bytes = self
+            .timeline
+            .get_control_file(self.lsn, self.ctx)
+            .await
+            .context("failed to get control bytes")?;
+        let (pg_control_bytes, system_identifier, was_shutdown) =
+            postgres_ffi::generate_pg_control(
+                &pg_control_bytes,
+                &checkpoint_bytes,
+                self.lsn,
+                self.timeline.pg_version,
+            )?;
+
         let lazy_slru_download = self.timeline.get_lazy_slru_download() && !self.full_backup;
 
         let pgversion = self.timeline.pg_version;
@@ -401,6 +426,10 @@ where
                 // In future we will not generate AUX record for "pg_logical/replorigin_checkpoint" at all,
                 // but now we should handle (skip) it for backward compatibility.
                 continue;
+            } else if path == "pg_stat/pgstat.stat" && !was_shutdown {
+                // Drop statistic in case of abnormal termination, i.e. if we're not starting from the exact LSN
+                // of a shutdown checkpoint.
+                continue;
             }
             let header = new_tar_header(&path, content.len() as u64)?;
             self.ar
@@ -462,8 +491,9 @@ where
             )))
         });
 
-        // Generate pg_control and bootstrap WAL segment.
-        self.add_pgcontrol_file().await?;
+        // Last, add the pg_control file and bootstrap WAL segment.
+        self.add_pgcontrol_file(pg_control_bytes, system_identifier)
+            .await?;
         self.ar
             .finish()
             .await
@@ -671,7 +701,11 @@ where
     // Add generated pg_control file and bootstrap WAL segment.
     // Also send zenith.signal file with extra bootstrap data.
     //
-    async fn add_pgcontrol_file(&mut self) -> Result<(), BasebackupError> {
+    async fn add_pgcontrol_file(
+        &mut self,
+        pg_control_bytes: Bytes,
+        system_identifier: u64,
+    ) -> Result<(), BasebackupError> {
         // add zenith.signal file
         let mut zenith_signal = String::new();
         if self.prev_record_lsn == Lsn(0) {
@@ -694,24 +728,6 @@ where
             .await
             .map_err(|e| BasebackupError::Client(e, "add_pgcontrol_file,zenith.signal"))?;
 
-        let checkpoint_bytes = self
-            .timeline
-            .get_checkpoint(self.lsn, self.ctx)
-            .await
-            .context("failed to get checkpoint bytes")?;
-        let pg_control_bytes = self
-            .timeline
-            .get_control_file(self.lsn, self.ctx)
-            .await
-            .context("failed get control bytes")?;
-
-        let (pg_control_bytes, system_identifier) = postgres_ffi::generate_pg_control(
-            &pg_control_bytes,
-            &checkpoint_bytes,
-            self.lsn,
-            self.timeline.pg_version,
-        )?;
-
         //send pg_control
         let header = new_tar_header("global/pg_control", pg_control_bytes.len() as u64)?;
         self.ar
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index ae2762bd1e..d0e2dab042 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -45,7 +45,7 @@ use std::ops::ControlFlow;
 use std::ops::Range;
 use strum::IntoEnumIterator;
 use tokio_util::sync::CancellationToken;
-use tracing::{debug, trace, warn};
+use tracing::{debug, info, trace, warn};
 use utils::bin_ser::DeserializeError;
 use utils::pausable_failpoint;
 use utils::{bin_ser::BeSer, lsn::Lsn};
@@ -2264,6 +2264,13 @@ impl DatadirModification<'_> {
                 self.tline.aux_file_size_estimator.on_add(content.len());
                 new_files.push((path, content));
             }
+            // Compute may request delete of old version of pgstat AUX file if new one exceeds size limit.
+            // Compute doesn't know if previous version of this file exists or not, so
+            // attempt to delete non-existing file can cause this message.
+            // To avoid false alarms, log it as info rather than warning.
+            (None, true) if path.starts_with("pg_stat/") => {
+                info!("removing non-existing pg_stat file: {}", path)
+            }
             (None, true) => warn!("removing non-existing aux file: {}", path),
         }
         let new_val = aux_file::encode_file_value(&new_files)?;
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 885c50425f..7ba0e3679f 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -51,8 +51,7 @@ use camino::{Utf8Path, Utf8PathBuf};
 use futures::StreamExt;
 use itertools::Itertools;
 use pageserver_api::config::MaxVectoredReadBytes;
-use pageserver_api::key::DBDIR_KEY;
-use pageserver_api::key::{Key, KEY_SIZE};
+use pageserver_api::key::{Key, DBDIR_KEY, KEY_SIZE};
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::ImageCompressionAlgorithm;
 use pageserver_api::shard::TenantShardId;
@@ -967,7 +966,10 @@ impl DeltaLayerInner {
                 .as_slice()
                 .iter()
                 .filter_map(|(_, blob_meta)| {
-                    if blob_meta.key.is_rel_dir_key() || blob_meta.key == DBDIR_KEY {
+                    if blob_meta.key.is_rel_dir_key()
+                        || blob_meta.key == DBDIR_KEY
+                        || blob_meta.key.is_aux_file_key()
+                    {
                         // The size of values for these keys is unbounded and can
                         // grow very large in pathological cases.
                         None
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index c49281dc45..dc611bd6e1 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -48,8 +48,7 @@ use camino::{Utf8Path, Utf8PathBuf};
 use hex;
 use itertools::Itertools;
 use pageserver_api::config::MaxVectoredReadBytes;
-use pageserver_api::key::DBDIR_KEY;
-use pageserver_api::key::{Key, KEY_SIZE};
+use pageserver_api::key::{Key, DBDIR_KEY, KEY_SIZE};
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::shard::{ShardIdentity, TenantShardId};
 use pageserver_api::value::Value;
@@ -603,7 +602,10 @@ impl ImageLayerInner {
                     .as_slice()
                     .iter()
                     .filter_map(|(_, blob_meta)| {
-                        if blob_meta.key.is_rel_dir_key() || blob_meta.key == DBDIR_KEY {
+                        if blob_meta.key.is_rel_dir_key()
+                            || blob_meta.key == DBDIR_KEY
+                            || blob_meta.key.is_aux_file_key()
+                        {
                             // The size of values for these keys is unbounded and can
                             // grow very large in pathological cases.
                             None
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 04edb3e3f4..45c87353a7 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -1180,6 +1180,50 @@ impl WalIngest {
                 } else {
                     cp.oldestActiveXid = xlog_checkpoint.oldestActiveXid;
                 }
+                // NB: We abuse the Checkpoint.redo field:
+                //
+                // - In PostgreSQL, the Checkpoint struct doesn't store the information
+                //   of whether this is an online checkpoint or a shutdown checkpoint. It's
+                //   stored in the XLOG info field of the WAL record, shutdown checkpoints
+                //   use record type XLOG_CHECKPOINT_SHUTDOWN and online checkpoints use
+                //   XLOG_CHECKPOINT_ONLINE. We don't store the original WAL record headers
+                //   in the pageserver, however.
+                //
+                // - In PostgreSQL, the Checkpoint.redo field stores the *start* of the
+                //   checkpoint record, if it's a shutdown checkpoint. But when we are
+                //   starting from a shutdown checkpoint, the basebackup LSN is the *end*
+                //   of the shutdown checkpoint WAL record. That makes it difficult to
+                //   correctly detect whether we're starting from a shutdown record or
+                //   not.
+                //
+                // To address both of those issues, we store 0 in the redo field if it's
+                // an online checkpoint record, and the record's *end* LSN if it's a
+                // shutdown checkpoint. We don't need the original redo pointer in neon,
+                // because we don't perform WAL replay at startup anyway, so we can get
+                // away with abusing the redo field like this.
+                //
+                // XXX: Ideally, we would persist the extra information in a more
+                // explicit format, rather than repurpose the fields of the Postgres
+                // struct like this. However, we already have persisted data like this,
+                // so we need to maintain backwards compatibility.
+                //
+                // NB: We didn't originally have this convention, so there are still old
+                // persisted records that didn't do this. Before, we didn't update the
+                // persisted redo field at all. That means that old records have a bogus
+                // redo pointer that points to some old value, from the checkpoint record
+                // that was originally imported from the data directory. If it was a
+                // project created in Neon, that means it points to the first checkpoint
+                // after initdb. That's OK for our purposes: all such old checkpoints are
+                // treated as old online checkpoints when the basebackup is created.
+                cp.redo = if info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN {
+                    // Store the *end* LSN of the checkpoint record. Or to be precise,
+                    // the start LSN of the *next* record, i.e. if the record ends
+                    // exactly at page boundary, the redo LSN points to just after the
+                    // page header on the next page.
+                    lsn.into()
+                } else {
+                    Lsn::INVALID.into()
+                };
 
                 // Write a new checkpoint key-value pair on every checkpoint record, even
                 // if nothing really changed. Not strictly required, but it seems nice to
diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py
index 124e62999a..d49686b57c 100644
--- a/test_runner/regress/test_broken_timeline.py
+++ b/test_runner/regress/test_broken_timeline.py
@@ -29,6 +29,8 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder):
             ".*failed to load metadata.*",
             ".*load failed.*load local timeline.*",
             ".*: layer load failed, assuming permanent failure:.*",
+            ".*failed to get checkpoint bytes.*",
+            ".*failed to get control bytes.*",
         ]
     )
 
@@ -75,7 +77,7 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder):
     # (We don't check layer file contents on startup, when loading the timeline)
     #
     # This will change when we implement checksums for layers
-    with pytest.raises(Exception, match="get_values_reconstruct_data for layer ") as err:
+    with pytest.raises(Exception, match="failed to get checkpoint bytes") as err:
         pg1.start()
     log.info(
         f"As expected, compute startup failed for timeline {tenant1}/{timeline1} with corrupt layers: {err}"
diff --git a/test_runner/regress/test_pgstat.py b/test_runner/regress/test_pgstat.py
new file mode 100644
index 0000000000..c31e5ef7f8
--- /dev/null
+++ b/test_runner/regress/test_pgstat.py
@@ -0,0 +1,83 @@
+import pytest
+from fixtures.neon_fixtures import NeonEnv
+from fixtures.pg_version import PgVersion
+
+
+#
+# Test that pgstat statistic is preserved across sessions
+#
+def test_pgstat(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+    if env.pg_version == PgVersion.V14:
+        pytest.skip("PG14 doesn't support pgstat statistic persistence")
+
+    n = 10000
+    endpoint = env.endpoints.create_start(
+        "main", config_lines=["neon_pgstat_file_size_limit=100kB", "autovacuum=off"]
+    )
+
+    con = endpoint.connect()
+    cur = con.cursor()
+
+    cur.execute("create table t(x integer)")
+    cur.execute(f"insert into t values (generate_series(1,{n}))")
+    cur.execute("vacuum analyze t")
+    cur.execute("select sum(x) from t")
+    cur.execute("update t set x=x+1")
+
+    cur.execute("select pg_stat_force_next_flush()")
+
+    cur.execute(
+        "select seq_scan,seq_tup_read,n_tup_ins,n_tup_upd,n_live_tup,n_dead_tup, vacuum_count,analyze_count from pg_stat_user_tables"
+    )
+    rec = cur.fetchall()[0]
+    assert rec == (2, n * 2, n, n, n * 2, n, 1, 1)
+
+    endpoint.stop()
+    endpoint.start()
+
+    con = endpoint.connect()
+    cur = con.cursor()
+
+    cur.execute(
+        "select seq_scan,seq_tup_read,n_tup_ins,n_tup_upd,n_live_tup,n_dead_tup, vacuum_count,analyze_count from pg_stat_user_tables"
+    )
+    rec = cur.fetchall()[0]
+    assert rec == (2, n * 2, n, n, n * 2, n, 1, 1)
+
+    cur.execute("update t set x=x+1")
+
+    # stop without checkpoint
+    endpoint.stop(mode="immediate")
+    endpoint.start()
+
+    con = endpoint.connect()
+    cur = con.cursor()
+
+    cur.execute(
+        "select seq_scan,seq_tup_read,n_tup_ins,n_tup_upd,n_live_tup,n_dead_tup, vacuum_count,analyze_count from pg_stat_user_tables"
+    )
+    rec = cur.fetchall()[0]
+    # pgstat information should be discarded in case of abnormal termination
+    assert rec == (0, 0, 0, 0, 0, 0, 0, 0)
+
+    cur.execute("select sum(x) from t")
+
+    # create more relations to increase size of statistics
+    for i in range(1, 1000):
+        cur.execute(f"create table t{i}(pk integer primary key)")
+
+    cur.execute("select pg_stat_force_next_flush()")
+
+    endpoint.stop()
+    endpoint.start()
+
+    con = endpoint.connect()
+    cur = con.cursor()
+
+    cur.execute(
+        "select seq_scan,seq_tup_read,n_tup_ins,n_tup_upd,n_live_tup,n_dead_tup, vacuum_count,analyze_count from pg_stat_user_tables"
+    )
+    rec = cur.fetchall()[0]
+    # pgstat information is not restored because its size exeeds 100k threshold
+    assert rec == (0, 0, 0, 0, 0, 0, 0, 0)
diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py
index 2706ddf2f0..c17840d31c 100644
--- a/test_runner/regress/test_timeline_archive.py
+++ b/test_runner/regress/test_timeline_archive.py
@@ -823,6 +823,8 @@ def test_timeline_retain_lsn(
             [
                 ".*initial size calculation failed: PageRead.MissingKey.could not find data for key.*",
                 ".*page_service_conn_main.*could not find data for key.*",
+                ".*failed to get checkpoint bytes.*",
+                ".*failed to get control bytes.*",
             ]
         )
     if offload_child is None or "no-restart" not in offload_child:
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 81e2eef061..023f1020ec 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 81e2eef0616c65c2233c75b06f25766ae4c080c4
+Subproject commit 023f1020ecb07af3bb0ddbf4622e1a3c3fa276a4
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 9422247c58..6cb8d22079 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 9422247c582e7c1a08a4855d04af0874f8df2f34
+Subproject commit 6cb8d22079570b50fcaff29124d40807c1e63a82
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index a8fea8b4be..59b2fe851f 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit a8fea8b4be43039f0782347c88a9b9b25f50c9d8
+Subproject commit 59b2fe851f8e0595f6c830b90ee766f4f1c17a0f
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 72d97d7f6a..3379cf1ba8 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,15 +1,15 @@
 {
   "v17": [
     "17.4",
-    "a8fea8b4be43039f0782347c88a9b9b25f50c9d8"
+    "59b2fe851f8e0595f6c830b90ee766f4f1c17a0f"
   ],
   "v16": [
     "16.8",
-    "9422247c582e7c1a08a4855d04af0874f8df2f34"
+    "6cb8d22079570b50fcaff29124d40807c1e63a82"
   ],
   "v15": [
     "15.12",
-    "81e2eef0616c65c2233c75b06f25766ae4c080c4"
+    "023f1020ecb07af3bb0ddbf4622e1a3c3fa276a4"
   ],
   "v14": [
     "14.17",

From 1d9346f8b746a5f4e6b5b9d1099bab9ebf6581d1 Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Thu, 20 Feb 2025 11:05:01 +0100
Subject: [PATCH 63/78] Add pg_repack test (#10638)

## Problem
We don't test `pg_repack`
## Summary of changes
The test for `pg_repack` is added
---
 compute/compute-node.Dockerfile               |  5 +-
 compute/patches/pg_repack.patch               | 72 +++++++++++++++++++
 docker-compose/docker_compose_test.sh         |  9 +--
 .../ext-src/pg_repack-src/test-upgrade.sh     |  5 ++
 docker-compose/test_extensions_upgrade.sh     |  3 +-
 5 files changed, 84 insertions(+), 10 deletions(-)
 create mode 100644 compute/patches/pg_repack.patch
 create mode 100755 docker-compose/ext-src/pg_repack-src/test-upgrade.sh

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 0b3001613d..19633064a6 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1844,7 +1844,10 @@ COPY --from=pg_semver-src /ext-src/ /ext-src/
 COPY --from=pg_ivm-src /ext-src/ /ext-src/
 COPY --from=pg_partman-src /ext-src/ /ext-src/
 #COPY --from=pg_mooncake-src /ext-src/ /ext-src/
-#COPY --from=pg_repack-src /ext-src/ /ext-src/
+COPY --from=pg_repack-src /ext-src/ /ext-src/
+COPY --from=pg_repack-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY compute/patches/pg_repack.patch /ext-src
+RUN cd /ext-src/pg_repack-src && patch -p1 </ext-src/pg_repack.patch && rm -f /ext-src/pg_repack.patch
 
 COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh
 RUN apt-get update && apt-get install -y libtap-parser-sourcehandler-pgtap-perl\
diff --git a/compute/patches/pg_repack.patch b/compute/patches/pg_repack.patch
new file mode 100644
index 0000000000..f6b0aa1e13
--- /dev/null
+++ b/compute/patches/pg_repack.patch
@@ -0,0 +1,72 @@
+diff --git a/regress/Makefile b/regress/Makefile
+index bf6edcb..89b4c7f 100644
+--- a/regress/Makefile
++++ b/regress/Makefile
+@@ -17,7 +17,7 @@ INTVERSION := $(shell echo $$(($$(echo $(VERSION).0 | sed 's/\([[:digit:]]\{1,\}
+ # Test suite
+ #
+ 
+-REGRESS := init-extension repack-setup repack-run error-on-invalid-idx no-error-on-invalid-idx after-schema repack-check nosuper tablespace get_order_by trigger
++REGRESS := init-extension repack-setup repack-run error-on-invalid-idx no-error-on-invalid-idx after-schema repack-check nosuper get_order_by trigger
+ 
+ USE_PGXS = 1	# use pgxs if not in contrib directory
+ PGXS := $(shell $(PG_CONFIG) --pgxs)
+diff --git a/regress/expected/nosuper.out b/regress/expected/nosuper.out
+index 8d0a94e..63b68bf 100644
+--- a/regress/expected/nosuper.out
++++ b/regress/expected/nosuper.out
+@@ -4,22 +4,22 @@
+ SET client_min_messages = error;
+ DROP ROLE IF EXISTS nosuper;
+ SET client_min_messages = warning;
+-CREATE ROLE nosuper WITH LOGIN;
++CREATE ROLE nosuper WITH LOGIN PASSWORD 'NoSuPeRpAsSwOrD';
+ -- => OK
+ \! pg_repack --dbname=contrib_regression --table=tbl_cluster --no-superuser-check
+ INFO: repacking table "public.tbl_cluster"
+ -- => ERROR
+-\! pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper
++\! PGPASSWORD=NoSuPeRpAsSwOrD pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper
+ ERROR: pg_repack failed with error: You must be a superuser to use pg_repack
+ -- => ERROR
+-\! pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check
++\! PGPASSWORD=NoSuPeRpAsSwOrD pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check
+ ERROR: pg_repack failed with error: ERROR:  permission denied for schema repack
+ LINE 1: select repack.version(), repack.version_sql()
+                ^
+ GRANT ALL ON ALL TABLES IN SCHEMA repack TO nosuper;
+ GRANT USAGE ON SCHEMA repack TO nosuper;
+ -- => ERROR
+-\! pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check
++\! PGPASSWORD=NoSuPeRpAsSwOrD pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check
+ INFO: repacking table "public.tbl_cluster"
+ ERROR: query failed: ERROR:  current transaction is aborted, commands ignored until end of transaction block
+ DETAIL: query was: RESET lock_timeout
+diff --git a/regress/sql/nosuper.sql b/regress/sql/nosuper.sql
+index 072f0fa..dbe60f8 100644
+--- a/regress/sql/nosuper.sql
++++ b/regress/sql/nosuper.sql
+@@ -4,19 +4,19 @@
+ SET client_min_messages = error;
+ DROP ROLE IF EXISTS nosuper;
+ SET client_min_messages = warning;
+-CREATE ROLE nosuper WITH LOGIN;
++CREATE ROLE nosuper WITH LOGIN PASSWORD 'NoSuPeRpAsSwOrD';
+ -- => OK
+ \! pg_repack --dbname=contrib_regression --table=tbl_cluster --no-superuser-check
+ -- => ERROR
+-\! pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper
++\! PGPASSWORD=NoSuPeRpAsSwOrD pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper
+ -- => ERROR
+-\! pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check
++\! PGPASSWORD=NoSuPeRpAsSwOrD pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check
+ 
+ GRANT ALL ON ALL TABLES IN SCHEMA repack TO nosuper;
+ GRANT USAGE ON SCHEMA repack TO nosuper;
+ 
+ -- => ERROR
+-\! pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check
++\! PGPASSWORD=NoSuPeRpAsSwOrD pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check
+ 
+ REVOKE ALL ON ALL TABLES IN SCHEMA repack FROM nosuper;
+ REVOKE USAGE ON SCHEMA repack FROM nosuper;
diff --git a/docker-compose/docker_compose_test.sh b/docker-compose/docker_compose_test.sh
index dd520d4986..5b3cfc74eb 100755
--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -81,15 +81,8 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
             [ $EXT_SUCCESS -eq 0 ] && FAILED=$(tail -1 testout.txt | awk '{for(i=1;i<=NF;i++){print "/ext-src/"$i;}}')
             [ $CONTRIB_SUCCESS -eq 0 ] && CONTRIB_FAILED=$(tail -1 testout_contrib.txt | awk '{for(i=0;i<=NF;i++){print "/postgres/contrib/"$i;}}')
             for d in $FAILED $CONTRIB_FAILED; do
-                dn="$(basename $d)"
-                rm -rf $dn
-                mkdir $dn
-                docker cp $TEST_CONTAINER_NAME:$d/regression.diffs $dn || [ $? -eq 1 ]
-                docker cp $TEST_CONTAINER_NAME:$d/regression.out $dn || [ $? -eq 1 ]
-                cat $dn/regression.out $dn/regression.diffs || true
-                rm -rf $dn
+                docker exec $TEST_CONTAINER_NAME bash -c 'for file in $(find '"$d"' -name regression.diffs -o -name regression.out); do cat $file; done' || [ $? -eq 1 ]
             done
-        rm -rf $FAILED
         exit 1
         fi
     fi
diff --git a/docker-compose/ext-src/pg_repack-src/test-upgrade.sh b/docker-compose/ext-src/pg_repack-src/test-upgrade.sh
new file mode 100755
index 0000000000..5021eb4027
--- /dev/null
+++ b/docker-compose/ext-src/pg_repack-src/test-upgrade.sh
@@ -0,0 +1,5 @@
+#!/bin/sh
+set -ex
+cd "$(dirname ${0})"
+PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
+${PG_REGRESS} --use-existing --inputdir=./regress --bindir='/usr/local/pgsql/bin' --dbname=contrib_regression repack-setup repack-run error-on-invalid-idx no-error-on-invalid-idx after-schema repack-check nosuper get_order_by trigger
diff --git a/docker-compose/test_extensions_upgrade.sh b/docker-compose/test_extensions_upgrade.sh
index 4a9024569b..06d351b496 100755
--- a/docker-compose/test_extensions_upgrade.sh
+++ b/docker-compose/test_extensions_upgrade.sh
@@ -43,7 +43,8 @@ EXTENSIONS='[
 {"extname": "semver", "extdir": "pg_semver-src"},
 {"extname": "pg_ivm", "extdir": "pg_ivm-src"},
 {"extname": "pgjwt", "extdir": "pgjwt-src"},
-{"extname": "pgtap", "extdir": "pgtap-src"}
+{"extname": "pgtap", "extdir": "pgtap-src"},
+{"extname": "pg_repack", "extdir": "pg_repack-src"}
 ]'
 EXTNAMES=$(echo ${EXTENSIONS} | jq -r '.[].extname' | paste -sd ' ' -)
 TAG=${NEWTAG} docker compose --profile test-extensions up --quiet-pull --build -d

From f7edcf12e320f5854d93cc21c5852bd2bf0433ce Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Thu, 20 Feb 2025 13:08:30 +0100
Subject: [PATCH 64/78] pageserver: downgrade ephemeral layer roll wait message
 (#10883)

We already log a message for this during the L0 flush, so the additional
message is mostly noise.
---
 pageserver/src/tenant/timeline.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 48c208d5d7..bc6131b378 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -6614,7 +6614,7 @@ impl TimelineWriter<'_> {
 
         if let Some(wait_threshold) = wait_threshold {
             if l0_count >= wait_threshold {
-                info!("layer roll waiting for flush due to compaction backpressure at {l0_count} L0 layers");
+                debug!("layer roll waiting for flush due to compaction backpressure at {l0_count} L0 layers");
                 self.tl.wait_flush_completion(flush_id).await?;
             }
         }

From 07bee600374ccd486c69370d0972d9035964fe68 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Thu, 20 Feb 2025 13:08:54 +0100
Subject: [PATCH 65/78] pageserver: make compaction walredo errors critical
 (#10884)

Mark walredo errors as critical too.

Also pull the pattern matching out into the outer `match`.

Follows #10872.
---
 pageserver/src/tenant/timeline.rs            |  6 ---
 pageserver/src/tenant/timeline/compaction.rs | 42 ++++++++++----------
 2 files changed, 20 insertions(+), 28 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index bc6131b378..b9425d2777 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -5344,12 +5344,6 @@ impl From<OffloadError> for CompactionError {
     }
 }
 
-impl CompactionError {
-    pub fn is_cancelled(&self) -> bool {
-        matches!(self, CompactionError::ShuttingDown)
-    }
-}
-
 impl From<CollectKeySpaceError> for CompactionError {
     fn from(err: CollectKeySpaceError) -> Self {
         match err {
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 4e4f906d78..58a87dbd5f 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -26,7 +26,7 @@ use pageserver_api::models::CompactInfoResponse;
 use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId};
 use serde::Serialize;
 use tokio_util::sync::CancellationToken;
-use tracing::{debug, info, info_span, trace, warn, Instrument};
+use tracing::{debug, error, info, info_span, trace, warn, Instrument};
 use utils::critical;
 use utils::id::TimelineId;
 
@@ -775,27 +775,25 @@ impl Timeline {
                     return Ok(CompactionOutcome::YieldForL0);
                 }
             }
-            Err(err) => {
-                // no partitioning? This is normal, if the timeline was just created
-                // as an empty timeline. Also in unit tests, when we use the timeline
-                // as a simple key-value store, ignoring the datadir layout. Log the
-                // error but continue.
-                //
-                // Suppress error when it's due to cancellation
-                if !self.cancel.is_cancelled() && !err.is_cancelled() {
-                    if let CompactionError::CollectKeySpaceError(
-                        CollectKeySpaceError::Decode(_)
-                        | CollectKeySpaceError::PageRead(PageReconstructError::MissingKey(_)),
-                    ) = err
-                    {
-                        critical!("could not compact, repartitioning keyspace failed: {err:?}");
-                    } else {
-                        tracing::error!(
-                            "could not compact, repartitioning keyspace failed: {err:?}"
-                        );
-                    }
-                }
-            }
+
+            // Suppress errors when cancelled.
+            Err(_) if self.cancel.is_cancelled() => {}
+            Err(CompactionError::ShuttingDown) => {}
+
+            // Alert on critical errors that indicate data corruption.
+            Err(
+                err @ CompactionError::CollectKeySpaceError(
+                    CollectKeySpaceError::Decode(_)
+                    | CollectKeySpaceError::PageRead(
+                        PageReconstructError::MissingKey(_) | PageReconstructError::WalRedo(_),
+                    ),
+                ),
+            ) => critical!("could not compact, repartitioning keyspace failed: {err:?}"),
+
+            // Log other errors. No partitioning? This is normal, if the timeline was just created
+            // as an empty timeline. Also in unit tests, when we use the timeline as a simple
+            // key-value store, ignoring the datadir layout. Log the error but continue.
+            Err(err) => error!("could not compact, repartitioning keyspace failed: {err:?}"),
         };
 
         let partition_count = self.partitioning.read().0 .0.parts.len();

From 7c7180a79dbda2764d883392a73950acf114b63f Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Thu, 20 Feb 2025 17:14:16 +0000
Subject: [PATCH 66/78] Fix deadlock in drop_subscriptions_before_start
 (#10806)

ALTER SUBSCRIPTION requires AccessExclusive lock
which conflicts with iteration over pg_subscription when multiple
databases are present
and operations are applied concurrently.

Fix by explicitly locking pg_subscription
in the beginning of the transaction in each database.

## Problem
https://github.com/neondatabase/cloud/issues/24292
---
 compute_tools/src/sql/drop_subscriptions.sql  |   1 +
 control_plane/src/endpoint.rs                 |  46 ++++-
 libs/compute_api/src/spec.rs                  |   8 +-
 .../regress/test_subscriber_branching.py      | 173 +++++++++++++++++-
 4 files changed, 220 insertions(+), 8 deletions(-)

diff --git a/compute_tools/src/sql/drop_subscriptions.sql b/compute_tools/src/sql/drop_subscriptions.sql
index dfb925e48e..03e8e158fa 100644
--- a/compute_tools/src/sql/drop_subscriptions.sql
+++ b/compute_tools/src/sql/drop_subscriptions.sql
@@ -2,6 +2,7 @@ DO $$
 DECLARE
     subname TEXT;
 BEGIN
+    LOCK TABLE pg_subscription IN ACCESS EXCLUSIVE MODE;
     FOR subname IN SELECT pg_subscription.subname FROM pg_subscription WHERE subdbid = (SELECT oid FROM pg_database WHERE datname = {datname_str}) LOOP
         EXECUTE format('ALTER SUBSCRIPTION %I DISABLE;', subname);
         EXECUTE format('ALTER SUBSCRIPTION %I SET (slot_name = NONE);', subname);
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index c3c8229c38..c16b3cb017 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -59,6 +59,7 @@ use nix::sys::signal::Signal;
 use pageserver_api::shard::ShardStripeSize;
 use reqwest::header::CONTENT_TYPE;
 use serde::{Deserialize, Serialize};
+use tracing::debug;
 use url::Host;
 use utils::id::{NodeId, TenantId, TimelineId};
 
@@ -81,8 +82,10 @@ pub struct EndpointConf {
     internal_http_port: u16,
     pg_version: u32,
     skip_pg_catalog_updates: bool,
+    reconfigure_concurrency: usize,
     drop_subscriptions_before_start: bool,
     features: Vec<ComputeFeature>,
+    cluster: Option<Cluster>,
 }
 
 //
@@ -179,7 +182,9 @@ impl ComputeControlPlane {
             // we also skip catalog updates in the cloud.
             skip_pg_catalog_updates,
             drop_subscriptions_before_start,
+            reconfigure_concurrency: 1,
             features: vec![],
+            cluster: None,
         });
 
         ep.create_endpoint_dir()?;
@@ -196,7 +201,9 @@ impl ComputeControlPlane {
                 pg_version,
                 skip_pg_catalog_updates,
                 drop_subscriptions_before_start,
+                reconfigure_concurrency: 1,
                 features: vec![],
+                cluster: None,
             })?,
         )?;
         std::fs::write(
@@ -261,8 +268,11 @@ pub struct Endpoint {
     skip_pg_catalog_updates: bool,
 
     drop_subscriptions_before_start: bool,
+    reconfigure_concurrency: usize,
     // Feature flags
     features: Vec<ComputeFeature>,
+    // Cluster settings
+    cluster: Option<Cluster>,
 }
 
 #[derive(PartialEq, Eq)]
@@ -302,6 +312,8 @@ impl Endpoint {
         let conf: EndpointConf =
             serde_json::from_slice(&std::fs::read(entry.path().join("endpoint.json"))?)?;
 
+        debug!("serialized endpoint conf: {:?}", conf);
+
         Ok(Endpoint {
             pg_address: SocketAddr::new(IpAddr::from(Ipv4Addr::LOCALHOST), conf.pg_port),
             external_http_address: SocketAddr::new(
@@ -319,8 +331,10 @@ impl Endpoint {
             tenant_id: conf.tenant_id,
             pg_version: conf.pg_version,
             skip_pg_catalog_updates: conf.skip_pg_catalog_updates,
+            reconfigure_concurrency: conf.reconfigure_concurrency,
             drop_subscriptions_before_start: conf.drop_subscriptions_before_start,
             features: conf.features,
+            cluster: conf.cluster,
         })
     }
 
@@ -607,7 +621,7 @@ impl Endpoint {
         };
 
         // Create spec file
-        let spec = ComputeSpec {
+        let mut spec = ComputeSpec {
             skip_pg_catalog_updates: self.skip_pg_catalog_updates,
             format_version: 1.0,
             operation_uuid: None,
@@ -640,7 +654,7 @@ impl Endpoint {
                     Vec::new()
                 },
                 settings: None,
-                postgresql_conf: Some(postgresql_conf),
+                postgresql_conf: Some(postgresql_conf.clone()),
             },
             delta_operations: None,
             tenant_id: Some(self.tenant_id),
@@ -653,9 +667,35 @@ impl Endpoint {
             pgbouncer_settings: None,
             shard_stripe_size: Some(shard_stripe_size),
             local_proxy_config: None,
-            reconfigure_concurrency: 1,
+            reconfigure_concurrency: self.reconfigure_concurrency,
             drop_subscriptions_before_start: self.drop_subscriptions_before_start,
         };
+
+        // this strange code is needed to support respec() in tests
+        if self.cluster.is_some() {
+            debug!("Cluster is already set in the endpoint spec, using it");
+            spec.cluster = self.cluster.clone().unwrap();
+
+            debug!("spec.cluster {:?}", spec.cluster);
+
+            // fill missing fields again
+            if create_test_user {
+                spec.cluster.roles.push(Role {
+                    name: PgIdent::from_str("test").unwrap(),
+                    encrypted_password: None,
+                    options: None,
+                });
+                spec.cluster.databases.push(Database {
+                    name: PgIdent::from_str("neondb").unwrap(),
+                    owner: PgIdent::from_str("test").unwrap(),
+                    options: None,
+                    restrict_conn: false,
+                    invalid: false,
+                });
+            }
+            spec.cluster.postgresql_conf = Some(postgresql_conf);
+        }
+
         let spec_path = self.endpoint_path().join("spec.json");
         std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
 
diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs
index 767a34bcbc..8fffae92fb 100644
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -252,7 +252,7 @@ pub enum ComputeMode {
     Replica,
 }
 
-#[derive(Clone, Debug, Default, Deserialize, Serialize)]
+#[derive(Clone, Debug, Default, Deserialize, Serialize, PartialEq, Eq)]
 pub struct Cluster {
     pub cluster_id: Option<String>,
     pub name: Option<String>,
@@ -283,7 +283,7 @@ pub struct DeltaOp {
 
 /// Rust representation of Postgres role info with only those fields
 /// that matter for us.
-#[derive(Clone, Debug, Deserialize, Serialize)]
+#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)]
 pub struct Role {
     pub name: PgIdent,
     pub encrypted_password: Option<String>,
@@ -292,7 +292,7 @@ pub struct Role {
 
 /// Rust representation of Postgres database info with only those fields
 /// that matter for us.
-#[derive(Clone, Debug, Deserialize, Serialize)]
+#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)]
 pub struct Database {
     pub name: PgIdent,
     pub owner: PgIdent,
@@ -308,7 +308,7 @@ pub struct Database {
 /// Common type representing both SQL statement params with or without value,
 /// like `LOGIN` or `OWNER username` in the `CREATE/ALTER ROLE`, and config
 /// options like `wal_level = logical`.
-#[derive(Clone, Debug, Deserialize, Serialize)]
+#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)]
 pub struct GenericOption {
     pub name: String,
     pub value: Option<String>,
diff --git a/test_runner/regress/test_subscriber_branching.py b/test_runner/regress/test_subscriber_branching.py
index 849d4f024d..6175643389 100644
--- a/test_runner/regress/test_subscriber_branching.py
+++ b/test_runner/regress/test_subscriber_branching.py
@@ -1,9 +1,10 @@
 from __future__ import annotations
 
+import threading
 import time
 
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv
+from fixtures.neon_fixtures import NeonEnv, logical_replication_sync
 from fixtures.utils import query_scalar, wait_until
 
 
@@ -239,3 +240,173 @@ def test_subscriber_branching(neon_simple_env: NeonEnv):
             res = scur_postgres.fetchall()
             assert len(res) == 1
             assert str(sub_child_2_timeline_id) == res[0][0]
+
+
+def test_multiple_subscription_branching(neon_simple_env: NeonEnv):
+    """
+    Test that compute_ctl can handle concurrent deletion of subscriptions in a multiple databases
+    """
+    env = neon_simple_env
+
+    NUMBER_OF_DBS = 5
+
+    # Create and start endpoint so that neon_local put all the generated
+    # stuff into the spec.json file.
+    endpoint = env.endpoints.create_start(
+        "main",
+        config_lines=[
+            "max_replication_slots = 10",
+            "max_logical_replication_workers=10",
+            "max_worker_processes=10",
+        ],
+    )
+
+    TEST_DB_NAMES = [
+        {
+            "name": "neondb",
+            "owner": "cloud_admin",
+        },
+        {
+            "name": "publisher_db",
+            "owner": "cloud_admin",
+        },
+    ]
+
+    for i in range(NUMBER_OF_DBS):
+        TEST_DB_NAMES.append(
+            {
+                "name": f"db{i}",
+                "owner": "cloud_admin",
+            }
+        )
+
+    # Update the spec.json file to create the databases
+    # and reconfigure the endpoint to apply the changes.
+    endpoint.respec_deep(
+        **{
+            "skip_pg_catalog_updates": False,
+            "cluster": {
+                "databases": TEST_DB_NAMES,
+            },
+        }
+    )
+    endpoint.reconfigure()
+
+    connstr = endpoint.connstr(dbname="publisher_db").replace("'", "''")
+
+    # create table, replication and subscription for each of the databases
+    with endpoint.cursor(dbname="publisher_db") as publisher_cursor:
+        for i in range(NUMBER_OF_DBS):
+            publisher_cursor.execute(f"CREATE TABLE t{i}(a int)")
+            publisher_cursor.execute(f"CREATE PUBLICATION mypub{i} FOR TABLE t{i}")
+            publisher_cursor.execute(
+                f"select pg_catalog.pg_create_logical_replication_slot('mysub{i}', 'pgoutput');"
+            )
+            publisher_cursor.execute(f"INSERT INTO t{i} VALUES ({i})")
+
+            with endpoint.cursor(dbname=f"db{i}") as cursor:
+                cursor.execute(f"CREATE TABLE t{i}(a int)")
+                cursor.execute(
+                    f"CREATE SUBSCRIPTION mysub{i} CONNECTION '{connstr}' PUBLICATION mypub{i}  WITH (create_slot = false) "
+                )
+
+    # wait for the subscription to be active
+    for i in range(NUMBER_OF_DBS):
+        logical_replication_sync(
+            endpoint,
+            endpoint,
+            f"mysub{i}",
+            sub_dbname=f"db{i}",
+            pub_dbname="publisher_db",
+        )
+
+    # Check that replication is working
+    for i in range(NUMBER_OF_DBS):
+        with endpoint.cursor(dbname=f"db{i}") as cursor:
+            cursor.execute(f"SELECT * FROM t{i}")
+            rows = cursor.fetchall()
+            assert len(rows) == 1
+            assert rows[0][0] == i
+
+            last_insert_lsn = query_scalar(cursor, "select pg_current_wal_insert_lsn();")
+
+    def start_publisher_workload(table_num: int, duration: int):
+        start = time.time()
+        with endpoint.cursor(dbname="publisher_db") as cur:
+            while time.time() - start < duration:
+                cur.execute(f"INSERT INTO t{i} SELECT FROM generate_series(1,1000)")
+
+    LOAD_DURATION = 5
+    threads = [
+        threading.Thread(target=start_publisher_workload, args=(i, LOAD_DURATION))
+        for i in range(NUMBER_OF_DBS)
+    ]
+
+    for thread in threads:
+        thread.start()
+
+    sub_child_1_timeline_id = env.create_branch(
+        "subscriber_child_1",
+        ancestor_branch_name="main",
+        ancestor_start_lsn=last_insert_lsn,
+    )
+
+    sub_child_1 = env.endpoints.create("subscriber_child_1")
+
+    sub_child_1.respec(
+        skip_pg_catalog_updates=False,
+        reconfigure_concurrency=5,
+        drop_subscriptions_before_start=True,
+        cluster={
+            "databases": TEST_DB_NAMES,
+            "roles": [],
+        },
+    )
+
+    sub_child_1.start()
+
+    # ensure that subscription deletion happened on this timeline
+    with sub_child_1.cursor() as scur_postgres:
+        scur_postgres.execute("SELECT timeline_id from neon.drop_subscriptions_done")
+        res = scur_postgres.fetchall()
+        log.info(f"res = {res}")
+        assert len(res) == 1
+        assert str(sub_child_1_timeline_id) == res[0][0]
+
+    # ensure that there are no subscriptions in the databases
+    for i in range(NUMBER_OF_DBS):
+        with sub_child_1.cursor(dbname=f"db{i}") as cursor:
+            cursor.execute("SELECT * FROM pg_catalog.pg_subscription")
+            res = cursor.fetchall()
+            assert len(res) == 0
+
+            # ensure that there are no unexpected rows in the tables
+            cursor.execute(f"SELECT * FROM t{i}")
+            rows = cursor.fetchall()
+            assert len(rows) == 1
+            assert rows[0][0] == i
+
+    for thread in threads:
+        thread.join()
+
+    # ensure that logical replication is still working in main endpoint
+    # wait for it to catch up
+    for i in range(NUMBER_OF_DBS):
+        logical_replication_sync(
+            endpoint,
+            endpoint,
+            f"mysub{i}",
+            sub_dbname=f"db{i}",
+            pub_dbname="publisher_db",
+        )
+
+    # verify that the data is the same in publisher and subscriber tables
+    with endpoint.cursor(dbname="publisher_db") as publisher_cursor:
+        for i in range(NUMBER_OF_DBS):
+            with endpoint.cursor(dbname=f"db{i}") as cursor:
+                publisher_cursor.execute(f"SELECT count(*) FROM t{i}")
+                cursor.execute(f"SELECT count(*) FROM t{i}")
+                pub_res = publisher_cursor.fetchone()
+                sub_res = cursor.fetchone()
+                log.info(f"for table t{i}: pub_res = {pub_res}, sub_res = {sub_res}")
+                assert pub_res == sub_res

From e808e9432af8ec6809cf97de577ff4e2a466fd02 Mon Sep 17 00:00:00 2001
From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com>
Date: Thu, 20 Feb 2025 21:16:04 +0400
Subject: [PATCH 67/78] storcon: use https for pageservers (#10759)

## Problem

Storage controller uses unsecure http for pageserver API.

Closes: https://github.com/neondatabase/cloud/issues/23734
Closes: https://github.com/neondatabase/cloud/issues/24091

## Summary of changes

- Add an optional `listen_https_port` field to storage controller's Node
state and its API (RegisterNode/ListNodes/etc).
- Allow updating `listen_https_port` on node registration to gradually
add https port for all nodes.
- Add `use_https_pageserver_api` CLI option to storage controller to
enable https.
- Pageserver doesn't support https for now and always reports
`https_port=None`. This will be addressed in follow-up PR.
---
 control_plane/storcon_cli/src/main.rs         |  5 ++
 libs/pageserver_api/src/controller_api.rs     |  3 +
 pageserver/src/controller_upcall_client.rs    |  1 +
 .../down.sql                                  |  1 +
 .../up.sql                                    |  1 +
 storage_controller/src/main.rs                |  5 ++
 storage_controller/src/node.rs                | 60 +++++++++++--
 storage_controller/src/persistence.rs         | 40 ++++++++-
 storage_controller/src/scheduler.rs           |  5 +-
 storage_controller/src/schema.rs              |  1 +
 storage_controller/src/service.rs             | 85 +++++++++++++++----
 test_runner/fixtures/neon_fixtures.py         |  2 +
 .../regress/test_storage_controller.py        | 53 ++++++++++++
 13 files changed, 231 insertions(+), 31 deletions(-)
 create mode 100644 storage_controller/migrations/2025-02-11-144848_pageserver_use_https/down.sql
 create mode 100644 storage_controller/migrations/2025-02-11-144848_pageserver_use_https/up.sql

diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index 3c574efc63..953ade83ad 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -47,6 +47,9 @@ enum Command {
         listen_http_addr: String,
         #[arg(long)]
         listen_http_port: u16,
+        #[arg(long)]
+        listen_https_port: Option<u16>,
+
         #[arg(long)]
         availability_zone_id: String,
     },
@@ -394,6 +397,7 @@ async fn main() -> anyhow::Result<()> {
             listen_pg_port,
             listen_http_addr,
             listen_http_port,
+            listen_https_port,
             availability_zone_id,
         } => {
             storcon_client
@@ -406,6 +410,7 @@ async fn main() -> anyhow::Result<()> {
                         listen_pg_port,
                         listen_http_addr,
                         listen_http_port,
+                        listen_https_port,
                         availability_zone_id: AvailabilityZone(availability_zone_id),
                     }),
                 )
diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index 42f6e47e63..f94bfab581 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -57,6 +57,7 @@ pub struct NodeRegisterRequest {
 
     pub listen_http_addr: String,
     pub listen_http_port: u16,
+    pub listen_https_port: Option<u16>,
 
     pub availability_zone_id: AvailabilityZone,
 }
@@ -105,6 +106,7 @@ pub struct TenantLocateResponseShard {
 
     pub listen_http_addr: String,
     pub listen_http_port: u16,
+    pub listen_https_port: Option<u16>,
 }
 
 #[derive(Serialize, Deserialize)]
@@ -148,6 +150,7 @@ pub struct NodeDescribeResponse {
 
     pub listen_http_addr: String,
     pub listen_http_port: u16,
+    pub listen_https_port: Option<u16>,
 
     pub listen_pg_addr: String,
     pub listen_pg_port: u16,
diff --git a/pageserver/src/controller_upcall_client.rs b/pageserver/src/controller_upcall_client.rs
index d41bfd9021..4990f17b40 100644
--- a/pageserver/src/controller_upcall_client.rs
+++ b/pageserver/src/controller_upcall_client.rs
@@ -173,6 +173,7 @@ impl ControlPlaneGenerationsApi for ControllerUpcallClient {
                         listen_pg_port: m.postgres_port,
                         listen_http_addr: m.http_host,
                         listen_http_port: m.http_port,
+                        listen_https_port: None, // TODO: Support https.
                         availability_zone_id: az_id.expect("Checked above"),
                     })
                 }
diff --git a/storage_controller/migrations/2025-02-11-144848_pageserver_use_https/down.sql b/storage_controller/migrations/2025-02-11-144848_pageserver_use_https/down.sql
new file mode 100644
index 0000000000..0f051d3ac3
--- /dev/null
+++ b/storage_controller/migrations/2025-02-11-144848_pageserver_use_https/down.sql
@@ -0,0 +1 @@
+ALTER TABLE nodes DROP listen_https_port;
diff --git a/storage_controller/migrations/2025-02-11-144848_pageserver_use_https/up.sql b/storage_controller/migrations/2025-02-11-144848_pageserver_use_https/up.sql
new file mode 100644
index 0000000000..172237d477
--- /dev/null
+++ b/storage_controller/migrations/2025-02-11-144848_pageserver_use_https/up.sql
@@ -0,0 +1 @@
+ALTER TABLE nodes ADD listen_https_port INTEGER;
diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index 9a9958f7a6..be074d269d 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -126,6 +126,10 @@ struct Cli {
 
     #[arg(long)]
     long_reconcile_threshold: Option<humantime::Duration>,
+
+    // Flag to use https for requests to pageserver API.
+    #[arg(long, default_value = "false")]
+    use_https_pageserver_api: bool,
 }
 
 enum StrictMode {
@@ -321,6 +325,7 @@ async fn async_main() -> anyhow::Result<()> {
         address_for_peers: args.address_for_peers,
         start_as_candidate: args.start_as_candidate,
         http_service_port: args.listen.port() as i32,
+        use_https_pageserver_api: args.use_https_pageserver_api,
     };
 
     // Validate that we can connect to the database
diff --git a/storage_controller/src/node.rs b/storage_controller/src/node.rs
index f5c2d329e0..3762d13c10 100644
--- a/storage_controller/src/node.rs
+++ b/storage_controller/src/node.rs
@@ -1,5 +1,6 @@
 use std::{str::FromStr, time::Duration};
 
+use anyhow::anyhow;
 use pageserver_api::{
     controller_api::{
         AvailabilityZone, NodeAvailability, NodeDescribeResponse, NodeRegisterRequest,
@@ -32,12 +33,16 @@ pub(crate) struct Node {
 
     listen_http_addr: String,
     listen_http_port: u16,
+    listen_https_port: Option<u16>,
 
     listen_pg_addr: String,
     listen_pg_port: u16,
 
     availability_zone_id: AvailabilityZone,
 
+    // Flag from storcon's config to use https for pageserver admin API.
+    // Invariant: if |true|, listen_https_port should contain a value.
+    use_https: bool,
     // This cancellation token means "stop any RPCs in flight to this node, and don't start
     // any more". It is not related to process shutdown.
     #[serde(skip)]
@@ -56,7 +61,16 @@ pub(crate) enum AvailabilityTransition {
 
 impl Node {
     pub(crate) fn base_url(&self) -> String {
-        format!("http://{}:{}", self.listen_http_addr, self.listen_http_port)
+        if self.use_https {
+            format!(
+                "https://{}:{}",
+                self.listen_http_addr,
+                self.listen_https_port
+                    .expect("https port should be specified if use_https is on")
+            )
+        } else {
+            format!("http://{}:{}", self.listen_http_addr, self.listen_http_port)
+        }
     }
 
     pub(crate) fn get_id(&self) -> NodeId {
@@ -82,11 +96,20 @@ impl Node {
         self.id == register_req.node_id
             && self.listen_http_addr == register_req.listen_http_addr
             && self.listen_http_port == register_req.listen_http_port
+            // Note: listen_https_port may change. See [`Self::need_update`] for mode details.
+            // && self.listen_https_port == register_req.listen_https_port
             && self.listen_pg_addr == register_req.listen_pg_addr
             && self.listen_pg_port == register_req.listen_pg_port
             && self.availability_zone_id == register_req.availability_zone_id
     }
 
+    // Do we need to update an existing record in DB on this registration request?
+    pub(crate) fn need_update(&self, register_req: &NodeRegisterRequest) -> bool {
+        // listen_https_port is checked here because it may change during migration to https.
+        // After migration, this check may be moved to registration_match.
+        self.listen_https_port != register_req.listen_https_port
+    }
+
     /// For a shard located on this node, populate a response object
     /// with this node's address information.
     pub(crate) fn shard_location(&self, shard_id: TenantShardId) -> TenantLocateResponseShard {
@@ -95,6 +118,7 @@ impl Node {
             node_id: self.id,
             listen_http_addr: self.listen_http_addr.clone(),
             listen_http_port: self.listen_http_port,
+            listen_https_port: self.listen_https_port,
             listen_pg_addr: self.listen_pg_addr.clone(),
             listen_pg_port: self.listen_pg_port,
         }
@@ -175,25 +199,34 @@ impl Node {
         }
     }
 
+    #[allow(clippy::too_many_arguments)]
     pub(crate) fn new(
         id: NodeId,
         listen_http_addr: String,
         listen_http_port: u16,
+        listen_https_port: Option<u16>,
         listen_pg_addr: String,
         listen_pg_port: u16,
         availability_zone_id: AvailabilityZone,
-    ) -> Self {
-        Self {
+        use_https: bool,
+    ) -> anyhow::Result<Self> {
+        if use_https && listen_https_port.is_none() {
+            return Err(anyhow!("https is enabled, but node has no https port"));
+        }
+
+        Ok(Self {
             id,
             listen_http_addr,
             listen_http_port,
+            listen_https_port,
             listen_pg_addr,
             listen_pg_port,
             scheduling: NodeSchedulingPolicy::Active,
             availability: NodeAvailability::Offline,
             availability_zone_id,
+            use_https,
             cancel: CancellationToken::new(),
-        }
+        })
     }
 
     pub(crate) fn to_persistent(&self) -> NodePersistence {
@@ -202,14 +235,19 @@ impl Node {
             scheduling_policy: self.scheduling.into(),
             listen_http_addr: self.listen_http_addr.clone(),
             listen_http_port: self.listen_http_port as i32,
+            listen_https_port: self.listen_https_port.map(|x| x as i32),
             listen_pg_addr: self.listen_pg_addr.clone(),
             listen_pg_port: self.listen_pg_port as i32,
             availability_zone_id: self.availability_zone_id.0.clone(),
         }
     }
 
-    pub(crate) fn from_persistent(np: NodePersistence) -> Self {
-        Self {
+    pub(crate) fn from_persistent(np: NodePersistence, use_https: bool) -> anyhow::Result<Self> {
+        if use_https && np.listen_https_port.is_none() {
+            return Err(anyhow!("https is enabled, but node has no https port"));
+        }
+
+        Ok(Self {
             id: NodeId(np.node_id as u64),
             // At startup we consider a node offline until proven otherwise.
             availability: NodeAvailability::Offline,
@@ -217,11 +255,13 @@ impl Node {
                 .expect("Bad scheduling policy in DB"),
             listen_http_addr: np.listen_http_addr,
             listen_http_port: np.listen_http_port as u16,
+            listen_https_port: np.listen_https_port.map(|x| x as u16),
             listen_pg_addr: np.listen_pg_addr,
             listen_pg_port: np.listen_pg_port as u16,
             availability_zone_id: AvailabilityZone(np.availability_zone_id),
+            use_https,
             cancel: CancellationToken::new(),
-        }
+        })
     }
 
     /// Wrapper for issuing requests to pageserver management API: takes care of generic
@@ -285,8 +325,9 @@ impl Node {
             warn_threshold,
             max_retries,
             &format!(
-                "Call to node {} ({}:{}) management API",
-                self.id, self.listen_http_addr, self.listen_http_port
+                "Call to node {} ({}) management API",
+                self.id,
+                self.base_url(),
             ),
             cancel,
         )
@@ -302,6 +343,7 @@ impl Node {
             availability_zone_id: self.availability_zone_id.0.clone(),
             listen_http_addr: self.listen_http_addr.clone(),
             listen_http_port: self.listen_http_port,
+            listen_https_port: self.listen_https_port,
             listen_pg_addr: self.listen_pg_addr.clone(),
             listen_pg_port: self.listen_pg_port,
         }
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index 67b60eadf3..459c11add9 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -375,18 +375,23 @@ impl Persistence {
         Ok(nodes)
     }
 
-    pub(crate) async fn update_node(
+    pub(crate) async fn update_node<V>(
         &self,
         input_node_id: NodeId,
-        input_scheduling: NodeSchedulingPolicy,
-    ) -> DatabaseResult<()> {
+        values: V,
+    ) -> DatabaseResult<()>
+    where
+        V: diesel::AsChangeset<Target = crate::schema::nodes::table> + Clone + Send + Sync,
+        V::Changeset: diesel::query_builder::QueryFragment<diesel::pg::Pg> + Send, // valid Postgres SQL
+    {
         use crate::schema::nodes::dsl::*;
         let updated = self
             .with_measured_conn(DatabaseOperation::UpdateNode, move |conn| {
+                let values = values.clone();
                 Box::pin(async move {
                     let updated = diesel::update(nodes)
                         .filter(node_id.eq(input_node_id.0 as i64))
-                        .set((scheduling_policy.eq(String::from(input_scheduling)),))
+                        .set(values)
                         .execute(conn)
                         .await?;
                     Ok(updated)
@@ -403,6 +408,32 @@ impl Persistence {
         }
     }
 
+    pub(crate) async fn update_node_scheduling_policy(
+        &self,
+        input_node_id: NodeId,
+        input_scheduling: NodeSchedulingPolicy,
+    ) -> DatabaseResult<()> {
+        use crate::schema::nodes::dsl::*;
+        self.update_node(
+            input_node_id,
+            scheduling_policy.eq(String::from(input_scheduling)),
+        )
+        .await
+    }
+
+    pub(crate) async fn update_node_on_registration(
+        &self,
+        input_node_id: NodeId,
+        input_https_port: Option<u16>,
+    ) -> DatabaseResult<()> {
+        use crate::schema::nodes::dsl::*;
+        self.update_node(
+            input_node_id,
+            listen_https_port.eq(input_https_port.map(|x| x as i32)),
+        )
+        .await
+    }
+
     /// At startup, load the high level state for shards, such as their config + policy.  This will
     /// be enriched at runtime with state discovered on pageservers.
     ///
@@ -1452,6 +1483,7 @@ pub(crate) struct NodePersistence {
     pub(crate) listen_pg_addr: String,
     pub(crate) listen_pg_port: i32,
     pub(crate) availability_zone_id: String,
+    pub(crate) listen_https_port: Option<i32>,
 }
 
 /// Tenant metadata health status that are stored durably.
diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs
index 106a7b2699..44936d018a 100644
--- a/storage_controller/src/scheduler.rs
+++ b/storage_controller/src/scheduler.rs
@@ -930,13 +930,16 @@ pub(crate) mod test_utils {
                         NodeId(i),
                         format!("httphost-{i}"),
                         80 + i as u16,
+                        None,
                         format!("pghost-{i}"),
                         5432 + i as u16,
                         az_iter
                             .next()
                             .cloned()
                             .unwrap_or(AvailabilityZone("test-az".to_string())),
-                    );
+                        false,
+                    )
+                    .unwrap();
                     node.set_availability(NodeAvailability::Active(test_utilization::simple(0, 0)));
                     assert!(node.is_available());
                     node
diff --git a/storage_controller/src/schema.rs b/storage_controller/src/schema.rs
index 14c30c296d..361253bd19 100644
--- a/storage_controller/src/schema.rs
+++ b/storage_controller/src/schema.rs
@@ -26,6 +26,7 @@ diesel::table! {
         listen_pg_addr -> Varchar,
         listen_pg_port -> Int4,
         availability_zone_id -> Varchar,
+        listen_https_port -> Nullable<Int4>,
     }
 }
 
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index fc6d2f3d29..25a1cb4252 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -399,6 +399,8 @@ pub struct Config {
     pub http_service_port: i32,
 
     pub long_reconcile_threshold: Duration,
+
+    pub use_https_pageserver_api: bool,
 }
 
 impl From<DatabaseError> for ApiError {
@@ -1401,8 +1403,8 @@ impl Service {
             .list_nodes()
             .await?
             .into_iter()
-            .map(Node::from_persistent)
-            .collect::<Vec<_>>();
+            .map(|x| Node::from_persistent(x, config.use_https_pageserver_api))
+            .collect::<anyhow::Result<Vec<Node>>>()?;
         let nodes: HashMap<NodeId, Node> = nodes.into_iter().map(|n| (n.get_id(), n)).collect();
         tracing::info!("Loaded {} nodes from database.", nodes.len());
         metrics::METRICS_REGISTRY
@@ -1501,10 +1503,13 @@ impl Service {
                     NodeId(node_id as u64),
                     "".to_string(),
                     123,
+                    None,
                     "".to_string(),
                     123,
                     AvailabilityZone("test_az".to_string()),
-                );
+                    false,
+                )
+                .unwrap();
 
                 scheduler.node_upsert(&node);
             }
@@ -5907,8 +5912,10 @@ impl Service {
         )
         .await;
 
+        #[derive(PartialEq)]
         enum RegistrationStatus {
-            Matched,
+            UpToDate,
+            NeedUpdate,
             Mismatched,
             New,
         }
@@ -5917,7 +5924,11 @@ impl Service {
             let locked = self.inner.read().unwrap();
             if let Some(node) = locked.nodes.get(&register_req.node_id) {
                 if node.registration_match(&register_req) {
-                    RegistrationStatus::Matched
+                    if node.need_update(&register_req) {
+                        RegistrationStatus::NeedUpdate
+                    } else {
+                        RegistrationStatus::UpToDate
+                    }
                 } else {
                     RegistrationStatus::Mismatched
                 }
@@ -5927,9 +5938,9 @@ impl Service {
         };
 
         match registration_status {
-            RegistrationStatus::Matched => {
+            RegistrationStatus::UpToDate => {
                 tracing::info!(
-                    "Node {} re-registered with matching address",
+                    "Node {} re-registered with matching address and is up to date",
                     register_req.node_id
                 );
 
@@ -5947,7 +5958,7 @@ impl Service {
                     "Node is already registered with different address".to_string(),
                 ));
             }
-            RegistrationStatus::New => {
+            RegistrationStatus::New | RegistrationStatus::NeedUpdate => {
                 // fallthrough
             }
         }
@@ -5976,6 +5987,16 @@ impl Service {
             ));
         }
 
+        if self.config.use_https_pageserver_api && register_req.listen_https_port.is_none() {
+            return Err(ApiError::PreconditionFailed(
+                format!(
+                    "Node {} has no https port, but use_https is enabled",
+                    register_req.node_id
+                )
+                .into(),
+            ));
+        }
+
         // Ordering: we must persist the new node _before_ adding it to in-memory state.
         // This ensures that before we use it for anything or expose it via any external
         // API, it is guaranteed to be available after a restart.
@@ -5983,13 +6004,29 @@ impl Service {
             register_req.node_id,
             register_req.listen_http_addr,
             register_req.listen_http_port,
+            register_req.listen_https_port,
             register_req.listen_pg_addr,
             register_req.listen_pg_port,
             register_req.availability_zone_id.clone(),
+            self.config.use_https_pageserver_api,
         );
+        let new_node = match new_node {
+            Ok(new_node) => new_node,
+            Err(error) => return Err(ApiError::InternalServerError(error)),
+        };
 
-        // TODO: idempotency if the node already exists in the database
-        self.persistence.insert_node(&new_node).await?;
+        match registration_status {
+            RegistrationStatus::New => self.persistence.insert_node(&new_node).await?,
+            RegistrationStatus::NeedUpdate => {
+                self.persistence
+                    .update_node_on_registration(
+                        register_req.node_id,
+                        register_req.listen_https_port,
+                    )
+                    .await?
+            }
+            _ => unreachable!("Other statuses have been processed earlier"),
+        }
 
         let mut locked = self.inner.write().unwrap();
         let mut new_nodes = (*locked.nodes).clone();
@@ -6004,12 +6041,24 @@ impl Service {
             .storage_controller_pageserver_nodes
             .set(locked.nodes.len() as i64);
 
-        tracing::info!(
-            "Registered pageserver {} ({}), now have {} pageservers",
-            register_req.node_id,
-            register_req.availability_zone_id,
-            locked.nodes.len()
-        );
+        match registration_status {
+            RegistrationStatus::New => {
+                tracing::info!(
+                    "Registered pageserver {} ({}), now have {} pageservers",
+                    register_req.node_id,
+                    register_req.availability_zone_id,
+                    locked.nodes.len()
+                );
+            }
+            RegistrationStatus::NeedUpdate => {
+                tracing::info!(
+                    "Re-registered and updated node {} ({})",
+                    register_req.node_id,
+                    register_req.availability_zone_id,
+                );
+            }
+            _ => unreachable!("Other statuses have been processed earlier"),
+        }
         Ok(())
     }
 
@@ -6027,7 +6076,9 @@ impl Service {
         if let Some(scheduling) = scheduling {
             // Scheduling is a persistent part of Node: we must write updates to the database before
             // applying them in memory
-            self.persistence.update_node(node_id, scheduling).await?;
+            self.persistence
+                .update_node_scheduling_policy(node_id, scheduling)
+                .await?;
         }
 
         // If we're activating a node, then before setting it active we must reconcile any shard locations
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 58c5dbfd29..36af522535 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1630,6 +1630,7 @@ def neon_env_builder(
 class PageserverPort:
     pg: int
     http: int
+    https: int | None = None
 
 
 class LogUtils:
@@ -1886,6 +1887,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
             "node_id": int(node.id),
             "listen_http_addr": "localhost",
             "listen_http_port": node.service_port.http,
+            "listen_https_port": node.service_port.https,
             "listen_pg_addr": "localhost",
             "listen_pg_port": node.service_port.pg,
             "availability_zone_id": node.az_id,
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 1d95312140..7e895422d2 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -3764,3 +3764,56 @@ def test_storage_controller_node_flap_detach_race(
             assert len(locs) == 1, f"{shard} has {len(locs)} attached locations"
 
     wait_until(validate_locations, timeout=10)
+
+
+def test_update_node_on_registration(neon_env_builder: NeonEnvBuilder):
+    """
+    Check that storage controller handles node_register requests with updated fields correctly.
+    1. Run storage controller and register 1 pageserver without https port.
+    2. Register the same pageserver with https port. Check that port has been updated.
+    3. Restart the storage controller. Check that https port is persistent.
+    4. Register the same pageserver without https port again (rollback). Check that port has been removed.
+    """
+    neon_env_builder.num_pageservers = 1
+    env = neon_env_builder.init_configs()
+
+    env.storage_controller.start()
+    env.storage_controller.wait_until_ready()
+
+    pageserver = env.pageservers[0]
+
+    # Step 1. Register pageserver without https port.
+    env.storage_controller.node_register(pageserver)
+    env.storage_controller.consistency_check()
+
+    nodes = env.storage_controller.node_list()
+    assert len(nodes) == 1
+    assert nodes[0]["listen_https_port"] is None
+
+    # Step 2. Register pageserver with https port.
+    pageserver.service_port.https = 1234
+    env.storage_controller.node_register(pageserver)
+    env.storage_controller.consistency_check()
+
+    nodes = env.storage_controller.node_list()
+    assert len(nodes) == 1
+    assert nodes[0]["listen_https_port"] == 1234
+
+    # Step 3. Restart storage controller.
+    env.storage_controller.stop()
+    env.storage_controller.start()
+    env.storage_controller.wait_until_ready()
+    env.storage_controller.consistency_check()
+
+    nodes = env.storage_controller.node_list()
+    assert len(nodes) == 1
+    assert nodes[0]["listen_https_port"] == 1234
+
+    # Step 4. Register pageserver with no https port again.
+    pageserver.service_port.https = None
+    env.storage_controller.node_register(pageserver)
+    env.storage_controller.consistency_check()
+
+    nodes = env.storage_controller.node_list()
+    assert len(nodes) == 1
+    assert nodes[0]["listen_https_port"] is None

From f7474d3f4142d1a05dda0719b19037358f717bae Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Thu, 20 Feb 2025 11:31:42 -0600
Subject: [PATCH 68/78] Remove forward compatibility hacks related to compute
 HTTP servers (#10797)

These hacks were added to appease the forward compatibility tests and
can be removed.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute_tools/src/bin/compute_ctl.rs | 15 ++++++---------
 control_plane/src/endpoint.rs        | 14 +++++---------
 2 files changed, 11 insertions(+), 18 deletions(-)

diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index a8803ec793..9193f06b3b 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -112,16 +112,13 @@ struct Cli {
     /// outside the compute will talk to the compute through this port. Keep
     /// the previous name for this argument around for a smoother release
     /// with the control plane.
-    ///
-    /// TODO: Remove the alias after the control plane release which teaches the
-    /// control plane about the renamed argument.
-    #[arg(long, alias = "http-port", default_value_t = 3080)]
+    #[arg(long, default_value_t = 3080)]
     pub external_http_port: u16,
 
-    /// The port to bind the internal listening HTTP server to. Clients like
+    /// The port to bind the internal listening HTTP server to. Clients include
     /// the neon extension (for installing remote extensions) and local_proxy.
-    #[arg(long)]
-    pub internal_http_port: Option<u16>,
+    #[arg(long, default_value_t = 3081)]
+    pub internal_http_port: u16,
 
     #[arg(short = 'D', long, value_name = "DATADIR")]
     pub pgdata: String,
@@ -359,7 +356,7 @@ fn wait_spec(
         pgbin: cli.pgbin.clone(),
         pgversion: get_pg_version_string(&cli.pgbin),
         external_http_port: cli.external_http_port,
-        internal_http_port: cli.internal_http_port.unwrap_or(cli.external_http_port + 1),
+        internal_http_port: cli.internal_http_port,
         live_config_allowed,
         state: Mutex::new(new_state),
         state_changed: Condvar::new(),
@@ -383,7 +380,7 @@ fn wait_spec(
 
     // The internal HTTP server could be launched later, but there isn't much
     // sense in waiting.
-    Server::Internal(cli.internal_http_port.unwrap_or(cli.external_http_port + 1)).launch(&compute);
+    Server::Internal(cli.internal_http_port).launch(&compute);
 
     if !spec_set {
         // No spec provided, hang waiting for it.
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index c16b3cb017..c22ff20c70 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -713,18 +713,14 @@ impl Endpoint {
             println!("Also at '{}'", conn_str);
         }
         let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl"));
-        //cmd.args([
-        //    "--external-http-port",
-        //    &self.external_http_address.port().to_string(),
-        //])
-        //.args([
-        //    "--internal-http-port",
-        //    &self.internal_http_address.port().to_string(),
-        //])
         cmd.args([
-            "--http-port",
+            "--external-http-port",
             &self.external_http_address.port().to_string(),
         ])
+        .args([
+            "--internal-http-port",
+            &self.internal_http_address.port().to_string(),
+        ])
         .args(["--pgdata", self.pgdata().to_str().unwrap()])
         .args(["--connstr", &conn_str])
         .args([

From d571553d8aff2e9f16c8dbc1ad59370e27418eeb Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Thu, 20 Feb 2025 11:31:52 -0600
Subject: [PATCH 69/78] Remove hacks in compute_ctl related to compute ID
 (#10751)

---
 compute_tools/src/bin/compute_ctl.rs          | 16 +-----------
 control_plane/src/endpoint.rs                 | 26 +++++++++----------
 .../compute_wrapper/shell/compute.sh          |  1 +
 3 files changed, 14 insertions(+), 29 deletions(-)

diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index 9193f06b3b..1cdae718fe 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -41,7 +41,6 @@ use std::process::exit;
 use std::str::FromStr;
 use std::sync::atomic::Ordering;
 use std::sync::{mpsc, Arc, Condvar, Mutex, RwLock};
-use std::time::SystemTime;
 use std::{thread, time::Duration};
 
 use anyhow::{Context, Result};
@@ -86,19 +85,6 @@ fn parse_remote_ext_config(arg: &str) -> Result<String> {
     }
 }
 
-/// Generate a compute ID if one is not supplied. This exists to keep forward
-/// compatibility tests working, but will be removed in a future iteration.
-fn generate_compute_id() -> String {
-    let now = SystemTime::now();
-
-    format!(
-        "compute-{}",
-        now.duration_since(SystemTime::UNIX_EPOCH)
-            .unwrap()
-            .as_secs()
-    )
-}
-
 #[derive(Parser)]
 #[command(rename_all = "kebab-case")]
 struct Cli {
@@ -153,7 +139,7 @@ struct Cli {
     #[arg(short = 'S', long, group = "spec-path")]
     pub spec_path: Option<OsString>,
 
-    #[arg(short = 'i', long, group = "compute-id", default_value = generate_compute_id())]
+    #[arg(short = 'i', long, group = "compute-id")]
     pub compute_id: String,
 
     #[arg(short = 'p', long, conflicts_with_all = ["spec", "spec-path"], value_name = "CONTROL_PLANE_API_BASE_URL")]
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index c22ff20c70..407578abb8 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -46,6 +46,8 @@ use std::process::Command;
 use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
+use std::time::SystemTime;
+use std::time::UNIX_EPOCH;
 
 use anyhow::{anyhow, bail, Context, Result};
 use compute_api::requests::ConfigurationRequest;
@@ -737,20 +739,16 @@ impl Endpoint {
         ])
         // TODO: It would be nice if we generated compute IDs with the same
         // algorithm as the real control plane.
-        //
-        // TODO: Add this back when
-        // https://github.com/neondatabase/neon/pull/10747 is merged.
-        //
-        //.args([
-        //    "--compute-id",
-        //    &format!(
-        //        "compute-{}",
-        //        SystemTime::now()
-        //            .duration_since(UNIX_EPOCH)
-        //            .unwrap()
-        //            .as_secs()
-        //    ),
-        //])
+        .args([
+            "--compute-id",
+            &format!(
+                "compute-{}",
+                SystemTime::now()
+                    .duration_since(UNIX_EPOCH)
+                    .unwrap()
+                    .as_secs()
+            ),
+        ])
         .stdin(std::process::Stdio::null())
         .stderr(logfile.try_clone()?)
         .stdout(logfile);
diff --git a/docker-compose/compute_wrapper/shell/compute.sh b/docker-compose/compute_wrapper/shell/compute.sh
index b4f8d3d66a..9dbdcce69f 100755
--- a/docker-compose/compute_wrapper/shell/compute.sh
+++ b/docker-compose/compute_wrapper/shell/compute.sh
@@ -77,4 +77,5 @@ echo "Start compute node"
 /usr/local/bin/compute_ctl --pgdata /var/db/postgres/compute \
      -C "postgresql://cloud_admin@localhost:55433/postgres"  \
      -b /usr/local/bin/postgres                              \
+     --compute-id "compute-$RANDOM"                          \
      -S ${SPEC_FILE}

From 34996416d65eed2377f3cbf8bd4559792c26a045 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 20 Feb 2025 17:49:05 +0000
Subject: [PATCH 70/78] pageserver: guard against WAL gaps in the interpreted
 protocol (#10858)

## Problem

The interpreted SK <-> PS protocol does not guard against gaps (neither
does the Vanilla one, but that's beside the point).

## Summary of changes

Extend the protocol to include the start LSN of the PG WAL section from
which the records were interpreted.
Validation is enabled via a config flag on the pageserver and works as
follows:

**Case 1**: `raw_wal_start_lsn` is smaller than the requested LSN
There can't be gaps here, but we check that the shard received records
which it hasn't seen before.

**Case 2**: `raw_wal_start_lsn` is equal to the requested LSN
This is the happy case. No gap and nothing to check

**Case 3**: `raw_wal_start_lsn` is greater than the requested LSN
This is a gap.

To make Case 3 work I had to bend the protocol a bit.
We read record chunks of WAL which aren't record aligned and feed them
to the decoder.
The picture below shows a shard which subscribes at a position somewhere
within Record 2.
We already have a wal reader which is below that position so we wait to
catch up.
We read some wal in Read 1 (all of Record 1 and some of Record 2). The
new shard doesn't
need Record 1 (it has already processed it according to the starting
position), but we read
past it's starting position. When we do Read 2, we decode Record 2 and
ship it off to the shard,
but the starting position of Read 2 is greater than the starting
position the shard requested.
This looks like a gap.


![image](https://github.com/user-attachments/assets/8aed292e-5d62-46a3-9b01-fbf9dc25efe0)

To make it work, we extend the protocol to send an empty
`InterpretedWalRecords` to shards
if the WAL the records originated from ends the requested start
position. On the pageserver,
that just updates the tracking LSNs in memory (no-op really). This gives
us a workaround for
the fake gap.

As a drive by, make `InterpretedWalRecords::next_record_lsn` mandatory
in the application level definition.
It's always included.

Related: https://github.com/neondatabase/cloud/issues/23935
---
 libs/pageserver_api/src/config.rs             |  3 +
 libs/wal_decoder/proto/interpreted_wal.proto  |  1 +
 libs/wal_decoder/src/models.rs                |  6 +-
 libs/wal_decoder/src/wire_format.rs           |  9 ++-
 pageserver/src/bin/pageserver.rs              |  1 +
 pageserver/src/config.rs                      |  6 ++
 pageserver/src/tenant/timeline.rs             |  1 +
 pageserver/src/tenant/timeline/walreceiver.rs |  1 +
 .../walreceiver/connection_manager.rs         |  3 +
 .../walreceiver/walreceiver_connection.rs     | 56 +++++++++++++++----
 safekeeper/src/send_interpreted_wal.rs        | 51 +++++++++++++----
 test_runner/fixtures/neon_fixtures.py         | 12 ++--
 12 files changed, 120 insertions(+), 30 deletions(-)

diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 0f33bcf45b..1aff5a7012 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -122,6 +122,8 @@ pub struct ConfigToml {
     pub page_service_pipelining: PageServicePipeliningConfig,
     pub get_vectored_concurrent_io: GetVectoredConcurrentIo,
     pub enable_read_path_debugging: Option<bool>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub validate_wal_contiguity: Option<bool>,
 }
 
 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -521,6 +523,7 @@ impl Default for ConfigToml {
             } else {
                 None
             },
+            validate_wal_contiguity: None,
         }
     }
 }
diff --git a/libs/wal_decoder/proto/interpreted_wal.proto b/libs/wal_decoder/proto/interpreted_wal.proto
index d68484d30f..7b40201a75 100644
--- a/libs/wal_decoder/proto/interpreted_wal.proto
+++ b/libs/wal_decoder/proto/interpreted_wal.proto
@@ -5,6 +5,7 @@ package interpreted_wal;
 message InterpretedWalRecords {
   repeated InterpretedWalRecord records = 1;
   optional uint64 next_record_lsn = 2;
+  optional uint64 raw_wal_start_lsn = 3;
 }
 
 message InterpretedWalRecord {
diff --git a/libs/wal_decoder/src/models.rs b/libs/wal_decoder/src/models.rs
index 51bf7e44ab..7e1934c6c3 100644
--- a/libs/wal_decoder/src/models.rs
+++ b/libs/wal_decoder/src/models.rs
@@ -60,7 +60,11 @@ pub struct InterpretedWalRecords {
     pub records: Vec<InterpretedWalRecord>,
     // Start LSN of the next record after the batch.
     // Note that said record may not belong to the current shard.
-    pub next_record_lsn: Option<Lsn>,
+    pub next_record_lsn: Lsn,
+    // Inclusive start LSN of the PG WAL from which the interpreted
+    // WAL records were extracted. Note that this is not necessarily the
+    // start LSN of the first interpreted record in the batch.
+    pub raw_wal_start_lsn: Option<Lsn>,
 }
 
 /// An interpreted Postgres WAL record, ready to be handled by the pageserver
diff --git a/libs/wal_decoder/src/wire_format.rs b/libs/wal_decoder/src/wire_format.rs
index 944ee5c919..52ed5c70b5 100644
--- a/libs/wal_decoder/src/wire_format.rs
+++ b/libs/wal_decoder/src/wire_format.rs
@@ -167,7 +167,8 @@ impl TryFrom<InterpretedWalRecords> for proto::InterpretedWalRecords {
             .collect::<Result<Vec<_>, _>>()?;
         Ok(proto::InterpretedWalRecords {
             records,
-            next_record_lsn: value.next_record_lsn.map(|l| l.0),
+            next_record_lsn: Some(value.next_record_lsn.0),
+            raw_wal_start_lsn: value.raw_wal_start_lsn.map(|l| l.0),
         })
     }
 }
@@ -254,7 +255,11 @@ impl TryFrom<proto::InterpretedWalRecords> for InterpretedWalRecords {
 
         Ok(InterpretedWalRecords {
             records,
-            next_record_lsn: value.next_record_lsn.map(Lsn::from),
+            next_record_lsn: value
+                .next_record_lsn
+                .map(Lsn::from)
+                .expect("Always provided"),
+            raw_wal_start_lsn: value.raw_wal_start_lsn.map(Lsn::from),
         })
     }
 }
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index fa098e9364..e2b9a7f073 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -134,6 +134,7 @@ fn main() -> anyhow::Result<()> {
     info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
     info!(?conf.virtual_file_io_mode, "starting with virtual_file IO mode");
     info!(?conf.wal_receiver_protocol, "starting with WAL receiver protocol");
+    info!(?conf.validate_wal_contiguity, "starting with WAL contiguity validation");
     info!(?conf.page_service_pipelining, "starting with page service pipelining config");
     info!(?conf.get_vectored_concurrent_io, "starting with get_vectored IO concurrency config");
 
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index c5368f6806..09d9444dd5 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -197,6 +197,10 @@ pub struct PageServerConf {
     /// Enable read path debugging. If enabled, read key errors will print a backtrace of the layer
     /// files read.
     pub enable_read_path_debugging: bool,
+
+    /// Interpreted protocol feature: if enabled, validate that the logical WAL received from
+    /// safekeepers does not have gaps.
+    pub validate_wal_contiguity: bool,
 }
 
 /// Token for authentication to safekeepers
@@ -360,6 +364,7 @@ impl PageServerConf {
             page_service_pipelining,
             get_vectored_concurrent_io,
             enable_read_path_debugging,
+            validate_wal_contiguity,
         } = config_toml;
 
         let mut conf = PageServerConf {
@@ -446,6 +451,7 @@ impl PageServerConf {
             virtual_file_io_mode: virtual_file_io_mode.unwrap_or(virtual_file::IoMode::preferred()),
             no_sync: no_sync.unwrap_or(false),
             enable_read_path_debugging: enable_read_path_debugging.unwrap_or(false),
+            validate_wal_contiguity: validate_wal_contiguity.unwrap_or(false),
         };
 
         // ------------------------------------------------------------
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index b9425d2777..30de4d90dc 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2874,6 +2874,7 @@ impl Timeline {
                 auth_token: crate::config::SAFEKEEPER_AUTH_TOKEN.get().cloned(),
                 availability_zone: self.conf.availability_zone.clone(),
                 ingest_batch_size: self.conf.ingest_batch_size,
+                validate_wal_contiguity: self.conf.validate_wal_contiguity,
             },
             broker_client,
             ctx,
diff --git a/pageserver/src/tenant/timeline/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs
index f831f5e48a..67429bff98 100644
--- a/pageserver/src/tenant/timeline/walreceiver.rs
+++ b/pageserver/src/tenant/timeline/walreceiver.rs
@@ -56,6 +56,7 @@ pub struct WalReceiverConf {
     pub auth_token: Option<Arc<String>>,
     pub availability_zone: Option<String>,
     pub ingest_batch_size: u64,
+    pub validate_wal_contiguity: bool,
 }
 
 pub struct WalReceiver {
diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
index 65f9d39078..1955345315 100644
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -537,6 +537,7 @@ impl ConnectionManagerState {
         let connect_timeout = self.conf.wal_connect_timeout;
         let ingest_batch_size = self.conf.ingest_batch_size;
         let protocol = self.conf.protocol;
+        let validate_wal_contiguity = self.conf.validate_wal_contiguity;
         let timeline = Arc::clone(&self.timeline);
         let ctx = ctx.detached_child(
             TaskKind::WalReceiverConnectionHandler,
@@ -558,6 +559,7 @@ impl ConnectionManagerState {
                     ctx,
                     node_id,
                     ingest_batch_size,
+                    validate_wal_contiguity,
                 )
                 .await;
 
@@ -1563,6 +1565,7 @@ mod tests {
                 auth_token: None,
                 availability_zone: None,
                 ingest_batch_size: 1,
+                validate_wal_contiguity: false,
             },
             wal_connection: None,
             wal_stream_candidates: HashMap::new(),
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index 23db4f88d2..ff05a8f902 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -120,6 +120,7 @@ pub(super) async fn handle_walreceiver_connection(
     ctx: RequestContext,
     safekeeper_node: NodeId,
     ingest_batch_size: u64,
+    validate_wal_contiguity: bool,
 ) -> Result<(), WalReceiverError> {
     debug_assert_current_span_has_tenant_and_timeline_id();
 
@@ -274,6 +275,7 @@ pub(super) async fn handle_walreceiver_connection(
         } => Some((format, compression)),
     };
 
+    let mut expected_wal_start = startpoint;
     while let Some(replication_message) = {
         select! {
             _ = cancellation.cancelled() => {
@@ -340,13 +342,49 @@ pub(super) async fn handle_walreceiver_connection(
                     )
                     })?;
 
+                // Guard against WAL gaps. If the start LSN of the PG WAL section
+                // from which the interpreted records were extracted, doesn't match
+                // the end of the previous batch (or the starting point for the first batch),
+                // then kill this WAL receiver connection and start a new one.
+                if validate_wal_contiguity {
+                    if let Some(raw_wal_start_lsn) = batch.raw_wal_start_lsn {
+                        match raw_wal_start_lsn.cmp(&expected_wal_start) {
+                            std::cmp::Ordering::Greater => {
+                                let msg = format!(
+                                    "Gap in streamed WAL: [{}, {})",
+                                    expected_wal_start, raw_wal_start_lsn
+                                );
+                                critical!("{msg}");
+                                return Err(WalReceiverError::Other(anyhow!(msg)));
+                            }
+                            std::cmp::Ordering::Less => {
+                                // Other shards are reading WAL behind us.
+                                // This is valid, but check that we received records
+                                // that we haven't seen before.
+                                if let Some(first_rec) = batch.records.first() {
+                                    if first_rec.next_record_lsn < last_rec_lsn {
+                                        let msg = format!(
+                                            "Received record with next_record_lsn multiple times ({} < {})",
+                                            first_rec.next_record_lsn, expected_wal_start
+                                        );
+                                        critical!("{msg}");
+                                        return Err(WalReceiverError::Other(anyhow!(msg)));
+                                    }
+                                }
+                            }
+                            std::cmp::Ordering::Equal => {}
+                        }
+                    }
+                }
+
                 let InterpretedWalRecords {
                     records,
                     next_record_lsn,
+                    raw_wal_start_lsn: _,
                 } = batch;
 
                 tracing::debug!(
-                    "Received WAL up to {} with next_record_lsn={:?}",
+                    "Received WAL up to {} with next_record_lsn={}",
                     streaming_lsn,
                     next_record_lsn
                 );
@@ -423,12 +461,11 @@ pub(super) async fn handle_walreceiver_connection(
                 // need to advance last record LSN on all shards. If we've not ingested the latest
                 // record, then set the LSN of the modification past it. This way all shards
                 // advance their last record LSN at the same time.
-                let needs_last_record_lsn_advance = match next_record_lsn {
-                    Some(lsn) if lsn > modification.get_lsn() => {
-                        modification.set_lsn(lsn).unwrap();
-                        true
-                    }
-                    _ => false,
+                let needs_last_record_lsn_advance = if next_record_lsn > modification.get_lsn() {
+                    modification.set_lsn(next_record_lsn).unwrap();
+                    true
+                } else {
+                    false
                 };
 
                 if uncommitted_records > 0 || needs_last_record_lsn_advance {
@@ -446,9 +483,8 @@ pub(super) async fn handle_walreceiver_connection(
                     timeline.get_last_record_lsn()
                 );
 
-                if let Some(lsn) = next_record_lsn {
-                    last_rec_lsn = lsn;
-                }
+                last_rec_lsn = next_record_lsn;
+                expected_wal_start = streaming_lsn;
 
                 Some(streaming_lsn)
             }
diff --git a/safekeeper/src/send_interpreted_wal.rs b/safekeeper/src/send_interpreted_wal.rs
index 5916675c3f..fb06339604 100644
--- a/safekeeper/src/send_interpreted_wal.rs
+++ b/safekeeper/src/send_interpreted_wal.rs
@@ -295,6 +295,10 @@ impl InterpretedWalReader {
 
         let mut wal_decoder = WalStreamDecoder::new(start_pos, self.pg_version);
 
+        // Tracks the start of the PG WAL LSN from which the current batch of
+        // interpreted records originated.
+        let mut current_batch_wal_start_lsn: Option<Lsn> = None;
+
         loop {
             tokio::select! {
                 // Main branch for reading WAL and forwarding it
@@ -302,7 +306,7 @@ impl InterpretedWalReader {
                     let wal = wal_or_reset.map(|wor| wor.get_wal().expect("reset handled in select branch below"));
                     let WalBytes {
                         wal,
-                        wal_start_lsn: _,
+                        wal_start_lsn,
                         wal_end_lsn,
                         available_wal_end_lsn,
                     } = match wal {
@@ -315,6 +319,12 @@ impl InterpretedWalReader {
                         }
                     };
 
+                    // We will already have a value if the previous chunks of WAL
+                    // did not decode into anything useful.
+                    if current_batch_wal_start_lsn.is_none() {
+                        current_batch_wal_start_lsn = Some(wal_start_lsn);
+                    }
+
                     wal_decoder.feed_bytes(&wal);
 
                     // Deserialize and interpret WAL records from this batch of WAL.
@@ -363,7 +373,9 @@ impl InterpretedWalReader {
 
                     let max_next_record_lsn = match max_next_record_lsn {
                         Some(lsn) => lsn,
-                        None => { continue; }
+                        None => {
+                            continue;
+                        }
                     };
 
                     // Update the current position such that new receivers can decide
@@ -377,21 +389,38 @@ impl InterpretedWalReader {
                         }
                     }
 
+                    let batch_wal_start_lsn = current_batch_wal_start_lsn.take().unwrap();
+
                     // Send interpreted records downstream. Anything that has already been seen
                     // by a shard is filtered out.
                     let mut shard_senders_to_remove = Vec::new();
                     for (shard, states) in &mut self.shard_senders {
                         for state in states {
-                            if max_next_record_lsn <= state.next_record_lsn {
-                                continue;
-                            }
-
                             let shard_sender_id = ShardSenderId::new(*shard, state.sender_id);
-                            let records = records_by_sender.remove(&shard_sender_id).unwrap_or_default();
 
-                            let batch = InterpretedWalRecords {
-                                records,
-                                next_record_lsn: Some(max_next_record_lsn),
+                            let batch = if max_next_record_lsn > state.next_record_lsn {
+                                // This batch contains at least one record that this shard has not
+                                // seen yet.
+                                let records = records_by_sender.remove(&shard_sender_id).unwrap_or_default();
+
+                                InterpretedWalRecords {
+                                    records,
+                                    next_record_lsn: max_next_record_lsn,
+                                    raw_wal_start_lsn: Some(batch_wal_start_lsn),
+                                }
+                            } else if wal_end_lsn > state.next_record_lsn {
+                                // All the records in this batch were seen by the shard
+                                // However, the batch maps to a chunk of WAL that the
+                                // shard has not yet seen. Notify it of the start LSN
+                                // of the PG WAL chunk such that it doesn't look like a gap.
+                                InterpretedWalRecords {
+                                    records: Vec::default(),
+                                    next_record_lsn: state.next_record_lsn,
+                                    raw_wal_start_lsn: Some(batch_wal_start_lsn),
+                                }
+                            } else {
+                                // The shard has seen this chunk of WAL before. Skip it.
+                                continue;
                             };
 
                             let res = state.tx.send(Batch {
@@ -403,7 +432,7 @@ impl InterpretedWalReader {
                             if res.is_err() {
                                 shard_senders_to_remove.push(shard_sender_id);
                             } else {
-                                state.next_record_lsn = max_next_record_lsn;
+                                state.next_record_lsn = std::cmp::max(state.next_record_lsn, max_next_record_lsn);
                             }
                         }
                     }
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 36af522535..1d282971b1 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1167,15 +1167,15 @@ class NeonEnv:
                 "max_batch_size": 32,
             }
 
-            # Concurrent IO (https://github.com/neondatabase/neon/issues/9378):
-            # enable concurrent IO by default in tests and benchmarks.
-            # Compat tests are exempt because old versions fail to parse the new config.
-            get_vectored_concurrent_io = self.pageserver_get_vectored_concurrent_io
             if config.test_may_use_compatibility_snapshot_binaries:
                 log.info(
-                    "Forcing use of binary-built-in default to avoid forward-compatibility related test failures"
+                    "Skipping WAL contiguity validation to avoid forward-compatibility related test failures"
                 )
-                get_vectored_concurrent_io = None
+            else:
+                # Look for gaps in WAL received from safekeepeers
+                ps_cfg["validate_wal_contiguity"] = True
+
+            get_vectored_concurrent_io = self.pageserver_get_vectored_concurrent_io
             if get_vectored_concurrent_io is not None:
                 ps_cfg["get_vectored_concurrent_io"] = {
                     "mode": self.pageserver_get_vectored_concurrent_io,

From bd335fa751162f7b0c534b306f3eceb6edd89c4b Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 20 Feb 2025 20:29:14 +0200
Subject: [PATCH 71/78] Fix prototype of CheckPointReplicationState (#10907)

## Problem

Occasionally removed (void) from definition of
`CheckPointReplicationState` function

## Summary of changes

Restore function prototype.

https://github.com/neondatabase/postgres/pull/585
https://github.com/neondatabase/postgres/pull/586

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 vendor/postgres-v15   | 2 +-
 vendor/postgres-v16   | 2 +-
 vendor/revisions.json | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 023f1020ec..6ff5044377 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 023f1020ecb07af3bb0ddbf4622e1a3c3fa276a4
+Subproject commit 6ff50443773b69749e16da6db9d4f4b19064b4b7
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 6cb8d22079..261ed10e9b 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 6cb8d22079570b50fcaff29124d40807c1e63a82
+Subproject commit 261ed10e9b8c8dda01ad7aefb18e944e30aa161d
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 3379cf1ba8..f85cec3a0b 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -5,11 +5,11 @@
   ],
   "v16": [
     "16.8",
-    "6cb8d22079570b50fcaff29124d40807c1e63a82"
+    "261ed10e9b8c8dda01ad7aefb18e944e30aa161d"
   ],
   "v15": [
     "15.12",
-    "023f1020ecb07af3bb0ddbf4622e1a3c3fa276a4"
+    "6ff50443773b69749e16da6db9d4f4b19064b4b7"
   ],
   "v14": [
     "14.17",

From 5b81a774fc698ea66f972af4463bfe0c5e9c8545 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 20 Feb 2025 20:16:22 +0100
Subject: [PATCH 72/78] Update rust to 1.85.0 (#10914)

We keep the practice of keeping the compiler up to date, pointing to the
latest release. This is done by many other projects in the Rust
ecosystem as well.

[Announcement blog
post](https://blog.rust-lang.org/2025/02/20/Rust-1.85.0.html).

Prior update was in #10618.
---
 build-tools.Dockerfile | 2 +-
 rust-toolchain.toml    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile
index 317eded26e..c103ceaea5 100644
--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -292,7 +292,7 @@ WORKDIR /home/nonroot
 
 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.84.1
+ENV RUSTC_VERSION=1.85.0
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 ARG RUSTFILT_VERSION=0.2.1
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
index 38a7f202ba..591d60ea79 100644
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,5 +1,5 @@
 [toolchain]
-channel = "1.84.1"
+channel = "1.85.0"
 profile = "default"
 # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
 # https://rust-lang.github.io/rustup/concepts/profiles.html

From 3f376e44babb12e9a1c7c2f57c51628daccfed15 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 20 Feb 2025 21:48:06 +0200
Subject: [PATCH 73/78] Temporarily disable pg_duckdb (#10909)

It clashed with pg_mooncake

This is the same as the hotfix #10908 , but for the main branch, to keep
the release and main branches in sync. In particular, we don't want to
accidentally revert this temporary fix, if we cut a new release from
main.
---
 compute/compute-node.Dockerfile | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 19633064a6..7169cbc41d 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1669,7 +1669,11 @@ COPY --from=pg_anon-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_ivm-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_partman-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_mooncake-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg_duckdb-build /usr/local/pgsql/ /usr/local/pgsql/
+
+# Disabled temporarily, because it clashed with pg_mooncake. pg_mooncake
+# also depends on libduckdb, but a different version.
+#COPY --from=pg_duckdb-build /usr/local/pgsql/ /usr/local/pgsql/
+
 COPY --from=pg_repack-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pgaudit-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pgauditlogtofile-build /usr/local/pgsql/ /usr/local/pgsql/

From 0b9b391ea0366b896069705b3cdfdf82a9a8e901 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 20 Feb 2025 22:21:44 +0200
Subject: [PATCH 74/78] Fix caclulation of prefetch ring position to fit
 in-flight request in resized ring buffer (#10899)

## Problem

Refer https://github.com/neondatabase/neon/issues/10885

Wait position in ring buffer to restrict number of in-flight requests is
not correctly calculated.

## Summary of changes

Update condition and remove redundant assertion

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/pagestore_smgr.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index f1087a8ccb..6c812f347f 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -474,8 +474,7 @@ readahead_buffer_resize(int newsize, void *extra)
 	 */
 	if (MyPState->n_requests_inflight > newsize)
 	{
-		Assert(MyPState->ring_unused >= MyPState->n_requests_inflight - newsize);
-		prefetch_wait_for(MyPState->ring_unused - (MyPState->n_requests_inflight - newsize));
+		prefetch_wait_for(MyPState->ring_unused - newsize - 1);
 		Assert(MyPState->n_requests_inflight <= newsize);
 	}
 

From 9b42d1ce1a6e0c8bba0fca397d6d17d200da3d9c Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Thu, 20 Feb 2025 22:38:42 +0100
Subject: [PATCH 75/78] pageserver: periodically log slow ongoing getpage
 requests (#10906)

## Problem

We don't have good observability for "stuck" getpage requests.

Resolves https://github.com/neondatabase/cloud/issues/23808.

## Summary of changes

Log a periodic warning (every 30 seconds) if GetPage request execution
is slow to complete, to aid in debugging stuck GetPage requests.

This does not cover response flushing (we have separate logging for
that), nor reading the request from the socket and batching it (expected
to be insignificant and not straightforward to handle with the current
protocol).

This costs 95 nanoseconds on the happy path when awaiting a
`tokio::task::yield_now()`:

```
warn_slow/enabled=false time:   [45.716 ns 46.116 ns 46.687 ns]
warn_slow/enabled=true  time:   [141.53 ns 141.83 ns 142.18 ns]
```
---
 Cargo.lock                       |  1 +
 libs/utils/Cargo.toml            |  3 ++-
 libs/utils/benches/README.md     | 26 ++++++++++++++++++
 libs/utils/benches/benchmarks.rs | 45 +++++++++++++++++++++++++++++---
 libs/utils/src/logging.rs        | 39 +++++++++++++++++++++++++++
 pageserver/src/page_service.rs   | 41 ++++++++++++++++++++---------
 6 files changed, 139 insertions(+), 16 deletions(-)
 create mode 100644 libs/utils/benches/README.md

diff --git a/Cargo.lock b/Cargo.lock
index 12232eaece..f62026696e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -7616,6 +7616,7 @@ dependencies = [
  "once_cell",
  "pin-project-lite",
  "postgres_connection",
+ "pprof",
  "pq_proto",
  "rand 0.8.5",
  "regex",
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index 62e0f4cfba..5020d82adf 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -27,7 +27,7 @@ humantime.workspace = true
 fail.workspace = true
 futures = { workspace = true }
 jsonwebtoken.workspace = true
-nix = {workspace = true, features = [ "ioctl" ] }
+nix = { workspace = true, features = ["ioctl"] }
 once_cell.workspace = true
 pin-project-lite.workspace = true
 regex.workspace = true
@@ -61,6 +61,7 @@ bytes.workspace = true
 criterion.workspace = true
 hex-literal.workspace = true
 camino-tempfile.workspace = true
+pprof.workspace = true
 serde_assert.workspace = true
 tokio = { workspace = true, features = ["test-util"] }
 
diff --git a/libs/utils/benches/README.md b/libs/utils/benches/README.md
new file mode 100644
index 0000000000..e23ec268c2
--- /dev/null
+++ b/libs/utils/benches/README.md
@@ -0,0 +1,26 @@
+## Utils Benchmarks
+
+To run benchmarks:
+
+```sh
+# All benchmarks.
+cargo bench --package utils
+
+# Specific file.
+cargo bench --package utils --bench benchmarks
+
+# Specific benchmark.
+cargo bench --package utils --bench benchmarks warn_slow/enabled=true
+
+# List available benchmarks.
+cargo bench --package utils --benches -- --list
+
+# Generate flamegraph profiles using pprof-rs, profiling for 10 seconds.
+# Output in target/criterion/*/profile/flamegraph.svg.
+cargo bench --package utils --bench benchmarks warn_slow/enabled=true --profile-time 10
+```
+
+Additional charts and statistics are available in `target/criterion/report/index.html`.
+
+Benchmarks are automatically compared against the previous run. To compare against other runs, see
+`--baseline` and `--save-baseline`.
\ No newline at end of file
diff --git a/libs/utils/benches/benchmarks.rs b/libs/utils/benches/benchmarks.rs
index 44eb36387c..cff3792f3a 100644
--- a/libs/utils/benches/benchmarks.rs
+++ b/libs/utils/benches/benchmarks.rs
@@ -1,5 +1,18 @@
-use criterion::{criterion_group, criterion_main, Criterion};
+use std::time::Duration;
+
+use criterion::{criterion_group, criterion_main, Bencher, Criterion};
+use pprof::criterion::{Output, PProfProfiler};
 use utils::id;
+use utils::logging::warn_slow;
+
+// Register benchmarks with Criterion.
+criterion_group!(
+    name = benches;
+    config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
+    targets = bench_id_stringify,
+    bench_warn_slow,
+);
+criterion_main!(benches);
 
 pub fn bench_id_stringify(c: &mut Criterion) {
     // Can only use public methods.
@@ -16,5 +29,31 @@ pub fn bench_id_stringify(c: &mut Criterion) {
     });
 }
 
-criterion_group!(benches, bench_id_stringify);
-criterion_main!(benches);
+pub fn bench_warn_slow(c: &mut Criterion) {
+    for enabled in [false, true] {
+        c.bench_function(&format!("warn_slow/enabled={enabled}"), |b| {
+            run_bench(b, enabled).unwrap()
+        });
+    }
+
+    // The actual benchmark.
+    fn run_bench(b: &mut Bencher, enabled: bool) -> anyhow::Result<()> {
+        const THRESHOLD: Duration = Duration::from_secs(1);
+
+        // Use a multi-threaded runtime to avoid thread parking overhead when yielding.
+        let runtime = tokio::runtime::Builder::new_multi_thread()
+            .enable_all()
+            .build()?;
+
+        // Test both with and without warn_slow, since we're essentially measuring Tokio scheduling
+        // performance too. Use a simple noop future that yields once, to avoid any scheduler fast
+        // paths for a ready future.
+        if enabled {
+            b.iter(|| runtime.block_on(warn_slow("ready", THRESHOLD, tokio::task::yield_now())));
+        } else {
+            b.iter(|| runtime.block_on(tokio::task::yield_now()));
+        }
+
+        Ok(())
+    }
+}
diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs
index 4a6069294d..95c69ac8ba 100644
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -1,9 +1,13 @@
+use std::future::Future;
 use std::str::FromStr;
+use std::time::Duration;
 
 use anyhow::Context;
 use metrics::{IntCounter, IntCounterVec};
 use once_cell::sync::Lazy;
 use strum_macros::{EnumString, VariantNames};
+use tokio::time::Instant;
+use tracing::warn;
 
 /// Logs a critical error, similarly to `tracing::error!`. This will:
 ///
@@ -318,6 +322,41 @@ impl std::fmt::Debug for SecretString {
     }
 }
 
+/// Logs a periodic warning if a future is slow to complete.
+///
+/// This is performance-sensitive as it's used on the GetPage read path.
+#[inline]
+pub async fn warn_slow<O>(name: &str, threshold: Duration, f: impl Future<Output = O>) -> O {
+    // TODO: we unfortunately have to pin the future on the heap, since GetPage futures are huge and
+    // won't fit on the stack.
+    let mut f = Box::pin(f);
+
+    let started = Instant::now();
+    let mut attempt = 1;
+
+    loop {
+        // NB: use timeout_at() instead of timeout() to avoid an extra clock reading in the common
+        // case where the timeout doesn't fire.
+        let deadline = started + attempt * threshold;
+        if let Ok(output) = tokio::time::timeout_at(deadline, &mut f).await {
+            // NB: we check if we exceeded the threshold even if the timeout never fired, because
+            // scheduling or execution delays may cause the future to succeed even if it exceeds the
+            // timeout. This costs an extra unconditional clock reading, but seems worth it to avoid
+            // false negatives.
+            let elapsed = started.elapsed();
+            if elapsed >= threshold {
+                warn!("slow {name} completed after {:.3}s", elapsed.as_secs_f64());
+            }
+            return output;
+        }
+
+        let elapsed = started.elapsed().as_secs_f64();
+        warn!("slow {name} still running after {elapsed:.3}s",);
+
+        attempt += 1;
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use metrics::{core::Opts, IntCounterVec};
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 0c8da6f2a8..7285697040 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -34,11 +34,13 @@ use std::str::FromStr;
 use std::sync::Arc;
 use std::time::SystemTime;
 use std::time::{Duration, Instant};
+use strum_macros::IntoStaticStr;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio::io::{AsyncWriteExt, BufWriter};
 use tokio::task::JoinHandle;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
+use utils::logging::warn_slow;
 use utils::sync::gate::{Gate, GateGuard};
 use utils::sync::spsc_fold;
 use utils::{
@@ -81,6 +83,9 @@ use std::os::fd::AsRawFd;
 /// NB: this is a different value than [`crate::http::routes::ACTIVE_TENANT_TIMEOUT`].
 const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);
 
+/// Threshold at which to log a warning about slow GetPage requests.
+const WARN_SLOW_GETPAGE_THRESHOLD: Duration = Duration::from_secs(30);
+
 ///////////////////////////////////////////////////////////////////////////////
 
 pub struct Listener {
@@ -594,6 +599,7 @@ struct BatchedTestRequest {
 /// NB: we only hold [`timeline::handle::WeakHandle`] inside this enum,
 /// so that we don't keep the [`Timeline::gate`] open while the batch
 /// is being built up inside the [`spsc_fold`] (pagestream pipelining).
+#[derive(IntoStaticStr)]
 enum BatchedFeMessage {
     Exists {
         span: Span,
@@ -638,6 +644,10 @@ enum BatchedFeMessage {
 }
 
 impl BatchedFeMessage {
+    fn as_static_str(&self) -> &'static str {
+        self.into()
+    }
+
     fn observe_execution_start(&mut self, at: Instant) {
         match self {
             BatchedFeMessage::Exists { timer, .. }
@@ -1463,17 +1473,20 @@ impl PageServerHandler {
                 }
             };
 
-            let err = self
-                .pagesteam_handle_batched_message(
+            let result = warn_slow(
+                msg.as_static_str(),
+                WARN_SLOW_GETPAGE_THRESHOLD,
+                self.pagesteam_handle_batched_message(
                     pgb_writer,
                     msg,
                     io_concurrency.clone(),
                     &cancel,
                     protocol_version,
                     ctx,
-                )
-                .await;
-            match err {
+                ),
+            )
+            .await;
+            match result {
                 Ok(()) => {}
                 Err(e) => break e,
             }
@@ -1636,13 +1649,17 @@ impl PageServerHandler {
                             return Err(e);
                         }
                     };
-                    self.pagesteam_handle_batched_message(
-                        pgb_writer,
-                        batch,
-                        io_concurrency.clone(),
-                        &cancel,
-                        protocol_version,
-                        &ctx,
+                    warn_slow(
+                        batch.as_static_str(),
+                        WARN_SLOW_GETPAGE_THRESHOLD,
+                        self.pagesteam_handle_batched_message(
+                            pgb_writer,
+                            batch,
+                            io_concurrency.clone(),
+                            &cancel,
+                            protocol_version,
+                            &ctx,
+                        ),
                     )
                     .await?;
                 }

From c214c32d3f7f29485226d7baf97ea6f7643769bd Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 20 Feb 2025 20:56:30 -0500
Subject: [PATCH 76/78] fix(pageserver): avoid creating empty job for
 gc-compaction (#10917)

## Problem

This should be one last fix for
https://github.com/neondatabase/neon/issues/10517.

## Summary of changes

If a keyspace is empty, we might produce a gc-compaction job which
covers no layer files. We should avoid generating such jobs so that the
gc-compaction image layer can cover the full key range.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/timeline/compaction.rs | 31 +++++++++++++-------
 1 file changed, 20 insertions(+), 11 deletions(-)

diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 58a87dbd5f..0361ce8cd1 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -2212,7 +2212,7 @@ impl Timeline {
         let sub_compaction_max_job_size_mb =
             sub_compaction_max_job_size_mb.unwrap_or(GC_COMPACT_MAX_SIZE_MB);
 
-        let mut compact_jobs = Vec::new();
+        let mut compact_jobs = Vec::<GcCompactJob>::new();
         // For now, we simply use the key partitioning information; we should do a more fine-grained partitioning
         // by estimating the amount of files read for a compaction job. We should also partition on LSN.
         let ((dense_ks, sparse_ks), _) = self.partitioning.read().as_ref().clone();
@@ -2299,16 +2299,25 @@ impl Timeline {
                 } else {
                     end
                 };
-                info!(
-                    "splitting compaction job: {}..{}, estimated_size={}",
-                    start, end, total_size
-                );
-                compact_jobs.push(GcCompactJob {
-                    dry_run: job.dry_run,
-                    compact_key_range: start..end,
-                    compact_lsn_range: job.compact_lsn_range.start..compact_below_lsn,
-                });
-                current_start = Some(end);
+                if total_size == 0 && !compact_jobs.is_empty() {
+                    info!(
+                        "splitting compaction job: {}..{}, estimated_size={}, extending the previous job",
+                        start, end, total_size
+                    );
+                    compact_jobs.last_mut().unwrap().compact_key_range.end = end;
+                    current_start = Some(end);
+                } else {
+                    info!(
+                        "splitting compaction job: {}..{}, estimated_size={}",
+                        start, end, total_size
+                    );
+                    compact_jobs.push(GcCompactJob {
+                        dry_run: job.dry_run,
+                        compact_key_range: start..end,
+                        compact_lsn_range: job.compact_lsn_range.start..compact_below_lsn,
+                    });
+                    current_start = Some(end);
+                }
             }
         }
         Ok(compact_jobs)

From 5b2afd953cca7213c0b04e95345057fd874827b4 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Fri, 21 Feb 2025 06:02:10 +0000
Subject: [PATCH 77/78] Storage release 2025-02-21


From 97e2e27f682003bcc8ac1c9e625bc3675f394264 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 21 Feb 2025 15:45:00 +0000
Subject: [PATCH 78/78] storcon: use `Duration` for duration's in the storage
 controller tenant config (#10928)

## Problem

The storage controller treats durations in the tenant config as strings.
These are loaded from the db.
The pageserver maps these durations to a seconds only format and we
always get a mismatch compared
to what's in the db.

## Summary of changes

Treat durations as durations inside the storage controller and not as
strings.
Nothing changes in the cross service API's themselves or the way things
are stored in the db.

I also added some logging which I would have made the investigation a
10min job:
1. Reason for why the reconciliation was spawned
2. Location config diff between the observed and wanted states
---
 Cargo.lock                                    | 18 ++++
 Cargo.toml                                    |  1 +
 control_plane/src/pageserver.rs               | 49 +++++++---
 control_plane/storcon_cli/src/main.rs         |  2 +-
 libs/pageserver_api/src/models.rs             | 89 ++++++++++++++-----
 pageserver/src/tenant/config.rs               | 47 +++-------
 storage_controller/Cargo.toml                 |  1 +
 storage_controller/src/reconciler.rs          | 27 +++++-
 storage_controller/src/service.rs             | 10 ++-
 storage_controller/src/tenant_shard.rs        | 30 +++++--
 .../regress/test_storage_controller.py        | 40 +++++++++
 11 files changed, 234 insertions(+), 80 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index f62026696e..f0dbdff3ec 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1874,6 +1874,12 @@ dependencies = [
  "syn 2.0.90",
 ]
 
+[[package]]
+name = "difflib"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6184e33543162437515c2e2b48714794e37845ec9851711914eec9d308f6ebe8"
+
 [[package]]
 name = "digest"
 version = "0.10.7"
@@ -3331,6 +3337,17 @@ dependencies = [
  "wasm-bindgen",
 ]
 
+[[package]]
+name = "json-structural-diff"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e878e36a8a44c158505c2c818abdc1350413ad83dcb774a0459f6a7ef2b65cbf"
+dependencies = [
+ "difflib",
+ "regex",
+ "serde_json",
+]
+
 [[package]]
 name = "jsonwebtoken"
 version = "9.2.0"
@@ -6443,6 +6460,7 @@ dependencies = [
  "humantime",
  "hyper 0.14.30",
  "itertools 0.10.5",
+ "json-structural-diff",
  "lasso",
  "measured",
  "metrics",
diff --git a/Cargo.toml b/Cargo.toml
index 7228623c6b..21310ce6ec 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -210,6 +210,7 @@ rustls-native-certs = "0.8"
 x509-parser = "0.16"
 whoami = "1.5.1"
 zerocopy = { version = "0.7", features = ["derive"] }
+json-structural-diff = { version = "0.2.0" }
 
 ## TODO replace this with tracing
 env_logger = "0.10"
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 28d130d9e0..2bf89b7bfa 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -335,13 +335,21 @@ impl PageServerNode {
                 .map(|x| x.parse::<u64>())
                 .transpose()
                 .context("Failed to parse 'checkpoint_distance' as an integer")?,
-            checkpoint_timeout: settings.remove("checkpoint_timeout").map(|x| x.to_string()),
+            checkpoint_timeout: settings
+                .remove("checkpoint_timeout")
+                .map(humantime::parse_duration)
+                .transpose()
+                .context("Failed to parse 'checkpoint_timeout' as duration")?,
             compaction_target_size: settings
                 .remove("compaction_target_size")
                 .map(|x| x.parse::<u64>())
                 .transpose()
                 .context("Failed to parse 'compaction_target_size' as an integer")?,
-            compaction_period: settings.remove("compaction_period").map(|x| x.to_string()),
+            compaction_period: settings
+                .remove("compaction_period")
+                .map(humantime::parse_duration)
+                .transpose()
+                .context("Failed to parse 'compaction_period' as duration")?,
             compaction_threshold: settings
                 .remove("compaction_threshold")
                 .map(|x| x.parse::<usize>())
@@ -387,7 +395,10 @@ impl PageServerNode {
                 .map(|x| x.parse::<u64>())
                 .transpose()
                 .context("Failed to parse 'gc_horizon' as an integer")?,
-            gc_period: settings.remove("gc_period").map(|x| x.to_string()),
+            gc_period: settings.remove("gc_period")
+                .map(humantime::parse_duration)
+                .transpose()
+                .context("Failed to parse 'gc_period' as duration")?,
             image_creation_threshold: settings
                 .remove("image_creation_threshold")
                 .map(|x| x.parse::<usize>())
@@ -403,13 +414,20 @@ impl PageServerNode {
                 .map(|x| x.parse::<usize>())
                 .transpose()
                 .context("Failed to parse 'image_creation_preempt_threshold' as integer")?,
-            pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
+            pitr_interval: settings.remove("pitr_interval")
+                .map(humantime::parse_duration)
+                .transpose()
+                .context("Failed to parse 'pitr_interval' as duration")?,
             walreceiver_connect_timeout: settings
                 .remove("walreceiver_connect_timeout")
-                .map(|x| x.to_string()),
+                .map(humantime::parse_duration)
+                .transpose()
+                .context("Failed to parse 'walreceiver_connect_timeout' as duration")?,
             lagging_wal_timeout: settings
                 .remove("lagging_wal_timeout")
-                .map(|x| x.to_string()),
+                .map(humantime::parse_duration)
+                .transpose()
+                .context("Failed to parse 'lagging_wal_timeout' as duration")?,
             max_lsn_wal_lag: settings
                 .remove("max_lsn_wal_lag")
                 .map(|x| x.parse::<NonZeroU64>())
@@ -427,8 +445,14 @@ impl PageServerNode {
                 .context("Failed to parse 'min_resident_size_override' as integer")?,
             evictions_low_residence_duration_metric_threshold: settings
                 .remove("evictions_low_residence_duration_metric_threshold")
-                .map(|x| x.to_string()),
-            heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
+                .map(humantime::parse_duration)
+                .transpose()
+                .context("Failed to parse 'evictions_low_residence_duration_metric_threshold' as duration")?,
+            heatmap_period: settings
+                .remove("heatmap_period")
+                .map(humantime::parse_duration)
+                .transpose()
+                .context("Failed to parse 'heatmap_period' as duration")?,
             lazy_slru_download: settings
                 .remove("lazy_slru_download")
                 .map(|x| x.parse::<bool>())
@@ -439,10 +463,15 @@ impl PageServerNode {
                 .map(serde_json::from_str)
                 .transpose()
                 .context("parse `timeline_get_throttle` from json")?,
-            lsn_lease_length: settings.remove("lsn_lease_length").map(|x| x.to_string()),
+            lsn_lease_length: settings.remove("lsn_lease_length")
+                .map(humantime::parse_duration)
+                .transpose()
+                .context("Failed to parse 'lsn_lease_length' as duration")?,
             lsn_lease_length_for_ts: settings
                 .remove("lsn_lease_length_for_ts")
-                .map(|x| x.to_string()),
+                .map(humantime::parse_duration)
+                .transpose()
+                .context("Failed to parse 'lsn_lease_length_for_ts' as duration")?,
             timeline_offloading: settings
                 .remove("timeline_offloading")
                 .map(|x| x.parse::<bool>())
diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index 953ade83ad..40b86e4110 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -959,7 +959,7 @@ async fn main() -> anyhow::Result<()> {
                                 threshold: threshold.into(),
                             },
                         )),
-                        heatmap_period: Some("300s".to_string()),
+                        heatmap_period: Some(Duration::from_secs(300)),
                         ..Default::default()
                     },
                 })
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index dd7bea2916..1164048229 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -526,9 +526,13 @@ pub struct TenantConfigPatch {
 #[derive(Serialize, Deserialize, Debug, Default, Clone, Eq, PartialEq)]
 pub struct TenantConfig {
     pub checkpoint_distance: Option<u64>,
-    pub checkpoint_timeout: Option<String>,
+    #[serde(default)]
+    #[serde(with = "humantime_serde")]
+    pub checkpoint_timeout: Option<Duration>,
     pub compaction_target_size: Option<u64>,
-    pub compaction_period: Option<String>,
+    #[serde(default)]
+    #[serde(with = "humantime_serde")]
+    pub compaction_period: Option<Duration>,
     pub compaction_threshold: Option<usize>,
     pub compaction_upper_limit: Option<usize>,
     // defer parsing compaction_algorithm, like eviction_policy
@@ -539,22 +543,38 @@ pub struct TenantConfig {
     pub l0_flush_stall_threshold: Option<usize>,
     pub l0_flush_wait_upload: Option<bool>,
     pub gc_horizon: Option<u64>,
-    pub gc_period: Option<String>,
+    #[serde(default)]
+    #[serde(with = "humantime_serde")]
+    pub gc_period: Option<Duration>,
     pub image_creation_threshold: Option<usize>,
-    pub pitr_interval: Option<String>,
-    pub walreceiver_connect_timeout: Option<String>,
-    pub lagging_wal_timeout: Option<String>,
+    #[serde(default)]
+    #[serde(with = "humantime_serde")]
+    pub pitr_interval: Option<Duration>,
+    #[serde(default)]
+    #[serde(with = "humantime_serde")]
+    pub walreceiver_connect_timeout: Option<Duration>,
+    #[serde(default)]
+    #[serde(with = "humantime_serde")]
+    pub lagging_wal_timeout: Option<Duration>,
     pub max_lsn_wal_lag: Option<NonZeroU64>,
     pub eviction_policy: Option<EvictionPolicy>,
     pub min_resident_size_override: Option<u64>,
-    pub evictions_low_residence_duration_metric_threshold: Option<String>,
-    pub heatmap_period: Option<String>,
+    #[serde(default)]
+    #[serde(with = "humantime_serde")]
+    pub evictions_low_residence_duration_metric_threshold: Option<Duration>,
+    #[serde(default)]
+    #[serde(with = "humantime_serde")]
+    pub heatmap_period: Option<Duration>,
     pub lazy_slru_download: Option<bool>,
     pub timeline_get_throttle: Option<ThrottleConfig>,
     pub image_layer_creation_check_threshold: Option<u8>,
     pub image_creation_preempt_threshold: Option<usize>,
-    pub lsn_lease_length: Option<String>,
-    pub lsn_lease_length_for_ts: Option<String>,
+    #[serde(default)]
+    #[serde(with = "humantime_serde")]
+    pub lsn_lease_length: Option<Duration>,
+    #[serde(default)]
+    #[serde(with = "humantime_serde")]
+    pub lsn_lease_length_for_ts: Option<Duration>,
     pub timeline_offloading: Option<bool>,
     pub wal_receiver_protocol_override: Option<PostgresClientProtocol>,
     pub rel_size_v2_enabled: Option<bool>,
@@ -564,7 +584,10 @@ pub struct TenantConfig {
 }
 
 impl TenantConfig {
-    pub fn apply_patch(self, patch: TenantConfigPatch) -> TenantConfig {
+    pub fn apply_patch(
+        self,
+        patch: TenantConfigPatch,
+    ) -> Result<TenantConfig, humantime::DurationError> {
         let Self {
             mut checkpoint_distance,
             mut checkpoint_timeout,
@@ -604,11 +627,17 @@ impl TenantConfig {
         } = self;
 
         patch.checkpoint_distance.apply(&mut checkpoint_distance);
-        patch.checkpoint_timeout.apply(&mut checkpoint_timeout);
+        patch
+            .checkpoint_timeout
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut checkpoint_timeout);
         patch
             .compaction_target_size
             .apply(&mut compaction_target_size);
-        patch.compaction_period.apply(&mut compaction_period);
+        patch
+            .compaction_period
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut compaction_period);
         patch.compaction_threshold.apply(&mut compaction_threshold);
         patch
             .compaction_upper_limit
@@ -626,15 +655,25 @@ impl TenantConfig {
             .apply(&mut l0_flush_stall_threshold);
         patch.l0_flush_wait_upload.apply(&mut l0_flush_wait_upload);
         patch.gc_horizon.apply(&mut gc_horizon);
-        patch.gc_period.apply(&mut gc_period);
+        patch
+            .gc_period
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut gc_period);
         patch
             .image_creation_threshold
             .apply(&mut image_creation_threshold);
-        patch.pitr_interval.apply(&mut pitr_interval);
+        patch
+            .pitr_interval
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut pitr_interval);
         patch
             .walreceiver_connect_timeout
+            .map(|v| humantime::parse_duration(&v))?
             .apply(&mut walreceiver_connect_timeout);
-        patch.lagging_wal_timeout.apply(&mut lagging_wal_timeout);
+        patch
+            .lagging_wal_timeout
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut lagging_wal_timeout);
         patch.max_lsn_wal_lag.apply(&mut max_lsn_wal_lag);
         patch.eviction_policy.apply(&mut eviction_policy);
         patch
@@ -642,8 +681,12 @@ impl TenantConfig {
             .apply(&mut min_resident_size_override);
         patch
             .evictions_low_residence_duration_metric_threshold
+            .map(|v| humantime::parse_duration(&v))?
             .apply(&mut evictions_low_residence_duration_metric_threshold);
-        patch.heatmap_period.apply(&mut heatmap_period);
+        patch
+            .heatmap_period
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut heatmap_period);
         patch.lazy_slru_download.apply(&mut lazy_slru_download);
         patch
             .timeline_get_throttle
@@ -654,9 +697,13 @@ impl TenantConfig {
         patch
             .image_creation_preempt_threshold
             .apply(&mut image_creation_preempt_threshold);
-        patch.lsn_lease_length.apply(&mut lsn_lease_length);
+        patch
+            .lsn_lease_length
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut lsn_lease_length);
         patch
             .lsn_lease_length_for_ts
+            .map(|v| humantime::parse_duration(&v))?
             .apply(&mut lsn_lease_length_for_ts);
         patch.timeline_offloading.apply(&mut timeline_offloading);
         patch
@@ -673,7 +720,7 @@ impl TenantConfig {
             .gc_compaction_ratio_percent
             .apply(&mut gc_compaction_ratio_percent);
 
-        Self {
+        Ok(Self {
             checkpoint_distance,
             checkpoint_timeout,
             compaction_target_size,
@@ -709,7 +756,7 @@ impl TenantConfig {
             gc_compaction_enabled,
             gc_compaction_initial_threshold_kb,
             gc_compaction_ratio_percent,
-        }
+        })
     }
 }
 
@@ -2503,7 +2550,7 @@ mod tests {
             ..base.clone()
         };
 
-        let patched = base.apply_patch(decoded.config);
+        let patched = base.apply_patch(decoded.config).unwrap();
 
         assert_eq!(patched, expected);
     }
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index c6bcfdf2fb..ab4c4c935d 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -693,16 +693,15 @@ impl TryFrom<&'_ models::TenantConfig> for TenantConfOpt {
 /// This is a conversion from our internal tenant config object to the one used
 /// in external APIs.
 impl From<TenantConfOpt> for models::TenantConfig {
+    // TODO(vlad): These are now the same, but they have different serialization logic.
+    // Can we merge them?
     fn from(value: TenantConfOpt) -> Self {
-        fn humantime(d: Duration) -> String {
-            format!("{}s", d.as_secs())
-        }
         Self {
             checkpoint_distance: value.checkpoint_distance,
-            checkpoint_timeout: value.checkpoint_timeout.map(humantime),
+            checkpoint_timeout: value.checkpoint_timeout,
             compaction_algorithm: value.compaction_algorithm,
             compaction_target_size: value.compaction_target_size,
-            compaction_period: value.compaction_period.map(humantime),
+            compaction_period: value.compaction_period,
             compaction_threshold: value.compaction_threshold,
             compaction_upper_limit: value.compaction_upper_limit,
             compaction_l0_first: value.compaction_l0_first,
@@ -711,24 +710,23 @@ impl From<TenantConfOpt> for models::TenantConfig {
             l0_flush_stall_threshold: value.l0_flush_stall_threshold,
             l0_flush_wait_upload: value.l0_flush_wait_upload,
             gc_horizon: value.gc_horizon,
-            gc_period: value.gc_period.map(humantime),
+            gc_period: value.gc_period,
             image_creation_threshold: value.image_creation_threshold,
-            pitr_interval: value.pitr_interval.map(humantime),
-            walreceiver_connect_timeout: value.walreceiver_connect_timeout.map(humantime),
-            lagging_wal_timeout: value.lagging_wal_timeout.map(humantime),
+            pitr_interval: value.pitr_interval,
+            walreceiver_connect_timeout: value.walreceiver_connect_timeout,
+            lagging_wal_timeout: value.lagging_wal_timeout,
             max_lsn_wal_lag: value.max_lsn_wal_lag,
             eviction_policy: value.eviction_policy,
             min_resident_size_override: value.min_resident_size_override,
             evictions_low_residence_duration_metric_threshold: value
-                .evictions_low_residence_duration_metric_threshold
-                .map(humantime),
-            heatmap_period: value.heatmap_period.map(humantime),
+                .evictions_low_residence_duration_metric_threshold,
+            heatmap_period: value.heatmap_period,
             lazy_slru_download: value.lazy_slru_download,
             timeline_get_throttle: value.timeline_get_throttle,
             image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
             image_creation_preempt_threshold: value.image_creation_preempt_threshold,
-            lsn_lease_length: value.lsn_lease_length.map(humantime),
-            lsn_lease_length_for_ts: value.lsn_lease_length_for_ts.map(humantime),
+            lsn_lease_length: value.lsn_lease_length,
+            lsn_lease_length_for_ts: value.lsn_lease_length_for_ts,
             timeline_offloading: value.timeline_offloading,
             wal_receiver_protocol_override: value.wal_receiver_protocol_override,
             rel_size_v2_enabled: value.rel_size_v2_enabled,
@@ -760,29 +758,10 @@ mod tests {
         assert_eq!(small_conf, serde_json::from_str(&json_form).unwrap());
     }
 
-    #[test]
-    fn test_try_from_models_tenant_config_err() {
-        let tenant_config = models::TenantConfig {
-            lagging_wal_timeout: Some("5a".to_string()),
-            ..TenantConfig::default()
-        };
-
-        let tenant_conf_opt = TenantConfOpt::try_from(&tenant_config);
-
-        assert!(
-            tenant_conf_opt.is_err(),
-            "Suceeded to convert TenantConfig to TenantConfOpt"
-        );
-
-        let expected_error_str =
-            "lagging_wal_timeout: invalid value: string \"5a\", expected a duration";
-        assert_eq!(tenant_conf_opt.unwrap_err().to_string(), expected_error_str);
-    }
-
     #[test]
     fn test_try_from_models_tenant_config_success() {
         let tenant_config = models::TenantConfig {
-            lagging_wal_timeout: Some("5s".to_string()),
+            lagging_wal_timeout: Some(Duration::from_secs(5)),
             ..TenantConfig::default()
         };
 
diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml
index 73dc1a5c10..08c80bc141 100644
--- a/storage_controller/Cargo.toml
+++ b/storage_controller/Cargo.toml
@@ -24,6 +24,7 @@ hex.workspace = true
 hyper0.workspace = true
 humantime.workspace = true
 itertools.workspace = true
+json-structural-diff.workspace = true
 lasso.workspace = true
 once_cell.workspace = true
 pageserver_api.workspace = true
diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs
index 48f0804926..ba3946fa3e 100644
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -1,6 +1,7 @@
 use crate::pageserver_client::PageserverClient;
 use crate::persistence::Persistence;
 use crate::{compute_hook, service};
+use json_structural_diff::JsonDiff;
 use pageserver_api::controller_api::{AvailabilityZone, MigrationConfig, PlacementPolicy};
 use pageserver_api::models::{
     LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig, TenantWaitLsnRequest,
@@ -24,7 +25,7 @@ use crate::compute_hook::{ComputeHook, NotifyError};
 use crate::node::Node;
 use crate::tenant_shard::{IntentState, ObservedState, ObservedStateDelta, ObservedStateLocation};
 
-const DEFAULT_HEATMAP_PERIOD: &str = "60s";
+const DEFAULT_HEATMAP_PERIOD: Duration = Duration::from_secs(60);
 
 /// Object with the lifetime of the background reconcile task that is created
 /// for tenants which have a difference between their intent and observed states.
@@ -880,7 +881,27 @@ impl Reconciler {
                         self.generation = Some(generation);
                         wanted_conf.generation = generation.into();
                     }
-                    tracing::info!(node_id=%node.get_id(), "Observed configuration requires update.");
+
+                    let diff = match observed {
+                        Some(ObservedStateLocation {
+                            conf: Some(observed),
+                        }) => {
+                            let diff = JsonDiff::diff(
+                                &serde_json::to_value(observed.clone()).unwrap(),
+                                &serde_json::to_value(wanted_conf.clone()).unwrap(),
+                                false,
+                            );
+
+                            if let Some(json_diff) = diff.diff {
+                                serde_json::to_string(&json_diff).unwrap_or("diff err".to_string())
+                            } else {
+                                "unknown".to_string()
+                            }
+                        }
+                        _ => "full".to_string(),
+                    };
+
+                    tracing::info!(node_id=%node.get_id(), "Observed configuration requires update: {diff}");
 
                     // Because `node` comes from a ref to &self, clone it before calling into a &mut self
                     // function: this could be avoided by refactoring the state mutated by location_config into
@@ -1180,7 +1201,7 @@ fn ha_aware_config(config: &TenantConfig, has_secondaries: bool) -> TenantConfig
     let mut config = config.clone();
     if has_secondaries {
         if config.heatmap_period.is_none() {
-            config.heatmap_period = Some(DEFAULT_HEATMAP_PERIOD.to_string());
+            config.heatmap_period = Some(DEFAULT_HEATMAP_PERIOD);
         }
     } else {
         config.heatmap_period = None;
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 25a1cb4252..01ace40b84 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -2921,7 +2921,9 @@ impl Service {
             first
         };
 
-        let updated_config = base.apply_patch(patch);
+        let updated_config = base
+            .apply_patch(patch)
+            .map_err(|err| ApiError::BadRequest(anyhow::anyhow!(err)))?;
         self.set_tenant_config_and_reconcile(tenant_id, updated_config)
             .await
     }
@@ -6649,11 +6651,12 @@ impl Service {
     ) -> Option<ReconcilerWaiter> {
         let reconcile_needed = shard.get_reconcile_needed(nodes);
 
-        match reconcile_needed {
+        let reconcile_reason = match reconcile_needed {
             ReconcileNeeded::No => return None,
             ReconcileNeeded::WaitExisting(waiter) => return Some(waiter),
-            ReconcileNeeded::Yes => {
+            ReconcileNeeded::Yes(reason) => {
                 // Fall through to try and acquire units for spawning reconciler
+                reason
             }
         };
 
@@ -6692,6 +6695,7 @@ impl Service {
         };
 
         shard.spawn_reconciler(
+            reconcile_reason,
             &self.result_tx,
             nodes,
             &self.compute_hook,
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index 219c0dffe7..56a36dc2df 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -481,7 +481,14 @@ pub(crate) enum ReconcileNeeded {
     /// spawned: wait for the existing reconciler rather than spawning a new one.
     WaitExisting(ReconcilerWaiter),
     /// shard needs reconciliation: call into [`TenantShard::spawn_reconciler`]
-    Yes,
+    Yes(ReconcileReason),
+}
+
+#[derive(Debug)]
+pub(crate) enum ReconcileReason {
+    ActiveNodesDirty,
+    UnknownLocation,
+    PendingComputeNotification,
 }
 
 /// Pending modification to the observed state of a tenant shard.
@@ -1341,12 +1348,18 @@ impl TenantShard {
 
         let active_nodes_dirty = self.dirty(pageservers);
 
-        // Even if there is no pageserver work to be done, if we have a pending notification to computes,
-        // wake up a reconciler to send it.
-        let do_reconcile =
-            active_nodes_dirty || dirty_observed || self.pending_compute_notification;
+        let reconcile_needed = match (
+            active_nodes_dirty,
+            dirty_observed,
+            self.pending_compute_notification,
+        ) {
+            (true, _, _) => ReconcileNeeded::Yes(ReconcileReason::ActiveNodesDirty),
+            (_, true, _) => ReconcileNeeded::Yes(ReconcileReason::UnknownLocation),
+            (_, _, true) => ReconcileNeeded::Yes(ReconcileReason::PendingComputeNotification),
+            _ => ReconcileNeeded::No,
+        };
 
-        if !do_reconcile {
+        if matches!(reconcile_needed, ReconcileNeeded::No) {
             tracing::debug!("Not dirty, no reconciliation needed.");
             return ReconcileNeeded::No;
         }
@@ -1389,7 +1402,7 @@ impl TenantShard {
             }
         }
 
-        ReconcileNeeded::Yes
+        reconcile_needed
     }
 
     /// Ensure the sequence number is set to a value where waiting for this value will make us wait
@@ -1479,6 +1492,7 @@ impl TenantShard {
     #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
     pub(crate) fn spawn_reconciler(
         &mut self,
+        reason: ReconcileReason,
         result_tx: &tokio::sync::mpsc::UnboundedSender<ReconcileResultRequest>,
         pageservers: &Arc<HashMap<NodeId, Node>>,
         compute_hook: &Arc<ComputeHook>,
@@ -1538,7 +1552,7 @@ impl TenantShard {
         let reconcile_seq = self.sequence;
         let long_reconcile_threshold = service_config.long_reconcile_threshold;
 
-        tracing::info!(seq=%reconcile_seq, "Spawning Reconciler for sequence {}", self.sequence);
+        tracing::info!(seq=%reconcile_seq, "Spawning Reconciler ({reason:?})");
         let must_notify = self.pending_compute_notification;
         let reconciler_span = tracing::info_span!(parent: None, "reconciler", seq=%reconcile_seq,
                                                         tenant_id=%reconciler.tenant_shard_id.tenant_id,
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 7e895422d2..d18cbb3393 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -3817,3 +3817,43 @@ def test_update_node_on_registration(neon_env_builder: NeonEnvBuilder):
     nodes = env.storage_controller.node_list()
     assert len(nodes) == 1
     assert nodes[0]["listen_https_port"] is None
+
+
+def test_storage_controller_location_conf_equivalence(neon_env_builder: NeonEnvBuilder):
+    """
+    Validate that a storage controller restart with no shards in a transient state
+    performs zero reconciliations at start-up. Implicitly, this means that the location
+    configs returned by the pageserver are identical to the persisted state in the
+    storage controller database.
+    """
+    neon_env_builder.num_pageservers = 1
+    neon_env_builder.storage_controller_config = {
+        "start_as_candidate": False,
+    }
+
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    tenant_id = TenantId.generate()
+    env.storage_controller.tenant_create(
+        tenant_id, shard_count=2, tenant_config={"pitr_interval": "1h2m3s"}
+    )
+
+    env.storage_controller.reconcile_until_idle()
+
+    reconciles_before_restart = env.storage_controller.get_metric_value(
+        "storage_controller_reconcile_complete_total", filter={"status": "ok"}
+    )
+
+    assert reconciles_before_restart != 0
+
+    env.storage_controller.stop()
+    env.storage_controller.start()
+
+    env.storage_controller.reconcile_until_idle()
+
+    reconciles_after_restart = env.storage_controller.get_metric_value(
+        "storage_controller_reconcile_complete_total", filter={"status": "ok"}
+    )
+
+    assert reconciles_after_restart == 0