From 79929bb1b654391e63d0f02f668993834806b837 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Fri, 8 Nov 2024 10:35:03 +0200
Subject: [PATCH 01/28] Disable `rust_2024_compatibility` lint option (#9615)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Compiling with nightly rust compiler, I'm getting a lot of errors like
this:

    error: `if let` assigns a shorter lifetime since Edition 2024
       --> proxy/src/auth/backend/jwt.rs:226:16
        |
    226 |             if let Some(permit) = self.try_acquire_permit() {
        |                ^^^^^^^^^^^^^^^^^^^-------------------------
        |                                   |
| this value has a significant drop implementation which may observe a
major change in drop order and requires your discretion
        |
        = warning: this changes meaning in Rust 2024
= note: for more information, see issue #124085
<https://github.com/rust-lang/rust/issues/124085>
    help: the value is now dropped here in Edition 2024
       --> proxy/src/auth/backend/jwt.rs:241:13
        |
    241 |             } else {
        |             ^
    note: the lint level is defined here
       --> proxy/src/lib.rs:8:5
        |
    8   |     rust_2024_compatibility
        |     ^^^^^^^^^^^^^^^^^^^^^^^
= note: `#[deny(if_let_rescope)]` implied by
`#[deny(rust_2024_compatibility)]`

and this:

error: these values and local bindings have significant drop
implementation that will have a different drop order from that of
Edition 2021
       --> proxy/src/auth/backend/jwt.rs:376:18
        |
    369 |         let client = Client::builder()
| ------ these values have significant drop implementation and will
observe changes in drop order under Edition 2024
    ...
    376 |             map: DashMap::default(),
        |                  ^^^^^^^^^^^^^^^^^^
        |
        = warning: this changes meaning in Rust 2024
= note: for more information, see issue #123739
<https://github.com/rust-lang/rust/issues/123739>
= note: `#[deny(tail_expr_drop_order)]` implied by
`#[deny(rust_2024_compatibility)]`

They are caused by the `rust_2024_compatibility` lint option.

When we actually switch to the 2024 edition, it makes sense to go
through all these and check that the drop order changes don't break
anything, but in the meanwhile, there's no easy way to avoid these
errors. Disable it, to allow compiling with nightly again.

Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
---
 proxy/src/lib.rs | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs
index f95d645c23..ad7e1d2771 100644
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -1,12 +1,6 @@
 // rustc lints/lint groups
 // https://doc.rust-lang.org/rustc/lints/groups.html
-#![deny(
-    deprecated,
-    future_incompatible,
-    let_underscore,
-    nonstandard_style,
-    rust_2024_compatibility
-)]
+#![deny(deprecated, future_incompatible, let_underscore, nonstandard_style)]
 #![warn(clippy::all, clippy::pedantic, clippy::cargo)]
 // List of denied lints from the clippy::restriction group.
 // https://rust-lang.github.io/rust-clippy/master/index.html#?groups=restriction

From 027889b06ca9324604575183d84aede5f0c4c906 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= <jc@neon.tech>
Date: Fri, 8 Nov 2024 10:44:59 +0100
Subject: [PATCH 02/28] ci: use set-docker-config-dir from dev-actions (#9638)

set-docker-config-dir was replicated over multiple repositories.

The replica of this action was removed from this repository and it's
using the version from github.com/neondatabase/dev-actions instead
---
 .../actions/set-docker-config-dir/action.yml  | 36 -------------------
 .github/workflows/build-build-tools-image.yml |  2 +-
 .github/workflows/build_and_test.yml          |  8 ++---
 3 files changed, 5 insertions(+), 41 deletions(-)
 delete mode 100644 .github/actions/set-docker-config-dir/action.yml

diff --git a/.github/actions/set-docker-config-dir/action.yml b/.github/actions/set-docker-config-dir/action.yml
deleted file mode 100644
index 3ee8bec8c6..0000000000
--- a/.github/actions/set-docker-config-dir/action.yml
+++ /dev/null
@@ -1,36 +0,0 @@
-name: "Set custom docker config directory"
-description: "Create a directory for docker config and set DOCKER_CONFIG"
-
-# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
-runs:
-  using: "composite"
-  steps:
-  - name: Show warning on GitHub-hosted runners
-    if: runner.environment == 'github-hosted'
-    shell: bash -euo pipefail {0}
-    run: |
-      # Using the following environment variables to find a path to the workflow file
-      # ${GITHUB_WORKFLOW_REF} - octocat/hello-world/.github/workflows/my-workflow.yml@refs/heads/my_branch
-      # ${GITHUB_REPOSITORY}   - octocat/hello-world
-      # ${GITHUB_REF}          - refs/heads/my_branch
-      # From https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/variables
-
-      filename_with_ref=${GITHUB_WORKFLOW_REF#"$GITHUB_REPOSITORY/"}
-      filename=${filename_with_ref%"@$GITHUB_REF"}
-
-      # https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/workflow-commands-for-github-actions#setting-a-warning-message
-      title='Unnecessary usage of `.github/actions/set-docker-config-dir`'
-      message='No need to use `.github/actions/set-docker-config-dir` action on GitHub-hosted runners'
-      echo "::warning file=${filename},title=${title}::${message}"
-
-  - uses: pyTooling/Actions/with-post-step@74afc5a42a17a046c90c68cb5cfa627e5c6c5b6b # v1.0.7
-    env:
-      DOCKER_CONFIG: .docker-custom-${{ github.run_id }}-${{ github.run_attempt }}
-    with:
-      main: |
-        mkdir -p "${DOCKER_CONFIG}"
-        echo DOCKER_CONFIG=${DOCKER_CONFIG} | tee -a $GITHUB_ENV
-      post: |
-        if [ -d "${DOCKER_CONFIG}" ]; then
-          rm -r "${DOCKER_CONFIG}"
-        fi
diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml
index 10750089b2..82b065c524 100644
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -64,7 +64,7 @@ jobs:
 
       - uses: actions/checkout@v4
 
-      - uses: ./.github/actions/set-docker-config-dir
+      - uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193
       - uses: docker/setup-buildx-action@v3
         with:
           cache-binary: false
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index bba51ddc92..bcf021a9a1 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -552,7 +552,7 @@ jobs:
         with:
           submodules: true
 
-      - uses: ./.github/actions/set-docker-config-dir
+      - uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193
       - uses: docker/setup-buildx-action@v3
         with:
           cache-binary: false
@@ -643,7 +643,7 @@ jobs:
         with:
           submodules: true
 
-      - uses: ./.github/actions/set-docker-config-dir
+      - uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193
       - uses: docker/setup-buildx-action@v3
         with:
           cache-binary: false
@@ -824,7 +824,7 @@ jobs:
           curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder
           chmod +x vm-builder
 
-      - uses: ./.github/actions/set-docker-config-dir
+      - uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193
       - uses: docker/login-action@v3
         with:
           username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
@@ -860,7 +860,7 @@ jobs:
     steps:
       - uses: actions/checkout@v4
 
-      - uses: ./.github/actions/set-docker-config-dir
+      - uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193
       - uses: docker/login-action@v3
         with:
           username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}

From aa9112efce42869472fbee7bfa0048f12d3ff81a Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 8 Nov 2024 10:16:04 +0000
Subject: [PATCH 03/28] pageserver: add `no_sync` for use in regression tests
 (1/2) (#9677)

## Problem

In test environments, the `syncfs` that the pageserver does on startup
can take a long time, as other tests running concurrently might have
many gigabytes of dirty pages.

## Summary of changes

- Add a `no_sync` option to the pageserver's config.
- Skip syncfs on startup if this is set
- A subsequent PR (https://github.com/neondatabase/neon/pull/9678) will
enable this by default in tests. We need to wait until after the next
release to avoid breaking compat tests, which would fail if we set
no_sync & use an old pageserver binary.

Q: Why is this a different mechanism than safekeeper, which as a
--no-sync CLI?
A: Because the way we manage pageservers in neon_local depends on the
pageserver.toml containing the full configuration, whereas safekeepers
have a config file which is neon-local-specific and can drive a CLI
flag.

Q: Why is the option no_sync rather than sync?
A: For boolean configs with a dangerous value, it's preferable to make
"false" the safe option, so that any downstream future config tooling
that might have a "booleans are false by default" behavior (e.g. golang
structs) is safe by default.

Q: Why only skip the syncfs, and not all fsyncs?
A: Skipping all fsyncs would require more code changes, and the most
acute problem isn't fsyncs themselves (these just slow down a running
test), it's the syncfs (which makes a pageserver startup slow as a
result of _other_ tests)
---
 control_plane/src/bin/neon_local.rs |  3 +++
 control_plane/src/local_env.rs      | 10 ++++++++++
 control_plane/src/pageserver.rs     |  1 +
 libs/pageserver_api/src/config.rs   |  3 +++
 pageserver/src/bin/pageserver.rs    | 18 +++++++++++-------
 pageserver/src/config.rs            |  5 +++++
 6 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 48438adf43..c4063bbd1a 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -944,6 +944,9 @@ fn handle_init(args: &InitCmdArgs) -> anyhow::Result<LocalEnv> {
                         pg_auth_type: AuthType::Trust,
                         http_auth_type: AuthType::Trust,
                         other: Default::default(),
+                        // Typical developer machines use disks with slow fsync, and we don't care
+                        // about data integrity: disable disk syncs.
+                        no_sync: true,
                     }
                 })
                 .collect(),
diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index 9dc2a0c36b..032c88a829 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -225,6 +225,7 @@ pub struct PageServerConf {
     pub listen_http_addr: String,
     pub pg_auth_type: AuthType,
     pub http_auth_type: AuthType,
+    pub no_sync: bool,
 }
 
 impl Default for PageServerConf {
@@ -235,6 +236,7 @@ impl Default for PageServerConf {
             listen_http_addr: String::new(),
             pg_auth_type: AuthType::Trust,
             http_auth_type: AuthType::Trust,
+            no_sync: false,
         }
     }
 }
@@ -249,6 +251,8 @@ pub struct NeonLocalInitPageserverConf {
     pub listen_http_addr: String,
     pub pg_auth_type: AuthType,
     pub http_auth_type: AuthType,
+    #[serde(default, skip_serializing_if = "std::ops::Not::not")]
+    pub no_sync: bool,
     #[serde(flatten)]
     pub other: HashMap<String, toml::Value>,
 }
@@ -261,6 +265,7 @@ impl From<&NeonLocalInitPageserverConf> for PageServerConf {
             listen_http_addr,
             pg_auth_type,
             http_auth_type,
+            no_sync,
             other: _,
         } = conf;
         Self {
@@ -269,6 +274,7 @@ impl From<&NeonLocalInitPageserverConf> for PageServerConf {
             listen_http_addr: listen_http_addr.clone(),
             pg_auth_type: *pg_auth_type,
             http_auth_type: *http_auth_type,
+            no_sync: *no_sync,
         }
     }
 }
@@ -569,6 +575,8 @@ impl LocalEnv {
                     listen_http_addr: String,
                     pg_auth_type: AuthType,
                     http_auth_type: AuthType,
+                    #[serde(default)]
+                    no_sync: bool,
                 }
                 let config_toml_path = dentry.path().join("pageserver.toml");
                 let config_toml: PageserverConfigTomlSubset = toml_edit::de::from_str(
@@ -591,6 +599,7 @@ impl LocalEnv {
                     listen_http_addr,
                     pg_auth_type,
                     http_auth_type,
+                    no_sync,
                 } = config_toml;
                 let IdentityTomlSubset {
                     id: identity_toml_id,
@@ -607,6 +616,7 @@ impl LocalEnv {
                     listen_http_addr,
                     pg_auth_type,
                     http_auth_type,
+                    no_sync,
                 };
                 pageservers.push(conf);
             }
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index eab76e14c3..ae5e22ddc6 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -273,6 +273,7 @@ impl PageServerNode {
             )
         })?;
         let args = vec!["-D", datadir_path_str];
+
         background_process::start_process(
             "pageserver",
             &datadir,
diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 00cc426c3c..6de34fdd35 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -106,6 +106,8 @@ pub struct ConfigToml {
     pub ephemeral_bytes_per_memory_kb: usize,
     pub l0_flush: Option<crate::models::L0FlushConfig>,
     pub virtual_file_io_mode: Option<crate::models::virtual_file::IoMode>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub no_sync: Option<bool>,
 }
 
 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -389,6 +391,7 @@ impl Default for ConfigToml {
             l0_flush: None,
             virtual_file_io_mode: None,
             tenant_config: TenantConfigToml::default(),
+            no_sync: None,
         }
     }
 }
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 782122139e..fe2a31167d 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -154,13 +154,17 @@ fn main() -> anyhow::Result<()> {
             },
         };
 
-        let started = Instant::now();
-        syncfs(dirfd)?;
-        let elapsed = started.elapsed();
-        info!(
-            elapsed_ms = elapsed.as_millis(),
-            "made tenant directory contents durable"
-        );
+        if conf.no_sync {
+            info!("Skipping syncfs on startup");
+        } else {
+            let started = Instant::now();
+            syncfs(dirfd)?;
+            let elapsed = started.elapsed();
+            info!(
+                elapsed_ms = elapsed.as_millis(),
+                "made tenant directory contents durable"
+            );
+        }
     }
 
     // Initialize up failpoints support
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 06d4326459..d62066ac22 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -178,6 +178,9 @@ pub struct PageServerConf {
 
     /// Direct IO settings
     pub virtual_file_io_mode: virtual_file::IoMode,
+
+    /// Optionally disable disk syncs (unsafe!)
+    pub no_sync: bool,
 }
 
 /// Token for authentication to safekeepers
@@ -332,6 +335,7 @@ impl PageServerConf {
             concurrent_tenant_size_logical_size_queries,
             virtual_file_io_engine,
             tenant_config,
+            no_sync,
         } = config_toml;
 
         let mut conf = PageServerConf {
@@ -409,6 +413,7 @@ impl PageServerConf {
                 .map(crate::l0_flush::L0FlushConfig::from)
                 .unwrap_or_default(),
             virtual_file_io_mode: virtual_file_io_mode.unwrap_or(virtual_file::IoMode::preferred()),
+            no_sync: no_sync.unwrap_or(false),
         };
 
         // ------------------------------------------------------------

From 17c002b660a173bb6cdec07ae77103cd8580ee98 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Fri, 8 Nov 2024 14:54:58 +0200
Subject: [PATCH 04/28] Do not copy logical replicaiton slots to replica
 (#9458)

## Problem

Replication slots are now persisted using AUX files mechanism and
included in basebackup when replica is launched.
This slots are not somehow used at replica but hold WAL, which may cause
local disk space exhaustion.

## Summary of changes

Add `--replica` parameter to basebackup request and do not include
replication slot state files in basebackup for replica.

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 compute_tools/src/compute.rs                  | 28 +++++++++--
 .../test_physical_and_logical_replicaiton.py  | 50 +++++++++++++++++++
 2 files changed, 73 insertions(+), 5 deletions(-)
 create mode 100644 test_runner/regress/test_physical_and_logical_replicaiton.py

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index d3e42fe618..0a8cb14058 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -364,11 +364,29 @@ impl ComputeNode {
         let pageserver_connect_micros = start_time.elapsed().as_micros() as u64;
 
         let basebackup_cmd = match lsn {
-            Lsn(0) => format!("basebackup {} {} --gzip", spec.tenant_id, spec.timeline_id),
-            _ => format!(
-                "basebackup {} {} {} --gzip",
-                spec.tenant_id, spec.timeline_id, lsn
-            ),
+            Lsn(0) => {
+                if spec.spec.mode != ComputeMode::Primary {
+                    format!(
+                        "basebackup {} {} --gzip --replica",
+                        spec.tenant_id, spec.timeline_id
+                    )
+                } else {
+                    format!("basebackup {} {} --gzip", spec.tenant_id, spec.timeline_id)
+                }
+            }
+            _ => {
+                if spec.spec.mode != ComputeMode::Primary {
+                    format!(
+                        "basebackup {} {} {} --gzip --replica",
+                        spec.tenant_id, spec.timeline_id, lsn
+                    )
+                } else {
+                    format!(
+                        "basebackup {} {} {} --gzip",
+                        spec.tenant_id, spec.timeline_id, lsn
+                    )
+                }
+            }
         };
 
         let copyreader = client.copy_out(basebackup_cmd.as_str())?;
diff --git a/test_runner/regress/test_physical_and_logical_replicaiton.py b/test_runner/regress/test_physical_and_logical_replicaiton.py
new file mode 100644
index 0000000000..ec14e08a14
--- /dev/null
+++ b/test_runner/regress/test_physical_and_logical_replicaiton.py
@@ -0,0 +1,50 @@
+from __future__ import annotations
+
+import time
+
+from fixtures.neon_fixtures import NeonEnv, logical_replication_sync
+
+
+def test_physical_and_logical_replication(neon_simple_env: NeonEnv, vanilla_pg):
+    env = neon_simple_env
+
+    n_records = 100000
+
+    primary = env.endpoints.create_start(
+        branch_name="main",
+        endpoint_id="primary",
+        config_lines=["min_wal_size=32MB", "max_wal_size=64MB"],
+    )
+    p_con = primary.connect()
+    p_cur = p_con.cursor()
+    p_cur.execute("CREATE TABLE t(pk bigint primary key, payload text default repeat('?',200))")
+    p_cur.execute("create publication pub1 for table t")
+
+    # start subscriber to primary
+    vanilla_pg.start()
+    vanilla_pg.safe_psql("CREATE TABLE t(pk bigint primary key, payload text)")
+    connstr = primary.connstr().replace("'", "''")
+    vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1")
+
+    time.sleep(1)
+    secondary = env.endpoints.new_replica_start(
+        origin=primary,
+        endpoint_id="secondary",
+        config_lines=["min_wal_size=32MB", "max_wal_size=64MB"],
+    )
+
+    s_con = secondary.connect()
+    s_cur = s_con.cursor()
+
+    for pk in range(n_records):
+        p_cur.execute("insert into t (pk) values (%s)", (pk,))
+
+    s_cur.execute("select count(*) from t")
+    assert s_cur.fetchall()[0][0] == n_records
+
+    logical_replication_sync(vanilla_pg, primary)
+    assert vanilla_pg.safe_psql("select count(*) from t")[0][0] == n_records
+
+    # Check that LR slot is not copied to replica
+    s_cur.execute("select count(*) from pg_replication_slots")
+    assert s_cur.fetchall()[0][0] == 0

From 3525d2e381c008904d05347742771f021325c6f8 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Fri, 8 Nov 2024 09:15:38 -0600
Subject: [PATCH 05/28] Update TimescaleDB to 2.17.1 for PG 17

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute/compute-node.Dockerfile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index f070f66c0a..6efef9e969 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -559,8 +559,8 @@ RUN case "${PG_VERSION}" in \
         export TIMESCALEDB_CHECKSUM=584a351c7775f0e067eaa0e7277ea88cab9077cc4c455cbbf09a5d9723dce95d \
         ;; \
       "v17") \
-        export TIMESCALEDB_VERSION=2.17.0 \
-        export TIMESCALEDB_CHECKSUM=155bf64391d3558c42f31ca0e523cfc6252921974f75298c9039ccad1c89811a \
+        export TIMESCALEDB_VERSION=2.17.1 \
+        export TIMESCALEDB_CHECKSUM=6277cf43f5695e23dae1c5cfeba00474d730b66ed53665a84b787a6bb1a57e28 \
         ;; \
     esac && \
     wget https://github.com/timescale/timescaledb/archive/refs/tags/${TIMESCALEDB_VERSION}.tar.gz -O timescaledb.tar.gz && \

From f561cbe1c709f07c507ffe642e975838ee430ef6 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Fri, 8 Nov 2024 10:35:27 -0500
Subject: [PATCH 06/28] fix(pageserver): drain upload queue before detaching
 ancestor (#9651)

In INC-317
https://neondb.slack.com/archives/C033RQ5SPDH/p1730815677932209, we saw
an interesting series of operations that would remove valid layer files
existing in the layer map.

* Timeline A starts compaction and generates an image layer Z but not
uploading it yet.
* Timeline B/C starts ancestor detaching (which should not affect
timeline A)
* The tenant gets restarted as part of the ancestor detaching process,
without increasing the generation number.
* Timeline A reloads, discovering the layer Z is a future layer, and
schedules a **deletion into the deletion queue**. This means that the
file will be deleted any time in the future.
* Timeline A starts compaction and generates layer Z again, adding it to
the layer map. Note that because we don't bump generation number during
ancestor detach, it has the same filename + generation number as the
original Z.
* Timeline A deletes layer Z from s3 + disk, and now we have a dangling
reference in the layer map, blocking all
compaction/logical_size_calculation process.

## Summary of changes

* We wait until all layers to be uploaded before shutting down the
tenants in `Flush` mode.
* Ancestor detach restarts now use this mode.
* Ancestor detach also waits for remote queue completion before starting
the detaching process.
* The patch ensures that we don't have any future image layer (or
something similar) after restart, but not fixing the underlying problem
around generation numbers.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/http/routes.rs                 | 15 +++++++++++
 pageserver/src/tenant/mgr.rs                  |  2 +-
 .../src/tenant/remote_timeline_client.rs      | 12 +++++++++
 pageserver/src/tenant/timeline.rs             | 25 ++++++++++++++-----
 4 files changed, 47 insertions(+), 7 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 72eb3e7ade..d57bd98e95 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2169,6 +2169,21 @@ async fn timeline_detach_ancestor_handler(
         let ctx = RequestContext::new(TaskKind::DetachAncestor, DownloadBehavior::Download);
         let ctx = &ctx;
 
+        // Flush the upload queues of all timelines before detaching ancestor. We do the same thing again
+        // during shutdown. This early upload ensures the pageserver does not need to upload too many
+        // things and creates downtime during timeline reloads.
+        for timeline in tenant.list_timelines() {
+            timeline
+                .remote_client
+                .wait_completion()
+                .await
+                .map_err(|e| {
+                    ApiError::PreconditionFailed(format!("cannot drain upload queue: {e}").into())
+                })?;
+        }
+
+        tracing::info!("all timeline upload queues are drained");
+
         let timeline = tenant.get_timeline(timeline_id, true)?;
 
         let progress = timeline
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index a4c458b737..4fc9d740c8 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -1959,7 +1959,7 @@ impl TenantManager {
             attempt.before_reset_tenant();
 
             let (_guard, progress) = utils::completion::channel();
-            match tenant.shutdown(progress, ShutdownMode::Hard).await {
+            match tenant.shutdown(progress, ShutdownMode::Flush).await {
                 Ok(()) => {
                     slot_guard.drop_old_value().expect("it was just shutdown");
                 }
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 0aa8d61036..b37c16e133 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -2201,6 +2201,18 @@ impl RemoteTimelineClient {
         inner.initialized_mut()?;
         Ok(UploadQueueAccessor { inner })
     }
+
+    pub(crate) fn no_pending_work(&self) -> bool {
+        let inner = self.upload_queue.lock().unwrap();
+        match &*inner {
+            UploadQueue::Uninitialized
+            | UploadQueue::Stopped(UploadQueueStopped::Uninitialized) => true,
+            UploadQueue::Stopped(UploadQueueStopped::Deletable(x)) => {
+                x.upload_queue_for_deletion.no_pending_work()
+            }
+            UploadQueue::Initialized(x) => x.no_pending_work(),
+        }
+    }
 }
 
 pub(crate) struct UploadQueueAccessor<'a> {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 6e082aecf5..4d086df2d1 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -852,6 +852,10 @@ pub(crate) enum ShutdownMode {
     /// While we are flushing, we continue to accept read I/O for LSNs ingested before
     /// the call to [`Timeline::shutdown`].
     FreezeAndFlush,
+    /// Only flush the layers to the remote storage without freezing any open layers. This is the
+    /// mode used by ancestor detach and any other operations that reloads a tenant but not increasing
+    /// the generation number.
+    Flush,
     /// Shut down immediately, without waiting for any open layers to flush.
     Hard,
 }
@@ -1678,11 +1682,6 @@ impl Timeline {
     pub(crate) async fn shutdown(&self, mode: ShutdownMode) {
         debug_assert_current_span_has_tenant_and_timeline_id();
 
-        let try_freeze_and_flush = match mode {
-            ShutdownMode::FreezeAndFlush => true,
-            ShutdownMode::Hard => false,
-        };
-
         // Regardless of whether we're going to try_freeze_and_flush
         // or not, stop ingesting any more data. Walreceiver only provides
         // cancellation but no "wait until gone", because it uses the Timeline::gate.
@@ -1704,7 +1703,7 @@ impl Timeline {
         // ... and inform any waiters for newer LSNs that there won't be any.
         self.last_record_lsn.shutdown();
 
-        if try_freeze_and_flush {
+        if let ShutdownMode::FreezeAndFlush = mode {
             if let Some((open, frozen)) = self
                 .layers
                 .read()
@@ -1746,6 +1745,20 @@ impl Timeline {
                     warn!("failed to freeze and flush: {e:#}");
                 }
             }
+
+            // `self.remote_client.shutdown().await` above should have already flushed everything from the queue, but
+            // we also do a final check here to ensure that the queue is empty.
+            if !self.remote_client.no_pending_work() {
+                warn!("still have pending work in remote upload queue, but continuing shutting down anyways");
+            }
+        }
+
+        if let ShutdownMode::Flush = mode {
+            // drain the upload queue
+            self.remote_client.shutdown().await;
+            if !self.remote_client.no_pending_work() {
+                warn!("still have pending work in remote upload queue, but continuing shutting down anyways");
+            }
         }
 
         // Signal any subscribers to our cancellation token to drop out

From 30680d1f3289093b532ecf2a417b6fe3309ea57b Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 8 Nov 2024 17:00:31 +0000
Subject: [PATCH 07/28] tests: use tigther storcon scopes (#9696)

## Problem

https://github.com/neondatabase/neon/pull/9596 did not update tests
because that would've broken the compat tests.

## Summary of Changes

Use infra scope where possible.
---
 test_runner/fixtures/auth_tokens.py   |  1 +
 test_runner/fixtures/neon_fixtures.py | 12 ++++++------
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/test_runner/fixtures/auth_tokens.py b/test_runner/fixtures/auth_tokens.py
index 8ebaf61e5e..be16be81de 100644
--- a/test_runner/fixtures/auth_tokens.py
+++ b/test_runner/fixtures/auth_tokens.py
@@ -45,3 +45,4 @@ class TokenScope(str, Enum):
     SAFEKEEPER_DATA = "safekeeperdata"
     TENANT = "tenant"
     SCRUBBER = "scrubber"
+    INFRA = "infra"
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index e23f46d1ca..83c68794c3 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1782,7 +1782,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
         self.request(
             "PUT",
             f"{self.api}/control/v1/node/{node_id}/drain",
-            headers=self.headers(TokenScope.ADMIN),
+            headers=self.headers(TokenScope.INFRA),
         )
 
     def cancel_node_drain(self, node_id):
@@ -1790,7 +1790,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
         self.request(
             "DELETE",
             f"{self.api}/control/v1/node/{node_id}/drain",
-            headers=self.headers(TokenScope.ADMIN),
+            headers=self.headers(TokenScope.INFRA),
         )
 
     def node_fill(self, node_id):
@@ -1798,7 +1798,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
         self.request(
             "PUT",
             f"{self.api}/control/v1/node/{node_id}/fill",
-            headers=self.headers(TokenScope.ADMIN),
+            headers=self.headers(TokenScope.INFRA),
         )
 
     def cancel_node_fill(self, node_id):
@@ -1806,14 +1806,14 @@ class NeonStorageController(MetricsGetter, LogUtils):
         self.request(
             "DELETE",
             f"{self.api}/control/v1/node/{node_id}/fill",
-            headers=self.headers(TokenScope.ADMIN),
+            headers=self.headers(TokenScope.INFRA),
         )
 
     def node_status(self, node_id):
         response = self.request(
             "GET",
             f"{self.api}/control/v1/node/{node_id}",
-            headers=self.headers(TokenScope.ADMIN),
+            headers=self.headers(TokenScope.INFRA),
         )
         return response.json()
 
@@ -1829,7 +1829,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
         response = self.request(
             "GET",
             f"{self.api}/control/v1/node",
-            headers=self.headers(TokenScope.ADMIN),
+            headers=self.headers(TokenScope.INFRA),
         )
         return response.json()
 

From b6bc954c5d3846214ee0a38010dd0228a7c2d7f5 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 8 Nov 2024 17:32:56 +0000
Subject: [PATCH 08/28] CI: move check codestyle python to reusable workflow
 and run on a merge_group (#9683)

## Problem

To prevent breaking main after Python 3.11 PR get merged
we need to enable merge queue and run `check-codestyle-python`
job on it

## Summary of changes
- Move `check-codestyle-python` to a reusable workflow
- Run this workflow on `merge_group` event
---
 .github/workflows/_check-codestyle-python.yml | 37 +++++++++++++++
 .github/workflows/build_and_test.yml          | 34 ++------------
 .github/workflows/pre-merge-checks.yml        | 47 +++++++++++++++++++
 3 files changed, 89 insertions(+), 29 deletions(-)
 create mode 100644 .github/workflows/_check-codestyle-python.yml
 create mode 100644 .github/workflows/pre-merge-checks.yml

diff --git a/.github/workflows/_check-codestyle-python.yml b/.github/workflows/_check-codestyle-python.yml
new file mode 100644
index 0000000000..9ae28a1379
--- /dev/null
+++ b/.github/workflows/_check-codestyle-python.yml
@@ -0,0 +1,37 @@
+name: Check Codestyle Python
+
+on:
+  workflow_call:
+    inputs:
+      build-tools-image:
+        description: 'build-tools image'
+        required: true
+        type: string
+
+defaults:
+  run:
+    shell: bash -euxo pipefail {0}
+
+jobs:
+  check-codestyle-python:
+    runs-on: [ self-hosted, small ]
+    container:
+      image: ${{ inputs.build-tools-image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      options: --init
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/cache@v4
+        with:
+          path: ~/.cache/pypoetry/virtualenvs
+          key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }}
+
+      - run: ./scripts/pysync
+
+      - run: poetry run ruff check .
+      - run: poetry run ruff format --check .
+      - run: poetry run mypy .
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index bcf021a9a1..d415e20db8 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -90,35 +90,10 @@ jobs:
 
   check-codestyle-python:
     needs: [ check-permissions, build-build-tools-image ]
-    runs-on: [ self-hosted, small ]
-    container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
-      credentials:
-        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
-      options: --init
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-
-      - name: Cache poetry deps
-        uses: actions/cache@v4
-        with:
-          path: ~/.cache/pypoetry/virtualenvs
-          key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }}
-
-      - name: Install Python deps
-        run: ./scripts/pysync
-
-      - name: Run `ruff check` to ensure code format
-        run: poetry run ruff check .
-
-      - name: Run `ruff format` to ensure code format
-        run: poetry run ruff format --check .
-
-      - name: Run mypy to check types
-        run: poetry run mypy .
+    uses: ./.github/workflows/_check-codestyle-python.yml
+    with:
+      build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
+    secrets: inherit
 
   check-codestyle-jsonnet:
     needs: [ check-permissions, build-build-tools-image ]
@@ -141,6 +116,7 @@ jobs:
   # Check that the vendor/postgres-* submodules point to the
   # corresponding REL_*_STABLE_neon branches.
   check-submodules:
+    needs: [ check-permissions ]
     runs-on: ubuntu-22.04
     steps:
       - name: Checkout
diff --git a/.github/workflows/pre-merge-checks.yml b/.github/workflows/pre-merge-checks.yml
new file mode 100644
index 0000000000..40ce644eb6
--- /dev/null
+++ b/.github/workflows/pre-merge-checks.yml
@@ -0,0 +1,47 @@
+name:
+
+on:
+  merge_group:
+    branches:
+      - main
+
+# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
+permissions: {}
+
+jobs:
+  get-changed-files:
+    runs-on: ubuntu-22.04
+    outputs:
+      any_changed: ${{ steps.src.outputs.any_changed }}
+    steps:
+      - uses: actions/checkout@v4
+      - uses: tj-actions/changed-files@c3a1bb2c992d77180ae65be6ae6c166cf40f857c # v45.0.3
+        id: src
+        with:
+          files: |
+            .github/workflows/pre-merge-checks.yml
+            **/**.py
+            poetry.lock
+            pyproject.toml
+
+      - name: PRINT ALL CHANGED FILES FOR DEBUG PURPOSES
+        env:
+          ALL_CHANGED_FILES: ${{ steps.src.outputs.all_changed_files }}
+        run: echo "${ALL_CHANGED_FILES}"
+
+  check-build-tools-image:
+    uses: ./.github/workflows/check-build-tools-image.yml
+
+  build-build-tools-image:
+    needs: [ check-build-tools-image ]
+    uses: ./.github/workflows/build-build-tools-image.yml
+    with:
+      image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }}
+    secrets: inherit
+
+  check-codestyle-python:
+    needs: [ build-build-tools-image ]
+    uses: ./.github/workflows/_check-codestyle-python.yml
+    with:
+      build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
+    secrets: inherit

From 34a4eb6f2a7ddb9bd98b1b1f7b8959fa57b3007a Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Fri, 8 Nov 2024 12:19:18 -0600
Subject: [PATCH 09/28] Switch compute-related locales to C.UTF-8 by default

Right now, our environments create databases with the C locale, which is
really unfortunate for users who have data stored in other languages
that they want to analyze. For instance, show_trgm on Hebrew text
currently doesn't work in staging or production.

I don't envision this being the final solution. I think this is just a
way to set a known value so the pageserver doesn't use its parent
environment. The final solution to me is exposing initdb parameters to
users in the console. Then they could use a different locale or encoding
if they so chose.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute_tools/src/config.rs                 |  6 ++
 libs/pageserver_api/src/config.rs           |  3 +
 libs/utils/scripts/restore_from_wal.sh      | 39 ++++++++++++-
 pageserver/src/config.rs                    |  3 +
 pageserver/src/tenant.rs                    | 28 ++++++++--
 test_runner/regress/test_compute_locales.py | 61 +++++++++++++++++++++
 test_runner/regress/test_wal_restore.py     |  2 +
 7 files changed, 136 insertions(+), 6 deletions(-)
 create mode 100644 test_runner/regress/test_compute_locales.py

diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs
index 479100eb89..50e2a95e9d 100644
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -73,6 +73,12 @@ pub fn write_postgres_conf(
         )?;
     }
 
+    // Locales
+    writeln!(file, "lc_messages='C.UTF-8'")?;
+    writeln!(file, "lc_monetary='C.UTF-8'")?;
+    writeln!(file, "lc_time='C.UTF-8'")?;
+    writeln!(file, "lc_numeric='C.UTF-8'")?;
+
     match spec.mode {
         ComputeMode::Primary => {}
         ComputeMode::Static(lsn) => {
diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 6de34fdd35..4272181954 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -64,6 +64,7 @@ pub struct ConfigToml {
     #[serde(with = "humantime_serde")]
     pub wal_redo_timeout: Duration,
     pub superuser: String,
+    pub locale: String,
     pub page_cache_size: usize,
     pub max_file_descriptors: usize,
     pub pg_distrib_dir: Option<Utf8PathBuf>,
@@ -276,6 +277,7 @@ pub mod defaults {
     pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s";
 
     pub const DEFAULT_SUPERUSER: &str = "cloud_admin";
+    pub const DEFAULT_LOCALE: &str = "C.UTF-8";
 
     pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192;
     pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100;
@@ -326,6 +328,7 @@ impl Default for ConfigToml {
             wal_redo_timeout: (humantime::parse_duration(DEFAULT_WAL_REDO_TIMEOUT)
                 .expect("cannot parse default wal redo timeout")),
             superuser: (DEFAULT_SUPERUSER.to_string()),
+            locale: DEFAULT_LOCALE.to_string(),
             page_cache_size: (DEFAULT_PAGE_CACHE_SIZE),
             max_file_descriptors: (DEFAULT_MAX_FILE_DESCRIPTORS),
             pg_distrib_dir: None, // Utf8PathBuf::from("./pg_install"), // TODO: formely, this was std::env::current_dir()
diff --git a/libs/utils/scripts/restore_from_wal.sh b/libs/utils/scripts/restore_from_wal.sh
index 316ec8ed0d..93448369a0 100755
--- a/libs/utils/scripts/restore_from_wal.sh
+++ b/libs/utils/scripts/restore_from_wal.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 
 set -euxo pipefail
 
@@ -6,9 +6,44 @@ PG_BIN=$1
 WAL_PATH=$2
 DATA_DIR=$3
 PORT=$4
+PG_VERSION=$5
 SYSID=$(od -A n -j 24 -N 8 -t d8 "$WAL_PATH"/000000010000000000000002* | cut -c 3-)
+
+# The way that initdb is invoked must match how the pageserver runs initdb.
+function initdb_with_args {
+    local cmd=(
+        "$PG_BIN"/initdb
+        -E utf8
+        -U cloud_admin
+        -D "$DATA_DIR"
+        --locale 'C.UTF-8'
+        --lc-collate 'C.UTF-8'
+        --lc-ctype 'C.UTF-8'
+        --lc-messages 'C.UTF-8'
+        --lc-monetary 'C.UTF-8'
+        --lc-numeric 'C.UTF-8'
+        --lc-time 'C.UTF-8'
+        --sysid="$SYSID"
+    )
+
+    case "$PG_VERSION" in
+        14)
+            # Postgres 14 and below didn't support --locale-provider
+            ;;
+        15 | 16)
+            cmd+=(--locale-provider 'libc')
+            ;;
+        *)
+            # Postgres 17 added the builtin provider
+            cmd+=(--locale-provider 'builtin')
+            ;;
+    esac
+
+    eval env -i LD_LIBRARY_PATH="$PG_BIN"/../lib "${cmd[*]}"
+}
+
 rm -fr "$DATA_DIR"
-env -i LD_LIBRARY_PATH="$PG_BIN"/../lib "$PG_BIN"/initdb -E utf8 -U cloud_admin -D "$DATA_DIR" --sysid="$SYSID"
+initdb_with_args
 echo "port=$PORT" >> "$DATA_DIR"/postgresql.conf
 echo "shared_preload_libraries='\$libdir/neon_rmgr.so'" >> "$DATA_DIR"/postgresql.conf
 REDO_POS=0x$("$PG_BIN"/pg_controldata -D "$DATA_DIR" | grep -F "REDO location"| cut -c 42-)
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index d62066ac22..b694a43599 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -69,6 +69,7 @@ pub struct PageServerConf {
     pub wal_redo_timeout: Duration,
 
     pub superuser: String,
+    pub locale: String,
 
     pub page_cache_size: usize,
     pub max_file_descriptors: usize,
@@ -301,6 +302,7 @@ impl PageServerConf {
             wait_lsn_timeout,
             wal_redo_timeout,
             superuser,
+            locale,
             page_cache_size,
             max_file_descriptors,
             pg_distrib_dir,
@@ -348,6 +350,7 @@ impl PageServerConf {
             wait_lsn_timeout,
             wal_redo_timeout,
             superuser,
+            locale,
             page_cache_size,
             max_file_descriptors,
             http_auth_type,
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index d45c99a41b..34ea6dae1f 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -4779,10 +4779,18 @@ async fn run_initdb(
 
     let _permit = INIT_DB_SEMAPHORE.acquire().await;
 
-    let initdb_command = tokio::process::Command::new(&initdb_bin_path)
+    let mut initdb_command = tokio::process::Command::new(&initdb_bin_path);
+    initdb_command
         .args(["--pgdata", initdb_target_dir.as_ref()])
         .args(["--username", &conf.superuser])
         .args(["--encoding", "utf8"])
+        .args(["--locale", &conf.locale])
+        .args(["--lc-collate", &conf.locale])
+        .args(["--lc-ctype", &conf.locale])
+        .args(["--lc-messages", &conf.locale])
+        .args(["--lc-monetary", &conf.locale])
+        .args(["--lc-numeric", &conf.locale])
+        .args(["--lc-time", &conf.locale])
         .arg("--no-instructions")
         .arg("--no-sync")
         .env_clear()
@@ -4792,15 +4800,27 @@ async fn run_initdb(
         // stdout invocation produces the same output every time, we don't need it
         .stdout(std::process::Stdio::null())
         // we would be interested in the stderr output, if there was any
-        .stderr(std::process::Stdio::piped())
-        .spawn()?;
+        .stderr(std::process::Stdio::piped());
+
+    // Before version 14, only the libc provide was available.
+    if pg_version > 14 {
+        // Version 17 brought with it a builtin locale provider which only provides
+        // C and C.UTF-8. While being safer for collation purposes since it is
+        // guaranteed to be consistent throughout a major release, it is also more
+        // performant.
+        let locale_provider = if pg_version >= 17 { "builtin" } else { "libc" };
+
+        initdb_command.args(["--locale-provider", locale_provider]);
+    }
+
+    let initdb_proc = initdb_command.spawn()?;
 
     // Ideally we'd select here with the cancellation token, but the problem is that
     // we can't safely terminate initdb: it launches processes of its own, and killing
     // initdb doesn't kill them. After we return from this function, we want the target
     // directory to be able to be cleaned up.
     // See https://github.com/neondatabase/neon/issues/6385
-    let initdb_output = initdb_command.wait_with_output().await?;
+    let initdb_output = initdb_proc.wait_with_output().await?;
     if !initdb_output.status.success() {
         return Err(InitdbError::Failed(
             initdb_output.status,
diff --git a/test_runner/regress/test_compute_locales.py b/test_runner/regress/test_compute_locales.py
new file mode 100644
index 0000000000..00ef32fb5e
--- /dev/null
+++ b/test_runner/regress/test_compute_locales.py
@@ -0,0 +1,61 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, cast
+
+from fixtures.pg_version import PgVersion
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
+    from fixtures.neon_fixtures import NeonEnv
+
+
+def test_default_locales(neon_simple_env: NeonEnv):
+    """
+    Test that the default locales for compute databases is C.UTF-8.
+    """
+    env = neon_simple_env
+
+    endpoint = env.endpoints.create_start("main")
+
+    domain_locales = cast(
+        "Sequence[str]",
+        endpoint.safe_psql(
+            "SELECT current_setting('lc_messages') AS lc_messages,"
+            + "current_setting('lc_monetary') AS lc_monetary,"
+            + "current_setting('lc_numeric') AS lc_numeric,"
+            + "current_setting('lc_time') AS lc_time"
+        )[0],
+    )
+    for dl in domain_locales:
+        assert dl == "C.UTF-8"
+
+    # Postgres 15 added the locale providers
+    if env.pg_version < PgVersion.V15:
+        results = cast(
+            "Sequence[str]",
+            endpoint.safe_psql(
+                "SELECT datcollate, datctype FROM pg_database WHERE datname = current_database()"
+            )[0],
+        )
+
+        datcollate = results[0]
+        datctype = results[1]
+    else:
+        results = cast(
+            "Sequence[str]",
+            endpoint.safe_psql(
+                "SELECT datlocprovider, datcollate, datctype FROM pg_database WHERE datname = current_database()"
+            )[0],
+        )
+        datlocprovider = results[0]
+        datcollate = results[1]
+        datctype = results[2]
+
+        if env.pg_version >= PgVersion.V17:
+            assert datlocprovider == "b", "The locale provider is not builtin"
+        else:
+            assert datlocprovider == "c", "The locale provider is not libc"
+
+    assert datcollate == "C.UTF-8"
+    assert datctype == "C.UTF-8"
diff --git a/test_runner/regress/test_wal_restore.py b/test_runner/regress/test_wal_restore.py
index 05b6ad8a9b..c8e51fde13 100644
--- a/test_runner/regress/test_wal_restore.py
+++ b/test_runner/regress/test_wal_restore.py
@@ -64,6 +64,7 @@ def test_wal_restore(
                 ),
                 str(data_dir),
                 str(port),
+                env.pg_version,
             ]
         )
         restored.start()
@@ -127,6 +128,7 @@ def test_wal_restore_initdb(
                 ),
                 str(data_dir),
                 str(port),
+                env.pg_version,
             ]
         )
         restored.start()

From ecca62a45dfa0be134c41c113adc8e2519b827af Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Fri, 8 Nov 2024 13:44:00 -0500
Subject: [PATCH 10/28] feat(pageserver): more log lines around frozen layers
 (#9697)

We saw pageserver OOMs
https://github.com/neondatabase/cloud/issues/19715 for tenants doing
large writes. Add log lines around in-memory layers to hopefully collect
some info during my on-call shift next week.

## Summary of changes

* Estimate in-memory size of an in-mem layer.
* Print frozen layer number if there are too many layers accumulated in
memory.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 .../tenant/storage_layer/inmemory_layer.rs    | 13 ++++++++++++
 pageserver/src/tenant/timeline.rs             | 20 +++++++++++++++++++
 2 files changed, 33 insertions(+)

diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 2ce26ed2eb..af6112d535 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -67,6 +67,8 @@ pub struct InMemoryLayer {
     /// The above fields never change, except for `end_lsn`, which is only set once.
     /// All other changing parts are in `inner`, and protected by a mutex.
     inner: RwLock<InMemoryLayerInner>,
+
+    estimated_in_mem_size: AtomicU64,
 }
 
 impl std::fmt::Debug for InMemoryLayer {
@@ -543,6 +545,10 @@ impl InMemoryLayer {
         Ok(inner.file.len())
     }
 
+    pub fn estimated_in_mem_size(&self) -> u64 {
+        self.estimated_in_mem_size.load(AtomicOrdering::Relaxed)
+    }
+
     /// Create a new, empty, in-memory layer
     pub async fn create(
         conf: &'static PageServerConf,
@@ -572,6 +578,7 @@ impl InMemoryLayer {
                 file,
                 resource_units: GlobalResourceUnits::new(),
             }),
+            estimated_in_mem_size: AtomicU64::new(0),
         })
     }
 
@@ -642,6 +649,12 @@ impl InMemoryLayer {
                 // because this case is unexpected, and we would like tests to fail if this happens.
                 warn!("Key {} at {} written twice at same LSN", key, lsn);
             }
+            self.estimated_in_mem_size.fetch_add(
+                (std::mem::size_of::<CompactKey>()
+                    + std::mem::size_of::<Lsn>()
+                    + std::mem::size_of::<IndexEntry>()) as u64,
+                AtomicOrdering::Relaxed,
+            );
         }
 
         inner.resource_units.maybe_publish_size(new_size);
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 4d086df2d1..60cc689c5e 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -23,6 +23,7 @@ use handle::ShardTimelineId;
 use offload::OffloadError;
 use once_cell::sync::Lazy;
 use pageserver_api::{
+    config::tenant_conf_defaults::DEFAULT_COMPACTION_THRESHOLD,
     key::{
         KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE,
         NON_INHERITED_SPARSE_RANGE,
@@ -3501,18 +3502,37 @@ impl Timeline {
 
                 let timer = self.metrics.flush_time_histo.start_timer();
 
+                let num_frozen_layers;
+                let frozen_layer_total_size;
                 let layer_to_flush = {
                     let guard = self.layers.read().await;
                     let Ok(lm) = guard.layer_map() else {
                         info!("dropping out of flush loop for timeline shutdown");
                         return;
                     };
+                    num_frozen_layers = lm.frozen_layers.len();
+                    frozen_layer_total_size = lm
+                        .frozen_layers
+                        .iter()
+                        .map(|l| l.estimated_in_mem_size())
+                        .sum::<u64>();
                     lm.frozen_layers.front().cloned()
                     // drop 'layers' lock to allow concurrent reads and writes
                 };
                 let Some(layer_to_flush) = layer_to_flush else {
                     break Ok(());
                 };
+                if num_frozen_layers
+                    > std::cmp::max(
+                        self.get_compaction_threshold(),
+                        DEFAULT_COMPACTION_THRESHOLD,
+                    )
+                    && frozen_layer_total_size >= /* 64 MB */ 64000000
+                {
+                    tracing::warn!(
+                        "too many frozen layers: {num_frozen_layers} layers with estimated in-mem size of {frozen_layer_total_size} bytes",
+                    );
+                }
                 match self.flush_frozen_layer(layer_to_flush, ctx).await {
                     Ok(this_layer_to_lsn) => {
                         flushed_to_lsn = std::cmp::max(flushed_to_lsn, this_layer_to_lsn);

From ab47804d000addd668e2583275bfdeb8209502e4 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Fri, 8 Nov 2024 20:25:31 +0100
Subject: [PATCH 11/28] safekeeper: remove unused
 `WriteGuardSharedState::skip_update` (#9699)

---
 safekeeper/src/timeline.rs | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index fa91241177..85add6bfea 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -108,16 +108,11 @@ pub type ReadGuardSharedState<'a> = RwLockReadGuard<'a, SharedState>;
 pub struct WriteGuardSharedState<'a> {
     tli: Arc<Timeline>,
     guard: RwLockWriteGuard<'a, SharedState>,
-    skip_update: bool,
 }
 
 impl<'a> WriteGuardSharedState<'a> {
     fn new(tli: Arc<Timeline>, guard: RwLockWriteGuard<'a, SharedState>) -> Self {
-        WriteGuardSharedState {
-            tli,
-            guard,
-            skip_update: false,
-        }
+        WriteGuardSharedState { tli, guard }
     }
 }
 
@@ -159,12 +154,10 @@ impl Drop for WriteGuardSharedState<'_> {
             }
         });
 
-        if !self.skip_update {
-            // send notification about shared state update
-            self.tli.shared_state_version_tx.send_modify(|old| {
-                *old += 1;
-            });
-        }
+        // send notification about shared state update
+        self.tli.shared_state_version_tx.send_modify(|old| {
+            *old += 1;
+        });
     }
 }
 

From af8238ae52aaf81cb02fdc246f9a7914538ded7d Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Fri, 8 Nov 2024 14:28:55 -0500
Subject: [PATCH 12/28] fix(pageserver): drain upload queue before offloading
 timeline (#9682)

It is possible at the point we shutdown the timeline, there are
still layer files we did not upload.

## Summary of changes

* If the queue is not empty, avoid offloading.
* Shutdown the timeline gracefully using the flush mode to
ensure all local files are uploaded before deleting the timeline
directory.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/http/routes.rs             |  4 ++--
 pageserver/src/tenant.rs                  |  3 ++-
 pageserver/src/tenant/timeline.rs         | 10 +++++++---
 pageserver/src/tenant/timeline/offload.rs |  2 +-
 4 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index d57bd98e95..dde9c5dd0b 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2002,9 +2002,9 @@ async fn timeline_offload_handler(
                 "timeline has attached children".into(),
             ));
         }
-        if !timeline.can_offload() {
+        if let (false, reason) = timeline.can_offload() {
             return Err(ApiError::PreconditionFailed(
-                "Timeline::can_offload() returned false".into(),
+                format!("Timeline::can_offload() check failed: {}", reason) .into(),
             ));
         }
         offload_timeline(&tenant, &timeline)
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 34ea6dae1f..903174680e 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -2493,7 +2493,8 @@ impl Tenant {
             timelines_to_compact_or_offload = timelines
                 .iter()
                 .filter_map(|(timeline_id, timeline)| {
-                    let (is_active, can_offload) = (timeline.is_active(), timeline.can_offload());
+                    let (is_active, (can_offload, _)) =
+                        (timeline.is_active(), timeline.can_offload());
                     let has_no_unoffloaded_children = {
                         !timelines
                             .iter()
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 60cc689c5e..56faacbaee 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1570,12 +1570,16 @@ impl Timeline {
     ///
     /// This is neccessary but not sufficient for offloading of the timeline as it might have
     /// child timelines that are not offloaded yet.
-    pub(crate) fn can_offload(&self) -> bool {
+    pub(crate) fn can_offload(&self) -> (bool, &'static str) {
         if self.remote_client.is_archived() != Some(true) {
-            return false;
+            return (false, "the timeline is not archived");
+        }
+        if !self.remote_client.no_pending_work() {
+            // if the remote client is still processing some work, we can't offload
+            return (false, "the upload queue is not drained yet");
         }
 
-        true
+        (true, "ok")
     }
 
     /// Outermost timeline compaction operation; downloads needed layers. Returns whether we have pending
diff --git a/pageserver/src/tenant/timeline/offload.rs b/pageserver/src/tenant/timeline/offload.rs
index 2dc461c28d..1394843467 100644
--- a/pageserver/src/tenant/timeline/offload.rs
+++ b/pageserver/src/tenant/timeline/offload.rs
@@ -58,7 +58,7 @@ pub(crate) async fn offload_timeline(
     }
 
     // Now that the Timeline is in Stopping state, request all the related tasks to shut down.
-    timeline.shutdown(super::ShutdownMode::Hard).await;
+    timeline.shutdown(super::ShutdownMode::Flush).await;
 
     // TODO extend guard mechanism above with method
     // to make deletions possible while offloading is in progress

From ecde8d763257703f143e3fd74c024fc73ff9f13f Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Fri, 8 Nov 2024 14:43:15 -0600
Subject: [PATCH 13/28] Improve type safety according to pyright

Pyright found many issues that mypy doesn't seem to want to catch or
mypy isn't configured to catch.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 test_runner/fixtures/benchmark_fixture.py     |  6 ++++
 test_runner/fixtures/compare_fixtures.py      |  9 +++--
 test_runner/fixtures/h2server.py              | 36 +++++++++++++------
 test_runner/fixtures/neon_fixtures.py         |  2 +-
 test_runner/fixtures/pageserver/http.py       |  2 +-
 test_runner/fixtures/pageserver/utils.py      |  4 +++
 test_runner/fixtures/paths.py                 |  4 +--
 test_runner/performance/test_copy.py          |  5 ++-
 .../regress/test_pageserver_generations.py    |  1 +
 test_runner/regress/test_proxy_websockets.py  |  2 +-
 test_runner/regress/test_sharding.py          |  1 +
 .../regress/test_storage_controller.py        |  2 ++
 test_runner/regress/test_storage_scrubber.py  |  7 +++-
 test_runner/regress/test_tenant_size.py       |  3 +-
 .../regress/test_threshold_based_eviction.py  |  1 +
 test_runner/regress/test_wal_acceptor.py      | 13 +++----
 16 files changed, 67 insertions(+), 31 deletions(-)

diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py
index 74fe39ef53..d3419bd8b1 100644
--- a/test_runner/fixtures/benchmark_fixture.py
+++ b/test_runner/fixtures/benchmark_fixture.py
@@ -80,7 +80,13 @@ class PgBenchRunResult:
     ):
         stdout_lines = stdout.splitlines()
 
+        number_of_clients = 0
+        number_of_threads = 0
+        number_of_transactions_actually_processed = 0
+        latency_average = 0.0
         latency_stddev = None
+        tps = 0.0
+        scale = 0
 
         # we know significant parts of these values from test input
         # but to be precise take them from output
diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py
index 2195ae8225..85b6e7a3b8 100644
--- a/test_runner/fixtures/compare_fixtures.py
+++ b/test_runner/fixtures/compare_fixtures.py
@@ -8,7 +8,7 @@ from contextlib import _GeneratorContextManager, contextmanager
 
 # Type-related stuff
 from pathlib import Path
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, final
 
 import pytest
 from _pytest.fixtures import FixtureRequest
@@ -70,12 +70,12 @@ class PgCompare(ABC):
 
     @contextmanager
     @abstractmethod
-    def record_pageserver_writes(self, out_name: str):
+    def record_pageserver_writes(self, out_name: str) -> Iterator[None]:
         pass
 
     @contextmanager
     @abstractmethod
-    def record_duration(self, out_name: str):
+    def record_duration(self, out_name: str) -> Iterator[None]:
         pass
 
     @contextmanager
@@ -105,6 +105,7 @@ class PgCompare(ABC):
         return results
 
 
+@final
 class NeonCompare(PgCompare):
     """PgCompare interface for the neon stack."""
 
@@ -206,6 +207,7 @@ class NeonCompare(PgCompare):
         return self.zenbenchmark.record_duration(out_name)
 
 
+@final
 class VanillaCompare(PgCompare):
     """PgCompare interface for vanilla postgres."""
 
@@ -271,6 +273,7 @@ class VanillaCompare(PgCompare):
         return self.zenbenchmark.record_duration(out_name)
 
 
+@final
 class RemoteCompare(PgCompare):
     """PgCompare interface for a remote postgres instance."""
 
diff --git a/test_runner/fixtures/h2server.py b/test_runner/fixtures/h2server.py
index 92783e1fb2..e890b2bcf1 100644
--- a/test_runner/fixtures/h2server.py
+++ b/test_runner/fixtures/h2server.py
@@ -4,11 +4,14 @@ https://python-hyper.org/projects/hyper-h2/en/stable/asyncio-example.html
 auth-broker -> local-proxy needs a h2 connection, so we need a h2 server :)
 """
 
+from __future__ import annotations
+
 import asyncio
 import collections
 import io
 import json
 from collections.abc import AsyncIterable
+from typing import TYPE_CHECKING, final
 
 import pytest_asyncio
 from h2.config import H2Configuration
@@ -25,34 +28,45 @@ from h2.events import (
 )
 from h2.exceptions import ProtocolError, StreamClosedError
 from h2.settings import SettingCodes
+from typing_extensions import override
+
+if TYPE_CHECKING:
+    from typing import Any, Optional
+
 
 RequestData = collections.namedtuple("RequestData", ["headers", "data"])
 
 
+@final
 class H2Server:
-    def __init__(self, host, port) -> None:
+    def __init__(self, host: str, port: int) -> None:
         self.host = host
         self.port = port
 
 
+@final
 class H2Protocol(asyncio.Protocol):
     def __init__(self):
         config = H2Configuration(client_side=False, header_encoding="utf-8")
         self.conn = H2Connection(config=config)
-        self.transport = None
-        self.stream_data = {}
-        self.flow_control_futures = {}
+        self.transport: Optional[asyncio.Transport] = None
+        self.stream_data: dict[int, RequestData] = {}
+        self.flow_control_futures: dict[int, asyncio.Future[Any]] = {}
 
-    def connection_made(self, transport: asyncio.Transport):  # type: ignore[override]
+    @override
+    def connection_made(self, transport: asyncio.BaseTransport):
+        assert isinstance(transport, asyncio.Transport)
         self.transport = transport
         self.conn.initiate_connection()
         self.transport.write(self.conn.data_to_send())
 
-    def connection_lost(self, _exc):
+    @override
+    def connection_lost(self, exc: Optional[Exception]):
         for future in self.flow_control_futures.values():
             future.cancel()
         self.flow_control_futures = {}
 
+    @override
     def data_received(self, data: bytes):
         assert self.transport is not None
         try:
@@ -77,7 +91,7 @@ class H2Protocol(asyncio.Protocol):
                     self.window_updated(event.stream_id, event.delta)
                 elif isinstance(event, RemoteSettingsChanged):
                     if SettingCodes.INITIAL_WINDOW_SIZE in event.changed_settings:
-                        self.window_updated(None, 0)
+                        self.window_updated(0, 0)
 
                 self.transport.write(self.conn.data_to_send())
 
@@ -123,7 +137,7 @@ class H2Protocol(asyncio.Protocol):
         else:
             stream_data.data.write(data)
 
-    def stream_reset(self, stream_id):
+    def stream_reset(self, stream_id: int):
         """
         A stream reset was sent. Stop sending data.
         """
@@ -131,7 +145,7 @@ class H2Protocol(asyncio.Protocol):
             future = self.flow_control_futures.pop(stream_id)
             future.cancel()
 
-    async def send_data(self, data, stream_id):
+    async def send_data(self, data: bytes, stream_id: int):
         """
         Send data according to the flow control rules.
         """
@@ -161,7 +175,7 @@ class H2Protocol(asyncio.Protocol):
             self.transport.write(self.conn.data_to_send())
             data = data[chunk_size:]
 
-    async def wait_for_flow_control(self, stream_id):
+    async def wait_for_flow_control(self, stream_id: int):
         """
         Waits for a Future that fires when the flow control window is opened.
         """
@@ -169,7 +183,7 @@ class H2Protocol(asyncio.Protocol):
         self.flow_control_futures[stream_id] = f
         await f
 
-    def window_updated(self, stream_id, delta):
+    def window_updated(self, stream_id: int, delta):
         """
         A window update frame was received. Unblock some number of flow control
         Futures.
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 83c68794c3..79baa8a32d 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1857,7 +1857,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
         shard_count: Optional[int] = None,
         shard_stripe_size: Optional[int] = None,
         tenant_config: Optional[dict[Any, Any]] = None,
-        placement_policy: Optional[Union[dict[Any, Any] | str]] = None,
+        placement_policy: Optional[Union[dict[Any, Any], str]] = None,
     ):
         """
         Use this rather than pageserver_api() when you need to include shard parameters
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 57a5d6875e..d1a9b5921a 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -316,7 +316,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
     def tenant_location_conf(
         self,
         tenant_id: Union[TenantId, TenantShardId],
-        location_conf=dict[str, Any],
+        location_conf: dict[str, Any],
         flush_ms=None,
         lazy: Optional[bool] = None,
     ):
diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index 4c4306be9e..ac7497ee6c 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -56,6 +56,8 @@ def wait_for_upload(
     lsn: Lsn,
 ):
     """waits for local timeline upload up to specified lsn"""
+
+    current_lsn = Lsn(0)
     for i in range(20):
         current_lsn = remote_consistent_lsn(pageserver_http, tenant, timeline)
         if current_lsn >= lsn:
@@ -203,6 +205,8 @@ def wait_for_last_record_lsn(
     lsn: Lsn,
 ) -> Lsn:
     """waits for pageserver to catch up to a certain lsn, returns the last observed lsn."""
+
+    current_lsn = Lsn(0)
     for i in range(1000):
         current_lsn = last_record_lsn(pageserver_http, tenant, timeline)
         if current_lsn >= lsn:
diff --git a/test_runner/fixtures/paths.py b/test_runner/fixtures/paths.py
index d950f2356d..60221573eb 100644
--- a/test_runner/fixtures/paths.py
+++ b/test_runner/fixtures/paths.py
@@ -112,7 +112,7 @@ def compatibility_snapshot_dir() -> Iterator[Path]:
 
 
 @pytest.fixture(scope="session")
-def compatibility_neon_binpath() -> Optional[Iterator[Path]]:
+def compatibility_neon_binpath() -> Iterator[Optional[Path]]:
     if os.getenv("REMOTE_ENV"):
         return
     comp_binpath = None
@@ -133,7 +133,7 @@ def pg_distrib_dir(base_dir: Path) -> Iterator[Path]:
 
 
 @pytest.fixture(scope="session")
-def compatibility_pg_distrib_dir() -> Optional[Iterator[Path]]:
+def compatibility_pg_distrib_dir() -> Iterator[Optional[Path]]:
     compat_distrib_dir = None
     if env_compat_postgres_bin := os.environ.get("COMPATIBILITY_POSTGRES_DISTRIB_DIR"):
         compat_distrib_dir = Path(env_compat_postgres_bin).resolve()
diff --git a/test_runner/performance/test_copy.py b/test_runner/performance/test_copy.py
index 743604a381..d571fab6b5 100644
--- a/test_runner/performance/test_copy.py
+++ b/test_runner/performance/test_copy.py
@@ -2,11 +2,13 @@ from __future__ import annotations
 
 from contextlib import closing
 from io import BufferedReader, RawIOBase
-from typing import Optional
+from typing import Optional, final
 
 from fixtures.compare_fixtures import PgCompare
+from typing_extensions import override
 
 
+@final
 class CopyTestData(RawIOBase):
     def __init__(self, rows: int):
         self.rows = rows
@@ -14,6 +16,7 @@ class CopyTestData(RawIOBase):
         self.linebuf: Optional[bytes] = None
         self.ptr = 0
 
+    @override
     def readable(self):
         return True
 
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index 11ebb81023..8f6c9f16fd 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -656,6 +656,7 @@ def test_upgrade_generationless_local_file_paths(
     workload.write_rows(1000)
 
     attached_pageserver = env.get_tenant_pageserver(tenant_id)
+    assert attached_pageserver is not None
     secondary_pageserver = list([ps for ps in env.pageservers if ps.id != attached_pageserver.id])[
         0
     ]
diff --git a/test_runner/regress/test_proxy_websockets.py b/test_runner/regress/test_proxy_websockets.py
index 071ca7c54e..ea01252ce4 100644
--- a/test_runner/regress/test_proxy_websockets.py
+++ b/test_runner/regress/test_proxy_websockets.py
@@ -37,7 +37,7 @@ async def test_websockets(static_proxy: NeonProxy):
         startup_message.extend(b"\0")
         length = (4 + len(startup_message)).to_bytes(4, byteorder="big")
 
-        await websocket.send([length, startup_message])
+        await websocket.send([length, bytes(startup_message)])
 
         startup_response = await websocket.recv()
         assert isinstance(startup_response, bytes)
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 3a249bbdb4..ec633e352c 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -256,6 +256,7 @@ def test_sharding_split_compaction(
     # Cleanup part 1: while layers are still in PITR window, we should only drop layers that are fully redundant
     for shard in shards:
         ps = env.get_tenant_pageserver(shard)
+        assert ps is not None
 
         # Invoke compaction: this should drop any layers that don't overlap with the shard's key stripes
         detail_before = ps.http_client().timeline_detail(shard, timeline_id)
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index c8de292588..a069e0d01c 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -1237,6 +1237,7 @@ def test_storage_controller_tenant_deletion(
     # Assert attachments all have local content
     for shard_id in shard_ids:
         pageserver = env.get_tenant_pageserver(shard_id)
+        assert pageserver is not None
         assert pageserver.tenant_dir(shard_id).exists()
 
     # Assert all shards have some content in remote storage
@@ -2745,6 +2746,7 @@ def test_storage_controller_validate_during_migration(neon_env_builder: NeonEnvB
 
     # Upload but don't compact
     origin_pageserver = env.get_tenant_pageserver(tenant_id)
+    assert origin_pageserver is not None
     dest_ps_id = [p.id for p in env.pageservers if p.id != origin_pageserver.id][0]
     origin_pageserver.http_client().timeline_checkpoint(
         tenant_id, timeline_id, wait_until_uploaded=True, compact=False
diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py
index 05db0fe977..11ad2173ae 100644
--- a/test_runner/regress/test_storage_scrubber.py
+++ b/test_runner/regress/test_storage_scrubber.py
@@ -245,6 +245,7 @@ def test_scrubber_physical_gc_ancestors(
     workload.write_rows(100, upload=False)
     for shard in shards:
         ps = env.get_tenant_pageserver(shard)
+        assert ps is not None
         log.info(f"Waiting for shard {shard} on pageserver {ps.id}")
         ps.http_client().timeline_checkpoint(
             shard, timeline_id, compact=False, wait_until_uploaded=True
@@ -270,6 +271,7 @@ def test_scrubber_physical_gc_ancestors(
     workload.churn_rows(100)
     for shard in shards:
         ps = env.get_tenant_pageserver(shard)
+        assert ps is not None
         ps.http_client().timeline_compact(shard, timeline_id, force_image_layer_creation=True)
         ps.http_client().timeline_gc(shard, timeline_id, 0)
 
@@ -336,12 +338,15 @@ def test_scrubber_physical_gc_timeline_deletion(neon_env_builder: NeonEnvBuilder
 
     # Issue a deletion queue flush so that the parent shard can't leave behind layers
     # that will look like unexpected garbage to the scrubber
-    env.get_tenant_pageserver(tenant_id).http_client().deletion_queue_flush(execute=True)
+    ps = env.get_tenant_pageserver(tenant_id)
+    assert ps is not None
+    ps.http_client().deletion_queue_flush(execute=True)
 
     new_shard_count = 4
     shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count)
     for shard in shards:
         ps = env.get_tenant_pageserver(shard)
+        assert ps is not None
         log.info(f"Waiting for shard {shard} on pageserver {ps.id}")
         ps.http_client().timeline_checkpoint(
             shard, timeline_id, compact=False, wait_until_uploaded=True
diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py
index b41f1709bd..0c431fa453 100644
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -315,6 +315,7 @@ def test_single_branch_get_tenant_size_grows(
         tenant_id: TenantId,
         timeline_id: TimelineId,
     ) -> tuple[Lsn, int]:
+        size = 0
         consistent = False
         size_debug = None
 
@@ -360,7 +361,7 @@ def test_single_branch_get_tenant_size_grows(
         collected_responses.append(("CREATE", current_lsn, size))
 
         batch_size = 100
-
+        prev_size = 0
         for i in range(3):
             with endpoint.cursor() as cur:
                 cur.execute(
diff --git a/test_runner/regress/test_threshold_based_eviction.py b/test_runner/regress/test_threshold_based_eviction.py
index 5f211ec4d4..68e9385035 100644
--- a/test_runner/regress/test_threshold_based_eviction.py
+++ b/test_runner/regress/test_threshold_based_eviction.py
@@ -146,6 +146,7 @@ def test_threshold_based_eviction(
                 out += [f"  {remote} {layer.layer_file_name}"]
             return "\n".join(out)
 
+    stable_for: float = 0
     observation_window = 8 * eviction_threshold
     consider_stable_when_no_change_for_seconds = 3 * eviction_threshold
     poll_interval = eviction_threshold / 3
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 157390c01c..e224d5eb01 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -1506,15 +1506,10 @@ class SafekeeperEnv:
             port=port.http,
             auth_token=None,
         )
-        try:
-            safekeeper_process = start_in_background(
-                cmd, safekeeper_dir, "safekeeper.log", safekeeper_client.check_status
-            )
-            return safekeeper_process
-        except Exception as e:
-            log.error(e)
-            safekeeper_process.kill()
-            raise Exception(f"Failed to start safekepeer as {cmd}, reason: {e}") from e
+        safekeeper_process = start_in_background(
+            cmd, safekeeper_dir, "safekeeper.log", safekeeper_client.check_status
+        )
+        return safekeeper_process
 
     def get_safekeeper_connstrs(self):
         assert self.safekeepers is not None, "safekeepers are not initialized"

From 2fcac0e66b1e4c5b6fb7adb01793c49850191f93 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Sat, 9 Nov 2024 01:02:54 +0000
Subject: [PATCH 14/28] CI(pre-merge-checks): add required checks (#9700)

## Problem
The Merge queue doesn't work because it expects certain jobs, which we
don't have in the `pre-merge-checks` workflow.
But it turns out we can just create jobs/checks with the same names in
any workflow that we run.

## Summary of changes
- Add `conclusion` jobs
- Create `neon-cloud-e2e` status check
- Add a bunch of `if`s to handle cases with no relevant changes found
and prepare the workflow to run rust checks in the future
- List the workflow in `report-workflow-stats` to collect stats about it
---
 .github/workflows/pre-merge-checks.yml      | 61 ++++++++++++++++++---
 .github/workflows/report-workflow-stats.yml |  1 +
 2 files changed, 55 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/pre-merge-checks.yml b/.github/workflows/pre-merge-checks.yml
index 40ce644eb6..137faa7abc 100644
--- a/.github/workflows/pre-merge-checks.yml
+++ b/.github/workflows/pre-merge-checks.yml
@@ -1,10 +1,14 @@
-name:
+name: Pre-merge checks
 
 on:
   merge_group:
     branches:
       - main
 
+defaults:
+  run:
+    shell: bash -euxo pipefail {0}
+
 # No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
 permissions: {}
 
@@ -12,11 +16,11 @@ jobs:
   get-changed-files:
     runs-on: ubuntu-22.04
     outputs:
-      any_changed: ${{ steps.src.outputs.any_changed }}
+      python-changed: ${{ steps.python-src.outputs.any_changed }}
     steps:
       - uses: actions/checkout@v4
-      - uses: tj-actions/changed-files@c3a1bb2c992d77180ae65be6ae6c166cf40f857c # v45.0.3
-        id: src
+      - uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf # v45.0.4
+        id: python-src
         with:
           files: |
             .github/workflows/pre-merge-checks.yml
@@ -26,10 +30,13 @@ jobs:
 
       - name: PRINT ALL CHANGED FILES FOR DEBUG PURPOSES
         env:
-          ALL_CHANGED_FILES: ${{ steps.src.outputs.all_changed_files }}
-        run: echo "${ALL_CHANGED_FILES}"
+          PYTHON_CHANGED_FILES: ${{ steps.python-src.outputs.all_changed_files }}
+        run: |
+          echo "${PYTHON_CHANGED_FILES}"
 
   check-build-tools-image:
+    if: needs.get-changed-files.outputs.python-changed == 'true'
+    needs: [ get-changed-files ]
     uses: ./.github/workflows/check-build-tools-image.yml
 
   build-build-tools-image:
@@ -40,8 +47,48 @@ jobs:
     secrets: inherit
 
   check-codestyle-python:
-    needs: [ build-build-tools-image ]
+    if: needs.get-changed-files.outputs.python-changed == 'true'
+    needs: [ get-changed-files, build-build-tools-image ]
     uses: ./.github/workflows/_check-codestyle-python.yml
     with:
       build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
     secrets: inherit
+
+  # To get items from the merge queue merged into main we need to satisfy "Status checks that are required".
+  # Currently we require 2 jobs (checks with exact name):
+  # - conclusion
+  # - neon-cloud-e2e
+  conclusion:
+    if: always()
+    permissions:
+      statuses: write # for `github.repos.createCommitStatus(...)`
+    needs:
+      - get-changed-files
+      - check-codestyle-python
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Create fake `neon-cloud-e2e` check
+        uses: actions/github-script@v7
+        with:
+          # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
+          retries: 5
+          script: |
+            const { repo, owner } = context.repo;
+            const targetUrl = `${context.serverUrl}/${owner}/${repo}/actions/runs/${context.runId}`;
+
+            await github.rest.repos.createCommitStatus({
+              owner: owner,
+              repo: repo,
+              sha: context.sha,
+              context: `neon-cloud-e2e`,
+              state: `success`,
+              target_url: targetUrl,
+              description: `fake check for merge queue`,
+            });
+
+      - name: Fail the job if any of the dependencies do not succeed or skipped
+        run: exit 1
+        if: |
+          (contains(needs.check-codestyle-python.result, 'skipped') && needs.get-changed-files.outputs.python-changed == 'true')
+          || contains(needs.*.result, 'failure')
+          || contains(needs.*.result, 'cancelled')
diff --git a/.github/workflows/report-workflow-stats.yml b/.github/workflows/report-workflow-stats.yml
index 6abeff7695..0d135a257c 100644
--- a/.github/workflows/report-workflow-stats.yml
+++ b/.github/workflows/report-workflow-stats.yml
@@ -23,6 +23,7 @@ on:
     - Test Postgres client libraries
     - Trigger E2E Tests
     - cleanup caches by a branch
+    - Pre-merge checks
     types: [completed]
 
 jobs:

From ceaa80ffebca3050e06c6a5d75f184c6e637ef50 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Mon, 11 Nov 2024 09:58:41 +0000
Subject: [PATCH 15/28] storcon: add peer token for peer to peer communication
 (#9695)

## Problem

We wish to stop using admin tokens in the infra repo, but step down
requests use the admin token.

## Summary of Changes

Introduce a new "ControllerPeer" scope and use it for step-down requests.
---
 libs/utils/src/auth.rs         | 5 +++++
 pageserver/src/auth.rs         | 3 ++-
 safekeeper/src/auth.rs         | 3 ++-
 storage_controller/src/http.rs | 2 +-
 4 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/libs/utils/src/auth.rs b/libs/utils/src/auth.rs
index 5bd6f4bedc..f7acc61ac1 100644
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -40,6 +40,11 @@ pub enum Scope {
     /// Allows access to storage controller APIs used by the scrubber, to interrogate the state
     /// of a tenant & post scrub results.
     Scrubber,
+
+    /// This scope is used for communication with other storage controller instances.
+    /// At the time of writing, this is only used for the step down request.
+    #[serde(rename = "controller_peer")]
+    ControllerPeer,
 }
 
 /// JWT payload. See docs/authentication.md for the format
diff --git a/pageserver/src/auth.rs b/pageserver/src/auth.rs
index 5c931fcfdb..4075427ab4 100644
--- a/pageserver/src/auth.rs
+++ b/pageserver/src/auth.rs
@@ -19,7 +19,8 @@ pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<
             | Scope::SafekeeperData
             | Scope::GenerationsApi
             | Scope::Infra
-            | Scope::Scrubber,
+            | Scope::Scrubber
+            | Scope::ControllerPeer,
             _,
         ) => Err(AuthError(
             format!(
diff --git a/safekeeper/src/auth.rs b/safekeeper/src/auth.rs
index fdd0830b02..81c79fae30 100644
--- a/safekeeper/src/auth.rs
+++ b/safekeeper/src/auth.rs
@@ -20,7 +20,8 @@ pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<
             | Scope::PageServerApi
             | Scope::GenerationsApi
             | Scope::Infra
-            | Scope::Scrubber,
+            | Scope::Scrubber
+            | Scope::ControllerPeer,
             _,
         ) => Err(AuthError(
             format!(
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index f6ea1aedc6..9b5d4caf31 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -1033,7 +1033,7 @@ async fn handle_update_preferred_azs(req: Request<Body>) -> Result<Response<Body
 }
 
 async fn handle_step_down(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
+    check_permissions(&req, Scope::ControllerPeer)?;
 
     let req = match maybe_forward(req).await {
         ForwardOutcome::Forwarded(res) => {

From f510647c7e97432adf31b301cb596e76a2213077 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 11 Nov 2024 12:42:32 +0000
Subject: [PATCH 16/28] CI: retry `actions/github-script` for 5XX errors
 (#9703)

## Problem

GitHub API can return error 500, and it fails jobs that use
`actions/github-script` action.

## Summary of changes
- Add `retry: 500` to all `actions/github-script` usage
---
 .github/actions/allure-report-generate/action.yml | 2 ++
 .github/workflows/build_and_test.yml              | 2 ++
 .github/workflows/neon_extra_builds.yml           | 2 ++
 3 files changed, 6 insertions(+)

diff --git a/.github/actions/allure-report-generate/action.yml b/.github/actions/allure-report-generate/action.yml
index 2bdb727719..16b6e71498 100644
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -221,6 +221,8 @@ runs:
         REPORT_URL: ${{ steps.generate-report.outputs.report-url }}
         COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
       with:
+        # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
+        retries: 5
         script: |
           const { REPORT_URL, COMMIT_SHA } = process.env
 
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index d415e20db8..cc6f91d28e 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -497,6 +497,8 @@ jobs:
           REPORT_URL_NEW: ${{ steps.upload-coverage-report-new.outputs.report-url }}
           COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
         with:
+          # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
+          retries: 5
           script: |
             const { REPORT_URL_NEW, COMMIT_SHA } = process.env
 
diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
index 287c9ea281..cd5a665402 100644
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -201,6 +201,8 @@ jobs:
           REPORT_URL: ${{ steps.upload-stats.outputs.report-url }}
           SHA: ${{ github.event.pull_request.head.sha || github.sha }}
         with:
+          # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
+          retries: 5
           script: |
             const { REPORT_URL, SHA } = process.env
 

From 48c06d9f7b7a87fe7cd97bc83b5300f38bf8011e Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Mon, 11 Nov 2024 09:13:46 -0500
Subject: [PATCH 17/28] fix(pageserver): increase frozen layer warning
 threshold; ignore in tests (#9705)

Perf benchmarks produce a lot of layers.

## Summary of changes

Bumping the threshold and ignore the warning.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/timeline.rs                 | 2 +-
 test_runner/fixtures/pageserver/allowed_errors.py | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 56faacbaee..09ddb19765 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3531,7 +3531,7 @@ impl Timeline {
                         self.get_compaction_threshold(),
                         DEFAULT_COMPACTION_THRESHOLD,
                     )
-                    && frozen_layer_total_size >= /* 64 MB */ 64000000
+                    && frozen_layer_total_size >= /* 128 MB */ 128000000
                 {
                     tracing::warn!(
                         "too many frozen layers: {num_frozen_layers} layers with estimated in-mem size of {frozen_layer_total_size} bytes",
diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py
index fa85563e35..d05704c8e0 100755
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -93,6 +93,8 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
     ".*WARN.*path=/v1/utilization .*request was dropped before completing",
     # Can happen during shutdown
     ".*scheduling deletion on drop failed: queue is in state Stopped.*",
+    # Too many frozen layers error is normal during intensive benchmarks
+    ".*too many frozen layers.*",
 )
 
 

From 54a16766803046a691141d3f11778d70df1c3fda Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Mon, 11 Nov 2024 09:19:03 -0500
Subject: [PATCH 18/28] rfc: update aux file rfc to reflect latest
 optimizations (#9681)

Reflects https://github.com/neondatabase/neon/pull/9631 in the RFC.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 docs/rfcs/038-aux-file-v2.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/rfcs/038-aux-file-v2.md b/docs/rfcs/038-aux-file-v2.md
index 9c3c336008..dc8c5d8fc4 100644
--- a/docs/rfcs/038-aux-file-v2.md
+++ b/docs/rfcs/038-aux-file-v2.md
@@ -91,7 +91,7 @@ generating the basebackup by scanning the `REPL_ORIGIN_KEY_PREFIX` keyspace.
 There are two places we need to read the aux files from the pageserver:
 
 * On the write path, when the compute node adds an aux file to the pageserver, we will retrieve the key from the storage, append the file to the hashed key, and write it back. The current `get` API already supports that.
-*  We use the vectored get API to retrieve all aux files during generating the basebackup. Because we need to scan a sparse keyspace, we slightly modified the vectored get path. The vectorized API will attempt to retrieve every single key within the requested key range, and therefore, we modified it in a way that keys within `NON_INHERITED_SPARSE_RANGE` will not trigger missing key error.
+*  We use the vectored get API to retrieve all aux files during generating the basebackup. Because we need to scan a sparse keyspace, we slightly modified the vectored get path. The vectorized API used to always attempt to retrieve every single key within the requested key range, and therefore, we modified it in a way that keys within `NON_INHERITED_SPARSE_RANGE` will not trigger missing key error. Furthermore, as aux file reads usually need all layer files intersecting with that key range within the branch and cover a big keyspace, it incurs large overhead for tracking keyspaces that have not been read. Therefore, for sparse keyspaces, we [do not track](https://github.com/neondatabase/neon/pull/9631) `ummapped_keyspace`.
 
 ## Compaction and Image Layer Generation
 

From f63de5f5274ff86a478bfc8a1a00450d896d5ca6 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Mon, 11 Nov 2024 17:55:50 +0100
Subject: [PATCH 19/28] safekeeper: add `initialize_segment` variant of
 `safekeeper_wal_storage_operation_seconds` (#9691)

## Problem

We don't have a metric capturing the latency of segment initialization.
This can be significant due to fsyncs.

## Summary of changes

Add an `initialize_segment` variant of
`safekeeper_wal_storage_operation_seconds`.
---
 safekeeper/src/metrics.rs     | 2 +-
 safekeeper/src/wal_storage.rs | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs
index bb56e923f8..bbd2f86898 100644
--- a/safekeeper/src/metrics.rs
+++ b/safekeeper/src/metrics.rs
@@ -55,7 +55,7 @@ pub static WRITE_WAL_SECONDS: Lazy<Histogram> = Lazy::new(|| {
 pub static FLUSH_WAL_SECONDS: Lazy<Histogram> = Lazy::new(|| {
     register_histogram!(
         "safekeeper_flush_wal_seconds",
-        "Seconds spent syncing WAL to a disk",
+        "Seconds spent syncing WAL to a disk (excluding segment initialization)",
         DISK_FSYNC_SECONDS_BUCKETS.to_vec()
     )
     .expect("Failed to register safekeeper_flush_wal_seconds histogram")
diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs
index 4e67940c51..11f372bceb 100644
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -257,6 +257,9 @@ impl PhysicalStorage {
             // Try to open existing partial file
             Ok((file, true))
         } else {
+            let _timer = WAL_STORAGE_OPERATION_SECONDS
+                .with_label_values(&["initialize_segment"])
+                .start_timer();
             // Create and fill new partial file
             //
             // We're using fdatasync during WAL writing, so file size must not
@@ -274,8 +277,6 @@ impl PhysicalStorage {
             });
             file.set_len(self.wal_seg_size as u64).await?;
 
-            // Note: this doesn't get into observe_flush_seconds metric. But
-            // segment init should be separate metric, if any.
             if let Err(e) = durable_rename(&tmp_path, &wal_file_partial_path, !self.no_sync).await {
                 // Probably rename succeeded, but fsync of it failed. Remove
                 // the file then to avoid using it.

From 1aab34715a699e8532c49caa2bf1010e64f09a71 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 11 Nov 2024 17:01:02 +0000
Subject: [PATCH 20/28] Remove checklist from the PR template (#9702)

## Problem
Once we enable the merge queue for the `main` branch, it won't be
possible to adjust the commit message right after pressing the "Squash
and merge" button and the PR title + description will be used as is.

To avoid extra noise in the commits in the `main` with the checklist
leftovers, I propose removing the checklist from the PR template and
keeping only the Problem / Summary of changes.

## Summary of changes
- Remove the checklist from the PR template
---
 .github/pull_request_template.md | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
index 22c025dd89..89328f20ee 100644
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -1,14 +1,3 @@
 ## Problem
 
 ## Summary of changes
-
-## Checklist before requesting a review
-
-- [ ] I have performed a self-review of my code.
-- [ ] If it is a core feature, I have added thorough tests.
-- [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard?
-- [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section.
-
-## Checklist before merging
-
-- [ ] Do not forget to reformat commit message to not include the above checklist

From 8db84d99643b1c668c935a68610be59e8326ba63 Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Mon, 11 Nov 2024 18:51:15 +0100
Subject: [PATCH 21/28] new ingest benchmark (#9711)

## Problem

We have no specific benchmark testing project migration of postgresql
project with existing data into Neon.
Typical steps of such a project migration are
- schema creation in the neon project
- initial COPY of relations
- creation of indexes and constraints
- vacuum analyze

## Summary of changes

Add a periodic benchmark running 9 AM UTC every day.
In each run:
- copy a 200 GiB project that has realistic schema, data, tables,
indexes and constraints from another project into
  - a new Neon project (7 CU fixed)
- an existing tenant, (but new branch and new database) that already has
4 TiB of data
- use pgcopydb tool to automate all steps and parallelize COPY and index
creation
- parse pgcopydb output and report performance metrics in Neon
performance test database

## Logs

This benchmark has been tested first manually and then as part of
benchmarking.yml workflow, example run see

https://github.com/neondatabase/neon/actions/runs/11757679870
---
 .github/actionlint.yml                 |   1 +
 .github/workflows/ingest_benchmark.yml | 372 +++++++++++++++++++++++++
 2 files changed, 373 insertions(+)
 create mode 100644 .github/workflows/ingest_benchmark.yml

diff --git a/.github/actionlint.yml b/.github/actionlint.yml
index 1b602883c5..29c4d18f4a 100644
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -20,3 +20,4 @@ config-variables:
   - REMOTE_STORAGE_AZURE_REGION
   - SLACK_UPCOMING_RELEASE_CHANNEL_ID
   - DEV_AWS_OIDC_ROLE_ARN
+  - BENCHMARK_INGEST_TARGET_PROJECTID
diff --git a/.github/workflows/ingest_benchmark.yml b/.github/workflows/ingest_benchmark.yml
new file mode 100644
index 0000000000..d770bb2bb5
--- /dev/null
+++ b/.github/workflows/ingest_benchmark.yml
@@ -0,0 +1,372 @@
+name: Benchmarking
+
+on:
+  # uncomment to run on push for debugging your PR
+  # push:
+  #   branches: [ your branch ]
+  schedule:
+    # * is a special character in YAML so you have to quote this string
+    #          ┌───────────── minute (0 - 59)
+    #          │ ┌───────────── hour (0 - 23)
+    #          │ │ ┌───────────── day of the month (1 - 31)
+    #          │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
+    #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
+    - cron:   '0 9 * * *' # run once a day, timezone is utc
+  workflow_dispatch: # adds ability to run this manually
+    
+defaults:
+  run:
+    shell: bash -euxo pipefail {0}
+
+concurrency:
+  # Allow only one workflow globally because we need dedicated resources which only exist once
+  group: ingest-bench-workflow
+  cancel-in-progress: true
+
+jobs:
+  ingest:
+    strategy:
+      matrix:
+        target_project: [new_empty_project, large_existing_project]  
+    permissions:
+      contents: write
+      statuses: write
+      id-token: write # aws-actions/configure-aws-credentials
+    env:
+      PG_CONFIG: /tmp/neon/pg_install/v16/bin/pg_config
+      PSQL: /tmp/neon/pg_install/v16/bin/psql
+      PG_16_LIB_PATH: /tmp/neon/pg_install/v16/lib
+      PGCOPYDB: /pgcopydb/bin/pgcopydb
+      PGCOPYDB_LIB_PATH: /pgcopydb/lib
+    runs-on: [ self-hosted, us-east-2, x64 ]
+    container:
+      image: neondatabase/build-tools:pinned-bookworm
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      options: --init
+    timeout-minutes: 1440
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Configure AWS credentials # necessary to download artefacts
+      uses: aws-actions/configure-aws-credentials@v4
+      with:
+        aws-region: eu-central-1
+        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        role-duration-seconds: 18000 # 5 hours is currently max associated with IAM role 
+
+    - name: Download Neon artifact
+      uses: ./.github/actions/download
+      with:
+        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
+        path: /tmp/neon/
+        prefix: latest
+
+    - name: Create Neon Project
+      if: ${{ matrix.target_project == 'new_empty_project' }}
+      id: create-neon-project-ingest-target
+      uses: ./.github/actions/neon-project-create
+      with:
+        region_id: aws-us-east-2
+        postgres_version: 16
+        compute_units: '[7, 7]' # we want to test large compute here to avoid compute-side bottleneck
+        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
+
+    - name: Initialize Neon project and retrieve current backpressure seconds
+      if: ${{ matrix.target_project == 'new_empty_project' }}
+      env:
+          NEW_PROJECT_CONNSTR: ${{ steps.create-neon-project-ingest-target.outputs.dsn }}
+          NEW_PROJECT_ID: ${{ steps.create-neon-project-ingest-target.outputs.project_id }}
+      run: |
+        echo "Initializing Neon project with project_id: ${NEW_PROJECT_ID}"
+        export LD_LIBRARY_PATH=${PG_16_LIB_PATH}
+        ${PSQL} "${NEW_PROJECT_CONNSTR}" -c "CREATE EXTENSION IF NOT EXISTS neon; CREATE EXTENSION IF NOT EXISTS neon_utils;"
+        BACKPRESSURE_TIME_BEFORE_INGEST=$(${PSQL} "${NEW_PROJECT_CONNSTR}" -t -c "select backpressure_throttling_time()/1000000;")
+        echo "BACKPRESSURE_TIME_BEFORE_INGEST=${BACKPRESSURE_TIME_BEFORE_INGEST}" >> $GITHUB_ENV
+        echo "NEW_PROJECT_CONNSTR=${NEW_PROJECT_CONNSTR}" >> $GITHUB_ENV
+
+    - name: Create Neon Branch for large tenant
+      if: ${{ matrix.target_project == 'large_existing_project' }}
+      id: create-neon-branch-ingest-target
+      uses: ./.github/actions/neon-branch-create
+      with:
+        project_id: ${{ vars.BENCHMARK_INGEST_TARGET_PROJECTID }}
+        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
+
+    - name: Initialize Neon project and retrieve current backpressure seconds
+      if: ${{ matrix.target_project == 'large_existing_project' }}
+      env:
+          NEW_PROJECT_CONNSTR: ${{ steps.create-neon-branch-ingest-target.outputs.dsn }}
+          NEW_BRANCH_ID: ${{ steps.create-neon-branch-ingest-target.outputs.branch_id }}
+      run: |
+        echo "Initializing Neon branch with branch_id: ${NEW_BRANCH_ID}"
+        export LD_LIBRARY_PATH=${PG_16_LIB_PATH}
+        # Extract the part before the database name
+        base_connstr="${NEW_PROJECT_CONNSTR%/*}"
+        # Extract the query parameters (if any) after the database name
+        query_params="${NEW_PROJECT_CONNSTR#*\?}"
+        # Reconstruct the new connection string
+        if [ "$query_params" != "$NEW_PROJECT_CONNSTR" ]; then
+          new_connstr="${base_connstr}/neondb?${query_params}"
+        else
+          new_connstr="${base_connstr}/neondb"
+        fi
+        ${PSQL} "${new_connstr}" -c "drop database ludicrous;"
+        ${PSQL} "${new_connstr}" -c "CREATE DATABASE ludicrous;"
+        if [ "$query_params" != "$NEW_PROJECT_CONNSTR" ]; then
+          NEW_PROJECT_CONNSTR="${base_connstr}/ludicrous?${query_params}"
+        else
+          NEW_PROJECT_CONNSTR="${base_connstr}/ludicrous"
+        fi
+        ${PSQL} "${NEW_PROJECT_CONNSTR}" -c "CREATE EXTENSION IF NOT EXISTS neon; CREATE EXTENSION IF NOT EXISTS neon_utils;"
+        BACKPRESSURE_TIME_BEFORE_INGEST=$(${PSQL} "${NEW_PROJECT_CONNSTR}" -t -c "select backpressure_throttling_time()/1000000;")
+        echo "BACKPRESSURE_TIME_BEFORE_INGEST=${BACKPRESSURE_TIME_BEFORE_INGEST}" >> $GITHUB_ENV
+        echo "NEW_PROJECT_CONNSTR=${NEW_PROJECT_CONNSTR}" >> $GITHUB_ENV
+      
+        
+    - name: Create pgcopydb filter file
+      run: |
+        cat << EOF > /tmp/pgcopydb_filter.txt
+          [include-only-table]
+          public.events
+          public.emails
+          public.email_transmissions
+          public.payments
+          public.editions
+          public.edition_modules
+          public.sp_content
+          public.email_broadcasts
+          public.user_collections
+          public.devices
+          public.user_accounts
+          public.lessons
+          public.lesson_users
+          public.payment_methods
+          public.orders
+          public.course_emails
+          public.modules
+          public.users
+          public.module_users
+          public.courses
+          public.payment_gateway_keys
+          public.accounts
+          public.roles
+          public.payment_gateways
+          public.management
+          public.event_names
+        EOF
+
+    - name: Invoke pgcopydb
+      env:
+          BENCHMARK_INGEST_SOURCE_CONNSTR: ${{ secrets.BENCHMARK_INGEST_SOURCE_CONNSTR }}
+      run: |
+        export LD_LIBRARY_PATH=${PGCOPYDB_LIB_PATH}:${PG_16_LIB_PATH}
+        export PGCOPYDB_SOURCE_PGURI="${BENCHMARK_INGEST_SOURCE_CONNSTR}"
+        export PGCOPYDB_TARGET_PGURI="${NEW_PROJECT_CONNSTR}"
+        export PGOPTIONS="-c maintenance_work_mem=8388608 -c max_parallel_maintenance_workers=7"
+        ${PG_CONFIG} --bindir
+        ${PGCOPYDB} --version
+        ${PGCOPYDB} clone --skip-vacuum  --no-owner --no-acl --skip-db-properties --table-jobs 4 \
+          --index-jobs 4 --restore-jobs 4 --split-tables-larger-than 10GB --skip-extensions \
+          --use-copy-binary --filters /tmp/pgcopydb_filter.txt 2>&1 | tee /tmp/pgcopydb_${{ matrix.target_project }}.log
+
+    # create dummy pgcopydb log to test parsing
+    # - name: create dummy log for parser test
+    #   run: |
+    #     cat << EOF > /tmp/pgcopydb_${{ matrix.target_project }}.log
+    #     2024-11-04 18:00:53.433 500861 INFO   main.c:136                Running pgcopydb version 0.17.10.g8361a93 from "/usr/lib/postgresql/17/bin/pgcopydb"
+    #     2024-11-04 18:00:53.434 500861 INFO   cli_common.c:1225         [SOURCE] Copying database from "postgres://neondb_owner@ep-bitter-shape-w2c1ir0a.us-east-2.aws.neon.build/neondb?sslmode=require&keepalives=1&keepalives_idle=10&keepalives_interval=10&keepalives_count=60"
+    #     2024-11-04 18:00:53.434 500861 INFO   cli_common.c:1226         [TARGET] Copying database into "postgres://neondb_owner@ep-icy-union-w25qd5pj.us-east-2.aws.neon.build/ludicrous?sslmode=require&keepalives=1&keepalives_idle=10&keepalives_interval=10&keepalives_count=60"
+    #     2024-11-04 18:00:53.442 500861 INFO   copydb.c:105              Using work dir "/tmp/pgcopydb"
+    #     2024-11-04 18:00:53.541 500861 INFO   snapshot.c:107            Exported snapshot "00000008-00000033-1" from the source database
+    #     2024-11-04 18:00:53.556 500865 INFO   cli_clone_follow.c:543    STEP 1: fetch source database tables, indexes, and sequences
+    #     2024-11-04 18:00:54.570 500865 INFO   copydb_schema.c:716       Splitting source candidate tables larger than 10 GB
+    #     2024-11-04 18:00:54.570 500865 INFO   copydb_schema.c:829       Table public.events is 96 GB large which is larger than --split-tables-larger-than 10 GB, and does not have a unique column of type integer: splitting by CTID
+    #     2024-11-04 18:01:05.538 500865 INFO   copydb_schema.c:905       Table public.events is 96 GB large, 10 COPY processes will be used, partitioning on ctid.
+    #     2024-11-04 18:01:05.564 500865 INFO   copydb_schema.c:905       Table public.email_transmissions is 27 GB large, 4 COPY processes will be used, partitioning on id.
+    #     2024-11-04 18:01:05.584 500865 INFO   copydb_schema.c:905       Table public.lessons is 25 GB large, 4 COPY processes will be used, partitioning on id.
+    #     2024-11-04 18:01:05.605 500865 INFO   copydb_schema.c:905       Table public.lesson_users is 16 GB large, 3 COPY processes will be used, partitioning on id.
+    #     2024-11-04 18:01:05.605 500865 INFO   copydb_schema.c:761       Fetched information for 26 tables (including 4 tables split in 21 partitions total), with an estimated total of 907 million tuples and 175 GB on-disk
+    #     2024-11-04 18:01:05.687 500865 INFO   copydb_schema.c:968       Fetched information for 57 indexes (supporting 25 constraints)
+    #     2024-11-04 18:01:05.753 500865 INFO   sequences.c:78            Fetching information for 24 sequences
+    #     2024-11-04 18:01:05.903 500865 INFO   copydb_schema.c:1122      Fetched information for 4 extensions
+    #     2024-11-04 18:01:06.178 500865 INFO   copydb_schema.c:1538      Found 0 indexes (supporting 0 constraints) in the target database
+    #     2024-11-04 18:01:06.184 500865 INFO   cli_clone_follow.c:584    STEP 2: dump the source database schema (pre/post data)
+    #     2024-11-04 18:01:06.186 500865 INFO   pgcmd.c:468                /usr/lib/postgresql/16/bin/pg_dump -Fc --snapshot 00000008-00000033-1 --section=pre-data --section=post-data --file /tmp/pgcopydb/schema/schema.dump 'postgres://neondb_owner@ep-bitter-shape-w2c1ir0a.us-east-2.aws.neon.build/neondb?sslmode=require&keepalives=1&keepalives_idle=10&keepalives_interval=10&keepalives_count=60'
+    #     2024-11-04 18:01:06.952 500865 INFO   cli_clone_follow.c:592    STEP 3: restore the pre-data section to the target database
+    #     2024-11-04 18:01:07.004 500865 INFO   pgcmd.c:1001               /usr/lib/postgresql/16/bin/pg_restore --dbname 'postgres://neondb_owner@ep-icy-union-w25qd5pj.us-east-2.aws.neon.build/ludicrous?sslmode=require&keepalives=1&keepalives_idle=10&keepalives_interval=10&keepalives_count=60' --section pre-data --jobs 4 --no-owner --no-acl --use-list /tmp/pgcopydb/schema/pre-filtered.list /tmp/pgcopydb/schema/schema.dump
+    #     2024-11-04 18:01:07.438 500874 INFO   table-data.c:656          STEP 4: starting 4 table-data COPY processes
+    #     2024-11-04 18:01:07.451 500877 INFO   vacuum.c:139              STEP 8: skipping VACUUM jobs per --skip-vacuum
+    #     2024-11-04 18:01:07.457 500875 INFO   indexes.c:182             STEP 6: starting 4 CREATE INDEX processes
+    #     2024-11-04 18:01:07.457 500875 INFO   indexes.c:183             STEP 7: constraints are built by the CREATE INDEX processes
+    #     2024-11-04 18:01:07.507 500865 INFO   blobs.c:74                Skipping large objects: none found.
+    #     2024-11-04 18:01:07.509 500865 INFO   sequences.c:194           STEP 9: reset sequences values
+    #     2024-11-04 18:01:07.510 500886 INFO   sequences.c:290           Set sequences values on the target database
+    #     2024-11-04 20:49:00.587 500865 INFO   cli_clone_follow.c:608    STEP 10: restore the post-data section to the target database
+    #     2024-11-04 20:49:00.600 500865 INFO   pgcmd.c:1001               /usr/lib/postgresql/16/bin/pg_restore --dbname 'postgres://neondb_owner@ep-icy-union-w25qd5pj.us-east-2.aws.neon.build/ludicrous?sslmode=require&keepalives=1&keepalives_idle=10&keepalives_interval=10&keepalives_count=60' --section post-data --jobs 4 --no-owner --no-acl --use-list /tmp/pgcopydb/schema/post-filtered.list /tmp/pgcopydb/schema/schema.dump
+    #     2024-11-05 10:50:58.508 500865 INFO   cli_clone_follow.c:639    All step are now done, 16h49m elapsed
+    #     2024-11-05 10:50:58.508 500865 INFO   summary.c:3155            Printing summary for 26 tables and 57 indexes
+
+    #       OID | Schema |                 Name | Parts | copy duration | transmitted bytes | indexes | create index duration 
+    #     ------+--------+----------------------+-------+---------------+-------------------+---------+----------------------
+    #     24654 | public |               events |    10 |         1d11h |            878 GB |       1 |                 1h41m
+    #     24623 | public |  email_transmissions |     4 |         4h46m |             99 GB |       3 |                 2h04m
+    #     24665 | public |              lessons |     4 |         4h42m |            161 GB |       4 |                 1m11s
+    #     24661 | public |         lesson_users |     3 |         2h46m |             49 GB |       3 |                39m35s
+    #     24631 | public |               emails |     1 |        34m07s |             10 GB |       2 |                   17s
+    #     24739 | public |             payments |     1 |         5m47s |           1848 MB |       4 |                 4m40s
+    #     24681 | public |         module_users |     1 |         4m57s |           1610 MB |       3 |                 1m50s
+    #     24694 | public |               orders |     1 |         2m50s |            835 MB |       3 |                 1m05s
+    #     24597 | public |              devices |     1 |         1m45s |            498 MB |       2 |                   40s
+    #     24723 | public |      payment_methods |     1 |         1m24s |            548 MB |       2 |                   31s
+    #     24765 | public |     user_collections |     1 |         2m17s |           1005 MB |       2 |                 968ms
+    #     24774 | public |                users |     1 |           52s |            291 MB |       4 |                   27s
+    #     24760 | public |        user_accounts |     1 |           16s |            172 MB |       3 |                   16s
+    #     24606 | public |      edition_modules |     1 |         8s983 |             46 MB |       3 |                 4s749
+    #     24583 | public |        course_emails |     1 |         8s526 |             26 MB |       2 |                 996ms
+    #     24685 | public |              modules |     1 |         1s592 |             21 MB |       3 |                 1s696
+    #     24610 | public |             editions |     1 |         2s199 |           7483 kB |       2 |                 1s032
+    #     24755 | public |           sp_content |     1 |         1s555 |           4177 kB |       0 |                   0ms
+    #     24619 | public |     email_broadcasts |     1 |         744ms |           2645 kB |       2 |                 677ms
+    #     24590 | public |              courses |     1 |         387ms |           1540 kB |       2 |                 367ms
+    #     24704 | public | payment_gateway_keys |     1 |         1s972 |            164 kB |       2 |                  27ms
+    #     24576 | public |             accounts |     1 |          58ms |             24 kB |       1 |                  14ms
+    #     24647 | public |          event_names |     1 |          32ms |             397 B |       1 |                   8ms
+    #     24716 | public |     payment_gateways |     1 |         1s675 |             117 B |       1 |                  11ms
+    #     24748 | public |                roles |     1 |          71ms |             173 B |       1 |                   8ms
+    #     24676 | public |           management |     1 |          33ms |              40 B |       1 |                  19ms
+
+
+    #                                                   Step   Connection    Duration    Transfer   Concurrency
+    #     --------------------------------------------------   ----------  ----------  ----------  ------------
+    #       Catalog Queries (table ordering, filtering, etc)       source         12s                         1
+    #                                             Dump Schema       source       765ms                         1
+    #                                         Prepare Schema       target       466ms                         1
+    #           COPY, INDEX, CONSTRAINTS, VACUUM (wall clock)         both       2h47m                        12
+    #                                       COPY (cumulative)         both       7h46m     1225 GB             4
+    #                               CREATE INDEX (cumulative)       target       4h36m                         4
+    #                               CONSTRAINTS (cumulative)       target       8s493                         4
+    #                                     VACUUM (cumulative)       target         0ms                         4
+    #                                         Reset Sequences         both        60ms                         1
+    #                             Large Objects (cumulative)       (null)         0ms                         0
+    #                                         Finalize Schema         both      14h01m                         4
+    #     --------------------------------------------------   ----------  ----------  ----------  ------------
+    #                               Total Wall Clock Duration         both      16h49m                        20
+
+
+    #     EOF
+
+
+    - name: show tables sizes and retrieve current backpressure seconds
+      run: |
+        export LD_LIBRARY_PATH=${PG_16_LIB_PATH}
+        ${PSQL} "${NEW_PROJECT_CONNSTR}" -c "\dt+"
+        BACKPRESSURE_TIME_AFTER_INGEST=$(${PSQL} "${NEW_PROJECT_CONNSTR}" -t -c "select backpressure_throttling_time()/1000000;")
+        echo "BACKPRESSURE_TIME_AFTER_INGEST=${BACKPRESSURE_TIME_AFTER_INGEST}" >> $GITHUB_ENV
+
+    - name: Parse pgcopydb log and report performance metrics
+      env:
+        PERF_TEST_RESULT_CONNSTR: ${{ secrets.PERF_TEST_RESULT_CONNSTR }}
+      run: |
+        export LD_LIBRARY_PATH=${PG_16_LIB_PATH}
+
+        # Define the log file path
+        LOG_FILE="/tmp/pgcopydb_${{ matrix.target_project }}.log"
+        
+        # Get the current git commit hash
+        git config --global --add safe.directory /__w/neon/neon
+        COMMIT_HASH=$(git rev-parse --short HEAD)
+        
+        # Define the platform and test suite
+        PLATFORM="pg16-${{ matrix.target_project }}-us-east-2-staging"
+        SUIT="pgcopydb_ingest_bench"
+        
+        # Function to convert time (e.g., "2h47m", "4h36m", "118ms", "8s493") to seconds
+        convert_to_seconds() {
+          local duration=$1
+          local total_seconds=0
+    
+          # Check for hours (h)
+          if [[ "$duration" =~ ([0-9]+)h ]]; then
+            total_seconds=$((total_seconds + ${BASH_REMATCH[1]#0} * 3600))
+          fi
+    
+          # Check for seconds (s)
+          if [[ "$duration" =~ ([0-9]+)s ]]; then
+            total_seconds=$((total_seconds + ${BASH_REMATCH[1]#0}))
+          fi
+    
+          # Check for milliseconds (ms) (if applicable)
+          if [[ "$duration" =~ ([0-9]+)ms ]]; then
+            total_seconds=$((total_seconds + ${BASH_REMATCH[1]#0} / 1000))
+            duration=${duration/${BASH_REMATCH[0]}/} # need to remove it to avoid double counting with m 
+          fi
+
+          # Check for minutes (m) - must be checked after ms because m is contained in ms
+          if [[ "$duration" =~ ([0-9]+)m ]]; then
+            total_seconds=$((total_seconds + ${BASH_REMATCH[1]#0} * 60))
+          fi
+    
+          echo $total_seconds
+        }
+
+        # Calculate the backpressure difference in seconds
+        BACKPRESSURE_TIME_DIFF=$(awk "BEGIN {print $BACKPRESSURE_TIME_AFTER_INGEST - $BACKPRESSURE_TIME_BEFORE_INGEST}")
+
+        # Insert the backpressure time difference into the performance database
+        if [ -n "$BACKPRESSURE_TIME_DIFF" ]; then
+          PSQL_CMD="${PSQL} \"${PERF_TEST_RESULT_CONNSTR}\" -c \"
+          INSERT INTO public.perf_test_results (suit, revision, platform, metric_name, metric_value, metric_unit, metric_report_type, recorded_at_timestamp)
+          VALUES ('${SUIT}', '${COMMIT_HASH}', '${PLATFORM}', 'backpressure_time', ${BACKPRESSURE_TIME_DIFF}, 'seconds', 'lower_is_better', now());
+          \""
+          echo "Inserting backpressure time difference: ${BACKPRESSURE_TIME_DIFF} seconds"
+          eval $PSQL_CMD
+        fi
+
+        # Extract and process log lines
+        while IFS= read -r line; do
+          METRIC_NAME=""
+          # Match each desired line and extract the relevant information
+          if [[ "$line" =~ COPY,\ INDEX,\ CONSTRAINTS,\ VACUUM.* ]]; then
+            METRIC_NAME="COPY, INDEX, CONSTRAINTS, VACUUM (wall clock)"
+          elif [[ "$line" =~ COPY\ \(cumulative\).* ]]; then
+            METRIC_NAME="COPY (cumulative)"
+          elif [[ "$line" =~ CREATE\ INDEX\ \(cumulative\).* ]]; then
+            METRIC_NAME="CREATE INDEX (cumulative)"
+          elif [[ "$line" =~ CONSTRAINTS\ \(cumulative\).* ]]; then
+            METRIC_NAME="CONSTRAINTS (cumulative)"
+          elif [[ "$line" =~ Finalize\ Schema.* ]]; then
+            METRIC_NAME="Finalize Schema"
+          elif [[ "$line" =~ Total\ Wall\ Clock\ Duration.* ]]; then
+            METRIC_NAME="Total Wall Clock Duration"
+          fi
+          
+          # If a metric was matched, insert it into the performance database
+          if [ -n "$METRIC_NAME" ]; then
+            DURATION=$(echo "$line" | grep -oP '\d+h\d+m|\d+s|\d+ms|\d{1,2}h\d{1,2}m|\d+\.\d+s' | head -n 1)
+            METRIC_VALUE=$(convert_to_seconds "$DURATION")
+            PSQL_CMD="${PSQL} \"${PERF_TEST_RESULT_CONNSTR}\" -c \"
+            INSERT INTO public.perf_test_results (suit, revision, platform, metric_name, metric_value, metric_unit, metric_report_type, recorded_at_timestamp)
+            VALUES ('${SUIT}', '${COMMIT_HASH}', '${PLATFORM}', '${METRIC_NAME}', ${METRIC_VALUE}, 'seconds', 'lower_is_better', now());
+            \""
+            echo "Inserting ${METRIC_NAME} with value ${METRIC_VALUE} seconds"
+            eval $PSQL_CMD
+          fi
+        done < "$LOG_FILE"
+      
+    - name: Delete Neon Project
+      if: ${{ always() && matrix.target_project == 'new_empty_project' }}
+      uses: ./.github/actions/neon-project-delete
+      with:
+        project_id: ${{ steps.create-neon-project-ingest-target.outputs.project_id }}
+        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
+
+    - name: Delete Neon Branch for large tenant
+      if: ${{ always() && matrix.target_project == 'large_existing_project' }}
+      uses: ./.github/actions/neon-branch-delete
+      with:
+        project_id: ${{ vars.BENCHMARK_INGEST_TARGET_PROJECTID }}
+        branch_id: ${{ steps.create-neon-branch-ingest-target.outputs.branch_id }}
+        api_key: ${{ secrets.NEON_STAGING_API_KEY }}

From e9dcfa2eb2950ff43a266238bb94cb2ec70fb8bc Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 11 Nov 2024 18:07:01 +0000
Subject: [PATCH 22/28] test_runner: skip more tests using decorator instead of
 pytest.skip (#9704)

## Problem

Running `pytest.skip(...)` in a test body instead of marking the test
with `@pytest.mark.skipif(...)` makes all fixtures to be initialised,
which is not necessary if the test is going to be skipped anyway.

Also, some tests are unnecessarily skipped (e.g. `test_layer_bloating`
on Postgres 17, or `test_idle_reconnections` at all) or run (e.g.
`test_parse_project_git_version_output_positive` more than on once
configuration) according to comments.

## Summary of changes
- Move `skip_on_postgres` / `xfail_on_postgres` /
`run_only_on_default_postgres` decorators to `fixture.utils`
- Add new `skip_in_debug_build` and `skip_on_ci` decorators
- Replace `pytest.skip(...)` calls with decorators where possible
---
 test_runner/fixtures/pg_version.py            | 31 ++------------
 test_runner/fixtures/utils.py                 | 41 ++++++++++++++++++-
 ...er_max_throughput_getpage_at_latest_lsn.py | 13 +++---
 test_runner/regress/test_branch_and_gc.py     |  8 ++--
 test_runner/regress/test_compaction.py        |  5 +--
 .../regress/test_download_extensions.py       |  8 ++--
 .../regress/test_ingestion_layer_size.py      |  9 ++--
 test_runner/regress/test_layer_bloating.py    | 13 ++++--
 test_runner/regress/test_layer_eviction.py    | 11 ++---
 test_runner/regress/test_logging.py           |  3 +-
 test_runner/regress/test_neon_cli.py          | 21 ++++------
 .../regress/test_pageserver_layer_rolling.py  | 12 ++----
 .../regress/test_pageserver_restart.py        | 10 ++---
 .../regress/test_pageserver_secondary.py      |  4 +-
 test_runner/regress/test_pg_regress.py        |  4 +-
 test_runner/regress/test_replica_start.py     | 11 +++--
 test_runner/regress/test_sharding.py          | 11 ++---
 .../regress/test_storage_controller.py        |  3 +-
 test_runner/regress/test_tenant_size.py       |  5 +--
 .../regress/test_timeline_detach_ancestor.py  | 18 ++++----
 test_runner/regress/test_wal_acceptor.py      |  7 ++--
 .../regress/test_wal_acceptor_async.py        |  5 +--
 22 files changed, 123 insertions(+), 130 deletions(-)

diff --git a/test_runner/fixtures/pg_version.py b/test_runner/fixtures/pg_version.py
index 01f0245665..4feab52c43 100644
--- a/test_runner/fixtures/pg_version.py
+++ b/test_runner/fixtures/pg_version.py
@@ -1,10 +1,8 @@
 from __future__ import annotations
 
 import enum
-import os
 from typing import TYPE_CHECKING
 
-import pytest
 from typing_extensions import override
 
 if TYPE_CHECKING:
@@ -18,12 +16,15 @@ This fixture is used to determine which version of Postgres to use for tests.
 
 # Inherit PgVersion from str rather than int to make it easier to pass as a command-line argument
 # TODO: use enum.StrEnum for Python >= 3.11
-@enum.unique
 class PgVersion(str, enum.Enum):
     V14 = "14"
     V15 = "15"
     V16 = "16"
     V17 = "17"
+
+    # Default Postgres Version for tests that don't really depend on Postgres itself
+    DEFAULT = V16
+
     # Instead of making version an optional parameter in methods, we can use this fake entry
     # to explicitly rely on the default server version (could be different from pg_version fixture value)
     NOT_SET = "<-POSTRGRES VERSION IS NOT SET->"
@@ -59,27 +60,3 @@ class PgVersion(str, enum.Enum):
         # Make mypy happy
         # See https://github.com/python/mypy/issues/3974
         return None
-
-
-DEFAULT_VERSION: PgVersion = PgVersion.V16
-
-
-def skip_on_postgres(version: PgVersion, reason: str):
-    return pytest.mark.skipif(
-        PgVersion(os.environ.get("DEFAULT_PG_VERSION", DEFAULT_VERSION)) is version,
-        reason=reason,
-    )
-
-
-def xfail_on_postgres(version: PgVersion, reason: str):
-    return pytest.mark.xfail(
-        PgVersion(os.environ.get("DEFAULT_PG_VERSION", DEFAULT_VERSION)) is version,
-        reason=reason,
-    )
-
-
-def run_only_on_default_postgres(reason: str):
-    return pytest.mark.skipif(
-        PgVersion(os.environ.get("DEFAULT_PG_VERSION", DEFAULT_VERSION)) is not DEFAULT_VERSION,
-        reason=reason,
-    )
diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index 01b7cf1026..96a651f0db 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -25,6 +25,7 @@ from fixtures.pageserver.common_types import (
     parse_delta_layer,
     parse_image_layer,
 )
+from fixtures.pg_version import PgVersion
 
 if TYPE_CHECKING:
     from collections.abc import Iterable
@@ -37,6 +38,7 @@ if TYPE_CHECKING:
 
 
 Fn = TypeVar("Fn", bound=Callable[..., Any])
+
 COMPONENT_BINARIES = {
     "storage_controller": ("storage_controller",),
     "storage_broker": ("storage_broker",),
@@ -519,7 +521,7 @@ def assert_pageserver_backups_equal(left: Path, right: Path, skip_files: set[str
     This is essentially:
 
     lines=$(comm -3 \
-        <(mkdir left && cd left && tar xf "$left" && find . -type f -print0 | xargs sha256sum | sort -k2) \
+        <(mkdir left  && cd left  && tar xf "$left"  && find . -type f -print0 | xargs sha256sum | sort -k2) \
         <(mkdir right && cd right && tar xf "$right" && find . -type f -print0 | xargs sha256sum | sort -k2) \
         | wc -l)
     [ "$lines" = "0" ]
@@ -643,3 +645,40 @@ def allpairs_versions():
         )
         ids.append(f"combination_{''.join(cur_id)}")
     return {"argnames": "combination", "argvalues": tuple(argvalues), "ids": ids}
+
+
+def skip_on_postgres(version: PgVersion, reason: str):
+    return pytest.mark.skipif(
+        PgVersion(os.getenv("DEFAULT_PG_VERSION", PgVersion.DEFAULT)) is version,
+        reason=reason,
+    )
+
+
+def xfail_on_postgres(version: PgVersion, reason: str):
+    return pytest.mark.xfail(
+        PgVersion(os.getenv("DEFAULT_PG_VERSION", PgVersion.DEFAULT)) is version,
+        reason=reason,
+    )
+
+
+def run_only_on_default_postgres(reason: str):
+    return pytest.mark.skipif(
+        PgVersion(os.getenv("DEFAULT_PG_VERSION", PgVersion.DEFAULT)) is not PgVersion.DEFAULT,
+        reason=reason,
+    )
+
+
+def skip_in_debug_build(reason: str):
+    return pytest.mark.skipif(
+        os.getenv("BUILD_TYPE", "debug") == "debug",
+        reason=reason,
+    )
+
+
+def skip_on_ci(reason: str):
+    # `CI` variable is always set to `true` on GitHub
+    # Ref: https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/store-information-in-variables#default-environment-variables
+    return pytest.mark.skipif(
+        os.getenv("CI", "false") == "true",
+        reason=reason,
+    )
diff --git a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
index c038fc3fd2..3dbbb197f4 100644
--- a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
+++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
@@ -1,7 +1,6 @@
 from __future__ import annotations
 
 import json
-import os
 from pathlib import Path
 from typing import TYPE_CHECKING
 
@@ -14,7 +13,7 @@ from fixtures.neon_fixtures import (
     PgBin,
     wait_for_last_flush_lsn,
 )
-from fixtures.utils import get_scale_for_db, humantime_to_ms
+from fixtures.utils import get_scale_for_db, humantime_to_ms, skip_on_ci
 
 from performance.pageserver.util import (
     setup_pageserver_with_tenants,
@@ -38,9 +37,8 @@ if TYPE_CHECKING:
 @pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(200)])
 @pytest.mark.parametrize("n_tenants", [500])
 @pytest.mark.timeout(10000)
-@pytest.mark.skipif(
-    os.getenv("CI", "false") == "true",
-    reason="This test needs lot of resources and should run on dedicated HW, not in github action runners as part of CI",
+@skip_on_ci(
+    "This test needs lot of resources and should run on dedicated HW, not in github action runners as part of CI"
 )
 def test_pageserver_characterize_throughput_with_n_tenants(
     neon_env_builder: NeonEnvBuilder,
@@ -66,9 +64,8 @@ def test_pageserver_characterize_throughput_with_n_tenants(
 @pytest.mark.parametrize("n_clients", [1, 64])
 @pytest.mark.parametrize("n_tenants", [1])
 @pytest.mark.timeout(2400)
-@pytest.mark.skipif(
-    os.getenv("CI", "false") == "true",
-    reason="This test needs lot of resources and should run on dedicated HW, not in github action runners as part of CI",
+@skip_on_ci(
+    "This test needs lot of resources and should run on dedicated HW, not in github action runners as part of CI"
 )
 def test_pageserver_characterize_latencies_with_1_client_and_throughput_with_many_clients_one_tenant(
     neon_env_builder: NeonEnvBuilder,
diff --git a/test_runner/regress/test_branch_and_gc.py b/test_runner/regress/test_branch_and_gc.py
index 6d1565c5e5..fccfbc7f09 100644
--- a/test_runner/regress/test_branch_and_gc.py
+++ b/test_runner/regress/test_branch_and_gc.py
@@ -8,7 +8,7 @@ from fixtures.common_types import Lsn, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv
 from fixtures.pageserver.http import TimelineCreate406
-from fixtures.utils import query_scalar
+from fixtures.utils import query_scalar, skip_in_debug_build
 
 
 # Test the GC implementation when running with branching.
@@ -48,10 +48,8 @@ from fixtures.utils import query_scalar
 # Because the delta layer D covering lsn1 is corrupted, creating a branch
 # starting from lsn1 should return an error as follows:
 #     could not find data for key ... at LSN ..., for request at LSN ...
-def test_branch_and_gc(neon_simple_env: NeonEnv, build_type: str):
-    if build_type == "debug":
-        pytest.skip("times out in debug builds")
-
+@skip_in_debug_build("times out in debug builds")
+def test_branch_and_gc(neon_simple_env: NeonEnv):
     env = neon_simple_env
     pageserver_http_client = env.pageserver.http_client()
 
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index 420055ac3a..370df3c379 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -2,7 +2,6 @@ from __future__ import annotations
 
 import enum
 import json
-import os
 import time
 from typing import TYPE_CHECKING
 
@@ -13,7 +12,7 @@ from fixtures.neon_fixtures import (
     generate_uploads_and_deletions,
 )
 from fixtures.pageserver.http import PageserverApiException
-from fixtures.utils import wait_until
+from fixtures.utils import skip_in_debug_build, wait_until
 from fixtures.workload import Workload
 
 if TYPE_CHECKING:
@@ -32,7 +31,7 @@ AGGRESIVE_COMPACTION_TENANT_CONF = {
 }
 
 
-@pytest.mark.skipif(os.environ.get("BUILD_TYPE") == "debug", reason="only run with release build")
+@skip_in_debug_build("only run with release build")
 def test_pageserver_compaction_smoke(neon_env_builder: NeonEnvBuilder):
     """
     This is a smoke test that compaction kicks in. The workload repeatedly churns
diff --git a/test_runner/regress/test_download_extensions.py b/test_runner/regress/test_download_extensions.py
index 0134f80769..b2e19ad713 100644
--- a/test_runner/regress/test_download_extensions.py
+++ b/test_runner/regress/test_download_extensions.py
@@ -12,6 +12,7 @@ from fixtures.neon_fixtures import (
     NeonEnvBuilder,
 )
 from fixtures.pg_version import PgVersion
+from fixtures.utils import skip_on_postgres
 from pytest_httpserver import HTTPServer
 from werkzeug.wrappers.request import Request
 from werkzeug.wrappers.response import Response
@@ -41,17 +42,14 @@ def neon_env_builder_local(
     return neon_env_builder
 
 
+@skip_on_postgres(PgVersion.V16, reason="TODO: PG16 extension building")
+@skip_on_postgres(PgVersion.V17, reason="TODO: PG17 extension building")
 def test_remote_extensions(
     httpserver: HTTPServer,
     neon_env_builder_local: NeonEnvBuilder,
     httpserver_listen_address,
     pg_version,
 ):
-    if pg_version == PgVersion.V16:
-        pytest.skip("TODO: PG16 extension building")
-    if pg_version == PgVersion.V17:
-        pytest.skip("TODO: PG17 extension building")
-
     # setup mock http server
     # that expects request for anon.tar.zst
     # and returns the requested file
diff --git a/test_runner/regress/test_ingestion_layer_size.py b/test_runner/regress/test_ingestion_layer_size.py
index 646dac8e6e..2916748925 100644
--- a/test_runner/regress/test_ingestion_layer_size.py
+++ b/test_runner/regress/test_ingestion_layer_size.py
@@ -4,25 +4,22 @@ from collections.abc import Iterable
 from dataclasses import dataclass
 from typing import TYPE_CHECKING
 
-import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_last_flush_lsn
 from fixtures.pageserver.http import HistoricLayerInfo, LayerMapInfo
-from fixtures.utils import human_bytes
+from fixtures.utils import human_bytes, skip_in_debug_build
 
 if TYPE_CHECKING:
     from typing import Union
 
 
-def test_ingesting_large_batches_of_images(neon_env_builder: NeonEnvBuilder, build_type: str):
+@skip_in_debug_build("debug run is unnecessarily slow")
+def test_ingesting_large_batches_of_images(neon_env_builder: NeonEnvBuilder):
     """
     Build a non-small GIN index which includes similarly batched up images in WAL stream as does pgvector
     to show that we no longer create oversized layers.
     """
 
-    if build_type == "debug":
-        pytest.skip("debug run is unnecessarily slow")
-
     minimum_initdb_size = 20 * 1024**2
     checkpoint_distance = 32 * 1024**2
     minimum_good_layer_size = checkpoint_distance * 0.9
diff --git a/test_runner/regress/test_layer_bloating.py b/test_runner/regress/test_layer_bloating.py
index a08d522fc2..d9043fef7f 100644
--- a/test_runner/regress/test_layer_bloating.py
+++ b/test_runner/regress/test_layer_bloating.py
@@ -2,7 +2,6 @@ from __future__ import annotations
 
 import os
 
-import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
@@ -10,12 +9,18 @@ from fixtures.neon_fixtures import (
     wait_for_last_flush_lsn,
 )
 from fixtures.pg_version import PgVersion
+from fixtures.utils import skip_on_postgres
 
 
+@skip_on_postgres(
+    PgVersion.V14,
+    reason="pg_log_standby_snapshot() function is available since Postgres 16",
+)
+@skip_on_postgres(
+    PgVersion.V15,
+    reason="pg_log_standby_snapshot() function is available since Postgres 16",
+)
 def test_layer_bloating(neon_env_builder: NeonEnvBuilder, vanilla_pg):
-    if neon_env_builder.pg_version != PgVersion.V16:
-        pytest.skip("pg_log_standby_snapshot() function is available only in PG16")
-
     env = neon_env_builder.init_start(
         initial_tenant_conf={
             "gc_period": "0s",
diff --git a/test_runner/regress/test_layer_eviction.py b/test_runner/regress/test_layer_eviction.py
index c49ac6893e..2eb38c49b2 100644
--- a/test_runner/regress/test_layer_eviction.py
+++ b/test_runner/regress/test_layer_eviction.py
@@ -2,7 +2,6 @@ from __future__ import annotations
 
 import time
 
-import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
@@ -12,17 +11,13 @@ from fixtures.neon_fixtures import (
 from fixtures.pageserver.common_types import parse_layer_file_name
 from fixtures.pageserver.utils import wait_for_upload
 from fixtures.remote_storage import RemoteStorageKind
+from fixtures.utils import skip_in_debug_build
 
 
 # Crates a few layers, ensures that we can evict them (removing locally but keeping track of them anyway)
 # and then download them back.
-def test_basic_eviction(
-    neon_env_builder: NeonEnvBuilder,
-    build_type: str,
-):
-    if build_type == "debug":
-        pytest.skip("times out in debug builds")
-
+@skip_in_debug_build("times out in debug builds")
+def test_basic_eviction(neon_env_builder: NeonEnvBuilder):
     neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
 
     env = neon_env_builder.init_start(
diff --git a/test_runner/regress/test_logging.py b/test_runner/regress/test_logging.py
index 9a3fdd835d..f6fbdcabfd 100644
--- a/test_runner/regress/test_logging.py
+++ b/test_runner/regress/test_logging.py
@@ -5,8 +5,7 @@ import uuid
 import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder
-from fixtures.pg_version import run_only_on_default_postgres
-from fixtures.utils import wait_until
+from fixtures.utils import run_only_on_default_postgres, wait_until
 
 
 @pytest.mark.parametrize("level", ["trace", "debug", "info", "warn", "error"])
diff --git a/test_runner/regress/test_neon_cli.py b/test_runner/regress/test_neon_cli.py
index 783fb813cf..72db72f2b9 100644
--- a/test_runner/regress/test_neon_cli.py
+++ b/test_runner/regress/test_neon_cli.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
 
-import os
 import subprocess
 from pathlib import Path
 from typing import cast
@@ -15,7 +14,7 @@ from fixtures.neon_fixtures import (
     parse_project_git_version_output,
 )
 from fixtures.pageserver.http import PageserverHttpClient
-from fixtures.pg_version import PgVersion, skip_on_postgres
+from fixtures.utils import run_only_on_default_postgres, skip_in_debug_build
 
 
 def helper_compare_timeline_list(
@@ -195,10 +194,8 @@ def test_cli_start_stop_multi(neon_env_builder: NeonEnvBuilder):
     res.check_returncode()
 
 
-@skip_on_postgres(PgVersion.V14, reason="does not use postgres")
-@pytest.mark.skipif(
-    os.environ.get("BUILD_TYPE") == "debug", reason="unit test for test support, either build works"
-)
+@run_only_on_default_postgres(reason="does not use postgres")
+@skip_in_debug_build("unit test for test support, either build works")
 def test_parse_project_git_version_output_positive():
     commit = "b6f77b5816cf1dba12a3bc8747941182ce220846"
 
@@ -217,10 +214,8 @@ def test_parse_project_git_version_output_positive():
         assert parse_project_git_version_output(example) == commit
 
 
-@skip_on_postgres(PgVersion.V14, reason="does not use postgres")
-@pytest.mark.skipif(
-    os.environ.get("BUILD_TYPE") == "debug", reason="unit test for test support, either build works"
-)
+@run_only_on_default_postgres(reason="does not use postgres")
+@skip_in_debug_build("unit test for test support, either build works")
 def test_parse_project_git_version_output_local_docker():
     """
     Makes sure the tests don't accept the default version in Dockerfile one gets without providing
@@ -234,10 +229,8 @@ def test_parse_project_git_version_output_local_docker():
     assert input in str(e)
 
 
-@skip_on_postgres(PgVersion.V14, reason="does not use postgres")
-@pytest.mark.skipif(
-    os.environ.get("BUILD_TYPE") == "debug", reason="cli api sanity, either build works"
-)
+@run_only_on_default_postgres(reason="does not use postgres")
+@skip_in_debug_build("unit test for test support, either build works")
 def test_binaries_version_parses(neon_binpath: Path):
     """
     Ensures that we can parse the actual outputs of --version from a set of binaries.
diff --git a/test_runner/regress/test_pageserver_layer_rolling.py b/test_runner/regress/test_pageserver_layer_rolling.py
index c0eb598891..200a323a3a 100644
--- a/test_runner/regress/test_pageserver_layer_rolling.py
+++ b/test_runner/regress/test_pageserver_layer_rolling.py
@@ -1,7 +1,6 @@
 from __future__ import annotations
 
 import asyncio
-import os
 import time
 from typing import TYPE_CHECKING
 
@@ -16,7 +15,7 @@ from fixtures.neon_fixtures import (
 )
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
-from fixtures.utils import wait_until
+from fixtures.utils import skip_in_debug_build, wait_until
 
 if TYPE_CHECKING:
     from typing import Optional
@@ -227,12 +226,9 @@ def test_idle_checkpoints(neon_env_builder: NeonEnvBuilder):
         assert get_dirty_bytes(env) >= dirty_after_write
 
 
-@pytest.mark.skipif(
-    # We have to use at least ~100MB of data to hit the lowest limit we can configure, which is
-    # prohibitively slow in debug mode
-    os.getenv("BUILD_TYPE") == "debug",
-    reason="Avoid running bulkier ingest tests in debug mode",
-)
+# We have to use at least ~100MB of data to hit the lowest limit we can configure, which is
+# prohibitively slow in debug mode
+@skip_in_debug_build("Avoid running bulkier ingest tests in debug mode")
 def test_total_size_limit(neon_env_builder: NeonEnvBuilder):
     """
     Test that checkpoints are done based on total ephemeral layer size, even if no one timeline is
diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py
index f7c42fc893..fb6050689c 100644
--- a/test_runner/regress/test_pageserver_restart.py
+++ b/test_runner/regress/test_pageserver_restart.py
@@ -8,7 +8,7 @@ import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder
 from fixtures.remote_storage import s3_storage
-from fixtures.utils import wait_until
+from fixtures.utils import skip_in_debug_build, wait_until
 
 
 # Test restarting page server, while safekeeper and compute node keep
@@ -155,12 +155,8 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
 # safekeeper and compute node keep running.
 @pytest.mark.timeout(540)
 @pytest.mark.parametrize("shard_count", [None, 4])
-def test_pageserver_chaos(
-    neon_env_builder: NeonEnvBuilder, build_type: str, shard_count: Optional[int]
-):
-    if build_type == "debug":
-        pytest.skip("times out in debug builds")
-
+@skip_in_debug_build("times out in debug builds")
+def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]):
     # same rationale as with the immediate stop; we might leave orphan layers behind.
     neon_env_builder.disable_scrub_on_exit()
     neon_env_builder.enable_pageserver_remote_storage(s3_storage())
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index 705b4ff054..d4aef96735 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -17,7 +17,7 @@ from fixtures.pageserver.utils import (
     wait_for_upload_queue_empty,
 )
 from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, S3Storage, s3_storage
-from fixtures.utils import wait_until
+from fixtures.utils import skip_in_debug_build, wait_until
 from fixtures.workload import Workload
 from werkzeug.wrappers.request import Request
 from werkzeug.wrappers.response import Response
@@ -765,7 +765,7 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
     assert download_rate < expect_download_rate * 2
 
 
-@pytest.mark.skipif(os.environ.get("BUILD_TYPE") == "debug", reason="only run with release build")
+@skip_in_debug_build("only run with release build")
 @pytest.mark.parametrize("via_controller", [True, False])
 def test_slow_secondary_downloads(neon_env_builder: NeonEnvBuilder, via_controller: bool):
     """
diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py
index b97fccddf5..f4698191eb 100644
--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -3,7 +3,6 @@
 #
 from __future__ import annotations
 
-import os
 from concurrent.futures import ThreadPoolExecutor
 from pathlib import Path
 from typing import TYPE_CHECKING, cast
@@ -19,6 +18,7 @@ from fixtures.neon_fixtures import (
 )
 from fixtures.pg_version import PgVersion
 from fixtures.remote_storage import s3_storage
+from fixtures.utils import skip_in_debug_build
 
 if TYPE_CHECKING:
     from typing import Optional
@@ -329,7 +329,7 @@ def test_sql_regress(
     post_checks(env, test_output_dir, DBNAME, endpoint)
 
 
-@pytest.mark.skipif(os.environ.get("BUILD_TYPE") == "debug", reason="only run with release build")
+@skip_in_debug_build("only run with release build")
 def test_tx_abort_with_many_relations(
     neon_env_builder: NeonEnvBuilder,
 ):
diff --git a/test_runner/regress/test_replica_start.py b/test_runner/regress/test_replica_start.py
index e81e7dad76..8e7c01f950 100644
--- a/test_runner/regress/test_replica_start.py
+++ b/test_runner/regress/test_replica_start.py
@@ -30,7 +30,7 @@ import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv, wait_for_last_flush_lsn, wait_replica_caughtup
 from fixtures.pg_version import PgVersion
-from fixtures.utils import query_scalar, wait_until
+from fixtures.utils import query_scalar, skip_on_postgres, wait_until
 
 CREATE_SUBXACTS_FUNC = """
 create or replace function create_subxacts(n integer) returns void as $$
@@ -137,6 +137,12 @@ def test_replica_start_scan_clog_crashed_xids(neon_simple_env: NeonEnv):
     assert secondary_cur.fetchone() == (1,)
 
 
+@skip_on_postgres(
+    PgVersion.V14, reason="pg_log_standby_snapshot() function is available since Postgres 16"
+)
+@skip_on_postgres(
+    PgVersion.V15, reason="pg_log_standby_snapshot() function is available since Postgres 16"
+)
 def test_replica_start_at_running_xacts(neon_simple_env: NeonEnv, pg_version):
     """
     Test that starting a replica works right after the primary has
@@ -149,9 +155,6 @@ def test_replica_start_at_running_xacts(neon_simple_env: NeonEnv, pg_version):
     """
     env = neon_simple_env
 
-    if env.pg_version == PgVersion.V14 or env.pg_version == PgVersion.V15:
-        pytest.skip("pg_log_standby_snapshot() function is available only in PG16")
-
     primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary")
     primary_conn = primary.connect()
     primary_cur = primary_conn.cursor()
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index ec633e352c..0a4a53356d 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -20,7 +20,7 @@ from fixtures.neon_fixtures import (
 )
 from fixtures.pageserver.utils import assert_prefix_empty, assert_prefix_not_empty
 from fixtures.remote_storage import s3_storage
-from fixtures.utils import wait_until
+from fixtures.utils import skip_in_debug_build, wait_until
 from fixtures.workload import Workload
 from pytest_httpserver import HTTPServer
 from typing_extensions import override
@@ -853,12 +853,9 @@ def test_sharding_split_stripe_size(
     wait_until(10, 1, assert_restart_notification)
 
 
-@pytest.mark.skipif(
-    # The quantity of data isn't huge, but debug can be _very_ slow, and the things we're
-    # validating in this test don't benefit much from debug assertions.
-    os.getenv("BUILD_TYPE") == "debug",
-    reason="Avoid running bulkier ingest tests in debug mode",
-)
+# The quantity of data isn't huge, but debug can be _very_ slow, and the things we're
+# validating in this test don't benefit much from debug assertions.
+@skip_in_debug_build("Avoid running bulkier ingest tests in debug mode")
 def test_sharding_ingest_layer_sizes(
     neon_env_builder: NeonEnvBuilder,
 ):
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index a069e0d01c..2c3d79b18a 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -36,11 +36,12 @@ from fixtures.pageserver.utils import (
     remote_storage_delete_key,
     timeline_delete_wait_completed,
 )
-from fixtures.pg_version import PgVersion, run_only_on_default_postgres
+from fixtures.pg_version import PgVersion
 from fixtures.port_distributor import PortDistributor
 from fixtures.remote_storage import RemoteStorageKind, s3_storage
 from fixtures.storage_controller_proxy import StorageControllerProxy
 from fixtures.utils import (
+    run_only_on_default_postgres,
     run_pg_bench_small,
     subprocess_capture,
     wait_until,
diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py
index 0c431fa453..8b733da0c6 100644
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
 
-import os
 from concurrent.futures import ThreadPoolExecutor
 from pathlib import Path
 
@@ -21,7 +20,7 @@ from fixtures.pageserver.utils import (
     wait_until_tenant_active,
 )
 from fixtures.pg_version import PgVersion
-from fixtures.utils import wait_until
+from fixtures.utils import skip_in_debug_build, wait_until
 
 
 def test_empty_tenant_size(neon_env_builder: NeonEnvBuilder):
@@ -279,7 +278,7 @@ def test_only_heads_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Pa
     size_debug_file.write(size_debug)
 
 
-@pytest.mark.skipif(os.environ.get("BUILD_TYPE") == "debug", reason="only run with release build")
+@skip_in_debug_build("only run with release build")
 def test_single_branch_get_tenant_size_grows(
     neon_env_builder: NeonEnvBuilder, test_output_dir: Path, pg_version: PgVersion
 ):
diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index 0e8519e07b..ef0eb05612 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -869,8 +869,17 @@ def test_sharded_timeline_detach_ancestor(neon_env_builder: NeonEnvBuilder):
         assert count == 10000
 
 
-@pytest.mark.parametrize("mode", ["delete_timeline", "delete_tenant"])
-@pytest.mark.parametrize("sharded", [False, True])
+@pytest.mark.parametrize(
+    "mode, sharded",
+    [
+        ("delete_timeline", False),
+        ("delete_timeline", True),
+        ("delete_tenant", False),
+        # the shared/exclusive lock for tenant is blocking this:
+        # timeline detach ancestor takes shared, delete tenant takes exclusive
+        # ("delete_tenant", True)
+    ],
+)
 def test_timeline_detach_ancestor_interrupted_by_deletion(
     neon_env_builder: NeonEnvBuilder, mode: str, sharded: bool
 ):
@@ -885,11 +894,6 @@ def test_timeline_detach_ancestor_interrupted_by_deletion(
     - shutdown winning over complete, see test_timeline_is_deleted_before_timeline_detach_ancestor_completes
     """
 
-    if sharded and mode == "delete_tenant":
-        # the shared/exclusive lock for tenant is blocking this:
-        # timeline detach ancestor takes shared, delete tenant takes exclusive
-        pytest.skip("tenant deletion while timeline ancestor detach is underway cannot happen")
-
     shard_count = 2 if sharded else 1
 
     neon_env_builder.num_pageservers = shard_count
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index e224d5eb01..0676b3dd9a 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -54,6 +54,8 @@ from fixtures.utils import (
     PropagatingThread,
     get_dir_size,
     query_scalar,
+    run_only_on_default_postgres,
+    skip_in_debug_build,
     start_in_background,
     wait_until,
 )
@@ -2104,10 +2106,9 @@ def test_pull_timeline_while_evicted(neon_env_builder: NeonEnvBuilder):
 # The only way to verify this without manipulating time is to sleep for a while.
 # In this test we sleep for 60 seconds, so this test takes at least 1 minute to run.
 # This is longer than most other tests, we run it only for v16 to save CI resources.
+@run_only_on_default_postgres("run only on release build to save CI resources")
+@skip_in_debug_build("run only on release build to save CI resources")
 def test_idle_reconnections(neon_env_builder: NeonEnvBuilder):
-    if os.environ.get("PYTEST_CURRENT_TEST", "").find("[debug-pg16]") == -1:
-        pytest.skip("run only on debug postgres v16 to save CI resources")
-
     neon_env_builder.num_safekeepers = 3
     env = neon_env_builder.init_start()
 
diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py
index f328974264..d3e989afa8 100644
--- a/test_runner/regress/test_wal_acceptor_async.py
+++ b/test_runner/regress/test_wal_acceptor_async.py
@@ -14,6 +14,7 @@ from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import getLogger
 from fixtures.neon_fixtures import Endpoint, NeonEnv, NeonEnvBuilder, Safekeeper
 from fixtures.remote_storage import RemoteStorageKind
+from fixtures.utils import skip_in_debug_build
 
 if TYPE_CHECKING:
     from typing import Optional
@@ -760,10 +761,8 @@ async def run_wal_lagging(env: NeonEnv, endpoint: Endpoint, test_output_dir: Pat
 # The test takes more than default 5 minutes on Postgres 16,
 # see https://github.com/neondatabase/neon/issues/5305
 @pytest.mark.timeout(600)
+@skip_in_debug_build("times out in debug builds")
 def test_wal_lagging(neon_env_builder: NeonEnvBuilder, test_output_dir: Path, build_type: str):
-    if build_type == "debug":
-        pytest.skip("times out in debug builds")
-
     neon_env_builder.num_safekeepers = 3
     env = neon_env_builder.init_start()
 

From 2d9652c434642b852ebaae6969f87ec4d93e3014 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 11 Nov 2024 13:53:12 -0600
Subject: [PATCH 23/28] Clean up C.UTF-8 locale changes

Removes some unnecessary initdb arguments, and fixes Neon for MacOS
since it doesn't seem to ship a C.UTF-8 locale.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute_tools/src/config.rs       | 15 +++++++++++----
 libs/pageserver_api/src/config.rs |  6 +++++-
 pageserver/src/tenant.rs          |  6 ------
 3 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs
index 50e2a95e9d..d4e413034e 100644
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -74,10 +74,17 @@ pub fn write_postgres_conf(
     }
 
     // Locales
-    writeln!(file, "lc_messages='C.UTF-8'")?;
-    writeln!(file, "lc_monetary='C.UTF-8'")?;
-    writeln!(file, "lc_time='C.UTF-8'")?;
-    writeln!(file, "lc_numeric='C.UTF-8'")?;
+    if cfg!(target_os = "macos") {
+        writeln!(file, "lc_messages='C'")?;
+        writeln!(file, "lc_monetary='C'")?;
+        writeln!(file, "lc_time='C'")?;
+        writeln!(file, "lc_numeric='C'")?;
+    } else {
+        writeln!(file, "lc_messages='C.UTF-8'")?;
+        writeln!(file, "lc_monetary='C.UTF-8'")?;
+        writeln!(file, "lc_time='C.UTF-8'")?;
+        writeln!(file, "lc_numeric='C.UTF-8'")?;
+    }
 
     match spec.mode {
         ComputeMode::Primary => {}
diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 4272181954..f48c1febb5 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -277,7 +277,11 @@ pub mod defaults {
     pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s";
 
     pub const DEFAULT_SUPERUSER: &str = "cloud_admin";
-    pub const DEFAULT_LOCALE: &str = "C.UTF-8";
+    pub const DEFAULT_LOCALE: &str = if cfg!(target_os = "macos") {
+        "C"
+    } else {
+        "C.UTF-8"
+    };
 
     pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192;
     pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100;
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 903174680e..774672aed6 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -4786,12 +4786,6 @@ async fn run_initdb(
         .args(["--username", &conf.superuser])
         .args(["--encoding", "utf8"])
         .args(["--locale", &conf.locale])
-        .args(["--lc-collate", &conf.locale])
-        .args(["--lc-ctype", &conf.locale])
-        .args(["--lc-messages", &conf.locale])
-        .args(["--lc-monetary", &conf.locale])
-        .args(["--lc-numeric", &conf.locale])
-        .args(["--lc-time", &conf.locale])
         .arg("--no-instructions")
         .arg("--no-sync")
         .env_clear()

From 5a138d08a3ab3c7cd79a81783ed1836e0a3dc14f Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Mon, 11 Nov 2024 15:30:32 -0500
Subject: [PATCH 24/28] feat(pageserver): support partial gc-compaction for
 delta layers (#9611)

The final patch for partial compaction, part of
https://github.com/neondatabase/neon/issues/9114, close
https://github.com/neondatabase/neon/issues/8921 (note that we didn't
implement parallel compaction or compaction scheduler for partial
compaction -- currently this needs to be scheduled by using a Python
script to split the keyspace, and in the future, automatically split
based on the key partitioning when the pageserver wants to trigger a
gc-compaction)

## Summary of changes

* Update the layer selection algorithm to use the same selection as full
compaction (everything intersect/below gc horizon)
* Update the layer selection algorithm to also generate a list of delta
layers that need to be rewritten
* Add the logic to rewrite delta layers and add them back to the layer
map
* Update test case to do partial compaction on deltas

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/compaction/src/helpers.rs          |   9 +
 pageserver/src/tenant.rs                      | 235 +++++++---
 .../src/tenant/storage_layer/delta_layer.rs   |   4 +
 .../tenant/storage_layer/filter_iterator.rs   |  25 +-
 .../tenant/storage_layer/merge_iterator.rs    |  82 +++-
 pageserver/src/tenant/timeline/compaction.rs  | 412 +++++++++++-------
 6 files changed, 521 insertions(+), 246 deletions(-)

diff --git a/pageserver/compaction/src/helpers.rs b/pageserver/compaction/src/helpers.rs
index 9dbb6ecedf..6b739d85a7 100644
--- a/pageserver/compaction/src/helpers.rs
+++ b/pageserver/compaction/src/helpers.rs
@@ -35,6 +35,15 @@ pub fn overlaps_with<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
     !(a.end <= b.start || b.end <= a.start)
 }
 
+/// Whether a fully contains b, example as below
+/// ```plain
+/// |      a       |
+///       |  b  |
+/// ```
+pub fn fully_contains<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
+    a.start <= b.start && a.end >= b.end
+}
+
 pub fn union_to_keyspace<K: Ord>(a: &mut CompactionKeySpace<K>, b: CompactionKeySpace<K>) {
     let x = std::mem::take(a);
     let mut all_ranges_iter = [x.into_iter(), b.into_iter()]
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 774672aed6..e7c258d829 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -9223,6 +9223,23 @@ mod tests {
         Ok(())
     }
 
+    fn sort_layer_key(k1: &PersistentLayerKey, k2: &PersistentLayerKey) -> std::cmp::Ordering {
+        (
+            k1.is_delta,
+            k1.key_range.start,
+            k1.key_range.end,
+            k1.lsn_range.start,
+            k1.lsn_range.end,
+        )
+            .cmp(&(
+                k2.is_delta,
+                k2.key_range.start,
+                k2.key_range.end,
+                k2.lsn_range.start,
+                k2.lsn_range.end,
+            ))
+    }
+
     async fn inspect_and_sort(
         tline: &Arc<Timeline>,
         filter: Option<std::ops::Range<Key>>,
@@ -9231,25 +9248,30 @@ mod tests {
         if let Some(filter) = filter {
             all_layers.retain(|layer| overlaps_with(&layer.key_range, &filter));
         }
-        all_layers.sort_by(|k1, k2| {
-            (
-                k1.is_delta,
-                k1.key_range.start,
-                k1.key_range.end,
-                k1.lsn_range.start,
-                k1.lsn_range.end,
-            )
-                .cmp(&(
-                    k2.is_delta,
-                    k2.key_range.start,
-                    k2.key_range.end,
-                    k2.lsn_range.start,
-                    k2.lsn_range.end,
-                ))
-        });
+        all_layers.sort_by(sort_layer_key);
         all_layers
     }
 
+    #[cfg(feature = "testing")]
+    fn check_layer_map_key_eq(
+        mut left: Vec<PersistentLayerKey>,
+        mut right: Vec<PersistentLayerKey>,
+    ) {
+        left.sort_by(sort_layer_key);
+        right.sort_by(sort_layer_key);
+        if left != right {
+            eprintln!("---LEFT---");
+            for left in left.iter() {
+                eprintln!("{}", left);
+            }
+            eprintln!("---RIGHT---");
+            for right in right.iter() {
+                eprintln!("{}", right);
+            }
+            assert_eq!(left, right);
+        }
+    }
+
     #[cfg(feature = "testing")]
     #[tokio::test]
     async fn test_simple_partial_bottom_most_compaction() -> anyhow::Result<()> {
@@ -9342,127 +9364,206 @@ mod tests {
 
         let cancel = CancellationToken::new();
 
-        // Do a partial compaction on key range 0..4, we should generate a image layer; no other layers
-        // can be removed because they might be used for other key ranges.
+        // Do a partial compaction on key range 0..2
         tline
-            .partial_compact_with_gc(Some(get_key(0)..get_key(4)), &cancel, EnumSet::new(), &ctx)
+            .partial_compact_with_gc(get_key(0)..get_key(2), &cancel, EnumSet::new(), &ctx)
             .await
             .unwrap();
         let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
-        assert_eq!(
+        check_layer_map_key_eq(
             all_layers,
             vec![
+                // newly-generated image layer for the partial compaction range 0-2
                 PersistentLayerKey {
-                    key_range: get_key(0)..get_key(4),
+                    key_range: get_key(0)..get_key(2),
                     lsn_range: Lsn(0x20)..Lsn(0x21),
-                    is_delta: false
+                    is_delta: false,
                 },
                 PersistentLayerKey {
                     key_range: get_key(0)..get_key(10),
                     lsn_range: Lsn(0x10)..Lsn(0x11),
-                    is_delta: false
+                    is_delta: false,
                 },
+                // delta1 is split and the second part is rewritten
                 PersistentLayerKey {
-                    key_range: get_key(1)..get_key(4),
+                    key_range: get_key(2)..get_key(4),
                     lsn_range: Lsn(0x20)..Lsn(0x48),
-                    is_delta: true
+                    is_delta: true,
                 },
                 PersistentLayerKey {
                     key_range: get_key(5)..get_key(7),
                     lsn_range: Lsn(0x20)..Lsn(0x48),
-                    is_delta: true
+                    is_delta: true,
                 },
                 PersistentLayerKey {
                     key_range: get_key(8)..get_key(10),
                     lsn_range: Lsn(0x48)..Lsn(0x50),
-                    is_delta: true
-                }
-            ]
+                    is_delta: true,
+                },
+            ],
         );
 
-        // Do a partial compaction on key range 4..10
+        // Do a partial compaction on key range 2..4
         tline
-            .partial_compact_with_gc(Some(get_key(4)..get_key(10)), &cancel, EnumSet::new(), &ctx)
+            .partial_compact_with_gc(get_key(2)..get_key(4), &cancel, EnumSet::new(), &ctx)
             .await
             .unwrap();
         let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
-        assert_eq!(
+        check_layer_map_key_eq(
             all_layers,
             vec![
                 PersistentLayerKey {
-                    key_range: get_key(0)..get_key(4),
+                    key_range: get_key(0)..get_key(2),
                     lsn_range: Lsn(0x20)..Lsn(0x21),
-                    is_delta: false
+                    is_delta: false,
                 },
                 PersistentLayerKey {
-                    // if (in the future) GC kicks in, this layer will be removed
                     key_range: get_key(0)..get_key(10),
                     lsn_range: Lsn(0x10)..Lsn(0x11),
-                    is_delta: false
+                    is_delta: false,
                 },
+                // image layer generated for the compaction range 2-4
                 PersistentLayerKey {
-                    key_range: get_key(4)..get_key(10),
+                    key_range: get_key(2)..get_key(4),
                     lsn_range: Lsn(0x20)..Lsn(0x21),
-                    is_delta: false
+                    is_delta: false,
                 },
+                // we have key2/key3 above the retain_lsn, so we still need this delta layer
                 PersistentLayerKey {
-                    key_range: get_key(1)..get_key(4),
+                    key_range: get_key(2)..get_key(4),
                     lsn_range: Lsn(0x20)..Lsn(0x48),
-                    is_delta: true
+                    is_delta: true,
                 },
                 PersistentLayerKey {
                     key_range: get_key(5)..get_key(7),
                     lsn_range: Lsn(0x20)..Lsn(0x48),
-                    is_delta: true
+                    is_delta: true,
                 },
                 PersistentLayerKey {
                     key_range: get_key(8)..get_key(10),
                     lsn_range: Lsn(0x48)..Lsn(0x50),
-                    is_delta: true
-                }
-            ]
+                    is_delta: true,
+                },
+            ],
+        );
+
+        // Do a partial compaction on key range 4..9
+        tline
+            .partial_compact_with_gc(get_key(4)..get_key(9), &cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();
+        let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
+        check_layer_map_key_eq(
+            all_layers,
+            vec![
+                PersistentLayerKey {
+                    key_range: get_key(0)..get_key(2),
+                    lsn_range: Lsn(0x20)..Lsn(0x21),
+                    is_delta: false,
+                },
+                PersistentLayerKey {
+                    key_range: get_key(0)..get_key(10),
+                    lsn_range: Lsn(0x10)..Lsn(0x11),
+                    is_delta: false,
+                },
+                PersistentLayerKey {
+                    key_range: get_key(2)..get_key(4),
+                    lsn_range: Lsn(0x20)..Lsn(0x21),
+                    is_delta: false,
+                },
+                PersistentLayerKey {
+                    key_range: get_key(2)..get_key(4),
+                    lsn_range: Lsn(0x20)..Lsn(0x48),
+                    is_delta: true,
+                },
+                // image layer generated for this compaction range
+                PersistentLayerKey {
+                    key_range: get_key(4)..get_key(9),
+                    lsn_range: Lsn(0x20)..Lsn(0x21),
+                    is_delta: false,
+                },
+                PersistentLayerKey {
+                    key_range: get_key(8)..get_key(10),
+                    lsn_range: Lsn(0x48)..Lsn(0x50),
+                    is_delta: true,
+                },
+            ],
+        );
+
+        // Do a partial compaction on key range 9..10
+        tline
+            .partial_compact_with_gc(get_key(9)..get_key(10), &cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();
+        let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
+        check_layer_map_key_eq(
+            all_layers,
+            vec![
+                PersistentLayerKey {
+                    key_range: get_key(0)..get_key(2),
+                    lsn_range: Lsn(0x20)..Lsn(0x21),
+                    is_delta: false,
+                },
+                PersistentLayerKey {
+                    key_range: get_key(0)..get_key(10),
+                    lsn_range: Lsn(0x10)..Lsn(0x11),
+                    is_delta: false,
+                },
+                PersistentLayerKey {
+                    key_range: get_key(2)..get_key(4),
+                    lsn_range: Lsn(0x20)..Lsn(0x21),
+                    is_delta: false,
+                },
+                PersistentLayerKey {
+                    key_range: get_key(2)..get_key(4),
+                    lsn_range: Lsn(0x20)..Lsn(0x48),
+                    is_delta: true,
+                },
+                PersistentLayerKey {
+                    key_range: get_key(4)..get_key(9),
+                    lsn_range: Lsn(0x20)..Lsn(0x21),
+                    is_delta: false,
+                },
+                // image layer generated for the compaction range
+                PersistentLayerKey {
+                    key_range: get_key(9)..get_key(10),
+                    lsn_range: Lsn(0x20)..Lsn(0x21),
+                    is_delta: false,
+                },
+                PersistentLayerKey {
+                    key_range: get_key(8)..get_key(10),
+                    lsn_range: Lsn(0x48)..Lsn(0x50),
+                    is_delta: true,
+                },
+            ],
         );
 
         // Do a partial compaction on key range 0..10, all image layers below LSN 20 can be replaced with new ones.
         tline
-            .partial_compact_with_gc(Some(get_key(0)..get_key(10)), &cancel, EnumSet::new(), &ctx)
+            .partial_compact_with_gc(get_key(0)..get_key(10), &cancel, EnumSet::new(), &ctx)
             .await
             .unwrap();
         let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
-        assert_eq!(
+        check_layer_map_key_eq(
             all_layers,
             vec![
-                PersistentLayerKey {
-                    key_range: get_key(0)..get_key(4),
-                    lsn_range: Lsn(0x20)..Lsn(0x21),
-                    is_delta: false
-                },
+                // aha, we removed all unnecessary image/delta layers and got a very clean layer map!
                 PersistentLayerKey {
                     key_range: get_key(0)..get_key(10),
                     lsn_range: Lsn(0x20)..Lsn(0x21),
-                    is_delta: false
+                    is_delta: false,
                 },
                 PersistentLayerKey {
-                    key_range: get_key(4)..get_key(10),
-                    lsn_range: Lsn(0x20)..Lsn(0x21),
-                    is_delta: false
-                },
-                PersistentLayerKey {
-                    key_range: get_key(1)..get_key(4),
+                    key_range: get_key(2)..get_key(4),
                     lsn_range: Lsn(0x20)..Lsn(0x48),
-                    is_delta: true
-                },
-                PersistentLayerKey {
-                    key_range: get_key(5)..get_key(7),
-                    lsn_range: Lsn(0x20)..Lsn(0x48),
-                    is_delta: true
+                    is_delta: true,
                 },
                 PersistentLayerKey {
                     key_range: get_key(8)..get_key(10),
                     lsn_range: Lsn(0x48)..Lsn(0x50),
-                    is_delta: true
-                }
-            ]
+                    is_delta: true,
+                },
+            ],
         );
 
         Ok(())
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 664c00a6b1..fec8a0a16c 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -653,6 +653,10 @@ impl DeltaLayerWriter {
         })
     }
 
+    pub fn is_empty(&self) -> bool {
+        self.inner.as_ref().unwrap().num_keys == 0
+    }
+
     ///
     /// Append a key-value pair to the file.
     ///
diff --git a/pageserver/src/tenant/storage_layer/filter_iterator.rs b/pageserver/src/tenant/storage_layer/filter_iterator.rs
index ccfcf68e8f..8660be1fcc 100644
--- a/pageserver/src/tenant/storage_layer/filter_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/filter_iterator.rs
@@ -1,4 +1,4 @@
-use std::ops::Range;
+use std::{ops::Range, sync::Arc};
 
 use anyhow::bail;
 use pageserver_api::{
@@ -9,7 +9,10 @@ use utils::lsn::Lsn;
 
 use pageserver_api::value::Value;
 
-use super::merge_iterator::MergeIterator;
+use super::{
+    merge_iterator::{MergeIterator, MergeIteratorItem},
+    PersistentLayerKey,
+};
 
 /// A filter iterator over merge iterators (and can be easily extended to other types of iterators).
 ///
@@ -48,10 +51,10 @@ impl<'a> FilterIterator<'a> {
         })
     }
 
-    pub async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
-        while let Some(item) = self.inner.next().await? {
+    async fn next_inner<R: MergeIteratorItem>(&mut self) -> anyhow::Result<Option<R>> {
+        while let Some(item) = self.inner.next_inner::<R>().await? {
             while self.current_filter_idx < self.retain_key_filters.len()
-                && item.0 >= self.retain_key_filters[self.current_filter_idx].end
+                && item.key_lsn_value().0 >= self.retain_key_filters[self.current_filter_idx].end
             {
                 // [filter region]    [filter region]     [filter region]
                 //                                     ^ item
@@ -68,7 +71,7 @@ impl<'a> FilterIterator<'a> {
                 //                                                 ^ current filter (nothing)
                 return Ok(None);
             }
-            if self.retain_key_filters[self.current_filter_idx].contains(&item.0) {
+            if self.retain_key_filters[self.current_filter_idx].contains(&item.key_lsn_value().0) {
                 // [filter region]    [filter region]     [filter region]
                 //                                              ^ item
                 //                                        ^ current filter
@@ -81,6 +84,16 @@ impl<'a> FilterIterator<'a> {
         }
         Ok(None)
     }
+
+    pub async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
+        self.next_inner().await
+    }
+
+    pub async fn next_with_trace(
+        &mut self,
+    ) -> anyhow::Result<Option<((Key, Lsn, Value), Arc<PersistentLayerKey>)>> {
+        self.next_inner().await
+    }
 }
 
 #[cfg(test)]
diff --git a/pageserver/src/tenant/storage_layer/merge_iterator.rs b/pageserver/src/tenant/storage_layer/merge_iterator.rs
index 980202f12c..2667d130f5 100644
--- a/pageserver/src/tenant/storage_layer/merge_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs
@@ -1,6 +1,7 @@
 use std::{
     cmp::Ordering,
     collections::{binary_heap, BinaryHeap},
+    sync::Arc,
 };
 
 use anyhow::bail;
@@ -13,10 +14,11 @@ use pageserver_api::value::Value;
 use super::{
     delta_layer::{DeltaLayerInner, DeltaLayerIterator},
     image_layer::{ImageLayerInner, ImageLayerIterator},
+    PersistentLayerDesc, PersistentLayerKey,
 };
 
 #[derive(Clone, Copy)]
-enum LayerRef<'a> {
+pub(crate) enum LayerRef<'a> {
     Image(&'a ImageLayerInner),
     Delta(&'a DeltaLayerInner),
 }
@@ -62,18 +64,20 @@ impl LayerIterRef<'_> {
 /// 1. Unified iterator for image and delta layers.
 /// 2. `Ord` for use in [`MergeIterator::heap`] (for the k-merge).
 /// 3. Lazy creation of the real delta/image iterator.
-enum IteratorWrapper<'a> {
+pub(crate) enum IteratorWrapper<'a> {
     NotLoaded {
         ctx: &'a RequestContext,
         first_key_lower_bound: (Key, Lsn),
         layer: LayerRef<'a>,
+        source_desc: Arc<PersistentLayerKey>,
     },
     Loaded {
         iter: PeekableLayerIterRef<'a>,
+        source_desc: Arc<PersistentLayerKey>,
     },
 }
 
-struct PeekableLayerIterRef<'a> {
+pub(crate) struct PeekableLayerIterRef<'a> {
     iter: LayerIterRef<'a>,
     peeked: Option<(Key, Lsn, Value)>, // None == end
 }
@@ -151,6 +155,12 @@ impl<'a> IteratorWrapper<'a> {
             layer: LayerRef::Image(image_layer),
             first_key_lower_bound: (image_layer.key_range().start, image_layer.lsn()),
             ctx,
+            source_desc: PersistentLayerKey {
+                key_range: image_layer.key_range().clone(),
+                lsn_range: PersistentLayerDesc::image_layer_lsn_range(image_layer.lsn()),
+                is_delta: false,
+            }
+            .into(),
         }
     }
 
@@ -162,12 +172,18 @@ impl<'a> IteratorWrapper<'a> {
             layer: LayerRef::Delta(delta_layer),
             first_key_lower_bound: (delta_layer.key_range().start, delta_layer.lsn_range().start),
             ctx,
+            source_desc: PersistentLayerKey {
+                key_range: delta_layer.key_range().clone(),
+                lsn_range: delta_layer.lsn_range().clone(),
+                is_delta: true,
+            }
+            .into(),
         }
     }
 
     fn peek_next_key_lsn_value(&self) -> Option<(&Key, Lsn, Option<&Value>)> {
         match self {
-            Self::Loaded { iter } => iter
+            Self::Loaded { iter, .. } => iter
                 .peek()
                 .as_ref()
                 .map(|(key, lsn, val)| (key, *lsn, Some(val))),
@@ -191,6 +207,7 @@ impl<'a> IteratorWrapper<'a> {
             ctx,
             first_key_lower_bound,
             layer,
+            source_desc,
         } = self
         else {
             unreachable!()
@@ -206,7 +223,10 @@ impl<'a> IteratorWrapper<'a> {
                 );
             }
         }
-        *self = Self::Loaded { iter };
+        *self = Self::Loaded {
+            iter,
+            source_desc: source_desc.clone(),
+        };
         Ok(())
     }
 
@@ -220,11 +240,19 @@ impl<'a> IteratorWrapper<'a> {
     /// The public interfaces to use are [`crate::tenant::storage_layer::delta_layer::DeltaLayerIterator`] and
     /// [`crate::tenant::storage_layer::image_layer::ImageLayerIterator`].
     async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
-        let Self::Loaded { iter } = self else {
+        let Self::Loaded { iter, .. } = self else {
             panic!("must load the iterator before using")
         };
         iter.next().await
     }
+
+    /// Get the persistent layer key corresponding to this iterator
+    fn trace_source(&self) -> Arc<PersistentLayerKey> {
+        match self {
+            Self::Loaded { source_desc, .. } => source_desc.clone(),
+            Self::NotLoaded { source_desc, .. } => source_desc.clone(),
+        }
+    }
 }
 
 /// A merge iterator over delta/image layer iterators.
@@ -242,6 +270,32 @@ pub struct MergeIterator<'a> {
     heap: BinaryHeap<IteratorWrapper<'a>>,
 }
 
+pub(crate) trait MergeIteratorItem {
+    fn new(item: (Key, Lsn, Value), iterator: &IteratorWrapper<'_>) -> Self;
+
+    fn key_lsn_value(&self) -> &(Key, Lsn, Value);
+}
+
+impl MergeIteratorItem for (Key, Lsn, Value) {
+    fn new(item: (Key, Lsn, Value), _: &IteratorWrapper<'_>) -> Self {
+        item
+    }
+
+    fn key_lsn_value(&self) -> &(Key, Lsn, Value) {
+        self
+    }
+}
+
+impl MergeIteratorItem for ((Key, Lsn, Value), Arc<PersistentLayerKey>) {
+    fn new(item: (Key, Lsn, Value), iter: &IteratorWrapper<'_>) -> Self {
+        (item, iter.trace_source().clone())
+    }
+
+    fn key_lsn_value(&self) -> &(Key, Lsn, Value) {
+        &self.0
+    }
+}
+
 impl<'a> MergeIterator<'a> {
     pub fn create(
         deltas: &[&'a DeltaLayerInner],
@@ -260,7 +314,7 @@ impl<'a> MergeIterator<'a> {
         }
     }
 
-    pub async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
+    pub(crate) async fn next_inner<R: MergeIteratorItem>(&mut self) -> anyhow::Result<Option<R>> {
         while let Some(mut iter) = self.heap.peek_mut() {
             if !iter.is_loaded() {
                 // Once we load the iterator, we can know the real first key-value pair in the iterator.
@@ -275,10 +329,22 @@ impl<'a> MergeIterator<'a> {
                 binary_heap::PeekMut::pop(iter);
                 continue;
             };
-            return Ok(Some(item));
+            return Ok(Some(R::new(item, &iter)));
         }
         Ok(None)
     }
+
+    /// Get the next key-value pair from the iterator.
+    pub async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
+        self.next_inner().await
+    }
+
+    /// Get the next key-value pair from the iterator, and trace where the key comes from.
+    pub async fn next_with_trace(
+        &mut self,
+    ) -> anyhow::Result<Option<((Key, Lsn, Value), Arc<PersistentLayerKey>)>> {
+        self.next_inner().await
+    }
 }
 
 #[cfg(test)]
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 01c2803881..e6ef1aae2b 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -4,7 +4,7 @@
 //!
 //! The old legacy algorithm is implemented directly in `timeline.rs`.
 
-use std::collections::{BinaryHeap, HashSet};
+use std::collections::{BinaryHeap, HashMap, HashSet};
 use std::ops::{Deref, Range};
 use std::sync::Arc;
 
@@ -56,7 +56,7 @@ use pageserver_api::value::Value;
 
 use utils::lsn::Lsn;
 
-use pageserver_compaction::helpers::overlaps_with;
+use pageserver_compaction::helpers::{fully_contains, overlaps_with};
 use pageserver_compaction::interface::*;
 
 use super::CompactionError;
@@ -64,6 +64,23 @@ use super::CompactionError;
 /// Maximum number of deltas before generating an image layer in bottom-most compaction.
 const COMPACTION_DELTA_THRESHOLD: usize = 5;
 
+pub struct GcCompactionJobDescription {
+    /// All layers to read in the compaction job
+    selected_layers: Vec<Layer>,
+    /// GC cutoff of the job
+    gc_cutoff: Lsn,
+    /// LSNs to retain for the job
+    retain_lsns_below_horizon: Vec<Lsn>,
+    /// Maximum layer LSN processed in this compaction
+    max_layer_lsn: Lsn,
+    /// Only compact layers overlapping with this range
+    compaction_key_range: Range<Key>,
+    /// When partial compaction is enabled, these layers need to be rewritten to ensure no overlap.
+    /// This field is here solely for debugging. The field will not be read once the compaction
+    /// description is generated.
+    rewrite_layers: Vec<Arc<PersistentLayerDesc>>,
+}
+
 /// The result of bottom-most compaction for a single key at each LSN.
 #[derive(Debug)]
 #[cfg_attr(test, derive(PartialEq))]
@@ -1722,7 +1739,8 @@ impl Timeline {
         flags: EnumSet<CompactFlags>,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
-        self.partial_compact_with_gc(None, cancel, flags, ctx).await
+        self.partial_compact_with_gc(Key::MIN..Key::MAX, cancel, flags, ctx)
+            .await
     }
 
     /// An experimental compaction building block that combines compaction with garbage collection.
@@ -1732,12 +1750,15 @@ impl Timeline {
     /// layers and image layers, which generates image layers on the gc horizon, drop deltas below gc horizon,
     /// and create delta layers with all deltas >= gc horizon.
     ///
-    /// If `key_range`, it will only compact the keys within the range, aka partial compaction. This functionality
-    /// is not complete yet, and if it is set, only image layers will be generated.
-    ///
+    /// If `key_range` is provided, it will only compact the keys within the range, aka partial compaction.
+    /// Partial compaction will read and process all layers overlapping with the key range, even if it might
+    /// contain extra keys. After the gc-compaction phase completes, delta layers that are not fully contained
+    /// within the key range will be rewritten to ensure they do not overlap with the delta layers. Providing
+    /// Key::MIN..Key..MAX to the function indicates a full compaction, though technically, `Key::MAX` is not
+    /// part of the range.
     pub(crate) async fn partial_compact_with_gc(
         self: &Arc<Self>,
-        compaction_key_range: Option<Range<Key>>,
+        compaction_key_range: Range<Key>,
         cancel: &CancellationToken,
         flags: EnumSet<CompactFlags>,
         ctx: &RequestContext,
@@ -1762,9 +1783,8 @@ impl Timeline {
         .await?;
 
         let dry_run = flags.contains(CompactFlags::DryRun);
-        let partial_compaction = compaction_key_range.is_some();
 
-        if let Some(ref compaction_key_range) = compaction_key_range {
+        if compaction_key_range == (Key::MIN..Key::MAX) {
             info!("running enhanced gc bottom-most compaction, dry_run={dry_run}, compaction_key_range={}..{}", compaction_key_range.start, compaction_key_range.end);
         } else {
             info!("running enhanced gc bottom-most compaction, dry_run={dry_run}");
@@ -1780,7 +1800,7 @@ impl Timeline {
         // The layer selection has the following properties:
         // 1. If a layer is in the selection, all layers below it are in the selection.
         // 2. Inferred from (1), for each key in the layer selection, the value can be reconstructed only with the layers in the layer selection.
-        let (layer_selection, gc_cutoff, retain_lsns_below_horizon) = if !partial_compaction {
+        let job_desc = {
             let guard = self.layers.read().await;
             let layers = guard.layer_map()?;
             let gc_info = self.gc_info.read().unwrap();
@@ -1810,9 +1830,21 @@ impl Timeline {
             };
             // Then, pick all the layers that are below the max_layer_lsn. This is to ensure we can pick all single-key
             // layers to compact.
+            let mut rewrite_layers = Vec::new();
             for desc in layers.iter_historic_layers() {
-                if desc.get_lsn_range().end <= max_layer_lsn {
+                if desc.get_lsn_range().end <= max_layer_lsn
+                    && overlaps_with(&desc.get_key_range(), &compaction_key_range)
+                {
+                    // If the layer overlaps with the compaction key range, we need to read it to obtain all keys within the range,
+                    // even if it might contain extra keys
                     selected_layers.push(guard.get_from_desc(&desc));
+                    // If the layer is not fully contained within the key range, we need to rewrite it if it's a delta layer (it's fine
+                    // to overlap image layers)
+                    if desc.is_delta()
+                        && !fully_contains(&compaction_key_range, &desc.get_key_range())
+                    {
+                        rewrite_layers.push(desc);
+                    }
                 }
             }
             if selected_layers.is_empty() {
@@ -1820,82 +1852,59 @@ impl Timeline {
                 return Ok(());
             }
             retain_lsns_below_horizon.sort();
-            (selected_layers, gc_cutoff, retain_lsns_below_horizon)
-        } else {
-            // In case of partial compaction, we currently only support generating image layers, and therefore,
-            // we pick all layers that are below the lowest retain_lsn and does not intersect with any of the layers.
-            let guard = self.layers.read().await;
-            let layers = guard.layer_map()?;
-            let gc_info = self.gc_info.read().unwrap();
-            let mut min_lsn = gc_info.cutoffs.select_min();
-            for (lsn, _, _) in &gc_info.retain_lsns {
-                if lsn < &min_lsn {
-                    min_lsn = *lsn;
-                }
+            GcCompactionJobDescription {
+                selected_layers,
+                gc_cutoff,
+                retain_lsns_below_horizon,
+                max_layer_lsn,
+                compaction_key_range,
+                rewrite_layers,
             }
-            for lsn in gc_info.leases.keys() {
-                if lsn < &min_lsn {
-                    min_lsn = *lsn;
-                }
-            }
-            let mut selected_layers = Vec::new();
-            drop(gc_info);
-            // |-------| |-------| |-------|
-            // | Delta | | Delta | | Delta | -- min_lsn could be intersecting with the layers
-            // |-------| |-------| |-------| <- we want to pick all the layers below min_lsn, so that
-            // | Delta | | Delta | | Delta |    ...we can remove them after compaction
-            // |-------| |-------| |-------|
-            // Pick all the layers intersect or below the min_lsn, get the largest LSN in the selected layers.
-            let Some(compaction_key_range) = compaction_key_range.as_ref() else {
-                unreachable!()
-            };
-            for desc in layers.iter_historic_layers() {
-                if desc.get_lsn_range().end <= min_lsn
-                    && overlaps_with(&desc.key_range, compaction_key_range)
-                {
-                    selected_layers.push(guard.get_from_desc(&desc));
-                }
-            }
-            if selected_layers.is_empty() {
-                info!("no layers to compact with gc");
-                return Ok(());
-            }
-            (selected_layers, min_lsn, Vec::new())
         };
         let lowest_retain_lsn = if self.ancestor_timeline.is_some() {
-            if partial_compaction {
-                warn!("partial compaction cannot run on child branches (for now)");
-                return Ok(());
-            }
             Lsn(self.ancestor_lsn.0 + 1)
         } else {
-            let res = retain_lsns_below_horizon
+            let res = job_desc
+                .retain_lsns_below_horizon
                 .first()
                 .copied()
-                .unwrap_or(gc_cutoff);
+                .unwrap_or(job_desc.gc_cutoff);
             if cfg!(debug_assertions) {
                 assert_eq!(
                     res,
-                    retain_lsns_below_horizon
+                    job_desc
+                        .retain_lsns_below_horizon
                         .iter()
                         .min()
                         .copied()
-                        .unwrap_or(gc_cutoff)
+                        .unwrap_or(job_desc.gc_cutoff)
                 );
             }
             res
         };
         info!(
-            "picked {} layers for compaction with gc_cutoff={} lowest_retain_lsn={}",
-            layer_selection.len(),
-            gc_cutoff,
-            lowest_retain_lsn
+            "picked {} layers for compaction ({} layers need rewriting) with max_layer_lsn={} gc_cutoff={} lowest_retain_lsn={}, key_range={}..{}",
+            job_desc.selected_layers.len(),
+            job_desc.rewrite_layers.len(),
+            job_desc.max_layer_lsn,
+            job_desc.gc_cutoff,
+            lowest_retain_lsn,
+            job_desc.compaction_key_range.start,
+            job_desc.compaction_key_range.end
         );
 
-        self.check_compaction_space(&layer_selection).await?;
+        for layer in &job_desc.selected_layers {
+            debug!("read layer: {}", layer.layer_desc().key());
+        }
+        for layer in &job_desc.rewrite_layers {
+            debug!("rewrite layer: {}", layer.key());
+        }
+
+        self.check_compaction_space(&job_desc.selected_layers)
+            .await?;
 
         // Generate statistics for the compaction
-        for layer in &layer_selection {
+        for layer in &job_desc.selected_layers {
             let desc = layer.layer_desc();
             if desc.is_delta() {
                 stat.visit_delta_layer(desc.file_size());
@@ -1906,25 +1915,25 @@ impl Timeline {
 
         // Step 1: construct a k-merge iterator over all layers.
         // Also, verify if the layer map can be split by drawing a horizontal line at every LSN start/end split point.
-        let layer_names: Vec<crate::tenant::storage_layer::LayerName> = layer_selection
+        let layer_names = job_desc
+            .selected_layers
             .iter()
             .map(|layer| layer.layer_desc().layer_name())
             .collect_vec();
         if let Some(err) = check_valid_layermap(&layer_names) {
-            bail!("cannot run gc-compaction because {}", err);
+            warn!("gc-compaction layer map check failed because {}, this is normal if partial compaction is not finished yet", err);
         }
         // The maximum LSN we are processing in this compaction loop
-        let end_lsn = layer_selection
+        let end_lsn = job_desc
+            .selected_layers
             .iter()
             .map(|l| l.layer_desc().lsn_range.end)
             .max()
             .unwrap();
-        // We don't want any of the produced layers to cover the full key range (i.e., MIN..MAX) b/c it will then be recognized
-        // as an L0 layer.
         let mut delta_layers = Vec::new();
         let mut image_layers = Vec::new();
         let mut downloaded_layers = Vec::new();
-        for layer in &layer_selection {
+        for layer in &job_desc.selected_layers {
             let resident_layer = layer.download_and_keep_resident().await?;
             downloaded_layers.push(resident_layer);
         }
@@ -1943,8 +1952,8 @@ impl Timeline {
             dense_ks,
             sparse_ks,
         )?;
-        // Step 2: Produce images+deltas. TODO: ensure newly-produced delta does not overlap with other deltas.
-        // Data of the same key.
+
+        // Step 2: Produce images+deltas.
         let mut accumulated_values = Vec::new();
         let mut last_key: Option<Key> = None;
 
@@ -1956,10 +1965,7 @@ impl Timeline {
                     self.conf,
                     self.timeline_id,
                     self.tenant_shard_id,
-                    compaction_key_range
-                        .as_ref()
-                        .map(|x| x.start)
-                        .unwrap_or(Key::MIN),
+                    job_desc.compaction_key_range.start,
                     lowest_retain_lsn,
                     self.get_compaction_target_size(),
                     ctx,
@@ -1979,6 +1985,13 @@ impl Timeline {
         )
         .await?;
 
+        #[derive(Default)]
+        struct RewritingLayers {
+            before: Option<DeltaLayerWriter>,
+            after: Option<DeltaLayerWriter>,
+        }
+        let mut delta_layer_rewriters = HashMap::<Arc<PersistentLayerKey>, RewritingLayers>::new();
+
         /// Returns None if there is no ancestor branch. Throw an error when the key is not found.
         ///
         /// Currently, we always get the ancestor image for each key in the child branch no matter whether the image
@@ -2004,10 +2017,51 @@ impl Timeline {
         // the key and LSN range are determined. However, to keep things simple here, we still
         // create this writer, and discard the writer in the end.
 
-        while let Some((key, lsn, val)) = merge_iter.next().await? {
+        while let Some(((key, lsn, val), desc)) = merge_iter.next_with_trace().await? {
             if cancel.is_cancelled() {
                 return Err(anyhow!("cancelled")); // TODO: refactor to CompactionError and pass cancel error
             }
+            if !job_desc.compaction_key_range.contains(&key) {
+                if !desc.is_delta {
+                    continue;
+                }
+                let rewriter = delta_layer_rewriters.entry(desc.clone()).or_default();
+                let rewriter = if key < job_desc.compaction_key_range.start {
+                    if rewriter.before.is_none() {
+                        rewriter.before = Some(
+                            DeltaLayerWriter::new(
+                                self.conf,
+                                self.timeline_id,
+                                self.tenant_shard_id,
+                                desc.key_range.start,
+                                desc.lsn_range.clone(),
+                                ctx,
+                            )
+                            .await?,
+                        );
+                    }
+                    rewriter.before.as_mut().unwrap()
+                } else if key >= job_desc.compaction_key_range.end {
+                    if rewriter.after.is_none() {
+                        rewriter.after = Some(
+                            DeltaLayerWriter::new(
+                                self.conf,
+                                self.timeline_id,
+                                self.tenant_shard_id,
+                                job_desc.compaction_key_range.end,
+                                desc.lsn_range.clone(),
+                                ctx,
+                            )
+                            .await?,
+                        );
+                    }
+                    rewriter.after.as_mut().unwrap()
+                } else {
+                    unreachable!()
+                };
+                rewriter.put_value(key, lsn, val, ctx).await?;
+                continue;
+            }
             match val {
                 Value::Image(_) => stat.visit_image_key(&val),
                 Value::WalRecord(_) => stat.visit_wal_key(&val),
@@ -2018,35 +2072,27 @@ impl Timeline {
                 }
                 accumulated_values.push((key, lsn, val));
             } else {
-                let last_key = last_key.as_mut().unwrap();
-                stat.on_unique_key_visited();
-                let skip_adding_key = if let Some(ref compaction_key_range) = compaction_key_range {
-                    !compaction_key_range.contains(last_key)
-                } else {
-                    false
-                };
-                if !skip_adding_key {
-                    let retention = self
-                        .generate_key_retention(
-                            *last_key,
-                            &accumulated_values,
-                            gc_cutoff,
-                            &retain_lsns_below_horizon,
-                            COMPACTION_DELTA_THRESHOLD,
-                            get_ancestor_image(self, *last_key, ctx).await?,
-                        )
-                        .await?;
-                    // Put the image into the image layer. Currently we have a single big layer for the compaction.
-                    retention
-                        .pipe_to(
-                            *last_key,
-                            &mut delta_layer_writer,
-                            image_layer_writer.as_mut(),
-                            &mut stat,
-                            ctx,
-                        )
-                        .await?;
-                }
+                let last_key: &mut Key = last_key.as_mut().unwrap();
+                stat.on_unique_key_visited(); // TODO: adjust statistics for partial compaction
+                let retention = self
+                    .generate_key_retention(
+                        *last_key,
+                        &accumulated_values,
+                        job_desc.gc_cutoff,
+                        &job_desc.retain_lsns_below_horizon,
+                        COMPACTION_DELTA_THRESHOLD,
+                        get_ancestor_image(self, *last_key, ctx).await?,
+                    )
+                    .await?;
+                retention
+                    .pipe_to(
+                        *last_key,
+                        &mut delta_layer_writer,
+                        image_layer_writer.as_mut(),
+                        &mut stat,
+                        ctx,
+                    )
+                    .await?;
                 accumulated_values.clear();
                 *last_key = key;
                 accumulated_values.push((key, lsn, val));
@@ -2057,35 +2103,43 @@ impl Timeline {
         let last_key = last_key.expect("no keys produced during compaction");
         stat.on_unique_key_visited();
 
-        let skip_adding_key = if let Some(ref compaction_key_range) = compaction_key_range {
-            !compaction_key_range.contains(&last_key)
-        } else {
-            false
-        };
-        if !skip_adding_key {
-            let retention = self
-                .generate_key_retention(
-                    last_key,
-                    &accumulated_values,
-                    gc_cutoff,
-                    &retain_lsns_below_horizon,
-                    COMPACTION_DELTA_THRESHOLD,
-                    get_ancestor_image(self, last_key, ctx).await?,
-                )
-                .await?;
-            // Put the image into the image layer. Currently we have a single big layer for the compaction.
-            retention
-                .pipe_to(
-                    last_key,
-                    &mut delta_layer_writer,
-                    image_layer_writer.as_mut(),
-                    &mut stat,
-                    ctx,
-                )
-                .await?;
-        }
+        let retention = self
+            .generate_key_retention(
+                last_key,
+                &accumulated_values,
+                job_desc.gc_cutoff,
+                &job_desc.retain_lsns_below_horizon,
+                COMPACTION_DELTA_THRESHOLD,
+                get_ancestor_image(self, last_key, ctx).await?,
+            )
+            .await?;
+        retention
+            .pipe_to(
+                last_key,
+                &mut delta_layer_writer,
+                image_layer_writer.as_mut(),
+                &mut stat,
+                ctx,
+            )
+            .await?;
         // end: move the above part to the loop body
 
+        let mut rewrote_delta_layers = Vec::new();
+        for (key, writers) in delta_layer_rewriters {
+            if let Some(delta_writer_before) = writers.before {
+                let (desc, path) = delta_writer_before
+                    .finish(job_desc.compaction_key_range.start, ctx)
+                    .await?;
+                let layer = Layer::finish_creating(self.conf, self, desc, &path)?;
+                rewrote_delta_layers.push(layer);
+            }
+            if let Some(delta_writer_after) = writers.after {
+                let (desc, path) = delta_writer_after.finish(key.key_range.end, ctx).await?;
+                let layer = Layer::finish_creating(self.conf, self, desc, &path)?;
+                rewrote_delta_layers.push(layer);
+            }
+        }
+
         let discard = |key: &PersistentLayerKey| {
             let key = key.clone();
             async move { KeyHistoryRetention::discard_key(&key, self, dry_run).await }
@@ -2093,10 +2147,7 @@ impl Timeline {
 
         let produced_image_layers = if let Some(writer) = image_layer_writer {
             if !dry_run {
-                let end_key = compaction_key_range
-                    .as_ref()
-                    .map(|x| x.end)
-                    .unwrap_or(Key::MAX);
+                let end_key = job_desc.compaction_key_range.end;
                 writer
                     .finish_with_discard_fn(self, ctx, end_key, discard)
                     .await?
@@ -2117,10 +2168,8 @@ impl Timeline {
             Vec::new()
         };
 
-        if partial_compaction && !produced_delta_layers.is_empty() {
-            bail!("implementation error: partial compaction should not be producing delta layers (for now)");
-        }
-
+        // TODO: make image/delta/rewrote_delta layers generation atomic. At this point, we already generated resident layers, and if
+        // compaction is cancelled at this point, we might have some layers that are not cleaned up.
         let mut compact_to = Vec::new();
         let mut keep_layers = HashSet::new();
         let produced_delta_layers_len = produced_delta_layers.len();
@@ -2128,52 +2177,84 @@ impl Timeline {
         for action in produced_delta_layers {
             match action {
                 BatchWriterResult::Produced(layer) => {
+                    if cfg!(debug_assertions) {
+                        info!("produced delta layer: {}", layer.layer_desc().key());
+                    }
                     stat.produce_delta_layer(layer.layer_desc().file_size());
                     compact_to.push(layer);
                 }
                 BatchWriterResult::Discarded(l) => {
+                    if cfg!(debug_assertions) {
+                        info!("discarded delta layer: {}", l);
+                    }
                     keep_layers.insert(l);
                     stat.discard_delta_layer();
                 }
             }
         }
+        for layer in &rewrote_delta_layers {
+            debug!(
+                "produced rewritten delta layer: {}",
+                layer.layer_desc().key()
+            );
+        }
+        compact_to.extend(rewrote_delta_layers);
         for action in produced_image_layers {
             match action {
                 BatchWriterResult::Produced(layer) => {
+                    debug!("produced image layer: {}", layer.layer_desc().key());
                     stat.produce_image_layer(layer.layer_desc().file_size());
                     compact_to.push(layer);
                 }
                 BatchWriterResult::Discarded(l) => {
+                    debug!("discarded image layer: {}", l);
                     keep_layers.insert(l);
                     stat.discard_image_layer();
                 }
             }
         }
-        let mut layer_selection = layer_selection;
-        layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key()));
-        if let Some(ref compaction_key_range) = compaction_key_range {
-            // Partial compaction might select more data than it processes, e.g., if
-            // the compaction_key_range only partially overlaps:
-            //
-            //         [---compaction_key_range---]
-            //   [---A----][----B----][----C----][----D----]
-            //
-            // A,B,C,D are all in the `layer_selection`. The created image layers contain
-            // whatever is needed from B, C, and from `----]` of A, and from  `[--` of D.
-            //
-            // In contrast, `[--A-` and `--D----]` have not been processed, so, we must
-            // keep that data.
-            //
-            // The solution for now is to keep A and D completely.
-            // (layer_selection is what we'll remove from the layer map, so,
-            //  retain what is _not_ fully covered by compaction_key_range).
-            layer_selection.retain(|x| {
-                let key_range = &x.layer_desc().key_range;
-                key_range.start >= compaction_key_range.start
-                    && key_range.end <= compaction_key_range.end
-            });
+
+        let mut layer_selection = job_desc.selected_layers;
+
+        // Partial compaction might select more data than it processes, e.g., if
+        // the compaction_key_range only partially overlaps:
+        //
+        //         [---compaction_key_range---]
+        //   [---A----][----B----][----C----][----D----]
+        //
+        // For delta layers, we will rewrite the layers so that it is cut exactly at
+        // the compaction key range, so we can always discard them. However, for image
+        // layers, as we do not rewrite them for now, we need to handle them differently.
+        // Assume image layers  A, B, C, D are all in the `layer_selection`.
+        //
+        // The created image layers contain whatever is needed from B, C, and from
+        // `----]` of A, and from  `[---` of D.
+        //
+        // In contrast, `[---A` and `D----]` have not been processed, so, we must
+        // keep that data.
+        //
+        // The solution for now is to keep A and D completely if they are image layers.
+        // (layer_selection is what we'll remove from the layer map, so, retain what
+        // is _not_ fully covered by compaction_key_range).
+        for layer in &layer_selection {
+            if !layer.layer_desc().is_delta() {
+                if !overlaps_with(
+                    &layer.layer_desc().key_range,
+                    &job_desc.compaction_key_range,
+                ) {
+                    bail!("violated constraint: image layer outside of compaction key range");
+                }
+                if !fully_contains(
+                    &job_desc.compaction_key_range,
+                    &layer.layer_desc().key_range,
+                ) {
+                    keep_layers.insert(layer.layer_desc().key());
+                }
+            }
         }
 
+        layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key()));
+
         info!(
             "gc-compaction statistics: {}",
             serde_json::to_string(&stat)?
@@ -2192,6 +2273,7 @@ impl Timeline {
 
         // Step 3: Place back to the layer map.
         {
+            // TODO: sanity check if the layer map is valid (i.e., should not have overlaps)
             let mut guard = self.layers.write().await;
             guard
                 .open_mut()?

From fde16f86140deeefd300cf8bf3fc17dd93cfa22d Mon Sep 17 00:00:00 2001
From: Fedor Dikarev <fedor@neon.tech>
Date: Mon, 11 Nov 2024 21:33:29 +0100
Subject: [PATCH 25/28] use batch gh-workflow-stats-action with separate table
 (#9722)

We found that exporting GH Workflow Runs in batch is more efficient due
to
- better utilisation of Github API
- and gh runners usage is rounded to minutes, so even when ad-hoc export
is done in 5-10 seconds, we billed for one minute usage

So now we introduce batch exporting, with version v0.2.x of github
workflow stats exporter.
How it's expected to work now:
- every 15 minutes we query for the workflow runs, created in last 2
hours
- to avoid missing workflows that ran for more than 2 hours, every night
(00:25) we will query workflows created in past 24 hours and export them
as well
- should we have query for even longer periods?
- lets see how it works with current schedule
- for longer periods like for days or weeks, it may require to adjust
logic and concurrency of querying data, so lets for now use simpler
version
---
 .../workflows/report-workflow-stats-batch.yml | 29 +++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100644 .github/workflows/report-workflow-stats-batch.yml

diff --git a/.github/workflows/report-workflow-stats-batch.yml b/.github/workflows/report-workflow-stats-batch.yml
new file mode 100644
index 0000000000..98e394a3c2
--- /dev/null
+++ b/.github/workflows/report-workflow-stats-batch.yml
@@ -0,0 +1,29 @@
+name: Report Workflow Stats Batch
+
+on:
+  schedule:
+    - cron: '*/15 * * * *'
+    - cron: '25 0 * * *'
+
+jobs:
+  gh-workflow-stats-batch:
+    name: GitHub Workflow Stats Batch
+    runs-on: ubuntu-22.04
+    permissions:
+      actions: read
+    steps:
+    - name: Export Workflow Run for the past 2 hours
+      uses: neondatabase/gh-workflow-stats-action@v0.2.1
+      with:
+        db_uri: ${{ secrets.GH_REPORT_STATS_DB_RW_CONNSTR }}
+        db_table: "gh_workflow_stats_batch_neon"
+        gh_token: ${{ secrets.GITHUB_TOKEN }}
+        duration: '2h'
+    - name: Export Workflow Run for the past 24 hours
+      if: github.event.schedule == '25 0 * * *'
+      uses: neondatabase/gh-workflow-stats-action@v0.2.1
+      with:
+        db_uri: ${{ secrets.GH_REPORT_STATS_DB_RW_CONNSTR }}
+        db_table: "gh_workflow_stats_batch_neon"
+        gh_token: ${{ secrets.GITHUB_TOKEN }}
+        duration: '24h'

From 4b075db7ea69ebd666d65a80d49c5178c37e9607 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 11 Nov 2024 14:49:37 -0600
Subject: [PATCH 26/28] Add a postgres_exporter config file

This exporter logs an ERROR if a file called `postgres_exporter.yml` is
not located in its current working directory. We can silence it by
adding an empty config file and pointing the exporter at it.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute/compute-node.Dockerfile     | 2 ++
 compute/etc/postgres_exporter.yml   | 0
 compute/vm-image-spec-bookworm.yaml | 2 +-
 compute/vm-image-spec-bullseye.yaml | 2 +-
 4 files changed, 4 insertions(+), 2 deletions(-)
 create mode 100644 compute/etc/postgres_exporter.yml

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 6efef9e969..a3e80223eb 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1475,6 +1475,8 @@ RUN mkdir -p /etc/local_proxy && chown postgres:postgres /etc/local_proxy
 COPY --from=postgres-exporter /bin/postgres_exporter /bin/postgres_exporter
 COPY --from=sql-exporter      /bin/sql_exporter      /bin/sql_exporter
 
+COPY --chown=postgres compute/etc/postgres_exporter.yml /etc/postgres_exporter.yml
+
 COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql_exporter.yml               /etc/sql_exporter.yml
 COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neon_collector.yml             /etc/neon_collector.yml
 COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql_exporter_autoscaling.yml   /etc/sql_exporter_autoscaling.yml
diff --git a/compute/etc/postgres_exporter.yml b/compute/etc/postgres_exporter.yml
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/compute/vm-image-spec-bookworm.yaml b/compute/vm-image-spec-bookworm.yaml
index 79f894c289..ac9f5c6904 100644
--- a/compute/vm-image-spec-bookworm.yaml
+++ b/compute/vm-image-spec-bookworm.yaml
@@ -26,7 +26,7 @@ commands:
   - name: postgres-exporter
     user: nobody
     sysvInitAction: respawn
-    shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter'
+    shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter --config.file=/etc/postgres_exporter.yml'
   - name: sql-exporter
     user: nobody
     sysvInitAction: respawn
diff --git a/compute/vm-image-spec-bullseye.yaml b/compute/vm-image-spec-bullseye.yaml
index ff04b9e4c6..0d178e1c24 100644
--- a/compute/vm-image-spec-bullseye.yaml
+++ b/compute/vm-image-spec-bullseye.yaml
@@ -26,7 +26,7 @@ commands:
   - name: postgres-exporter
     user: nobody
     sysvInitAction: respawn
-    shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter'
+    shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter --config.file=/etc/postgres_exporter.yml'
   - name: sql-exporter
     user: nobody
     sysvInitAction: respawn

From b018bc7da89c9adf889829e2ef684fae34012fc6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 11 Nov 2024 23:29:21 +0100
Subject: [PATCH 27/28] Add a retain_lsn test (#9599)

Add a test that ensures the `retain_lsn` functionality works. Right now,
there is not a single test that is broken if offloaded or non-offloaded
timelines don't get registered at their parents, preventing gc from
discarding the ancestor_lsns of the children. This PR fills that gap.

The test has four modes:

* `offloaded`: offload the child timeline, run compaction on the parent
timeline, unarchive the child timeline, then try reading from it.
hopefully the data is still there.
* `offloaded-corrupted`: offload the child timeline, corrupts the
manifest in a way that the pageserver believes the timeline was
flattened. This is the closest we can get to pretend the `retain_lsn`
mechanism doesn't exist for offloaded timelines, so we can avoid adding
endpoints to the pageserver that do this manually for tests. The test
then checks that indeed data is corrupted and the endpoint can't be
started. That way we know that the test is actually working, and
actually tests the `retain_lsn` mechanism, instead of say the lsn lease
mechanism, or one of the many other mechanisms that impede gc.
* `archived`: the child timeline gets archived but doesn't get
offloaded. this currently matches the `None` case but we might have
refactors in the future that make archived timelines sufficiently
different from non-archived ones.
* `None`: the child timeline doesn't even get archived. this tests that
normal timelines participate in `retain_lsn`. I've made them locally not
participate in `retain_lsn` (via commenting out the respective
`ancestor_children.push` statement in tenant.rs) and ran the testsuite,
and not a single test failed. So this test is first of its kind.

Part of #8088.
---
 test_runner/regress/test_timeline_archive.py | 154 ++++++++++++++++++-
 1 file changed, 152 insertions(+), 2 deletions(-)

diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py
index 3e9812c38a..d3839e3d2c 100644
--- a/test_runner/regress/test_timeline_archive.py
+++ b/test_runner/regress/test_timeline_archive.py
@@ -1,15 +1,22 @@
 from __future__ import annotations
 
+import json
+from typing import Optional
+
 import pytest
 from fixtures.common_types import TenantId, TimelineArchivalState, TimelineId
+from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
     last_flush_lsn_upload,
 )
 from fixtures.pageserver.http import PageserverApiException
-from fixtures.pageserver.utils import assert_prefix_empty, assert_prefix_not_empty
-from fixtures.remote_storage import s3_storage
+from fixtures.pageserver.utils import assert_prefix_empty, assert_prefix_not_empty, list_prefix
+from fixtures.remote_storage import S3Storage, s3_storage
 from fixtures.utils import wait_until
+from mypy_boto3_s3.type_defs import (
+    ObjectTypeDef,
+)
 
 
 @pytest.mark.parametrize("shard_count", [0, 4])
@@ -369,3 +376,146 @@ def test_timeline_offload_persist(neon_env_builder: NeonEnvBuilder, delete_timel
         neon_env_builder.pageserver_remote_storage,
         prefix=f"tenants/{str(tenant_id)}/",
     )
+
+
+@pytest.mark.parametrize("offload_child", ["offload", "offload-corrupt", "archive", None])
+def test_timeline_retain_lsn(neon_env_builder: NeonEnvBuilder, offload_child: Optional[str]):
+    """
+    Ensure that retain_lsn functionality for timelines works, both for offloaded and non-offloaded ones
+    """
+    if offload_child == "offload-corrupt":
+        # Our corruption code only works with S3 compatible storage
+        neon_env_builder.enable_pageserver_remote_storage(s3_storage())
+
+    env = neon_env_builder.init_start()
+    ps_http = env.pageserver.http_client()
+
+    # Turn off gc and compaction loops: we want to issue them manually for better reliability
+    tenant_id, root_timeline_id = env.create_tenant(
+        conf={
+            # small checkpointing and compaction targets to ensure we generate many upload operations
+            "checkpoint_distance": 128 * 1024,
+            "compaction_threshold": 1,
+            "compaction_target_size": 128 * 1024,
+            # set small image creation thresholds so that gc deletes data
+            "image_creation_threshold": 2,
+            # disable background compaction and GC. We invoke it manually when we want it to happen.
+            "gc_period": "0s",
+            "compaction_period": "0s",
+            # Disable pitr, we only want the latest lsn
+            "pitr_interval": "0s",
+            # Don't rely on endpoint lsn leases
+            "lsn_lease_length": "0s",
+        }
+    )
+
+    with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
+        endpoint.safe_psql_many(
+            [
+                "CREATE TABLE foo(v int, key serial primary key, t text default 'data_content')",
+                "SELECT setseed(0.4321)",
+                "INSERT INTO foo SELECT v FROM (SELECT generate_series(1,2048), (random() * 409600)::int as v) as random_numbers",
+            ]
+        )
+        pre_branch_sum = endpoint.safe_psql("SELECT sum(key) from foo where v < 51200")
+        log.info(f"Pre branch sum: {pre_branch_sum}")
+        last_flush_lsn_upload(env, endpoint, tenant_id, root_timeline_id)
+
+    # Create a branch and write some additional data to the parent
+    child_timeline_id = env.create_branch("test_archived_branch", tenant_id)
+
+    with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
+        # Do some churn of the data. This is important so that we can overwrite image layers.
+        for i in range(10):
+            endpoint.safe_psql_many(
+                [
+                    f"SELECT setseed(0.23{i})",
+                    "UPDATE foo SET v=(random() * 409600)::int WHERE v % 3 = 2",
+                    "UPDATE foo SET v=(random() * 409600)::int WHERE v % 3 = 1",
+                    "UPDATE foo SET v=(random() * 409600)::int WHERE v % 3 = 0",
+                ]
+            )
+        post_branch_sum = endpoint.safe_psql("SELECT sum(key) from foo where v < 51200")
+        log.info(f"Post branch sum: {post_branch_sum}")
+        last_flush_lsn_upload(env, endpoint, tenant_id, root_timeline_id)
+
+    if offload_child is not None:
+        ps_http.timeline_archival_config(
+            tenant_id,
+            child_timeline_id,
+            state=TimelineArchivalState.ARCHIVED,
+        )
+        leaf_detail = ps_http.timeline_detail(
+            tenant_id,
+            child_timeline_id,
+        )
+        assert leaf_detail["is_archived"] is True
+        if "offload" in offload_child:
+            ps_http.timeline_offload(tenant_id, child_timeline_id)
+
+    # Do a restart to get rid of any in-memory objects (we only init gc info once, at attach)
+    env.pageserver.stop()
+    if offload_child == "offload-corrupt":
+        assert isinstance(env.pageserver_remote_storage, S3Storage)
+        listing = list_prefix(
+            env.pageserver_remote_storage, f"tenants/{str(tenant_id)}/tenant-manifest"
+        )
+        objects: list[ObjectTypeDef] = listing.get("Contents", [])
+        assert len(objects) > 0
+        remote_key: str = str(objects[0].get("Key", []))
+        local_path = str(env.repo_dir / "tenant-manifest.json")
+
+        log.info(f"Downloading {remote_key} -> {local_path}")
+        env.pageserver_remote_storage.client.download_file(
+            env.pageserver_remote_storage.bucket_name, remote_key, local_path
+        )
+
+        log.info(f"Corrupting {local_path}")
+        with open(local_path) as manifest_json_file:
+            manifest_json = json.load(manifest_json_file)
+        for offloaded_timeline in manifest_json["offloaded_timelines"]:
+            offloaded_timeline["ancestor_retain_lsn"] = None
+        with open(local_path, "w") as manifest_json_file:
+            json.dump(manifest_json, manifest_json_file)
+
+        log.info(f"Uploading {local_path} -> {remote_key}")
+        env.pageserver_remote_storage.client.upload_file(
+            local_path, env.pageserver_remote_storage.bucket_name, remote_key
+        )
+        # The point of our earlier efforts was to provoke these
+        env.pageserver.allowed_errors.extend(
+            [
+                ".*initial size calculation failed: PageRead.MissingKey.could not find data for key.*",
+                ".*page_service_conn_main.*could not find data for key.*",
+            ]
+        )
+    env.pageserver.start()
+
+    # Do an agressive gc and compaction of the parent branch
+    ps_http.timeline_gc(tenant_id=tenant_id, timeline_id=root_timeline_id, gc_horizon=0)
+    ps_http.timeline_checkpoint(
+        tenant_id,
+        root_timeline_id,
+        force_l0_compaction=True,
+        force_repartition=True,
+        wait_until_uploaded=True,
+        compact=True,
+    )
+
+    if offload_child is not None:
+        ps_http.timeline_archival_config(
+            tenant_id,
+            child_timeline_id,
+            state=TimelineArchivalState.UNARCHIVED,
+        )
+
+    # Now, after unarchival, the child timeline should still have its data accessible (or corrupted)
+    if offload_child == "offload-corrupt":
+        with pytest.raises(RuntimeError, match=".*failed to get basebackup.*"):
+            env.endpoints.create_start(
+                "test_archived_branch", tenant_id=tenant_id, basebackup_request_tries=1
+            )
+    else:
+        with env.endpoints.create_start("test_archived_branch", tenant_id=tenant_id) as endpoint:
+            sum = endpoint.safe_psql("SELECT sum(key) from foo where v < 51200")
+            assert sum == pre_branch_sum

From 5be6b07cf169665bb99548c16af084971ccd7ec5 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 11 Nov 2024 17:36:45 -0600
Subject: [PATCH 28/28] Improve typing related to
 regress/test_logical_replication.py (#9725)

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 test_runner/fixtures/neon_fixtures.py         |  4 +-
 .../regress/test_logical_replication.py       | 50 ++++++++++++-------
 2 files changed, 33 insertions(+), 21 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 79baa8a32d..0728a33a63 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -286,7 +286,7 @@ class PgProtocol:
         return self.safe_psql_many([query], **kwargs)[0]
 
     def safe_psql_many(
-        self, queries: Iterable[str], log_query=True, **kwargs: Any
+        self, queries: Iterable[str], log_query: bool = True, **kwargs: Any
     ) -> list[list[tuple[Any, ...]]]:
         """
         Execute queries against the node and return all rows.
@@ -306,7 +306,7 @@ class PgProtocol:
                         result.append(cur.fetchall())
         return result
 
-    def safe_psql_scalar(self, query, log_query=True) -> Any:
+    def safe_psql_scalar(self, query: str, log_query: bool = True) -> Any:
         """
         Execute query returning single row with single column.
         """
diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py
index 30027463df..df83ca1c44 100644
--- a/test_runner/regress/test_logical_replication.py
+++ b/test_runner/regress/test_logical_replication.py
@@ -4,24 +4,31 @@ import time
 from functools import partial
 from random import choice
 from string import ascii_lowercase
+from typing import TYPE_CHECKING, cast
 
-from fixtures.common_types import Lsn
+from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
-    NeonEnv,
-    NeonEnvBuilder,
-    PgProtocol,
     logical_replication_sync,
     wait_for_last_flush_lsn,
 )
 from fixtures.utils import wait_until
 
+if TYPE_CHECKING:
+    from fixtures.neon_fixtures import (
+        Endpoint,
+        NeonEnv,
+        NeonEnvBuilder,
+        PgProtocol,
+        VanillaPostgres,
+    )
+
 
 def random_string(n: int):
     return "".join([choice(ascii_lowercase) for _ in range(n)])
 
 
-def test_logical_replication(neon_simple_env: NeonEnv, vanilla_pg):
+def test_logical_replication(neon_simple_env: NeonEnv, vanilla_pg: VanillaPostgres):
     env = neon_simple_env
 
     tenant_id = env.initial_tenant
@@ -160,10 +167,10 @@ COMMIT;
 
 
 # Test that neon.logical_replication_max_snap_files works
-def test_obsolete_slot_drop(neon_simple_env: NeonEnv, vanilla_pg):
-    def slot_removed(ep):
+def test_obsolete_slot_drop(neon_simple_env: NeonEnv, vanilla_pg: VanillaPostgres):
+    def slot_removed(ep: Endpoint):
         assert (
-            endpoint.safe_psql(
+            ep.safe_psql(
                 "select count(*) from pg_replication_slots where slot_name = 'stale_slot'"
             )[0][0]
             == 0
@@ -254,7 +261,7 @@ FROM generate_series(1, 16384) AS seq; -- Inserts enough rows to exceed 16MB of
 
 
 # Tests that walsender correctly blocks until WAL is downloaded from safekeepers
-def test_lr_with_slow_safekeeper(neon_env_builder: NeonEnvBuilder, vanilla_pg):
+def test_lr_with_slow_safekeeper(neon_env_builder: NeonEnvBuilder, vanilla_pg: VanillaPostgres):
     neon_env_builder.num_safekeepers = 3
     env = neon_env_builder.init_start()
 
@@ -336,13 +343,13 @@ FROM generate_series(1, 16384) AS seq; -- Inserts enough rows to exceed 16MB of
 #
 # Most pages start with a contrecord, so we don't do anything special
 # to ensure that.
-def test_restart_endpoint(neon_simple_env: NeonEnv, vanilla_pg):
+def test_restart_endpoint(neon_simple_env: NeonEnv, vanilla_pg: VanillaPostgres):
     env = neon_simple_env
 
     env.create_branch("init")
     endpoint = env.endpoints.create_start("init")
-    tenant_id = endpoint.safe_psql("show neon.tenant_id")[0][0]
-    timeline_id = endpoint.safe_psql("show neon.timeline_id")[0][0]
+    tenant_id = TenantId(cast("str", endpoint.safe_psql("show neon.tenant_id")[0][0]))
+    timeline_id = TimelineId(cast("str", endpoint.safe_psql("show neon.timeline_id")[0][0]))
 
     cur = endpoint.connect().cursor()
     cur.execute("create table t(key int, value text)")
@@ -380,7 +387,7 @@ def test_restart_endpoint(neon_simple_env: NeonEnv, vanilla_pg):
 # logical replication bug as such, but without logical replication,
 # records passed ot the WAL redo process are never large enough to hit
 # the bug.
-def test_large_records(neon_simple_env: NeonEnv, vanilla_pg):
+def test_large_records(neon_simple_env: NeonEnv, vanilla_pg: VanillaPostgres):
     env = neon_simple_env
 
     env.create_branch("init")
@@ -522,15 +529,20 @@ def logical_replication_wait_flush_lsn_sync(publisher: PgProtocol) -> Lsn:
     because for some WAL records like vacuum subscriber won't get any data at
     all.
     """
-    publisher_flush_lsn = Lsn(publisher.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
+    publisher_flush_lsn = Lsn(
+        cast("str", publisher.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
+    )
 
     def check_caughtup():
-        res = publisher.safe_psql(
-            """
+        res = cast(
+            "tuple[str, str, str]",
+            publisher.safe_psql(
+                """
 select sent_lsn, flush_lsn, pg_current_wal_flush_lsn() from pg_stat_replication sr, pg_replication_slots s
    where s.active_pid = sr.pid and s.slot_type = 'logical';
                                   """
-        )[0]
+            )[0],
+        )
         sent_lsn, flush_lsn, curr_publisher_flush_lsn = Lsn(res[0]), Lsn(res[1]), Lsn(res[2])
         log.info(
             f"sent_lsn={sent_lsn}, flush_lsn={flush_lsn}, publisher_flush_lsn={curr_publisher_flush_lsn}, waiting flush_lsn to reach {publisher_flush_lsn}"
@@ -545,7 +557,7 @@ select sent_lsn, flush_lsn, pg_current_wal_flush_lsn() from pg_stat_replication
 # flush_lsn reporting to publisher. Without this, subscriber may ack too far,
 # losing data on restart because publisher implicitly advances positition given
 # in START_REPLICATION to the confirmed_flush_lsn of the slot.
-def test_subscriber_synchronous_commit(neon_simple_env: NeonEnv, vanilla_pg):
+def test_subscriber_synchronous_commit(neon_simple_env: NeonEnv, vanilla_pg: VanillaPostgres):
     env = neon_simple_env
     # use vanilla as publisher to allow writes on it when safekeeper is down
     vanilla_pg.configure(
@@ -593,7 +605,7 @@ def test_subscriber_synchronous_commit(neon_simple_env: NeonEnv, vanilla_pg):
     # logical_replication_wait_flush_lsn_sync is expected to hang while
     # safekeeper is down.
     vanilla_pg.safe_psql("checkpoint;")
-    assert sub.safe_psql_scalar("SELECT count(*) FROM t") == 1000
+    assert cast("int", sub.safe_psql_scalar("SELECT count(*) FROM t")) == 1000
 
     # restart subscriber and ensure it can catch up lost tail again
     sub.stop(mode="immediate")