proxy: Add temporary stream_events tool

neondatabase/cloud#19600
safekeeper: fix atomicity of WAL truncation (#9685 )
2026-01-27 15:20:38 +00:00 · 2024-11-14 11:49:07 +01:00 · 2024-11-14 13:06:42 +03:00 · 2024-11-13 20:35:48 -06:00 · 2024-11-13 18:07:39 +00:00 · 2024-11-13 15:08:58 +02:00
38 changed files with 1154 additions and 235 deletions
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -133,38 +133,38 @@ jobs:

      - name: Check vendor/postgres-v14 submodule reference
        if: steps.check-if-submodules-changed.outputs.vendor == 'true'
-        uses: hlinnaka/submodule-branch-check-action@main
+        uses: jtmullen/submodule-branch-check-action@v1
        with:
          path: "vendor/postgres-v14"
          fetch_depth: "50"
-          sub_fetch_depth: ""
+          sub_fetch_depth: "50"
          pass_if_unchanged: true

      - name: Check vendor/postgres-v15 submodule reference
        if: steps.check-if-submodules-changed.outputs.vendor == 'true'
-        uses: hlinnaka/submodule-branch-check-action@main
+        uses: jtmullen/submodule-branch-check-action@v1
        with:
          path: "vendor/postgres-v15"
          fetch_depth: "50"
-          sub_fetch_depth: ""
+          sub_fetch_depth: "50"
          pass_if_unchanged: true

      - name: Check vendor/postgres-v16 submodule reference
        if: steps.check-if-submodules-changed.outputs.vendor == 'true'
-        uses: hlinnaka/submodule-branch-check-action@main
+        uses: jtmullen/submodule-branch-check-action@v1
        with:
          path: "vendor/postgres-v16"
          fetch_depth: "50"
-          sub_fetch_depth: ""
+          sub_fetch_depth: "50"
          pass_if_unchanged: true

      - name: Check vendor/postgres-v17 submodule reference
        if: steps.check-if-submodules-changed.outputs.vendor == 'true'
-        uses: hlinnaka/submodule-branch-check-action@main
+        uses: jtmullen/submodule-branch-check-action@v1
        with:
          path: "vendor/postgres-v17"
          fetch_depth: "50"
-          sub_fetch_depth: ""
+          sub_fetch_depth: "50"
          pass_if_unchanged: true

  check-codestyle-rust:
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1229,12 +1229,15 @@ dependencies = [
 "flate2",
 "futures",
 "hyper 0.14.30",
+ "metrics",
 "nix 0.27.1",
 "notify",
 "num_cpus",
+ "once_cell",
 "opentelemetry",
 "opentelemetry_sdk",
 "postgres",
+ "prometheus",
 "regex",
 "remote_storage",
 "reqwest 0.12.4",
--- a/2
+++ b/2
@@ -55,6 +55,7 @@ RUN set -e \
      --bin proxy  \
      --bin neon_local \
      --bin storage_scrubber \
+      --bin stream_events \
      --locked --release

 # Build final image
@@ -82,6 +83,7 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_control
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy               /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local          /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_scrubber    /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/stream_events       /usr/local/bin

 COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
 COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/
--- a/compute/etc/sql_exporter/compute_backpressure_throttling_seconds.sql
+++ b/compute/etc/sql_exporter/compute_backpressure_throttling_seconds.sql
@@ -1 +1 @@
-SELECT neon.backpressure_throttling_time()::float8 / 1000 AS throttled;
+SELECT (neon.backpressure_throttling_time()::float8 / 1000000) AS throttled;
--- a/compute/patches/pg_anon.patch
+++ b/compute/patches/pg_anon.patch
@@ -1,3 +1,45 @@
+commit 00aa659afc9c7336ab81036edec3017168aabf40
+Author: Heikki Linnakangas <heikki@neon.tech>
+Date:   Tue Nov 12 16:59:19 2024 +0200
+
+    Temporarily disable test that depends on timezone
+
+diff --git a/tests/expected/generalization.out b/tests/expected/generalization.out
+index 23ef5fa..9e60deb 100644
+--- a/ext-src/pg_anon-src/tests/expected/generalization.out
+++ b/ext-src/pg_anon-src/tests/expected/generalization.out
+@@ -284,12 +284,9 @@ SELECT anon.generalize_tstzrange('19041107','century');
+  ["Tue Jan 01 00:00:00 1901 PST","Mon Jan 01 00:00:00 2001 PST")
+ (1 row)
+ 
+-SELECT anon.generalize_tstzrange('19041107','millennium');
+-                      generalize_tstzrange                       
+------------------------------------------------------------------
+- ["Thu Jan 01 00:00:00 1001 PST","Mon Jan 01 00:00:00 2001 PST")
+-(1 row)
+-
+-- temporarily disabled, see:
+-- https://gitlab.com/dalibo/postgresql_anonymizer/-/commit/199f0a392b37c59d92ae441fb8f037e094a11a52#note_2148017485
+--SELECT anon.generalize_tstzrange('19041107','millennium');
+ -- generalize_daterange
+ SELECT anon.generalize_daterange('19041107');
+   generalize_daterange   
+diff --git a/tests/sql/generalization.sql b/tests/sql/generalization.sql
+index b868344..b4fc977 100644
+--- a/ext-src/pg_anon-src/tests/sql/generalization.sql
+++ b/ext-src/pg_anon-src/tests/sql/generalization.sql
+@@ -61,7 +61,9 @@ SELECT anon.generalize_tstzrange('19041107','month');
+ SELECT anon.generalize_tstzrange('19041107','year');
+ SELECT anon.generalize_tstzrange('19041107','decade');
+ SELECT anon.generalize_tstzrange('19041107','century');
+-SELECT anon.generalize_tstzrange('19041107','millennium');
+-- temporarily disabled, see:
+-- https://gitlab.com/dalibo/postgresql_anonymizer/-/commit/199f0a392b37c59d92ae441fb8f037e094a11a52#note_2148017485
+--SELECT anon.generalize_tstzrange('19041107','millennium');
+ 
+ -- generalize_daterange
+ SELECT anon.generalize_daterange('19041107');
+
 commit 7dd414ee75f2875cffb1d6ba474df1f135a6fc6f
 Author: Alexey Masterov <alexeymasterov@neon.tech>
 Date:   Fri May 31 06:34:26 2024 +0000
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -18,9 +18,11 @@ clap.workspace = true
 flate2.workspace = true
 futures.workspace = true
 hyper0 = { workspace = true, features = ["full"] }
+metrics.workspace = true
 nix.workspace = true
 notify.workspace = true
 num_cpus.workspace = true
+once_cell.workspace = true
 opentelemetry.workspace = true
 opentelemetry_sdk.workspace = true
 postgres.workspace = true
@@ -39,6 +41,7 @@ tracing-subscriber.workspace = true
 tracing-utils.workspace = true
 thiserror.workspace = true
 url.workspace = true
+prometheus.workspace = true

 compute_api.workspace = true
 utils.workspace = true
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -9,6 +9,7 @@ use crate::catalog::SchemaDumpError;
 use crate::catalog::{get_database_schema, get_dbs_and_roles};
 use crate::compute::forward_termination_signal;
 use crate::compute::{ComputeNode, ComputeState, ParsedSpec};
+use crate::installed_extensions;
 use compute_api::requests::{ConfigurationRequest, ExtensionInstallRequest, SetRoleGrantsRequest};
 use compute_api::responses::{
    ComputeStatus, ComputeStatusResponse, ExtensionInstallResult, GenericAPIError,
@@ -19,6 +20,8 @@ use anyhow::Result;
 use hyper::header::CONTENT_TYPE;
 use hyper::service::{make_service_fn, service_fn};
 use hyper::{Body, Method, Request, Response, Server, StatusCode};
+use metrics::Encoder;
+use metrics::TextEncoder;
 use tokio::task;
 use tracing::{debug, error, info, warn};
 use tracing_utils::http::OtelName;
@@ -65,6 +68,28 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
            Response::new(Body::from(serde_json::to_string(&metrics).unwrap()))
        }

+        // Prometheus metrics
+        (&Method::GET, "/metrics") => {
+            debug!("serving /metrics GET request");
+
+            let mut buffer = vec![];
+            let metrics = installed_extensions::collect();
+            let encoder = TextEncoder::new();
+            encoder.encode(&metrics, &mut buffer).unwrap();
+
+            match Response::builder()
+                .status(StatusCode::OK)
+                .header(CONTENT_TYPE, encoder.format_type())
+                .body(Body::from(buffer))
+            {
+                Ok(response) => response,
+                Err(err) => {
+                    let msg = format!("error handling /metrics request: {err}");
+                    error!(msg);
+                    render_json_error(&msg, StatusCode::INTERNAL_SERVER_ERROR)
+                }
+            }
+        }
        // Collect Postgres current usage insights
        (&Method::GET, "/insights") => {
            info!("serving /insights GET request");
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -37,6 +37,21 @@ paths:
              schema:
                $ref: "#/components/schemas/ComputeMetrics"

+  /metrics
+    get:
+      tags:
+      - Info
+      summary: Get compute node metrics in text format.
+      description: ""
+      operationId: getComputeMetrics
+      responses:
+        200:
+          description: ComputeMetrics
+          content:
+            text/plain:
+              schema:
+                type: string
+                description: Metrics in text format.
  /insights:
    get:
      tags:
--- a/compute_tools/src/installed_extensions.rs
+++ b/compute_tools/src/installed_extensions.rs
@@ -1,4 +1,5 @@
 use compute_api::responses::{InstalledExtension, InstalledExtensions};
+use metrics::proto::MetricFamily;
 use std::collections::HashMap;
 use std::collections::HashSet;
 use tracing::info;
@@ -8,6 +9,10 @@ use anyhow::Result;
 use postgres::{Client, NoTls};
 use tokio::task;

+use metrics::core::Collector;
+use metrics::{register_uint_gauge_vec, UIntGaugeVec};
+use once_cell::sync::Lazy;
+
 /// We don't reuse get_existing_dbs() just for code clarity
 /// and to make database listing query here more explicit.
 ///
@@ -59,6 +64,12 @@ pub async fn get_installed_extensions(connstr: Url) -> Result<InstalledExtension

            for (extname, v) in extensions.iter() {
                let version = v.to_string();
+
+                // increment the number of databases where the version of extension is installed
+                INSTALLED_EXTENSIONS
+                    .with_label_values(&[extname, &version])
+                    .inc();
+
                extensions_map
                    .entry(extname.to_string())
                    .and_modify(|e| {
@@ -74,9 +85,11 @@ pub async fn get_installed_extensions(connstr: Url) -> Result<InstalledExtension
            }
        }

-        Ok(InstalledExtensions {
+        let res = InstalledExtensions {
            extensions: extensions_map.values().cloned().collect(),
-        })
+        };
+
+        Ok(res)
    })
    .await?
 }
@@ -97,6 +110,18 @@ pub fn get_installed_extensions_sync(connstr: Url) -> Result<()> {
        "[NEON_EXT_STAT] {}",
        serde_json::to_string(&result).expect("failed to serialize extensions list")
    );
-
    Ok(())
 }
+
+static INSTALLED_EXTENSIONS: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "installed_extensions",
+        "Number of databases where the version of extension is installed",
+        &["extension_name", "version"]
+    )
+    .expect("failed to define a metric")
+});
+
+pub fn collect() -> Vec<MetricFamily> {
+    INSTALLED_EXTENSIONS.collect()
+}
--- a/libs/pageserver_api/src/record.rs
+++ b/libs/pageserver_api/src/record.rs
@@ -80,18 +80,18 @@ impl NeonWalRecord {
    }

    #[cfg(feature = "testing")]
-    pub fn wal_clear() -> Self {
+    pub fn wal_clear(s: impl AsRef<str>) -> Self {
        Self::Test {
-            append: "".to_string(),
+            append: s.as_ref().to_string(),
            clear: true,
            will_init: false,
        }
    }

    #[cfg(feature = "testing")]
-    pub fn wal_init() -> Self {
+    pub fn wal_init(s: impl AsRef<str>) -> Self {
        Self::Test {
-            append: "".to_string(),
+            append: s.as_ref().to_string(),
            clear: true,
            will_init: true,
        }
--- a/libs/remote_storage/src/error.rs
+++ b/libs/remote_storage/src/error.rs
@@ -15,6 +15,9 @@ pub enum DownloadError {
    ///
    /// Concurrency control is not timed within timeout.
    Timeout,
+    /// Some integrity/consistency check failed during download. This is used during
+    /// timeline loads to cancel the load of a tenant if some timeline detects fatal corruption.
+    Fatal(String),
    /// The file was found in the remote storage, but the download failed.
    Other(anyhow::Error),
 }
@@ -29,6 +32,7 @@ impl std::fmt::Display for DownloadError {
            DownloadError::Unmodified => write!(f, "File was not modified"),
            DownloadError::Cancelled => write!(f, "Cancelled, shutting down"),
            DownloadError::Timeout => write!(f, "timeout"),
+            DownloadError::Fatal(why) => write!(f, "Fatal read error: {why}"),
            DownloadError::Other(e) => write!(f, "Failed to download a remote file: {e:?}"),
        }
    }
@@ -41,7 +45,7 @@ impl DownloadError {
    pub fn is_permanent(&self) -> bool {
        use DownloadError::*;
        match self {
-            BadInput(_) | NotFound | Unmodified | Cancelled => true,
+            BadInput(_) | NotFound | Unmodified | Fatal(_) | Cancelled => true,
            Timeout | Other(_) => false,
        }
    }
--- a/libs/utils/src/crashsafe.rs
+++ b/libs/utils/src/crashsafe.rs
@@ -123,15 +123,27 @@ pub async fn fsync_async_opt(
    Ok(())
 }

-/// Like postgres' durable_rename, renames file issuing fsyncs do make it
-/// durable. After return, file and rename are guaranteed to be persisted.
+/// Like postgres' durable_rename, renames a file and issues fsyncs to make it durable. After
+/// returning, both the file and rename are guaranteed to be persisted. Both paths must be on the
+/// same file system.
 ///
-/// Unlike postgres, it only does fsyncs to 1) file to be renamed to make
-/// contents durable; 2) its directory entry to make rename durable 3) again to
-/// already renamed file, which is not required by standards but postgres does
-/// it, let's stick to that. Postgres additionally fsyncs newpath *before*
-/// rename if it exists to ensure that at least one of the files survives, but
-/// current callers don't need that.
+/// Unlike postgres, it only fsyncs 1) the file to make contents durable, and 2) the directory to
+/// make the rename durable. This sequence ensures the target file will never be incomplete.
+///
+/// Postgres also:
+///
+/// * Fsyncs the target file, if it exists, before the rename, to ensure either the new or existing
+///   file survives a crash. Current callers don't need this as it should already be fsynced if
+///   durability is needed.
+///
+/// * Fsyncs the file after the rename. This can be required with certain OSes or file systems (e.g.
+///   NFS), but not on Linux with most common file systems like ext4 (which we currently use).
+///
+/// An audit of 8 other databases found that none fsynced the file after a rename:
+/// <https://github.com/neondatabase/neon/pull/9686#discussion_r1837180535>
+///
+/// eBPF probes confirmed that this is sufficient with ext4, XFS, and ZFS, but possibly not Btrfs:
+/// <https://github.com/neondatabase/neon/pull/9686#discussion_r1837926218>
 ///
 /// virtual_file.rs has similar code, but it doesn't use vfs.
 ///
@@ -149,9 +161,6 @@ pub async fn durable_rename(
    // Time to do the real deal.
    tokio::fs::rename(old_path.as_ref(), new_path.as_ref()).await?;

-    // Postgres'ish fsync of renamed file.
-    fsync_async_opt(new_path.as_ref(), do_fsync).await?;
-
    // Now fsync the parent
    let parent = match new_path.as_ref().parent() {
        Some(p) => p,
--- a/libs/utils/src/lsn.rs
+++ b/libs/utils/src/lsn.rs
@@ -138,6 +138,11 @@ impl Lsn {
        self.0.checked_sub(other).map(Lsn)
    }

+    /// Subtract a number, saturating at numeric bounds instead of overflowing.
+    pub fn saturating_sub<T: Into<u64>>(self, other: T) -> Lsn {
+        Lsn(self.0.saturating_sub(other.into()))
+    }
+
    /// Subtract a number, returning the difference as i128 to avoid overflow.
    pub fn widening_sub<T: Into<u64>>(self, other: T) -> i128 {
        let other: u64 = other.into();
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1433,6 +1433,12 @@ impl Tenant {
                    info!(%timeline_id, "index_part not found on remote");
                    continue;
                }
+                Err(DownloadError::Fatal(why)) => {
+                    // If, while loading one remote timeline, we saw an indication that our generation
+                    // number is likely invalid, then we should not load the whole tenant.
+                    error!(%timeline_id, "Fatal error loading timeline: {why}");
+                    anyhow::bail!(why.to_string());
+                }
                Err(e) => {
                    // Some (possibly ephemeral) error happened during index_part download.
                    // Pretend the timeline exists to not delete the timeline directory,
@@ -7757,13 +7763,13 @@ mod tests {
            (
                get_key(3),
                Lsn(0x20),
-                Value::WalRecord(NeonWalRecord::wal_clear()),
+                Value::WalRecord(NeonWalRecord::wal_clear("c")),
            ),
            (get_key(4), Lsn(0x10), Value::Image("0x10".into())),
            (
                get_key(4),
                Lsn(0x20),
-                Value::WalRecord(NeonWalRecord::wal_init()),
+                Value::WalRecord(NeonWalRecord::wal_init("i")),
            ),
        ];
        let image1 = vec![(get_key(1), "0x10".into())];
@@ -7912,8 +7918,30 @@ mod tests {

    #[cfg(feature = "testing")]
    #[tokio::test]
-    async fn test_simple_bottom_most_compaction_deltas() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_simple_bottom_most_compaction_deltas").await?;
+    async fn test_simple_bottom_most_compaction_deltas_1() -> anyhow::Result<()> {
+        test_simple_bottom_most_compaction_deltas_helper(
+            "test_simple_bottom_most_compaction_deltas_1",
+            false,
+        )
+        .await
+    }
+
+    #[cfg(feature = "testing")]
+    #[tokio::test]
+    async fn test_simple_bottom_most_compaction_deltas_2() -> anyhow::Result<()> {
+        test_simple_bottom_most_compaction_deltas_helper(
+            "test_simple_bottom_most_compaction_deltas_2",
+            true,
+        )
+        .await
+    }
+
+    #[cfg(feature = "testing")]
+    async fn test_simple_bottom_most_compaction_deltas_helper(
+        test_name: &'static str,
+        use_delta_bottom_layer: bool,
+    ) -> anyhow::Result<()> {
+        let harness = TenantHarness::create(test_name).await?;
        let (tenant, ctx) = harness.load().await;

        fn get_key(id: u32) -> Key {
@@ -7944,6 +7972,16 @@ mod tests {
        let img_layer = (0..10)
            .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
            .collect_vec();
+        // or, delta layer at 0x10 if `use_delta_bottom_layer` is true
+        let delta4 = (0..10)
+            .map(|id| {
+                (
+                    get_key(id),
+                    Lsn(0x08),
+                    Value::WalRecord(NeonWalRecord::wal_init(format!("value {id}@0x10"))),
+                )
+            })
+            .collect_vec();

        let delta1 = vec![
            (
@@ -7997,21 +8035,61 @@ mod tests {
            ),
        ];

-        let tline = tenant
-            .create_test_timeline_with_layers(
-                TIMELINE_ID,
-                Lsn(0x10),
-                DEFAULT_PG_VERSION,
-                &ctx,
-                vec![
-                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta1),
-                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta2),
-                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3),
-                ], // delta layers
-                vec![(Lsn(0x10), img_layer)], // image layers
-                Lsn(0x50),
-            )
-            .await?;
+        let tline = if use_delta_bottom_layer {
+            tenant
+                .create_test_timeline_with_layers(
+                    TIMELINE_ID,
+                    Lsn(0x08),
+                    DEFAULT_PG_VERSION,
+                    &ctx,
+                    vec![
+                        DeltaLayerTestDesc::new_with_inferred_key_range(
+                            Lsn(0x08)..Lsn(0x10),
+                            delta4,
+                        ),
+                        DeltaLayerTestDesc::new_with_inferred_key_range(
+                            Lsn(0x20)..Lsn(0x48),
+                            delta1,
+                        ),
+                        DeltaLayerTestDesc::new_with_inferred_key_range(
+                            Lsn(0x20)..Lsn(0x48),
+                            delta2,
+                        ),
+                        DeltaLayerTestDesc::new_with_inferred_key_range(
+                            Lsn(0x48)..Lsn(0x50),
+                            delta3,
+                        ),
+                    ], // delta layers
+                    vec![], // image layers
+                    Lsn(0x50),
+                )
+                .await?
+        } else {
+            tenant
+                .create_test_timeline_with_layers(
+                    TIMELINE_ID,
+                    Lsn(0x10),
+                    DEFAULT_PG_VERSION,
+                    &ctx,
+                    vec![
+                        DeltaLayerTestDesc::new_with_inferred_key_range(
+                            Lsn(0x10)..Lsn(0x48),
+                            delta1,
+                        ),
+                        DeltaLayerTestDesc::new_with_inferred_key_range(
+                            Lsn(0x10)..Lsn(0x48),
+                            delta2,
+                        ),
+                        DeltaLayerTestDesc::new_with_inferred_key_range(
+                            Lsn(0x48)..Lsn(0x50),
+                            delta3,
+                        ),
+                    ], // delta layers
+                    vec![(Lsn(0x10), img_layer)], // image layers
+                    Lsn(0x50),
+                )
+                .await?
+        };
        {
            // Update GC info
            let mut guard = tline.gc_info.write().unwrap();
@@ -8121,7 +8199,7 @@ mod tests {
            (
                key,
                Lsn(0x10),
-                Value::Image(Bytes::copy_from_slice(b"0x10")),
+                Value::WalRecord(NeonWalRecord::wal_init("0x10")),
            ),
            (
                key,
@@ -8183,7 +8261,7 @@ mod tests {
                    Lsn(0x20),
                    KeyLogAtLsn(vec![(
                        Lsn(0x20),
-                        Value::Image(Bytes::copy_from_slice(b"0x10;0x20")),
+                        Value::Image(Bytes::from_static(b"0x10;0x20")),
                    )]),
                ),
                (
@@ -9165,7 +9243,7 @@ mod tests {

            let will_init = will_init_keys.contains(&i);
            if will_init {
-                delta_layer_spec.push((key, lsn, Value::WalRecord(NeonWalRecord::wal_init())));
+                delta_layer_spec.push((key, lsn, Value::WalRecord(NeonWalRecord::wal_init(""))));

                expected_key_values.insert(key, "".to_string());
            } else {
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -574,12 +574,18 @@ impl RemoteTimelineClient {

            if latest_index_generation > index_generation {
                // Unexpected!  Why are we loading such an old index if a more recent one exists?
-                tracing::warn!(
+                // We will refuse to proceed, as there is no reasonable scenario where this should happen, but
+                // there _is_ a clear bug/corruption scenario where it would happen (controller sets the generation
+                // backwards).
+                tracing::error!(
                    ?index_generation,
                    ?latest_index_generation,
                    ?latest_index_mtime,
                    "Found a newer index while loading an old one"
                );
+                return Err(DownloadError::Fatal(
+                    "Index age exceeds threshold and a newer index exists".into(),
+                ));
            }
        }

--- a/pageserver/src/tenant/storage_layer/merge_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs
@@ -562,7 +562,7 @@ mod tests {
            (
                get_key(0),
                Lsn(0x10),
-                Value::WalRecord(NeonWalRecord::wal_init()),
+                Value::WalRecord(NeonWalRecord::wal_init("")),
            ),
            (
                get_key(0),
@@ -572,7 +572,7 @@ mod tests {
            (
                get_key(5),
                Lsn(0x10),
-                Value::WalRecord(NeonWalRecord::wal_init()),
+                Value::WalRecord(NeonWalRecord::wal_init("")),
            ),
            (
                get_key(5),
--- a/pageserver/src/walredo/apply_neon.rs
+++ b/pageserver/src/walredo/apply_neon.rs
@@ -253,6 +253,10 @@ pub(crate) fn apply_in_neon(
            use bytes::BufMut;
            if *will_init {
                assert!(*clear, "init record must be clear to ensure correctness");
+                assert!(
+                    page.is_empty(),
+                    "init record must be the first entry to ensure correctness"
+                );
            }
            if *clear {
                page.clear();
--- a/pgxn/neon/logical_replication_monitor.c
+++ b/pgxn/neon/logical_replication_monitor.c
@@ -1,7 +1,8 @@
+#include <dirent.h>
 #include <limits.h>
 #include <string.h>
-#include <dirent.h>
 #include <signal.h>
+#include <sys/stat.h>

 #include "postgres.h"

@@ -21,17 +22,35 @@

 static int	logical_replication_max_snap_files = 300;

+/*
+ * According to Chi (shyzh), the pageserver _should_ be good with 10 MB worth of
+ * snapshot files. Let's use 8 MB since 8 is a power of 2.
+ */
+static int	logical_replication_max_logicalsnapdir_size = 8000;
+
+/*
+ * A primitive description of a logical snapshot file including the LSN of the
+ * file and its size.
+ */
+typedef struct SnapDesc {
+	XLogRecPtr	lsn;
+	off_t		sz;
+} SnapDesc;
+
 PGDLLEXPORT void LogicalSlotsMonitorMain(Datum main_arg);

+/*
+ * Sorts an array of snapshot descriptors by their LSN.
+ */
 static int
-LsnDescComparator(const void *a, const void *b)
+SnapDescComparator(const void *a, const void *b)
 {
-	XLogRecPtr	lsn1 = *((const XLogRecPtr *) a);
-	XLogRecPtr	lsn2 = *((const XLogRecPtr *) b);
+	const SnapDesc	*desc1 = a;
+	const SnapDesc	*desc2 = b;

-	if (lsn1 < lsn2)
+	if (desc1->lsn < desc2->lsn)
 		return 1;
-	else if (lsn1 == lsn2)
+	else if (desc1->lsn == desc2->lsn)
 		return 0;
 	else
 		return -1;
@@ -43,28 +62,39 @@ LsnDescComparator(const void *a, const void *b)
 * slots having lower restart_lsn should be dropped.
 */
 static XLogRecPtr
-get_num_snap_files_lsn_threshold(void)
+get_snapshots_cutoff_lsn(void)
 {
-	DIR		   *dirdesc;
-	struct dirent *de;
-	char	   *snap_path = "pg_logical/snapshots/";
-	int			lsns_allocated = 1024;
-	int			lsns_num = 0;
-	XLogRecPtr *lsns;
-	XLogRecPtr	cutoff;
+/* PG 18 has a constant defined for this, PG_LOGICAL_SNAPSHOTS_DIR */
+#define SNAPDIR "pg_logical/snapshots"

-	if (logical_replication_max_snap_files < 0)
+	DIR		   *dirdesc;
+	int			dirdesc_fd;
+	struct dirent *de;
+	size_t		snapshot_index = 0;
+	SnapDesc   *snapshot_descriptors;
+	size_t		descriptors_allocated = 1024;
+	XLogRecPtr	cutoff = 0;
+	off_t		logicalsnapdir_size = 0;
+	const int	logical_replication_max_logicalsnapdir_size_bytes = logical_replication_max_logicalsnapdir_size * 1000;
+
+	if (logical_replication_max_snap_files < 0 && logical_replication_max_logicalsnapdir_size < 0)
 		return 0;

-	lsns = palloc(sizeof(XLogRecPtr) * lsns_allocated);
+	snapshot_descriptors = palloc(sizeof(*snapshot_descriptors) * descriptors_allocated);
+
+	dirdesc = AllocateDir(SNAPDIR);
+	dirdesc_fd = dirfd(dirdesc);
+	if (dirdesc_fd == -1)
+		ereport(ERROR, errmsg("failed to get a file descriptor for " SNAPDIR ": %m"));

 	/* find all .snap files and get their lsns */
-	dirdesc = AllocateDir(snap_path);
-	while ((de = ReadDir(dirdesc, snap_path)) != NULL)
+	while ((de = ReadDir(dirdesc, SNAPDIR)) != NULL)
 	{
-		XLogRecPtr	lsn;
 		uint32		hi;
 		uint32		lo;
+		struct stat	st;
+		XLogRecPtr	lsn;
+		SnapDesc   *desc;

 		if (strcmp(de->d_name, ".") == 0 ||
 			strcmp(de->d_name, "..") == 0)
@@ -79,28 +109,69 @@ get_num_snap_files_lsn_threshold(void)

 		lsn = ((uint64) hi) << 32 | lo;
 		elog(DEBUG5, "found snap file %X/%X", LSN_FORMAT_ARGS(lsn));
-		if (lsns_allocated == lsns_num)
+
+		if (fstatat(dirdesc_fd, de->d_name, &st, 0) == -1)
+			ereport(ERROR, errmsg("failed to get the size of " SNAPDIR "/%s: %m", de->d_name));
+
+		if (descriptors_allocated == snapshot_index)
 		{
-			lsns_allocated *= 2;
-			lsns = repalloc(lsns, sizeof(XLogRecPtr) * lsns_allocated);
+			descriptors_allocated *= 2;
+			snapshot_descriptors = repalloc(snapshot_descriptors, sizeof(*snapshot_descriptors) * descriptors_allocated);
 		}
-		lsns[lsns_num++] = lsn;
+
+		desc = &snapshot_descriptors[snapshot_index++];
+		desc->lsn = lsn;
+		desc->sz = st.st_size;
 	}
-	/* sort by lsn desc */
-	qsort(lsns, lsns_num, sizeof(XLogRecPtr), LsnDescComparator);
-	/* and take cutoff at logical_replication_max_snap_files */
-	if (logical_replication_max_snap_files > lsns_num)
-		cutoff = 0;
-	/* have less files than cutoff */
-	else
+
+	qsort(snapshot_descriptors, snapshot_index, sizeof(*snapshot_descriptors), SnapDescComparator);
+
+	/* Are there more snapshot files than specified? */
+	if (logical_replication_max_snap_files <= snapshot_index)
 	{
-		cutoff = lsns[logical_replication_max_snap_files - 1];
-		elog(LOG, "ls_monitor: dropping logical slots with restart_lsn lower %X/%X, found %d .snap files, limit is %d",
-			 LSN_FORMAT_ARGS(cutoff), lsns_num, logical_replication_max_snap_files);
+		cutoff = snapshot_descriptors[logical_replication_max_snap_files - 1].lsn;
+		elog(LOG,
+			"ls_monitor: dropping logical slots with restart_lsn lower %X/%X, found %zu snapshot files, limit is %d",
+			LSN_FORMAT_ARGS(cutoff), snapshot_index, logical_replication_max_snap_files);
 	}
-	pfree(lsns);
+
+	/* Is the size of the logical snapshots directory larger than specified?
+	 *
+	 * It's possible we could hit both thresholds, so remove any extra files
+	 * first, and then truncate based on size of the remaining files.
+	 */
+	if (logicalsnapdir_size > logical_replication_max_logicalsnapdir_size_bytes)
+	{
+		/* Unfortunately, iterating the directory does not guarantee any order
+		 * so we can't cache an index in the preceding loop.
+		 */
+
+		off_t		sz;
+		const XLogRecPtr original = cutoff;
+
+		sz = snapshot_descriptors[0].sz;
+		for (size_t i = 1; i < logical_replication_max_snap_files; ++i)
+		{
+			if (sz > logical_replication_max_logicalsnapdir_size_bytes)
+			{
+				cutoff = snapshot_descriptors[i - 1].lsn;
+				break;
+			}
+
+			sz += snapshot_descriptors[i].sz;
+		}
+
+		if (cutoff != original)
+			elog(LOG, "ls_monitor: dropping logical slots with restart_lsn lower than %X/%X, " SNAPDIR " is larger than %d KB",
+					LSN_FORMAT_ARGS(cutoff), logical_replication_max_logicalsnapdir_size);
+	}
+
+	pfree(snapshot_descriptors);
 	FreeDir(dirdesc);
+
 	return cutoff;
+
+#undef SNAPDIR
 }

 void
@@ -118,6 +189,16 @@ InitLogicalReplicationMonitor(void)
 							0,
 							NULL, NULL, NULL);

+	DefineCustomIntVariable(
+							"neon.logical_replication_max_logicalsnapdir_size",
+							"Maximum allowed size of the pg_logical/snapshots directory (KB). When exceeded, slots are dropped until the limit is met. -1 disables the limit.",
+							NULL,
+							&logical_replication_max_logicalsnapdir_size,
+							8000, -1, INT_MAX,
+							PGC_SIGHUP,
+							GUC_UNIT_KB,
+							NULL, NULL, NULL);
+
 	memset(&bgw, 0, sizeof(bgw));
 	bgw.bgw_flags = BGWORKER_SHMEM_ACCESS;
 	bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
@@ -162,7 +243,7 @@ LogicalSlotsMonitorMain(Datum main_arg)
 		 * If there are too many .snap files, just drop all logical slots to
 		 * prevent aux files bloat.
 		 */
-		cutoff_lsn = get_num_snap_files_lsn_threshold();
+		cutoff_lsn = get_snapshots_cutoff_lsn();
 		if (cutoff_lsn > 0)
 		{
 			for (int i = 0; i < max_replication_slots; i++)
--- a/proxy/src/bin/stream_events.rs
+++ b/proxy/src/bin/stream_events.rs
@@ -0,0 +1,450 @@
+use std::sync::Arc;
+
+use anyhow::bail;
+use aws_config::environment::EnvironmentVariableCredentialsProvider;
+use aws_config::imds::credentials::ImdsCredentialsProvider;
+use aws_config::meta::credentials::CredentialsProviderChain;
+use aws_config::meta::region::RegionProviderChain;
+use aws_config::profile::ProfileFileCredentialsProvider;
+use aws_config::provider_config::ProviderConfig;
+use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider;
+use aws_config::Region;
+use clap::{Parser, ValueEnum};
+use proxy::config::{self, remote_storage_from_toml, ProxyProtocolV2};
+use proxy::context::parquet::ParquetUploadArgs;
+use proxy::rate_limiter::RateBucketInfo;
+use proxy::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
+use proxy::redis::elasticache;
+use redis::streams::{StreamReadOptions, StreamReadReply};
+use redis::{AsyncCommands, FromRedisValue, Value};
+use remote_storage::RemoteStorageConfig;
+use serde::{Deserialize, Serialize};
+use tracing::warn;
+
+#[global_allocator]
+static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
+
+#[derive(Clone, Debug, ValueEnum)]
+enum AuthBackendType {
+    #[value(name("console"), alias("cplane"))]
+    ControlPlane,
+
+    #[value(name("link"), alias("control-redirect"))]
+    ConsoleRedirect,
+
+    #[cfg(feature = "testing")]
+    Postgres,
+}
+
+/// Neon proxy/router
+#[derive(Parser)]
+struct ProxyCliArgs {
+    /// Name of the region this proxy is deployed in
+    #[clap(long, default_value_t = String::new())]
+    region: String,
+    /// listen for incoming client connections on ip:port
+    #[clap(short, long, default_value = "127.0.0.1:4432")]
+    proxy: String,
+    #[clap(value_enum, long, default_value_t = AuthBackendType::ConsoleRedirect)]
+    auth_backend: AuthBackendType,
+    /// listen for management callback connection on ip:port
+    #[clap(short, long, default_value = "127.0.0.1:7000")]
+    mgmt: String,
+    /// listen for incoming http connections (metrics, etc) on ip:port
+    #[clap(long, default_value = "127.0.0.1:7001")]
+    http: String,
+    /// listen for incoming wss connections on ip:port
+    #[clap(long)]
+    wss: Option<String>,
+    /// redirect unauthenticated users to the given uri in case of console redirect auth
+    #[clap(short, long, default_value = "http://localhost:3000/psql_session/")]
+    uri: String,
+    /// cloud API endpoint for authenticating users
+    #[clap(
+        short,
+        long,
+        default_value = "http://localhost:3000/authenticate_proxy_request/"
+    )]
+    auth_endpoint: String,
+    /// JWT used to connect to control plane.
+    #[clap(
+        long,
+        value_name = "JWT",
+        default_value = "",
+        env = "NEON_PROXY_TO_CONTROLPLANE_TOKEN"
+    )]
+    control_plane_token: Arc<str>,
+    /// if this is not local proxy, this toggles whether we accept jwt or passwords for http
+    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
+    is_auth_broker: bool,
+    /// path to TLS key for client postgres connections
+    ///
+    /// tls-key and tls-cert are for backwards compatibility, we can put all certs in one dir
+    #[clap(short = 'k', long, alias = "ssl-key")]
+    tls_key: Option<String>,
+    /// path to TLS cert for client postgres connections
+    ///
+    /// tls-key and tls-cert are for backwards compatibility, we can put all certs in one dir
+    #[clap(short = 'c', long, alias = "ssl-cert")]
+    tls_cert: Option<String>,
+    /// path to directory with TLS certificates for client postgres connections
+    #[clap(long)]
+    certs_dir: Option<String>,
+    /// timeout for the TLS handshake
+    #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
+    handshake_timeout: tokio::time::Duration,
+    /// http endpoint to receive periodic metric updates
+    #[clap(long)]
+    metric_collection_endpoint: Option<String>,
+    /// how often metrics should be sent to a collection endpoint
+    #[clap(long)]
+    metric_collection_interval: Option<String>,
+    /// cache for `wake_compute` api method (use `size=0` to disable)
+    #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
+    wake_compute_cache: String,
+    /// lock for `wake_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable).
+    #[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK)]
+    wake_compute_lock: String,
+    /// lock for `connect_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable).
+    #[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK)]
+    connect_compute_lock: String,
+    /// Allow self-signed certificates for compute nodes (for testing)
+    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
+    allow_self_signed_compute: bool,
+    #[clap(flatten)]
+    sql_over_http: SqlOverHttpArgs,
+    /// timeout for scram authentication protocol
+    #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
+    scram_protocol_timeout: tokio::time::Duration,
+    /// size of the threadpool for password hashing
+    #[clap(long, default_value_t = 4)]
+    scram_thread_pool_size: u8,
+    /// Endpoint rate limiter max number of requests per second.
+    ///
+    /// Provided in the form `<Requests Per Second>@<Bucket Duration Size>`.
+    /// Can be given multiple times for different bucket sizes.
+    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)]
+    endpoint_rps_limit: Vec<RateBucketInfo>,
+    /// Wake compute rate limiter max number of requests per second.
+    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)]
+    wake_compute_limit: Vec<RateBucketInfo>,
+    /// Whether the auth rate limiter actually takes effect (for testing)
+    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
+    auth_rate_limit_enabled: bool,
+    /// Authentication rate limiter max number of hashes per second.
+    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_AUTH_SET)]
+    auth_rate_limit: Vec<RateBucketInfo>,
+    /// The IP subnet to use when considering whether two IP addresses are considered the same.
+    #[clap(long, default_value_t = 64)]
+    auth_rate_limit_ip_subnet: u8,
+    /// Redis rate limiter max number of requests per second.
+    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)]
+    redis_rps_limit: Vec<RateBucketInfo>,
+    /// cache for `allowed_ips` (use `size=0` to disable)
+    #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
+    allowed_ips_cache: String,
+    /// cache for `role_secret` (use `size=0` to disable)
+    #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
+    role_secret_cache: String,
+    /// redis url for notifications (if empty, redis_host:port will be used for both notifications and streaming connections)
+    #[clap(long)]
+    redis_notifications: Option<String>,
+    /// what from the available authentications type to use for the regional redis we have. Supported are "irsa" and "plain".
+    #[clap(long, default_value = "irsa")]
+    redis_auth_type: String,
+    /// redis host for streaming connections (might be different from the notifications host)
+    #[clap(long)]
+    redis_host: Option<String>,
+    /// redis port for streaming connections (might be different from the notifications host)
+    #[clap(long)]
+    redis_port: Option<u16>,
+    /// redis cluster name, used in aws elasticache
+    #[clap(long)]
+    redis_cluster_name: Option<String>,
+    /// redis user_id, used in aws elasticache
+    #[clap(long)]
+    redis_user_id: Option<String>,
+    /// aws region to retrieve credentials
+    #[clap(long, default_value_t = String::new())]
+    aws_region: String,
+    /// cache for `project_info` (use `size=0` to disable)
+    #[clap(long, default_value = config::ProjectInfoCacheOptions::CACHE_DEFAULT_OPTIONS)]
+    project_info_cache: String,
+    /// cache for all valid endpoints
+    #[clap(long, default_value = config::EndpointCacheConfig::CACHE_DEFAULT_OPTIONS)]
+    endpoint_cache_config: String,
+    #[clap(flatten)]
+    parquet_upload: ParquetUploadArgs,
+
+    /// interval for backup metric collection
+    #[clap(long, default_value = "10m", value_parser = humantime::parse_duration)]
+    metric_backup_collection_interval: std::time::Duration,
+    /// remote storage configuration for backup metric collection
+    /// Encoded as toml (same format as pageservers), eg
+    /// `{bucket_name='the-bucket',bucket_region='us-east-1',prefix_in_bucket='proxy',endpoint='http://minio:9000'}`
+    #[clap(long, value_parser = remote_storage_from_toml)]
+    metric_backup_collection_remote_storage: Option<RemoteStorageConfig>,
+    /// chunk size for backup metric collection
+    /// Size of each event is no more than 400 bytes, so 2**22 is about 200MB before the compression.
+    #[clap(long, default_value = "4194304")]
+    metric_backup_collection_chunk_size: usize,
+    /// Whether to retry the connection to the compute node
+    #[clap(long, default_value = config::RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES)]
+    connect_to_compute_retry: String,
+    /// Whether to retry the wake_compute request
+    #[clap(long, default_value = config::RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)]
+    wake_compute_retry: String,
+
+    /// Configure if this is a private access proxy for the POC: In that case the proxy will ignore the IP allowlist
+    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
+    is_private_access_proxy: bool,
+
+    /// Configure whether all incoming requests have a Proxy Protocol V2 packet.
+    // TODO(conradludgate): switch default to rejected or required once we've updated all deployments
+    #[clap(value_enum, long, default_value_t = ProxyProtocolV2::Supported)]
+    proxy_protocol_v2: ProxyProtocolV2,
+
+    /// Time the proxy waits for the webauth session to be confirmed by the control plane.
+    // TODO: rename to `console_redirect_confirmation_timeout`.
+    #[clap(long, default_value = "2m", value_parser = humantime::parse_duration)]
+    webauth_confirmation_timeout: std::time::Duration,
+}
+
+#[derive(clap::Args, Clone, Copy, Debug)]
+struct SqlOverHttpArgs {
+    /// timeout for http connection requests
+    #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
+    sql_over_http_timeout: tokio::time::Duration,
+
+    /// Whether the SQL over http pool is opt-in
+    #[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
+    sql_over_http_pool_opt_in: bool,
+
+    /// How many connections to pool for each endpoint. Excess connections are discarded
+    #[clap(long, default_value_t = 20)]
+    sql_over_http_pool_max_conns_per_endpoint: usize,
+
+    /// How many connections to pool for each endpoint. Excess connections are discarded
+    #[clap(long, default_value_t = 20000)]
+    sql_over_http_pool_max_total_conns: usize,
+
+    /// How long pooled connections should remain idle for before closing
+    #[clap(long, default_value = "5m", value_parser = humantime::parse_duration)]
+    sql_over_http_idle_timeout: tokio::time::Duration,
+
+    /// Duration each shard will wait on average before a GC sweep.
+    /// A longer time will causes sweeps to take longer but will interfere less frequently.
+    #[clap(long, default_value = "10m", value_parser = humantime::parse_duration)]
+    sql_over_http_pool_gc_epoch: tokio::time::Duration,
+
+    /// How many shards should the global pool have. Must be a power of two.
+    /// More shards will introduce less contention for pool operations, but can
+    /// increase memory used by the pool
+    #[clap(long, default_value_t = 128)]
+    sql_over_http_pool_shards: usize,
+
+    #[clap(long, default_value_t = 10000)]
+    sql_over_http_client_conn_threshold: u64,
+
+    #[clap(long, default_value_t = 64)]
+    sql_over_http_cancel_set_shards: usize,
+
+    #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB
+    sql_over_http_max_request_size_bytes: u64,
+
+    #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB
+    sql_over_http_max_response_size_bytes: usize,
+}
+
+#[tokio::main]
+async fn main() -> anyhow::Result<()> {
+    let _logging_guard = proxy::logging::init().await?;
+    let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook();
+
+    let args = ProxyCliArgs::parse();
+
+    let region_provider =
+        RegionProviderChain::default_provider().or_else(Region::new(args.aws_region.clone()));
+    let provider_conf =
+        ProviderConfig::without_region().with_region(region_provider.region().await);
+    let aws_credentials_provider = {
+        // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
+        CredentialsProviderChain::first_try("env", EnvironmentVariableCredentialsProvider::new())
+            // uses "AWS_PROFILE" / `aws sso login --profile <profile>`
+            .or_else(
+                "profile-sso",
+                ProfileFileCredentialsProvider::builder()
+                    .configure(&provider_conf)
+                    .build(),
+            )
+            // uses "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME"
+            // needed to access remote extensions bucket
+            .or_else(
+                "token",
+                WebIdentityTokenCredentialsProvider::builder()
+                    .configure(&provider_conf)
+                    .build(),
+            )
+            // uses imds v2
+            .or_else("imds", ImdsCredentialsProvider::builder().build())
+    };
+    let elasticache_credentials_provider = Arc::new(elasticache::CredentialsProvider::new(
+        elasticache::AWSIRSAConfig::new(
+            args.aws_region.clone(),
+            args.redis_cluster_name,
+            args.redis_user_id,
+        ),
+        aws_credentials_provider,
+    ));
+    let regional_redis_client = match (args.redis_auth_type.as_str(), &args.redis_notifications) {
+        ("plain", redis_url) => match redis_url {
+            None => {
+                bail!("plain auth requires redis_notifications to be set");
+            }
+            Some(url) => Some(
+                ConnectionWithCredentialsProvider::new_with_static_credentials(url.to_string()),
+            ),
+        },
+        ("irsa", _) => match (&args.redis_host, args.redis_port) {
+            (Some(host), Some(port)) => Some(
+                ConnectionWithCredentialsProvider::new_with_credentials_provider(
+                    host.to_string(),
+                    port,
+                    elasticache_credentials_provider.clone(),
+                ),
+            ),
+            (None, None) => {
+                warn!("irsa auth requires redis-host and redis-port to be set, continuing without regional_redis_client");
+                None
+            }
+            _ => {
+                bail!("redis-host and redis-port must be specified together");
+            }
+        },
+        _ => {
+            bail!("unknown auth type given");
+        }
+    };
+
+    let endpoint_cache_config: config::EndpointCacheConfig = args.endpoint_cache_config.parse()?;
+
+    let Some(mut regional_redis_client) = regional_redis_client else {
+        bail!("no regional_redis_client");
+    };
+
+    if let Err(e) = regional_redis_client.connect().await {
+        bail!("error connecting to redis: {:?}", e);
+    }
+
+    let mut last_id = "0-0".to_string();
+    batch_read(
+        &mut regional_redis_client,
+        endpoint_cache_config.stream_name,
+        StreamReadOptions::default().count(endpoint_cache_config.default_batch_size),
+        &mut last_id,
+        true,
+        |event| {
+            let json = serde_json::to_string(&event)?;
+            println!("{}", json);
+            Ok(())
+        },
+    )
+    .await?;
+
+    Ok(())
+}
+
+// TODO: this could be an enum, but events in Redis need to be fixed first.
+// ProjectCreated was sent with type:branch_created. So we ignore type.
+#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
+struct CPlaneEvent {
+    id: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    endpoint_created: Option<EndpointCreated>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    branch_created: Option<BranchCreated>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    project_created: Option<ProjectCreated>,
+
+    #[serde(rename = "type")]
+    _type: Option<String>,
+}
+
+#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
+struct EndpointCreated {
+    endpoint_id: String,
+}
+
+#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
+struct BranchCreated {
+    branch_id: String,
+}
+
+#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
+struct ProjectCreated {
+    project_id: String,
+}
+
+impl TryFrom<&Value> for CPlaneEvent {
+    type Error = anyhow::Error;
+    fn try_from(value: &Value) -> Result<Self, Self::Error> {
+        let json = String::from_redis_value(value)?;
+        Ok(serde_json::from_str(&json)?)
+    }
+}
+
+async fn batch_read(
+    conn: &mut ConnectionWithCredentialsProvider,
+    stream_name: String,
+    opts: StreamReadOptions,
+    last_id: &mut String,
+    return_when_finish: bool,
+    mut insert_event: impl FnMut(CPlaneEvent) -> anyhow::Result<()>,
+) -> anyhow::Result<()> {
+    let mut total: usize = 0;
+    loop {
+        let mut res: StreamReadReply = conn
+            .xread_options(&[&stream_name], &[last_id.as_str()], &opts)
+            .await?;
+
+        if res.keys.is_empty() {
+            if return_when_finish {
+                if total != 0 {
+                    break;
+                }
+                anyhow::bail!(
+                    "Redis stream {} is empty, cannot be used to filter endpoints",
+                    stream_name
+                );
+            }
+            // If we are not returning when finish, we should wait for more data.
+            continue;
+        }
+        if res.keys.len() != 1 {
+            anyhow::bail!("Cannot read from redis stream {}", stream_name);
+        }
+
+        let key = res.keys.pop().expect("Checked length above");
+
+        for stream_id in key.ids {
+            total += 1;
+            for value in stream_id.map.values() {
+                match value.try_into() {
+                    Ok::<CPlaneEvent, _>(mut event) => {
+                        event.id = Some(stream_id.id.clone());
+                        insert_event(event)?;
+                    }
+                    Err(err) => {
+                        tracing::error!("error parsing value {value:?}: {err:?}");
+                    }
+                };
+            }
+            if total.is_power_of_two() {
+                tracing::debug!("endpoints read {}", total);
+            }
+            *last_id = stream_id.id;
+        }
+    }
+    tracing::info!("read {} endpoints/branches/projects from redis", total);
+    Ok(())
+}
--- a/proxy/src/redis/connection_with_credentials_provider.rs
+++ b/proxy/src/redis/connection_with_credentials_provider.rs
@@ -80,7 +80,7 @@ impl ConnectionWithCredentialsProvider {
        redis::cmd("PING").query_async(con).await
    }

-    pub(crate) async fn connect(&mut self) -> anyhow::Result<()> {
+    pub async fn connect(&mut self) -> anyhow::Result<()> {
        let _guard = self.mutex.lock().await;
        if let Some(con) = self.con.as_mut() {
            match Self::ping(con).await {
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -979,7 +979,8 @@ where
            self.wal_store.flush_wal().await?;
        }

-        // Update commit_lsn.
+        // Update commit_lsn. It will be flushed to the control file regularly by the timeline
+        // manager, off of the WAL ingest hot path.
        if msg.h.commit_lsn != Lsn(0) {
            self.update_commit_lsn(msg.h.commit_lsn).await?;
        }
@@ -992,15 +993,6 @@ where
        self.state.inmem.peer_horizon_lsn =
            max(self.state.inmem.peer_horizon_lsn, msg.h.truncate_lsn);

-        // Update truncate and commit LSN in control file.
-        // To avoid negative impact on performance of extra fsync, do it only
-        // when commit_lsn delta exceeds WAL segment size.
-        if self.state.commit_lsn + (self.state.server.wal_seg_size as u64)
-            < self.state.inmem.commit_lsn
-        {
-            self.state.flush().await?;
-        }
-
        trace!(
            "processed AppendRequest of len {}, begin_lsn={}, end_lsn={:?}, commit_lsn={:?}, truncate_lsn={:?}, flushed={:?}",
            msg.wal_data.len(),
--- a/safekeeper/src/state.rs
+++ b/safekeeper/src/state.rs
@@ -4,6 +4,7 @@
 use std::{cmp::max, ops::Deref};

 use anyhow::{bail, Result};
+use postgres_ffi::WAL_SEGMENT_SIZE;
 use safekeeper_api::models::TimelineTermBumpResponse;
 use serde::{Deserialize, Serialize};
 use utils::{
@@ -144,7 +145,7 @@ impl TimelinePersistentState {
            ServerInfo {
                pg_version: 170000, /* Postgres server version (major * 10000) */
                system_id: 0,       /* Postgres system identifier */
-                wal_seg_size: 16 * 1024 * 1024,
+                wal_seg_size: WAL_SEGMENT_SIZE as u32,
            },
            vec![],
            Lsn::INVALID,
--- a/safekeeper/src/timeline_manager.rs
+++ b/safekeeper/src/timeline_manager.rs
@@ -515,7 +515,12 @@ impl Manager {
            return;
        }

-        if state.cfile_last_persist_at.elapsed() > self.conf.control_file_save_interval {
+        if state.cfile_last_persist_at.elapsed() > self.conf.control_file_save_interval
+            // If the control file's commit_lsn lags more than one segment behind the current
+            // commit_lsn, flush immediately to limit recovery time in case of a crash. We don't do
+            // this on the WAL ingest hot path since it incurs fsync latency.
+            || state.commit_lsn.saturating_sub(state.cfile_commit_lsn).0 >= self.wal_seg_size as u64
+        {
            let mut write_guard = self.tli.write_shared_state().await;
            // it should be done in the background because it blocks manager task, but flush() should
            // be fast enough not to be a problem now
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -127,23 +127,29 @@ pub struct PhysicalStorage {
    /// - doesn't point to the end of the segment
    file: Option<File>,

-    /// When false, we have just initialized storage using the LSN from find_end_of_wal().
-    /// In this case, [`write_lsn`] can be less than actually written WAL on disk. In particular,
-    /// there can be a case with unexpected .partial file.
+    /// When true, WAL truncation potentially has been interrupted and we need
+    /// to finish it before allowing WAL writes; see truncate_wal for details.
+    /// In this case [`write_lsn`] can be less than actually written WAL on
+    /// disk. In particular, there can be a case with unexpected .partial file.
    ///
    /// Imagine the following:
    /// - 000000010000000000000001
-    ///   - it was fully written, but the last record is split between 2 segments
-    ///   - after restart, `find_end_of_wal()` returned 0/1FFFFF0, which is in the end of this segment
-    ///   - `write_lsn`, `write_record_lsn` and `flush_record_lsn` were initialized to 0/1FFFFF0
+    ///   - it was fully written, but the last record is split between 2
+    ///     segments
+    ///   - after restart, `find_end_of_wal()` returned 0/1FFFFF0, which is in
+    ///     the end of this segment
+    ///   - `write_lsn`, `write_record_lsn` and `flush_record_lsn` were
+    ///     initialized to 0/1FFFFF0
    /// - 000000010000000000000002.partial
-    ///   - it has only 1 byte written, which is not enough to make a full WAL record
+    ///   - it has only 1 byte written, which is not enough to make a full WAL
+    ///     record
    ///
-    /// Partial segment 002 has no WAL records, and it will be removed by the next truncate_wal().
-    /// This flag will be set to true after the first truncate_wal() call.
+    /// Partial segment 002 has no WAL records, and it will be removed by the
+    /// next truncate_wal(). This flag will be set to true after the first
+    /// truncate_wal() call.
    ///
    /// [`write_lsn`]: Self::write_lsn
-    is_truncated_after_restart: bool,
+    pending_wal_truncation: bool,
 }

 impl PhysicalStorage {
@@ -208,7 +214,7 @@ impl PhysicalStorage {
            flush_record_lsn: flush_lsn,
            decoder: WalStreamDecoder::new(write_lsn, state.server.pg_version / 10000),
            file: None,
-            is_truncated_after_restart: false,
+            pending_wal_truncation: true,
        })
    }

@@ -405,6 +411,13 @@ impl Storage for PhysicalStorage {
                startpos
            );
        }
+        if self.pending_wal_truncation {
+            bail!(
+                "write_wal called with pending WAL truncation, write_lsn={}, startpos={}",
+                self.write_lsn,
+                startpos
+            );
+        }

        let write_seconds = time_io_closure(self.write_exact(startpos, buf)).await?;
        // WAL is written, updating write metrics
@@ -479,15 +492,34 @@ impl Storage for PhysicalStorage {
            );
        }

-        // Quick exit if nothing to do to avoid writing up to 16 MiB of zeros on
-        // disk (this happens on each connect).
-        if self.is_truncated_after_restart
+        // Quick exit if nothing to do and we know that the state is clean to
+        // avoid writing up to 16 MiB of zeros on disk (this happens on each
+        // connect).
+        if !self.pending_wal_truncation
            && end_pos == self.write_lsn
            && end_pos == self.flush_record_lsn
        {
            return Ok(());
        }

+        // Atomicity: we start with LSNs reset because once on disk deletion is
+        // started it can't be reversed. However, we might crash/error in the
+        // middle, leaving garbage above the truncation point. In theory,
+        // concatenated with previous records it might form bogus WAL (though
+        // very unlikely in practice because CRC would guard from that). To
+        // protect, set pending_wal_truncation flag before beginning: it means
+        // truncation must be retried and WAL writes are prohibited until it
+        // succeeds. Flag is also set on boot because we don't know if the last
+        // state was clean.
+        //
+        // Protocol (HandleElected before first AppendRequest) ensures we'll
+        // always try to ensure clean truncation before any writes.
+        self.pending_wal_truncation = true;
+
+        self.write_lsn = end_pos;
+        self.write_record_lsn = end_pos;
+        self.flush_record_lsn = end_pos;
+
        // Close previously opened file, if any
        if let Some(unflushed_file) = self.file.take() {
            self.fdatasync_file(&unflushed_file).await?;
@@ -513,11 +545,7 @@ impl Storage for PhysicalStorage {
            fs::rename(wal_file_path, wal_file_partial_path).await?;
        }

-        // Update LSNs
-        self.write_lsn = end_pos;
-        self.write_record_lsn = end_pos;
-        self.flush_record_lsn = end_pos;
-        self.is_truncated_after_restart = true;
+        self.pending_wal_truncation = false;
        Ok(())
    }

--- a/test_runner/fixtures/endpoint/http.py
+++ b/test_runner/fixtures/endpoint/http.py
@@ -46,3 +46,8 @@ class EndpointHttpClient(requests.Session):
        )
        res.raise_for_status()
        return res.json()
+
+    def metrics(self) -> str:
+        res = self.get(f"http://localhost:{self.port}/metrics")
+        res.raise_for_status()
+        return res.text
--- a/test_runner/fixtures/neon_api.py
+++ b/test_runner/fixtures/neon_api.py
@@ -5,6 +5,8 @@ from typing import TYPE_CHECKING, cast, final

 import requests

+from fixtures.log_helper import log
+
 if TYPE_CHECKING:
    from typing import Any, Literal, Optional

@@ -30,7 +32,11 @@ class NeonAPI:
            kwargs["headers"] = {}
        kwargs["headers"]["Authorization"] = f"Bearer {self.__neon_api_key}"

-        return requests.request(method, f"{self.__neon_api_base_url}{endpoint}", **kwargs)
+        resp = requests.request(method, f"{self.__neon_api_base_url}{endpoint}", **kwargs)
+        log.debug("%s %s returned a %d: %s", method, endpoint, resp.status_code, resp.text)
+        resp.raise_for_status()
+
+        return resp

    def create_project(
        self,
@@ -66,8 +72,6 @@ class NeonAPI:
            json=data,
        )

-        assert resp.status_code == 201
-
        return cast("dict[str, Any]", resp.json())

    def get_project_details(self, project_id: str) -> dict[str, Any]:
@@ -79,7 +83,7 @@ class NeonAPI:
                "Content-Type": "application/json",
            },
        )
-        assert resp.status_code == 200
+
        return cast("dict[str, Any]", resp.json())

    def delete_project(
@@ -95,8 +99,6 @@ class NeonAPI:
            },
        )

-        assert resp.status_code == 200
-
        return cast("dict[str, Any]", resp.json())

    def start_endpoint(
@@ -112,8 +114,6 @@ class NeonAPI:
            },
        )

-        assert resp.status_code == 200
-
        return cast("dict[str, Any]", resp.json())

    def suspend_endpoint(
@@ -129,8 +129,6 @@ class NeonAPI:
            },
        )

-        assert resp.status_code == 200
-
        return cast("dict[str, Any]", resp.json())

    def restart_endpoint(
@@ -146,8 +144,6 @@ class NeonAPI:
            },
        )

-        assert resp.status_code == 200
-
        return cast("dict[str, Any]", resp.json())

    def create_endpoint(
@@ -178,8 +174,6 @@ class NeonAPI:
            json=data,
        )

-        assert resp.status_code == 201
-
        return cast("dict[str, Any]", resp.json())

    def get_connection_uri(
@@ -206,8 +200,6 @@ class NeonAPI:
            },
        )

-        assert resp.status_code == 200
-
        return cast("dict[str, Any]", resp.json())

    def get_branches(self, project_id: str) -> dict[str, Any]:
@@ -219,8 +211,6 @@ class NeonAPI:
            },
        )

-        assert resp.status_code == 200
-
        return cast("dict[str, Any]", resp.json())

    def get_endpoints(self, project_id: str) -> dict[str, Any]:
@@ -232,8 +222,6 @@ class NeonAPI:
            },
        )

-        assert resp.status_code == 200
-
        return cast("dict[str, Any]", resp.json())

    def get_operations(self, project_id: str) -> dict[str, Any]:
@@ -246,8 +234,6 @@ class NeonAPI:
            },
        )

-        assert resp.status_code == 200
-
        return cast("dict[str, Any]", resp.json())

    def wait_for_operation_to_finish(self, project_id: str):
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1065,6 +1065,9 @@ class NeonEnv:
                "http_auth_type": http_auth_type,
                # Default which can be overriden with `NeonEnvBuilder.pageserver_config_override`
                "availability_zone": "us-east-2a",
+                # Disable pageserver disk syncs in tests: when running tests concurrently, this avoids
+                # the pageserver taking a long time to start up due to syncfs flushing other tests' data
+                "no_sync": True,
            }
            if self.pageserver_virtual_file_io_engine is not None:
                ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine
--- a/test_runner/performance/test_logical_replication.py
+++ b/test_runner/performance/test_logical_replication.py
@@ -149,12 +149,16 @@ def test_subscriber_lag(
                check_pgbench_still_running(pub_workload, "pub")
                check_pgbench_still_running(sub_workload, "sub")

-                with (
-                    psycopg2.connect(pub_connstr) as pub_conn,
-                    psycopg2.connect(sub_connstr) as sub_conn,
-                ):
-                    with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
-                        lag = measure_logical_replication_lag(sub_cur, pub_cur)
+                pub_conn = psycopg2.connect(pub_connstr)
+                sub_conn = psycopg2.connect(sub_connstr)
+                pub_conn.autocommit = True
+                sub_conn.autocommit = True
+
+                with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
+                    lag = measure_logical_replication_lag(sub_cur, pub_cur)
+
+                pub_conn.close()
+                sub_conn.close()

                log.info(f"Replica lagged behind master by {lag} seconds")
                zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER)
@@ -206,6 +210,7 @@ def test_publisher_restart(
    sub_conn = psycopg2.connect(sub_connstr)
    pub_conn.autocommit = True
    sub_conn.autocommit = True
+
    with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
        pub_cur.execute("SELECT 1 FROM pg_catalog.pg_publication WHERE pubname = 'pub1'")
        pub_exists = len(pub_cur.fetchall()) != 0
@@ -222,6 +227,7 @@ def test_publisher_restart(
            sub_cur.execute(f"create subscription sub1 connection '{pub_connstr}' publication pub1")

        initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur)
+
    pub_conn.close()
    sub_conn.close()

@@ -248,12 +254,17 @@ def test_publisher_restart(
                    ["pgbench", "-c10", pgbench_duration, "-Mprepared"],
                    env=pub_env,
                )
-                with (
-                    psycopg2.connect(pub_connstr) as pub_conn,
-                    psycopg2.connect(sub_connstr) as sub_conn,
-                ):
-                    with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
-                        lag = measure_logical_replication_lag(sub_cur, pub_cur)
+
+                pub_conn = psycopg2.connect(pub_connstr)
+                sub_conn = psycopg2.connect(sub_connstr)
+                pub_conn.autocommit = True
+                sub_conn.autocommit = True
+
+                with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
+                    lag = measure_logical_replication_lag(sub_cur, pub_cur)
+
+                pub_conn.close()
+                sub_conn.close()

                log.info(f"Replica lagged behind master by {lag} seconds")
                zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER)
@@ -288,58 +299,56 @@ def test_snap_files(
    env = benchmark_project_pub.pgbench_env
    connstr = benchmark_project_pub.connstr

-    with psycopg2.connect(connstr) as conn:
-        conn.autocommit = True
-        with conn.cursor() as cur:
-            cur.execute("SELECT rolsuper FROM pg_roles WHERE rolname = 'neondb_owner'")
-            is_super = cast("bool", cur.fetchall()[0][0])
-            assert is_super, "This benchmark won't work if we don't have superuser"
+    conn = psycopg2.connect(connstr)
+    conn.autocommit = True
+
+    with conn.cursor() as cur:
+        cur.execute("SELECT rolsuper FROM pg_roles WHERE rolname = 'neondb_owner'")
+        is_super = cast("bool", cur.fetchall()[0][0])
+        assert is_super, "This benchmark won't work if we don't have superuser"
+
+    conn.close()

    pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s100"], env=env)

    conn = psycopg2.connect(connstr)
    conn.autocommit = True
-    cur = conn.cursor()
-    cur.execute("ALTER SYSTEM SET neon.logical_replication_max_snap_files = -1")

-    with psycopg2.connect(connstr) as conn:
-        conn.autocommit = True
-        with conn.cursor() as cur:
-            cur.execute("SELECT pg_reload_conf()")
-
-    with psycopg2.connect(connstr) as conn:
-        conn.autocommit = True
-        with conn.cursor() as cur:
-            cur.execute(
-                """
-                DO $$
-                    BEGIN
-                    IF EXISTS (
-                        SELECT 1
-                        FROM pg_replication_slots
-                        WHERE slot_name = 'slotter'
-                    ) THEN
-                        PERFORM pg_drop_replication_slot('slotter');
-                    END IF;
-                END $$;
+    with conn.cursor() as cur:
+        cur.execute(
            """
-            )
-            cur.execute("SELECT pg_create_logical_replication_slot('slotter', 'test_decoding')")
+            DO $$
+                BEGIN
+                IF EXISTS (
+                    SELECT 1
+                    FROM pg_replication_slots
+                    WHERE slot_name = 'slotter'
+                ) THEN
+                    PERFORM pg_drop_replication_slot('slotter');
+                END IF;
+            END $$;
+        """
+        )
+        cur.execute("SELECT pg_create_logical_replication_slot('slotter', 'test_decoding')")
+
+    conn.close()

    workload = pg_bin.run_nonblocking(["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=env)
    try:
        start = time.time()
        prev_measurement = time.time()
        while time.time() - start < test_duration_min * 60:
-            with psycopg2.connect(connstr) as conn:
-                with conn.cursor() as cur:
-                    cur.execute(
-                        "SELECT count(*) FROM (SELECT pg_log_standby_snapshot() FROM generate_series(1, 10000) g) s"
-                    )
-                    check_pgbench_still_running(workload)
-                    cur.execute(
-                        "SELECT pg_replication_slot_advance('slotter', pg_current_wal_lsn())"
-                    )
+            conn = psycopg2.connect(connstr)
+            conn.autocommit = True
+
+            with conn.cursor() as cur:
+                cur.execute(
+                    "SELECT count(*) FROM (SELECT pg_log_standby_snapshot() FROM generate_series(1, 10000) g) s"
+                )
+                check_pgbench_still_running(workload)
+                cur.execute("SELECT pg_replication_slot_advance('slotter', pg_current_wal_lsn())")
+
+            conn.close()

            # Measure storage
            if time.time() - prev_measurement > test_interval_min * 60:
--- a/test_runner/performance/test_physical_replication.py
+++ b/test_runner/performance/test_physical_replication.py
@@ -102,15 +102,21 @@ def test_ro_replica_lag(
                    check_pgbench_still_running(master_workload)
                    check_pgbench_still_running(replica_workload)
                    time.sleep(sync_interval_min * 60)
+
+                    conn_master = psycopg2.connect(master_connstr)
+                    conn_replica = psycopg2.connect(replica_connstr)
+                    conn_master.autocommit = True
+                    conn_replica.autocommit = True
+
                    with (
-                        psycopg2.connect(master_connstr) as conn_master,
-                        psycopg2.connect(replica_connstr) as conn_replica,
+                        conn_master.cursor() as cur_master,
+                        conn_replica.cursor() as cur_replica,
                    ):
-                        with (
-                            conn_master.cursor() as cur_master,
-                            conn_replica.cursor() as cur_replica,
-                        ):
-                            lag = measure_replication_lag(cur_master, cur_replica)
+                        lag = measure_replication_lag(cur_master, cur_replica)
+
+                    conn_master.close()
+                    conn_replica.close()
+
                    log.info(f"Replica lagged behind master by {lag} seconds")
                    zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER)
            finally:
@@ -219,11 +225,15 @@ def test_replication_start_stop(
        pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s10"], env=master_env)

        # Sync replicas
-        with psycopg2.connect(master_connstr) as conn_master:
-            with conn_master.cursor() as cur_master:
-                for i in range(num_replicas):
-                    conn_replica = psycopg2.connect(replica_connstr[i])
-                    measure_replication_lag(cur_master, conn_replica.cursor())
+        conn_master = psycopg2.connect(master_connstr)
+        conn_master.autocommit = True
+
+        with conn_master.cursor() as cur_master:
+            for i in range(num_replicas):
+                conn_replica = psycopg2.connect(replica_connstr[i])
+                measure_replication_lag(cur_master, conn_replica.cursor())
+
+        conn_master.close()

        master_pgbench = pg_bin.run_nonblocking(
            [
@@ -277,17 +287,22 @@ def test_replication_start_stop(

            time.sleep(configuration_test_time_sec)

-            with psycopg2.connect(master_connstr) as conn_master:
-                with conn_master.cursor() as cur_master:
-                    for ireplica in range(num_replicas):
-                        replica_conn = psycopg2.connect(replica_connstr[ireplica])
-                        lag = measure_replication_lag(cur_master, replica_conn.cursor())
-                        zenbenchmark.record(
-                            f"Replica {ireplica} lag", lag, "s", MetricReport.LOWER_IS_BETTER
-                        )
-                        log.info(
-                            f"Replica {ireplica} lagging behind master by {lag} seconds after configuration {iconfig:>b}"
-                        )
+            conn_master = psycopg2.connect(master_connstr)
+            conn_master.autocommit = True
+
+            with conn_master.cursor() as cur_master:
+                for ireplica in range(num_replicas):
+                    replica_conn = psycopg2.connect(replica_connstr[ireplica])
+                    lag = measure_replication_lag(cur_master, replica_conn.cursor())
+                    zenbenchmark.record(
+                        f"Replica {ireplica} lag", lag, "s", MetricReport.LOWER_IS_BETTER
+                    )
+                    log.info(
+                        f"Replica {ireplica} lagging behind master by {lag} seconds after configuration {iconfig:>b}"
+                    )
+
+            conn_master.close()
+
        master_pgbench.terminate()
    except Exception as e:
        error_occurred = True
--- a/test_runner/regress/test_installed_extensions.py
+++ b/test_runner/regress/test_installed_extensions.py
@@ -1,6 +1,14 @@
-from logging import info
+from __future__ import annotations

-from fixtures.neon_fixtures import NeonEnv
+import time
+from logging import info
+from typing import TYPE_CHECKING
+
+from fixtures.log_helper import log
+from fixtures.metrics import parse_metrics
+
+if TYPE_CHECKING:
+    from fixtures.neon_fixtures import NeonEnv


 def test_installed_extensions(neon_simple_env: NeonEnv):
@@ -85,3 +93,52 @@ def test_installed_extensions(neon_simple_env: NeonEnv):
            assert ext["n_databases"] == 2
            ext["versions"].sort()
            assert ext["versions"] == ["1.2", "1.3"]
+
+    # check that /metrics endpoint is available
+    # ensure that we see the metric before and after restart
+    res = client.metrics()
+    info("Metrics: %s", res)
+    m = parse_metrics(res)
+    neon_m = m.query_all("installed_extensions", {"extension_name": "neon", "version": "1.2"})
+    assert len(neon_m) == 1
+    for sample in neon_m:
+        assert sample.value == 2
+    neon_m = m.query_all("installed_extensions", {"extension_name": "neon", "version": "1.3"})
+    assert len(neon_m) == 1
+    for sample in neon_m:
+        assert sample.value == 1
+
+    endpoint.stop()
+    endpoint.start()
+
+    timeout = 10
+    while timeout > 0:
+        try:
+            res = client.metrics()
+            timeout = -1
+            if len(parse_metrics(res).query_all("installed_extensions")) < 4:
+                # Assume that not all metrics that are collected yet
+                time.sleep(1)
+                timeout -= 1
+                continue
+        except Exception:
+            log.exception("failed to get metrics, assume they are not collected yet")
+            time.sleep(1)
+            timeout -= 1
+            continue
+
+        assert (
+            len(parse_metrics(res).query_all("installed_extensions")) >= 4
+        ), "Not all metrics are collected"
+
+        info("After restart metrics: %s", res)
+        m = parse_metrics(res)
+        neon_m = m.query_all("installed_extensions", {"extension_name": "neon", "version": "1.2"})
+        assert len(neon_m) == 1
+        for sample in neon_m:
+            assert sample.value == 1
+
+        neon_m = m.query_all("installed_extensions", {"extension_name": "neon", "version": "1.3"})
+        assert len(neon_m) == 1
+        for sample in neon_m:
+            assert sample.value == 1
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -35,9 +35,10 @@ from fixtures.pageserver.utils import (
    wait_for_upload,
 )
 from fixtures.remote_storage import (
+    LocalFsStorage,
    RemoteStorageKind,
 )
-from fixtures.utils import wait_until
+from fixtures.utils import run_only_on_default_postgres, wait_until
 from fixtures.workload import Workload

 if TYPE_CHECKING:
@@ -728,3 +729,68 @@ def test_upgrade_generationless_local_file_paths(
    )
    # We should download into the same local path we started with
    assert os.path.exists(victim_path)
+
+
+@run_only_on_default_postgres("Only tests index logic")
+def test_old_index_time_threshold(
+    neon_env_builder: NeonEnvBuilder,
+):
+    """
+    Exercise pageserver's detection of trying to load an ancient non-latest index.
+    (see https://github.com/neondatabase/neon/issues/6951)
+    """
+
+    # Run with local_fs because we will interfere with mtimes by local filesystem access
+    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
+    env = neon_env_builder.init_start()
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    workload = Workload(env, tenant_id, timeline_id)
+    workload.init()
+    workload.write_rows(32)
+
+    # Remember generation 1's index path
+    assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
+    index_path = env.pageserver_remote_storage.index_path(tenant_id, timeline_id)
+
+    # Increment generation by detaching+attaching, and write+flush some data to get a new remote index
+    env.storage_controller.tenant_policy_update(tenant_id, {"placement": "Detached"})
+    env.storage_controller.tenant_policy_update(tenant_id, {"placement": {"Attached": 0}})
+    env.storage_controller.reconcile_until_idle()
+    workload.churn_rows(32)
+
+    # A new index should have been written
+    assert env.pageserver_remote_storage.index_path(tenant_id, timeline_id) != index_path
+
+    # Hack the mtime on the generation 1 index
+    log.info(f"Setting old mtime on {index_path}")
+    os.utime(index_path, times=(time.time(), time.time() - 30 * 24 * 3600))
+    env.pageserver.allowed_errors.extend(
+        [
+            ".*Found a newer index while loading an old one.*",
+            ".*Index age exceeds threshold and a newer index exists.*",
+        ]
+    )
+
+    # Detach from storage controller + attach in an old generation directly on the pageserver.
+    workload.stop()
+    env.storage_controller.tenant_policy_update(tenant_id, {"placement": "Detached"})
+    env.storage_controller.reconcile_until_idle()
+    env.storage_controller.tenant_policy_update(tenant_id, {"scheduling": "Stop"})
+    env.storage_controller.allowed_errors.append(".*Scheduling is disabled by policy")
+
+    # The controller would not do this (attach in an old generation): we are doing it to simulate
+    # a hypothetical profound bug in the controller.
+    env.pageserver.http_client().tenant_location_conf(
+        tenant_id, {"generation": 1, "mode": "AttachedSingle", "tenant_conf": {}}
+    )
+
+    # The pageserver should react to this situation by refusing to attach the tenant and putting
+    # it into Broken state
+    env.pageserver.allowed_errors.append(".*tenant is broken.*")
+    with pytest.raises(
+        PageserverApiException,
+        match="tenant is broken: Index age exceeds threshold and a newer index exists",
+    ):
+        env.pageserver.http_client().timeline_detail(tenant_id, timeline_id)
--- a/test_runner/regress/test_physical_and_logical_replicaiton.py
+++ b/test_runner/regress/test_physical_and_logical_replicaiton.py
@@ -53,7 +53,7 @@ def test_aux_not_logged_at_replica(neon_simple_env: NeonEnv, vanilla_pg):
    """Test that AUX files are not saved at replica"""
    env = neon_simple_env

-    n_records = 200000
+    n_records = 20000

    primary = env.endpoints.create_start(
        branch_name="main",
@@ -91,7 +91,7 @@ def test_aux_not_logged_at_replica(neon_simple_env: NeonEnv, vanilla_pg):
    s_cur.execute("select count(*) from t")
    assert s_cur.fetchall()[0][0] == n_records

-    primary.stop()
-    time.sleep(1)
+    vanilla_pg.stop()
    secondary.stop()
+    primary.stop()
    assert not secondary.log_contains("cannot make new WAL entries during recovery")
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -427,7 +427,7 @@ def test_create_churn_during_restart(neon_env_builder: NeonEnvBuilder):
            env.pageserver.start()

            for f in futs:
-                f.result(timeout=10)
+                f.result(timeout=30)

    # The tenant should end up active
    wait_until_tenant_active(env.pageserver.http_client(), tenant_id, iterations=10, period=1)
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,18 +1,18 @@
 {
  "v17": [
-    "17.0",
-    "ae4cc30dba24f3910533e5a48e8103c3f2fff300"
+    "17.1",
+    "aa2e29f2b6952140dfe51876bbd11054acae776f"
  ],
  "v16": [
-    "16.4",
-    "03b43900edc5d8d6eecec460bfc89aec7174bd84"
+    "16.5",
+    "b0b693ea298454e95e6b154780d1fd586a244dfd"
  ],
  "v15": [
-    "15.8",
-    "fd631a959049dfe2b82f67409c8b8b0d3e0016d1"
+    "15.9",
+    "1feff6b60f07cb71b665d0f5ead71a4320a71743"
  ],
  "v14": [
-    "14.13",
-    "de0a000dafc2e66ce2e39282d3aa1c704fe0390e"
+    "14.14",
+    "c5e0d642efb02e4bfedc283b0a7707fe6c79cc89"
  ]
 }
Author	SHA1	Message	Date
Folke Behrens	0896dcf3d4	proxy: Add temporary stream_events tool neondatabase/cloud#19600	2024-11-14 11:49:07 +01:00
Arseny Sher	d06bf4b0fe	safekeeper: fix atomicity of WAL truncation (#9685 ) If WAL truncation fails in the middle it might leave some data on disk above the write/flush LSN. In theory, concatenated with previous records it might form bogus WAL (though very unlikely in practice because CRC would protect from that). To protect from that, set pending_wal_truncation flag: means before any WAL writes truncation must be retried until it succeeds. We already did that in case of safekeeper restart, now extend this mechanism for failures without restart. Also, importantly, reset LSNs in the beginning of the operation, not in the end, because once on disk deletion starts previous pointers are wrong. All this most likely haven't created any problems in practice because CRC protects from the consequences. Tests for this are hard; simulation infrastructure might be useful here in the future, but not yet.	2024-11-14 13:06:42 +03:00
Tristan Partin	1280b708f1	Improve error handling for NeonAPI fixture Move error handling to the common request function and add a debug log. Signed-off-by: Tristan Partin <tristan@neon.tech>	2024-11-13 20:35:48 -06:00
John Spray	b4e00b8b22	pageserver: refuse to load tenants with suspiciously old indices in old generations (#9719 ) ## Problem Historically, if a control component passed a pageserver "generation: 1" this could be a quick way to corrupt a tenant by loading a historic index. Follows https://github.com/neondatabase/neon/pull/9383 Closes #6951 ## Summary of changes - Introduce a Fatal variant to DownloadError, to enable index downloads to signal when they have encountered a scary enough situation that we shouldn't proceed to load the tenant. - Handle this variant by putting the tenant into a broken state (no matter which timeline within the tenant reported it) - Add a test for this case In the event that this behavior fires when we don't want it to, we have ways to intervene: - "Touch" an affected index to update its mtime (download+upload S3 object) - If this behavior is triggered, it indicates we're attaching in some old generation, so we should be able to fix that by manually bumping generation numbers in the storage controller database (this should never happen, but it's an option if it does)	2024-11-13 18:07:39 +00:00
Heikki Linnakangas	10aaa3677d	PostgreSQL minor version updates (17.1, 16.5, 15.9, 14.14) (#9727 ) This includes a patch to temporarily disable one test in the pg_anon test suite. It is an upstream issue, the test started failing with the new PostgreSQL minor versions because of a change in the default timezone used in tests. We don't want to block the release for this, so just disable the test for now. See `199f0a392b (note_2148017485)` Corresponding postgres repository PRs: https://github.com/neondatabase/postgres/pull/524 https://github.com/neondatabase/postgres/pull/525 https://github.com/neondatabase/postgres/pull/526 https://github.com/neondatabase/postgres/pull/527	2024-11-13 15:08:58 +02:00
Heikki Linnakangas	d5435b1a81	tests: Increase timeout in test_create_churn_during_restart (#9736 ) This test was seen to be flaky, e.g. at: https://neon-github-public-dev.s3.amazonaws.com/reports/pr-9457/11804246485/index.html#suites/ec4311502db344eee91f1354e9dc839b/982bd121ea698414/. If I _reduce_ the timeout from 10s to 8s on my laptop, it reliably hits that timeout and fails. That suggests that the test is pretty close to the edge even when it passes. Let's bump up the timeout to 30 s to make it more robust. See also https://github.com/neondatabase/neon/issues/9730, although the error message is different there.	2024-11-13 12:20:32 +02:00
Anastasia Lubennikova	080d585b22	Add installed_extensions prometheus metric (#9608 ) and add /metrics endpoint to compute_ctl to expose such metrics metric format example for extension pg_rag with versions 1.2.3 and 1.4.2 installed in 3 and 1 databases respectively: neon_extensions_installed{extension="pg_rag", version="1.2.3"} = 3 neon_extensions_installed{extension="pg_rag", version="1.4.2"} = 1 ------ infra part: https://github.com/neondatabase/flux-fleet/pull/251 --------- Co-authored-by: Tristan Partin <tristan@neon.tech>	2024-11-13 09:36:48 +00:00
John Spray	7595d3afe6	pageserver: add `no_sync` for use in regression tests (2/2) (#9678 ) ## Problem Followup to https://github.com/neondatabase/neon/pull/9677 which enables `no_sync` in tests. This can be merged once the next release has happened. ## Summary of changes - Always run pageserver with `no_sync = true` in tests.	2024-11-13 09:17:26 +00:00
Konstantin Knizhnik	1ff5333a1b	Do not wallog AUX files at replica (#9457 ) ## Problem Attempt to persist LR stuff at replica cause cannot make new WAL entries during recovery` error. See https://neondb.slack.com/archives/C07S7RBFVRA/p1729280401283389 ## Summary of changes Do not wallog AUX files at replica. Related Postgres PRs: https://github.com/neondatabase/postgres/pull/517 https://github.com/neondatabase/postgres/pull/516 https://github.com/neondatabase/postgres/pull/515 https://github.com/neondatabase/postgres/pull/514 ## Checklist before requesting a review - [ ] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. ## Checklist before merging - [ ] Do not forget to reformat commit message to not include the above checklist --------- Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech> Co-authored-by: Heikki Linnakangas <heikki@neon.tech>	2024-11-13 08:50:01 +02:00
Tristan Partin	d8f5d43549	Fix autocommit footguns in performance tests psycopg2 has the following warning related to autocommit: > By default, any query execution, including a simple SELECT will start > a transaction: for long-running programs, if no further action is > taken, the session will remain “idle in transaction”, an undesirable > condition for several reasons (locks are held by the session, tables > bloat…). For long lived scripts, either ensure to terminate a > transaction as soon as possible or use an autocommit connection. In the 2.9 release notes, psycopg2 also made the following change: > `with connection` starts a transaction on autocommit transactions too Some of these connections are indeed long-lived, so we were retaining tons of WAL on the endpoints because we had a transaction pinned in the past. Link: https://www.psycopg.org/docs/news.html#what-s-new-in-psycopg-2-9 Link: https://github.com/psycopg/psycopg2/issues/941 Signed-off-by: Tristan Partin <tristan@neon.tech>	2024-11-12 15:48:19 -06:00
Erik Grinaker	2256a5727a	safekeeper: use `WAL_SEGMENT_SIZE` for empty timeline state (#9734 ) ## Problem `TimelinePersistentState::empty()`, used for tests and benchmarks, had a hardcoded 16 MB WAL segment size. This caused confusion when attempting to change the global segment size. ## Summary of changes Inherit from `WAL_SEGMENT_SIZE` in `TimelinePersistentState::empty()`.	2024-11-12 20:35:44 +00:00
Tristan Partin	3f80af8b1d	Add neon.logical_replication_max_logicalsnapdir_size This GUC will drop replication slots if the size of the pg_logical/snapshots directory (not including temp snapshot files) becomes larger than the specified size. Keeping the size of this directory smaller will help with basebackup size from the pageserver. Part-of: https://github.com/neondatabase/neon/issues/8619 Signed-off-by: Tristan Partin <tristan@neon.tech>	2024-11-12 13:13:28 -06:00
Tristan Partin	a61d81bbc7	Calculate compute_backpressure_throttling_seconds correctly The original value that we get is measured in microseconds. It comes from a calculation using Postgres' GetCurrentTimestamp(), whihc is implemented in terms of gettimeofday(2). Signed-off-by: Tristan Partin <tristan@neon.tech>	2024-11-12 13:12:08 -06:00
Erik Grinaker	05381a48f0	utils: remove unnecessary fsync in `durable_rename()` (#9686 ) ## Problem WAL segment fsyncs significantly affect WAL ingestion throughput. `durable_rename()` is used when initializing every 16 MB segment, and issues 3 fsyncs of which 1 was unnecessary. ## Summary of changes Remove an fsync in `durable_rename` which is unnecessary with Linux and ext4 (which we currently use). This improves WAL ingestion throughput by up to 23% with large appends on my MacBook.	2024-11-12 18:57:31 +01:00
Alex Chi Z.	cef165818c	test(pageserver): add gc-compaction tests with delta will_init (#9724 ) I had an impression that gc-compaction didn't test the case where the first record of the key history is will_init because of there are some code path that will panic in this case. Luckily it got fixed in https://github.com/neondatabase/neon/pull/9026 so we can now implement such tests. Part of https://github.com/neondatabase/neon/issues/9114 ## Summary of changes * Randomly changed some images into will_init neon wal record * Split `test_simple_bottom_most_compaction_deltas` into two test cases, one of them has the bottom layer as delta layer with will_init flags, while the other is the original one with image layers. --------- Signed-off-by: Alex Chi Z <chi@neon.tech>	2024-11-12 10:37:31 -05:00
Erik Grinaker	6b19867410	safekeeper: don't flush control file on WAL ingest path (#9698 ) ## Problem The control file is flushed on the WAL ingest path when the commit LSN advances by one segment, to bound the amount of recovery work in case of a crash. This involves 3 additional fsyncs, which can have a significant impact on WAL ingest throughput. This is to some extent mitigated by `AppendResponse` not being emitted on segment bound flushes, since this will prevent commit LSN advancement, which will be addressed separately. ## Summary of changes Don't flush the control file on the WAL ingest path at all. Instead, leave that responsibility to the timeline manager, but ask it to flush eagerly if the control file lags the in-memory commit LSN by more than one segment. This should not cause more than `REFRESH_INTERVAL` (300 ms) additional latency before flushing the control file, which is negligible.	2024-11-12 15:17:03 +00:00