safekeeper: batch AppendRequest writes

tests: Increase timeout in test_create_churn_during_restart (#9736 )
This test was seen to be flaky, e.g. at: https://neon-github-public-dev.s3.amazonaws.com/reports/pr-9457/11804246485/index.html#suites/ec4311502db344eee91f1354e9dc839b/982bd121ea698414/. If I _reduce_ the timeout from 10s to 8s on my laptop, it reliably hits that timeout and fails. That suggests that the test is pretty close to the edge even when it passes. Let's bump up the timeout to 30 s to make it more robust. See also https://github.com/neondatabase/neon/issues/9730, although the error message is different there.
2026-06-03 13:30:38 +00:00 · 2024-11-13 15:09:57 +01:00 · 2024-11-13 12:20:32 +02:00 · 2024-11-13 09:36:48 +00:00 · 2024-11-13 09:17:26 +00:00 · 2024-11-13 08:50:01 +02:00
29 changed files with 717 additions and 188 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -994,9 +994,9 @@ checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610"

 [[package]]
 name = "bytes"
-version = "1.5.0"
+version = "1.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a2bd12c1caf447e69cd4528f47f94d203fd2582878ecb9e9465484c4148a8223"
+checksum = "9ac0150caa2ae65ca5bd83f25c7de183dea78d4d366469f148435e2acfbad0da"
 dependencies = [
 "serde",
 ]
@@ -1229,12 +1229,15 @@ dependencies = [
 "flate2",
 "futures",
 "hyper 0.14.30",
+ "metrics",
 "nix 0.27.1",
 "notify",
 "num_cpus",
+ "once_cell",
 "opentelemetry",
 "opentelemetry_sdk",
 "postgres",
+ "prometheus",
 "regex",
 "remote_storage",
 "reqwest 0.12.4",
--- a/compute/etc/sql_exporter/compute_backpressure_throttling_seconds.sql
+++ b/compute/etc/sql_exporter/compute_backpressure_throttling_seconds.sql
@@ -1 +1 @@
-SELECT neon.backpressure_throttling_time()::float8 / 1000 AS throttled;
+SELECT (neon.backpressure_throttling_time()::float8 / 1000000) AS throttled;
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -18,9 +18,11 @@ clap.workspace = true
 flate2.workspace = true
 futures.workspace = true
 hyper0 = { workspace = true, features = ["full"] }
+metrics.workspace = true
 nix.workspace = true
 notify.workspace = true
 num_cpus.workspace = true
+once_cell.workspace = true
 opentelemetry.workspace = true
 opentelemetry_sdk.workspace = true
 postgres.workspace = true
@@ -39,6 +41,7 @@ tracing-subscriber.workspace = true
 tracing-utils.workspace = true
 thiserror.workspace = true
 url.workspace = true
+prometheus.workspace = true

 compute_api.workspace = true
 utils.workspace = true
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -9,6 +9,7 @@ use crate::catalog::SchemaDumpError;
 use crate::catalog::{get_database_schema, get_dbs_and_roles};
 use crate::compute::forward_termination_signal;
 use crate::compute::{ComputeNode, ComputeState, ParsedSpec};
+use crate::installed_extensions;
 use compute_api::requests::{ConfigurationRequest, ExtensionInstallRequest, SetRoleGrantsRequest};
 use compute_api::responses::{
    ComputeStatus, ComputeStatusResponse, ExtensionInstallResult, GenericAPIError,
@@ -19,6 +20,8 @@ use anyhow::Result;
 use hyper::header::CONTENT_TYPE;
 use hyper::service::{make_service_fn, service_fn};
 use hyper::{Body, Method, Request, Response, Server, StatusCode};
+use metrics::Encoder;
+use metrics::TextEncoder;
 use tokio::task;
 use tracing::{debug, error, info, warn};
 use tracing_utils::http::OtelName;
@@ -65,6 +68,28 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
            Response::new(Body::from(serde_json::to_string(&metrics).unwrap()))
        }

+        // Prometheus metrics
+        (&Method::GET, "/metrics") => {
+            debug!("serving /metrics GET request");
+
+            let mut buffer = vec![];
+            let metrics = installed_extensions::collect();
+            let encoder = TextEncoder::new();
+            encoder.encode(&metrics, &mut buffer).unwrap();
+
+            match Response::builder()
+                .status(StatusCode::OK)
+                .header(CONTENT_TYPE, encoder.format_type())
+                .body(Body::from(buffer))
+            {
+                Ok(response) => response,
+                Err(err) => {
+                    let msg = format!("error handling /metrics request: {err}");
+                    error!(msg);
+                    render_json_error(&msg, StatusCode::INTERNAL_SERVER_ERROR)
+                }
+            }
+        }
        // Collect Postgres current usage insights
        (&Method::GET, "/insights") => {
            info!("serving /insights GET request");
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -37,6 +37,21 @@ paths:
              schema:
                $ref: "#/components/schemas/ComputeMetrics"

+  /metrics
+    get:
+      tags:
+      - Info
+      summary: Get compute node metrics in text format.
+      description: ""
+      operationId: getComputeMetrics
+      responses:
+        200:
+          description: ComputeMetrics
+          content:
+            text/plain:
+              schema:
+                type: string
+                description: Metrics in text format.
  /insights:
    get:
      tags:
--- a/compute_tools/src/installed_extensions.rs
+++ b/compute_tools/src/installed_extensions.rs
@@ -1,4 +1,5 @@
 use compute_api::responses::{InstalledExtension, InstalledExtensions};
+use metrics::proto::MetricFamily;
 use std::collections::HashMap;
 use std::collections::HashSet;
 use tracing::info;
@@ -8,6 +9,10 @@ use anyhow::Result;
 use postgres::{Client, NoTls};
 use tokio::task;

+use metrics::core::Collector;
+use metrics::{register_uint_gauge_vec, UIntGaugeVec};
+use once_cell::sync::Lazy;
+
 /// We don't reuse get_existing_dbs() just for code clarity
 /// and to make database listing query here more explicit.
 ///
@@ -59,6 +64,12 @@ pub async fn get_installed_extensions(connstr: Url) -> Result<InstalledExtension

            for (extname, v) in extensions.iter() {
                let version = v.to_string();
+
+                // increment the number of databases where the version of extension is installed
+                INSTALLED_EXTENSIONS
+                    .with_label_values(&[extname, &version])
+                    .inc();
+
                extensions_map
                    .entry(extname.to_string())
                    .and_modify(|e| {
@@ -74,9 +85,11 @@ pub async fn get_installed_extensions(connstr: Url) -> Result<InstalledExtension
            }
        }

-        Ok(InstalledExtensions {
+        let res = InstalledExtensions {
            extensions: extensions_map.values().cloned().collect(),
-        })
+        };
+
+        Ok(res)
    })
    .await?
 }
@@ -97,6 +110,18 @@ pub fn get_installed_extensions_sync(connstr: Url) -> Result<()> {
        "[NEON_EXT_STAT] {}",
        serde_json::to_string(&result).expect("failed to serialize extensions list")
    );
-
    Ok(())
 }
+
+static INSTALLED_EXTENSIONS: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "installed_extensions",
+        "Number of databases where the version of extension is installed",
+        &["extension_name", "version"]
+    )
+    .expect("failed to define a metric")
+});
+
+pub fn collect() -> Vec<MetricFamily> {
+    INSTALLED_EXTENSIONS.collect()
+}
--- a/libs/pageserver_api/src/record.rs
+++ b/libs/pageserver_api/src/record.rs
@@ -80,18 +80,18 @@ impl NeonWalRecord {
    }

    #[cfg(feature = "testing")]
-    pub fn wal_clear() -> Self {
+    pub fn wal_clear(s: impl AsRef<str>) -> Self {
        Self::Test {
-            append: "".to_string(),
+            append: s.as_ref().to_string(),
            clear: true,
            will_init: false,
        }
    }

    #[cfg(feature = "testing")]
-    pub fn wal_init() -> Self {
+    pub fn wal_init(s: impl AsRef<str>) -> Self {
        Self::Test {
-            append: "".to_string(),
+            append: s.as_ref().to_string(),
            clear: true,
            will_init: true,
        }
--- a/libs/utils/src/crashsafe.rs
+++ b/libs/utils/src/crashsafe.rs
@@ -123,15 +123,27 @@ pub async fn fsync_async_opt(
    Ok(())
 }

-/// Like postgres' durable_rename, renames file issuing fsyncs do make it
-/// durable. After return, file and rename are guaranteed to be persisted.
+/// Like postgres' durable_rename, renames a file and issues fsyncs to make it durable. After
+/// returning, both the file and rename are guaranteed to be persisted. Both paths must be on the
+/// same file system.
 ///
-/// Unlike postgres, it only does fsyncs to 1) file to be renamed to make
-/// contents durable; 2) its directory entry to make rename durable 3) again to
-/// already renamed file, which is not required by standards but postgres does
-/// it, let's stick to that. Postgres additionally fsyncs newpath *before*
-/// rename if it exists to ensure that at least one of the files survives, but
-/// current callers don't need that.
+/// Unlike postgres, it only fsyncs 1) the file to make contents durable, and 2) the directory to
+/// make the rename durable. This sequence ensures the target file will never be incomplete.
+///
+/// Postgres also:
+///
+/// * Fsyncs the target file, if it exists, before the rename, to ensure either the new or existing
+///   file survives a crash. Current callers don't need this as it should already be fsynced if
+///   durability is needed.
+///
+/// * Fsyncs the file after the rename. This can be required with certain OSes or file systems (e.g.
+///   NFS), but not on Linux with most common file systems like ext4 (which we currently use).
+///
+/// An audit of 8 other databases found that none fsynced the file after a rename:
+/// <https://github.com/neondatabase/neon/pull/9686#discussion_r1837180535>
+///
+/// eBPF probes confirmed that this is sufficient with ext4, XFS, and ZFS, but possibly not Btrfs:
+/// <https://github.com/neondatabase/neon/pull/9686#discussion_r1837926218>
 ///
 /// virtual_file.rs has similar code, but it doesn't use vfs.
 ///
@@ -149,9 +161,6 @@ pub async fn durable_rename(
    // Time to do the real deal.
    tokio::fs::rename(old_path.as_ref(), new_path.as_ref()).await?;

-    // Postgres'ish fsync of renamed file.
-    fsync_async_opt(new_path.as_ref(), do_fsync).await?;
-
    // Now fsync the parent
    let parent = match new_path.as_ref().parent() {
        Some(p) => p,
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -7757,13 +7757,13 @@ mod tests {
            (
                get_key(3),
                Lsn(0x20),
-                Value::WalRecord(NeonWalRecord::wal_clear()),
+                Value::WalRecord(NeonWalRecord::wal_clear("c")),
            ),
            (get_key(4), Lsn(0x10), Value::Image("0x10".into())),
            (
                get_key(4),
                Lsn(0x20),
-                Value::WalRecord(NeonWalRecord::wal_init()),
+                Value::WalRecord(NeonWalRecord::wal_init("i")),
            ),
        ];
        let image1 = vec![(get_key(1), "0x10".into())];
@@ -7912,8 +7912,30 @@ mod tests {

    #[cfg(feature = "testing")]
    #[tokio::test]
-    async fn test_simple_bottom_most_compaction_deltas() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_simple_bottom_most_compaction_deltas").await?;
+    async fn test_simple_bottom_most_compaction_deltas_1() -> anyhow::Result<()> {
+        test_simple_bottom_most_compaction_deltas_helper(
+            "test_simple_bottom_most_compaction_deltas_1",
+            false,
+        )
+        .await
+    }
+
+    #[cfg(feature = "testing")]
+    #[tokio::test]
+    async fn test_simple_bottom_most_compaction_deltas_2() -> anyhow::Result<()> {
+        test_simple_bottom_most_compaction_deltas_helper(
+            "test_simple_bottom_most_compaction_deltas_2",
+            true,
+        )
+        .await
+    }
+
+    #[cfg(feature = "testing")]
+    async fn test_simple_bottom_most_compaction_deltas_helper(
+        test_name: &'static str,
+        use_delta_bottom_layer: bool,
+    ) -> anyhow::Result<()> {
+        let harness = TenantHarness::create(test_name).await?;
        let (tenant, ctx) = harness.load().await;

        fn get_key(id: u32) -> Key {
@@ -7944,6 +7966,16 @@ mod tests {
        let img_layer = (0..10)
            .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
            .collect_vec();
+        // or, delta layer at 0x10 if `use_delta_bottom_layer` is true
+        let delta4 = (0..10)
+            .map(|id| {
+                (
+                    get_key(id),
+                    Lsn(0x08),
+                    Value::WalRecord(NeonWalRecord::wal_init(format!("value {id}@0x10"))),
+                )
+            })
+            .collect_vec();

        let delta1 = vec![
            (
@@ -7997,21 +8029,61 @@ mod tests {
            ),
        ];

-        let tline = tenant
-            .create_test_timeline_with_layers(
-                TIMELINE_ID,
-                Lsn(0x10),
-                DEFAULT_PG_VERSION,
-                &ctx,
-                vec![
-                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta1),
-                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta2),
-                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3),
-                ], // delta layers
-                vec![(Lsn(0x10), img_layer)], // image layers
-                Lsn(0x50),
-            )
-            .await?;
+        let tline = if use_delta_bottom_layer {
+            tenant
+                .create_test_timeline_with_layers(
+                    TIMELINE_ID,
+                    Lsn(0x08),
+                    DEFAULT_PG_VERSION,
+                    &ctx,
+                    vec![
+                        DeltaLayerTestDesc::new_with_inferred_key_range(
+                            Lsn(0x08)..Lsn(0x10),
+                            delta4,
+                        ),
+                        DeltaLayerTestDesc::new_with_inferred_key_range(
+                            Lsn(0x20)..Lsn(0x48),
+                            delta1,
+                        ),
+                        DeltaLayerTestDesc::new_with_inferred_key_range(
+                            Lsn(0x20)..Lsn(0x48),
+                            delta2,
+                        ),
+                        DeltaLayerTestDesc::new_with_inferred_key_range(
+                            Lsn(0x48)..Lsn(0x50),
+                            delta3,
+                        ),
+                    ], // delta layers
+                    vec![], // image layers
+                    Lsn(0x50),
+                )
+                .await?
+        } else {
+            tenant
+                .create_test_timeline_with_layers(
+                    TIMELINE_ID,
+                    Lsn(0x10),
+                    DEFAULT_PG_VERSION,
+                    &ctx,
+                    vec![
+                        DeltaLayerTestDesc::new_with_inferred_key_range(
+                            Lsn(0x10)..Lsn(0x48),
+                            delta1,
+                        ),
+                        DeltaLayerTestDesc::new_with_inferred_key_range(
+                            Lsn(0x10)..Lsn(0x48),
+                            delta2,
+                        ),
+                        DeltaLayerTestDesc::new_with_inferred_key_range(
+                            Lsn(0x48)..Lsn(0x50),
+                            delta3,
+                        ),
+                    ], // delta layers
+                    vec![(Lsn(0x10), img_layer)], // image layers
+                    Lsn(0x50),
+                )
+                .await?
+        };
        {
            // Update GC info
            let mut guard = tline.gc_info.write().unwrap();
@@ -8121,7 +8193,7 @@ mod tests {
            (
                key,
                Lsn(0x10),
-                Value::Image(Bytes::copy_from_slice(b"0x10")),
+                Value::WalRecord(NeonWalRecord::wal_init("0x10")),
            ),
            (
                key,
@@ -8183,7 +8255,7 @@ mod tests {
                    Lsn(0x20),
                    KeyLogAtLsn(vec![(
                        Lsn(0x20),
-                        Value::Image(Bytes::copy_from_slice(b"0x10;0x20")),
+                        Value::Image(Bytes::from_static(b"0x10;0x20")),
                    )]),
                ),
                (
@@ -9165,7 +9237,7 @@ mod tests {

            let will_init = will_init_keys.contains(&i);
            if will_init {
-                delta_layer_spec.push((key, lsn, Value::WalRecord(NeonWalRecord::wal_init())));
+                delta_layer_spec.push((key, lsn, Value::WalRecord(NeonWalRecord::wal_init(""))));

                expected_key_values.insert(key, "".to_string());
            } else {
--- a/pageserver/src/tenant/storage_layer/merge_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs
@@ -562,7 +562,7 @@ mod tests {
            (
                get_key(0),
                Lsn(0x10),
-                Value::WalRecord(NeonWalRecord::wal_init()),
+                Value::WalRecord(NeonWalRecord::wal_init("")),
            ),
            (
                get_key(0),
@@ -572,7 +572,7 @@ mod tests {
            (
                get_key(5),
                Lsn(0x10),
-                Value::WalRecord(NeonWalRecord::wal_init()),
+                Value::WalRecord(NeonWalRecord::wal_init("")),
            ),
            (
                get_key(5),
--- a/pageserver/src/walredo/apply_neon.rs
+++ b/pageserver/src/walredo/apply_neon.rs
@@ -253,6 +253,10 @@ pub(crate) fn apply_in_neon(
            use bytes::BufMut;
            if *will_init {
                assert!(*clear, "init record must be clear to ensure correctness");
+                assert!(
+                    page.is_empty(),
+                    "init record must be the first entry to ensure correctness"
+                );
            }
            if *clear {
                page.clear();
--- a/pgxn/neon/logical_replication_monitor.c
+++ b/pgxn/neon/logical_replication_monitor.c
@@ -1,7 +1,8 @@
+#include <dirent.h>
 #include <limits.h>
 #include <string.h>
-#include <dirent.h>
 #include <signal.h>
+#include <sys/stat.h>

 #include "postgres.h"

@@ -21,17 +22,35 @@

 static int	logical_replication_max_snap_files = 300;

+/*
+ * According to Chi (shyzh), the pageserver _should_ be good with 10 MB worth of
+ * snapshot files. Let's use 8 MB since 8 is a power of 2.
+ */
+static int	logical_replication_max_logicalsnapdir_size = 8000;
+
+/*
+ * A primitive description of a logical snapshot file including the LSN of the
+ * file and its size.
+ */
+typedef struct SnapDesc {
+	XLogRecPtr	lsn;
+	off_t		sz;
+} SnapDesc;
+
 PGDLLEXPORT void LogicalSlotsMonitorMain(Datum main_arg);

+/*
+ * Sorts an array of snapshot descriptors by their LSN.
+ */
 static int
-LsnDescComparator(const void *a, const void *b)
+SnapDescComparator(const void *a, const void *b)
 {
-	XLogRecPtr	lsn1 = *((const XLogRecPtr *) a);
-	XLogRecPtr	lsn2 = *((const XLogRecPtr *) b);
+	const SnapDesc	*desc1 = a;
+	const SnapDesc	*desc2 = b;

-	if (lsn1 < lsn2)
+	if (desc1->lsn < desc2->lsn)
 		return 1;
-	else if (lsn1 == lsn2)
+	else if (desc1->lsn == desc2->lsn)
 		return 0;
 	else
 		return -1;
@@ -43,28 +62,39 @@ LsnDescComparator(const void *a, const void *b)
 * slots having lower restart_lsn should be dropped.
 */
 static XLogRecPtr
-get_num_snap_files_lsn_threshold(void)
+get_snapshots_cutoff_lsn(void)
 {
-	DIR		   *dirdesc;
-	struct dirent *de;
-	char	   *snap_path = "pg_logical/snapshots/";
-	int			lsns_allocated = 1024;
-	int			lsns_num = 0;
-	XLogRecPtr *lsns;
-	XLogRecPtr	cutoff;
+/* PG 18 has a constant defined for this, PG_LOGICAL_SNAPSHOTS_DIR */
+#define SNAPDIR "pg_logical/snapshots"

-	if (logical_replication_max_snap_files < 0)
+	DIR		   *dirdesc;
+	int			dirdesc_fd;
+	struct dirent *de;
+	size_t		snapshot_index = 0;
+	SnapDesc   *snapshot_descriptors;
+	size_t		descriptors_allocated = 1024;
+	XLogRecPtr	cutoff = 0;
+	off_t		logicalsnapdir_size = 0;
+	const int	logical_replication_max_logicalsnapdir_size_bytes = logical_replication_max_logicalsnapdir_size * 1000;
+
+	if (logical_replication_max_snap_files < 0 && logical_replication_max_logicalsnapdir_size < 0)
 		return 0;

-	lsns = palloc(sizeof(XLogRecPtr) * lsns_allocated);
+	snapshot_descriptors = palloc(sizeof(*snapshot_descriptors) * descriptors_allocated);
+
+	dirdesc = AllocateDir(SNAPDIR);
+	dirdesc_fd = dirfd(dirdesc);
+	if (dirdesc_fd == -1)
+		ereport(ERROR, errmsg("failed to get a file descriptor for " SNAPDIR ": %m"));

 	/* find all .snap files and get their lsns */
-	dirdesc = AllocateDir(snap_path);
-	while ((de = ReadDir(dirdesc, snap_path)) != NULL)
+	while ((de = ReadDir(dirdesc, SNAPDIR)) != NULL)
 	{
-		XLogRecPtr	lsn;
 		uint32		hi;
 		uint32		lo;
+		struct stat	st;
+		XLogRecPtr	lsn;
+		SnapDesc   *desc;

 		if (strcmp(de->d_name, ".") == 0 ||
 			strcmp(de->d_name, "..") == 0)
@@ -79,28 +109,69 @@ get_num_snap_files_lsn_threshold(void)

 		lsn = ((uint64) hi) << 32 | lo;
 		elog(DEBUG5, "found snap file %X/%X", LSN_FORMAT_ARGS(lsn));
-		if (lsns_allocated == lsns_num)
+
+		if (fstatat(dirdesc_fd, de->d_name, &st, 0) == -1)
+			ereport(ERROR, errmsg("failed to get the size of " SNAPDIR "/%s: %m", de->d_name));
+
+		if (descriptors_allocated == snapshot_index)
 		{
-			lsns_allocated *= 2;
-			lsns = repalloc(lsns, sizeof(XLogRecPtr) * lsns_allocated);
+			descriptors_allocated *= 2;
+			snapshot_descriptors = repalloc(snapshot_descriptors, sizeof(*snapshot_descriptors) * descriptors_allocated);
 		}
-		lsns[lsns_num++] = lsn;
+
+		desc = &snapshot_descriptors[snapshot_index++];
+		desc->lsn = lsn;
+		desc->sz = st.st_size;
 	}
-	/* sort by lsn desc */
-	qsort(lsns, lsns_num, sizeof(XLogRecPtr), LsnDescComparator);
-	/* and take cutoff at logical_replication_max_snap_files */
-	if (logical_replication_max_snap_files > lsns_num)
-		cutoff = 0;
-	/* have less files than cutoff */
-	else
+
+	qsort(snapshot_descriptors, snapshot_index, sizeof(*snapshot_descriptors), SnapDescComparator);
+
+	/* Are there more snapshot files than specified? */
+	if (logical_replication_max_snap_files <= snapshot_index)
 	{
-		cutoff = lsns[logical_replication_max_snap_files - 1];
-		elog(LOG, "ls_monitor: dropping logical slots with restart_lsn lower %X/%X, found %d .snap files, limit is %d",
-			 LSN_FORMAT_ARGS(cutoff), lsns_num, logical_replication_max_snap_files);
+		cutoff = snapshot_descriptors[logical_replication_max_snap_files - 1].lsn;
+		elog(LOG,
+			"ls_monitor: dropping logical slots with restart_lsn lower %X/%X, found %zu snapshot files, limit is %d",
+			LSN_FORMAT_ARGS(cutoff), snapshot_index, logical_replication_max_snap_files);
 	}
-	pfree(lsns);
+
+	/* Is the size of the logical snapshots directory larger than specified?
+	 *
+	 * It's possible we could hit both thresholds, so remove any extra files
+	 * first, and then truncate based on size of the remaining files.
+	 */
+	if (logicalsnapdir_size > logical_replication_max_logicalsnapdir_size_bytes)
+	{
+		/* Unfortunately, iterating the directory does not guarantee any order
+		 * so we can't cache an index in the preceding loop.
+		 */
+
+		off_t		sz;
+		const XLogRecPtr original = cutoff;
+
+		sz = snapshot_descriptors[0].sz;
+		for (size_t i = 1; i < logical_replication_max_snap_files; ++i)
+		{
+			if (sz > logical_replication_max_logicalsnapdir_size_bytes)
+			{
+				cutoff = snapshot_descriptors[i - 1].lsn;
+				break;
+			}
+
+			sz += snapshot_descriptors[i].sz;
+		}
+
+		if (cutoff != original)
+			elog(LOG, "ls_monitor: dropping logical slots with restart_lsn lower than %X/%X, " SNAPDIR " is larger than %d KB",
+					LSN_FORMAT_ARGS(cutoff), logical_replication_max_logicalsnapdir_size);
+	}
+
+	pfree(snapshot_descriptors);
 	FreeDir(dirdesc);
+
 	return cutoff;
+
+#undef SNAPDIR
 }

 void
@@ -118,6 +189,16 @@ InitLogicalReplicationMonitor(void)
 							0,
 							NULL, NULL, NULL);

+	DefineCustomIntVariable(
+							"neon.logical_replication_max_logicalsnapdir_size",
+							"Maximum allowed size of the pg_logical/snapshots directory (KB). When exceeded, slots are dropped until the limit is met. -1 disables the limit.",
+							NULL,
+							&logical_replication_max_logicalsnapdir_size,
+							8000, -1, INT_MAX,
+							PGC_SIGHUP,
+							GUC_UNIT_KB,
+							NULL, NULL, NULL);
+
 	memset(&bgw, 0, sizeof(bgw));
 	bgw.bgw_flags = BGWORKER_SHMEM_ACCESS;
 	bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
@@ -162,7 +243,7 @@ LogicalSlotsMonitorMain(Datum main_arg)
 		 * If there are too many .snap files, just drop all logical slots to
 		 * prevent aux files bloat.
 		 */
-		cutoff_lsn = get_num_snap_files_lsn_threshold();
+		cutoff_lsn = get_snapshots_cutoff_lsn();
 		if (cutoff_lsn > 0)
 		{
 			for (int i = 0; i < max_replication_slots; i++)
--- a/safekeeper/benches/receive_wal.rs
+++ b/safekeeper/benches/receive_wal.rs
@@ -262,7 +262,7 @@ fn bench_wal_acceptor_throughput(c: &mut Criterion) {

                // Send requests.
                for req in reqgen {
-                    _ = reply_rx.try_recv(); // discard any replies, to avoid blocking
+                    while reply_rx.try_recv().is_ok() {} // discard replies, to avoid blocking
                    let msg = ProposerAcceptorMessage::AppendRequest(req);
                    msg_tx.send(msg).await.expect("send failed");
                }
--- a/safekeeper/src/metrics.rs
+++ b/safekeeper/src/metrics.rs
@@ -217,7 +217,8 @@ pub static WAL_RECEIVER_QUEUE_DEPTH: Lazy<Histogram> = Lazy::new(|| {
    let mut buckets = pow2_buckets(1, MSG_QUEUE_SIZE);
    buckets.insert(0, 0.0);
    buckets.insert(buckets.len() - 1, (MSG_QUEUE_SIZE - 1) as f64);
-    assert!(buckets.len() <= 12, "too many histogram buckets");
+    // TODO: tweak this.
+    assert!(buckets.len() <= 16, "too many histogram buckets");

    register_histogram!(
        "safekeeper_wal_receiver_queue_depth",
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -7,14 +7,15 @@ use crate::metrics::{
    WAL_RECEIVERS, WAL_RECEIVER_QUEUE_DEPTH, WAL_RECEIVER_QUEUE_DEPTH_TOTAL,
    WAL_RECEIVER_QUEUE_SIZE_TOTAL,
 };
-use crate::safekeeper::AcceptorProposerMessage;
-use crate::safekeeper::ProposerAcceptorMessage;
-use crate::safekeeper::ServerInfo;
+use crate::safekeeper::{
+    AcceptorProposerMessage, AppendRequest, AppendRequestHeader, ProposerAcceptorMessage,
+    ServerInfo,
+};
 use crate::timeline::WalResidentTimeline;
 use crate::wal_service::ConnectionId;
 use crate::GlobalTimelines;
 use anyhow::{anyhow, Context};
-use bytes::BytesMut;
+use bytes::{BufMut as _, Bytes, BytesMut};
 use parking_lot::MappedMutexGuard;
 use parking_lot::Mutex;
 use parking_lot::MutexGuard;
@@ -206,7 +207,8 @@ impl Drop for WalReceiverGuard {
    }
 }

-pub const MSG_QUEUE_SIZE: usize = 256;
+// TODO: reconsider this.
+pub const MSG_QUEUE_SIZE: usize = 4096;
 pub const REPLY_QUEUE_SIZE: usize = 16;

 impl SafekeeperPostgresHandler {
@@ -484,6 +486,9 @@ const FLUSH_INTERVAL: Duration = Duration::from_secs(1);
 /// every 5 seconds, for 12 samples per poll. This will give a count of up to 12x active timelines.
 const METRICS_INTERVAL: Duration = Duration::from_secs(5);

+/// The AppendRequest buffer size.
+const APPEND_BUFFER_SIZE: usize = 1024 * 1024;
+
 /// Encapsulates a task which takes messages from msg_rx, processes and pushes
 /// replies to reply_tx.
 ///
@@ -530,6 +535,9 @@ impl WalAcceptor {
    async fn run(&mut self) -> anyhow::Result<()> {
        let walreceiver_guard = self.tli.get_walreceivers().register(self.conn_id);

+        // Buffer AppendRequests to submit them as a single large write.
+        let mut append_buf = BufferedAppendRequest::new(APPEND_BUFFER_SIZE);
+
        // Periodically flush the WAL and compute metrics.
        let mut flush_ticker = tokio::time::interval(FLUSH_INTERVAL);
        flush_ticker.set_missed_tick_behavior(MissedTickBehavior::Delay);
@@ -546,7 +554,7 @@ impl WalAcceptor {
                // Process inbound message.
                msg = self.msg_rx.recv() => {
                    // If disconnected, break to flush WAL and return.
-                    let Some(mut msg) = msg else {
+                    let Some(msg) = msg else {
                        break;
                    };

@@ -563,11 +571,44 @@ impl WalAcceptor {
                    // This batches multiple appends per fsync. If the channel is empty after
                    // sending the reply, we'll schedule an immediate flush.
                    if let ProposerAcceptorMessage::AppendRequest(append_request) = msg {
-                        msg = ProposerAcceptorMessage::NoFlushAppendRequest(append_request);
-                        dirty = true;
-                    }
+                        // Try to batch multiple messages into a single large write.
+                        if !append_buf.is_empty() || !self.msg_rx.is_empty() {
+                            if append_buf.add(&append_request) {
+                                continue; // message buffered, go get next message
+                            }

-                    self.tli.process_msg(&msg).await?
+                            // Full buffer, write it and buffer this message for next iteration.
+                            dirty = true;
+                            let buf_req = append_buf.take().expect("empty buffer");
+                            let buf_msg = ProposerAcceptorMessage::NoFlushAppendRequest(buf_req);
+                            let reply = self.tli.process_msg(&buf_msg).await?;
+                            drop(buf_msg); // allow reusing buffer for add
+                            assert!(append_buf.add(&append_request), "empty buffer rejected msg");
+                            reply
+                        } else {
+                            dirty = true;
+                            let msg = ProposerAcceptorMessage::NoFlushAppendRequest(append_request);
+                            self.tli.process_msg(&msg).await?
+                        }
+                    } else {
+                        self.tli.process_msg(&msg).await?
+                    }
+                }
+
+                // If there are no pending messages, write the append buffer.
+                //
+                // NB: we don't also flush the WAL here. Otherwise we can get into a regime where we
+                // quickly drain msg_rx and fsync before the sender is able to repopulate msg_rx.
+                // This happens consistently due to Tokio scheduling, leading to overeager fsyncing.
+                // Instead, we perform the write without fsyncing and give the sender a chance to
+                // get scheduled and populate msg_rx for the next iteration. If there are no further
+                // messages, the next iteration will flush the WAL.
+                _ = future::ready(()), if self.msg_rx.is_empty() && !append_buf.is_empty() => {
+                    dirty = true;
+                    let buf_req = append_buf.take().expect("empty buffer");
+                    self.tli
+                        .process_msg(&ProposerAcceptorMessage::NoFlushAppendRequest(buf_req))
+                        .await?
                }

                // While receiving AppendRequests, flush the WAL periodically and respond with an
@@ -579,11 +620,11 @@ impl WalAcceptor {
                        .await?
                }

-                // If there are no pending messages, flush the WAL immediately.
+                // If there are no pending messages, flush the WAL and append buffer immediately.
                //
                // TODO: this should be done via flush_ticker.reset_immediately(), but that's always
                // delayed by 1ms due to this bug: https://github.com/tokio-rs/tokio/issues/6866.
-                _ = future::ready(()), if dirty && self.msg_rx.is_empty() => {
+                _ = future::ready(()), if self.msg_rx.is_empty() && dirty => {
                    dirty = false;
                    flush_ticker.reset();
                    self.tli
@@ -627,3 +668,115 @@ impl Drop for WalAcceptor {
        }
    }
 }
+
+/// Buffers WAL data for multiple AppendRequests, to submit them as a single write.
+struct BufferedAppendRequest {
+    /// The buffer capacity.
+    capacity: usize,
+    /// The buffered header and WAL data.
+    buf: Option<(AppendRequestHeader, BytesMut)>,
+    /// A previous buffer that can be reused when the returned message is dropped.
+    reuse_buf: Option<Bytes>,
+    /// If an AppendRequest is larger than the buffer capacity (when empty), just stash it here to
+    /// avoid growing the buffer and copying it. This will be returned as-is.
+    large: Option<AppendRequest>,
+}
+
+impl BufferedAppendRequest {
+    /// Creates a new append request buffer with the given capacity.
+    fn new(capacity: usize) -> Self {
+        Self {
+            capacity,
+            buf: None,
+            reuse_buf: None,
+            large: None,
+        }
+    }
+
+    /// Adds the given append request to the buffer, if possible. Returns `false` if the message
+    /// can't be buffered, leaving self unmodified. An empty buffer will always accept a message.
+    ///
+    /// If the buffer is not empty, the message must have the same term and proposer and contiguous
+    /// `begin_lsn` and `end_lsn`. The buffer must have available capacity for the entire
+    /// `wal_data`. If the message is greater than an empty buffer's capacity, it is accepted but
+    /// simply stashed away in `large` without growing the buffer.
+    pub fn add(&mut self, msg: &AppendRequest) -> bool {
+        // If there is a stashed large message, reject further messages.
+        if self.large.is_some() {
+            return false;
+        }
+
+        // If there is no existing buffer, initialize one with the message.
+        let Some((ref mut h, ref mut wal_data)) = self.buf else {
+            // If the message is larger than the buffer capacity, just stash it instead of growing.
+            if msg.wal_data.len() > self.capacity {
+                assert!(self.large.is_none());
+                self.large = Some(msg.clone()); // clone is cheap with Bytes
+                return true;
+            }
+
+            // Reuse a previous buffer, if any, or allocate a new one.
+            //
+            // TODO: try_into_mut() is essentially runtime borrow checking. If AppendRequest used a
+            // normal Vec<u8> we could do compile-time borrow checking instead and avoid panic.
+            let mut wal_data = match self.reuse_buf.take() {
+                Some(reuse_buf) => match reuse_buf.try_into_mut() {
+                    Ok(mut reuse_buf) => {
+                        assert_eq!(reuse_buf.capacity(), self.capacity);
+                        reuse_buf.clear();
+                        reuse_buf
+                    }
+                    Err(_) => panic!("couldn't reuse buffer, still in use"),
+                },
+                None => BytesMut::with_capacity(self.capacity),
+            };
+            // Copy the append request into the buffer.
+            wal_data.put_slice(&msg.wal_data);
+            self.buf = Some((msg.h, wal_data));
+            return true;
+        };
+
+        // The messages must have the same term and proposer.
+        if h.term != msg.h.term || h.proposer_uuid != msg.h.proposer_uuid {
+            return false;
+        }
+        // The messages must be contiguous.
+        if h.end_lsn != msg.h.begin_lsn {
+            return false;
+        }
+        // The message must fit in the buffer.
+        if wal_data.len() + msg.wal_data.len() > self.capacity {
+            return false;
+        }
+
+        // Add the message to the buffer, bumping the commit and truncate LSNs. We assume that later
+        // messages have later commit/truncate LSNs.
+        h.end_lsn = msg.h.end_lsn;
+        h.commit_lsn = msg.h.commit_lsn;
+        h.truncate_lsn = msg.h.truncate_lsn;
+        wal_data.put_slice(&msg.wal_data);
+        true
+    }
+
+    /// Returns true if there is no buffered message.
+    fn is_empty(&self) -> bool {
+        self.buf.is_none() && self.large.is_none()
+    }
+
+    /// Takes the buffered AppendRequest (if any), leaving a None in its place.
+    ///
+    /// NB: The returned `wal_data` Bytes must be dropped before the next call to `add()`, in order
+    /// to reuse the buffer. This is basically runtime borrow checking, because of Bytes.
+    fn take(&mut self) -> Option<AppendRequest> {
+        // If there is a stashed large message, return it.
+        if let Some(large) = self.large.take() {
+            assert!(self.buf.is_none(), "both buf and large are set");
+            return Some(large);
+        }
+
+        let (h, wal_data) = self.buf.take()?;
+        let wal_data = wal_data.freeze();
+        self.reuse_buf = Some(wal_data.clone()); // keep a reference to the buffer
+        Some(AppendRequest { h, wal_data })
+    }
+}
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -296,12 +296,13 @@ pub struct ProposerElected {

 /// Request with WAL message sent from proposer to safekeeper. Along the way it
 /// communicates commit_lsn.
-#[derive(Debug)]
+#[derive(Clone, Debug)]
 pub struct AppendRequest {
    pub h: AppendRequestHeader,
    pub wal_data: Bytes,
 }
-#[derive(Debug, Clone, Deserialize)]
+
+#[derive(Debug, Clone, Copy, Deserialize)]
 pub struct AppendRequestHeader {
    // safekeeper's current term; if it is higher than proposer's, the compute is out of date.
    pub term: Term,
@@ -1166,7 +1167,7 @@ mod tests {
            proposer_uuid: [0; 16],
        };
        let mut append_request = AppendRequest {
-            h: ar_hdr.clone(),
+            h: ar_hdr,
            wal_data: Bytes::from_static(b"b"),
        };

@@ -1240,7 +1241,7 @@ mod tests {
            proposer_uuid: [0; 16],
        };
        let append_request = AppendRequest {
-            h: ar_hdr.clone(),
+            h: ar_hdr,
            wal_data: Bytes::from_static(b"b"),
        };

@@ -1248,7 +1249,7 @@ mod tests {
        sk.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request))
            .await
            .unwrap();
-        let mut ar_hrd2 = ar_hdr.clone();
+        let mut ar_hrd2 = ar_hdr;
        ar_hrd2.begin_lsn = Lsn(4);
        ar_hrd2.end_lsn = Lsn(5);
        let append_request = AppendRequest {
--- a/safekeeper/src/state.rs
+++ b/safekeeper/src/state.rs
@@ -4,6 +4,7 @@
 use std::{cmp::max, ops::Deref};

 use anyhow::{bail, Result};
+use postgres_ffi::WAL_SEGMENT_SIZE;
 use safekeeper_api::models::TimelineTermBumpResponse;
 use serde::{Deserialize, Serialize};
 use utils::{
@@ -144,7 +145,7 @@ impl TimelinePersistentState {
            ServerInfo {
                pg_version: 170000, /* Postgres server version (major * 10000) */
                system_id: 0,       /* Postgres system identifier */
-                wal_seg_size: 16 * 1024 * 1024,
+                wal_seg_size: WAL_SEGMENT_SIZE as u32,
            },
            vec![],
            Lsn::INVALID,
--- a/test_runner/fixtures/endpoint/http.py
+++ b/test_runner/fixtures/endpoint/http.py
@@ -46,3 +46,8 @@ class EndpointHttpClient(requests.Session):
        )
        res.raise_for_status()
        return res.json()
+
+    def metrics(self) -> str:
+        res = self.get(f"http://localhost:{self.port}/metrics")
+        res.raise_for_status()
+        return res.text
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1065,6 +1065,9 @@ class NeonEnv:
                "http_auth_type": http_auth_type,
                # Default which can be overriden with `NeonEnvBuilder.pageserver_config_override`
                "availability_zone": "us-east-2a",
+                # Disable pageserver disk syncs in tests: when running tests concurrently, this avoids
+                # the pageserver taking a long time to start up due to syncfs flushing other tests' data
+                "no_sync": True,
            }
            if self.pageserver_virtual_file_io_engine is not None:
                ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine
--- a/test_runner/performance/test_logical_replication.py
+++ b/test_runner/performance/test_logical_replication.py
@@ -149,12 +149,16 @@ def test_subscriber_lag(
                check_pgbench_still_running(pub_workload, "pub")
                check_pgbench_still_running(sub_workload, "sub")

-                with (
-                    psycopg2.connect(pub_connstr) as pub_conn,
-                    psycopg2.connect(sub_connstr) as sub_conn,
-                ):
-                    with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
-                        lag = measure_logical_replication_lag(sub_cur, pub_cur)
+                pub_conn = psycopg2.connect(pub_connstr)
+                sub_conn = psycopg2.connect(sub_connstr)
+                pub_conn.autocommit = True
+                sub_conn.autocommit = True
+
+                with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
+                    lag = measure_logical_replication_lag(sub_cur, pub_cur)
+
+                pub_conn.close()
+                sub_conn.close()

                log.info(f"Replica lagged behind master by {lag} seconds")
                zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER)
@@ -206,6 +210,7 @@ def test_publisher_restart(
    sub_conn = psycopg2.connect(sub_connstr)
    pub_conn.autocommit = True
    sub_conn.autocommit = True
+
    with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
        pub_cur.execute("SELECT 1 FROM pg_catalog.pg_publication WHERE pubname = 'pub1'")
        pub_exists = len(pub_cur.fetchall()) != 0
@@ -222,6 +227,7 @@ def test_publisher_restart(
            sub_cur.execute(f"create subscription sub1 connection '{pub_connstr}' publication pub1")

        initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur)
+
    pub_conn.close()
    sub_conn.close()

@@ -248,12 +254,17 @@ def test_publisher_restart(
                    ["pgbench", "-c10", pgbench_duration, "-Mprepared"],
                    env=pub_env,
                )
-                with (
-                    psycopg2.connect(pub_connstr) as pub_conn,
-                    psycopg2.connect(sub_connstr) as sub_conn,
-                ):
-                    with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
-                        lag = measure_logical_replication_lag(sub_cur, pub_cur)
+
+                pub_conn = psycopg2.connect(pub_connstr)
+                sub_conn = psycopg2.connect(sub_connstr)
+                pub_conn.autocommit = True
+                sub_conn.autocommit = True
+
+                with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
+                    lag = measure_logical_replication_lag(sub_cur, pub_cur)
+
+                pub_conn.close()
+                sub_conn.close()

                log.info(f"Replica lagged behind master by {lag} seconds")
                zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER)
@@ -288,58 +299,56 @@ def test_snap_files(
    env = benchmark_project_pub.pgbench_env
    connstr = benchmark_project_pub.connstr

-    with psycopg2.connect(connstr) as conn:
-        conn.autocommit = True
-        with conn.cursor() as cur:
-            cur.execute("SELECT rolsuper FROM pg_roles WHERE rolname = 'neondb_owner'")
-            is_super = cast("bool", cur.fetchall()[0][0])
-            assert is_super, "This benchmark won't work if we don't have superuser"
+    conn = psycopg2.connect(connstr)
+    conn.autocommit = True
+
+    with conn.cursor() as cur:
+        cur.execute("SELECT rolsuper FROM pg_roles WHERE rolname = 'neondb_owner'")
+        is_super = cast("bool", cur.fetchall()[0][0])
+        assert is_super, "This benchmark won't work if we don't have superuser"
+
+    conn.close()

    pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s100"], env=env)

    conn = psycopg2.connect(connstr)
    conn.autocommit = True
-    cur = conn.cursor()
-    cur.execute("ALTER SYSTEM SET neon.logical_replication_max_snap_files = -1")

-    with psycopg2.connect(connstr) as conn:
-        conn.autocommit = True
-        with conn.cursor() as cur:
-            cur.execute("SELECT pg_reload_conf()")
-
-    with psycopg2.connect(connstr) as conn:
-        conn.autocommit = True
-        with conn.cursor() as cur:
-            cur.execute(
-                """
-                DO $$
-                    BEGIN
-                    IF EXISTS (
-                        SELECT 1
-                        FROM pg_replication_slots
-                        WHERE slot_name = 'slotter'
-                    ) THEN
-                        PERFORM pg_drop_replication_slot('slotter');
-                    END IF;
-                END $$;
+    with conn.cursor() as cur:
+        cur.execute(
            """
-            )
-            cur.execute("SELECT pg_create_logical_replication_slot('slotter', 'test_decoding')")
+            DO $$
+                BEGIN
+                IF EXISTS (
+                    SELECT 1
+                    FROM pg_replication_slots
+                    WHERE slot_name = 'slotter'
+                ) THEN
+                    PERFORM pg_drop_replication_slot('slotter');
+                END IF;
+            END $$;
+        """
+        )
+        cur.execute("SELECT pg_create_logical_replication_slot('slotter', 'test_decoding')")
+
+    conn.close()

    workload = pg_bin.run_nonblocking(["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=env)
    try:
        start = time.time()
        prev_measurement = time.time()
        while time.time() - start < test_duration_min * 60:
-            with psycopg2.connect(connstr) as conn:
-                with conn.cursor() as cur:
-                    cur.execute(
-                        "SELECT count(*) FROM (SELECT pg_log_standby_snapshot() FROM generate_series(1, 10000) g) s"
-                    )
-                    check_pgbench_still_running(workload)
-                    cur.execute(
-                        "SELECT pg_replication_slot_advance('slotter', pg_current_wal_lsn())"
-                    )
+            conn = psycopg2.connect(connstr)
+            conn.autocommit = True
+
+            with conn.cursor() as cur:
+                cur.execute(
+                    "SELECT count(*) FROM (SELECT pg_log_standby_snapshot() FROM generate_series(1, 10000) g) s"
+                )
+                check_pgbench_still_running(workload)
+                cur.execute("SELECT pg_replication_slot_advance('slotter', pg_current_wal_lsn())")
+
+            conn.close()

            # Measure storage
            if time.time() - prev_measurement > test_interval_min * 60:
--- a/test_runner/performance/test_physical_replication.py
+++ b/test_runner/performance/test_physical_replication.py
@@ -102,15 +102,21 @@ def test_ro_replica_lag(
                    check_pgbench_still_running(master_workload)
                    check_pgbench_still_running(replica_workload)
                    time.sleep(sync_interval_min * 60)
+
+                    conn_master = psycopg2.connect(master_connstr)
+                    conn_replica = psycopg2.connect(replica_connstr)
+                    conn_master.autocommit = True
+                    conn_replica.autocommit = True
+
                    with (
-                        psycopg2.connect(master_connstr) as conn_master,
-                        psycopg2.connect(replica_connstr) as conn_replica,
+                        conn_master.cursor() as cur_master,
+                        conn_replica.cursor() as cur_replica,
                    ):
-                        with (
-                            conn_master.cursor() as cur_master,
-                            conn_replica.cursor() as cur_replica,
-                        ):
-                            lag = measure_replication_lag(cur_master, cur_replica)
+                        lag = measure_replication_lag(cur_master, cur_replica)
+
+                    conn_master.close()
+                    conn_replica.close()
+
                    log.info(f"Replica lagged behind master by {lag} seconds")
                    zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER)
            finally:
@@ -219,11 +225,15 @@ def test_replication_start_stop(
        pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s10"], env=master_env)

        # Sync replicas
-        with psycopg2.connect(master_connstr) as conn_master:
-            with conn_master.cursor() as cur_master:
-                for i in range(num_replicas):
-                    conn_replica = psycopg2.connect(replica_connstr[i])
-                    measure_replication_lag(cur_master, conn_replica.cursor())
+        conn_master = psycopg2.connect(master_connstr)
+        conn_master.autocommit = True
+
+        with conn_master.cursor() as cur_master:
+            for i in range(num_replicas):
+                conn_replica = psycopg2.connect(replica_connstr[i])
+                measure_replication_lag(cur_master, conn_replica.cursor())
+
+        conn_master.close()

        master_pgbench = pg_bin.run_nonblocking(
            [
@@ -277,17 +287,22 @@ def test_replication_start_stop(

            time.sleep(configuration_test_time_sec)

-            with psycopg2.connect(master_connstr) as conn_master:
-                with conn_master.cursor() as cur_master:
-                    for ireplica in range(num_replicas):
-                        replica_conn = psycopg2.connect(replica_connstr[ireplica])
-                        lag = measure_replication_lag(cur_master, replica_conn.cursor())
-                        zenbenchmark.record(
-                            f"Replica {ireplica} lag", lag, "s", MetricReport.LOWER_IS_BETTER
-                        )
-                        log.info(
-                            f"Replica {ireplica} lagging behind master by {lag} seconds after configuration {iconfig:>b}"
-                        )
+            conn_master = psycopg2.connect(master_connstr)
+            conn_master.autocommit = True
+
+            with conn_master.cursor() as cur_master:
+                for ireplica in range(num_replicas):
+                    replica_conn = psycopg2.connect(replica_connstr[ireplica])
+                    lag = measure_replication_lag(cur_master, replica_conn.cursor())
+                    zenbenchmark.record(
+                        f"Replica {ireplica} lag", lag, "s", MetricReport.LOWER_IS_BETTER
+                    )
+                    log.info(
+                        f"Replica {ireplica} lagging behind master by {lag} seconds after configuration {iconfig:>b}"
+                    )
+
+            conn_master.close()
+
        master_pgbench.terminate()
    except Exception as e:
        error_occurred = True
--- a/test_runner/regress/test_installed_extensions.py
+++ b/test_runner/regress/test_installed_extensions.py
@@ -1,6 +1,14 @@
-from logging import info
+from __future__ import annotations

-from fixtures.neon_fixtures import NeonEnv
+import time
+from logging import info
+from typing import TYPE_CHECKING
+
+from fixtures.log_helper import log
+from fixtures.metrics import parse_metrics
+
+if TYPE_CHECKING:
+    from fixtures.neon_fixtures import NeonEnv


 def test_installed_extensions(neon_simple_env: NeonEnv):
@@ -85,3 +93,52 @@ def test_installed_extensions(neon_simple_env: NeonEnv):
            assert ext["n_databases"] == 2
            ext["versions"].sort()
            assert ext["versions"] == ["1.2", "1.3"]
+
+    # check that /metrics endpoint is available
+    # ensure that we see the metric before and after restart
+    res = client.metrics()
+    info("Metrics: %s", res)
+    m = parse_metrics(res)
+    neon_m = m.query_all("installed_extensions", {"extension_name": "neon", "version": "1.2"})
+    assert len(neon_m) == 1
+    for sample in neon_m:
+        assert sample.value == 2
+    neon_m = m.query_all("installed_extensions", {"extension_name": "neon", "version": "1.3"})
+    assert len(neon_m) == 1
+    for sample in neon_m:
+        assert sample.value == 1
+
+    endpoint.stop()
+    endpoint.start()
+
+    timeout = 10
+    while timeout > 0:
+        try:
+            res = client.metrics()
+            timeout = -1
+            if len(parse_metrics(res).query_all("installed_extensions")) < 4:
+                # Assume that not all metrics that are collected yet
+                time.sleep(1)
+                timeout -= 1
+                continue
+        except Exception:
+            log.exception("failed to get metrics, assume they are not collected yet")
+            time.sleep(1)
+            timeout -= 1
+            continue
+
+        assert (
+            len(parse_metrics(res).query_all("installed_extensions")) >= 4
+        ), "Not all metrics are collected"
+
+        info("After restart metrics: %s", res)
+        m = parse_metrics(res)
+        neon_m = m.query_all("installed_extensions", {"extension_name": "neon", "version": "1.2"})
+        assert len(neon_m) == 1
+        for sample in neon_m:
+            assert sample.value == 1
+
+        neon_m = m.query_all("installed_extensions", {"extension_name": "neon", "version": "1.3"})
+        assert len(neon_m) == 1
+        for sample in neon_m:
+            assert sample.value == 1
--- a/test_runner/regress/test_physical_and_logical_replicaiton.py
+++ b/test_runner/regress/test_physical_and_logical_replicaiton.py
@@ -5,7 +5,8 @@ import time
 from fixtures.neon_fixtures import NeonEnv, logical_replication_sync


-def test_physical_and_logical_replication(neon_simple_env: NeonEnv, vanilla_pg):
+def test_physical_and_logical_replication_slot_not_copied(neon_simple_env: NeonEnv, vanilla_pg):
+    """Test read replica of a primary which has a logical replication publication"""
    env = neon_simple_env

    n_records = 100000
@@ -13,7 +14,6 @@ def test_physical_and_logical_replication(neon_simple_env: NeonEnv, vanilla_pg):
    primary = env.endpoints.create_start(
        branch_name="main",
        endpoint_id="primary",
-        config_lines=["min_wal_size=32MB", "max_wal_size=64MB"],
    )
    p_con = primary.connect()
    p_cur = p_con.cursor()
@@ -30,7 +30,6 @@ def test_physical_and_logical_replication(neon_simple_env: NeonEnv, vanilla_pg):
    secondary = env.endpoints.new_replica_start(
        origin=primary,
        endpoint_id="secondary",
-        config_lines=["min_wal_size=32MB", "max_wal_size=64MB"],
    )

    s_con = secondary.connect()
@@ -48,3 +47,51 @@ def test_physical_and_logical_replication(neon_simple_env: NeonEnv, vanilla_pg):
    # Check that LR slot is not copied to replica
    s_cur.execute("select count(*) from pg_replication_slots")
    assert s_cur.fetchall()[0][0] == 0
+
+
+def test_aux_not_logged_at_replica(neon_simple_env: NeonEnv, vanilla_pg):
+    """Test that AUX files are not saved at replica"""
+    env = neon_simple_env
+
+    n_records = 20000
+
+    primary = env.endpoints.create_start(
+        branch_name="main",
+        endpoint_id="primary",
+    )
+    p_con = primary.connect()
+    p_cur = p_con.cursor()
+    p_cur.execute("CREATE TABLE t(pk bigint primary key, payload text default repeat('?',200))")
+    p_cur.execute("create publication pub1 for table t")
+
+    # start subscriber
+    vanilla_pg.start()
+    vanilla_pg.safe_psql("CREATE TABLE t(pk bigint primary key, payload text)")
+    connstr = primary.connstr().replace("'", "''")
+    vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1")
+
+    for pk in range(n_records):
+        p_cur.execute("insert into t (pk) values (%s)", (pk,))
+
+    # LR snapshot is stored each 15 seconds
+    time.sleep(16)
+
+    # start replica
+    secondary = env.endpoints.new_replica_start(
+        origin=primary,
+        endpoint_id="secondary",
+    )
+
+    s_con = secondary.connect()
+    s_cur = s_con.cursor()
+
+    logical_replication_sync(vanilla_pg, primary)
+
+    assert vanilla_pg.safe_psql("select count(*) from t")[0][0] == n_records
+    s_cur.execute("select count(*) from t")
+    assert s_cur.fetchall()[0][0] == n_records
+
+    vanilla_pg.stop()
+    secondary.stop()
+    primary.stop()
+    assert not secondary.log_contains("cannot make new WAL entries during recovery")
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -427,7 +427,7 @@ def test_create_churn_during_restart(neon_env_builder: NeonEnvBuilder):
            env.pageserver.start()

            for f in futs:
-                f.result(timeout=10)
+                f.result(timeout=30)

    # The tenant should end up active
    wait_until_tenant_active(env.pageserver.http_client(), tenant_id, iterations=10, period=1)
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,18 +1,18 @@
 {
  "v17": [
    "17.0",
-    "9ad2f3c5c37c08069a01c1e3f6b7cf275437e0cb"
+    "ae4cc30dba24f3910533e5a48e8103c3f2fff300"
  ],
  "v16": [
    "16.4",
-    "e131a9c027b202ce92bd7b9cf2569d48a6f9948e"
+    "03b43900edc5d8d6eecec460bfc89aec7174bd84"
  ],
  "v15": [
    "15.8",
-    "22e580fe9ffcea7e02592110b1c9bf426d83cada"
+    "fd631a959049dfe2b82f67409c8b8b0d3e0016d1"
  ],
  "v14": [
    "14.13",
-    "2199b83fb72680001ce0f43bf6187a21dfb8f45d"
+    "de0a000dafc2e66ce2e39282d3aa1c704fe0390e"
  ]
 }
Author	SHA1	Message	Date
Erik Grinaker	4308ffe5c0	safekeeper: batch AppendRequest writes	2024-11-13 15:09:57 +01:00
Heikki Linnakangas	d5435b1a81	tests: Increase timeout in test_create_churn_during_restart (#9736 ) This test was seen to be flaky, e.g. at: https://neon-github-public-dev.s3.amazonaws.com/reports/pr-9457/11804246485/index.html#suites/ec4311502db344eee91f1354e9dc839b/982bd121ea698414/. If I _reduce_ the timeout from 10s to 8s on my laptop, it reliably hits that timeout and fails. That suggests that the test is pretty close to the edge even when it passes. Let's bump up the timeout to 30 s to make it more robust. See also https://github.com/neondatabase/neon/issues/9730, although the error message is different there.	2024-11-13 12:20:32 +02:00
Anastasia Lubennikova	080d585b22	Add installed_extensions prometheus metric (#9608 ) and add /metrics endpoint to compute_ctl to expose such metrics metric format example for extension pg_rag with versions 1.2.3 and 1.4.2 installed in 3 and 1 databases respectively: neon_extensions_installed{extension="pg_rag", version="1.2.3"} = 3 neon_extensions_installed{extension="pg_rag", version="1.4.2"} = 1 ------ infra part: https://github.com/neondatabase/flux-fleet/pull/251 --------- Co-authored-by: Tristan Partin <tristan@neon.tech>	2024-11-13 09:36:48 +00:00
John Spray	7595d3afe6	pageserver: add `no_sync` for use in regression tests (2/2) (#9678 ) ## Problem Followup to https://github.com/neondatabase/neon/pull/9677 which enables `no_sync` in tests. This can be merged once the next release has happened. ## Summary of changes - Always run pageserver with `no_sync = true` in tests.	2024-11-13 09:17:26 +00:00
Konstantin Knizhnik	1ff5333a1b	Do not wallog AUX files at replica (#9457 ) ## Problem Attempt to persist LR stuff at replica cause cannot make new WAL entries during recovery` error. See https://neondb.slack.com/archives/C07S7RBFVRA/p1729280401283389 ## Summary of changes Do not wallog AUX files at replica. Related Postgres PRs: https://github.com/neondatabase/postgres/pull/517 https://github.com/neondatabase/postgres/pull/516 https://github.com/neondatabase/postgres/pull/515 https://github.com/neondatabase/postgres/pull/514 ## Checklist before requesting a review - [ ] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. ## Checklist before merging - [ ] Do not forget to reformat commit message to not include the above checklist --------- Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech> Co-authored-by: Heikki Linnakangas <heikki@neon.tech>	2024-11-13 08:50:01 +02:00
Tristan Partin	d8f5d43549	Fix autocommit footguns in performance tests psycopg2 has the following warning related to autocommit: > By default, any query execution, including a simple SELECT will start > a transaction: for long-running programs, if no further action is > taken, the session will remain “idle in transaction”, an undesirable > condition for several reasons (locks are held by the session, tables > bloat…). For long lived scripts, either ensure to terminate a > transaction as soon as possible or use an autocommit connection. In the 2.9 release notes, psycopg2 also made the following change: > `with connection` starts a transaction on autocommit transactions too Some of these connections are indeed long-lived, so we were retaining tons of WAL on the endpoints because we had a transaction pinned in the past. Link: https://www.psycopg.org/docs/news.html#what-s-new-in-psycopg-2-9 Link: https://github.com/psycopg/psycopg2/issues/941 Signed-off-by: Tristan Partin <tristan@neon.tech>	2024-11-12 15:48:19 -06:00
Erik Grinaker	2256a5727a	safekeeper: use `WAL_SEGMENT_SIZE` for empty timeline state (#9734 ) ## Problem `TimelinePersistentState::empty()`, used for tests and benchmarks, had a hardcoded 16 MB WAL segment size. This caused confusion when attempting to change the global segment size. ## Summary of changes Inherit from `WAL_SEGMENT_SIZE` in `TimelinePersistentState::empty()`.	2024-11-12 20:35:44 +00:00
Tristan Partin	3f80af8b1d	Add neon.logical_replication_max_logicalsnapdir_size This GUC will drop replication slots if the size of the pg_logical/snapshots directory (not including temp snapshot files) becomes larger than the specified size. Keeping the size of this directory smaller will help with basebackup size from the pageserver. Part-of: https://github.com/neondatabase/neon/issues/8619 Signed-off-by: Tristan Partin <tristan@neon.tech>	2024-11-12 13:13:28 -06:00
Tristan Partin	a61d81bbc7	Calculate compute_backpressure_throttling_seconds correctly The original value that we get is measured in microseconds. It comes from a calculation using Postgres' GetCurrentTimestamp(), whihc is implemented in terms of gettimeofday(2). Signed-off-by: Tristan Partin <tristan@neon.tech>	2024-11-12 13:12:08 -06:00
Erik Grinaker	05381a48f0	utils: remove unnecessary fsync in `durable_rename()` (#9686 ) ## Problem WAL segment fsyncs significantly affect WAL ingestion throughput. `durable_rename()` is used when initializing every 16 MB segment, and issues 3 fsyncs of which 1 was unnecessary. ## Summary of changes Remove an fsync in `durable_rename` which is unnecessary with Linux and ext4 (which we currently use). This improves WAL ingestion throughput by up to 23% with large appends on my MacBook.	2024-11-12 18:57:31 +01:00
Alex Chi Z.	cef165818c	test(pageserver): add gc-compaction tests with delta will_init (#9724 ) I had an impression that gc-compaction didn't test the case where the first record of the key history is will_init because of there are some code path that will panic in this case. Luckily it got fixed in https://github.com/neondatabase/neon/pull/9026 so we can now implement such tests. Part of https://github.com/neondatabase/neon/issues/9114 ## Summary of changes * Randomly changed some images into will_init neon wal record * Split `test_simple_bottom_most_compaction_deltas` into two test cases, one of them has the bottom layer as delta layer with will_init flags, while the other is the original one with image layers. --------- Signed-off-by: Alex Chi Z <chi@neon.tech>	2024-11-12 10:37:31 -05:00