tests: demonstrate standby_horizon issue with multiple safekeepers

DEBUG
Multiple safekeepers in test_hot_standby_gc
2026-01-25 06:10:37 +00:00 · 2024-08-22 10:27:55 +00:00 · 2024-08-22 10:27:55 +00:00 · 2024-08-22 10:27:55 +00:00
2 changed files with 40 additions and 0 deletions
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -731,6 +731,12 @@ impl ConnectionManagerState {
            timeline_update.standby_horizon
        );
        if timeline_update.standby_horizon != 0 {
+            tracing::info!(
+                "register_timeline_update: sk={} standby_horizon={} commit_lsn={}",
+                timeline_update.safekeeper_id,
+                Lsn(timeline_update.standby_horizon),
+                Lsn(timeline_update.commit_lsn)
+            );
            // ignore reports from safekeepers not connected to replicas
            self.timeline
                .standby_horizon
@@ -739,6 +745,12 @@ impl ConnectionManagerState {
                .metrics
                .standby_horizon_gauge
                .set(timeline_update.standby_horizon as i64);
+        } else {
+            tracing::info!(
+                "register_timeline_update: sk={} standby_horizon=None commit_lsn={}",
+                timeline_update.safekeeper_id,
+                Lsn(timeline_update.commit_lsn)
+            );
        }

        let new_safekeeper_id = NodeId(timeline_update.safekeeper_id);
--- a/test_runner/regress/test_hot_standby.py
+++ b/test_runner/regress/test_hot_standby.py
@@ -131,6 +131,11 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool):
        # set PITR interval to be small, so we can do GC
        "pitr_interval": "0 s",
    }
+
+    # Make sure that standby_horizon feedback still works when the standby and
+    # the pageserver might be connected to different safekeepers
+    neon_env_builder.num_safekeepers = 3
+
    env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf)
    timeline_id = env.initial_timeline
    tenant_id = env.initial_tenant
@@ -189,6 +194,29 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool):
            # generates use old not_modified_since LSNs, older than
            # the GC cutoff, but new request LSNs. (In protocol
            # version 1 there was only one LSN, and this failed.)
+            secondary.clear_shared_buffers(cursor=s_cur)
+            log_replica_lag(primary, secondary)
+            s_cur.execute("SELECT COUNT(*) FROM test")
+            log_replica_lag(primary, secondary)
+            res = s_cur.fetchone()
+            assert res[0] == 10000
+
+            env.safekeepers[0].stop()
+
+            # Restart the pageserver and run GC again: the standby_horizon should still be enforced
+            env.pageserver.stop()
+            env.pageserver.start()
+            for tenant_shard_id, pageserver in shards:
+                client = pageserver.http_client()
+                client.timeline_checkpoint(tenant_shard_id, timeline_id)
+                client.timeline_compact(tenant_shard_id, timeline_id)
+                client.timeline_gc(tenant_shard_id, timeline_id, 0)
+
+            # Re-execute the query. The GetPage requests that this
+            # generates use old not_modified_since LSNs, older than
+            # the GC cutoff, but new request LSNs. (In protocol
+            # version 1 there was only one LSN, and this failed.)
+            secondary.clear_shared_buffers(cursor=s_cur)
            log_replica_lag(primary, secondary)
            s_cur.execute("SELECT COUNT(*) FROM test")
            log_replica_lag(primary, secondary)
Author	SHA1	Message	Date
John Spray	c281c80f0c	tests: demonstrate standby_horizon issue with multiple safekeepers	2024-08-22 10:27:55 +00:00
John Spray	3cb39db80c	DEBUG	2024-08-22 10:27:55 +00:00
John Spray	12b8d921be	Multiple safekeepers in test_hot_standby_gc	2024-08-22 10:27:55 +00:00