simplify the test, failure now looks like this:

``` 2025-07-10 13:00:25.198 INFO [neon_fixtures.py:5643] caughtup=True, primary_lsn=0/53F63B0, secondary_lsn=0/53F63B0 2025-07-10 13:00:25.200 INFO [neon_fixtures.py:265] Hostname: localhost 2025-07-10 13:00:25.239 INFO [test_hot_standby.py:241] tenant_shard_id.shard_index=ShardIndex(shard_number=0, shard_count=0): standby_horizon_at_ps=Lsn("0/14EEC38") secondary_apply_lsn=Lsn("0/53F63B0") 2025-07-10 13:00:26.241 INFO [neon_fixtures.py:265] Hostname: localhost 2025-07-10 13:00:26.269 INFO [test_hot_standby.py:241] tenant_shard_id.shard_index=ShardIndex(shard_number=0, shard_count=0): standby_horizon_at_ps=Lsn("0/14EEC38") secondary_apply_lsn=Lsn("0/53F63B0") 2025-07-10 13:00:27.271 INFO [neon_fixtures.py:265] Hostname: localhost ... 2025-07-10 13:00:35.542 INFO [test_hot_standby.py:241] tenant_shard_id.shard_index=ShardIndex(shard_number=0, shard_count=0): standby_horizon_at_ps=Lsn("0/14EEC38") secondary_apply_lsn=Lsn("0/53F63E8") ... FAILED test_runner/regress/test_hot_standby.py::test_hot_standby_gc[debug-pg16-True] - Failed: standby_horizon didn't propagate within timeout_secs=10, this is holding up gc on secondary ```
extend test_hot_standby_gc to demonstrate that it doesn't work without hot_standby_feedback
2026-05-14 03:30:36 +00:00 · 2025-07-10 13:00:43 +00:00 · 2025-07-10 12:41:54 +00:00
4 changed files with 54 additions and 4 deletions
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -6633,7 +6633,7 @@ impl Timeline {
                const MAX_ALLOWED_STANDBY_LAG: u64 = 10u64 << 30; // 10 GB
                if standby_lag.0 < MAX_ALLOWED_STANDBY_LAG {
                    new_gc_cutoff = Lsn::min(standby_horizon, new_gc_cutoff);
-                    trace!("holding off GC for standby apply LSN {}", standby_horizon);
+                    info!("holding off GC for standby apply LSN {}", standby_horizon);
                } else {
                    warn!(
                        "standby is lagging for more than {}MB, not holding gc for it",
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -750,7 +750,7 @@ impl ConnectionManagerState {

        WALRECEIVER_BROKER_UPDATES.inc();

-        trace!(
+        info!(
            "safekeeper info update: standby_horizon(cutoff)={}",
            timeline_update.standby_horizon
        );
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -220,7 +220,7 @@ impl WalSenders {
    fn record_standby_reply(self: &Arc<WalSenders>, id: WalSenderId, reply: &StandbyReply) {
        let mut shared = self.mutex.lock();
        let slot = shared.get_slot_mut(id);
-        debug!(
+        info!(
            "Record standby reply: ts={} apply_lsn={}",
            reply.reply_ts, reply.apply_lsn
        );
@@ -400,7 +400,10 @@ impl WalSendersShared {
            }
        }
        self.agg_standby_feedback = StandbyFeedback {
-            reply: reply_agg,
+            reply: {
+                info!(prev=%self.agg_standby_feedback.reply.apply_lsn, new=%reply_agg.apply_lsn, "updating agg_standby_feedback apply_lsn");
+                reply_agg
+            },
            hs_feedback: agg,
        };
    }
--- a/test_runner/regress/test_hot_standby.py
+++ b/test_runner/regress/test_hot_standby.py
@@ -7,6 +7,7 @@ import time
 from functools import partial

 import pytest
+from fixtures.common_types import Lsn
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
    NeonEnv,
@@ -133,6 +134,9 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool):
    tenant_conf = {
        # set PITR interval to be small, so we can do GC
        "pitr_interval": "0 s",
+        # this test is largely about PS GC behavior, we control it manually
+        "gc_period": "0s",
+        "compaction_period": "0s",
    }
    env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf)
    timeline_id = env.initial_timeline
@@ -163,6 +167,11 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool):
            res = s_cur.fetchone()
            assert res is not None

+            s_cur.execute("SHOW hot_standby_feedback")
+            res = s_cur.fetchone()
+            assert res is not None
+            assert res[0] == "off"
+
            s_cur.execute("SELECT COUNT(*) FROM test")
            res = s_cur.fetchone()
            assert res == (10000,)
@@ -198,6 +207,44 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool):
            res = s_cur.fetchone()
            assert res == (10000,)

+            if pause_apply:
+                s_cur.execute("SELECT pg_wal_replay_resume()")
+
+            wait_replica_caughtup(primary, secondary)
+
+            # Wait for PS's view of standby horizon to catch up.
+            # (When we switch to leases (LKB-88) we need to change this to watch the lease lsn move.)
+            # (TODO: instead of checking impl details here, somehow assert that gc can delete layers now.
+            #        Tricky to do that without flakiness though.)
+            # We already waited for replica to catch up, so, this timeout is strictly on
+            # a few few in-memory only RPCs to propagate standby_horizon.
+            timeout_secs = 10
+            started_at = time.time()
+            shards = tenant_get_shards(env, tenant_id, None)
+            for tenant_shard_id, pageserver in shards:
+                client = pageserver.http_client()
+                while True:
+                    secondary_apply_lsn = Lsn(
+                        secondary.safe_psql_scalar(
+                            "SELECT pg_last_wal_replay_lsn()", log_query=False
+                        )
+                    )
+                    standby_horizon_metric = client.get_metrics().query_one(
+                        "pageserver_standby_horizon",
+                        {
+                            "tenant_id": str(tenant_shard_id.tenant_id),
+                            "shard_id": str(tenant_shard_id.shard_index),
+                            "timeline_id": str(timeline_id),
+                        },
+                    )
+                    standby_horizon_at_ps = Lsn(int(standby_horizon_metric.value))
+                    log.info(f"{tenant_shard_id.shard_index=}: {standby_horizon_at_ps=} {secondary_apply_lsn=}")
+                    if secondary_apply_lsn == standby_horizon_at_ps:
+                        break
+                    if time.time() - started_at > timeout_secs:
+                        pytest.fail(f"standby_horizon didn't propagate within {timeout_secs=}, this is holding up gc on secondary")
+                    time.sleep(1)
+

 def run_pgbench(connstr: str, pg_bin: PgBin):
    log.info(f"Start a pgbench workload on pg {connstr}")