simplify the test, failure now looks like this:

``` 2025-07-10 13:00:25.198 INFO [neon_fixtures.py:5643] caughtup=True, primary_lsn=0/53F63B0, secondary_lsn=0/53F63B0 2025-07-10 13:00:25.200 INFO [neon_fixtures.py:265] Hostname: localhost 2025-07-10 13:00:25.239 INFO [test_hot_standby.py:241] tenant_shard_id.shard_index=ShardIndex(shard_number=0, shard_count=0): standby_horizon_at_ps=Lsn("0/14EEC38") secondary_apply_lsn=Lsn("0/53F63B0") 2025-07-10 13:00:26.241 INFO [neon_fixtures.py:265] Hostname: localhost 2025-07-10 13:00:26.269 INFO [test_hot_standby.py:241] tenant_shard_id.shard_index=ShardIndex(shard_number=0, shard_count=0): standby_horizon_at_ps=Lsn("0/14EEC38") secondary_apply_lsn=Lsn("0/53F63B0") 2025-07-10 13:00:27.271 INFO [neon_fixtures.py:265] Hostname: localhost ... 2025-07-10 13:00:35.542 INFO [test_hot_standby.py:241] tenant_shard_id.shard_index=ShardIndex(shard_number=0, shard_count=0): standby_horizon_at_ps=Lsn("0/14EEC38") secondary_apply_lsn=Lsn("0/53F63E8") ... FAILED test_runner/regress/test_hot_standby.py::test_hot_standby_gc[debug-pg16-True] - Failed: standby_horizon didn't propagate within timeout_secs=10, this is holding up gc on secondary ```
2026-05-26 09:30:37 +00:00 · 2025-07-10 13:00:43 +00:00
parent 45f5dfc685
commit d2e72f119f
1 changed files with 13 additions and 36 deletions
--- a/test_runner/regress/test_hot_standby.py
+++ b/test_runner/regress/test_hot_standby.py
@@ -134,6 +134,9 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool):
    tenant_conf = {
        # set PITR interval to be small, so we can do GC
        "pitr_interval": "0 s",
+        # this test is largely about PS GC behavior, we control it manually
+        "gc_period": "0s",
+        "compaction_period": "0s",
    }
    env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf)
    timeline_id = env.initial_timeline
@@ -209,8 +212,14 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool):

            wait_replica_caughtup(primary, secondary)

-            # Wait for standby horizon to catch up, then gc again.
-            # (When we switch to leases (LKB-88) we need to wait for leases to expir.e)
+            # Wait for PS's view of standby horizon to catch up.
+            # (When we switch to leases (LKB-88) we need to change this to watch the lease lsn move.)
+            # (TODO: instead of checking impl details here, somehow assert that gc can delete layers now.
+            #        Tricky to do that without flakiness though.)
+            # We already waited for replica to catch up, so, this timeout is strictly on
+            # a few few in-memory only RPCs to propagate standby_horizon.
+            timeout_secs = 10
+            started_at = time.time()
            shards = tenant_get_shards(env, tenant_id, None)
            for tenant_shard_id, pageserver in shards:
                client = pageserver.http_client()
@@ -232,41 +241,9 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool):
                    log.info(f"{tenant_shard_id.shard_index=}: {standby_horizon_at_ps=} {secondary_apply_lsn=}")
                    if secondary_apply_lsn == standby_horizon_at_ps:
                        break
+                    if time.time() - started_at > timeout_secs:
+                        pytest.fail(f"standby_horizon didn't propagate within {timeout_secs=}, this is holding up gc on secondary")
                    time.sleep(1)
-                client.timeline_checkpoint(tenant_shard_id, timeline_id)
-                client.timeline_compact(tenant_shard_id, timeline_id)
-                client.timeline_gc(tenant_shard_id, timeline_id, 0)
-
-            # Clear the cache in the standby, so that when we
-            # re-execute the query, it will make GetPage
-            # requests. This does not clear the last-written LSN cache
-            # so we still remember the LSNs of the pages.
-            secondary.clear_buffers(cursor=s_cur)
-
-            if pause_apply:
-                s_cur.execute("SELECT pg_wal_replay_pause()")
-
-            # Do other stuff on the primary, to advance the WAL
-            p_cur.execute("CREATE TABLE test3 AS SELECT generate_series(1, 1000000) AS g")
-
-            # Run GC. The PITR interval is very small, so this advances the GC cutoff LSN
-            # very close to the primary's current insert LSN.
-            shards = tenant_get_shards(env, tenant_id, None)
-            for tenant_shard_id, pageserver in shards:
-                client = pageserver.http_client()
-                client.timeline_checkpoint(tenant_shard_id, timeline_id)
-                client.timeline_compact(tenant_shard_id, timeline_id)
-                client.timeline_gc(tenant_shard_id, timeline_id, 0)
-
-            # Re-execute the query. The GetPage requests that this
-            # generates use old not_modified_since LSNs, older than
-            # the GC cutoff, but new request LSNs. (In protocol
-            # version 1 there was only one LSN, and this failed.)
-            log_replica_lag(primary, secondary)
-            s_cur.execute("SELECT COUNT(*) FROM test")
-            log_replica_lag(primary, secondary)
-            res = s_cur.fetchone()
-            assert res == (10000,)


 def run_pgbench(connstr: str, pg_bin: PgBin):