diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py index 6b120f386d..e384c57c2e 100644 --- a/test_runner/regress/test_hot_standby.py +++ b/test_runner/regress/test_hot_standby.py @@ -134,6 +134,9 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool): tenant_conf = { # set PITR interval to be small, so we can do GC "pitr_interval": "0 s", + # this test is largely about PS GC behavior, we control it manually + "gc_period": "0s", + "compaction_period": "0s", } env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf) timeline_id = env.initial_timeline @@ -209,8 +212,14 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool): wait_replica_caughtup(primary, secondary) - # Wait for standby horizon to catch up, then gc again. - # (When we switch to leases (LKB-88) we need to wait for leases to expir.e) + # Wait for PS's view of standby horizon to catch up. + # (When we switch to leases (LKB-88) we need to change this to watch the lease lsn move.) + # (TODO: instead of checking impl details here, somehow assert that gc can delete layers now. + # Tricky to do that without flakiness though.) + # We already waited for replica to catch up, so, this timeout is strictly on + # a few few in-memory only RPCs to propagate standby_horizon. + timeout_secs = 10 + started_at = time.time() shards = tenant_get_shards(env, tenant_id, None) for tenant_shard_id, pageserver in shards: client = pageserver.http_client() @@ -232,41 +241,9 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool): log.info(f"{tenant_shard_id.shard_index=}: {standby_horizon_at_ps=} {secondary_apply_lsn=}") if secondary_apply_lsn == standby_horizon_at_ps: break + if time.time() - started_at > timeout_secs: + pytest.fail(f"standby_horizon didn't propagate within {timeout_secs=}, this is holding up gc on secondary") time.sleep(1) - client.timeline_checkpoint(tenant_shard_id, timeline_id) - client.timeline_compact(tenant_shard_id, timeline_id) - client.timeline_gc(tenant_shard_id, timeline_id, 0) - - # Clear the cache in the standby, so that when we - # re-execute the query, it will make GetPage - # requests. This does not clear the last-written LSN cache - # so we still remember the LSNs of the pages. - secondary.clear_buffers(cursor=s_cur) - - if pause_apply: - s_cur.execute("SELECT pg_wal_replay_pause()") - - # Do other stuff on the primary, to advance the WAL - p_cur.execute("CREATE TABLE test3 AS SELECT generate_series(1, 1000000) AS g") - - # Run GC. The PITR interval is very small, so this advances the GC cutoff LSN - # very close to the primary's current insert LSN. - shards = tenant_get_shards(env, tenant_id, None) - for tenant_shard_id, pageserver in shards: - client = pageserver.http_client() - client.timeline_checkpoint(tenant_shard_id, timeline_id) - client.timeline_compact(tenant_shard_id, timeline_id) - client.timeline_gc(tenant_shard_id, timeline_id, 0) - - # Re-execute the query. The GetPage requests that this - # generates use old not_modified_since LSNs, older than - # the GC cutoff, but new request LSNs. (In protocol - # version 1 there was only one LSN, and this failed.) - log_replica_lag(primary, secondary) - s_cur.execute("SELECT COUNT(*) FROM test") - log_replica_lag(primary, secondary) - res = s_cur.fetchone() - assert res == (10000,) def run_pgbench(connstr: str, pg_bin: PgBin):