simplify the test, failure now looks like this:

```
2025-07-10 13:00:25.198 INFO [neon_fixtures.py:5643] caughtup=True, primary_lsn=0/53F63B0, secondary_lsn=0/53F63B0
2025-07-10 13:00:25.200 INFO [neon_fixtures.py:265] Hostname: localhost
2025-07-10 13:00:25.239 INFO [test_hot_standby.py:241] tenant_shard_id.shard_index=ShardIndex(shard_number=0, shard_count=0): standby_horizon_at_ps=Lsn("0/14EEC38") secondary_apply_lsn=Lsn("0/53F63B0")
2025-07-10 13:00:26.241 INFO [neon_fixtures.py:265] Hostname: localhost
2025-07-10 13:00:26.269 INFO [test_hot_standby.py:241] tenant_shard_id.shard_index=ShardIndex(shard_number=0, shard_count=0): standby_horizon_at_ps=Lsn("0/14EEC38") secondary_apply_lsn=Lsn("0/53F63B0")
2025-07-10 13:00:27.271 INFO [neon_fixtures.py:265] Hostname: localhost
...
2025-07-10 13:00:35.542 INFO [test_hot_standby.py:241] tenant_shard_id.shard_index=ShardIndex(shard_number=0, shard_count=0): standby_horizon_at_ps=Lsn("0/14EEC38") secondary_apply_lsn=Lsn("0/53F63E8")
...
FAILED test_runner/regress/test_hot_standby.py::test_hot_standby_gc[debug-pg16-True] - Failed: standby_horizon didn't propagate within timeout_secs=10, this is holding up gc on secondary
```
This commit is contained in:
Christian Schwarz
2025-07-10 13:00:43 +00:00
parent 45f5dfc685
commit d2e72f119f

View File

@@ -134,6 +134,9 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool):
tenant_conf = {
# set PITR interval to be small, so we can do GC
"pitr_interval": "0 s",
# this test is largely about PS GC behavior, we control it manually
"gc_period": "0s",
"compaction_period": "0s",
}
env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf)
timeline_id = env.initial_timeline
@@ -209,8 +212,14 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool):
wait_replica_caughtup(primary, secondary)
# Wait for standby horizon to catch up, then gc again.
# (When we switch to leases (LKB-88) we need to wait for leases to expir.e)
# Wait for PS's view of standby horizon to catch up.
# (When we switch to leases (LKB-88) we need to change this to watch the lease lsn move.)
# (TODO: instead of checking impl details here, somehow assert that gc can delete layers now.
# Tricky to do that without flakiness though.)
# We already waited for replica to catch up, so, this timeout is strictly on
# a few few in-memory only RPCs to propagate standby_horizon.
timeout_secs = 10
started_at = time.time()
shards = tenant_get_shards(env, tenant_id, None)
for tenant_shard_id, pageserver in shards:
client = pageserver.http_client()
@@ -232,41 +241,9 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool):
log.info(f"{tenant_shard_id.shard_index=}: {standby_horizon_at_ps=} {secondary_apply_lsn=}")
if secondary_apply_lsn == standby_horizon_at_ps:
break
if time.time() - started_at > timeout_secs:
pytest.fail(f"standby_horizon didn't propagate within {timeout_secs=}, this is holding up gc on secondary")
time.sleep(1)
client.timeline_checkpoint(tenant_shard_id, timeline_id)
client.timeline_compact(tenant_shard_id, timeline_id)
client.timeline_gc(tenant_shard_id, timeline_id, 0)
# Clear the cache in the standby, so that when we
# re-execute the query, it will make GetPage
# requests. This does not clear the last-written LSN cache
# so we still remember the LSNs of the pages.
secondary.clear_buffers(cursor=s_cur)
if pause_apply:
s_cur.execute("SELECT pg_wal_replay_pause()")
# Do other stuff on the primary, to advance the WAL
p_cur.execute("CREATE TABLE test3 AS SELECT generate_series(1, 1000000) AS g")
# Run GC. The PITR interval is very small, so this advances the GC cutoff LSN
# very close to the primary's current insert LSN.
shards = tenant_get_shards(env, tenant_id, None)
for tenant_shard_id, pageserver in shards:
client = pageserver.http_client()
client.timeline_checkpoint(tenant_shard_id, timeline_id)
client.timeline_compact(tenant_shard_id, timeline_id)
client.timeline_gc(tenant_shard_id, timeline_id, 0)
# Re-execute the query. The GetPage requests that this
# generates use old not_modified_since LSNs, older than
# the GC cutoff, but new request LSNs. (In protocol
# version 1 there was only one LSN, and this failed.)
log_replica_lag(primary, secondary)
s_cur.execute("SELECT COUNT(*) FROM test")
log_replica_lag(primary, secondary)
res = s_cur.fetchone()
assert res == (10000,)
def run_pgbench(connstr: str, pg_bin: PgBin):