mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-26 09:30:37 +00:00
simplify the test, failure now looks like this:
```
2025-07-10 13:00:25.198 INFO [neon_fixtures.py:5643] caughtup=True, primary_lsn=0/53F63B0, secondary_lsn=0/53F63B0
2025-07-10 13:00:25.200 INFO [neon_fixtures.py:265] Hostname: localhost
2025-07-10 13:00:25.239 INFO [test_hot_standby.py:241] tenant_shard_id.shard_index=ShardIndex(shard_number=0, shard_count=0): standby_horizon_at_ps=Lsn("0/14EEC38") secondary_apply_lsn=Lsn("0/53F63B0")
2025-07-10 13:00:26.241 INFO [neon_fixtures.py:265] Hostname: localhost
2025-07-10 13:00:26.269 INFO [test_hot_standby.py:241] tenant_shard_id.shard_index=ShardIndex(shard_number=0, shard_count=0): standby_horizon_at_ps=Lsn("0/14EEC38") secondary_apply_lsn=Lsn("0/53F63B0")
2025-07-10 13:00:27.271 INFO [neon_fixtures.py:265] Hostname: localhost
...
2025-07-10 13:00:35.542 INFO [test_hot_standby.py:241] tenant_shard_id.shard_index=ShardIndex(shard_number=0, shard_count=0): standby_horizon_at_ps=Lsn("0/14EEC38") secondary_apply_lsn=Lsn("0/53F63E8")
...
FAILED test_runner/regress/test_hot_standby.py::test_hot_standby_gc[debug-pg16-True] - Failed: standby_horizon didn't propagate within timeout_secs=10, this is holding up gc on secondary
```
This commit is contained in:
@@ -134,6 +134,9 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool):
|
||||
tenant_conf = {
|
||||
# set PITR interval to be small, so we can do GC
|
||||
"pitr_interval": "0 s",
|
||||
# this test is largely about PS GC behavior, we control it manually
|
||||
"gc_period": "0s",
|
||||
"compaction_period": "0s",
|
||||
}
|
||||
env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf)
|
||||
timeline_id = env.initial_timeline
|
||||
@@ -209,8 +212,14 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool):
|
||||
|
||||
wait_replica_caughtup(primary, secondary)
|
||||
|
||||
# Wait for standby horizon to catch up, then gc again.
|
||||
# (When we switch to leases (LKB-88) we need to wait for leases to expir.e)
|
||||
# Wait for PS's view of standby horizon to catch up.
|
||||
# (When we switch to leases (LKB-88) we need to change this to watch the lease lsn move.)
|
||||
# (TODO: instead of checking impl details here, somehow assert that gc can delete layers now.
|
||||
# Tricky to do that without flakiness though.)
|
||||
# We already waited for replica to catch up, so, this timeout is strictly on
|
||||
# a few few in-memory only RPCs to propagate standby_horizon.
|
||||
timeout_secs = 10
|
||||
started_at = time.time()
|
||||
shards = tenant_get_shards(env, tenant_id, None)
|
||||
for tenant_shard_id, pageserver in shards:
|
||||
client = pageserver.http_client()
|
||||
@@ -232,41 +241,9 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool):
|
||||
log.info(f"{tenant_shard_id.shard_index=}: {standby_horizon_at_ps=} {secondary_apply_lsn=}")
|
||||
if secondary_apply_lsn == standby_horizon_at_ps:
|
||||
break
|
||||
if time.time() - started_at > timeout_secs:
|
||||
pytest.fail(f"standby_horizon didn't propagate within {timeout_secs=}, this is holding up gc on secondary")
|
||||
time.sleep(1)
|
||||
client.timeline_checkpoint(tenant_shard_id, timeline_id)
|
||||
client.timeline_compact(tenant_shard_id, timeline_id)
|
||||
client.timeline_gc(tenant_shard_id, timeline_id, 0)
|
||||
|
||||
# Clear the cache in the standby, so that when we
|
||||
# re-execute the query, it will make GetPage
|
||||
# requests. This does not clear the last-written LSN cache
|
||||
# so we still remember the LSNs of the pages.
|
||||
secondary.clear_buffers(cursor=s_cur)
|
||||
|
||||
if pause_apply:
|
||||
s_cur.execute("SELECT pg_wal_replay_pause()")
|
||||
|
||||
# Do other stuff on the primary, to advance the WAL
|
||||
p_cur.execute("CREATE TABLE test3 AS SELECT generate_series(1, 1000000) AS g")
|
||||
|
||||
# Run GC. The PITR interval is very small, so this advances the GC cutoff LSN
|
||||
# very close to the primary's current insert LSN.
|
||||
shards = tenant_get_shards(env, tenant_id, None)
|
||||
for tenant_shard_id, pageserver in shards:
|
||||
client = pageserver.http_client()
|
||||
client.timeline_checkpoint(tenant_shard_id, timeline_id)
|
||||
client.timeline_compact(tenant_shard_id, timeline_id)
|
||||
client.timeline_gc(tenant_shard_id, timeline_id, 0)
|
||||
|
||||
# Re-execute the query. The GetPage requests that this
|
||||
# generates use old not_modified_since LSNs, older than
|
||||
# the GC cutoff, but new request LSNs. (In protocol
|
||||
# version 1 there was only one LSN, and this failed.)
|
||||
log_replica_lag(primary, secondary)
|
||||
s_cur.execute("SELECT COUNT(*) FROM test")
|
||||
log_replica_lag(primary, secondary)
|
||||
res = s_cur.fetchone()
|
||||
assert res == (10000,)
|
||||
|
||||
|
||||
def run_pgbench(connstr: str, pg_bin: PgBin):
|
||||
|
||||
Reference in New Issue
Block a user