mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-14 11:40:38 +00:00
fix(test): remove too strict layers_removed==0 check in test_readonly_node_gc (#9506)
Fixes #9098 ## Problem `test_readonly_node_gc` is flaky. As shown in [Allure Report](https://neon-github-public-dev.s3.amazonaws.com/reports/pr-9469/11444519440/index.html#suites/3ccffb1d100105b98aed3dc19b717917/2c02073738fa2b39), we would get a `AssertionError: No layers should be removed, old layers are guarded by leases.` after the test restarts pageservers or after reconfigure pageservers. During the investigation, we found that the layers has LSN (`0/1563088`) greater than the LSN (`0x1562000`) protected by the lease. For instance, **Layers removed** <pre> 000000067F00000005000034540100000000-000000067F00000005000040050100000000__000000000<b><i>1563088</i></b>-00000001 (shard 0002) 000000068000000000000017E20000000001-010000000100000001000000000000000001__000000000<b><i>1563088</i></b>-00000001 (shard 0002) </pre> **Lsn Lease Granted** <pre> handle_make_lsn_lease{lsn=<b><i>0/1562000</i></b> shard_id=0002 shard_id=0002}: lease created, valid until 2024-10-21 </pre> This means that these layers are not guarded by the leases: they are in "future", not visible to the static endpoint. ## Summary of changes - Remove the assertion layers_removed == 0 after trigger timeline GC while holding the lease. Instead rely on the successful execution of the`SELECT` query to test lease validity. - Improve test logging Signed-off-by: Yuchen Liang <yuchen@neon.tech>
This commit is contained in:
@@ -169,23 +169,24 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
|
||||
)
|
||||
return last_flush_lsn
|
||||
|
||||
def trigger_gc_and_select(env: NeonEnv, ep_static: Endpoint):
|
||||
def trigger_gc_and_select(env: NeonEnv, ep_static: Endpoint, ctx: str):
|
||||
"""
|
||||
Trigger GC manually on all pageservers. Then run an `SELECT` query.
|
||||
"""
|
||||
for shard, ps in tenant_get_shards(env, env.initial_tenant):
|
||||
client = ps.http_client()
|
||||
gc_result = client.timeline_gc(shard, env.initial_timeline, 0)
|
||||
# Note: cannot assert on `layers_removed` here because it could be layers
|
||||
# not guarded by the lease. Rely on successful execution of the query instead.
|
||||
log.info(f"{gc_result=}")
|
||||
|
||||
assert (
|
||||
gc_result["layers_removed"] == 0
|
||||
), "No layers should be removed, old layers are guarded by leases."
|
||||
|
||||
with ep_static.cursor() as cur:
|
||||
# Following query should succeed if pages are properly guarded by leases.
|
||||
cur.execute("SELECT count(*) FROM t0")
|
||||
assert cur.fetchone() == (ROW_COUNT,)
|
||||
|
||||
log.info(f"`SELECT` query succeed after GC, {ctx=}")
|
||||
|
||||
# Insert some records on main branch
|
||||
with env.endpoints.create_start("main") as ep_main:
|
||||
with ep_main.cursor() as cur:
|
||||
@@ -210,9 +211,9 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
|
||||
# Wait for static compute to renew lease at least once.
|
||||
time.sleep(LSN_LEASE_LENGTH / 2)
|
||||
|
||||
generate_updates_on_main(env, ep_main, i, end=100)
|
||||
generate_updates_on_main(env, ep_main, 3, end=100)
|
||||
|
||||
trigger_gc_and_select(env, ep_static)
|
||||
trigger_gc_and_select(env, ep_static, ctx="Before pageservers restart")
|
||||
|
||||
# Trigger Pageserver restarts
|
||||
for ps in env.pageservers:
|
||||
@@ -221,7 +222,7 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
|
||||
time.sleep(LSN_LEASE_LENGTH / 2)
|
||||
ps.start()
|
||||
|
||||
trigger_gc_and_select(env, ep_static)
|
||||
trigger_gc_and_select(env, ep_static, ctx="After pageservers restart")
|
||||
|
||||
# Reconfigure pageservers
|
||||
env.pageservers[0].stop()
|
||||
@@ -230,7 +231,7 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
|
||||
)
|
||||
env.storage_controller.reconcile_until_idle()
|
||||
|
||||
trigger_gc_and_select(env, ep_static)
|
||||
trigger_gc_and_select(env, ep_static, ctx="After putting pageserver 0 offline")
|
||||
|
||||
# Do some update so we can increment latest_gc_cutoff
|
||||
generate_updates_on_main(env, ep_main, i, end=100)
|
||||
|
||||
Reference in New Issue
Block a user