From db900ae9d0bc01aa38d95938325b29b30eb4b4cf Mon Sep 17 00:00:00 2001 From: Yuchen Liang <70461588+yliang412@users.noreply.github.com> Date: Fri, 25 Oct 2024 07:50:47 -0400 Subject: [PATCH] fix(test): remove too strict layers_removed==0 check in test_readonly_node_gc (#9506) Fixes #9098 ## Problem `test_readonly_node_gc` is flaky. As shown in [Allure Report](https://neon-github-public-dev.s3.amazonaws.com/reports/pr-9469/11444519440/index.html#suites/3ccffb1d100105b98aed3dc19b717917/2c02073738fa2b39), we would get a `AssertionError: No layers should be removed, old layers are guarded by leases.` after the test restarts pageservers or after reconfigure pageservers. During the investigation, we found that the layers has LSN (`0/1563088`) greater than the LSN (`0x1562000`) protected by the lease. For instance, **Layers removed**

000000067F00000005000034540100000000-000000067F00000005000040050100000000__0000000001563088-00000001
(shard 0002)

000000068000000000000017E20000000001-010000000100000001000000000000000001__0000000001563088-00000001
(shard 0002)
**Lsn Lease Granted**
handle_make_lsn_lease{lsn=0/1562000 shard_id=0002
shard_id=0002}: lease created, valid until 2024-10-21
This means that these layers are not guarded by the leases: they are in "future", not visible to the static endpoint. ## Summary of changes - Remove the assertion layers_removed == 0 after trigger timeline GC while holding the lease. Instead rely on the successful execution of the`SELECT` query to test lease validity. - Improve test logging Signed-off-by: Yuchen Liang --- test_runner/regress/test_readonly_node.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py index 30c69cb883..8151160477 100644 --- a/test_runner/regress/test_readonly_node.py +++ b/test_runner/regress/test_readonly_node.py @@ -169,23 +169,24 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder): ) return last_flush_lsn - def trigger_gc_and_select(env: NeonEnv, ep_static: Endpoint): + def trigger_gc_and_select(env: NeonEnv, ep_static: Endpoint, ctx: str): """ Trigger GC manually on all pageservers. Then run an `SELECT` query. """ for shard, ps in tenant_get_shards(env, env.initial_tenant): client = ps.http_client() gc_result = client.timeline_gc(shard, env.initial_timeline, 0) + # Note: cannot assert on `layers_removed` here because it could be layers + # not guarded by the lease. Rely on successful execution of the query instead. log.info(f"{gc_result=}") - assert ( - gc_result["layers_removed"] == 0 - ), "No layers should be removed, old layers are guarded by leases." - with ep_static.cursor() as cur: + # Following query should succeed if pages are properly guarded by leases. cur.execute("SELECT count(*) FROM t0") assert cur.fetchone() == (ROW_COUNT,) + log.info(f"`SELECT` query succeed after GC, {ctx=}") + # Insert some records on main branch with env.endpoints.create_start("main") as ep_main: with ep_main.cursor() as cur: @@ -210,9 +211,9 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder): # Wait for static compute to renew lease at least once. time.sleep(LSN_LEASE_LENGTH / 2) - generate_updates_on_main(env, ep_main, i, end=100) + generate_updates_on_main(env, ep_main, 3, end=100) - trigger_gc_and_select(env, ep_static) + trigger_gc_and_select(env, ep_static, ctx="Before pageservers restart") # Trigger Pageserver restarts for ps in env.pageservers: @@ -221,7 +222,7 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder): time.sleep(LSN_LEASE_LENGTH / 2) ps.start() - trigger_gc_and_select(env, ep_static) + trigger_gc_and_select(env, ep_static, ctx="After pageservers restart") # Reconfigure pageservers env.pageservers[0].stop() @@ -230,7 +231,7 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder): ) env.storage_controller.reconcile_until_idle() - trigger_gc_and_select(env, ep_static) + trigger_gc_and_select(env, ep_static, ctx="After putting pageserver 0 offline") # Do some update so we can increment latest_gc_cutoff generate_updates_on_main(env, ep_main, i, end=100)