From db900ae9d0bc01aa38d95938325b29b30eb4b4cf Mon Sep 17 00:00:00 2001
From: Yuchen Liang <70461588+yliang412@users.noreply.github.com>
Date: Fri, 25 Oct 2024 07:50:47 -0400
Subject: [PATCH] fix(test): remove too strict layers_removed==0 check in
test_readonly_node_gc (#9506)
Fixes #9098
## Problem
`test_readonly_node_gc` is flaky. As shown in [Allure
Report](https://neon-github-public-dev.s3.amazonaws.com/reports/pr-9469/11444519440/index.html#suites/3ccffb1d100105b98aed3dc19b717917/2c02073738fa2b39),
we would get a `AssertionError: No layers should be removed, old layers
are guarded by leases.` after the test restarts pageservers or after
reconfigure pageservers.
During the investigation, we found that the layers has LSN (`0/1563088`)
greater than the LSN (`0x1562000`) protected by the lease. For instance,
**Layers removed**
000000067F00000005000034540100000000-000000067F00000005000040050100000000__0000000001563088-00000001
(shard 0002)
000000068000000000000017E20000000001-010000000100000001000000000000000001__0000000001563088-00000001
(shard 0002)
**Lsn Lease Granted**
handle_make_lsn_lease{lsn=0/1562000 shard_id=0002
shard_id=0002}: lease created, valid until 2024-10-21
This means that these layers are not guarded by the leases: they are in
"future", not visible to the static endpoint.
## Summary of changes
- Remove the assertion layers_removed == 0 after trigger timeline GC
while holding the lease. Instead rely on the successful execution of
the`SELECT` query to test lease validity.
- Improve test logging
Signed-off-by: Yuchen Liang
---
test_runner/regress/test_readonly_node.py | 19 ++++++++++---------
1 file changed, 10 insertions(+), 9 deletions(-)
diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py
index 30c69cb883..8151160477 100644
--- a/test_runner/regress/test_readonly_node.py
+++ b/test_runner/regress/test_readonly_node.py
@@ -169,23 +169,24 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
)
return last_flush_lsn
- def trigger_gc_and_select(env: NeonEnv, ep_static: Endpoint):
+ def trigger_gc_and_select(env: NeonEnv, ep_static: Endpoint, ctx: str):
"""
Trigger GC manually on all pageservers. Then run an `SELECT` query.
"""
for shard, ps in tenant_get_shards(env, env.initial_tenant):
client = ps.http_client()
gc_result = client.timeline_gc(shard, env.initial_timeline, 0)
+ # Note: cannot assert on `layers_removed` here because it could be layers
+ # not guarded by the lease. Rely on successful execution of the query instead.
log.info(f"{gc_result=}")
- assert (
- gc_result["layers_removed"] == 0
- ), "No layers should be removed, old layers are guarded by leases."
-
with ep_static.cursor() as cur:
+ # Following query should succeed if pages are properly guarded by leases.
cur.execute("SELECT count(*) FROM t0")
assert cur.fetchone() == (ROW_COUNT,)
+ log.info(f"`SELECT` query succeed after GC, {ctx=}")
+
# Insert some records on main branch
with env.endpoints.create_start("main") as ep_main:
with ep_main.cursor() as cur:
@@ -210,9 +211,9 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
# Wait for static compute to renew lease at least once.
time.sleep(LSN_LEASE_LENGTH / 2)
- generate_updates_on_main(env, ep_main, i, end=100)
+ generate_updates_on_main(env, ep_main, 3, end=100)
- trigger_gc_and_select(env, ep_static)
+ trigger_gc_and_select(env, ep_static, ctx="Before pageservers restart")
# Trigger Pageserver restarts
for ps in env.pageservers:
@@ -221,7 +222,7 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
time.sleep(LSN_LEASE_LENGTH / 2)
ps.start()
- trigger_gc_and_select(env, ep_static)
+ trigger_gc_and_select(env, ep_static, ctx="After pageservers restart")
# Reconfigure pageservers
env.pageservers[0].stop()
@@ -230,7 +231,7 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
)
env.storage_controller.reconcile_until_idle()
- trigger_gc_and_select(env, ep_static)
+ trigger_gc_and_select(env, ep_static, ctx="After putting pageserver 0 offline")
# Do some update so we can increment latest_gc_cutoff
generate_updates_on_main(env, ep_main, i, end=100)