From 1bb434ab747ae500ef04e9aa671acb4f9eb9d956 Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Tue, 29 Jul 2025 10:23:42 -0400 Subject: [PATCH] fix(test): test_readonly_node_gc compute needs time to acquire lease (#12747) ## Problem Part of LKB-2368. Compute fails to obtain LSN lease in this test case. There're many assumptions around how compute obtains the leases, and in this particular test case, as the LSN lease length is only 8s (which is shorter than the amount of time where pageserver can restart and compute can reconnect in terms of force stop), it sometimes cause issues. ## Summary of changes Add more sleeps around the test case to ensure it's stable at least. We need to find a more reliable way to test this in the future. --------- Signed-off-by: Alex Chi Z --- test_runner/regress/test_readonly_node.py | 28 +++++++++++++++++------ 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py index 5612236250..e151b0ba13 100644 --- a/test_runner/regress/test_readonly_node.py +++ b/test_runner/regress/test_readonly_node.py @@ -129,7 +129,10 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder): Test static endpoint is protected from GC by acquiring and renewing lsn leases. """ - LSN_LEASE_LENGTH = 8 + LSN_LEASE_LENGTH = ( + 14 # This value needs to be large enough for compute_ctl to send two lease requests. + ) + neon_env_builder.num_pageservers = 2 # GC is manual triggered. env = neon_env_builder.init_start( @@ -230,6 +233,15 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder): log.info(f"`SELECT` query succeed after GC, {ctx=}") return offset + # It's not reliable to let the compute renew the lease in this test case as we have a very tight + # lease timeout. Therefore, the test case itself will renew the lease. + # + # This is a workaround to make the test case more deterministic. + def renew_lease(env: NeonEnv, lease_lsn: Lsn): + env.storage_controller.pageserver_api().timeline_lsn_lease( + env.initial_tenant, env.initial_timeline, lease_lsn + ) + # Insert some records on main branch with env.endpoints.create_start("main", config_lines=["shared_buffers=1MB"]) as ep_main: with ep_main.cursor() as cur: @@ -242,6 +254,9 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder): XLOG_BLCKSZ = 8192 lsn = Lsn((int(lsn) // XLOG_BLCKSZ) * XLOG_BLCKSZ) + # We need to mock the way cplane works: it gets a lease for a branch before starting the compute. + renew_lease(env, lsn) + with env.endpoints.create_start( branch_name="main", endpoint_id="static", @@ -251,9 +266,6 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder): cur.execute("SELECT count(*) FROM t0") assert cur.fetchone() == (ROW_COUNT,) - # Wait for static compute to renew lease at least once. - time.sleep(LSN_LEASE_LENGTH / 2) - generate_updates_on_main(env, ep_main, 3, end=100) offset = trigger_gc_and_select( @@ -263,10 +275,10 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder): # Trigger Pageserver restarts for ps in env.pageservers: ps.stop() - # Static compute should have at least one lease request failure due to connection. - time.sleep(LSN_LEASE_LENGTH / 2) ps.start() + renew_lease(env, lsn) + trigger_gc_and_select( env, ep_static, @@ -282,6 +294,9 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder): ) env.storage_controller.reconcile_until_idle() + # Wait for static compute to renew lease on the new pageserver. + time.sleep(LSN_LEASE_LENGTH + 3) + trigger_gc_and_select( env, ep_static, @@ -292,7 +307,6 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder): # Do some update so we can increment gc_cutoff generate_updates_on_main(env, ep_main, i, end=100) - # Wait for the existing lease to expire. time.sleep(LSN_LEASE_LENGTH + 1) # Now trigger GC again, layers should be removed.