From fe7a4e1ab68a4e1252118aa197aad497d8937bc0 Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Mon, 28 Jul 2025 12:27:55 -0400 Subject: [PATCH] fix(test): wait compaction in timeline offload test (#12673) ## Problem close LKB-753. `test_pageserver_metrics_removed_after_offload` is unstable and it sometimes leave the metrics behind after tenant offloading. It turns out that we triggered an image compaction before the offload and the job was stopped after the offload request was completed. ## Summary of changes Wait all background tasks to finish before checking the metrics. --------- Signed-off-by: Alex Chi Z --- test_runner/regress/test_tenants.py | 34 ++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 7f32f34d36..49bc02a3e7 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -298,15 +298,26 @@ def test_pageserver_metrics_removed_after_detach(neon_env_builder: NeonEnvBuilde assert post_detach_samples == set() -def test_pageserver_metrics_removed_after_offload(neon_env_builder: NeonEnvBuilder): +@pytest.mark.parametrize("compaction", ["compaction_enabled", "compaction_disabled"]) +def test_pageserver_metrics_removed_after_offload( + neon_env_builder: NeonEnvBuilder, compaction: str +): """Tests that when a timeline is offloaded, the tenant specific metrics are not left behind""" neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3) - neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() - tenant_1, _ = env.create_tenant() + tenant_1, _ = env.create_tenant( + conf={ + # disable background compaction and GC so that we don't have leftover tasks + # after offloading. + "gc_period": "0s", + "compaction_period": "0s", + } + if compaction == "compaction_disabled" + else None + ) timeline_1 = env.create_timeline("test_metrics_removed_after_offload_1", tenant_id=tenant_1) timeline_2 = env.create_timeline("test_metrics_removed_after_offload_2", tenant_id=tenant_1) @@ -351,6 +362,23 @@ def test_pageserver_metrics_removed_after_offload(neon_env_builder: NeonEnvBuild state=TimelineArchivalState.ARCHIVED, ) env.pageserver.http_client().timeline_offload(tenant_1, timeline) + # We need to wait until all background jobs are finished before we can check the metrics. + # There're many of them: compaction, GC, etc. + wait_until( + lambda: all( + sample.value == 0 + for sample in env.pageserver.http_client() + .get_metrics() + .query_all("pageserver_background_loop_semaphore_waiting_tasks") + ) + and all( + sample.value == 0 + for sample in env.pageserver.http_client() + .get_metrics() + .query_all("pageserver_background_loop_semaphore_running_tasks") + ) + ) + post_offload_samples = set( [x.name for x in get_ps_metric_samples_for_timeline(tenant_1, timeline)] )