fix(test): wait compaction in timeline offload test (#12673)

## Problem close LKB-753. `test_pageserver_metrics_removed_after_offload` is unstable and it sometimes leave the metrics behind after tenant offloading. It turns out that we triggered an image compaction before the offload and the job was stopped after the offload request was completed. ## Summary of changes Wait all background tasks to finish before checking the metrics. --------- Signed-off-by: Alex Chi Z <chi@neon.tech>
2025-12-22 21:59:59 +00:00 · 2025-07-28 12:27:55 -04:00
parent 40cae8cc36
commit fe7a4e1ab6
1 changed files with 31 additions and 3 deletions
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -298,15 +298,26 @@ def test_pageserver_metrics_removed_after_detach(neon_env_builder: NeonEnvBuilde
        assert post_detach_samples == set()


-def test_pageserver_metrics_removed_after_offload(neon_env_builder: NeonEnvBuilder):
+@pytest.mark.parametrize("compaction", ["compaction_enabled", "compaction_disabled"])
+def test_pageserver_metrics_removed_after_offload(
+    neon_env_builder: NeonEnvBuilder, compaction: str
+):
    """Tests that when a timeline is offloaded, the tenant specific metrics are not left behind"""

    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3)
-
    neon_env_builder.num_safekeepers = 3

    env = neon_env_builder.init_start()
-    tenant_1, _ = env.create_tenant()
+    tenant_1, _ = env.create_tenant(
+        conf={
+            # disable background compaction and GC so that we don't have leftover tasks
+            # after offloading.
+            "gc_period": "0s",
+            "compaction_period": "0s",
+        }
+        if compaction == "compaction_disabled"
+        else None
+    )

    timeline_1 = env.create_timeline("test_metrics_removed_after_offload_1", tenant_id=tenant_1)
    timeline_2 = env.create_timeline("test_metrics_removed_after_offload_2", tenant_id=tenant_1)
@@ -351,6 +362,23 @@ def test_pageserver_metrics_removed_after_offload(neon_env_builder: NeonEnvBuild
            state=TimelineArchivalState.ARCHIVED,
        )
        env.pageserver.http_client().timeline_offload(tenant_1, timeline)
+        # We need to wait until all background jobs are finished before we can check the metrics.
+        # There're many of them: compaction, GC, etc.
+        wait_until(
+            lambda: all(
+                sample.value == 0
+                for sample in env.pageserver.http_client()
+                .get_metrics()
+                .query_all("pageserver_background_loop_semaphore_waiting_tasks")
+            )
+            and all(
+                sample.value == 0
+                for sample in env.pageserver.http_client()
+                .get_metrics()
+                .query_all("pageserver_background_loop_semaphore_running_tasks")
+            )
+        )
+
        post_offload_samples = set(
            [x.name for x in get_ps_metric_samples_for_timeline(tenant_1, timeline)]
        )