tests: improve stability of tests using wait_for_upload_queue_empty (#6856)

## Problem PR #6834 introduced an assertion that the sets of metric labels on finished operations should equal those on started operations, which is not true if no operations have finished yet for a particular set of labels. ## Summary of changes - Instead of asserting out, wait and re-check in the case that finished metrics don't match started
2026-01-08 14:02:55 +00:00 · 2024-02-21 16:00:17 +00:00
parent 532b0fa52b
commit ce1673a8c4
1 changed files with 3 additions and 2 deletions
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -219,6 +219,7 @@ def wait_for_last_record_lsn(
 def wait_for_upload_queue_empty(
    pageserver_http: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId
 ):
+    wait_period_secs = 0.2
    while True:
        all_metrics = pageserver_http.get_metrics()
        started = all_metrics.query_all(
@@ -235,7 +236,7 @@ def wait_for_upload_queue_empty(
                "timeline_id": str(timeline_id),
            },
        )
-        assert len(started) == len(finished)
+
        # this is `started left join finished`; if match, subtracting start from finished, resulting in queue depth
        remaining_labels = ["shard_id", "file_kind", "op_kind"]
        tl: List[Tuple[Any, float]] = []
@@ -256,7 +257,7 @@ def wait_for_upload_queue_empty(
            log.info(f"  {labels}: {queue_count}")
        if all(queue_count == 0 for (_, queue_count) in tl):
            return
-        time.sleep(0.2)
+        time.sleep(wait_period_secs)


 def wait_timeline_detail_404(