From ce1673a8c46c2e61a7d5e8509ccc563c7fbd2a30 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 21 Feb 2024 16:00:17 +0000
Subject: [PATCH] tests: improve stability of  tests using
 `wait_for_upload_queue_empty` (#6856)

## Problem

PR #6834 introduced an assertion that the sets of metric labels on
finished operations should equal those on started operations, which is
not true if no operations have finished yet for a particular set of
labels.

## Summary of changes

- Instead of asserting out, wait and re-check in the case that finished
metrics don't match started
---
 test_runner/fixtures/pageserver/utils.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index 225cfcd143..1415038f69 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -219,6 +219,7 @@ def wait_for_last_record_lsn(
 def wait_for_upload_queue_empty(
     pageserver_http: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId
 ):
+    wait_period_secs = 0.2
     while True:
         all_metrics = pageserver_http.get_metrics()
         started = all_metrics.query_all(
@@ -235,7 +236,7 @@ def wait_for_upload_queue_empty(
                 "timeline_id": str(timeline_id),
             },
         )
-        assert len(started) == len(finished)
+
         # this is `started left join finished`; if match, subtracting start from finished, resulting in queue depth
         remaining_labels = ["shard_id", "file_kind", "op_kind"]
         tl: List[Tuple[Any, float]] = []
@@ -256,7 +257,7 @@ def wait_for_upload_queue_empty(
             log.info(f"  {labels}: {queue_count}")
         if all(queue_count == 0 for (_, queue_count) in tl):
             return
-        time.sleep(0.2)
+        time.sleep(wait_period_secs)
 
 
 def wait_timeline_detail_404(