From ce1673a8c46c2e61a7d5e8509ccc563c7fbd2a30 Mon Sep 17 00:00:00 2001 From: John Spray Date: Wed, 21 Feb 2024 16:00:17 +0000 Subject: [PATCH] tests: improve stability of tests using `wait_for_upload_queue_empty` (#6856) ## Problem PR #6834 introduced an assertion that the sets of metric labels on finished operations should equal those on started operations, which is not true if no operations have finished yet for a particular set of labels. ## Summary of changes - Instead of asserting out, wait and re-check in the case that finished metrics don't match started --- test_runner/fixtures/pageserver/utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py index 225cfcd143..1415038f69 100644 --- a/test_runner/fixtures/pageserver/utils.py +++ b/test_runner/fixtures/pageserver/utils.py @@ -219,6 +219,7 @@ def wait_for_last_record_lsn( def wait_for_upload_queue_empty( pageserver_http: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId ): + wait_period_secs = 0.2 while True: all_metrics = pageserver_http.get_metrics() started = all_metrics.query_all( @@ -235,7 +236,7 @@ def wait_for_upload_queue_empty( "timeline_id": str(timeline_id), }, ) - assert len(started) == len(finished) + # this is `started left join finished`; if match, subtracting start from finished, resulting in queue depth remaining_labels = ["shard_id", "file_kind", "op_kind"] tl: List[Tuple[Any, float]] = [] @@ -256,7 +257,7 @@ def wait_for_upload_queue_empty( log.info(f" {labels}: {queue_count}") if all(queue_count == 0 for (_, queue_count) in tl): return - time.sleep(0.2) + time.sleep(wait_period_secs) def wait_timeline_detail_404(