tests: improve stability of tests using wait_for_upload_queue_empty (#6856)

## Problem

PR #6834 introduced an assertion that the sets of metric labels on
finished operations should equal those on started operations, which is
not true if no operations have finished yet for a particular set of
labels.

## Summary of changes

- Instead of asserting out, wait and re-check in the case that finished
metrics don't match started
This commit is contained in:
John Spray
2024-02-21 16:00:17 +00:00
committed by GitHub
parent 532b0fa52b
commit ce1673a8c4

View File

@@ -219,6 +219,7 @@ def wait_for_last_record_lsn(
def wait_for_upload_queue_empty(
pageserver_http: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId
):
wait_period_secs = 0.2
while True:
all_metrics = pageserver_http.get_metrics()
started = all_metrics.query_all(
@@ -235,7 +236,7 @@ def wait_for_upload_queue_empty(
"timeline_id": str(timeline_id),
},
)
assert len(started) == len(finished)
# this is `started left join finished`; if match, subtracting start from finished, resulting in queue depth
remaining_labels = ["shard_id", "file_kind", "op_kind"]
tl: List[Tuple[Any, float]] = []
@@ -256,7 +257,7 @@ def wait_for_upload_queue_empty(
log.info(f" {labels}: {queue_count}")
if all(queue_count == 0 for (_, queue_count) in tl):
return
time.sleep(0.2)
time.sleep(wait_period_secs)
def wait_timeline_detail_404(