From 7672e49ab530eed265b39bf62c3d44e7750f8303 Mon Sep 17 00:00:00 2001 From: John Spray Date: Thu, 18 Jul 2024 10:14:56 +0100 Subject: [PATCH] tests: fix metrics check in test_s3_eviction (#8419) ## Problem This test would occasionally fail its metric check. This could happen in the rare case that the nodes had all been restarted before their most recent eviction. The metric check was added in https://github.com/neondatabase/neon/pull/8348 ## Summary of changes - Check metrics before each restart, accumulate into a bool that we assert on at the end of the test --- test_runner/regress/test_wal_acceptor.py | 43 +++++++++++++----------- 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 2e906e6160..f02f19c588 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -2242,6 +2242,8 @@ def test_s3_eviction( check_values = [0] * n_timelines + event_metrics_seen = False + n_iters = 20 for _ in range(n_iters): if log.isEnabledFor(logging.DEBUG): @@ -2266,6 +2268,27 @@ def test_s3_eviction( # update remote_consistent_lsn on pageserver ps_client.timeline_checkpoint(env.initial_tenant, timelines[i], wait_until_uploaded=True) + # Do metrics check before restarts, since these will reset to zero across a restart + event_metrics_seen |= any( + sk.http_client().get_metric_value( + "safekeeper_eviction_events_started_total", {"kind": "evict"} + ) + or 0 > 0 + and sk.http_client().get_metric_value( + "safekeeper_eviction_events_completed_total", {"kind": "evict"} + ) + or 0 > 0 + and sk.http_client().get_metric_value( + "safekeeper_eviction_events_started_total", {"kind": "restore"} + ) + or 0 > 0 + and sk.http_client().get_metric_value( + "safekeeper_eviction_events_completed_total", {"kind": "restore"} + ) + or 0 > 0 + for sk in env.safekeepers + ) + # restarting random safekeepers for sk in env.safekeepers: if random.random() < restart_chance: @@ -2280,22 +2303,4 @@ def test_s3_eviction( for sk in env.safekeepers ) - assert any( - sk.http_client().get_metric_value( - "safekeeper_eviction_events_started_total", {"kind": "evict"} - ) - or 0 > 0 - and sk.http_client().get_metric_value( - "safekeeper_eviction_events_completed_total", {"kind": "evict"} - ) - or 0 > 0 - and sk.http_client().get_metric_value( - "safekeeper_eviction_events_started_total", {"kind": "restore"} - ) - or 0 > 0 - and sk.http_client().get_metric_value( - "safekeeper_eviction_events_completed_total", {"kind": "restore"} - ) - or 0 > 0 - for sk in env.safekeepers - ) + assert event_metrics_seen