tests: fix metrics check in test_s3_eviction (#8419)

## Problem

This test would occasionally fail its metric check. This could happen in
the rare case that the nodes had all been restarted before their most
recent eviction.

The metric check was added in
https://github.com/neondatabase/neon/pull/8348

## Summary of changes

- Check metrics before each restart, accumulate into a bool that we
assert on at the end of the test
This commit is contained in:
John Spray
2024-07-18 10:14:56 +01:00
committed by GitHub
parent a2d170b6d0
commit 7672e49ab5

View File

@@ -2242,6 +2242,8 @@ def test_s3_eviction(
check_values = [0] * n_timelines
event_metrics_seen = False
n_iters = 20
for _ in range(n_iters):
if log.isEnabledFor(logging.DEBUG):
@@ -2266,6 +2268,27 @@ def test_s3_eviction(
# update remote_consistent_lsn on pageserver
ps_client.timeline_checkpoint(env.initial_tenant, timelines[i], wait_until_uploaded=True)
# Do metrics check before restarts, since these will reset to zero across a restart
event_metrics_seen |= any(
sk.http_client().get_metric_value(
"safekeeper_eviction_events_started_total", {"kind": "evict"}
)
or 0 > 0
and sk.http_client().get_metric_value(
"safekeeper_eviction_events_completed_total", {"kind": "evict"}
)
or 0 > 0
and sk.http_client().get_metric_value(
"safekeeper_eviction_events_started_total", {"kind": "restore"}
)
or 0 > 0
and sk.http_client().get_metric_value(
"safekeeper_eviction_events_completed_total", {"kind": "restore"}
)
or 0 > 0
for sk in env.safekeepers
)
# restarting random safekeepers
for sk in env.safekeepers:
if random.random() < restart_chance:
@@ -2280,22 +2303,4 @@ def test_s3_eviction(
for sk in env.safekeepers
)
assert any(
sk.http_client().get_metric_value(
"safekeeper_eviction_events_started_total", {"kind": "evict"}
)
or 0 > 0
and sk.http_client().get_metric_value(
"safekeeper_eviction_events_completed_total", {"kind": "evict"}
)
or 0 > 0
and sk.http_client().get_metric_value(
"safekeeper_eviction_events_started_total", {"kind": "restore"}
)
or 0 > 0
and sk.http_client().get_metric_value(
"safekeeper_eviction_events_completed_total", {"kind": "restore"}
)
or 0 > 0
for sk in env.safekeepers
)
assert event_metrics_seen