safekeeper: eviction metrics (#8348)

## Problem

Follow up to https://github.com/neondatabase/neon/pull/8335, to improve
observability of how many evict/restores we are doing.

## Summary of changes

- Add `safekeeper_eviction_events_started_total` and
`safekeeper_eviction_events_completed_total`, with a "kind" label of
evict or restore. This gives us rates, and also ability to calculate how
many are in progress.
- Generalize SafekeeperMetrics test type to use the same helpers as
pageserver, and enable querying any metric.
- Read the new metrics at the end of the eviction test.
This commit is contained in:
John Spray
2024-07-11 17:05:35 +01:00
committed by GitHub
parent d9a82468e2
commit 0159ae9536
6 changed files with 92 additions and 29 deletions

View File

@@ -147,8 +147,8 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder):
last_record_lsn=Lsn(timeline_detail["last_record_lsn"]),
)
for sk_m in sk_metrics:
m.flush_lsns.append(Lsn(sk_m.flush_lsn_inexact[(tenant_id, timeline_id)]))
m.commit_lsns.append(Lsn(sk_m.commit_lsn_inexact[(tenant_id, timeline_id)]))
m.flush_lsns.append(Lsn(int(sk_m.flush_lsn_inexact(tenant_id, timeline_id))))
m.commit_lsns.append(Lsn(int(sk_m.commit_lsn_inexact(tenant_id, timeline_id))))
for flush_lsn, commit_lsn in zip(m.flush_lsns, m.commit_lsns):
# Invariant. May be < when transaction is in progress.
@@ -2274,3 +2274,23 @@ def test_s3_eviction(
and sk.log_contains("successfully restored evicted timeline")
for sk in env.safekeepers
)
assert any(
sk.http_client().get_metric_value(
"safekeeper_eviction_events_started_total", {"kind": "evict"}
)
or 0 > 0
and sk.http_client().get_metric_value(
"safekeeper_eviction_events_completed_total", {"kind": "evict"}
)
or 0 > 0
and sk.http_client().get_metric_value(
"safekeeper_eviction_events_started_total", {"kind": "restore"}
)
or 0 > 0
and sk.http_client().get_metric_value(
"safekeeper_eviction_events_completed_total", {"kind": "restore"}
)
or 0 > 0
for sk in env.safekeepers
)