safekeeper: eviction metrics (#8348)

## Problem Follow up to https://github.com/neondatabase/neon/pull/8335, to improve observability of how many evict/restores we are doing. ## Summary of changes - Add `safekeeper_eviction_events_started_total` and `safekeeper_eviction_events_completed_total`, with a "kind" label of evict or restore. This gives us rates, and also ability to calculate how many are in progress. - Generalize SafekeeperMetrics test type to use the same helpers as pageserver, and enable querying any metric. - Read the new metrics at the end of the eviction test.
2026-01-05 20:42:54 +00:00 · 2024-07-11 17:05:35 +01:00
parent d9a82468e2
commit 0159ae9536
6 changed files with 92 additions and 29 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5206,6 +5206,8 @@ dependencies = [
 "sha2",
 "signal-hook",
 "storage_broker",
+ "strum",
+ "strum_macros",
 "thiserror",
 "tokio",
 "tokio-io-timeout",
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -41,6 +41,8 @@ serde.workspace = true
 serde_json.workspace = true
 serde_with.workspace = true
 signal-hook.workspace = true
+strum.workspace = true
+strum_macros.workspace = true
 thiserror.workspace = true
 tokio = { workspace = true, features = ["fs"] }
 tokio-util = { workspace = true }
--- a/safekeeper/src/metrics.rs
+++ b/safekeeper/src/metrics.rs
@@ -205,6 +205,32 @@ pub static WAL_BACKUP_TASKS: Lazy<IntCounterPair> = Lazy::new(|| {
    .expect("Failed to register safekeeper_wal_backup_tasks_finished_total counter")
 });

+// Metrics collected on operations on the storage repository.
+#[derive(strum_macros::EnumString, strum_macros::Display, strum_macros::IntoStaticStr)]
+#[strum(serialize_all = "kebab_case")]
+pub(crate) enum EvictionEvent {
+    Evict,
+    Restore,
+}
+
+pub(crate) static EVICTION_EVENTS_STARTED: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "safekeeper_eviction_events_started_total",
+        "Number of eviction state changes, incremented when they start",
+        &["kind"]
+    )
+    .expect("Failed to register metric")
+});
+
+pub(crate) static EVICTION_EVENTS_COMPLETED: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "safekeeper_eviction_events_completed_total",
+        "Number of eviction state changes, incremented when they complete",
+        &["kind"]
+    )
+    .expect("Failed to register metric")
+});
+
 pub const LABEL_UNKNOWN: &str = "unknown";

 /// Labels for traffic metrics.
--- a/safekeeper/src/timeline_eviction.rs
+++ b/safekeeper/src/timeline_eviction.rs
@@ -14,6 +14,7 @@ use tracing::{debug, info, instrument, warn};
 use utils::crashsafe::durable_rename;

 use crate::{
+    metrics::{EvictionEvent, EVICTION_EVENTS_COMPLETED, EVICTION_EVENTS_STARTED},
    timeline_manager::{Manager, StateSnapshot},
    wal_backup,
    wal_backup_partial::{self, PartialRemoteSegment},
@@ -66,6 +67,15 @@ impl Manager {

        info!("starting eviction, using {:?}", partial_backup_uploaded);

+        EVICTION_EVENTS_STARTED
+            .with_label_values(&[EvictionEvent::Evict.into()])
+            .inc();
+        let _guard = scopeguard::guard((), |_| {
+            EVICTION_EVENTS_COMPLETED
+                .with_label_values(&[EvictionEvent::Evict.into()])
+                .inc();
+        });
+
        if let Err(e) = do_eviction(self, &partial_backup_uploaded).await {
            warn!("failed to evict timeline: {:?}", e);
            return;
@@ -88,6 +98,15 @@ impl Manager {

        info!("starting uneviction, using {:?}", partial_backup_uploaded);

+        EVICTION_EVENTS_STARTED
+            .with_label_values(&[EvictionEvent::Restore.into()])
+            .inc();
+        let _guard = scopeguard::guard((), |_| {
+            EVICTION_EVENTS_COMPLETED
+                .with_label_values(&[EvictionEvent::Restore.into()])
+                .inc();
+        });
+
        if let Err(e) = do_uneviction(self, &partial_backup_uploaded).await {
            warn!("failed to unevict timeline: {:?}", e);
            return;
--- a/test_runner/fixtures/safekeeper/http.py
+++ b/test_runner/fixtures/safekeeper/http.py
@@ -1,6 +1,5 @@
 import json
-import re
-from dataclasses import dataclass, field
+from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple, Union

 import pytest
@@ -8,6 +7,7 @@ import requests

 from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
+from fixtures.metrics import Metrics, MetricsGetter, parse_metrics


 # Walreceiver as returned by sk's timeline status endpoint.
@@ -31,15 +31,26 @@ class SafekeeperTimelineStatus:
    walreceivers: List[Walreceiver]


-@dataclass
-class SafekeeperMetrics:
+class SafekeeperMetrics(Metrics):
+    # Helpers to get metrics from tests without hardcoding the metric names there.
    # These are metrics from Prometheus which uses float64 internally.
    # As a consequence, values may differ from real original int64s.
-    flush_lsn_inexact: Dict[Tuple[TenantId, TimelineId], int] = field(default_factory=dict)
-    commit_lsn_inexact: Dict[Tuple[TenantId, TimelineId], int] = field(default_factory=dict)
+
+    def __init__(self, m: Metrics):
+        self.metrics = m.metrics
+
+    def flush_lsn_inexact(self, tenant_id: TenantId, timeline_id: TimelineId):
+        return self.query_one(
+            "safekeeper_flush_lsn", {"tenant_id": str(tenant_id), "timeline_id": str(timeline_id)}
+        ).value
+
+    def commit_lsn_inexact(self, tenant_id: TenantId, timeline_id: TimelineId):
+        return self.query_one(
+            "safekeeper_commit_lsn", {"tenant_id": str(tenant_id), "timeline_id": str(timeline_id)}
+        ).value


-class SafekeeperHttpClient(requests.Session):
+class SafekeeperHttpClient(requests.Session, MetricsGetter):
    HTTPError = requests.HTTPError

    def __init__(self, port: int, auth_token: Optional[str] = None, is_testing_enabled=False):
@@ -209,28 +220,11 @@ class SafekeeperHttpClient(requests.Session):
        return res_json

    def get_metrics_str(self) -> str:
+        """You probably want to use get_metrics() instead."""
        request_result = self.get(f"http://localhost:{self.port}/metrics")
        request_result.raise_for_status()
        return request_result.text

    def get_metrics(self) -> SafekeeperMetrics:
-        all_metrics_text = self.get_metrics_str()
-
-        metrics = SafekeeperMetrics()
-        for match in re.finditer(
-            r'^safekeeper_flush_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$',
-            all_metrics_text,
-            re.MULTILINE,
-        ):
-            metrics.flush_lsn_inexact[(TenantId(match.group(1)), TimelineId(match.group(2)))] = int(
-                match.group(3)
-            )
-        for match in re.finditer(
-            r'^safekeeper_commit_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$',
-            all_metrics_text,
-            re.MULTILINE,
-        ):
-            metrics.commit_lsn_inexact[
-                (TenantId(match.group(1)), TimelineId(match.group(2)))
-            ] = int(match.group(3))
-        return metrics
+        res = self.get_metrics_str()
+        return SafekeeperMetrics(parse_metrics(res))
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -147,8 +147,8 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder):
                last_record_lsn=Lsn(timeline_detail["last_record_lsn"]),
            )
            for sk_m in sk_metrics:
-                m.flush_lsns.append(Lsn(sk_m.flush_lsn_inexact[(tenant_id, timeline_id)]))
-                m.commit_lsns.append(Lsn(sk_m.commit_lsn_inexact[(tenant_id, timeline_id)]))
+                m.flush_lsns.append(Lsn(int(sk_m.flush_lsn_inexact(tenant_id, timeline_id))))
+                m.commit_lsns.append(Lsn(int(sk_m.commit_lsn_inexact(tenant_id, timeline_id))))

            for flush_lsn, commit_lsn in zip(m.flush_lsns, m.commit_lsns):
                # Invariant. May be < when transaction is in progress.
@@ -2274,3 +2274,23 @@ def test_s3_eviction(
        and sk.log_contains("successfully restored evicted timeline")
        for sk in env.safekeepers
    )
+
+    assert any(
+        sk.http_client().get_metric_value(
+            "safekeeper_eviction_events_started_total", {"kind": "evict"}
+        )
+        or 0 > 0
+        and sk.http_client().get_metric_value(
+            "safekeeper_eviction_events_completed_total", {"kind": "evict"}
+        )
+        or 0 > 0
+        and sk.http_client().get_metric_value(
+            "safekeeper_eviction_events_started_total", {"kind": "restore"}
+        )
+        or 0 > 0
+        and sk.http_client().get_metric_value(
+            "safekeeper_eviction_events_completed_total", {"kind": "restore"}
+        )
+        or 0 > 0
+        for sk in env.safekeepers
+    )