From 0159ae9536d6b9e0a9cb27b0ced3fd244faf63d0 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 11 Jul 2024 17:05:35 +0100
Subject: [PATCH] safekeeper: eviction metrics (#8348)

## Problem

Follow up to https://github.com/neondatabase/neon/pull/8335, to improve
observability of how many evict/restores we are doing.

## Summary of changes

- Add `safekeeper_eviction_events_started_total` and
`safekeeper_eviction_events_completed_total`, with a "kind" label of
evict or restore. This gives us rates, and also ability to calculate how
many are in progress.
- Generalize SafekeeperMetrics test type to use the same helpers as
pageserver, and enable querying any metric.
- Read the new metrics at the end of the eviction test.
---
 Cargo.lock                               |  2 +
 safekeeper/Cargo.toml                    |  2 +
 safekeeper/src/metrics.rs                | 26 +++++++++++++
 safekeeper/src/timeline_eviction.rs      | 19 ++++++++++
 test_runner/fixtures/safekeeper/http.py  | 48 +++++++++++-------------
 test_runner/regress/test_wal_acceptor.py | 24 +++++++++++-
 6 files changed, 92 insertions(+), 29 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index 4b1525edee..b31ac69e6c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5206,6 +5206,8 @@ dependencies = [
  "sha2",
  "signal-hook",
  "storage_broker",
+ "strum",
+ "strum_macros",
  "thiserror",
  "tokio",
  "tokio-io-timeout",
diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml
index a650d5e207..9f32016fd9 100644
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -41,6 +41,8 @@ serde.workspace = true
 serde_json.workspace = true
 serde_with.workspace = true
 signal-hook.workspace = true
+strum.workspace = true
+strum_macros.workspace = true
 thiserror.workspace = true
 tokio = { workspace = true, features = ["fs"] }
 tokio-util = { workspace = true }
diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs
index 539ecf826b..aa2bafbe92 100644
--- a/safekeeper/src/metrics.rs
+++ b/safekeeper/src/metrics.rs
@@ -205,6 +205,32 @@ pub static WAL_BACKUP_TASKS: Lazy<IntCounterPair> = Lazy::new(|| {
     .expect("Failed to register safekeeper_wal_backup_tasks_finished_total counter")
 });
 
+// Metrics collected on operations on the storage repository.
+#[derive(strum_macros::EnumString, strum_macros::Display, strum_macros::IntoStaticStr)]
+#[strum(serialize_all = "kebab_case")]
+pub(crate) enum EvictionEvent {
+    Evict,
+    Restore,
+}
+
+pub(crate) static EVICTION_EVENTS_STARTED: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "safekeeper_eviction_events_started_total",
+        "Number of eviction state changes, incremented when they start",
+        &["kind"]
+    )
+    .expect("Failed to register metric")
+});
+
+pub(crate) static EVICTION_EVENTS_COMPLETED: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "safekeeper_eviction_events_completed_total",
+        "Number of eviction state changes, incremented when they complete",
+        &["kind"]
+    )
+    .expect("Failed to register metric")
+});
+
 pub const LABEL_UNKNOWN: &str = "unknown";
 
 /// Labels for traffic metrics.
diff --git a/safekeeper/src/timeline_eviction.rs b/safekeeper/src/timeline_eviction.rs
index e4ab65290d..0b8d58ee8a 100644
--- a/safekeeper/src/timeline_eviction.rs
+++ b/safekeeper/src/timeline_eviction.rs
@@ -14,6 +14,7 @@ use tracing::{debug, info, instrument, warn};
 use utils::crashsafe::durable_rename;
 
 use crate::{
+    metrics::{EvictionEvent, EVICTION_EVENTS_COMPLETED, EVICTION_EVENTS_STARTED},
     timeline_manager::{Manager, StateSnapshot},
     wal_backup,
     wal_backup_partial::{self, PartialRemoteSegment},
@@ -66,6 +67,15 @@ impl Manager {
 
         info!("starting eviction, using {:?}", partial_backup_uploaded);
 
+        EVICTION_EVENTS_STARTED
+            .with_label_values(&[EvictionEvent::Evict.into()])
+            .inc();
+        let _guard = scopeguard::guard((), |_| {
+            EVICTION_EVENTS_COMPLETED
+                .with_label_values(&[EvictionEvent::Evict.into()])
+                .inc();
+        });
+
         if let Err(e) = do_eviction(self, &partial_backup_uploaded).await {
             warn!("failed to evict timeline: {:?}", e);
             return;
@@ -88,6 +98,15 @@ impl Manager {
 
         info!("starting uneviction, using {:?}", partial_backup_uploaded);
 
+        EVICTION_EVENTS_STARTED
+            .with_label_values(&[EvictionEvent::Restore.into()])
+            .inc();
+        let _guard = scopeguard::guard((), |_| {
+            EVICTION_EVENTS_COMPLETED
+                .with_label_values(&[EvictionEvent::Restore.into()])
+                .inc();
+        });
+
         if let Err(e) = do_uneviction(self, &partial_backup_uploaded).await {
             warn!("failed to unevict timeline: {:?}", e);
             return;
diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py
index 11e6fef28f..a51b89744b 100644
--- a/test_runner/fixtures/safekeeper/http.py
+++ b/test_runner/fixtures/safekeeper/http.py
@@ -1,6 +1,5 @@
 import json
-import re
-from dataclasses import dataclass, field
+from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import pytest
@@ -8,6 +7,7 @@ import requests
 
 from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
+from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
 
 
 # Walreceiver as returned by sk's timeline status endpoint.
@@ -31,15 +31,26 @@ class SafekeeperTimelineStatus:
     walreceivers: List[Walreceiver]
 
 
-@dataclass
-class SafekeeperMetrics:
+class SafekeeperMetrics(Metrics):
+    # Helpers to get metrics from tests without hardcoding the metric names there.
     # These are metrics from Prometheus which uses float64 internally.
     # As a consequence, values may differ from real original int64s.
-    flush_lsn_inexact: Dict[Tuple[TenantId, TimelineId], int] = field(default_factory=dict)
-    commit_lsn_inexact: Dict[Tuple[TenantId, TimelineId], int] = field(default_factory=dict)
+
+    def __init__(self, m: Metrics):
+        self.metrics = m.metrics
+
+    def flush_lsn_inexact(self, tenant_id: TenantId, timeline_id: TimelineId):
+        return self.query_one(
+            "safekeeper_flush_lsn", {"tenant_id": str(tenant_id), "timeline_id": str(timeline_id)}
+        ).value
+
+    def commit_lsn_inexact(self, tenant_id: TenantId, timeline_id: TimelineId):
+        return self.query_one(
+            "safekeeper_commit_lsn", {"tenant_id": str(tenant_id), "timeline_id": str(timeline_id)}
+        ).value
 
 
-class SafekeeperHttpClient(requests.Session):
+class SafekeeperHttpClient(requests.Session, MetricsGetter):
     HTTPError = requests.HTTPError
 
     def __init__(self, port: int, auth_token: Optional[str] = None, is_testing_enabled=False):
@@ -209,28 +220,11 @@ class SafekeeperHttpClient(requests.Session):
         return res_json
 
     def get_metrics_str(self) -> str:
+        """You probably want to use get_metrics() instead."""
         request_result = self.get(f"http://localhost:{self.port}/metrics")
         request_result.raise_for_status()
         return request_result.text
 
     def get_metrics(self) -> SafekeeperMetrics:
-        all_metrics_text = self.get_metrics_str()
-
-        metrics = SafekeeperMetrics()
-        for match in re.finditer(
-            r'^safekeeper_flush_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$',
-            all_metrics_text,
-            re.MULTILINE,
-        ):
-            metrics.flush_lsn_inexact[(TenantId(match.group(1)), TimelineId(match.group(2)))] = int(
-                match.group(3)
-            )
-        for match in re.finditer(
-            r'^safekeeper_commit_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$',
-            all_metrics_text,
-            re.MULTILINE,
-        ):
-            metrics.commit_lsn_inexact[
-                (TenantId(match.group(1)), TimelineId(match.group(2)))
-            ] = int(match.group(3))
-        return metrics
+        res = self.get_metrics_str()
+        return SafekeeperMetrics(parse_metrics(res))
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 7efd86e349..e0ad4fdd5c 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -147,8 +147,8 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder):
                 last_record_lsn=Lsn(timeline_detail["last_record_lsn"]),
             )
             for sk_m in sk_metrics:
-                m.flush_lsns.append(Lsn(sk_m.flush_lsn_inexact[(tenant_id, timeline_id)]))
-                m.commit_lsns.append(Lsn(sk_m.commit_lsn_inexact[(tenant_id, timeline_id)]))
+                m.flush_lsns.append(Lsn(int(sk_m.flush_lsn_inexact(tenant_id, timeline_id))))
+                m.commit_lsns.append(Lsn(int(sk_m.commit_lsn_inexact(tenant_id, timeline_id))))
 
             for flush_lsn, commit_lsn in zip(m.flush_lsns, m.commit_lsns):
                 # Invariant. May be < when transaction is in progress.
@@ -2274,3 +2274,23 @@ def test_s3_eviction(
         and sk.log_contains("successfully restored evicted timeline")
         for sk in env.safekeepers
     )
+
+    assert any(
+        sk.http_client().get_metric_value(
+            "safekeeper_eviction_events_started_total", {"kind": "evict"}
+        )
+        or 0 > 0
+        and sk.http_client().get_metric_value(
+            "safekeeper_eviction_events_completed_total", {"kind": "evict"}
+        )
+        or 0 > 0
+        and sk.http_client().get_metric_value(
+            "safekeeper_eviction_events_started_total", {"kind": "restore"}
+        )
+        or 0 > 0
+        and sk.http_client().get_metric_value(
+            "safekeeper_eviction_events_completed_total", {"kind": "restore"}
+        )
+        or 0 > 0
+        for sk in env.safekeepers
+    )