From 0159ae9536d6b9e0a9cb27b0ced3fd244faf63d0 Mon Sep 17 00:00:00 2001 From: John Spray Date: Thu, 11 Jul 2024 17:05:35 +0100 Subject: [PATCH] safekeeper: eviction metrics (#8348) ## Problem Follow up to https://github.com/neondatabase/neon/pull/8335, to improve observability of how many evict/restores we are doing. ## Summary of changes - Add `safekeeper_eviction_events_started_total` and `safekeeper_eviction_events_completed_total`, with a "kind" label of evict or restore. This gives us rates, and also ability to calculate how many are in progress. - Generalize SafekeeperMetrics test type to use the same helpers as pageserver, and enable querying any metric. - Read the new metrics at the end of the eviction test. --- Cargo.lock | 2 + safekeeper/Cargo.toml | 2 + safekeeper/src/metrics.rs | 26 +++++++++++++ safekeeper/src/timeline_eviction.rs | 19 ++++++++++ test_runner/fixtures/safekeeper/http.py | 48 +++++++++++------------- test_runner/regress/test_wal_acceptor.py | 24 +++++++++++- 6 files changed, 92 insertions(+), 29 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4b1525edee..b31ac69e6c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5206,6 +5206,8 @@ dependencies = [ "sha2", "signal-hook", "storage_broker", + "strum", + "strum_macros", "thiserror", "tokio", "tokio-io-timeout", diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index a650d5e207..9f32016fd9 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -41,6 +41,8 @@ serde.workspace = true serde_json.workspace = true serde_with.workspace = true signal-hook.workspace = true +strum.workspace = true +strum_macros.workspace = true thiserror.workspace = true tokio = { workspace = true, features = ["fs"] } tokio-util = { workspace = true } diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs index 539ecf826b..aa2bafbe92 100644 --- a/safekeeper/src/metrics.rs +++ b/safekeeper/src/metrics.rs @@ -205,6 +205,32 @@ pub static WAL_BACKUP_TASKS: Lazy = Lazy::new(|| { .expect("Failed to register safekeeper_wal_backup_tasks_finished_total counter") }); +// Metrics collected on operations on the storage repository. +#[derive(strum_macros::EnumString, strum_macros::Display, strum_macros::IntoStaticStr)] +#[strum(serialize_all = "kebab_case")] +pub(crate) enum EvictionEvent { + Evict, + Restore, +} + +pub(crate) static EVICTION_EVENTS_STARTED: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "safekeeper_eviction_events_started_total", + "Number of eviction state changes, incremented when they start", + &["kind"] + ) + .expect("Failed to register metric") +}); + +pub(crate) static EVICTION_EVENTS_COMPLETED: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "safekeeper_eviction_events_completed_total", + "Number of eviction state changes, incremented when they complete", + &["kind"] + ) + .expect("Failed to register metric") +}); + pub const LABEL_UNKNOWN: &str = "unknown"; /// Labels for traffic metrics. diff --git a/safekeeper/src/timeline_eviction.rs b/safekeeper/src/timeline_eviction.rs index e4ab65290d..0b8d58ee8a 100644 --- a/safekeeper/src/timeline_eviction.rs +++ b/safekeeper/src/timeline_eviction.rs @@ -14,6 +14,7 @@ use tracing::{debug, info, instrument, warn}; use utils::crashsafe::durable_rename; use crate::{ + metrics::{EvictionEvent, EVICTION_EVENTS_COMPLETED, EVICTION_EVENTS_STARTED}, timeline_manager::{Manager, StateSnapshot}, wal_backup, wal_backup_partial::{self, PartialRemoteSegment}, @@ -66,6 +67,15 @@ impl Manager { info!("starting eviction, using {:?}", partial_backup_uploaded); + EVICTION_EVENTS_STARTED + .with_label_values(&[EvictionEvent::Evict.into()]) + .inc(); + let _guard = scopeguard::guard((), |_| { + EVICTION_EVENTS_COMPLETED + .with_label_values(&[EvictionEvent::Evict.into()]) + .inc(); + }); + if let Err(e) = do_eviction(self, &partial_backup_uploaded).await { warn!("failed to evict timeline: {:?}", e); return; @@ -88,6 +98,15 @@ impl Manager { info!("starting uneviction, using {:?}", partial_backup_uploaded); + EVICTION_EVENTS_STARTED + .with_label_values(&[EvictionEvent::Restore.into()]) + .inc(); + let _guard = scopeguard::guard((), |_| { + EVICTION_EVENTS_COMPLETED + .with_label_values(&[EvictionEvent::Restore.into()]) + .inc(); + }); + if let Err(e) = do_uneviction(self, &partial_backup_uploaded).await { warn!("failed to unevict timeline: {:?}", e); return; diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py index 11e6fef28f..a51b89744b 100644 --- a/test_runner/fixtures/safekeeper/http.py +++ b/test_runner/fixtures/safekeeper/http.py @@ -1,6 +1,5 @@ import json -import re -from dataclasses import dataclass, field +from dataclasses import dataclass from typing import Any, Dict, List, Optional, Tuple, Union import pytest @@ -8,6 +7,7 @@ import requests from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log +from fixtures.metrics import Metrics, MetricsGetter, parse_metrics # Walreceiver as returned by sk's timeline status endpoint. @@ -31,15 +31,26 @@ class SafekeeperTimelineStatus: walreceivers: List[Walreceiver] -@dataclass -class SafekeeperMetrics: +class SafekeeperMetrics(Metrics): + # Helpers to get metrics from tests without hardcoding the metric names there. # These are metrics from Prometheus which uses float64 internally. # As a consequence, values may differ from real original int64s. - flush_lsn_inexact: Dict[Tuple[TenantId, TimelineId], int] = field(default_factory=dict) - commit_lsn_inexact: Dict[Tuple[TenantId, TimelineId], int] = field(default_factory=dict) + + def __init__(self, m: Metrics): + self.metrics = m.metrics + + def flush_lsn_inexact(self, tenant_id: TenantId, timeline_id: TimelineId): + return self.query_one( + "safekeeper_flush_lsn", {"tenant_id": str(tenant_id), "timeline_id": str(timeline_id)} + ).value + + def commit_lsn_inexact(self, tenant_id: TenantId, timeline_id: TimelineId): + return self.query_one( + "safekeeper_commit_lsn", {"tenant_id": str(tenant_id), "timeline_id": str(timeline_id)} + ).value -class SafekeeperHttpClient(requests.Session): +class SafekeeperHttpClient(requests.Session, MetricsGetter): HTTPError = requests.HTTPError def __init__(self, port: int, auth_token: Optional[str] = None, is_testing_enabled=False): @@ -209,28 +220,11 @@ class SafekeeperHttpClient(requests.Session): return res_json def get_metrics_str(self) -> str: + """You probably want to use get_metrics() instead.""" request_result = self.get(f"http://localhost:{self.port}/metrics") request_result.raise_for_status() return request_result.text def get_metrics(self) -> SafekeeperMetrics: - all_metrics_text = self.get_metrics_str() - - metrics = SafekeeperMetrics() - for match in re.finditer( - r'^safekeeper_flush_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$', - all_metrics_text, - re.MULTILINE, - ): - metrics.flush_lsn_inexact[(TenantId(match.group(1)), TimelineId(match.group(2)))] = int( - match.group(3) - ) - for match in re.finditer( - r'^safekeeper_commit_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$', - all_metrics_text, - re.MULTILINE, - ): - metrics.commit_lsn_inexact[ - (TenantId(match.group(1)), TimelineId(match.group(2))) - ] = int(match.group(3)) - return metrics + res = self.get_metrics_str() + return SafekeeperMetrics(parse_metrics(res)) diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 7efd86e349..e0ad4fdd5c 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -147,8 +147,8 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder): last_record_lsn=Lsn(timeline_detail["last_record_lsn"]), ) for sk_m in sk_metrics: - m.flush_lsns.append(Lsn(sk_m.flush_lsn_inexact[(tenant_id, timeline_id)])) - m.commit_lsns.append(Lsn(sk_m.commit_lsn_inexact[(tenant_id, timeline_id)])) + m.flush_lsns.append(Lsn(int(sk_m.flush_lsn_inexact(tenant_id, timeline_id)))) + m.commit_lsns.append(Lsn(int(sk_m.commit_lsn_inexact(tenant_id, timeline_id)))) for flush_lsn, commit_lsn in zip(m.flush_lsns, m.commit_lsns): # Invariant. May be < when transaction is in progress. @@ -2274,3 +2274,23 @@ def test_s3_eviction( and sk.log_contains("successfully restored evicted timeline") for sk in env.safekeepers ) + + assert any( + sk.http_client().get_metric_value( + "safekeeper_eviction_events_started_total", {"kind": "evict"} + ) + or 0 > 0 + and sk.http_client().get_metric_value( + "safekeeper_eviction_events_completed_total", {"kind": "evict"} + ) + or 0 > 0 + and sk.http_client().get_metric_value( + "safekeeper_eviction_events_started_total", {"kind": "restore"} + ) + or 0 > 0 + and sk.http_client().get_metric_value( + "safekeeper_eviction_events_completed_total", {"kind": "restore"} + ) + or 0 > 0 + for sk in env.safekeepers + )