safekeeper: eviction metrics (#8348)

## Problem

Follow up to https://github.com/neondatabase/neon/pull/8335, to improve
observability of how many evict/restores we are doing.

## Summary of changes

- Add `safekeeper_eviction_events_started_total` and
`safekeeper_eviction_events_completed_total`, with a "kind" label of
evict or restore. This gives us rates, and also ability to calculate how
many are in progress.
- Generalize SafekeeperMetrics test type to use the same helpers as
pageserver, and enable querying any metric.
- Read the new metrics at the end of the eviction test.
This commit is contained in:
John Spray
2024-07-11 17:05:35 +01:00
committed by GitHub
parent d9a82468e2
commit 0159ae9536
6 changed files with 92 additions and 29 deletions

2
Cargo.lock generated
View File

@@ -5206,6 +5206,8 @@ dependencies = [
"sha2",
"signal-hook",
"storage_broker",
"strum",
"strum_macros",
"thiserror",
"tokio",
"tokio-io-timeout",

View File

@@ -41,6 +41,8 @@ serde.workspace = true
serde_json.workspace = true
serde_with.workspace = true
signal-hook.workspace = true
strum.workspace = true
strum_macros.workspace = true
thiserror.workspace = true
tokio = { workspace = true, features = ["fs"] }
tokio-util = { workspace = true }

View File

@@ -205,6 +205,32 @@ pub static WAL_BACKUP_TASKS: Lazy<IntCounterPair> = Lazy::new(|| {
.expect("Failed to register safekeeper_wal_backup_tasks_finished_total counter")
});
// Metrics collected on operations on the storage repository.
#[derive(strum_macros::EnumString, strum_macros::Display, strum_macros::IntoStaticStr)]
#[strum(serialize_all = "kebab_case")]
pub(crate) enum EvictionEvent {
Evict,
Restore,
}
pub(crate) static EVICTION_EVENTS_STARTED: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"safekeeper_eviction_events_started_total",
"Number of eviction state changes, incremented when they start",
&["kind"]
)
.expect("Failed to register metric")
});
pub(crate) static EVICTION_EVENTS_COMPLETED: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"safekeeper_eviction_events_completed_total",
"Number of eviction state changes, incremented when they complete",
&["kind"]
)
.expect("Failed to register metric")
});
pub const LABEL_UNKNOWN: &str = "unknown";
/// Labels for traffic metrics.

View File

@@ -14,6 +14,7 @@ use tracing::{debug, info, instrument, warn};
use utils::crashsafe::durable_rename;
use crate::{
metrics::{EvictionEvent, EVICTION_EVENTS_COMPLETED, EVICTION_EVENTS_STARTED},
timeline_manager::{Manager, StateSnapshot},
wal_backup,
wal_backup_partial::{self, PartialRemoteSegment},
@@ -66,6 +67,15 @@ impl Manager {
info!("starting eviction, using {:?}", partial_backup_uploaded);
EVICTION_EVENTS_STARTED
.with_label_values(&[EvictionEvent::Evict.into()])
.inc();
let _guard = scopeguard::guard((), |_| {
EVICTION_EVENTS_COMPLETED
.with_label_values(&[EvictionEvent::Evict.into()])
.inc();
});
if let Err(e) = do_eviction(self, &partial_backup_uploaded).await {
warn!("failed to evict timeline: {:?}", e);
return;
@@ -88,6 +98,15 @@ impl Manager {
info!("starting uneviction, using {:?}", partial_backup_uploaded);
EVICTION_EVENTS_STARTED
.with_label_values(&[EvictionEvent::Restore.into()])
.inc();
let _guard = scopeguard::guard((), |_| {
EVICTION_EVENTS_COMPLETED
.with_label_values(&[EvictionEvent::Restore.into()])
.inc();
});
if let Err(e) = do_uneviction(self, &partial_backup_uploaded).await {
warn!("failed to unevict timeline: {:?}", e);
return;

View File

@@ -1,6 +1,5 @@
import json
import re
from dataclasses import dataclass, field
from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Tuple, Union
import pytest
@@ -8,6 +7,7 @@ import requests
from fixtures.common_types import Lsn, TenantId, TimelineId
from fixtures.log_helper import log
from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
# Walreceiver as returned by sk's timeline status endpoint.
@@ -31,15 +31,26 @@ class SafekeeperTimelineStatus:
walreceivers: List[Walreceiver]
@dataclass
class SafekeeperMetrics:
class SafekeeperMetrics(Metrics):
# Helpers to get metrics from tests without hardcoding the metric names there.
# These are metrics from Prometheus which uses float64 internally.
# As a consequence, values may differ from real original int64s.
flush_lsn_inexact: Dict[Tuple[TenantId, TimelineId], int] = field(default_factory=dict)
commit_lsn_inexact: Dict[Tuple[TenantId, TimelineId], int] = field(default_factory=dict)
def __init__(self, m: Metrics):
self.metrics = m.metrics
def flush_lsn_inexact(self, tenant_id: TenantId, timeline_id: TimelineId):
return self.query_one(
"safekeeper_flush_lsn", {"tenant_id": str(tenant_id), "timeline_id": str(timeline_id)}
).value
def commit_lsn_inexact(self, tenant_id: TenantId, timeline_id: TimelineId):
return self.query_one(
"safekeeper_commit_lsn", {"tenant_id": str(tenant_id), "timeline_id": str(timeline_id)}
).value
class SafekeeperHttpClient(requests.Session):
class SafekeeperHttpClient(requests.Session, MetricsGetter):
HTTPError = requests.HTTPError
def __init__(self, port: int, auth_token: Optional[str] = None, is_testing_enabled=False):
@@ -209,28 +220,11 @@ class SafekeeperHttpClient(requests.Session):
return res_json
def get_metrics_str(self) -> str:
"""You probably want to use get_metrics() instead."""
request_result = self.get(f"http://localhost:{self.port}/metrics")
request_result.raise_for_status()
return request_result.text
def get_metrics(self) -> SafekeeperMetrics:
all_metrics_text = self.get_metrics_str()
metrics = SafekeeperMetrics()
for match in re.finditer(
r'^safekeeper_flush_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$',
all_metrics_text,
re.MULTILINE,
):
metrics.flush_lsn_inexact[(TenantId(match.group(1)), TimelineId(match.group(2)))] = int(
match.group(3)
)
for match in re.finditer(
r'^safekeeper_commit_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$',
all_metrics_text,
re.MULTILINE,
):
metrics.commit_lsn_inexact[
(TenantId(match.group(1)), TimelineId(match.group(2)))
] = int(match.group(3))
return metrics
res = self.get_metrics_str()
return SafekeeperMetrics(parse_metrics(res))

View File

@@ -147,8 +147,8 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder):
last_record_lsn=Lsn(timeline_detail["last_record_lsn"]),
)
for sk_m in sk_metrics:
m.flush_lsns.append(Lsn(sk_m.flush_lsn_inexact[(tenant_id, timeline_id)]))
m.commit_lsns.append(Lsn(sk_m.commit_lsn_inexact[(tenant_id, timeline_id)]))
m.flush_lsns.append(Lsn(int(sk_m.flush_lsn_inexact(tenant_id, timeline_id))))
m.commit_lsns.append(Lsn(int(sk_m.commit_lsn_inexact(tenant_id, timeline_id))))
for flush_lsn, commit_lsn in zip(m.flush_lsns, m.commit_lsns):
# Invariant. May be < when transaction is in progress.
@@ -2274,3 +2274,23 @@ def test_s3_eviction(
and sk.log_contains("successfully restored evicted timeline")
for sk in env.safekeepers
)
assert any(
sk.http_client().get_metric_value(
"safekeeper_eviction_events_started_total", {"kind": "evict"}
)
or 0 > 0
and sk.http_client().get_metric_value(
"safekeeper_eviction_events_completed_total", {"kind": "evict"}
)
or 0 > 0
and sk.http_client().get_metric_value(
"safekeeper_eviction_events_started_total", {"kind": "restore"}
)
or 0 > 0
and sk.http_client().get_metric_value(
"safekeeper_eviction_events_completed_total", {"kind": "restore"}
)
or 0 > 0
for sk in env.safekeepers
)