mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-05 20:42:54 +00:00
safekeeper: eviction metrics (#8348)
## Problem Follow up to https://github.com/neondatabase/neon/pull/8335, to improve observability of how many evict/restores we are doing. ## Summary of changes - Add `safekeeper_eviction_events_started_total` and `safekeeper_eviction_events_completed_total`, with a "kind" label of evict or restore. This gives us rates, and also ability to calculate how many are in progress. - Generalize SafekeeperMetrics test type to use the same helpers as pageserver, and enable querying any metric. - Read the new metrics at the end of the eviction test.
This commit is contained in:
2
Cargo.lock
generated
2
Cargo.lock
generated
@@ -5206,6 +5206,8 @@ dependencies = [
|
||||
"sha2",
|
||||
"signal-hook",
|
||||
"storage_broker",
|
||||
"strum",
|
||||
"strum_macros",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-io-timeout",
|
||||
|
||||
@@ -41,6 +41,8 @@ serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
serde_with.workspace = true
|
||||
signal-hook.workspace = true
|
||||
strum.workspace = true
|
||||
strum_macros.workspace = true
|
||||
thiserror.workspace = true
|
||||
tokio = { workspace = true, features = ["fs"] }
|
||||
tokio-util = { workspace = true }
|
||||
|
||||
@@ -205,6 +205,32 @@ pub static WAL_BACKUP_TASKS: Lazy<IntCounterPair> = Lazy::new(|| {
|
||||
.expect("Failed to register safekeeper_wal_backup_tasks_finished_total counter")
|
||||
});
|
||||
|
||||
// Metrics collected on operations on the storage repository.
|
||||
#[derive(strum_macros::EnumString, strum_macros::Display, strum_macros::IntoStaticStr)]
|
||||
#[strum(serialize_all = "kebab_case")]
|
||||
pub(crate) enum EvictionEvent {
|
||||
Evict,
|
||||
Restore,
|
||||
}
|
||||
|
||||
pub(crate) static EVICTION_EVENTS_STARTED: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"safekeeper_eviction_events_started_total",
|
||||
"Number of eviction state changes, incremented when they start",
|
||||
&["kind"]
|
||||
)
|
||||
.expect("Failed to register metric")
|
||||
});
|
||||
|
||||
pub(crate) static EVICTION_EVENTS_COMPLETED: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"safekeeper_eviction_events_completed_total",
|
||||
"Number of eviction state changes, incremented when they complete",
|
||||
&["kind"]
|
||||
)
|
||||
.expect("Failed to register metric")
|
||||
});
|
||||
|
||||
pub const LABEL_UNKNOWN: &str = "unknown";
|
||||
|
||||
/// Labels for traffic metrics.
|
||||
|
||||
@@ -14,6 +14,7 @@ use tracing::{debug, info, instrument, warn};
|
||||
use utils::crashsafe::durable_rename;
|
||||
|
||||
use crate::{
|
||||
metrics::{EvictionEvent, EVICTION_EVENTS_COMPLETED, EVICTION_EVENTS_STARTED},
|
||||
timeline_manager::{Manager, StateSnapshot},
|
||||
wal_backup,
|
||||
wal_backup_partial::{self, PartialRemoteSegment},
|
||||
@@ -66,6 +67,15 @@ impl Manager {
|
||||
|
||||
info!("starting eviction, using {:?}", partial_backup_uploaded);
|
||||
|
||||
EVICTION_EVENTS_STARTED
|
||||
.with_label_values(&[EvictionEvent::Evict.into()])
|
||||
.inc();
|
||||
let _guard = scopeguard::guard((), |_| {
|
||||
EVICTION_EVENTS_COMPLETED
|
||||
.with_label_values(&[EvictionEvent::Evict.into()])
|
||||
.inc();
|
||||
});
|
||||
|
||||
if let Err(e) = do_eviction(self, &partial_backup_uploaded).await {
|
||||
warn!("failed to evict timeline: {:?}", e);
|
||||
return;
|
||||
@@ -88,6 +98,15 @@ impl Manager {
|
||||
|
||||
info!("starting uneviction, using {:?}", partial_backup_uploaded);
|
||||
|
||||
EVICTION_EVENTS_STARTED
|
||||
.with_label_values(&[EvictionEvent::Restore.into()])
|
||||
.inc();
|
||||
let _guard = scopeguard::guard((), |_| {
|
||||
EVICTION_EVENTS_COMPLETED
|
||||
.with_label_values(&[EvictionEvent::Restore.into()])
|
||||
.inc();
|
||||
});
|
||||
|
||||
if let Err(e) = do_uneviction(self, &partial_backup_uploaded).await {
|
||||
warn!("failed to unevict timeline: {:?}", e);
|
||||
return;
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
import json
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
|
||||
import pytest
|
||||
@@ -8,6 +7,7 @@ import requests
|
||||
|
||||
from fixtures.common_types import Lsn, TenantId, TimelineId
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
|
||||
|
||||
|
||||
# Walreceiver as returned by sk's timeline status endpoint.
|
||||
@@ -31,15 +31,26 @@ class SafekeeperTimelineStatus:
|
||||
walreceivers: List[Walreceiver]
|
||||
|
||||
|
||||
@dataclass
|
||||
class SafekeeperMetrics:
|
||||
class SafekeeperMetrics(Metrics):
|
||||
# Helpers to get metrics from tests without hardcoding the metric names there.
|
||||
# These are metrics from Prometheus which uses float64 internally.
|
||||
# As a consequence, values may differ from real original int64s.
|
||||
flush_lsn_inexact: Dict[Tuple[TenantId, TimelineId], int] = field(default_factory=dict)
|
||||
commit_lsn_inexact: Dict[Tuple[TenantId, TimelineId], int] = field(default_factory=dict)
|
||||
|
||||
def __init__(self, m: Metrics):
|
||||
self.metrics = m.metrics
|
||||
|
||||
def flush_lsn_inexact(self, tenant_id: TenantId, timeline_id: TimelineId):
|
||||
return self.query_one(
|
||||
"safekeeper_flush_lsn", {"tenant_id": str(tenant_id), "timeline_id": str(timeline_id)}
|
||||
).value
|
||||
|
||||
def commit_lsn_inexact(self, tenant_id: TenantId, timeline_id: TimelineId):
|
||||
return self.query_one(
|
||||
"safekeeper_commit_lsn", {"tenant_id": str(tenant_id), "timeline_id": str(timeline_id)}
|
||||
).value
|
||||
|
||||
|
||||
class SafekeeperHttpClient(requests.Session):
|
||||
class SafekeeperHttpClient(requests.Session, MetricsGetter):
|
||||
HTTPError = requests.HTTPError
|
||||
|
||||
def __init__(self, port: int, auth_token: Optional[str] = None, is_testing_enabled=False):
|
||||
@@ -209,28 +220,11 @@ class SafekeeperHttpClient(requests.Session):
|
||||
return res_json
|
||||
|
||||
def get_metrics_str(self) -> str:
|
||||
"""You probably want to use get_metrics() instead."""
|
||||
request_result = self.get(f"http://localhost:{self.port}/metrics")
|
||||
request_result.raise_for_status()
|
||||
return request_result.text
|
||||
|
||||
def get_metrics(self) -> SafekeeperMetrics:
|
||||
all_metrics_text = self.get_metrics_str()
|
||||
|
||||
metrics = SafekeeperMetrics()
|
||||
for match in re.finditer(
|
||||
r'^safekeeper_flush_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$',
|
||||
all_metrics_text,
|
||||
re.MULTILINE,
|
||||
):
|
||||
metrics.flush_lsn_inexact[(TenantId(match.group(1)), TimelineId(match.group(2)))] = int(
|
||||
match.group(3)
|
||||
)
|
||||
for match in re.finditer(
|
||||
r'^safekeeper_commit_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$',
|
||||
all_metrics_text,
|
||||
re.MULTILINE,
|
||||
):
|
||||
metrics.commit_lsn_inexact[
|
||||
(TenantId(match.group(1)), TimelineId(match.group(2)))
|
||||
] = int(match.group(3))
|
||||
return metrics
|
||||
res = self.get_metrics_str()
|
||||
return SafekeeperMetrics(parse_metrics(res))
|
||||
|
||||
@@ -147,8 +147,8 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder):
|
||||
last_record_lsn=Lsn(timeline_detail["last_record_lsn"]),
|
||||
)
|
||||
for sk_m in sk_metrics:
|
||||
m.flush_lsns.append(Lsn(sk_m.flush_lsn_inexact[(tenant_id, timeline_id)]))
|
||||
m.commit_lsns.append(Lsn(sk_m.commit_lsn_inexact[(tenant_id, timeline_id)]))
|
||||
m.flush_lsns.append(Lsn(int(sk_m.flush_lsn_inexact(tenant_id, timeline_id))))
|
||||
m.commit_lsns.append(Lsn(int(sk_m.commit_lsn_inexact(tenant_id, timeline_id))))
|
||||
|
||||
for flush_lsn, commit_lsn in zip(m.flush_lsns, m.commit_lsns):
|
||||
# Invariant. May be < when transaction is in progress.
|
||||
@@ -2274,3 +2274,23 @@ def test_s3_eviction(
|
||||
and sk.log_contains("successfully restored evicted timeline")
|
||||
for sk in env.safekeepers
|
||||
)
|
||||
|
||||
assert any(
|
||||
sk.http_client().get_metric_value(
|
||||
"safekeeper_eviction_events_started_total", {"kind": "evict"}
|
||||
)
|
||||
or 0 > 0
|
||||
and sk.http_client().get_metric_value(
|
||||
"safekeeper_eviction_events_completed_total", {"kind": "evict"}
|
||||
)
|
||||
or 0 > 0
|
||||
and sk.http_client().get_metric_value(
|
||||
"safekeeper_eviction_events_started_total", {"kind": "restore"}
|
||||
)
|
||||
or 0 > 0
|
||||
and sk.http_client().get_metric_value(
|
||||
"safekeeper_eviction_events_completed_total", {"kind": "restore"}
|
||||
)
|
||||
or 0 > 0
|
||||
for sk in env.safekeepers
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user