mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-10 06:52:55 +00:00
always remove RemoteTimelineClient's metrics when dropping it
This commit is contained in:
committed by
Christian Schwarz
parent
8fcba150db
commit
4132ae9dfe
@@ -201,7 +201,7 @@ pub static NUM_ONDISK_LAYERS: Lazy<IntGauge> = Lazy::new(|| {
|
||||
|
||||
// remote storage metrics
|
||||
|
||||
pub static REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
static REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
register_int_gauge_vec!(
|
||||
"pageserver_remote_upload_queue_unfinished_tasks",
|
||||
"Number of tasks in the upload queue that are not finished yet.",
|
||||
@@ -210,14 +210,14 @@ pub static REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS: Lazy<IntGaugeVec> = Lazy::new(|
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub enum RemoteOpKind {
|
||||
Upload,
|
||||
Download,
|
||||
Delete,
|
||||
}
|
||||
impl RemoteOpKind {
|
||||
pub fn as_str(&self) -> &str {
|
||||
pub fn as_str(&self) -> &'static str {
|
||||
match self {
|
||||
Self::Upload => "upload",
|
||||
Self::Download => "download",
|
||||
@@ -226,13 +226,13 @@ impl RemoteOpKind {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
|
||||
pub enum RemoteOpFileKind {
|
||||
Layer,
|
||||
Index,
|
||||
}
|
||||
impl RemoteOpFileKind {
|
||||
pub fn as_str(&self) -> &str {
|
||||
pub fn as_str(&self) -> &'static str {
|
||||
match self {
|
||||
Self::Layer => "layer",
|
||||
Self::Index => "index",
|
||||
@@ -491,10 +491,94 @@ pub fn remove_tenant_metrics(tenant_id: &TenantId) {
|
||||
|
||||
use futures::Future;
|
||||
use pin_project_lite::pin_project;
|
||||
use std::collections::HashMap;
|
||||
use std::pin::Pin;
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::task::{Context, Poll};
|
||||
use std::time::Instant;
|
||||
|
||||
pub struct RemoteTimelineClientMetrics {
|
||||
tenant_id: String,
|
||||
timeline_id: String,
|
||||
remote_operation_time: Mutex<HashMap<(&'static str, &'static str, &'static str), Histogram>>,
|
||||
unfinished_tasks: Mutex<HashMap<(&'static str, &'static str), IntGauge>>,
|
||||
}
|
||||
|
||||
impl RemoteTimelineClientMetrics {
|
||||
pub fn new(tenant_id: &TenantId, timeline_id: &TimelineId) -> Self {
|
||||
RemoteTimelineClientMetrics {
|
||||
tenant_id: tenant_id.to_string(),
|
||||
timeline_id: timeline_id.to_string(),
|
||||
remote_operation_time: Mutex::new(HashMap::default()),
|
||||
unfinished_tasks: Mutex::new(HashMap::default()),
|
||||
}
|
||||
}
|
||||
pub fn remote_operation_time(
|
||||
&self,
|
||||
file_kind: &RemoteOpFileKind,
|
||||
op_kind: &RemoteOpKind,
|
||||
status: &'static str,
|
||||
) -> Histogram {
|
||||
// XXX would be nice to have an upgradable RwLock
|
||||
let mut guard = self.remote_operation_time.lock().unwrap();
|
||||
let key = (file_kind.as_str(), op_kind.as_str(), status);
|
||||
let metric = guard.entry(key).or_insert_with(move || {
|
||||
REMOTE_OPERATION_TIME
|
||||
.get_metric_with_label_values(&[
|
||||
&self.tenant_id.to_string(),
|
||||
&self.timeline_id.to_string(),
|
||||
key.0,
|
||||
key.1,
|
||||
key.2,
|
||||
])
|
||||
.unwrap()
|
||||
});
|
||||
metric.clone()
|
||||
}
|
||||
pub fn unfinished_tasks(
|
||||
&self,
|
||||
file_kind: &RemoteOpFileKind,
|
||||
op_kind: &RemoteOpKind,
|
||||
) -> IntGauge {
|
||||
// XXX would be nice to have an upgradable RwLock
|
||||
let mut guard = self.unfinished_tasks.lock().unwrap();
|
||||
let key = (file_kind.as_str(), op_kind.as_str());
|
||||
let metric = guard.entry(key).or_insert_with(move || {
|
||||
REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS
|
||||
.get_metric_with_label_values(&[
|
||||
&self.tenant_id.to_string(),
|
||||
&self.timeline_id.to_string(),
|
||||
key.0,
|
||||
key.1,
|
||||
])
|
||||
.unwrap()
|
||||
});
|
||||
metric.clone()
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for RemoteTimelineClientMetrics {
|
||||
fn drop(&mut self) {
|
||||
let RemoteTimelineClientMetrics {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
remote_operation_time,
|
||||
unfinished_tasks,
|
||||
} = self;
|
||||
for ((a, b, c), _) in remote_operation_time.get_mut().unwrap().drain() {
|
||||
let _ = REMOTE_OPERATION_TIME.remove_label_values(&[tenant_id, timeline_id, a, b, c]);
|
||||
}
|
||||
for ((a, b), _) in unfinished_tasks.get_mut().unwrap().drain() {
|
||||
let _ = REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS.remove_label_values(&[
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
a,
|
||||
b,
|
||||
]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Wrapper future that measures the time spent by a remote storage operation,
|
||||
/// and records the time and success/failure as a prometheus metric.
|
||||
pub trait MeasureRemoteOp: Sized {
|
||||
@@ -504,6 +588,7 @@ pub trait MeasureRemoteOp: Sized {
|
||||
timeline_id: TimelineId,
|
||||
file_kind: RemoteOpFileKind,
|
||||
op: RemoteOpKind,
|
||||
metrics: Arc<RemoteTimelineClientMetrics>,
|
||||
) -> MeasuredRemoteOp<Self> {
|
||||
let start = Instant::now();
|
||||
MeasuredRemoteOp {
|
||||
@@ -513,6 +598,7 @@ pub trait MeasureRemoteOp: Sized {
|
||||
file_kind,
|
||||
op,
|
||||
start,
|
||||
metrics,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -529,6 +615,7 @@ pin_project! {
|
||||
file_kind: RemoteOpFileKind,
|
||||
op: RemoteOpKind,
|
||||
start: Instant,
|
||||
metrics: Arc<RemoteTimelineClientMetrics>,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -541,15 +628,8 @@ impl<F: Future<Output = Result<O, E>>, O, E> Future for MeasuredRemoteOp<F> {
|
||||
if let Poll::Ready(ref res) = poll_result {
|
||||
let duration = this.start.elapsed();
|
||||
let status = if res.is_ok() { &"success" } else { &"failure" };
|
||||
REMOTE_OPERATION_TIME
|
||||
.get_metric_with_label_values(&[
|
||||
&this.tenant_id.to_string(),
|
||||
&this.timeline_id.to_string(),
|
||||
this.file_kind.as_str(),
|
||||
this.op.as_str(),
|
||||
status,
|
||||
])
|
||||
.unwrap()
|
||||
this.metrics
|
||||
.remote_operation_time(this.file_kind, this.op, status)
|
||||
.observe(duration.as_secs_f64());
|
||||
}
|
||||
poll_result
|
||||
|
||||
@@ -210,10 +210,9 @@ use utils::lsn::Lsn;
|
||||
|
||||
use self::index::IndexPart;
|
||||
|
||||
use crate::metrics::MeasureRemoteOp;
|
||||
use crate::metrics::RemoteOpFileKind;
|
||||
use crate::metrics::RemoteOpKind;
|
||||
use crate::metrics::REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS;
|
||||
use crate::metrics::{MeasureRemoteOp, RemoteTimelineClientMetrics};
|
||||
use crate::tenant::filename::LayerFileName;
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
@@ -256,6 +255,8 @@ pub struct RemoteTimelineClient {
|
||||
|
||||
upload_queue: Mutex<UploadQueue>,
|
||||
|
||||
metrics: Arc<RemoteTimelineClientMetrics>,
|
||||
|
||||
storage_impl: GenericRemoteStorage,
|
||||
}
|
||||
|
||||
@@ -501,6 +502,7 @@ impl RemoteTimelineClient {
|
||||
self.timeline_id,
|
||||
RemoteOpFileKind::Index,
|
||||
RemoteOpKind::Download,
|
||||
Arc::clone(&self.metrics),
|
||||
)
|
||||
.await
|
||||
}
|
||||
@@ -528,6 +530,7 @@ impl RemoteTimelineClient {
|
||||
self.timeline_id,
|
||||
RemoteOpFileKind::Layer,
|
||||
RemoteOpKind::Download,
|
||||
Arc::clone(&self.metrics),
|
||||
)
|
||||
.await?;
|
||||
|
||||
@@ -847,6 +850,7 @@ impl RemoteTimelineClient {
|
||||
self.timeline_id,
|
||||
RemoteOpFileKind::Layer,
|
||||
RemoteOpKind::Upload,
|
||||
Arc::clone(&self.metrics),
|
||||
)
|
||||
.await
|
||||
}
|
||||
@@ -863,6 +867,7 @@ impl RemoteTimelineClient {
|
||||
self.timeline_id,
|
||||
RemoteOpFileKind::Index,
|
||||
RemoteOpKind::Upload,
|
||||
Arc::clone(&self.metrics),
|
||||
)
|
||||
.await
|
||||
}
|
||||
@@ -877,6 +882,7 @@ impl RemoteTimelineClient {
|
||||
self.timeline_id,
|
||||
*metric_file_kind,
|
||||
RemoteOpKind::Delete,
|
||||
Arc::clone(&self.metrics),
|
||||
)
|
||||
.await
|
||||
}
|
||||
@@ -977,14 +983,8 @@ impl RemoteTimelineClient {
|
||||
return;
|
||||
}
|
||||
};
|
||||
REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS
|
||||
.get_metric_with_label_values(&[
|
||||
&self.tenant_id.to_string(),
|
||||
&self.timeline_id.to_string(),
|
||||
file_kind.as_str(),
|
||||
op_kind.as_str(),
|
||||
])
|
||||
.unwrap()
|
||||
self.metrics
|
||||
.unfinished_tasks(&file_kind, &op_kind)
|
||||
.add(delta)
|
||||
}
|
||||
|
||||
@@ -1068,6 +1068,7 @@ pub fn create_remote_timeline_client(
|
||||
timeline_id,
|
||||
storage_impl: remote_storage,
|
||||
upload_queue: Mutex::new(UploadQueue::Uninitialized),
|
||||
metrics: Arc::new(RemoteTimelineClientMetrics::new(&tenant_id, &timeline_id)),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -1180,6 +1181,10 @@ mod tests {
|
||||
timeline_id: TIMELINE_ID,
|
||||
storage_impl,
|
||||
upload_queue: Mutex::new(UploadQueue::Uninitialized),
|
||||
metrics: Arc::new(RemoteTimelineClientMetrics::new(
|
||||
&harness.tenant_id,
|
||||
&TIMELINE_ID,
|
||||
)),
|
||||
});
|
||||
|
||||
let remote_timeline_dir =
|
||||
|
||||
@@ -39,6 +39,13 @@ def parse_metrics(text: str, name: str = "") -> Metrics:
|
||||
return metrics
|
||||
|
||||
|
||||
PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS: Tuple[str, ...] = (
|
||||
"pageserver_remote_upload_queue_unfinished_tasks",
|
||||
"pageserver_remote_operation_seconds_bucket",
|
||||
"pageserver_remote_operation_seconds_count",
|
||||
"pageserver_remote_operation_seconds_sum",
|
||||
)
|
||||
|
||||
PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
|
||||
"pageserver_current_logical_size",
|
||||
"pageserver_current_physical_size",
|
||||
@@ -62,4 +69,5 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
|
||||
"pageserver_wait_lsn_seconds_sum",
|
||||
"pageserver_created_persistent_files_total",
|
||||
"pageserver_written_persistent_bytes_total",
|
||||
*PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
|
||||
)
|
||||
|
||||
@@ -384,7 +384,8 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
|
||||
metrics,
|
||||
re.MULTILINE,
|
||||
)
|
||||
assert matches
|
||||
if matches is None:
|
||||
return None
|
||||
return int(matches[1])
|
||||
|
||||
pg = env.postgres.create_start("main", tenant_id=tenant_id)
|
||||
@@ -436,8 +437,8 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
|
||||
|
||||
assert not timeline_path.exists()
|
||||
|
||||
# timeline deletion should kill ongoing uploads
|
||||
assert get_queued_count(file_kind="index", op_kind="upload") == 0
|
||||
# timeline deletion should kill ongoing uploads, so, the metric will be gone
|
||||
assert get_queued_count(file_kind="index", op_kind="upload") is None
|
||||
|
||||
# timeline deletion should be unblocking checkpoint ops
|
||||
checkpoint_thread.join(2.0)
|
||||
|
||||
@@ -7,7 +7,11 @@ from typing import List
|
||||
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.metrics import PAGESERVER_PER_TENANT_METRICS, parse_metrics
|
||||
from fixtures.metrics import (
|
||||
PAGESERVER_PER_TENANT_METRICS,
|
||||
PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
|
||||
parse_metrics,
|
||||
)
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonEnv,
|
||||
NeonEnvBuilder,
|
||||
@@ -157,9 +161,21 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder):
|
||||
)
|
||||
|
||||
|
||||
def test_pageserver_metrics_removed_after_detach(neon_env_builder: NeonEnvBuilder):
|
||||
@pytest.mark.parametrize(
|
||||
"remote_storage_kind",
|
||||
# exercise both the code paths where remote_storage=None and remote_storage=Some(...)
|
||||
[RemoteStorageKind.NOOP, RemoteStorageKind.MOCK_S3],
|
||||
)
|
||||
def test_pageserver_metrics_removed_after_detach(
|
||||
neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
|
||||
):
|
||||
"""Tests that when a tenant is detached, the tenant specific metrics are not left behind"""
|
||||
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_pageserver_metrics_removed_after_detach",
|
||||
)
|
||||
|
||||
neon_env_builder.num_safekeepers = 3
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
@@ -192,7 +208,11 @@ def test_pageserver_metrics_removed_after_detach(neon_env_builder: NeonEnvBuilde
|
||||
|
||||
for tenant in [tenant_1, tenant_2]:
|
||||
pre_detach_samples = set([x.name for x in get_ps_metric_samples_for_tenant(tenant)])
|
||||
assert pre_detach_samples == set(PAGESERVER_PER_TENANT_METRICS)
|
||||
expected = set(PAGESERVER_PER_TENANT_METRICS)
|
||||
if remote_storage_kind == RemoteStorageKind.NOOP:
|
||||
# if there's no remote storage configured, we don't expose the remote timeline client metrics
|
||||
expected -= set(PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS)
|
||||
assert pre_detach_samples == expected
|
||||
|
||||
env.pageserver.http_client().tenant_detach(tenant)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user