always remove RemoteTimelineClient's metrics when dropping it

This commit is contained in:
Christian Schwarz
2022-12-13 12:23:47 -05:00
committed by Christian Schwarz
parent 8fcba150db
commit 4132ae9dfe
5 changed files with 144 additions and 30 deletions

View File

@@ -201,7 +201,7 @@ pub static NUM_ONDISK_LAYERS: Lazy<IntGauge> = Lazy::new(|| {
// remote storage metrics
pub static REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS: Lazy<IntGaugeVec> = Lazy::new(|| {
static REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS: Lazy<IntGaugeVec> = Lazy::new(|| {
register_int_gauge_vec!(
"pageserver_remote_upload_queue_unfinished_tasks",
"Number of tasks in the upload queue that are not finished yet.",
@@ -210,14 +210,14 @@ pub static REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS: Lazy<IntGaugeVec> = Lazy::new(|
.expect("failed to define a metric")
});
#[derive(Debug, Clone, Copy)]
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum RemoteOpKind {
Upload,
Download,
Delete,
}
impl RemoteOpKind {
pub fn as_str(&self) -> &str {
pub fn as_str(&self) -> &'static str {
match self {
Self::Upload => "upload",
Self::Download => "download",
@@ -226,13 +226,13 @@ impl RemoteOpKind {
}
}
#[derive(Debug, Clone, Copy)]
#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
pub enum RemoteOpFileKind {
Layer,
Index,
}
impl RemoteOpFileKind {
pub fn as_str(&self) -> &str {
pub fn as_str(&self) -> &'static str {
match self {
Self::Layer => "layer",
Self::Index => "index",
@@ -491,10 +491,94 @@ pub fn remove_tenant_metrics(tenant_id: &TenantId) {
use futures::Future;
use pin_project_lite::pin_project;
use std::collections::HashMap;
use std::pin::Pin;
use std::sync::{Arc, Mutex};
use std::task::{Context, Poll};
use std::time::Instant;
pub struct RemoteTimelineClientMetrics {
tenant_id: String,
timeline_id: String,
remote_operation_time: Mutex<HashMap<(&'static str, &'static str, &'static str), Histogram>>,
unfinished_tasks: Mutex<HashMap<(&'static str, &'static str), IntGauge>>,
}
impl RemoteTimelineClientMetrics {
pub fn new(tenant_id: &TenantId, timeline_id: &TimelineId) -> Self {
RemoteTimelineClientMetrics {
tenant_id: tenant_id.to_string(),
timeline_id: timeline_id.to_string(),
remote_operation_time: Mutex::new(HashMap::default()),
unfinished_tasks: Mutex::new(HashMap::default()),
}
}
pub fn remote_operation_time(
&self,
file_kind: &RemoteOpFileKind,
op_kind: &RemoteOpKind,
status: &'static str,
) -> Histogram {
// XXX would be nice to have an upgradable RwLock
let mut guard = self.remote_operation_time.lock().unwrap();
let key = (file_kind.as_str(), op_kind.as_str(), status);
let metric = guard.entry(key).or_insert_with(move || {
REMOTE_OPERATION_TIME
.get_metric_with_label_values(&[
&self.tenant_id.to_string(),
&self.timeline_id.to_string(),
key.0,
key.1,
key.2,
])
.unwrap()
});
metric.clone()
}
pub fn unfinished_tasks(
&self,
file_kind: &RemoteOpFileKind,
op_kind: &RemoteOpKind,
) -> IntGauge {
// XXX would be nice to have an upgradable RwLock
let mut guard = self.unfinished_tasks.lock().unwrap();
let key = (file_kind.as_str(), op_kind.as_str());
let metric = guard.entry(key).or_insert_with(move || {
REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS
.get_metric_with_label_values(&[
&self.tenant_id.to_string(),
&self.timeline_id.to_string(),
key.0,
key.1,
])
.unwrap()
});
metric.clone()
}
}
impl Drop for RemoteTimelineClientMetrics {
fn drop(&mut self) {
let RemoteTimelineClientMetrics {
tenant_id,
timeline_id,
remote_operation_time,
unfinished_tasks,
} = self;
for ((a, b, c), _) in remote_operation_time.get_mut().unwrap().drain() {
let _ = REMOTE_OPERATION_TIME.remove_label_values(&[tenant_id, timeline_id, a, b, c]);
}
for ((a, b), _) in unfinished_tasks.get_mut().unwrap().drain() {
let _ = REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS.remove_label_values(&[
tenant_id,
timeline_id,
a,
b,
]);
}
}
}
/// Wrapper future that measures the time spent by a remote storage operation,
/// and records the time and success/failure as a prometheus metric.
pub trait MeasureRemoteOp: Sized {
@@ -504,6 +588,7 @@ pub trait MeasureRemoteOp: Sized {
timeline_id: TimelineId,
file_kind: RemoteOpFileKind,
op: RemoteOpKind,
metrics: Arc<RemoteTimelineClientMetrics>,
) -> MeasuredRemoteOp<Self> {
let start = Instant::now();
MeasuredRemoteOp {
@@ -513,6 +598,7 @@ pub trait MeasureRemoteOp: Sized {
file_kind,
op,
start,
metrics,
}
}
}
@@ -529,6 +615,7 @@ pin_project! {
file_kind: RemoteOpFileKind,
op: RemoteOpKind,
start: Instant,
metrics: Arc<RemoteTimelineClientMetrics>,
}
}
@@ -541,15 +628,8 @@ impl<F: Future<Output = Result<O, E>>, O, E> Future for MeasuredRemoteOp<F> {
if let Poll::Ready(ref res) = poll_result {
let duration = this.start.elapsed();
let status = if res.is_ok() { &"success" } else { &"failure" };
REMOTE_OPERATION_TIME
.get_metric_with_label_values(&[
&this.tenant_id.to_string(),
&this.timeline_id.to_string(),
this.file_kind.as_str(),
this.op.as_str(),
status,
])
.unwrap()
this.metrics
.remote_operation_time(this.file_kind, this.op, status)
.observe(duration.as_secs_f64());
}
poll_result

View File

@@ -210,10 +210,9 @@ use utils::lsn::Lsn;
use self::index::IndexPart;
use crate::metrics::MeasureRemoteOp;
use crate::metrics::RemoteOpFileKind;
use crate::metrics::RemoteOpKind;
use crate::metrics::REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS;
use crate::metrics::{MeasureRemoteOp, RemoteTimelineClientMetrics};
use crate::tenant::filename::LayerFileName;
use crate::{
config::PageServerConf,
@@ -256,6 +255,8 @@ pub struct RemoteTimelineClient {
upload_queue: Mutex<UploadQueue>,
metrics: Arc<RemoteTimelineClientMetrics>,
storage_impl: GenericRemoteStorage,
}
@@ -501,6 +502,7 @@ impl RemoteTimelineClient {
self.timeline_id,
RemoteOpFileKind::Index,
RemoteOpKind::Download,
Arc::clone(&self.metrics),
)
.await
}
@@ -528,6 +530,7 @@ impl RemoteTimelineClient {
self.timeline_id,
RemoteOpFileKind::Layer,
RemoteOpKind::Download,
Arc::clone(&self.metrics),
)
.await?;
@@ -847,6 +850,7 @@ impl RemoteTimelineClient {
self.timeline_id,
RemoteOpFileKind::Layer,
RemoteOpKind::Upload,
Arc::clone(&self.metrics),
)
.await
}
@@ -863,6 +867,7 @@ impl RemoteTimelineClient {
self.timeline_id,
RemoteOpFileKind::Index,
RemoteOpKind::Upload,
Arc::clone(&self.metrics),
)
.await
}
@@ -877,6 +882,7 @@ impl RemoteTimelineClient {
self.timeline_id,
*metric_file_kind,
RemoteOpKind::Delete,
Arc::clone(&self.metrics),
)
.await
}
@@ -977,14 +983,8 @@ impl RemoteTimelineClient {
return;
}
};
REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS
.get_metric_with_label_values(&[
&self.tenant_id.to_string(),
&self.timeline_id.to_string(),
file_kind.as_str(),
op_kind.as_str(),
])
.unwrap()
self.metrics
.unfinished_tasks(&file_kind, &op_kind)
.add(delta)
}
@@ -1068,6 +1068,7 @@ pub fn create_remote_timeline_client(
timeline_id,
storage_impl: remote_storage,
upload_queue: Mutex::new(UploadQueue::Uninitialized),
metrics: Arc::new(RemoteTimelineClientMetrics::new(&tenant_id, &timeline_id)),
})
}
@@ -1180,6 +1181,10 @@ mod tests {
timeline_id: TIMELINE_ID,
storage_impl,
upload_queue: Mutex::new(UploadQueue::Uninitialized),
metrics: Arc::new(RemoteTimelineClientMetrics::new(
&harness.tenant_id,
&TIMELINE_ID,
)),
});
let remote_timeline_dir =

View File

@@ -39,6 +39,13 @@ def parse_metrics(text: str, name: str = "") -> Metrics:
return metrics
PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS: Tuple[str, ...] = (
"pageserver_remote_upload_queue_unfinished_tasks",
"pageserver_remote_operation_seconds_bucket",
"pageserver_remote_operation_seconds_count",
"pageserver_remote_operation_seconds_sum",
)
PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
"pageserver_current_logical_size",
"pageserver_current_physical_size",
@@ -62,4 +69,5 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
"pageserver_wait_lsn_seconds_sum",
"pageserver_created_persistent_files_total",
"pageserver_written_persistent_bytes_total",
*PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
)

View File

@@ -384,7 +384,8 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
metrics,
re.MULTILINE,
)
assert matches
if matches is None:
return None
return int(matches[1])
pg = env.postgres.create_start("main", tenant_id=tenant_id)
@@ -436,8 +437,8 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
assert not timeline_path.exists()
# timeline deletion should kill ongoing uploads
assert get_queued_count(file_kind="index", op_kind="upload") == 0
# timeline deletion should kill ongoing uploads, so, the metric will be gone
assert get_queued_count(file_kind="index", op_kind="upload") is None
# timeline deletion should be unblocking checkpoint ops
checkpoint_thread.join(2.0)

View File

@@ -7,7 +7,11 @@ from typing import List
import pytest
from fixtures.log_helper import log
from fixtures.metrics import PAGESERVER_PER_TENANT_METRICS, parse_metrics
from fixtures.metrics import (
PAGESERVER_PER_TENANT_METRICS,
PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
parse_metrics,
)
from fixtures.neon_fixtures import (
NeonEnv,
NeonEnvBuilder,
@@ -157,9 +161,21 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder):
)
def test_pageserver_metrics_removed_after_detach(neon_env_builder: NeonEnvBuilder):
@pytest.mark.parametrize(
"remote_storage_kind",
# exercise both the code paths where remote_storage=None and remote_storage=Some(...)
[RemoteStorageKind.NOOP, RemoteStorageKind.MOCK_S3],
)
def test_pageserver_metrics_removed_after_detach(
neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
):
"""Tests that when a tenant is detached, the tenant specific metrics are not left behind"""
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storage_kind,
test_name="test_pageserver_metrics_removed_after_detach",
)
neon_env_builder.num_safekeepers = 3
env = neon_env_builder.init_start()
@@ -192,7 +208,11 @@ def test_pageserver_metrics_removed_after_detach(neon_env_builder: NeonEnvBuilde
for tenant in [tenant_1, tenant_2]:
pre_detach_samples = set([x.name for x in get_ps_metric_samples_for_tenant(tenant)])
assert pre_detach_samples == set(PAGESERVER_PER_TENANT_METRICS)
expected = set(PAGESERVER_PER_TENANT_METRICS)
if remote_storage_kind == RemoteStorageKind.NOOP:
# if there's no remote storage configured, we don't expose the remote timeline client metrics
expected -= set(PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS)
assert pre_detach_samples == expected
env.pageserver.http_client().tenant_detach(tenant)