diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 454ff01f0e..2f1a98e4c5 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -201,7 +201,7 @@ pub static NUM_ONDISK_LAYERS: Lazy = Lazy::new(|| { // remote storage metrics -pub static REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS: Lazy = Lazy::new(|| { +static REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS: Lazy = Lazy::new(|| { register_int_gauge_vec!( "pageserver_remote_upload_queue_unfinished_tasks", "Number of tasks in the upload queue that are not finished yet.", @@ -210,14 +210,14 @@ pub static REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS: Lazy = Lazy::new(| .expect("failed to define a metric") }); -#[derive(Debug, Clone, Copy)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub enum RemoteOpKind { Upload, Download, Delete, } impl RemoteOpKind { - pub fn as_str(&self) -> &str { + pub fn as_str(&self) -> &'static str { match self { Self::Upload => "upload", Self::Download => "download", @@ -226,13 +226,13 @@ impl RemoteOpKind { } } -#[derive(Debug, Clone, Copy)] +#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)] pub enum RemoteOpFileKind { Layer, Index, } impl RemoteOpFileKind { - pub fn as_str(&self) -> &str { + pub fn as_str(&self) -> &'static str { match self { Self::Layer => "layer", Self::Index => "index", @@ -491,10 +491,94 @@ pub fn remove_tenant_metrics(tenant_id: &TenantId) { use futures::Future; use pin_project_lite::pin_project; +use std::collections::HashMap; use std::pin::Pin; +use std::sync::{Arc, Mutex}; use std::task::{Context, Poll}; use std::time::Instant; +pub struct RemoteTimelineClientMetrics { + tenant_id: String, + timeline_id: String, + remote_operation_time: Mutex>, + unfinished_tasks: Mutex>, +} + +impl RemoteTimelineClientMetrics { + pub fn new(tenant_id: &TenantId, timeline_id: &TimelineId) -> Self { + RemoteTimelineClientMetrics { + tenant_id: tenant_id.to_string(), + timeline_id: timeline_id.to_string(), + remote_operation_time: Mutex::new(HashMap::default()), + unfinished_tasks: Mutex::new(HashMap::default()), + } + } + pub fn remote_operation_time( + &self, + file_kind: &RemoteOpFileKind, + op_kind: &RemoteOpKind, + status: &'static str, + ) -> Histogram { + // XXX would be nice to have an upgradable RwLock + let mut guard = self.remote_operation_time.lock().unwrap(); + let key = (file_kind.as_str(), op_kind.as_str(), status); + let metric = guard.entry(key).or_insert_with(move || { + REMOTE_OPERATION_TIME + .get_metric_with_label_values(&[ + &self.tenant_id.to_string(), + &self.timeline_id.to_string(), + key.0, + key.1, + key.2, + ]) + .unwrap() + }); + metric.clone() + } + pub fn unfinished_tasks( + &self, + file_kind: &RemoteOpFileKind, + op_kind: &RemoteOpKind, + ) -> IntGauge { + // XXX would be nice to have an upgradable RwLock + let mut guard = self.unfinished_tasks.lock().unwrap(); + let key = (file_kind.as_str(), op_kind.as_str()); + let metric = guard.entry(key).or_insert_with(move || { + REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS + .get_metric_with_label_values(&[ + &self.tenant_id.to_string(), + &self.timeline_id.to_string(), + key.0, + key.1, + ]) + .unwrap() + }); + metric.clone() + } +} + +impl Drop for RemoteTimelineClientMetrics { + fn drop(&mut self) { + let RemoteTimelineClientMetrics { + tenant_id, + timeline_id, + remote_operation_time, + unfinished_tasks, + } = self; + for ((a, b, c), _) in remote_operation_time.get_mut().unwrap().drain() { + let _ = REMOTE_OPERATION_TIME.remove_label_values(&[tenant_id, timeline_id, a, b, c]); + } + for ((a, b), _) in unfinished_tasks.get_mut().unwrap().drain() { + let _ = REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS.remove_label_values(&[ + tenant_id, + timeline_id, + a, + b, + ]); + } + } +} + /// Wrapper future that measures the time spent by a remote storage operation, /// and records the time and success/failure as a prometheus metric. pub trait MeasureRemoteOp: Sized { @@ -504,6 +588,7 @@ pub trait MeasureRemoteOp: Sized { timeline_id: TimelineId, file_kind: RemoteOpFileKind, op: RemoteOpKind, + metrics: Arc, ) -> MeasuredRemoteOp { let start = Instant::now(); MeasuredRemoteOp { @@ -513,6 +598,7 @@ pub trait MeasureRemoteOp: Sized { file_kind, op, start, + metrics, } } } @@ -529,6 +615,7 @@ pin_project! { file_kind: RemoteOpFileKind, op: RemoteOpKind, start: Instant, + metrics: Arc, } } @@ -541,15 +628,8 @@ impl>, O, E> Future for MeasuredRemoteOp { if let Poll::Ready(ref res) = poll_result { let duration = this.start.elapsed(); let status = if res.is_ok() { &"success" } else { &"failure" }; - REMOTE_OPERATION_TIME - .get_metric_with_label_values(&[ - &this.tenant_id.to_string(), - &this.timeline_id.to_string(), - this.file_kind.as_str(), - this.op.as_str(), - status, - ]) - .unwrap() + this.metrics + .remote_operation_time(this.file_kind, this.op, status) .observe(duration.as_secs_f64()); } poll_result diff --git a/pageserver/src/storage_sync2.rs b/pageserver/src/storage_sync2.rs index 7cc0eac2bf..cebec4d615 100644 --- a/pageserver/src/storage_sync2.rs +++ b/pageserver/src/storage_sync2.rs @@ -210,10 +210,9 @@ use utils::lsn::Lsn; use self::index::IndexPart; -use crate::metrics::MeasureRemoteOp; use crate::metrics::RemoteOpFileKind; use crate::metrics::RemoteOpKind; -use crate::metrics::REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS; +use crate::metrics::{MeasureRemoteOp, RemoteTimelineClientMetrics}; use crate::tenant::filename::LayerFileName; use crate::{ config::PageServerConf, @@ -256,6 +255,8 @@ pub struct RemoteTimelineClient { upload_queue: Mutex, + metrics: Arc, + storage_impl: GenericRemoteStorage, } @@ -501,6 +502,7 @@ impl RemoteTimelineClient { self.timeline_id, RemoteOpFileKind::Index, RemoteOpKind::Download, + Arc::clone(&self.metrics), ) .await } @@ -528,6 +530,7 @@ impl RemoteTimelineClient { self.timeline_id, RemoteOpFileKind::Layer, RemoteOpKind::Download, + Arc::clone(&self.metrics), ) .await?; @@ -847,6 +850,7 @@ impl RemoteTimelineClient { self.timeline_id, RemoteOpFileKind::Layer, RemoteOpKind::Upload, + Arc::clone(&self.metrics), ) .await } @@ -863,6 +867,7 @@ impl RemoteTimelineClient { self.timeline_id, RemoteOpFileKind::Index, RemoteOpKind::Upload, + Arc::clone(&self.metrics), ) .await } @@ -877,6 +882,7 @@ impl RemoteTimelineClient { self.timeline_id, *metric_file_kind, RemoteOpKind::Delete, + Arc::clone(&self.metrics), ) .await } @@ -977,14 +983,8 @@ impl RemoteTimelineClient { return; } }; - REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS - .get_metric_with_label_values(&[ - &self.tenant_id.to_string(), - &self.timeline_id.to_string(), - file_kind.as_str(), - op_kind.as_str(), - ]) - .unwrap() + self.metrics + .unfinished_tasks(&file_kind, &op_kind) .add(delta) } @@ -1068,6 +1068,7 @@ pub fn create_remote_timeline_client( timeline_id, storage_impl: remote_storage, upload_queue: Mutex::new(UploadQueue::Uninitialized), + metrics: Arc::new(RemoteTimelineClientMetrics::new(&tenant_id, &timeline_id)), }) } @@ -1180,6 +1181,10 @@ mod tests { timeline_id: TIMELINE_ID, storage_impl, upload_queue: Mutex::new(UploadQueue::Uninitialized), + metrics: Arc::new(RemoteTimelineClientMetrics::new( + &harness.tenant_id, + &TIMELINE_ID, + )), }); let remote_timeline_dir = diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index 86ab4425ed..17b2b71df2 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -39,6 +39,13 @@ def parse_metrics(text: str, name: str = "") -> Metrics: return metrics +PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS: Tuple[str, ...] = ( + "pageserver_remote_upload_queue_unfinished_tasks", + "pageserver_remote_operation_seconds_bucket", + "pageserver_remote_operation_seconds_count", + "pageserver_remote_operation_seconds_sum", +) + PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = ( "pageserver_current_logical_size", "pageserver_current_physical_size", @@ -62,4 +69,5 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = ( "pageserver_wait_lsn_seconds_sum", "pageserver_created_persistent_files_total", "pageserver_written_persistent_bytes_total", + *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS, ) diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index 7152bc8b6a..d8f8298fa6 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -384,7 +384,8 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue( metrics, re.MULTILINE, ) - assert matches + if matches is None: + return None return int(matches[1]) pg = env.postgres.create_start("main", tenant_id=tenant_id) @@ -436,8 +437,8 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue( assert not timeline_path.exists() - # timeline deletion should kill ongoing uploads - assert get_queued_count(file_kind="index", op_kind="upload") == 0 + # timeline deletion should kill ongoing uploads, so, the metric will be gone + assert get_queued_count(file_kind="index", op_kind="upload") is None # timeline deletion should be unblocking checkpoint ops checkpoint_thread.join(2.0) diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 0b20afefc3..9477ae3c25 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -7,7 +7,11 @@ from typing import List import pytest from fixtures.log_helper import log -from fixtures.metrics import PAGESERVER_PER_TENANT_METRICS, parse_metrics +from fixtures.metrics import ( + PAGESERVER_PER_TENANT_METRICS, + PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS, + parse_metrics, +) from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, @@ -157,9 +161,21 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder): ) -def test_pageserver_metrics_removed_after_detach(neon_env_builder: NeonEnvBuilder): +@pytest.mark.parametrize( + "remote_storage_kind", + # exercise both the code paths where remote_storage=None and remote_storage=Some(...) + [RemoteStorageKind.NOOP, RemoteStorageKind.MOCK_S3], +) +def test_pageserver_metrics_removed_after_detach( + neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind +): """Tests that when a tenant is detached, the tenant specific metrics are not left behind""" + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="test_pageserver_metrics_removed_after_detach", + ) + neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() @@ -192,7 +208,11 @@ def test_pageserver_metrics_removed_after_detach(neon_env_builder: NeonEnvBuilde for tenant in [tenant_1, tenant_2]: pre_detach_samples = set([x.name for x in get_ps_metric_samples_for_tenant(tenant)]) - assert pre_detach_samples == set(PAGESERVER_PER_TENANT_METRICS) + expected = set(PAGESERVER_PER_TENANT_METRICS) + if remote_storage_kind == RemoteStorageKind.NOOP: + # if there's no remote storage configured, we don't expose the remote timeline client metrics + expected -= set(PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS) + assert pre_detach_samples == expected env.pageserver.http_client().tenant_detach(tenant)