From bf3ac2be2d5cdff317ddd105dd68d13523382b19 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Wed, 14 Dec 2022 08:53:54 -0500 Subject: [PATCH] add remote_physical_size metric We do the accounting exclusively after updating remote IndexPart successfully. This is cleaner & more robust than doing it upon completion of individual layer file uploads / deletions since we can uset .set() insteaf of add()/sub(). NB: Originally, this work was intended to be part of #3013 but it turns out that it's completely orthogonal. So, spin it out into this PR for easier review. Since this change is additive, it won't break anything. --- pageserver/src/metrics.rs | 30 +++++++++++++++++++++++++ pageserver/src/storage_sync2.rs | 32 +++++++++++++++++++++++++-- pageserver/src/storage_sync2/index.rs | 2 +- test_runner/fixtures/metrics.py | 1 + 4 files changed, 62 insertions(+), 3 deletions(-) diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 2f1a98e4c5..308f9cd4eb 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -96,6 +96,16 @@ static CURRENT_PHYSICAL_SIZE: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +static REMOTE_PHYSICAL_SIZE: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_remote_physical_size", + "The size of the layer files present in the remote storage that are listed in the the remote index_part.json.", + // Corollary: If any files are missing from the index part, they won't be included here. + &["tenant_id", "timeline_id"] + ) + .expect("failed to define a metric") +}); + static CURRENT_LOGICAL_SIZE: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_current_logical_size", @@ -500,6 +510,7 @@ use std::time::Instant; pub struct RemoteTimelineClientMetrics { tenant_id: String, timeline_id: String, + remote_physical_size_gauge: Mutex>, remote_operation_time: Mutex>, unfinished_tasks: Mutex>, } @@ -511,8 +522,22 @@ impl RemoteTimelineClientMetrics { timeline_id: timeline_id.to_string(), remote_operation_time: Mutex::new(HashMap::default()), unfinished_tasks: Mutex::new(HashMap::default()), + remote_physical_size_gauge: Mutex::new(None), } } + pub fn remote_physical_size_gauge(&self) -> UIntGauge { + let mut guard = self.remote_physical_size_gauge.lock().unwrap(); + guard + .get_or_insert_with(|| { + REMOTE_PHYSICAL_SIZE + .get_metric_with_label_values(&[ + &self.tenant_id.to_string(), + &self.timeline_id.to_string(), + ]) + .unwrap() + }) + .clone() + } pub fn remote_operation_time( &self, file_kind: &RemoteOpFileKind, @@ -562,6 +587,7 @@ impl Drop for RemoteTimelineClientMetrics { let RemoteTimelineClientMetrics { tenant_id, timeline_id, + remote_physical_size_gauge, remote_operation_time, unfinished_tasks, } = self; @@ -576,6 +602,10 @@ impl Drop for RemoteTimelineClientMetrics { b, ]); } + { + let _ = remote_physical_size_gauge; // use to avoid 'unused' warning in desctructuring above + let _ = REMOTE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]); + } } } diff --git a/pageserver/src/storage_sync2.rs b/pageserver/src/storage_sync2.rs index cebec4d615..89bbc34227 100644 --- a/pageserver/src/storage_sync2.rs +++ b/pageserver/src/storage_sync2.rs @@ -460,6 +460,7 @@ impl RemoteTimelineClient { pub fn init_upload_queue(&self, index_part: &IndexPart) -> anyhow::Result<()> { let mut upload_queue = self.upload_queue.lock().unwrap(); upload_queue.initialize_with_current_remote_index_part(index_part)?; + self.update_remote_physical_size_gauge(Some(index_part)); Ok(()) } @@ -471,6 +472,7 @@ impl RemoteTimelineClient { ) -> anyhow::Result<()> { let mut upload_queue = self.upload_queue.lock().unwrap(); upload_queue.initialize_empty_remote(local_metadata)?; + self.update_remote_physical_size_gauge(None); Ok(()) } @@ -482,6 +484,20 @@ impl RemoteTimelineClient { } } + fn update_remote_physical_size_gauge(&self, current_remote_index_part: Option<&IndexPart>) { + let size: u64 = if let Some(current_remote_index_part) = current_remote_index_part { + current_remote_index_part + .layer_metadata + .iter() + // If we don't have the file size for the layer, don't account for it in the metric. + .map(|(_, ilmd)| ilmd.file_size.unwrap_or(0)) + .sum() + } else { + 0 + }; + self.metrics.remote_physical_size_gauge().set(size); + } + // // Download operations. // @@ -543,6 +559,14 @@ impl RemoteTimelineClient { let upload_queue = guard.initialized_mut()?; if let Some(upgraded) = upload_queue.latest_files.get_mut(layer_file_name) { upgraded.merge(&new_metadata); + // If we don't do an index file upload inbetween here and restart, + // the value will go back down after pageserver restart, since we will + // have lost this data point. + // But, we upload index part fairly frequently, and restart pageserver rarely. + // So, by accounting eagerly, we present a most-of-the-time-more-accurate value sooner. + self.metrics + .remote_physical_size_gauge() + .add(downloaded_size); } else { // The file should exist, since we just downloaded it. warn!( @@ -855,7 +879,7 @@ impl RemoteTimelineClient { .await } UploadOp::UploadMetadata(ref index_part, _lsn) => { - upload::upload_index_part( + let res = upload::upload_index_part( self.conf, &self.storage_impl, self.tenant_id, @@ -869,7 +893,11 @@ impl RemoteTimelineClient { RemoteOpKind::Upload, Arc::clone(&self.metrics), ) - .await + .await; + if res.is_ok() { + self.update_remote_physical_size_gauge(Some(index_part)); + } + res } UploadOp::Delete(metric_file_kind, ref layer_file_name) => { let path = &self diff --git a/pageserver/src/storage_sync2/index.rs b/pageserver/src/storage_sync2/index.rs index 82487339ee..ed4ed10189 100644 --- a/pageserver/src/storage_sync2/index.rs +++ b/pageserver/src/storage_sync2/index.rs @@ -232,7 +232,7 @@ impl IndexPart { /// Serialized form of [`LayerFileMetadata`]. #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Default)] pub struct IndexLayerMetadata { - file_size: Option, + pub(super) file_size: Option, } impl From<&'_ LayerFileMetadata> for IndexLayerMetadata { diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index 17b2b71df2..5fe6c43528 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -44,6 +44,7 @@ PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS: Tuple[str, ...] = ( "pageserver_remote_operation_seconds_bucket", "pageserver_remote_operation_seconds_count", "pageserver_remote_operation_seconds_sum", + "pageserver_remote_physical_size", ) PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (