From 13feda0669d65cbac4b2103952caba1a9db1342e Mon Sep 17 00:00:00 2001 From: Peter Bendel Date: Tue, 26 Nov 2024 12:46:58 +0100 Subject: [PATCH] track how much time the flush loop is stalled waiting for uploads (#9885) ## Problem We don't know how much time PS is losing during ingest when waiting for remote storage uploads in the flush frozen layer loop. Also we don't know how many remote storage requests get an permit without waiting (not throttled by remote_storage concurrency_limit). ## Summary of changes - Add a metric that accumulates the time waited per shard/PS - in [remote storage semaphore wait seconds](https://neonprod.grafana.net/d/febd9732-9bcf-4992-a821-49b1f6b02724/remote-storage?orgId=1&var-datasource=HUNg6jvVk&var-instance=pageserver-26.us-east-2.aws.neon.build&var-instance=pageserver-27.us-east-2.aws.neon.build&var-instance=pageserver-28.us-east-2.aws.neon.build&var-instance=pageserver-29.us-east-2.aws.neon.build&var-instance=pageserver-30.us-east-2.aws.neon.build&var-instance=pageserver-31.us-east-2.aws.neon.build&var-instance=pageserver-36.us-east-2.aws.neon.build&var-instance=pageserver-37.us-east-2.aws.neon.build&var-instance=pageserver-38.us-east-2.aws.neon.build&var-instance=pageserver-39.us-east-2.aws.neon.build&var-instance=pageserver-40.us-east-2.aws.neon.build&var-instance=pageserver-41.us-east-2.aws.neon.build&var-request_type=put_object&from=1731961336340&to=1731964762933&viewPanel=3) add a first bucket with 100 microseconds to count requests that do not need to wait on semaphore Update: created a new version that uses a Gauge (one increasing value per PS/shard) instead of histogram as suggested by review --- libs/remote_storage/src/metrics.rs | 4 +++- pageserver/src/metrics.rs | 25 ++++++++++++++++++++++++- pageserver/src/tenant/timeline.rs | 5 ++++- test_runner/fixtures/metrics.py | 1 + 4 files changed, 32 insertions(+), 3 deletions(-) diff --git a/libs/remote_storage/src/metrics.rs b/libs/remote_storage/src/metrics.rs index f1aa4c433b..48c121fbc8 100644 --- a/libs/remote_storage/src/metrics.rs +++ b/libs/remote_storage/src/metrics.rs @@ -176,7 +176,9 @@ pub(crate) struct BucketMetrics { impl Default for BucketMetrics { fn default() -> Self { - let buckets = [0.01, 0.10, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0]; + // first bucket 100 microseconds to count requests that do not need to wait at all + // and get a permit immediately + let buckets = [0.0001, 0.01, 0.10, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0]; let req_seconds = register_histogram_vec!( "remote_storage_s3_request_seconds", diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 3cdc2a761e..5ce3ae6cf7 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -3,7 +3,7 @@ use metrics::{ register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec, register_int_counter, register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge, register_uint_gauge_vec, - Counter, CounterVec, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair, + Counter, CounterVec, Gauge, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair, IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec, }; use once_cell::sync::Lazy; @@ -457,6 +457,15 @@ pub(crate) static WAIT_LSN_TIME: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +static FLUSH_WAIT_UPLOAD_TIME: Lazy = Lazy::new(|| { + register_gauge_vec!( + "pageserver_flush_wait_upload_seconds", + "Time spent waiting for preceding uploads during layer flush", + &["tenant_id", "shard_id", "timeline_id"] + ) + .expect("failed to define a metric") +}); + static LAST_RECORD_LSN: Lazy = Lazy::new(|| { register_int_gauge_vec!( "pageserver_last_record_lsn", @@ -2336,6 +2345,7 @@ pub(crate) struct TimelineMetrics { shard_id: String, timeline_id: String, pub flush_time_histo: StorageTimeMetrics, + pub flush_wait_upload_time_gauge: Gauge, pub compact_time_histo: StorageTimeMetrics, pub create_images_time_histo: StorageTimeMetrics, pub logical_size_histo: StorageTimeMetrics, @@ -2379,6 +2389,9 @@ impl TimelineMetrics { &shard_id, &timeline_id, ); + let flush_wait_upload_time_gauge = FLUSH_WAIT_UPLOAD_TIME + .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) + .unwrap(); let compact_time_histo = StorageTimeMetrics::new( StorageTimeOperation::Compact, &tenant_id, @@ -2516,6 +2529,7 @@ impl TimelineMetrics { shard_id, timeline_id, flush_time_histo, + flush_wait_upload_time_gauge, compact_time_histo, create_images_time_histo, logical_size_histo, @@ -2563,6 +2577,14 @@ impl TimelineMetrics { self.resident_physical_size_gauge.get() } + pub(crate) fn flush_wait_upload_time_gauge_add(&self, duration: f64) { + self.flush_wait_upload_time_gauge.add(duration); + crate::metrics::FLUSH_WAIT_UPLOAD_TIME + .get_metric_with_label_values(&[&self.tenant_id, &self.shard_id, &self.timeline_id]) + .unwrap() + .add(duration); + } + pub(crate) fn shutdown(&self) { let was_shutdown = self .shutdown @@ -2579,6 +2601,7 @@ impl TimelineMetrics { let timeline_id = &self.timeline_id; let shard_id = &self.shard_id; let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]); + let _ = FLUSH_WAIT_UPLOAD_TIME.remove_label_values(&[tenant_id, shard_id, timeline_id]); let _ = STANDBY_HORIZON.remove_label_values(&[tenant_id, shard_id, timeline_id]); { RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get()); diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index f6a06e73a7..c1ff0f426d 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -3830,7 +3830,8 @@ impl Timeline { }; // Backpressure mechanism: wait with continuation of the flush loop until we have uploaded all layer files. - // This makes us refuse ingest until the new layers have been persisted to the remote. + // This makes us refuse ingest until the new layers have been persisted to the remote + let start = Instant::now(); self.remote_client .wait_completion() .await @@ -3843,6 +3844,8 @@ impl Timeline { FlushLayerError::Other(anyhow!(e).into()) } })?; + let duration = start.elapsed().as_secs_f64(); + self.metrics.flush_wait_upload_time_gauge_add(duration); // FIXME: between create_delta_layer and the scheduling of the upload in `update_metadata_file`, // a compaction can delete the file and then it won't be available for uploads any more. diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index 330f007a77..3f90c233a6 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -168,6 +168,7 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = ( "pageserver_evictions_with_low_residence_duration_total", "pageserver_aux_file_estimated_size", "pageserver_valid_lsn_lease_count", + "pageserver_flush_wait_upload_seconds", counter("pageserver_tenant_throttling_count_accounted_start"), counter("pageserver_tenant_throttling_count_accounted_finish"), counter("pageserver_tenant_throttling_wait_usecs_sum"),