track how much time the flush loop is stalled waiting for uploads (#9885)

## Problem We don't know how much time PS is losing during ingest when waiting for remote storage uploads in the flush frozen layer loop. Also we don't know how many remote storage requests get an permit without waiting (not throttled by remote_storage concurrency_limit). ## Summary of changes - Add a metric that accumulates the time waited per shard/PS - in [remote storage semaphore wait seconds](https://neonprod.grafana.net/d/febd9732-9bcf-4992-a821-49b1f6b02724/remote-storage?orgId=1&var-datasource=HUNg6jvVk&var-instance=pageserver-26.us-east-2.aws.neon.build&var-instance=pageserver-27.us-east-2.aws.neon.build&var-instance=pageserver-28.us-east-2.aws.neon.build&var-instance=pageserver-29.us-east-2.aws.neon.build&var-instance=pageserver-30.us-east-2.aws.neon.build&var-instance=pageserver-31.us-east-2.aws.neon.build&var-instance=pageserver-36.us-east-2.aws.neon.build&var-instance=pageserver-37.us-east-2.aws.neon.build&var-instance=pageserver-38.us-east-2.aws.neon.build&var-instance=pageserver-39.us-east-2.aws.neon.build&var-instance=pageserver-40.us-east-2.aws.neon.build&var-instance=pageserver-41.us-east-2.aws.neon.build&var-request_type=put_object&from=1731961336340&to=1731964762933&viewPanel=3) add a first bucket with 100 microseconds to count requests that do not need to wait on semaphore Update: created a new version that uses a Gauge (one increasing value per PS/shard) instead of histogram as suggested by review
2026-01-03 19:42:55 +00:00 · 2024-11-26 12:46:58 +01:00
parent 96a1b71c84
commit 13feda0669
4 changed files with 32 additions and 3 deletions
--- a/libs/remote_storage/src/metrics.rs
+++ b/libs/remote_storage/src/metrics.rs
@@ -176,7 +176,9 @@ pub(crate) struct BucketMetrics {

 impl Default for BucketMetrics {
    fn default() -> Self {
-        let buckets = [0.01, 0.10, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0];
+        // first bucket 100 microseconds to count requests that do not need to wait at all
+        // and get a permit immediately
+        let buckets = [0.0001, 0.01, 0.10, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0];

        let req_seconds = register_histogram_vec!(
            "remote_storage_s3_request_seconds",
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -3,7 +3,7 @@ use metrics::{
    register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
    register_int_counter, register_int_counter_pair_vec, register_int_counter_vec,
    register_int_gauge, register_int_gauge_vec, register_uint_gauge, register_uint_gauge_vec,
-    Counter, CounterVec, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair,
+    Counter, CounterVec, Gauge, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair,
    IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
 };
 use once_cell::sync::Lazy;
@@ -457,6 +457,15 @@ pub(crate) static WAIT_LSN_TIME: Lazy<Histogram> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

+static FLUSH_WAIT_UPLOAD_TIME: Lazy<GaugeVec> = Lazy::new(|| {
+    register_gauge_vec!(
+        "pageserver_flush_wait_upload_seconds",
+        "Time spent waiting for preceding uploads during layer flush",
+        &["tenant_id", "shard_id", "timeline_id"]
+    )
+    .expect("failed to define a metric")
+});
+
 static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
    register_int_gauge_vec!(
        "pageserver_last_record_lsn",
@@ -2336,6 +2345,7 @@ pub(crate) struct TimelineMetrics {
    shard_id: String,
    timeline_id: String,
    pub flush_time_histo: StorageTimeMetrics,
+    pub flush_wait_upload_time_gauge: Gauge,
    pub compact_time_histo: StorageTimeMetrics,
    pub create_images_time_histo: StorageTimeMetrics,
    pub logical_size_histo: StorageTimeMetrics,
@@ -2379,6 +2389,9 @@ impl TimelineMetrics {
            &shard_id,
            &timeline_id,
        );
+        let flush_wait_upload_time_gauge = FLUSH_WAIT_UPLOAD_TIME
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
+            .unwrap();
        let compact_time_histo = StorageTimeMetrics::new(
            StorageTimeOperation::Compact,
            &tenant_id,
@@ -2516,6 +2529,7 @@ impl TimelineMetrics {
            shard_id,
            timeline_id,
            flush_time_histo,
+            flush_wait_upload_time_gauge,
            compact_time_histo,
            create_images_time_histo,
            logical_size_histo,
@@ -2563,6 +2577,14 @@ impl TimelineMetrics {
        self.resident_physical_size_gauge.get()
    }

+    pub(crate) fn flush_wait_upload_time_gauge_add(&self, duration: f64) {
+        self.flush_wait_upload_time_gauge.add(duration);
+        crate::metrics::FLUSH_WAIT_UPLOAD_TIME
+            .get_metric_with_label_values(&[&self.tenant_id, &self.shard_id, &self.timeline_id])
+            .unwrap()
+            .add(duration);
+    }
+
    pub(crate) fn shutdown(&self) {
        let was_shutdown = self
            .shutdown
@@ -2579,6 +2601,7 @@ impl TimelineMetrics {
        let timeline_id = &self.timeline_id;
        let shard_id = &self.shard_id;
        let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
+        let _ = FLUSH_WAIT_UPLOAD_TIME.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        let _ = STANDBY_HORIZON.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        {
            RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3830,7 +3830,8 @@ impl Timeline {
        };

        // Backpressure mechanism: wait with continuation of the flush loop until we have uploaded all layer files.
-        // This makes us refuse ingest until the new layers have been persisted to the remote.
+        // This makes us refuse ingest until the new layers have been persisted to the remote
+        let start = Instant::now();
        self.remote_client
            .wait_completion()
            .await
@@ -3843,6 +3844,8 @@ impl Timeline {
                    FlushLayerError::Other(anyhow!(e).into())
                }
            })?;
+        let duration = start.elapsed().as_secs_f64();
+        self.metrics.flush_wait_upload_time_gauge_add(duration);

        // FIXME: between create_delta_layer and the scheduling of the upload in `update_metadata_file`,
        // a compaction can delete the file and then it won't be available for uploads any more.
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -168,6 +168,7 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = (
    "pageserver_evictions_with_low_residence_duration_total",
    "pageserver_aux_file_estimated_size",
    "pageserver_valid_lsn_lease_count",
+    "pageserver_flush_wait_upload_seconds",
    counter("pageserver_tenant_throttling_count_accounted_start"),
    counter("pageserver_tenant_throttling_count_accounted_finish"),
    counter("pageserver_tenant_throttling_wait_usecs_sum"),