From 8a53d576e685b700c498d50588b4c0224711bed2 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Wed, 13 Mar 2024 17:10:20 +0200 Subject: [PATCH] fix(metrics): time individual layer flush operations (#7109) Currently, the flushing operation could flush multiple frozen layers to the disk and store the aggregate time in the histogram. The result is a bimodal distribution with short and over 1000-second flushes. Change it so that we record how long one layer flush takes. --- pageserver/src/tenant/timeline.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index a733a3b1a7..f10df19b7b 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -2967,7 +2967,6 @@ impl Timeline { } trace!("waking up"); - let timer = self.metrics.flush_time_histo.start_timer(); let flush_counter = *layer_flush_start_rx.borrow(); let result = loop { if self.cancel.is_cancelled() { @@ -2978,6 +2977,8 @@ impl Timeline { return; } + let timer = self.metrics.flush_time_histo.start_timer(); + let layer_to_flush = { let guard = self.layers.read().await; guard.layer_map().frozen_layers.front().cloned() @@ -2999,13 +3000,12 @@ impl Timeline { break err; } } + timer.stop_and_record(); }; // Notify any listeners that we're done let _ = self .layer_flush_done_tx .send_replace((flush_counter, result)); - - timer.stop_and_record(); } } @@ -3073,6 +3073,7 @@ impl Timeline { ctx: &RequestContext, ) -> Result<(), FlushLayerError> { debug_assert_current_span_has_tenant_and_timeline_id(); + // As a special case, when we have just imported an image into the repository, // instead of writing out a L0 delta layer, we directly write out image layer // files instead. This is possible as long as *all* the data imported into the