From 69338e53e3628cd0133b27d1c079f9deeaaea725 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Wed, 13 Mar 2024 18:49:17 +0100 Subject: [PATCH] throttling: fixup interactions with Timeline::get_vectored (#7089) ## Problem Before this PR, `Timeline::get_vectored` would be throttled twice if the sequential option was enabled or if validation was enabled. Also, `pageserver_get_vectored_seconds` included the time spent in the throttle, which turns out to be undesirable for what we use that metric for. ## Summary of changes Double-throttle: * Add `Timeline::get0` method which is unthrottled. * Use that method from within the `Timeline::get_vectored` code path. Metric: * return throttled time from `throttle()` method * deduct the value from the observed time * globally rate-limited logging of duration subtraction errors, like in all other places that do the throttled-time deduction from observations --- pageserver/src/bin/pageserver.rs | 2 ++ pageserver/src/metrics.rs | 2 +- pageserver/src/tenant/throttle.rs | 7 ++-- pageserver/src/tenant/timeline.rs | 59 +++++++++++++++++++++++++------ 4 files changed, 57 insertions(+), 13 deletions(-) diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 2f172bd384..59750897ff 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -1,3 +1,5 @@ +#![recursion_limit = "300"] + //! Main entry point for the Page Server executable. use std::env::{var, VarError}; diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 814b3e1f96..03537ddb05 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -167,7 +167,7 @@ impl GetVectoredLatency { pub(crate) static GET_VECTORED_LATENCY: Lazy = Lazy::new(|| { let inner = register_histogram_vec!( "pageserver_get_vectored_seconds", - "Time spent in get_vectored", + "Time spent in get_vectored, excluding time spent in timeline_get_throttle.", &["task_kind"], CRITICAL_OP_BUCKETS.into(), ) diff --git a/pageserver/src/tenant/throttle.rs b/pageserver/src/tenant/throttle.rs index 280773e9c3..f3f3d5e3ae 100644 --- a/pageserver/src/tenant/throttle.rs +++ b/pageserver/src/tenant/throttle.rs @@ -130,10 +130,10 @@ where self.inner.load().config.steady_rps() } - pub async fn throttle(&self, ctx: &RequestContext, key_count: usize) { + pub async fn throttle(&self, ctx: &RequestContext, key_count: usize) -> Option { let inner = self.inner.load_full(); // clones the `Inner` Arc if !inner.task_kinds.contains(ctx.task_kind()) { - return; + return None; }; let start = std::time::Instant::now(); let mut did_throttle = false; @@ -170,6 +170,9 @@ where }); } } + Some(wait_time) + } else { + None } } } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index f10df19b7b..d507a19de9 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -634,6 +634,8 @@ impl Timeline { /// If a remote layer file is needed, it is downloaded as part of this /// call. /// + /// This method enforces [`Self::timeline_get_throttle`] internally. + /// /// NOTE: It is considered an error to 'get' a key that doesn't exist. The /// abstraction above this needs to store suitable metadata to track what /// data exists with what keys, in separate metadata entries. If a @@ -644,18 +646,27 @@ impl Timeline { /// # Cancel-Safety /// /// This method is cancellation-safe. + #[inline(always)] pub(crate) async fn get( &self, key: Key, lsn: Lsn, ctx: &RequestContext, + ) -> Result { + self.timeline_get_throttle.throttle(ctx, 1).await; + self.get_impl(key, lsn, ctx).await + } + /// Not subject to [`Self::timeline_get_throttle`]. + async fn get_impl( + &self, + key: Key, + lsn: Lsn, + ctx: &RequestContext, ) -> Result { if !lsn.is_valid() { return Err(PageReconstructError::Other(anyhow::anyhow!("Invalid LSN"))); } - self.timeline_get_throttle.throttle(ctx, 1).await; - // This check is debug-only because of the cost of hashing, and because it's a double-check: we // already checked the key against the shard_identity when looking up the Timeline from // page_service. @@ -752,10 +763,6 @@ impl Timeline { return Err(GetVectoredError::Oversized(key_count)); } - self.timeline_get_throttle - .throttle(ctx, key_count as usize) - .await; - for range in &keyspace.ranges { let mut key = range.start; while key != range.end { @@ -772,11 +779,18 @@ impl Timeline { self.conf.get_vectored_impl ); - let _timer = crate::metrics::GET_VECTORED_LATENCY + let start = crate::metrics::GET_VECTORED_LATENCY .for_task_kind(ctx.task_kind()) - .map(|t| t.start_timer()); + .map(|metric| (metric, Instant::now())); - match self.conf.get_vectored_impl { + // start counting after throttle so that throttle time + // is always less than observation time + let throttled = self + .timeline_get_throttle + .throttle(ctx, key_count as usize) + .await; + + let res = match self.conf.get_vectored_impl { GetVectoredImpl::Sequential => { self.get_vectored_sequential_impl(keyspace, lsn, ctx).await } @@ -790,9 +804,33 @@ impl Timeline { vectored_res } + }; + + if let Some((metric, start)) = start { + let elapsed = start.elapsed(); + let ex_throttled = if let Some(throttled) = throttled { + elapsed.checked_sub(throttled) + } else { + Some(elapsed) + }; + + if let Some(ex_throttled) = ex_throttled { + metric.observe(ex_throttled.as_secs_f64()); + } else { + use utils::rate_limit::RateLimit; + static LOGGED: Lazy> = + Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10)))); + let mut rate_limit = LOGGED.lock().unwrap(); + rate_limit.call(|| { + warn!("error deducting time spent throttled; this message is logged at a global rate limit"); + }); + } } + + res } + /// Not subject to [`Self::timeline_get_throttle`]. pub(super) async fn get_vectored_sequential_impl( &self, keyspace: KeySpace, @@ -803,7 +841,7 @@ impl Timeline { for range in keyspace.ranges { let mut key = range.start; while key != range.end { - let block = self.get(key, lsn, ctx).await; + let block = self.get_impl(key, lsn, ctx).await; use PageReconstructError::*; match block { @@ -853,6 +891,7 @@ impl Timeline { Ok(results) } + /// Not subject to [`Self::timeline_get_throttle`]. pub(super) async fn validate_get_vectored_impl( &self, vectored_res: &Result>, GetVectoredError>,