From bf187aa13f8527ea81de370d14984a6cc5889ecd Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Fri, 15 Mar 2024 17:30:13 +0200 Subject: [PATCH] fix(layer): metric miscalculations (#7137) Split off from #7030: - each early exit is counted as canceled init, even though it most likely was just `LayerInner::keep_resident` doing the no-download repair check - `downloaded_after` could had been accounted for multiple times, and also when repairing to match on-disk state Cc: #5331 --- pageserver/src/tenant/storage_layer/layer.rs | 63 ++++++++++++++------ 1 file changed, 45 insertions(+), 18 deletions(-) diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 959065bc4c..0200ff8cf4 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -710,10 +710,6 @@ impl LayerInner { // disable any scheduled but not yet running eviction deletions for this let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed); - // count cancellations, which currently remain largely unexpected - let init_cancelled = - scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled()); - // no need to make the evict_and_wait wait for the actual download to complete drop(self.status.send(Status::Downloaded)); @@ -722,7 +718,9 @@ impl LayerInner { .upgrade() .ok_or_else(|| DownloadError::TimelineShutdown)?; - // FIXME: grab a gate + // count cancellations, which currently remain largely unexpected + let init_cancelled = + scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled()); let can_ever_evict = timeline.remote_client.as_ref().is_some(); @@ -731,9 +729,17 @@ impl LayerInner { let needs_download = self .needs_download() .await - .map_err(DownloadError::PreStatFailed)?; + .map_err(DownloadError::PreStatFailed); - let permit = if let Some(reason) = needs_download { + let needs_download = match needs_download { + Ok(reason) => reason, + Err(e) => { + scopeguard::ScopeGuard::into_inner(init_cancelled); + return Err(e); + } + }; + + let (permit, downloaded) = if let Some(reason) = needs_download { if let NeedsDownload::NotFile(ft) = reason { return Err(DownloadError::NotFile(ft)); } @@ -744,36 +750,59 @@ impl LayerInner { self.wanted_evicted.store(false, Ordering::Release); if !can_ever_evict { + scopeguard::ScopeGuard::into_inner(init_cancelled); return Err(DownloadError::NoRemoteStorage); } if let Some(ctx) = ctx { - self.check_expected_download(ctx)?; + let res = self.check_expected_download(ctx); + if let Err(e) = res { + scopeguard::ScopeGuard::into_inner(init_cancelled); + return Err(e); + } } if !allow_download { // this does look weird, but for LayerInner the "downloading" means also changing // internal once related state ... + scopeguard::ScopeGuard::into_inner(init_cancelled); return Err(DownloadError::DownloadRequired); } tracing::info!(%reason, "downloading on-demand"); - self.spawn_download_and_wait(timeline, permit).await? + let permit = self.spawn_download_and_wait(timeline, permit).await; + + let permit = match permit { + Ok(permit) => permit, + Err(e) => { + scopeguard::ScopeGuard::into_inner(init_cancelled); + return Err(e); + } + }; + + (permit, true) } else { // the file is present locally, probably by a previous but cancelled call to // get_or_maybe_download. alternatively we might be running without remote storage. LAYER_IMPL_METRICS.inc_init_needed_no_download(); - permit + (permit, false) }; - let since_last_eviction = - self.last_evicted_at.lock().unwrap().map(|ts| ts.elapsed()); - if let Some(since_last_eviction) = since_last_eviction { - // FIXME: this will not always be recorded correctly until #6028 (the no - // download needed branch above) - LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction); + scopeguard::ScopeGuard::into_inner(init_cancelled); + + if downloaded { + let since_last_eviction = self + .last_evicted_at + .lock() + .unwrap() + .take() + .map(|ts| ts.elapsed()); + + if let Some(since_last_eviction) = since_last_eviction { + LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction); + } } let res = Arc::new(DownloadedLayer { @@ -795,8 +824,6 @@ impl LayerInner { ); } - scopeguard::ScopeGuard::into_inner(init_cancelled); - Ok((ResidentOrWantedEvicted::Resident(res), permit)) } .instrument(tracing::info_span!("get_or_maybe_download", layer=%self))