diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index a4b102c314..9a8ddc1a6b 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -259,8 +259,9 @@ impl Layer { layer .get_value_reconstruct_data(key, lsn_range, reconstruct_data, &self.0, ctx) - .instrument(tracing::info_span!("get_value_reconstruct_data", layer=%self)) + .instrument(tracing::debug_span!("get_value_reconstruct_data", layer=%self)) .await + .with_context(|| format!("get_value_reconstruct_data for layer {self}")) } /// Download the layer if evicted. @@ -654,7 +655,6 @@ impl LayerInner { } /// Cancellation safe. - #[tracing::instrument(skip_all, fields(layer=%self))] async fn get_or_maybe_download( self: &Arc, allow_download: bool, @@ -663,95 +663,101 @@ impl LayerInner { let mut init_permit = None; loop { - let download = move |permit| async move { - // disable any scheduled but not yet running eviction deletions for this - let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed); + let download = move |permit| { + async move { + // disable any scheduled but not yet running eviction deletions for this + let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed); - // count cancellations, which currently remain largely unexpected - let init_cancelled = - scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled()); + // count cancellations, which currently remain largely unexpected + let init_cancelled = + scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled()); - // no need to make the evict_and_wait wait for the actual download to complete - drop(self.status.send(Status::Downloaded)); + // no need to make the evict_and_wait wait for the actual download to complete + drop(self.status.send(Status::Downloaded)); - let timeline = self - .timeline - .upgrade() - .ok_or_else(|| DownloadError::TimelineShutdown)?; + let timeline = self + .timeline + .upgrade() + .ok_or_else(|| DownloadError::TimelineShutdown)?; - // FIXME: grab a gate + // FIXME: grab a gate - let can_ever_evict = timeline.remote_client.as_ref().is_some(); + let can_ever_evict = timeline.remote_client.as_ref().is_some(); - // check if we really need to be downloaded; could have been already downloaded by a - // cancelled previous attempt. - let needs_download = self - .needs_download() - .await - .map_err(DownloadError::PreStatFailed)?; + // check if we really need to be downloaded; could have been already downloaded by a + // cancelled previous attempt. + let needs_download = self + .needs_download() + .await + .map_err(DownloadError::PreStatFailed)?; - let permit = if let Some(reason) = needs_download { - if let NeedsDownload::NotFile(ft) = reason { - return Err(DownloadError::NotFile(ft)); + let permit = if let Some(reason) = needs_download { + if let NeedsDownload::NotFile(ft) = reason { + return Err(DownloadError::NotFile(ft)); + } + + // only reset this after we've decided we really need to download. otherwise it'd + // be impossible to mark cancelled downloads for eviction, like one could imagine + // we would like to do for prefetching which was not needed. + self.wanted_evicted.store(false, Ordering::Release); + + if !can_ever_evict { + return Err(DownloadError::NoRemoteStorage); + } + + if let Some(ctx) = ctx { + self.check_expected_download(ctx)?; + } + + if !allow_download { + // this does look weird, but for LayerInner the "downloading" means also changing + // internal once related state ... + return Err(DownloadError::DownloadRequired); + } + + tracing::info!(%reason, "downloading on-demand"); + + self.spawn_download_and_wait(timeline, permit).await? + } else { + // the file is present locally, probably by a previous but cancelled call to + // get_or_maybe_download. alternatively we might be running without remote storage. + LAYER_IMPL_METRICS.inc_init_needed_no_download(); + + permit + }; + + let since_last_eviction = + self.last_evicted_at.lock().unwrap().map(|ts| ts.elapsed()); + if let Some(since_last_eviction) = since_last_eviction { + // FIXME: this will not always be recorded correctly until #6028 (the no + // download needed branch above) + LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction); } - // only reset this after we've decided we really need to download. otherwise it'd - // be impossible to mark cancelled downloads for eviction, like one could imagine - // we would like to do for prefetching which was not needed. - self.wanted_evicted.store(false, Ordering::Release); + let res = Arc::new(DownloadedLayer { + owner: Arc::downgrade(self), + kind: tokio::sync::OnceCell::default(), + version: next_version, + }); - if !can_ever_evict { - return Err(DownloadError::NoRemoteStorage); + self.access_stats.record_residence_event( + LayerResidenceStatus::Resident, + LayerResidenceEventReason::ResidenceChange, + ); + + let waiters = self.inner.initializer_count(); + if waiters > 0 { + tracing::info!( + waiters, + "completing the on-demand download for other tasks" + ); } - if let Some(ctx) = ctx { - self.check_expected_download(ctx)?; - } + scopeguard::ScopeGuard::into_inner(init_cancelled); - if !allow_download { - // this does look weird, but for LayerInner the "downloading" means also changing - // internal once related state ... - return Err(DownloadError::DownloadRequired); - } - - tracing::info!(%reason, "downloading on-demand"); - - self.spawn_download_and_wait(timeline, permit).await? - } else { - // the file is present locally, probably by a previous but cancelled call to - // get_or_maybe_download. alternatively we might be running without remote storage. - LAYER_IMPL_METRICS.inc_init_needed_no_download(); - - permit - }; - - let since_last_eviction = - self.last_evicted_at.lock().unwrap().map(|ts| ts.elapsed()); - if let Some(since_last_eviction) = since_last_eviction { - // FIXME: this will not always be recorded correctly until #6028 (the no - // download needed branch above) - LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction); + Ok((ResidentOrWantedEvicted::Resident(res), permit)) } - - let res = Arc::new(DownloadedLayer { - owner: Arc::downgrade(self), - kind: tokio::sync::OnceCell::default(), - version: next_version, - }); - - self.access_stats.record_residence_event( - LayerResidenceStatus::Resident, - LayerResidenceEventReason::ResidenceChange, - ); - - let waiters = self.inner.initializer_count(); - if waiters > 0 { - tracing::info!(waiters, "completing the on-demand download for other tasks"); - } - - scopeguard::ScopeGuard::into_inner(init_cancelled); - - Ok((ResidentOrWantedEvicted::Resident(res), permit)) + .instrument(tracing::info_span!("get_or_maybe_download", layer=%self)) }; if let Some(init_permit) = init_permit.take() { diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py index 53eeb8bbe9..4da0ba7b20 100644 --- a/test_runner/regress/test_broken_timeline.py +++ b/test_runner/regress/test_broken_timeline.py @@ -20,7 +20,7 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder): env.pageserver.allowed_errors.extend( [ - ".*layer loading failed:.*", + ".*get_value_reconstruct_data for layer .*", ".*could not find data for key.*", ".*is not active. Current state: Broken.*", ".*will not become active. Current state: Broken.*", @@ -83,7 +83,7 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder): # (We don't check layer file contents on startup, when loading the timeline) # # This will change when we implement checksums for layers - with pytest.raises(Exception, match="layer loading failed:") as err: + with pytest.raises(Exception, match="get_value_reconstruct_data for layer ") as err: pg2.start() log.info( f"As expected, compute startup failed for timeline {tenant2}/{timeline2} with corrupt layers: {err}"