pageserver: Reduce tracing overhead in timeline::get (#6115)

## Problem

Compaction process (specifically the image layer reconstructions part)
is lagging behind wal ingest (at speed ~10-15MB/s) for medium-sized
tenants (30-50GB). CPU profile shows that significant amount of time
(see flamegraph) is being spent in `tracing::span::Span::new`.

mainline (commit: 0ba4cae491):

![reconstruct-mainline-0ba4cae491c2](https://github.com/neondatabase/neon/assets/289788/ebfd262e-5c97-4858-80c7-664a1dbcc59d)

## Summary of changes

By lowering the tracing level in get_value_reconstruct_data and
get_or_maybe_download from info to debug, we can reduce the overhead of
span creation in prod environments. On my system, this sped up the image
reconstruction process by 60% (from 14500 to 23160 page reconstruction
per sec)

pr:

![reconstruct-opt-2](https://github.com/neondatabase/neon/assets/289788/563a159b-8f2f-4300-b0a1-6cd66e7df769)


`create_image_layers()` (it's 1 CPU bound here) mainline vs pr:

![image](https://github.com/neondatabase/neon/assets/289788/a981e3cb-6df9-4882-8a94-95e99c35aa83)
This commit is contained in:
John Khvatov
2023-12-18 16:33:23 +03:00
committed by GitHub
parent 17bde7eda5
commit 33cb9a68f7
2 changed files with 85 additions and 79 deletions

View File

@@ -259,8 +259,9 @@ impl Layer {
layer
.get_value_reconstruct_data(key, lsn_range, reconstruct_data, &self.0, ctx)
.instrument(tracing::info_span!("get_value_reconstruct_data", layer=%self))
.instrument(tracing::debug_span!("get_value_reconstruct_data", layer=%self))
.await
.with_context(|| format!("get_value_reconstruct_data for layer {self}"))
}
/// Download the layer if evicted.
@@ -654,7 +655,6 @@ impl LayerInner {
}
/// Cancellation safe.
#[tracing::instrument(skip_all, fields(layer=%self))]
async fn get_or_maybe_download(
self: &Arc<Self>,
allow_download: bool,
@@ -663,95 +663,101 @@ impl LayerInner {
let mut init_permit = None;
loop {
let download = move |permit| async move {
// disable any scheduled but not yet running eviction deletions for this
let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed);
let download = move |permit| {
async move {
// disable any scheduled but not yet running eviction deletions for this
let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed);
// count cancellations, which currently remain largely unexpected
let init_cancelled =
scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
// count cancellations, which currently remain largely unexpected
let init_cancelled =
scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
// no need to make the evict_and_wait wait for the actual download to complete
drop(self.status.send(Status::Downloaded));
// no need to make the evict_and_wait wait for the actual download to complete
drop(self.status.send(Status::Downloaded));
let timeline = self
.timeline
.upgrade()
.ok_or_else(|| DownloadError::TimelineShutdown)?;
let timeline = self
.timeline
.upgrade()
.ok_or_else(|| DownloadError::TimelineShutdown)?;
// FIXME: grab a gate
// FIXME: grab a gate
let can_ever_evict = timeline.remote_client.as_ref().is_some();
let can_ever_evict = timeline.remote_client.as_ref().is_some();
// check if we really need to be downloaded; could have been already downloaded by a
// cancelled previous attempt.
let needs_download = self
.needs_download()
.await
.map_err(DownloadError::PreStatFailed)?;
// check if we really need to be downloaded; could have been already downloaded by a
// cancelled previous attempt.
let needs_download = self
.needs_download()
.await
.map_err(DownloadError::PreStatFailed)?;
let permit = if let Some(reason) = needs_download {
if let NeedsDownload::NotFile(ft) = reason {
return Err(DownloadError::NotFile(ft));
let permit = if let Some(reason) = needs_download {
if let NeedsDownload::NotFile(ft) = reason {
return Err(DownloadError::NotFile(ft));
}
// only reset this after we've decided we really need to download. otherwise it'd
// be impossible to mark cancelled downloads for eviction, like one could imagine
// we would like to do for prefetching which was not needed.
self.wanted_evicted.store(false, Ordering::Release);
if !can_ever_evict {
return Err(DownloadError::NoRemoteStorage);
}
if let Some(ctx) = ctx {
self.check_expected_download(ctx)?;
}
if !allow_download {
// this does look weird, but for LayerInner the "downloading" means also changing
// internal once related state ...
return Err(DownloadError::DownloadRequired);
}
tracing::info!(%reason, "downloading on-demand");
self.spawn_download_and_wait(timeline, permit).await?
} else {
// the file is present locally, probably by a previous but cancelled call to
// get_or_maybe_download. alternatively we might be running without remote storage.
LAYER_IMPL_METRICS.inc_init_needed_no_download();
permit
};
let since_last_eviction =
self.last_evicted_at.lock().unwrap().map(|ts| ts.elapsed());
if let Some(since_last_eviction) = since_last_eviction {
// FIXME: this will not always be recorded correctly until #6028 (the no
// download needed branch above)
LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction);
}
// only reset this after we've decided we really need to download. otherwise it'd
// be impossible to mark cancelled downloads for eviction, like one could imagine
// we would like to do for prefetching which was not needed.
self.wanted_evicted.store(false, Ordering::Release);
let res = Arc::new(DownloadedLayer {
owner: Arc::downgrade(self),
kind: tokio::sync::OnceCell::default(),
version: next_version,
});
if !can_ever_evict {
return Err(DownloadError::NoRemoteStorage);
self.access_stats.record_residence_event(
LayerResidenceStatus::Resident,
LayerResidenceEventReason::ResidenceChange,
);
let waiters = self.inner.initializer_count();
if waiters > 0 {
tracing::info!(
waiters,
"completing the on-demand download for other tasks"
);
}
if let Some(ctx) = ctx {
self.check_expected_download(ctx)?;
}
scopeguard::ScopeGuard::into_inner(init_cancelled);
if !allow_download {
// this does look weird, but for LayerInner the "downloading" means also changing
// internal once related state ...
return Err(DownloadError::DownloadRequired);
}
tracing::info!(%reason, "downloading on-demand");
self.spawn_download_and_wait(timeline, permit).await?
} else {
// the file is present locally, probably by a previous but cancelled call to
// get_or_maybe_download. alternatively we might be running without remote storage.
LAYER_IMPL_METRICS.inc_init_needed_no_download();
permit
};
let since_last_eviction =
self.last_evicted_at.lock().unwrap().map(|ts| ts.elapsed());
if let Some(since_last_eviction) = since_last_eviction {
// FIXME: this will not always be recorded correctly until #6028 (the no
// download needed branch above)
LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction);
Ok((ResidentOrWantedEvicted::Resident(res), permit))
}
let res = Arc::new(DownloadedLayer {
owner: Arc::downgrade(self),
kind: tokio::sync::OnceCell::default(),
version: next_version,
});
self.access_stats.record_residence_event(
LayerResidenceStatus::Resident,
LayerResidenceEventReason::ResidenceChange,
);
let waiters = self.inner.initializer_count();
if waiters > 0 {
tracing::info!(waiters, "completing the on-demand download for other tasks");
}
scopeguard::ScopeGuard::into_inner(init_cancelled);
Ok((ResidentOrWantedEvicted::Resident(res), permit))
.instrument(tracing::info_span!("get_or_maybe_download", layer=%self))
};
if let Some(init_permit) = init_permit.take() {

View File

@@ -20,7 +20,7 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder):
env.pageserver.allowed_errors.extend(
[
".*layer loading failed:.*",
".*get_value_reconstruct_data for layer .*",
".*could not find data for key.*",
".*is not active. Current state: Broken.*",
".*will not become active. Current state: Broken.*",
@@ -83,7 +83,7 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder):
# (We don't check layer file contents on startup, when loading the timeline)
#
# This will change when we implement checksums for layers
with pytest.raises(Exception, match="layer loading failed:") as err:
with pytest.raises(Exception, match="get_value_reconstruct_data for layer ") as err:
pg2.start()
log.info(
f"As expected, compute startup failed for timeline {tenant2}/{timeline2} with corrupt layers: {err}"