From 3ec52088dd53a762f86c2469063e2c0724802513 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Tue, 9 May 2023 18:16:22 +0200 Subject: [PATCH] eviction_task: tracing::instrument the imitate-access calls (#4180) Currently, if we unexpectly download from the eviction task, the log lines look like what we have in https://github.com/neondatabase/neon/issues/4154 ``` 2023-05-04T14:42:57.586772Z WARN eviction_task{tenant_id=$TENANT timeline_id=$TIMELINE}:eviction_iteration{policy_kind="LayerAccessThreshold"}: unexpectedly on-demand downloading remote layer remote $TIMELINE/000000067F000032AC0000400C00FFFFFFFF-000000067F000032AC000040140000000008__0000000001696070-0000000003DC76E9 for task kind Eviction ``` We know these are caused by the imitate accesses. But we don't know which one (my bet is on update_gc_info). I didn't want to pollute the other tasks' logs with the additional spans, so, using `.instrument()` when we call non-eviction-task code. refs https://github.com/neondatabase/neon/issues/4154 --- .../src/tenant/timeline/eviction_task.rs | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index 58321762ea..523a5f8fa7 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -22,7 +22,7 @@ use std::{ use tokio::time::Instant; use tokio_util::sync::CancellationToken; -use tracing::{debug, error, info, instrument, warn}; +use tracing::{debug, error, info, info_span, instrument, warn, Instrument}; use crate::{ context::{DownloadBehavior, RequestContext}, @@ -276,6 +276,7 @@ impl Timeline { ControlFlow::Continue(()) } + #[instrument(skip_all)] async fn imitate_layer_accesses( &self, p: &EvictionPolicyLayerAccessThreshold, @@ -324,6 +325,7 @@ impl Timeline { } /// Recompute the values which would cause on-demand downloads during restart. + #[instrument(skip_all)] async fn imitate_timeline_cached_layer_accesses( &self, cancel: &CancellationToken, @@ -332,7 +334,10 @@ impl Timeline { let lsn = self.get_last_record_lsn(); // imitiate on-restart initial logical size - let size = self.calculate_logical_size(lsn, cancel.clone(), ctx).await; + let size = self + .calculate_logical_size(lsn, cancel.clone(), ctx) + .instrument(info_span!("calculate_logical_size")) + .await; match &size { Ok(_size) => { @@ -347,7 +352,11 @@ impl Timeline { } // imitiate repartiting on first compactation - if let Err(e) = self.collect_keyspace(lsn, ctx).await { + if let Err(e) = self + .collect_keyspace(lsn, ctx) + .instrument(info_span!("collect_keyspace")) + .await + { // if this failed, we probably failed logical size because these use the same keys if size.is_err() { // ignore, see above comment @@ -360,6 +369,7 @@ impl Timeline { } // Imitate the synthetic size calculation done by the consumption_metrics module. + #[instrument(skip_all)] async fn imitate_synthetic_size_calculation_worker( &self, tenant: &Arc, @@ -398,7 +408,8 @@ impl Timeline { let mut throwaway_cache = HashMap::new(); let gather = - crate::tenant::size::gather_inputs(tenant, limit, None, &mut throwaway_cache, ctx); + crate::tenant::size::gather_inputs(tenant, limit, None, &mut throwaway_cache, ctx) + .instrument(info_span!("gather_inputs")); tokio::select! { _ = cancel.cancelled() => {}