From 608afc3055fe1acb32caba2f7b94e01db30f12f2 Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Wed, 7 May 2025 17:21:17 +0800 Subject: [PATCH] fix(scrubber): log download error (#11833) ## Problem We use `head_object` to determine whether an object exists or not. However, it does not always error due to a missing object. ## Summary of changes Log the error so that we can have a better idea what's going on with the scrubber errors in prod. --------- Signed-off-by: Alex Chi Z --- storage_scrubber/src/checks.rs | 5 +++-- storage_scrubber/src/pageserver_physical_gc.rs | 11 +++++++---- storage_scrubber/src/scan_pageserver_metadata.rs | 5 ++++- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs index f0ba632fd4..b151b612bf 100644 --- a/storage_scrubber/src/checks.rs +++ b/storage_scrubber/src/checks.rs @@ -165,16 +165,17 @@ pub(crate) async fn branch_cleanup_and_check_errors( .head_object(&path, &CancellationToken::new()) .await; - if response.is_err() { + if let Err(e) = response { // Object is not present. let is_l0 = LayerMap::is_l0(layer.key_range(), layer.is_delta()); let msg = format!( - "index_part.json contains a layer {}{} (shard {}) that is not present in remote storage (layer_is_l0: {})", + "index_part.json contains a layer {}{} (shard {}) that is not present in remote storage (layer_is_l0: {}) with error: {}", layer, metadata.generation.get_suffix(), metadata.shard, is_l0, + e, ); if is_l0 || ignore_error { diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs index f14341c7bc..e1a4095a3c 100644 --- a/storage_scrubber/src/pageserver_physical_gc.rs +++ b/storage_scrubber/src/pageserver_physical_gc.rs @@ -137,11 +137,10 @@ struct TenantRefAccumulator { impl TenantRefAccumulator { fn update(&mut self, ttid: TenantShardTimelineId, index_part: &IndexPart) { let this_shard_idx = ttid.tenant_shard_id.to_index(); - (*self - .shards_seen + self.shards_seen .entry(ttid.tenant_shard_id.tenant_id) - .or_default()) - .insert(this_shard_idx); + .or_default() + .insert(this_shard_idx); let mut ancestor_refs = Vec::new(); for (layer_name, layer_metadata) in &index_part.layer_metadata { @@ -767,10 +766,13 @@ pub async fn pageserver_physical_gc( stream_tenant_timelines(remote_client_ref, target_ref, tenant_shard_id).await?, ); Ok(try_stream! { + let mut cnt = 0; while let Some(ttid_res) = timelines.next().await { let ttid = ttid_res?; + cnt += 1; yield (ttid, tenant_manifest_arc.clone()); } + tracing::info!(%tenant_shard_id, "Found {} timelines", cnt); }) } }); @@ -790,6 +792,7 @@ pub async fn pageserver_physical_gc( &accumulator, tenant_manifest_arc, ) + .instrument(info_span!("gc_timeline", %ttid)) }); let timelines = timelines.try_buffered(CONCURRENCY); let mut timelines = std::pin::pin!(timelines); diff --git a/storage_scrubber/src/scan_pageserver_metadata.rs b/storage_scrubber/src/scan_pageserver_metadata.rs index ba75f25984..77c7987aa7 100644 --- a/storage_scrubber/src/scan_pageserver_metadata.rs +++ b/storage_scrubber/src/scan_pageserver_metadata.rs @@ -153,7 +153,10 @@ pub async fn scan_pageserver_metadata( const CONCURRENCY: usize = 32; // Generate a stream of TenantTimelineId - let timelines = tenants.map_ok(|t| stream_tenant_timelines(&remote_client, &target, t)); + let timelines = tenants.map_ok(|t| { + tracing::info!("Found tenant: {}", t); + stream_tenant_timelines(&remote_client, &target, t) + }); let timelines = timelines.try_buffered(CONCURRENCY); let timelines = timelines.try_flatten();