From 608afc3055fe1acb32caba2f7b94e01db30f12f2 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 7 May 2025 17:21:17 +0800
Subject: [PATCH] fix(scrubber): log download error (#11833)

## Problem

We use `head_object` to determine whether an object exists or not.
However, it does not always error due to a missing object.

## Summary of changes

Log the error so that we can have a better idea what's going on with the
scrubber errors in prod.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 storage_scrubber/src/checks.rs                   |  5 +++--
 storage_scrubber/src/pageserver_physical_gc.rs   | 11 +++++++----
 storage_scrubber/src/scan_pageserver_metadata.rs |  5 ++++-
 3 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs
index f0ba632fd4..b151b612bf 100644
--- a/storage_scrubber/src/checks.rs
+++ b/storage_scrubber/src/checks.rs
@@ -165,16 +165,17 @@ pub(crate) async fn branch_cleanup_and_check_errors(
                                 .head_object(&path, &CancellationToken::new())
                                 .await;
 
-                            if response.is_err() {
+                            if let Err(e) = response {
                                 // Object is not present.
                                 let is_l0 = LayerMap::is_l0(layer.key_range(), layer.is_delta());
 
                                 let msg = format!(
-                                    "index_part.json contains a layer {}{} (shard {}) that is not present in remote storage (layer_is_l0: {})",
+                                    "index_part.json contains a layer {}{} (shard {}) that is not present in remote storage (layer_is_l0: {}) with error: {}",
                                     layer,
                                     metadata.generation.get_suffix(),
                                     metadata.shard,
                                     is_l0,
+                                    e,
                                 );
 
                                 if is_l0 || ignore_error {
diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs
index f14341c7bc..e1a4095a3c 100644
--- a/storage_scrubber/src/pageserver_physical_gc.rs
+++ b/storage_scrubber/src/pageserver_physical_gc.rs
@@ -137,11 +137,10 @@ struct TenantRefAccumulator {
 impl TenantRefAccumulator {
     fn update(&mut self, ttid: TenantShardTimelineId, index_part: &IndexPart) {
         let this_shard_idx = ttid.tenant_shard_id.to_index();
-        (*self
-            .shards_seen
+        self.shards_seen
             .entry(ttid.tenant_shard_id.tenant_id)
-            .or_default())
-        .insert(this_shard_idx);
+            .or_default()
+            .insert(this_shard_idx);
 
         let mut ancestor_refs = Vec::new();
         for (layer_name, layer_metadata) in &index_part.layer_metadata {
@@ -767,10 +766,13 @@ pub async fn pageserver_physical_gc(
                 stream_tenant_timelines(remote_client_ref, target_ref, tenant_shard_id).await?,
             );
             Ok(try_stream! {
+                let mut cnt = 0;
                 while let Some(ttid_res) = timelines.next().await {
                     let ttid = ttid_res?;
+                    cnt += 1;
                     yield (ttid, tenant_manifest_arc.clone());
                 }
+                tracing::info!(%tenant_shard_id, "Found {} timelines", cnt);
             })
         }
     });
@@ -790,6 +792,7 @@ pub async fn pageserver_physical_gc(
                 &accumulator,
                 tenant_manifest_arc,
             )
+            .instrument(info_span!("gc_timeline", %ttid))
         });
         let timelines = timelines.try_buffered(CONCURRENCY);
         let mut timelines = std::pin::pin!(timelines);
diff --git a/storage_scrubber/src/scan_pageserver_metadata.rs b/storage_scrubber/src/scan_pageserver_metadata.rs
index ba75f25984..77c7987aa7 100644
--- a/storage_scrubber/src/scan_pageserver_metadata.rs
+++ b/storage_scrubber/src/scan_pageserver_metadata.rs
@@ -153,7 +153,10 @@ pub async fn scan_pageserver_metadata(
     const CONCURRENCY: usize = 32;
 
     // Generate a stream of TenantTimelineId
-    let timelines = tenants.map_ok(|t| stream_tenant_timelines(&remote_client, &target, t));
+    let timelines = tenants.map_ok(|t| {
+        tracing::info!("Found tenant: {}", t);
+        stream_tenant_timelines(&remote_client, &target, t)
+    });
     let timelines = timelines.try_buffered(CONCURRENCY);
     let timelines = timelines.try_flatten();