From 2726b1934ebd9d12d976ce9e9a41783d9ab238a8 Mon Sep 17 00:00:00 2001 From: John Spray Date: Wed, 20 Mar 2024 18:07:45 +0000 Subject: [PATCH] pageserver: extra debug for test_secondary_downloads failures (#7183) - Enable debug logs for this test - Add some debug logging detail in downloader.rs - Add an info-level message in scheduler.rs that makes it obvious if a command is waiting for an existing task rather than spawning a new one. --- pageserver/src/tenant/secondary/downloader.rs | 13 ++++++++++++- pageserver/src/tenant/secondary/scheduler.rs | 1 + test_runner/regress/test_pageserver_secondary.py | 4 ++++ 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs index a595096133..82af7ed83b 100644 --- a/pageserver/src/tenant/secondary/downloader.rs +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -534,7 +534,11 @@ impl<'a> TenantDownloader<'a> { .await .maybe_fatal_err(&context_msg)?; - tracing::debug!("Wrote local heatmap to {}", heatmap_path); + tracing::debug!( + "Wrote local heatmap to {}, with {} timelines", + heatmap_path, + heatmap.timelines.len() + ); // Clean up any local layers that aren't in the heatmap. We do this first for all timelines, on the general // principle that deletions should be done before writes wherever possible, and so that we can use this @@ -547,6 +551,10 @@ impl<'a> TenantDownloader<'a> { // Download the layers in the heatmap for timeline in heatmap.timelines { if self.secondary_state.cancel.is_cancelled() { + tracing::debug!( + "Cancelled before downloading timeline {}", + timeline.timeline_id + ); return Ok(()); } @@ -764,10 +772,13 @@ impl<'a> TenantDownloader<'a> { } }; + tracing::debug!(timeline_id=%timeline.timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len()); + // Download heatmap layers that are not present on local disk, or update their // access time if they are already present. for layer in timeline.layers { if self.secondary_state.cancel.is_cancelled() { + tracing::debug!("Cancelled -- dropping out of layer loop"); return Ok(()); } diff --git a/pageserver/src/tenant/secondary/scheduler.rs b/pageserver/src/tenant/secondary/scheduler.rs index 58bdb54161..3bd7be782e 100644 --- a/pageserver/src/tenant/secondary/scheduler.rs +++ b/pageserver/src/tenant/secondary/scheduler.rs @@ -300,6 +300,7 @@ where let tenant_shard_id = job.get_tenant_shard_id(); let barrier = if let Some(barrier) = self.get_running(tenant_shard_id) { + tracing::info!("Command already running, waiting for it"); barrier } else { let running = self.spawn_now(job); diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index e664547b69..2e57136607 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -432,6 +432,10 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): - Eviction of layers on the attached location results in deletion on the secondary location as well. """ + + # For debug of https://github.com/neondatabase/neon/issues/6966 + neon_env_builder.rust_log_override = "DEBUG" + neon_env_builder.num_pageservers = 2 neon_env_builder.enable_pageserver_remote_storage( remote_storage_kind=RemoteStorageKind.MOCK_S3,