pageserver: even more debug for test_secondary_downloads (#7295)

The latest failures of test_secondary_downloads are spooky: layers are
missing on disk according to the test, but present according to the
pageserver logs:
- Make the pageserver assert that layers are really present on disk and
log the full path (debug mode only)
- Make the test dump a full listing on failure of the assert that failed
the last two times

Related: #6966
This commit is contained in:
John Spray
2024-04-03 11:23:44 +01:00
committed by GitHub
parent d8da51e78a
commit bc05d7eb9c
2 changed files with 42 additions and 3 deletions

View File

@@ -786,6 +786,35 @@ impl<'a> TenantDownloader<'a> {
// Existing on-disk layers: just update their access time.
if let Some(on_disk) = timeline_state.on_disk_layers.get(&layer.name) {
tracing::debug!("Layer {} is already on disk", layer.name);
if cfg!(debug_assertions) {
// Debug for https://github.com/neondatabase/neon/issues/6966: check that the files we think
// are already present on disk are really there.
let local_path = self
.conf
.timeline_path(tenant_shard_id, &timeline.timeline_id)
.join(layer.name.file_name());
match tokio::fs::metadata(&local_path).await {
Ok(meta) => {
tracing::debug!(
"Layer {} present at {}, size {}",
layer.name,
local_path,
meta.len(),
);
}
Err(e) => {
tracing::warn!(
"Layer {} not found at {} ({})",
layer.name,
local_path,
e
);
debug_assert!(false);
}
}
}
if on_disk.metadata != LayerFileMetadata::from(&layer.metadata)
|| on_disk.access_time != layer.access_time
{

View File

@@ -498,9 +498,19 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
ps_secondary.http_client().tenant_secondary_download(tenant_id)
assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers(
ps_secondary, tenant_id, timeline_id
)
try:
assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers(
ps_secondary, tenant_id, timeline_id
)
except:
# Do a full listing of the secondary location on errors, to help debug of
# https://github.com/neondatabase/neon/issues/6966
timeline_path = ps_secondary.timeline_dir(tenant_id, timeline_id)
for path, _dirs, files in os.walk(timeline_path):
for f in files:
log.info(f"Secondary file: {os.path.join(path, f)}")
raise
# FIXME: this sleep is needed to avoid on-demand promotion of the layers we evict, while
# walreceiver is still doing something.