diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index 1747995d2d..ac5fb0bc8c 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -1557,6 +1557,7 @@ fn schedule_first_sync_tasks( local_timeline_init_statuses } +/// bool in return value stands for awaits_download fn compare_local_and_remote_timeline( new_sync_tasks: &mut VecDeque<(ZTenantTimelineId, SyncTask)>, sync_id: ZTenantTimelineId, @@ -1566,14 +1567,6 @@ fn compare_local_and_remote_timeline( ) -> (LocalTimelineInitStatus, bool) { let remote_files = remote_entry.stored_files(); - // TODO probably here we need more sophisticated logic, - // if more data is available remotely can we just download what's there? - // without trying to upload something. It may be tricky, needs further investigation. - // For now looks strange that we can request upload - // and download for the same timeline simultaneously. - // (upload needs to be only for previously unsynced files, not whole timeline dir). - // If one of the tasks fails they will be reordered in the queue which can lead - // to timeline being stuck in evicted state let number_of_layers_to_download = remote_files.difference(&local_files).count(); let (initial_timeline_status, awaits_download) = if number_of_layers_to_download > 0 { new_sync_tasks.push_back(( diff --git a/pageserver/src/storage_sync/download.rs b/pageserver/src/storage_sync/download.rs index 1700d6c9c9..a91eaaa7ca 100644 --- a/pageserver/src/storage_sync/download.rs +++ b/pageserver/src/storage_sync/download.rs @@ -3,6 +3,7 @@ use std::{ collections::{HashMap, HashSet}, fmt::Debug, + mem, path::Path, }; @@ -32,10 +33,29 @@ pub const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download"; // Poisoned variant is returned. // When data is received succesfully without errors Present variant is used. pub enum TenantIndexParts { - Poisoned(ZTenantTimelineId), + Poisoned { + present: HashMap, + missing: HashSet, + }, Present(HashMap), } +impl TenantIndexParts { + fn add_poisoned(&mut self, timeline_id: ZTimelineId) { + match self { + TenantIndexParts::Poisoned { missing, .. } => { + missing.insert(timeline_id); + } + TenantIndexParts::Present(present) => { + *self = TenantIndexParts::Poisoned { + present: mem::take(present), + missing: HashSet::from([timeline_id]), + } + } + } + } +} + impl Default for TenantIndexParts { fn default() -> Self { TenantIndexParts::Present(HashMap::default()) @@ -63,8 +83,8 @@ where Ok(index_part) => { debug!("Successfully fetched index part for {id}"); match index_parts.entry(id.tenant_id).or_default() { - TenantIndexParts::Poisoned(id) => { - warn!("disgarding index part for poisoned tenant, poisoned by id: {id}") + TenantIndexParts::Poisoned { present, .. } => { + present.insert(id.timeline_id, index_part); } TenantIndexParts::Present(parts) => { parts.insert(id.timeline_id, index_part); @@ -77,8 +97,8 @@ where // thats ok because it means that we didnt upload something we have locally for example } e => { - *index_parts.entry(id.tenant_id).or_default() = - TenantIndexParts::Poisoned(id); + let tenant_parts = index_parts.entry(id.tenant_id).or_default(); + tenant_parts.add_poisoned(id.timeline_id); error!( "Failed to fetch index part for {id}: {e} poisoning tenant index parts" ); @@ -144,8 +164,8 @@ where .remove(&tenant_id) .ok_or_else(|| anyhow::anyhow!("Missing tenant index parts. This is a bug."))? { - TenantIndexParts::Poisoned(id) => { - anyhow::bail!("failed to download index parts for all timeline: {id}") + TenantIndexParts::Poisoned { missing, .. } => { + anyhow::bail!("Failed to download index parts for all timelines. Missing {missing:?}") } TenantIndexParts::Present(parts) => Ok(parts), } diff --git a/pageserver/src/storage_sync/index.rs b/pageserver/src/storage_sync/index.rs index 4182d58ce6..134ae893bc 100644 --- a/pageserver/src/storage_sync/index.rs +++ b/pageserver/src/storage_sync/index.rs @@ -97,8 +97,8 @@ impl RemoteIndex { for (tenant_id, index_parts) in index_parts { match index_parts { - // TODO: should we schedule a retry so it can be recovered? otherwise there is no way to revive it other restarting whole pageserver - TenantIndexParts::Poisoned(id) => warn!("skipping tenant_id set up for remote index because the index download has failed for timeline {id}"), + // TODO: should we schedule a retry so it can be recovered? otherwise we can revive it only through detach/attach or pageserver restart + TenantIndexParts::Poisoned { missing, ..} => warn!("skipping tenant_id set up for remote index because the index download has failed for timeline(s): {missing:?}"), TenantIndexParts::Present(timelines) => { for (timeline_id, index_part) in timelines { let timeline_path = conf.timeline_path(&timeline_id, &tenant_id);