diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index cb3ca9c8b9..a98a32de35 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -31,6 +31,7 @@ use pageserver_api::shard::TenantShardId; use remote_storage::DownloadError; use remote_storage::GenericRemoteStorage; use remote_storage::TimeoutOrCancel; +use std::collections::BTreeMap; use std::fmt; use std::time::SystemTime; use storage_broker::BrokerClientChannel; @@ -95,14 +96,12 @@ use crate::tenant::storage_layer::ImageLayer; use crate::walredo; use crate::InitializationOrder; use std::collections::hash_map::Entry; -use std::collections::BTreeSet; use std::collections::HashMap; use std::collections::HashSet; use std::fmt::Debug; use std::fmt::Display; use std::fs; use std::fs::File; -use std::ops::Bound::Included; use std::sync::atomic::AtomicU64; use std::sync::atomic::Ordering; use std::sync::Arc; @@ -1765,6 +1764,9 @@ impl Tenant { .values() .filter(|timeline| !(timeline.is_broken() || timeline.is_stopping())); + // Before activation, populate each Timeline's GcInfo with information about its children + self.initialize_gc_info(&timelines_accessor); + // Spawn gc and compaction loops. The loops will shut themselves // down when they notice that the tenant is inactive. tasks::start_background_loops(self, background_jobs_can_start); @@ -2798,6 +2800,55 @@ impl Tenant { .await } + /// Populate all Timelines' `GcInfo` with information about their children. We do not set the + /// PITR cutoffs here, because that requires I/O: this is done later, before GC, by [`Self::refresh_gc_info_internal`] + /// + /// Subsequently, parent-child relationships are updated incrementally during timeline creation/deletion. + fn initialize_gc_info( + &self, + timelines: &std::sync::MutexGuard>>, + ) { + // This function must be called before activation: after activation timeline create/delete operations + // might happen, and this function is not safe to run concurrently with those. + assert!(!self.is_active()); + + // Scan all timelines. For each timeline, remember the timeline ID and + // the branch point where it was created. + let mut all_branchpoints: BTreeMap> = BTreeMap::new(); + timelines.iter().for_each(|(timeline_id, timeline_entry)| { + if let Some(ancestor_timeline_id) = &timeline_entry.get_ancestor_timeline_id() { + let ancestor_children = all_branchpoints.entry(*ancestor_timeline_id).or_default(); + ancestor_children.push((timeline_entry.get_ancestor_lsn(), *timeline_id)); + } + }); + + // The number of bytes we always keep, irrespective of PITR: this is a constant across timelines + let horizon = self.get_gc_horizon(); + + // Populate each timeline's GcInfo with information about its child branches + for timeline in timelines.values() { + let mut branchpoints: Vec<(Lsn, TimelineId)> = all_branchpoints + .remove(&timeline.timeline_id) + .unwrap_or_default(); + + branchpoints.sort_by_key(|b| b.0); + + let mut target = timeline.gc_info.write().unwrap(); + + target.retain_lsns = branchpoints; + + let space_cutoff = timeline + .get_last_record_lsn() + .checked_sub(horizon) + .unwrap_or(Lsn(0)); + + target.cutoffs = GcCutoffs { + space: space_cutoff, + time: Lsn::INVALID, + }; + } + } + async fn refresh_gc_info_internal( &self, target_timeline_id: Option, @@ -2820,6 +2871,11 @@ impl Tenant { .cloned() .collect::>(); + if target_timeline_id.is_some() && timelines.is_empty() { + // We were to act on a particular timeline and it wasn't found + return Err(GcError::TimelineNotFound); + } + let mut gc_cutoffs: HashMap = HashMap::with_capacity(timelines.len()); @@ -2842,68 +2898,63 @@ impl Tenant { // because that will stall branch creation. let gc_cs = self.gc_cs.lock().await; - // Scan all timelines. For each timeline, remember the timeline ID and - // the branch point where it was created. - let (all_branchpoints, timelines): (BTreeSet<(TimelineId, Lsn)>, _) = { - let timelines = self.timelines.lock().unwrap(); - let mut all_branchpoints = BTreeSet::new(); - let timelines = { - if let Some(target_timeline_id) = target_timeline_id.as_ref() { - if timelines.get(target_timeline_id).is_none() { - return Err(GcError::TimelineNotFound); + // Paranoia check: it is critical that GcInfo's list of child timelines is correct, to avoid incorrectly GC'ing data they + // depend on. So although GcInfo is updated continuously by Timeline::new and Timeline::drop, we also calculate it here + // and fail out if it's inaccurate. + // (this can be removed later, it's a risk mitigation for https://github.com/neondatabase/neon/pull/8427) + { + let mut all_branchpoints: BTreeMap> = + BTreeMap::new(); + timelines.iter().for_each(|timeline| { + if let Some(ancestor_timeline_id) = &timeline.get_ancestor_timeline_id() { + let ancestor_children = + all_branchpoints.entry(*ancestor_timeline_id).or_default(); + ancestor_children.push((timeline.get_ancestor_lsn(), timeline.timeline_id)); + } + }); + + for timeline in &timelines { + let mut branchpoints: Vec<(Lsn, TimelineId)> = all_branchpoints + .remove(&timeline.timeline_id) + .unwrap_or_default(); + + branchpoints.sort_by_key(|b| b.0); + + let target = timeline.gc_info.read().unwrap(); + + // We require that retain_lsns contains everything in `branchpoints`, but not that + // they are exactly equal: timeline deletions can race with us, so retain_lsns + // may contain some extra stuff. It is safe to have extra timelines in there, because it + // just means that we retain slightly more data than we otherwise might. + let have_branchpoints = target.retain_lsns.iter().copied().collect::>(); + for b in &branchpoints { + if !have_branchpoints.contains(b) { + tracing::error!( + "Bug: `retain_lsns` is set incorrectly. Expected be {:?}, but found {:?}", + branchpoints, + target.retain_lsns + ); + debug_assert!(false); + // Do not GC based on bad information! + // (ab-use an existing GcError type rather than adding a new one, since this is a + // "should never happen" check that will be removed soon). + return Err(GcError::Remote(anyhow::anyhow!( + "retain_lsns failed validation!" + ))); } - }; - - timelines - .iter() - .map(|(_timeline_id, timeline_entry)| { - if let Some(ancestor_timeline_id) = - &timeline_entry.get_ancestor_timeline_id() - { - // If target_timeline is specified, we only need to know branchpoints of its children - if let Some(timeline_id) = target_timeline_id { - if ancestor_timeline_id == &timeline_id { - all_branchpoints.insert(( - *ancestor_timeline_id, - timeline_entry.get_ancestor_lsn(), - )); - } - } - // Collect branchpoints for all timelines - else { - all_branchpoints.insert(( - *ancestor_timeline_id, - timeline_entry.get_ancestor_lsn(), - )); - } - } - - timeline_entry.clone() - }) - .collect::>() - }; - (all_branchpoints, timelines) - }; + } + } + } // Ok, we now know all the branch points. // Update the GC information for each timeline. let mut gc_timelines = Vec::with_capacity(timelines.len()); for timeline in timelines { - // If target_timeline is specified, ignore all other timelines + // We filtered the timeline list above if let Some(target_timeline_id) = target_timeline_id { - if timeline.timeline_id != target_timeline_id { - continue; - } + assert_eq!(target_timeline_id, timeline.timeline_id); } - let branchpoints: Vec = all_branchpoints - .range(( - Included((timeline.timeline_id, Lsn(0))), - Included((timeline.timeline_id, Lsn(u64::MAX))), - )) - .map(|&x| x.1) - .collect(); - { let mut target = timeline.gc_info.write().unwrap(); @@ -2941,20 +2992,12 @@ impl Tenant { .0, ); - match gc_cutoffs.remove(&timeline.timeline_id) { - Some(cutoffs) => { - target.retain_lsns = branchpoints; - target.cutoffs = cutoffs; - } - None => { - // reasons for this being unavailable: - // - this timeline was created while we were finding cutoffs - // - lsn for timestamp search fails for this timeline repeatedly - // - // in both cases, refreshing the branchpoints is correct. - target.retain_lsns = branchpoints; - } - }; + // Apply the cutoffs we found to the Timeline's GcInfo. Why might we _not_ have cutoffs for a timeline? + // - this timeline was created while we were finding cutoffs + // - lsn for timestamp search fails for this timeline repeatedly + if let Some(cutoffs) = gc_cutoffs.get(&timeline.timeline_id) { + target.cutoffs = cutoffs.clone(); + } } gc_timelines.push(timeline); @@ -4343,7 +4386,7 @@ mod tests { { let branchpoints = &tline.gc_info.read().unwrap().retain_lsns; assert_eq!(branchpoints.len(), 1); - assert_eq!(branchpoints[0], Lsn(0x40)); + assert_eq!(branchpoints[0], (Lsn(0x40), NEW_TIMELINE_ID)); } // You can read the key from the child branch even though the parent is diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs index e4728ca8a8..41d558d3f6 100644 --- a/pageserver/src/tenant/size.rs +++ b/pageserver/src/tenant/size.rs @@ -264,10 +264,10 @@ pub(super) async fn gather_inputs( let mut lsns: Vec<(Lsn, LsnKind)> = gc_info .retain_lsns .iter() - .filter(|&&lsn| lsn > ancestor_lsn) + .filter(|(lsn, _child_id)| lsn > &ancestor_lsn) .copied() // this assumes there are no other retain_lsns than the branchpoints - .map(|lsn| (lsn, LsnKind::BranchPoint)) + .map(|(lsn, _child_id)| (lsn, LsnKind::BranchPoint)) .collect::>(); lsns.extend(lease_points.iter().map(|&lsn| (lsn, LsnKind::LeasePoint))); diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 82e8ff02ca..178b707aa7 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -460,7 +460,7 @@ pub(crate) struct GcInfo { /// Currently, this includes all points where child branches have /// been forked off from. In the future, could also include /// explicit user-defined snapshot points. - pub(crate) retain_lsns: Vec, + pub(crate) retain_lsns: Vec<(Lsn, TimelineId)>, /// The cutoff coordinates, which are combined by selecting the minimum. pub(crate) cutoffs: GcCutoffs, @@ -476,12 +476,21 @@ impl GcInfo { pub(crate) fn min_cutoff(&self) -> Lsn { self.cutoffs.select_min() } + + pub(super) fn insert_child(&mut self, child_id: TimelineId, child_lsn: Lsn) { + self.retain_lsns.push((child_lsn, child_id)); + self.retain_lsns.sort_by_key(|i| i.0); + } + + pub(super) fn remove_child(&mut self, child_id: TimelineId) { + self.retain_lsns.retain(|i| i.1 != child_id); + } } /// The `GcInfo` component describing which Lsns need to be retained. Functionally, this /// is a single number (the oldest LSN which we must retain), but it internally distinguishes /// between time-based and space-based retention for observability and consumption metrics purposes. -#[derive(Debug)] +#[derive(Debug, Clone)] pub(crate) struct GcCutoffs { /// Calculated from the [`TenantConf::gc_horizon`], this LSN indicates how much /// history we must keep to retain a specified number of bytes of WAL. @@ -2307,6 +2316,11 @@ impl Timeline { ) }; + if let Some(ancestor) = &ancestor { + let mut ancestor_gc_info = ancestor.gc_info.write().unwrap(); + ancestor_gc_info.insert_child(timeline_id, metadata.ancestor_lsn()); + } + Arc::new_cyclic(|myself| { let metrics = TimelineMetrics::new( &tenant_shard_id, @@ -4753,6 +4767,18 @@ impl Timeline { } } +impl Drop for Timeline { + fn drop(&mut self) { + if let Some(ancestor) = &self.ancestor_timeline { + // This lock should never be poisoned, but in case it is we do a .map() instead of + // an unwrap(), to avoid panicking in a destructor and thereby aborting the process. + if let Ok(mut gc_info) = ancestor.gc_info.write() { + gc_info.remove_child(self.timeline_id) + } + } + } +} + /// Top-level failure to compact. #[derive(Debug, thiserror::Error)] pub(crate) enum CompactionError { @@ -5070,7 +5096,11 @@ impl Timeline { let space_cutoff = min(gc_info.cutoffs.space, self.get_disk_consistent_lsn()); let time_cutoff = gc_info.cutoffs.time; - let retain_lsns = gc_info.retain_lsns.clone(); + let retain_lsns = gc_info + .retain_lsns + .iter() + .map(|(lsn, _child_id)| *lsn) + .collect(); // Gets the maximum LSN that holds the valid lease. // diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index 02124ad852..ab6a5f20ba 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -148,14 +148,14 @@ async fn cleanup_remaining_timeline_fs_traces( /// For more context see comments in [`DeleteTimelineFlow::prepare`] async fn remove_timeline_from_tenant( tenant: &Tenant, - timeline_id: TimelineId, + timeline: &Timeline, _: &DeletionGuard, // using it as a witness ) -> anyhow::Result<()> { // Remove the timeline from the map. let mut timelines = tenant.timelines.lock().unwrap(); let children_exist = timelines .iter() - .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id)); + .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline.timeline_id)); // XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`. // We already deleted the layer files, so it's probably best to panic. // (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart) @@ -164,7 +164,7 @@ async fn remove_timeline_from_tenant( } timelines - .remove(&timeline_id) + .remove(&timeline.timeline_id) .expect("timeline that we were deleting was concurrently removed from 'timelines' map"); drop(timelines); @@ -414,7 +414,7 @@ impl DeleteTimelineFlow { pausable_failpoint!("in_progress_delete"); - remove_timeline_from_tenant(tenant, timeline.timeline_id, &guard).await?; + remove_timeline_from_tenant(tenant, timeline, &guard).await?; *guard = Self::Finished;