From 2ef8e57f865773437a1350964f7d2e83bbab6ad5 Mon Sep 17 00:00:00 2001 From: John Spray Date: Wed, 24 Jul 2024 11:33:44 +0100 Subject: [PATCH] pageserver: maintain gc_info incrementally (#8427) ## Problem Previously, Timeline::gc_info was only updated in a batch operation at the start of GC. That means that timelines didn't generally have accurate information about who their children were before the first GC, or between GC cycles. Knowledge of child branches is important for calculating layer visibility in #8398 ## Summary of changes - Split out part of refresh_gc_info into initialize_gc_info, which is now called early in startup - Include TimelineId in retain_lsns so that we can later add/remove the LSNs for particular children - When timelines are added/removed, update their parent's retain_lsns --- pageserver/src/tenant.rs | 183 ++++++++++++++--------- pageserver/src/tenant/size.rs | 4 +- pageserver/src/tenant/timeline.rs | 36 ++++- pageserver/src/tenant/timeline/delete.rs | 8 +- 4 files changed, 152 insertions(+), 79 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index cb3ca9c8b9..a98a32de35 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -31,6 +31,7 @@ use pageserver_api::shard::TenantShardId; use remote_storage::DownloadError; use remote_storage::GenericRemoteStorage; use remote_storage::TimeoutOrCancel; +use std::collections::BTreeMap; use std::fmt; use std::time::SystemTime; use storage_broker::BrokerClientChannel; @@ -95,14 +96,12 @@ use crate::tenant::storage_layer::ImageLayer; use crate::walredo; use crate::InitializationOrder; use std::collections::hash_map::Entry; -use std::collections::BTreeSet; use std::collections::HashMap; use std::collections::HashSet; use std::fmt::Debug; use std::fmt::Display; use std::fs; use std::fs::File; -use std::ops::Bound::Included; use std::sync::atomic::AtomicU64; use std::sync::atomic::Ordering; use std::sync::Arc; @@ -1765,6 +1764,9 @@ impl Tenant { .values() .filter(|timeline| !(timeline.is_broken() || timeline.is_stopping())); + // Before activation, populate each Timeline's GcInfo with information about its children + self.initialize_gc_info(&timelines_accessor); + // Spawn gc and compaction loops. The loops will shut themselves // down when they notice that the tenant is inactive. tasks::start_background_loops(self, background_jobs_can_start); @@ -2798,6 +2800,55 @@ impl Tenant { .await } + /// Populate all Timelines' `GcInfo` with information about their children. We do not set the + /// PITR cutoffs here, because that requires I/O: this is done later, before GC, by [`Self::refresh_gc_info_internal`] + /// + /// Subsequently, parent-child relationships are updated incrementally during timeline creation/deletion. + fn initialize_gc_info( + &self, + timelines: &std::sync::MutexGuard>>, + ) { + // This function must be called before activation: after activation timeline create/delete operations + // might happen, and this function is not safe to run concurrently with those. + assert!(!self.is_active()); + + // Scan all timelines. For each timeline, remember the timeline ID and + // the branch point where it was created. + let mut all_branchpoints: BTreeMap> = BTreeMap::new(); + timelines.iter().for_each(|(timeline_id, timeline_entry)| { + if let Some(ancestor_timeline_id) = &timeline_entry.get_ancestor_timeline_id() { + let ancestor_children = all_branchpoints.entry(*ancestor_timeline_id).or_default(); + ancestor_children.push((timeline_entry.get_ancestor_lsn(), *timeline_id)); + } + }); + + // The number of bytes we always keep, irrespective of PITR: this is a constant across timelines + let horizon = self.get_gc_horizon(); + + // Populate each timeline's GcInfo with information about its child branches + for timeline in timelines.values() { + let mut branchpoints: Vec<(Lsn, TimelineId)> = all_branchpoints + .remove(&timeline.timeline_id) + .unwrap_or_default(); + + branchpoints.sort_by_key(|b| b.0); + + let mut target = timeline.gc_info.write().unwrap(); + + target.retain_lsns = branchpoints; + + let space_cutoff = timeline + .get_last_record_lsn() + .checked_sub(horizon) + .unwrap_or(Lsn(0)); + + target.cutoffs = GcCutoffs { + space: space_cutoff, + time: Lsn::INVALID, + }; + } + } + async fn refresh_gc_info_internal( &self, target_timeline_id: Option, @@ -2820,6 +2871,11 @@ impl Tenant { .cloned() .collect::>(); + if target_timeline_id.is_some() && timelines.is_empty() { + // We were to act on a particular timeline and it wasn't found + return Err(GcError::TimelineNotFound); + } + let mut gc_cutoffs: HashMap = HashMap::with_capacity(timelines.len()); @@ -2842,68 +2898,63 @@ impl Tenant { // because that will stall branch creation. let gc_cs = self.gc_cs.lock().await; - // Scan all timelines. For each timeline, remember the timeline ID and - // the branch point where it was created. - let (all_branchpoints, timelines): (BTreeSet<(TimelineId, Lsn)>, _) = { - let timelines = self.timelines.lock().unwrap(); - let mut all_branchpoints = BTreeSet::new(); - let timelines = { - if let Some(target_timeline_id) = target_timeline_id.as_ref() { - if timelines.get(target_timeline_id).is_none() { - return Err(GcError::TimelineNotFound); + // Paranoia check: it is critical that GcInfo's list of child timelines is correct, to avoid incorrectly GC'ing data they + // depend on. So although GcInfo is updated continuously by Timeline::new and Timeline::drop, we also calculate it here + // and fail out if it's inaccurate. + // (this can be removed later, it's a risk mitigation for https://github.com/neondatabase/neon/pull/8427) + { + let mut all_branchpoints: BTreeMap> = + BTreeMap::new(); + timelines.iter().for_each(|timeline| { + if let Some(ancestor_timeline_id) = &timeline.get_ancestor_timeline_id() { + let ancestor_children = + all_branchpoints.entry(*ancestor_timeline_id).or_default(); + ancestor_children.push((timeline.get_ancestor_lsn(), timeline.timeline_id)); + } + }); + + for timeline in &timelines { + let mut branchpoints: Vec<(Lsn, TimelineId)> = all_branchpoints + .remove(&timeline.timeline_id) + .unwrap_or_default(); + + branchpoints.sort_by_key(|b| b.0); + + let target = timeline.gc_info.read().unwrap(); + + // We require that retain_lsns contains everything in `branchpoints`, but not that + // they are exactly equal: timeline deletions can race with us, so retain_lsns + // may contain some extra stuff. It is safe to have extra timelines in there, because it + // just means that we retain slightly more data than we otherwise might. + let have_branchpoints = target.retain_lsns.iter().copied().collect::>(); + for b in &branchpoints { + if !have_branchpoints.contains(b) { + tracing::error!( + "Bug: `retain_lsns` is set incorrectly. Expected be {:?}, but found {:?}", + branchpoints, + target.retain_lsns + ); + debug_assert!(false); + // Do not GC based on bad information! + // (ab-use an existing GcError type rather than adding a new one, since this is a + // "should never happen" check that will be removed soon). + return Err(GcError::Remote(anyhow::anyhow!( + "retain_lsns failed validation!" + ))); } - }; - - timelines - .iter() - .map(|(_timeline_id, timeline_entry)| { - if let Some(ancestor_timeline_id) = - &timeline_entry.get_ancestor_timeline_id() - { - // If target_timeline is specified, we only need to know branchpoints of its children - if let Some(timeline_id) = target_timeline_id { - if ancestor_timeline_id == &timeline_id { - all_branchpoints.insert(( - *ancestor_timeline_id, - timeline_entry.get_ancestor_lsn(), - )); - } - } - // Collect branchpoints for all timelines - else { - all_branchpoints.insert(( - *ancestor_timeline_id, - timeline_entry.get_ancestor_lsn(), - )); - } - } - - timeline_entry.clone() - }) - .collect::>() - }; - (all_branchpoints, timelines) - }; + } + } + } // Ok, we now know all the branch points. // Update the GC information for each timeline. let mut gc_timelines = Vec::with_capacity(timelines.len()); for timeline in timelines { - // If target_timeline is specified, ignore all other timelines + // We filtered the timeline list above if let Some(target_timeline_id) = target_timeline_id { - if timeline.timeline_id != target_timeline_id { - continue; - } + assert_eq!(target_timeline_id, timeline.timeline_id); } - let branchpoints: Vec = all_branchpoints - .range(( - Included((timeline.timeline_id, Lsn(0))), - Included((timeline.timeline_id, Lsn(u64::MAX))), - )) - .map(|&x| x.1) - .collect(); - { let mut target = timeline.gc_info.write().unwrap(); @@ -2941,20 +2992,12 @@ impl Tenant { .0, ); - match gc_cutoffs.remove(&timeline.timeline_id) { - Some(cutoffs) => { - target.retain_lsns = branchpoints; - target.cutoffs = cutoffs; - } - None => { - // reasons for this being unavailable: - // - this timeline was created while we were finding cutoffs - // - lsn for timestamp search fails for this timeline repeatedly - // - // in both cases, refreshing the branchpoints is correct. - target.retain_lsns = branchpoints; - } - }; + // Apply the cutoffs we found to the Timeline's GcInfo. Why might we _not_ have cutoffs for a timeline? + // - this timeline was created while we were finding cutoffs + // - lsn for timestamp search fails for this timeline repeatedly + if let Some(cutoffs) = gc_cutoffs.get(&timeline.timeline_id) { + target.cutoffs = cutoffs.clone(); + } } gc_timelines.push(timeline); @@ -4343,7 +4386,7 @@ mod tests { { let branchpoints = &tline.gc_info.read().unwrap().retain_lsns; assert_eq!(branchpoints.len(), 1); - assert_eq!(branchpoints[0], Lsn(0x40)); + assert_eq!(branchpoints[0], (Lsn(0x40), NEW_TIMELINE_ID)); } // You can read the key from the child branch even though the parent is diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs index e4728ca8a8..41d558d3f6 100644 --- a/pageserver/src/tenant/size.rs +++ b/pageserver/src/tenant/size.rs @@ -264,10 +264,10 @@ pub(super) async fn gather_inputs( let mut lsns: Vec<(Lsn, LsnKind)> = gc_info .retain_lsns .iter() - .filter(|&&lsn| lsn > ancestor_lsn) + .filter(|(lsn, _child_id)| lsn > &ancestor_lsn) .copied() // this assumes there are no other retain_lsns than the branchpoints - .map(|lsn| (lsn, LsnKind::BranchPoint)) + .map(|(lsn, _child_id)| (lsn, LsnKind::BranchPoint)) .collect::>(); lsns.extend(lease_points.iter().map(|&lsn| (lsn, LsnKind::LeasePoint))); diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 82e8ff02ca..178b707aa7 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -460,7 +460,7 @@ pub(crate) struct GcInfo { /// Currently, this includes all points where child branches have /// been forked off from. In the future, could also include /// explicit user-defined snapshot points. - pub(crate) retain_lsns: Vec, + pub(crate) retain_lsns: Vec<(Lsn, TimelineId)>, /// The cutoff coordinates, which are combined by selecting the minimum. pub(crate) cutoffs: GcCutoffs, @@ -476,12 +476,21 @@ impl GcInfo { pub(crate) fn min_cutoff(&self) -> Lsn { self.cutoffs.select_min() } + + pub(super) fn insert_child(&mut self, child_id: TimelineId, child_lsn: Lsn) { + self.retain_lsns.push((child_lsn, child_id)); + self.retain_lsns.sort_by_key(|i| i.0); + } + + pub(super) fn remove_child(&mut self, child_id: TimelineId) { + self.retain_lsns.retain(|i| i.1 != child_id); + } } /// The `GcInfo` component describing which Lsns need to be retained. Functionally, this /// is a single number (the oldest LSN which we must retain), but it internally distinguishes /// between time-based and space-based retention for observability and consumption metrics purposes. -#[derive(Debug)] +#[derive(Debug, Clone)] pub(crate) struct GcCutoffs { /// Calculated from the [`TenantConf::gc_horizon`], this LSN indicates how much /// history we must keep to retain a specified number of bytes of WAL. @@ -2307,6 +2316,11 @@ impl Timeline { ) }; + if let Some(ancestor) = &ancestor { + let mut ancestor_gc_info = ancestor.gc_info.write().unwrap(); + ancestor_gc_info.insert_child(timeline_id, metadata.ancestor_lsn()); + } + Arc::new_cyclic(|myself| { let metrics = TimelineMetrics::new( &tenant_shard_id, @@ -4753,6 +4767,18 @@ impl Timeline { } } +impl Drop for Timeline { + fn drop(&mut self) { + if let Some(ancestor) = &self.ancestor_timeline { + // This lock should never be poisoned, but in case it is we do a .map() instead of + // an unwrap(), to avoid panicking in a destructor and thereby aborting the process. + if let Ok(mut gc_info) = ancestor.gc_info.write() { + gc_info.remove_child(self.timeline_id) + } + } + } +} + /// Top-level failure to compact. #[derive(Debug, thiserror::Error)] pub(crate) enum CompactionError { @@ -5070,7 +5096,11 @@ impl Timeline { let space_cutoff = min(gc_info.cutoffs.space, self.get_disk_consistent_lsn()); let time_cutoff = gc_info.cutoffs.time; - let retain_lsns = gc_info.retain_lsns.clone(); + let retain_lsns = gc_info + .retain_lsns + .iter() + .map(|(lsn, _child_id)| *lsn) + .collect(); // Gets the maximum LSN that holds the valid lease. // diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index 02124ad852..ab6a5f20ba 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -148,14 +148,14 @@ async fn cleanup_remaining_timeline_fs_traces( /// For more context see comments in [`DeleteTimelineFlow::prepare`] async fn remove_timeline_from_tenant( tenant: &Tenant, - timeline_id: TimelineId, + timeline: &Timeline, _: &DeletionGuard, // using it as a witness ) -> anyhow::Result<()> { // Remove the timeline from the map. let mut timelines = tenant.timelines.lock().unwrap(); let children_exist = timelines .iter() - .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id)); + .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline.timeline_id)); // XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`. // We already deleted the layer files, so it's probably best to panic. // (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart) @@ -164,7 +164,7 @@ async fn remove_timeline_from_tenant( } timelines - .remove(&timeline_id) + .remove(&timeline.timeline_id) .expect("timeline that we were deleting was concurrently removed from 'timelines' map"); drop(timelines); @@ -414,7 +414,7 @@ impl DeleteTimelineFlow { pausable_failpoint!("in_progress_delete"); - remove_timeline_from_tenant(tenant, timeline.timeline_id, &guard).await?; + remove_timeline_from_tenant(tenant, timeline, &guard).await?; *guard = Self::Finished;