diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index d3c8c423e4..d8019b08e2 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -51,8 +51,8 @@ pub(crate) enum StorageTimeOperation { #[strum(serialize = "gc")] Gc, - #[strum(serialize = "update gc info")] - UpdateGcInfo, + #[strum(serialize = "find gc cutoffs")] + FindGcCutoffs, #[strum(serialize = "create tenant")] CreateTenant, @@ -1989,7 +1989,7 @@ pub(crate) struct TimelineMetrics { pub imitate_logical_size_histo: StorageTimeMetrics, pub load_layer_map_histo: StorageTimeMetrics, pub garbage_collect_histo: StorageTimeMetrics, - pub update_gc_info_histo: StorageTimeMetrics, + pub find_gc_cutoffs_histo: StorageTimeMetrics, pub last_record_gauge: IntGauge, resident_physical_size_gauge: UIntGauge, /// copy of LayeredTimeline.current_logical_size @@ -2050,8 +2050,8 @@ impl TimelineMetrics { &shard_id, &timeline_id, ); - let update_gc_info_histo = StorageTimeMetrics::new( - StorageTimeOperation::UpdateGcInfo, + let find_gc_cutoffs_histo = StorageTimeMetrics::new( + StorageTimeOperation::FindGcCutoffs, &tenant_id, &shard_id, &timeline_id, @@ -2098,7 +2098,7 @@ impl TimelineMetrics { logical_size_histo, imitate_logical_size_histo, garbage_collect_histo, - update_gc_info_histo, + find_gc_cutoffs_histo, load_layer_map_histo, last_record_gauge, resident_physical_size_gauge, diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 05ceff2b59..a6cd1471ff 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -62,6 +62,7 @@ use self::timeline::uninit::TimelineCreateGuard; use self::timeline::uninit::TimelineExclusionError; use self::timeline::uninit::UninitializedTimeline; use self::timeline::EvictionTaskTenantState; +use self::timeline::GcInfo; use self::timeline::TimelineResources; use self::timeline::WaitLsnError; use crate::config::PageServerConf; @@ -86,7 +87,6 @@ use crate::tenant::remote_timeline_client::INITDB_PATH; use crate::tenant::storage_layer::DeltaLayer; use crate::tenant::storage_layer::ImageLayer; use crate::InitializationOrder; -use std::cmp::min; use std::collections::hash_map::Entry; use std::collections::BTreeSet; use std::collections::HashMap; @@ -2886,9 +2886,12 @@ impl Tenant { )) .map(|&x| x.1) .collect(); - timeline - .update_gc_info(branchpoints, cutoff, pitr, cancel, ctx) - .await?; + let cutoffs = timeline.find_gc_cutoffs(cutoff, pitr, cancel, ctx).await?; + + *timeline.gc_info.write().unwrap() = GcInfo { + retain_lsns: branchpoints, + cutoffs, + }; gc_timelines.push(timeline); } @@ -2977,7 +2980,7 @@ impl Tenant { // and then the planned GC cutoff { let gc_info = src_timeline.gc_info.read().unwrap(); - let cutoff = min(gc_info.pitr_cutoff, gc_info.horizon_cutoff); + let cutoff = gc_info.min_cutoff(); if start_lsn < cutoff { return Err(CreateTimelineError::AncestorLsn(anyhow::anyhow!( "invalid branch start lsn: less than planned GC cutoff {cutoff}" @@ -4513,18 +4516,20 @@ mod tests { } async fn bulk_insert_compact_gc( - timeline: Arc, + tenant: &Tenant, + timeline: &Arc, ctx: &RequestContext, lsn: Lsn, repeat: usize, key_count: usize, ) -> anyhow::Result<()> { let compact = true; - bulk_insert_maybe_compact_gc(timeline, ctx, lsn, repeat, key_count, compact).await + bulk_insert_maybe_compact_gc(tenant, timeline, ctx, lsn, repeat, key_count, compact).await } async fn bulk_insert_maybe_compact_gc( - timeline: Arc, + tenant: &Tenant, + timeline: &Arc, ctx: &RequestContext, mut lsn: Lsn, repeat: usize, @@ -4537,6 +4542,8 @@ mod tests { // Enforce that key range is monotonously increasing let mut keyspace = KeySpaceAccum::new(); + let cancel = CancellationToken::new(); + for _ in 0..repeat { for _ in 0..key_count { test_key.field6 = blknum; @@ -4558,24 +4565,19 @@ mod tests { blknum += 1; } - let cutoff = timeline.get_last_record_lsn(); - - timeline - .update_gc_info( - Vec::new(), - cutoff, - Duration::ZERO, - &CancellationToken::new(), - ctx, - ) - .await?; timeline.freeze_and_flush().await?; if compact { - timeline - .compact(&CancellationToken::new(), EnumSet::empty(), ctx) - .await?; + // this requires timeline to be &Arc + timeline.compact(&cancel, EnumSet::empty(), ctx).await?; } - timeline.gc().await?; + + // this doesn't really need to use the timeline_id target, but it is closer to what it + // originally was. + let res = tenant + .gc_iteration(Some(timeline.timeline_id), 0, Duration::ZERO, &cancel, ctx) + .await?; + + assert_eq!(res.layers_removed, 0, "this never removes anything"); } Ok(()) @@ -4594,7 +4596,7 @@ mod tests { .await?; let lsn = Lsn(0x10); - bulk_insert_compact_gc(tline.clone(), &ctx, lsn, 50, 10000).await?; + bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?; Ok(()) } @@ -4625,7 +4627,7 @@ mod tests { .await?; let lsn = Lsn(0x10); - bulk_insert_compact_gc(tline.clone(), &ctx, lsn, 50, 10000).await?; + bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?; let guard = tline.layers.read().await; guard.layer_map().dump(true, &ctx).await?; @@ -5079,6 +5081,7 @@ mod tests { .await?; const NUM_KEYS: usize = 1000; + let cancel = CancellationToken::new(); let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap(); @@ -5138,18 +5141,10 @@ mod tests { } // Perform a cycle of flush, and GC - let cutoff = tline.get_last_record_lsn(); - tline - .update_gc_info( - Vec::new(), - cutoff, - Duration::ZERO, - &CancellationToken::new(), - &ctx, - ) - .await?; tline.freeze_and_flush().await?; - tline.gc().await?; + tenant + .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx) + .await?; } Ok(()) @@ -5170,6 +5165,8 @@ mod tests { let mut keyspace = KeySpaceAccum::new(); + let cancel = CancellationToken::new(); + // Track when each page was last modified. Used to assert that // a read sees the latest page version. let mut updated = [Lsn(0); NUM_KEYS]; @@ -5233,21 +5230,11 @@ mod tests { } // Perform a cycle of flush, compact, and GC - let cutoff = tline.get_last_record_lsn(); - tline - .update_gc_info( - Vec::new(), - cutoff, - Duration::ZERO, - &CancellationToken::new(), - &ctx, - ) - .await?; tline.freeze_and_flush().await?; - tline - .compact(&CancellationToken::new(), EnumSet::empty(), &ctx) + tline.compact(&cancel, EnumSet::empty(), &ctx).await?; + tenant + .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx) .await?; - tline.gc().await?; } Ok(()) @@ -5452,7 +5439,7 @@ mod tests { let lsn = Lsn(0x10); let compact = false; - bulk_insert_maybe_compact_gc(tline.clone(), &ctx, lsn, 50, 10000, compact).await?; + bulk_insert_maybe_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000, compact).await?; let test_key = Key::from_hex("010000000033333333444444445500000000").unwrap(); let read_lsn = Lsn(u64::MAX - 1); diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs index f521dfa55d..974c1091fd 100644 --- a/pageserver/src/tenant/size.rs +++ b/pageserver/src/tenant/size.rs @@ -192,7 +192,9 @@ pub(super) async fn gather_inputs( // than a space bound (horizon cutoff). This means that if someone drops a database and waits for their // PITR interval, they will see synthetic size decrease, even if we are still storing data inside // horizon_cutoff. - let mut next_gc_cutoff = gc_info.pitr_cutoff; + let pitr_cutoff = gc_info.cutoffs.pitr; + let horizon_cutoff = gc_info.cutoffs.horizon; + let mut next_gc_cutoff = pitr_cutoff; // If the caller provided a shorter retention period, use that instead of the GC cutoff. let retention_param_cutoff = if let Some(max_retention_period) = max_retention_period { @@ -297,8 +299,8 @@ pub(super) async fn gather_inputs( last_record: last_record_lsn, // this is not used above, because it might not have updated recently enough latest_gc_cutoff: *timeline.get_latest_gc_cutoff_lsn(), - horizon_cutoff: gc_info.horizon_cutoff, - pitr_cutoff: gc_info.pitr_cutoff, + horizon_cutoff, + pitr_cutoff, next_gc_cutoff, retention_param_cutoff, }); diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 22bfa53445..7aeb3a6a59 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -325,7 +325,7 @@ pub struct Timeline { // List of child timelines and their branch points. This is needed to avoid // garbage collecting data that is still needed by the child timelines. - pub gc_info: std::sync::RwLock, + pub(crate) gc_info: std::sync::RwLock, // It may change across major versions so for simplicity // keep it after running initdb for a timeline. @@ -409,33 +409,59 @@ pub struct WalReceiverInfo { pub last_received_msg_ts: u128, } -/// /// Information about how much history needs to be retained, needed by /// Garbage Collection. -/// -pub struct GcInfo { +#[derive(Default)] +pub(crate) struct GcInfo { /// Specific LSNs that are needed. /// /// Currently, this includes all points where child branches have /// been forked off from. In the future, could also include /// explicit user-defined snapshot points. - pub retain_lsns: Vec, + pub(crate) retain_lsns: Vec, - /// In addition to 'retain_lsns', keep everything newer than this - /// point. + /// The cutoff coordinates, which are combined by selecting the minimum. + pub(crate) cutoffs: GcCutoffs, +} + +impl GcInfo { + pub(crate) fn min_cutoff(&self) -> Lsn { + self.cutoffs.select_min() + } +} + +/// The `GcInfo` component describing which Lsns need to be retained. +#[derive(Debug)] +pub(crate) struct GcCutoffs { + /// Keep everything newer than this point. /// /// This is calculated by subtracting 'gc_horizon' setting from /// last-record LSN /// /// FIXME: is this inclusive or exclusive? - pub horizon_cutoff: Lsn, + pub(crate) horizon: Lsn, /// In addition to 'retain_lsns' and 'horizon_cutoff', keep everything newer than this /// point. /// /// This is calculated by finding a number such that a record is needed for PITR /// if only if its LSN is larger than 'pitr_cutoff'. - pub pitr_cutoff: Lsn, + pub(crate) pitr: Lsn, +} + +impl Default for GcCutoffs { + fn default() -> Self { + Self { + horizon: Lsn::INVALID, + pitr: Lsn::INVALID, + } + } +} + +impl GcCutoffs { + fn select_min(&self) -> Lsn { + std::cmp::min(self.horizon, self.pitr) + } } /// An error happened in a get() operation. @@ -1155,7 +1181,7 @@ impl Timeline { " - keyspace={:?} lsn={}"), seq_err, keyspace, lsn) }, (Ok(_), Err(GetVectoredError::GetReadyAncestorError(GetReadyAncestorError::AncestorLsnTimeout(_)))) => { - // Sequential get runs after vectored get, so it is possible for the later + // Sequential get runs after vectored get, so it is possible for the later // to time out while waiting for its ancestor's Lsn to become ready and for the // former to succeed (it essentially has a doubled wait time). }, @@ -2097,11 +2123,7 @@ impl Timeline { write_lock: tokio::sync::Mutex::new(None), - gc_info: std::sync::RwLock::new(GcInfo { - retain_lsns: Vec::new(), - horizon_cutoff: Lsn(0), - pitr_cutoff: Lsn(0), - }), + gc_info: std::sync::RwLock::new(GcInfo::default()), latest_gc_cutoff_lsn: Rcu::new(metadata.latest_gc_cutoff_lsn()), initdb_lsn: metadata.initdb_lsn(), @@ -4383,7 +4405,7 @@ impl Timeline { Ok(()) } - /// Update information about which layer files need to be retained on + /// Find the Lsns above which layer files need to be retained on /// garbage collection. This is separate from actually performing the GC, /// and is updated more frequently, so that compaction can remove obsolete /// page versions more aggressively. @@ -4391,17 +4413,6 @@ impl Timeline { /// TODO: that's wishful thinking, compaction doesn't actually do that /// currently. /// - /// The caller specifies how much history is needed with the 3 arguments: - /// - /// retain_lsns: keep a version of each page at these LSNs - /// cutoff_horizon: also keep everything newer than this LSN - /// pitr: the time duration required to keep data for PITR - /// - /// The 'retain_lsns' list is currently used to prevent removing files that - /// are needed by child timelines. In the future, the user might be able to - /// name additional points in time to retain. The caller is responsible for - /// collecting that information. - /// /// The 'cutoff_horizon' point is used to retain recent versions that might still be /// needed by read-only nodes. (As of this writing, the caller just passes /// the latest LSN subtracted by a constant, and doesn't do anything smart @@ -4409,26 +4420,17 @@ impl Timeline { /// /// The 'pitr' duration is used to calculate a 'pitr_cutoff', which can be used to determine /// whether a record is needed for PITR. - /// - /// NOTE: This function holds a short-lived lock to protect the 'gc_info' - /// field, so that the three values passed as argument are stored - /// atomically. But the caller is responsible for ensuring that no new - /// branches are created that would need to be included in 'retain_lsns', - /// for example. The caller should hold `Tenant::gc_cs` lock to ensure - /// that. - /// #[instrument(skip_all, fields(timeline_id=%self.timeline_id))] - pub(super) async fn update_gc_info( + pub(super) async fn find_gc_cutoffs( &self, - retain_lsns: Vec, cutoff_horizon: Lsn, pitr: Duration, cancel: &CancellationToken, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> anyhow::Result { let _timer = self .metrics - .update_gc_info_histo + .find_gc_cutoffs_histo .start_timer() .record_on_drop(); @@ -4481,14 +4483,10 @@ impl Timeline { self.get_last_record_lsn() }; - // Grab the lock and update the values - *self.gc_info.write().unwrap() = GcInfo { - retain_lsns, - horizon_cutoff: cutoff_horizon, - pitr_cutoff, - }; - - Ok(()) + Ok(GcCutoffs { + horizon: cutoff_horizon, + pitr: pitr_cutoff, + }) } /// Garbage collect layer files on a timeline that are no longer needed. @@ -4517,8 +4515,8 @@ impl Timeline { let (horizon_cutoff, pitr_cutoff, retain_lsns) = { let gc_info = self.gc_info.read().unwrap(); - let horizon_cutoff = min(gc_info.horizon_cutoff, self.get_disk_consistent_lsn()); - let pitr_cutoff = gc_info.pitr_cutoff; + let horizon_cutoff = min(gc_info.cutoffs.horizon, self.get_disk_consistent_lsn()); + let pitr_cutoff = gc_info.cutoffs.pitr; let retain_lsns = gc_info.retain_lsns.clone(); (horizon_cutoff, pitr_cutoff, retain_lsns) };