diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 1bb80d232c..e4054d9583 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -472,7 +472,6 @@ impl LayeredRepository { } } - // cutoff is if let Some(cutoff) = timeline.get_last_record_lsn().checked_sub(horizon) { let branchpoints: Vec = all_branchpoints .range(( @@ -1453,12 +1452,14 @@ impl LayeredTimeline { let mut layers_to_remove: Vec> = Vec::new(); - // Scan all layer files in the directory. For each file, if a newer file - // exists, we can remove the old one. + // Scan all on-disk layers in the timeline. + // + // Garbage collect the layer if all conditions are satisfied: + // 1. it is older than cutoff LSN; + // 2. it doesn't need to be retained for 'retain_lsns'; + // 3. newer on-disk layer exists (only for non-dropped segments); + // 4. this layer doesn't serve as a tombstone for some older layer; // - // Determine for each file if it needs to be retained - // FIXME: also scan open in-memory layers. Normally we cannot remove the - // latest layer of any seg, but if it was dropped it's possible let mut layers = self.layers.lock().unwrap(); 'outer: for l in layers.iter_historic_layers() { let seg = l.get_seg_tag(); @@ -1469,7 +1470,7 @@ impl LayeredTimeline { result.ondisk_nonrelfiles_total += 1; } - // Is it newer than cutoff point? + // 1. Is it newer than cutoff point? if l.get_end_lsn() > cutoff { info!( "keeping {} {}-{} because it's newer than cutoff {}", @@ -1486,7 +1487,7 @@ impl LayeredTimeline { continue 'outer; } - // 2. Is it needed by any child branch? + // 2. Is it needed by a child branch? for retain_lsn in &retain_lsns { // start_lsn is inclusive and end_lsn is exclusive if l.get_start_lsn() <= *retain_lsn && *retain_lsn < l.get_end_lsn() { @@ -1506,9 +1507,15 @@ impl LayeredTimeline { } } - // Unless the relation was dropped, is there a later image file for this relation? + // 3. Is there a later on-disk layer for this relation? if !l.is_dropped() && !layers.newer_image_layer_exists(l.get_seg_tag(), l.get_end_lsn()) { + info!( + "keeping {} {}-{} because it is the latest layer", + seg, + l.get_start_lsn(), + l.get_end_lsn() + ); if seg.rel.is_relation() { result.ondisk_relfiles_not_updated += 1; } else { @@ -1517,6 +1524,77 @@ impl LayeredTimeline { continue 'outer; } + // 4. Does this layer serve as a tombstome for some older layer? + if l.is_dropped() { + let prior_lsn = l.get_start_lsn().checked_sub(1u64).unwrap(); + + // Check if this layer serves as a tombstone for this timeline + // We have to do this separately from timeline check below, + // because LayerMap of this timeline is already locked. + let mut is_tombstone = layers.layer_exists_at_lsn(l.get_seg_tag(), prior_lsn); + if is_tombstone { + info!( + "earlier layer exists at {} in {}", + prior_lsn, self.timelineid + ); + } + // Now check ancestor timelines, if any + else if let Some(ancestor) = &self.ancestor_timeline { + let prior_lsn = ancestor.get_last_record_lsn(); + if seg.rel.is_blocky() { + info!( + "check blocky relish size {} at {} in {} for layer {}-{}", + seg, + prior_lsn, + ancestor.timelineid, + l.get_start_lsn(), + l.get_end_lsn() + ); + match ancestor.get_relish_size(seg.rel, prior_lsn).unwrap() { + Some(size) => { + let last_live_seg = SegmentTag::from_blknum(seg.rel, size - 1); + info!( + "blocky rel size is {} last_live_seg.segno {} seg.segno {}", + size, last_live_seg.segno, seg.segno + ); + if last_live_seg.segno >= seg.segno { + is_tombstone = true; + } + } + _ => { + info!("blocky rel doesn't exist"); + } + } + } else { + info!( + "check non-blocky relish existence {} at {} in {} for layer {}-{}", + seg, + prior_lsn, + ancestor.timelineid, + l.get_start_lsn(), + l.get_end_lsn() + ); + is_tombstone = ancestor.get_rel_exists(seg.rel, prior_lsn).unwrap_or(false); + } + } + + if is_tombstone { + info!( + "keeping {} {}-{} because this layer servers as a tombstome for older layer", + seg, + l.get_start_lsn(), + l.get_end_lsn() + ); + + if seg.rel.is_relation() { + result.ondisk_relfiles_needed_as_tombstone += 1; + } else { + result.ondisk_nonrelfiles_needed_as_tombstone += 1; + } + continue 'outer; + } + } + // We didn't find any reason to keep this file, so remove it. info!( "garbage collecting {} {}-{} {}", diff --git a/pageserver/src/layered_repository/layer_map.rs b/pageserver/src/layered_repository/layer_map.rs index bd38e9f57a..fc3014ec3a 100644 --- a/pageserver/src/layered_repository/layer_map.rs +++ b/pageserver/src/layered_repository/layer_map.rs @@ -174,6 +174,20 @@ impl LayerMap { } } + /// Is there any layer for given segment that is alive at the lsn? + /// + /// This is a public wrapper for SegEntry fucntion, + /// used for garbage collection, to determine if some alive layer + /// exists at the lsn. If so, we shouldn't delete a newer dropped layer + /// to avoid incorrectly making it visible. + pub fn layer_exists_at_lsn(&self, seg: SegmentTag, lsn: Lsn) -> bool { + if let Some(segentry) = self.segs.get(&seg) { + segentry.exists_at_lsn(lsn).unwrap_or(false) + } else { + false + } + } + /// Return the oldest in-memory layer, along with its generation number. pub fn peek_oldest_open(&self) -> Option<(Arc, u64)> { self.open_layers diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index a8ea76dda3..39562e7c33 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -654,12 +654,14 @@ impl postgres_backend::Handler for PageServerHandler { RowDescriptor::int8_col(b"layer_relfiles_needed_by_cutoff"), RowDescriptor::int8_col(b"layer_relfiles_needed_by_branches"), RowDescriptor::int8_col(b"layer_relfiles_not_updated"), + RowDescriptor::int8_col(b"layer_relfiles_needed_as_tombstone"), RowDescriptor::int8_col(b"layer_relfiles_removed"), RowDescriptor::int8_col(b"layer_relfiles_dropped"), RowDescriptor::int8_col(b"layer_nonrelfiles_total"), RowDescriptor::int8_col(b"layer_nonrelfiles_needed_by_cutoff"), RowDescriptor::int8_col(b"layer_nonrelfiles_needed_by_branches"), RowDescriptor::int8_col(b"layer_nonrelfiles_not_updated"), + RowDescriptor::int8_col(b"layer_nonrelfiles_needed_as_tombstone"), RowDescriptor::int8_col(b"layer_nonrelfiles_removed"), RowDescriptor::int8_col(b"layer_nonrelfiles_dropped"), RowDescriptor::int8_col(b"elapsed"), @@ -679,6 +681,12 @@ impl postgres_backend::Handler for PageServerHandler { .as_bytes(), ), Some(result.ondisk_relfiles_not_updated.to_string().as_bytes()), + Some( + result + .ondisk_relfiles_needed_as_tombstone + .to_string() + .as_bytes(), + ), Some(result.ondisk_relfiles_removed.to_string().as_bytes()), Some(result.ondisk_relfiles_dropped.to_string().as_bytes()), Some(result.ondisk_nonrelfiles_total.to_string().as_bytes()), @@ -695,6 +703,12 @@ impl postgres_backend::Handler for PageServerHandler { .as_bytes(), ), Some(result.ondisk_nonrelfiles_not_updated.to_string().as_bytes()), + Some( + result + .ondisk_nonrelfiles_needed_as_tombstone + .to_string() + .as_bytes(), + ), Some(result.ondisk_nonrelfiles_removed.to_string().as_bytes()), Some(result.ondisk_nonrelfiles_dropped.to_string().as_bytes()), Some(result.elapsed.as_millis().to_string().as_bytes()), diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index fac9fc3786..1d3c4b4ed8 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -48,6 +48,7 @@ pub struct GcResult { pub ondisk_relfiles_needed_by_cutoff: u64, pub ondisk_relfiles_needed_by_branches: u64, pub ondisk_relfiles_not_updated: u64, + pub ondisk_relfiles_needed_as_tombstone: u64, pub ondisk_relfiles_removed: u64, // # of layer files removed because they have been made obsolete by newer ondisk files. pub ondisk_relfiles_dropped: u64, // # of layer files removed because the relation was dropped @@ -55,6 +56,7 @@ pub struct GcResult { pub ondisk_nonrelfiles_needed_by_cutoff: u64, pub ondisk_nonrelfiles_needed_by_branches: u64, pub ondisk_nonrelfiles_not_updated: u64, + pub ondisk_nonrelfiles_needed_as_tombstone: u64, pub ondisk_nonrelfiles_removed: u64, // # of layer files removed because they have been made obsolete by newer ondisk files. pub ondisk_nonrelfiles_dropped: u64, // # of layer files removed because the relation was dropped @@ -67,6 +69,7 @@ impl AddAssign for GcResult { self.ondisk_relfiles_needed_by_cutoff += other.ondisk_relfiles_needed_by_cutoff; self.ondisk_relfiles_needed_by_branches += other.ondisk_relfiles_needed_by_branches; self.ondisk_relfiles_not_updated += other.ondisk_relfiles_not_updated; + self.ondisk_relfiles_needed_as_tombstone += other.ondisk_relfiles_needed_as_tombstone; self.ondisk_relfiles_removed += other.ondisk_relfiles_removed; self.ondisk_relfiles_dropped += other.ondisk_relfiles_dropped; @@ -74,6 +77,7 @@ impl AddAssign for GcResult { self.ondisk_nonrelfiles_needed_by_cutoff += other.ondisk_nonrelfiles_needed_by_cutoff; self.ondisk_nonrelfiles_needed_by_branches += other.ondisk_nonrelfiles_needed_by_branches; self.ondisk_nonrelfiles_not_updated += other.ondisk_nonrelfiles_not_updated; + self.ondisk_nonrelfiles_needed_as_tombstone += other.ondisk_nonrelfiles_needed_as_tombstone; self.ondisk_nonrelfiles_removed += other.ondisk_nonrelfiles_removed; self.ondisk_nonrelfiles_dropped += other.ondisk_nonrelfiles_dropped; diff --git a/test_runner/batch_others/test_snapfiles_gc.py b/test_runner/batch_others/test_snapfiles_gc.py index 957194d7d5..690d999b6f 100644 --- a/test_runner/batch_others/test_snapfiles_gc.py +++ b/test_runner/batch_others/test_snapfiles_gc.py @@ -6,8 +6,8 @@ pytest_plugins = ("fixtures.zenith_fixtures") def print_gc_result(row): print("GC duration {elapsed} ms".format_map(row)); - print(" REL total: {layer_relfiles_total}, needed_by_cutoff {layer_relfiles_needed_by_cutoff}, needed_by_branches: {layer_relfiles_needed_by_branches}, not_updated: {layer_relfiles_not_updated}, removed: {layer_relfiles_removed}, dropped: {layer_relfiles_dropped}".format_map(row)) - print(" NONREL total: {layer_nonrelfiles_total}, needed_by_cutoff {layer_nonrelfiles_needed_by_cutoff}, needed_by_branches: {layer_nonrelfiles_needed_by_branches}, not_updated: {layer_nonrelfiles_not_updated}, removed: {layer_nonrelfiles_removed}, dropped: {layer_nonrelfiles_dropped}".format_map(row)) + print(" REL total: {layer_relfiles_total}, needed_by_cutoff {layer_relfiles_needed_by_cutoff}, needed_by_branches: {layer_relfiles_needed_by_branches}, not_updated: {layer_relfiles_not_updated}, needed_as_tombstone {layer_relfiles_needed_as_tombstone}, removed: {layer_relfiles_removed}, dropped: {layer_relfiles_dropped}".format_map(row)) + print(" NONREL total: {layer_nonrelfiles_total}, needed_by_cutoff {layer_nonrelfiles_needed_by_cutoff}, needed_by_branches: {layer_nonrelfiles_needed_by_branches}, not_updated: {layer_nonrelfiles_not_updated}, needed_as_tombstone {layer_nonrelfiles_needed_as_tombstone}, removed: {layer_nonrelfiles_removed}, dropped: {layer_nonrelfiles_dropped}".format_map(row)) # @@ -113,12 +113,19 @@ def test_layerfiles_gc(zenith_cli, pageserver, postgres, pg_bin): row = pscur.fetchone() print_gc_result(row); + # We still cannot remove the latest layers + # because they serve as tombstones for earlier layers. + assert row['layer_relfiles_dropped'] == 0 # Each relation fork is counted separately, hence 3. - assert row['layer_relfiles_dropped'] == 3 + assert row['layer_relfiles_needed_as_tombstone'] == 3 # The catalog updates also create new layer files of the catalogs, which # are counted as 'removed' assert row['layer_relfiles_removed'] > 0 + # TODO Change the test to check actual CG of dropped layers. + # Each relation fork is counted separately, hence 3. + #assert row['layer_relfiles_dropped'] == 3 + # TODO: perhaps we should count catalog and user relations separately, # to make this kind of testing more robust