From a4fc6da57be2f7e4afeb01b1f05f2a6955cc155d Mon Sep 17 00:00:00 2001 From: anastasia Date: Fri, 10 Sep 2021 17:04:16 +0300 Subject: [PATCH] Fix gc_internal to treat dropped layers. Some dropped layers serve as tombstones for earlier layers and thus cannot be garbage collected. Add new fields to GcResult for layers that are preserved as tombstones --- pageserver/src/layered_repository.rs | 96 +++++++++++++++++-- .../src/layered_repository/layer_map.rs | 14 +++ pageserver/src/page_service.rs | 14 +++ pageserver/src/repository.rs | 4 + test_runner/batch_others/test_snapfiles_gc.py | 13 ++- 5 files changed, 129 insertions(+), 12 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 1bb80d232c..e4054d9583 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -472,7 +472,6 @@ impl LayeredRepository { } } - // cutoff is if let Some(cutoff) = timeline.get_last_record_lsn().checked_sub(horizon) { let branchpoints: Vec = all_branchpoints .range(( @@ -1453,12 +1452,14 @@ impl LayeredTimeline { let mut layers_to_remove: Vec> = Vec::new(); - // Scan all layer files in the directory. For each file, if a newer file - // exists, we can remove the old one. + // Scan all on-disk layers in the timeline. + // + // Garbage collect the layer if all conditions are satisfied: + // 1. it is older than cutoff LSN; + // 2. it doesn't need to be retained for 'retain_lsns'; + // 3. newer on-disk layer exists (only for non-dropped segments); + // 4. this layer doesn't serve as a tombstone for some older layer; // - // Determine for each file if it needs to be retained - // FIXME: also scan open in-memory layers. Normally we cannot remove the - // latest layer of any seg, but if it was dropped it's possible let mut layers = self.layers.lock().unwrap(); 'outer: for l in layers.iter_historic_layers() { let seg = l.get_seg_tag(); @@ -1469,7 +1470,7 @@ impl LayeredTimeline { result.ondisk_nonrelfiles_total += 1; } - // Is it newer than cutoff point? + // 1. Is it newer than cutoff point? if l.get_end_lsn() > cutoff { info!( "keeping {} {}-{} because it's newer than cutoff {}", @@ -1486,7 +1487,7 @@ impl LayeredTimeline { continue 'outer; } - // 2. Is it needed by any child branch? + // 2. Is it needed by a child branch? for retain_lsn in &retain_lsns { // start_lsn is inclusive and end_lsn is exclusive if l.get_start_lsn() <= *retain_lsn && *retain_lsn < l.get_end_lsn() { @@ -1506,9 +1507,15 @@ impl LayeredTimeline { } } - // Unless the relation was dropped, is there a later image file for this relation? + // 3. Is there a later on-disk layer for this relation? if !l.is_dropped() && !layers.newer_image_layer_exists(l.get_seg_tag(), l.get_end_lsn()) { + info!( + "keeping {} {}-{} because it is the latest layer", + seg, + l.get_start_lsn(), + l.get_end_lsn() + ); if seg.rel.is_relation() { result.ondisk_relfiles_not_updated += 1; } else { @@ -1517,6 +1524,77 @@ impl LayeredTimeline { continue 'outer; } + // 4. Does this layer serve as a tombstome for some older layer? + if l.is_dropped() { + let prior_lsn = l.get_start_lsn().checked_sub(1u64).unwrap(); + + // Check if this layer serves as a tombstone for this timeline + // We have to do this separately from timeline check below, + // because LayerMap of this timeline is already locked. + let mut is_tombstone = layers.layer_exists_at_lsn(l.get_seg_tag(), prior_lsn); + if is_tombstone { + info!( + "earlier layer exists at {} in {}", + prior_lsn, self.timelineid + ); + } + // Now check ancestor timelines, if any + else if let Some(ancestor) = &self.ancestor_timeline { + let prior_lsn = ancestor.get_last_record_lsn(); + if seg.rel.is_blocky() { + info!( + "check blocky relish size {} at {} in {} for layer {}-{}", + seg, + prior_lsn, + ancestor.timelineid, + l.get_start_lsn(), + l.get_end_lsn() + ); + match ancestor.get_relish_size(seg.rel, prior_lsn).unwrap() { + Some(size) => { + let last_live_seg = SegmentTag::from_blknum(seg.rel, size - 1); + info!( + "blocky rel size is {} last_live_seg.segno {} seg.segno {}", + size, last_live_seg.segno, seg.segno + ); + if last_live_seg.segno >= seg.segno { + is_tombstone = true; + } + } + _ => { + info!("blocky rel doesn't exist"); + } + } + } else { + info!( + "check non-blocky relish existence {} at {} in {} for layer {}-{}", + seg, + prior_lsn, + ancestor.timelineid, + l.get_start_lsn(), + l.get_end_lsn() + ); + is_tombstone = ancestor.get_rel_exists(seg.rel, prior_lsn).unwrap_or(false); + } + } + + if is_tombstone { + info!( + "keeping {} {}-{} because this layer servers as a tombstome for older layer", + seg, + l.get_start_lsn(), + l.get_end_lsn() + ); + + if seg.rel.is_relation() { + result.ondisk_relfiles_needed_as_tombstone += 1; + } else { + result.ondisk_nonrelfiles_needed_as_tombstone += 1; + } + continue 'outer; + } + } + // We didn't find any reason to keep this file, so remove it. info!( "garbage collecting {} {}-{} {}", diff --git a/pageserver/src/layered_repository/layer_map.rs b/pageserver/src/layered_repository/layer_map.rs index bd38e9f57a..fc3014ec3a 100644 --- a/pageserver/src/layered_repository/layer_map.rs +++ b/pageserver/src/layered_repository/layer_map.rs @@ -174,6 +174,20 @@ impl LayerMap { } } + /// Is there any layer for given segment that is alive at the lsn? + /// + /// This is a public wrapper for SegEntry fucntion, + /// used for garbage collection, to determine if some alive layer + /// exists at the lsn. If so, we shouldn't delete a newer dropped layer + /// to avoid incorrectly making it visible. + pub fn layer_exists_at_lsn(&self, seg: SegmentTag, lsn: Lsn) -> bool { + if let Some(segentry) = self.segs.get(&seg) { + segentry.exists_at_lsn(lsn).unwrap_or(false) + } else { + false + } + } + /// Return the oldest in-memory layer, along with its generation number. pub fn peek_oldest_open(&self) -> Option<(Arc, u64)> { self.open_layers diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index a8ea76dda3..39562e7c33 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -654,12 +654,14 @@ impl postgres_backend::Handler for PageServerHandler { RowDescriptor::int8_col(b"layer_relfiles_needed_by_cutoff"), RowDescriptor::int8_col(b"layer_relfiles_needed_by_branches"), RowDescriptor::int8_col(b"layer_relfiles_not_updated"), + RowDescriptor::int8_col(b"layer_relfiles_needed_as_tombstone"), RowDescriptor::int8_col(b"layer_relfiles_removed"), RowDescriptor::int8_col(b"layer_relfiles_dropped"), RowDescriptor::int8_col(b"layer_nonrelfiles_total"), RowDescriptor::int8_col(b"layer_nonrelfiles_needed_by_cutoff"), RowDescriptor::int8_col(b"layer_nonrelfiles_needed_by_branches"), RowDescriptor::int8_col(b"layer_nonrelfiles_not_updated"), + RowDescriptor::int8_col(b"layer_nonrelfiles_needed_as_tombstone"), RowDescriptor::int8_col(b"layer_nonrelfiles_removed"), RowDescriptor::int8_col(b"layer_nonrelfiles_dropped"), RowDescriptor::int8_col(b"elapsed"), @@ -679,6 +681,12 @@ impl postgres_backend::Handler for PageServerHandler { .as_bytes(), ), Some(result.ondisk_relfiles_not_updated.to_string().as_bytes()), + Some( + result + .ondisk_relfiles_needed_as_tombstone + .to_string() + .as_bytes(), + ), Some(result.ondisk_relfiles_removed.to_string().as_bytes()), Some(result.ondisk_relfiles_dropped.to_string().as_bytes()), Some(result.ondisk_nonrelfiles_total.to_string().as_bytes()), @@ -695,6 +703,12 @@ impl postgres_backend::Handler for PageServerHandler { .as_bytes(), ), Some(result.ondisk_nonrelfiles_not_updated.to_string().as_bytes()), + Some( + result + .ondisk_nonrelfiles_needed_as_tombstone + .to_string() + .as_bytes(), + ), Some(result.ondisk_nonrelfiles_removed.to_string().as_bytes()), Some(result.ondisk_nonrelfiles_dropped.to_string().as_bytes()), Some(result.elapsed.as_millis().to_string().as_bytes()), diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index fac9fc3786..1d3c4b4ed8 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -48,6 +48,7 @@ pub struct GcResult { pub ondisk_relfiles_needed_by_cutoff: u64, pub ondisk_relfiles_needed_by_branches: u64, pub ondisk_relfiles_not_updated: u64, + pub ondisk_relfiles_needed_as_tombstone: u64, pub ondisk_relfiles_removed: u64, // # of layer files removed because they have been made obsolete by newer ondisk files. pub ondisk_relfiles_dropped: u64, // # of layer files removed because the relation was dropped @@ -55,6 +56,7 @@ pub struct GcResult { pub ondisk_nonrelfiles_needed_by_cutoff: u64, pub ondisk_nonrelfiles_needed_by_branches: u64, pub ondisk_nonrelfiles_not_updated: u64, + pub ondisk_nonrelfiles_needed_as_tombstone: u64, pub ondisk_nonrelfiles_removed: u64, // # of layer files removed because they have been made obsolete by newer ondisk files. pub ondisk_nonrelfiles_dropped: u64, // # of layer files removed because the relation was dropped @@ -67,6 +69,7 @@ impl AddAssign for GcResult { self.ondisk_relfiles_needed_by_cutoff += other.ondisk_relfiles_needed_by_cutoff; self.ondisk_relfiles_needed_by_branches += other.ondisk_relfiles_needed_by_branches; self.ondisk_relfiles_not_updated += other.ondisk_relfiles_not_updated; + self.ondisk_relfiles_needed_as_tombstone += other.ondisk_relfiles_needed_as_tombstone; self.ondisk_relfiles_removed += other.ondisk_relfiles_removed; self.ondisk_relfiles_dropped += other.ondisk_relfiles_dropped; @@ -74,6 +77,7 @@ impl AddAssign for GcResult { self.ondisk_nonrelfiles_needed_by_cutoff += other.ondisk_nonrelfiles_needed_by_cutoff; self.ondisk_nonrelfiles_needed_by_branches += other.ondisk_nonrelfiles_needed_by_branches; self.ondisk_nonrelfiles_not_updated += other.ondisk_nonrelfiles_not_updated; + self.ondisk_nonrelfiles_needed_as_tombstone += other.ondisk_nonrelfiles_needed_as_tombstone; self.ondisk_nonrelfiles_removed += other.ondisk_nonrelfiles_removed; self.ondisk_nonrelfiles_dropped += other.ondisk_nonrelfiles_dropped; diff --git a/test_runner/batch_others/test_snapfiles_gc.py b/test_runner/batch_others/test_snapfiles_gc.py index 957194d7d5..690d999b6f 100644 --- a/test_runner/batch_others/test_snapfiles_gc.py +++ b/test_runner/batch_others/test_snapfiles_gc.py @@ -6,8 +6,8 @@ pytest_plugins = ("fixtures.zenith_fixtures") def print_gc_result(row): print("GC duration {elapsed} ms".format_map(row)); - print(" REL total: {layer_relfiles_total}, needed_by_cutoff {layer_relfiles_needed_by_cutoff}, needed_by_branches: {layer_relfiles_needed_by_branches}, not_updated: {layer_relfiles_not_updated}, removed: {layer_relfiles_removed}, dropped: {layer_relfiles_dropped}".format_map(row)) - print(" NONREL total: {layer_nonrelfiles_total}, needed_by_cutoff {layer_nonrelfiles_needed_by_cutoff}, needed_by_branches: {layer_nonrelfiles_needed_by_branches}, not_updated: {layer_nonrelfiles_not_updated}, removed: {layer_nonrelfiles_removed}, dropped: {layer_nonrelfiles_dropped}".format_map(row)) + print(" REL total: {layer_relfiles_total}, needed_by_cutoff {layer_relfiles_needed_by_cutoff}, needed_by_branches: {layer_relfiles_needed_by_branches}, not_updated: {layer_relfiles_not_updated}, needed_as_tombstone {layer_relfiles_needed_as_tombstone}, removed: {layer_relfiles_removed}, dropped: {layer_relfiles_dropped}".format_map(row)) + print(" NONREL total: {layer_nonrelfiles_total}, needed_by_cutoff {layer_nonrelfiles_needed_by_cutoff}, needed_by_branches: {layer_nonrelfiles_needed_by_branches}, not_updated: {layer_nonrelfiles_not_updated}, needed_as_tombstone {layer_nonrelfiles_needed_as_tombstone}, removed: {layer_nonrelfiles_removed}, dropped: {layer_nonrelfiles_dropped}".format_map(row)) # @@ -113,12 +113,19 @@ def test_layerfiles_gc(zenith_cli, pageserver, postgres, pg_bin): row = pscur.fetchone() print_gc_result(row); + # We still cannot remove the latest layers + # because they serve as tombstones for earlier layers. + assert row['layer_relfiles_dropped'] == 0 # Each relation fork is counted separately, hence 3. - assert row['layer_relfiles_dropped'] == 3 + assert row['layer_relfiles_needed_as_tombstone'] == 3 # The catalog updates also create new layer files of the catalogs, which # are counted as 'removed' assert row['layer_relfiles_removed'] > 0 + # TODO Change the test to check actual CG of dropped layers. + # Each relation fork is counted separately, hence 3. + #assert row['layer_relfiles_dropped'] == 3 + # TODO: perhaps we should count catalog and user relations separately, # to make this kind of testing more robust