diff --git a/pageserver/src/keyspace.rs b/pageserver/src/keyspace.rs index 64024a2d8d..20e6df9c7b 100644 --- a/pageserver/src/keyspace.rs +++ b/pageserver/src/keyspace.rs @@ -5,7 +5,7 @@ use std::ops::Range; /// /// Represents a set of Keys, in a compact form. /// -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Default)] pub struct KeySpace { /// Contiguous ranges of keys that belong to the key space. In key order, /// and with no overlap. @@ -61,6 +61,18 @@ impl KeySpace { KeyPartitioning { parts } } + + /// + /// Check if key space contains overlapping range + /// + pub fn overlaps(&self, range: &Range) -> bool { + match self.ranges.binary_search_by_key(&range.end, |r| r.start) { + Ok(0) => false, + Err(0) => false, + Ok(index) => self.ranges[index - 1].end > range.start, + Err(index) => self.ranges[index - 1].end > range.start, + } + } } /// @@ -129,3 +141,226 @@ impl KeySpaceAccum { } } } + +/// +/// A helper object, to collect a set of keys and key ranges into a KeySpace +/// object. Key ranges may be inserted in any order and can overlap. +/// +#[derive(Clone, Debug, Default)] +pub struct KeySpaceRandomAccum { + ranges: Vec>, +} + +impl KeySpaceRandomAccum { + pub fn new() -> Self { + Self { ranges: Vec::new() } + } + + pub fn add_key(&mut self, key: Key) { + self.add_range(singleton_range(key)) + } + + pub fn add_range(&mut self, range: Range) { + self.ranges.push(range); + } + + pub fn to_keyspace(mut self) -> KeySpace { + let mut ranges = Vec::new(); + if !self.ranges.is_empty() { + self.ranges.sort_by_key(|r| r.start); + let mut start = self.ranges.first().unwrap().start; + let mut end = self.ranges.first().unwrap().end; + for r in self.ranges { + assert!(r.start >= start); + if r.start > end { + ranges.push(start..end); + start = r.start; + end = r.end; + } else if r.end > end { + end = r.end; + } + } + ranges.push(start..end); + } + KeySpace { ranges } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::fmt::Write; + + // Helper function to create a key range. + // + // Make the tests below less verbose. + fn kr(irange: Range) -> Range { + Key::from_i128(irange.start)..Key::from_i128(irange.end) + } + + #[allow(dead_code)] + fn dump_keyspace(ks: &KeySpace) { + for r in ks.ranges.iter() { + println!(" {}..{}", r.start.to_i128(), r.end.to_i128()); + } + } + + fn assert_ks_eq(actual: &KeySpace, expected: Vec>) { + if actual.ranges != expected { + let mut msg = String::new(); + + writeln!(msg, "expected:").unwrap(); + for r in &expected { + writeln!(msg, " {}..{}", r.start.to_i128(), r.end.to_i128()).unwrap(); + } + writeln!(msg, "got:").unwrap(); + for r in &actual.ranges { + writeln!(msg, " {}..{}", r.start.to_i128(), r.end.to_i128()).unwrap(); + } + panic!("{}", msg); + } + } + + #[test] + fn keyspace_add_range() { + // two separate ranges + // + // ##### + // ##### + let mut ks = KeySpaceRandomAccum::default(); + ks.add_range(kr(0..10)); + ks.add_range(kr(20..30)); + assert_ks_eq(&ks.to_keyspace(), vec![kr(0..10), kr(20..30)]); + + // two separate ranges, added in reverse order + // + // ##### + // ##### + let mut ks = KeySpaceRandomAccum::default(); + ks.add_range(kr(20..30)); + ks.add_range(kr(0..10)); + + // add range that is adjacent to the end of an existing range + // + // ##### + // ##### + ks.add_range(kr(0..10)); + ks.add_range(kr(10..30)); + assert_ks_eq(&ks.to_keyspace(), vec![kr(0..30)]); + + // add range that is adjacent to the start of an existing range + // + // ##### + // ##### + let mut ks = KeySpaceRandomAccum::default(); + ks.add_range(kr(10..30)); + ks.add_range(kr(0..10)); + assert_ks_eq(&ks.to_keyspace(), vec![kr(0..30)]); + + // add range that overlaps with the end of an existing range + // + // ##### + // ##### + let mut ks = KeySpaceRandomAccum::default(); + ks.add_range(kr(0..10)); + ks.add_range(kr(5..30)); + assert_ks_eq(&ks.to_keyspace(), vec![kr(0..30)]); + + // add range that overlaps with the start of an existing range + // + // ##### + // ##### + let mut ks = KeySpaceRandomAccum::default(); + ks.add_range(kr(5..30)); + ks.add_range(kr(0..10)); + assert_ks_eq(&ks.to_keyspace(), vec![kr(0..30)]); + + // add range that is fully covered by an existing range + // + // ######### + // ##### + let mut ks = KeySpaceRandomAccum::default(); + ks.add_range(kr(0..30)); + ks.add_range(kr(10..20)); + assert_ks_eq(&ks.to_keyspace(), vec![kr(0..30)]); + + // add range that extends an existing range from both ends + // + // ##### + // ######### + let mut ks = KeySpaceRandomAccum::default(); + ks.add_range(kr(10..20)); + ks.add_range(kr(0..30)); + assert_ks_eq(&ks.to_keyspace(), vec![kr(0..30)]); + + // add a range that overlaps with two existing ranges, joining them + // + // ##### ##### + // ####### + let mut ks = KeySpaceRandomAccum::default(); + ks.add_range(kr(0..10)); + ks.add_range(kr(20..30)); + ks.add_range(kr(5..25)); + assert_ks_eq(&ks.to_keyspace(), vec![kr(0..30)]); + } + + #[test] + fn keyspace_overlaps() { + let mut ks = KeySpaceRandomAccum::default(); + ks.add_range(kr(10..20)); + ks.add_range(kr(30..40)); + let ks = ks.to_keyspace(); + + // ##### ##### + // xxxx + assert!(!ks.overlaps(&kr(0..5))); + + // ##### ##### + // xxxx + assert!(!ks.overlaps(&kr(5..9))); + + // ##### ##### + // xxxx + assert!(!ks.overlaps(&kr(5..10))); + + // ##### ##### + // xxxx + assert!(ks.overlaps(&kr(5..11))); + + // ##### ##### + // xxxx + assert!(ks.overlaps(&kr(10..15))); + + // ##### ##### + // xxxx + assert!(ks.overlaps(&kr(15..20))); + + // ##### ##### + // xxxx + assert!(ks.overlaps(&kr(15..25))); + + // ##### ##### + // xxxx + assert!(!ks.overlaps(&kr(22..28))); + + // ##### ##### + // xxxx + assert!(!ks.overlaps(&kr(25..30))); + + // ##### ##### + // xxxx + assert!(ks.overlaps(&kr(35..35))); + + // ##### ##### + // xxxx + assert!(!ks.overlaps(&kr(40..45))); + + // ##### ##### + // xxxx + assert!(!ks.overlaps(&kr(45..50))); + + // ##### ##### + // xxxxxxxxxxx + assert!(ks.overlaps(&kr(0..30))); // XXXXX This fails currently! + } +} diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index c47f4444f5..3c951c1188 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -22,8 +22,7 @@ use tracing::*; use utils::id::TenantTimelineId; use std::cmp::{max, min, Ordering}; -use std::collections::BinaryHeap; -use std::collections::HashMap; +use std::collections::{BinaryHeap, HashMap}; use std::fs; use std::ops::{Deref, Range}; use std::path::{Path, PathBuf}; @@ -48,7 +47,7 @@ use crate::tenant::{ }; use crate::config::PageServerConf; -use crate::keyspace::{KeyPartitioning, KeySpace}; +use crate::keyspace::{KeyPartitioning, KeySpace, KeySpaceRandomAccum}; use crate::metrics::{TimelineMetrics, UNEXPECTED_ONDEMAND_DOWNLOADS}; use crate::pgdatadir_mapping::LsnForTimestamp; use crate::pgdatadir_mapping::{is_rel_fsm_block_key, is_rel_vm_block_key}; @@ -123,6 +122,17 @@ pub struct Timeline { pub(super) layers: RwLock>, + /// Set of key ranges which should be covered by image layers to + /// allow GC to remove old layers. This set is created by GC and its cutoff LSN is also stored. + /// It is used by compaction task when it checks if new image layer should be created. + /// Newly created image layer doesn't help to remove the delta layer, until the + /// newly created image layer falls off the PITR horizon. So on next GC cycle, + /// gc_timeline may still want the new image layer to be created. To avoid redundant + /// image layers creation we should check if image layer exists but beyond PITR horizon. + /// This is why we need remember GC cutoff LSN. + /// + wanted_image_layers: Mutex>, + last_freeze_at: AtomicLsn, // Atomic would be more appropriate here. last_freeze_ts: RwLock, @@ -1354,6 +1364,7 @@ impl Timeline { tenant_id, pg_version, layers: RwLock::new(LayerMap::default()), + wanted_image_layers: Mutex::new(None), walredo_mgr, walreceiver, @@ -2904,6 +2915,30 @@ impl Timeline { let layers = self.layers.read().unwrap(); let mut max_deltas = 0; + { + let wanted_image_layers = self.wanted_image_layers.lock().unwrap(); + if let Some((cutoff_lsn, wanted)) = &*wanted_image_layers { + let img_range = + partition.ranges.first().unwrap().start..partition.ranges.last().unwrap().end; + if wanted.overlaps(&img_range) { + // + // gc_timeline only pays attention to image layers that are older than the GC cutoff, + // but create_image_layers creates image layers at last-record-lsn. + // So it's possible that gc_timeline wants a new image layer to be created for a key range, + // but the range is already covered by image layers at more recent LSNs. Before we + // create a new image layer, check if the range is already covered at more recent LSNs. + if !layers + .image_layer_exists(&img_range, &(Lsn::min(lsn, *cutoff_lsn)..lsn + 1))? + { + debug!( + "Force generation of layer {}-{} wanted by GC, cutoff={}, lsn={})", + img_range.start, img_range.end, cutoff_lsn, lsn + ); + return Ok(true); + } + } + } + } for part_range in &partition.ranges { let image_coverage = layers.image_coverage(part_range, lsn)?; @@ -3023,6 +3058,12 @@ impl Timeline { image_layers.push(image_layer); } } + // All layers that the GC wanted us to create have now been created. + // + // It's possible that another GC cycle happened while we were compacting, and added + // something new to wanted_image_layers, and we now clear that before processing it. + // That's OK, because the next GC iteration will put it back in. + *self.wanted_image_layers.lock().unwrap() = None; // Sync the new layer to disk before adding it to the layer map, to make sure // we don't garbage collect something based on the new layer, before it has @@ -3720,6 +3761,7 @@ impl Timeline { } let mut layers_to_remove = Vec::new(); + let mut wanted_image_layers = KeySpaceRandomAccum::default(); // Scan all layers in the timeline (remote or on-disk). // @@ -3803,6 +3845,15 @@ impl Timeline { "keeping {} because it is the latest layer", l.filename().file_name() ); + // Collect delta key ranges that need image layers to allow garbage + // collecting the layers. + // It is not so obvious whether we need to propagate information only about + // delta layers. Image layers can form "stairs" preventing old image from been deleted. + // But image layers are in any case less sparse than delta layers. Also we need some + // protection from replacing recent image layers with new one after each GC iteration. + if l.is_incremental() && !LayerMap::is_l0(&*l) { + wanted_image_layers.add_range(l.get_key_range()); + } result.layers_not_updated += 1; continue 'outer; } @@ -3815,6 +3866,10 @@ impl Timeline { ); layers_to_remove.push(Arc::clone(&l)); } + self.wanted_image_layers + .lock() + .unwrap() + .replace((new_gc_cutoff, wanted_image_layers.to_keyspace())); let mut updates = layers.batch_update(); if !layers_to_remove.is_empty() { diff --git a/test_runner/performance/test_gc_feedback.py b/test_runner/performance/test_gc_feedback.py new file mode 100644 index 0000000000..f93b560d8e --- /dev/null +++ b/test_runner/performance/test_gc_feedback.py @@ -0,0 +1,76 @@ +import pytest +from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder + + +@pytest.mark.timeout(10000) +def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker): + """ + Test that GC is able to collect all old layers even if them are forming + "stairs" and there are not three delta layers since last image layer. + + Information about image layers needed to collect old layers should + be propagated by GC to compaction task which should take in in account + when make a decision which new image layers needs to be created. + """ + env = neon_env_builder.init_start() + client = env.pageserver.http_client() + + tenant_id, _ = env.neon_cli.create_tenant( + conf={ + # disable default GC and compaction + "gc_period": "1000 m", + "compaction_period": "0 s", + "gc_horizon": f"{1024 ** 2}", + "checkpoint_distance": f"{1024 ** 2}", + "compaction_target_size": f"{1024 ** 2}", + # set PITR interval to be small, so we can do GC + "pitr_interval": "10 s", + # "compaction_threshold": "3", + # "image_creation_threshold": "2", + } + ) + endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) + timeline_id = endpoint.safe_psql("show neon.timeline_id")[0][0] + n_steps = 10 + n_update_iters = 100 + step_size = 10000 + with endpoint.cursor() as cur: + cur.execute("SET statement_timeout='1000s'") + cur.execute( + "CREATE TABLE t(step bigint, count bigint default 0, payload text default repeat(' ', 100)) with (fillfactor=50)" + ) + cur.execute("CREATE INDEX ON t(step)") + # In each step, we insert 'step_size' new rows, and update the newly inserted rows + # 'n_update_iters' times. This creates a lot of churn and generates lots of WAL at the end of the table, + # without modifying the earlier parts of the table. + for step in range(n_steps): + cur.execute(f"INSERT INTO t (step) SELECT {step} FROM generate_series(1, {step_size})") + for i in range(n_update_iters): + cur.execute(f"UPDATE t set count=count+1 where step = {step}") + cur.execute("vacuum t") + + # cur.execute("select pg_table_size('t')") + # logical_size = cur.fetchone()[0] + logical_size = client.timeline_detail(tenant_id, timeline_id)["current_logical_size"] + log.info(f"Logical storage size {logical_size}") + + client.timeline_checkpoint(tenant_id, timeline_id) + + # Do compaction and GC + client.timeline_gc(tenant_id, timeline_id, 0) + client.timeline_compact(tenant_id, timeline_id) + # One more iteration to check that no excessive image layers are generated + client.timeline_gc(tenant_id, timeline_id, 0) + client.timeline_compact(tenant_id, timeline_id) + + physical_size = client.timeline_detail(tenant_id, timeline_id)["current_physical_size"] + log.info(f"Physical storage size {physical_size}") + + MB = 1024 * 1024 + zenbenchmark.record("logical_size", logical_size // MB, "Mb", MetricReport.LOWER_IS_BETTER) + zenbenchmark.record("physical_size", physical_size // MB, "Mb", MetricReport.LOWER_IS_BETTER) + zenbenchmark.record( + "physical/logical ratio", physical_size / logical_size, "", MetricReport.LOWER_IS_BETTER + )