Merge remote-tracking branch 'origin/main' into problame/infallible-timeline-activate/2-pushup-tenant-and-timeline-activation

This commit is contained in:
Christian Schwarz
2023-05-24 11:31:00 +02:00
21 changed files with 1315 additions and 161 deletions

View File

@@ -797,7 +797,8 @@ impl PageServerConf {
)?);
}
if let Some(max_lsn_wal_lag) = item.get("max_lsn_wal_lag") {
t_conf.max_lsn_wal_lag = Some(parse_toml_from_str("max_lsn_wal_lag", max_lsn_wal_lag)?);
t_conf.max_lsn_wal_lag =
Some(deserialize_from_item("max_lsn_wal_lag", max_lsn_wal_lag)?);
}
if let Some(trace_read_requests) = item.get("trace_read_requests") {
t_conf.trace_read_requests =

View File

@@ -5,7 +5,7 @@ use std::ops::Range;
///
/// Represents a set of Keys, in a compact form.
///
#[derive(Clone, Debug)]
#[derive(Clone, Debug, Default)]
pub struct KeySpace {
/// Contiguous ranges of keys that belong to the key space. In key order,
/// and with no overlap.
@@ -61,6 +61,18 @@ impl KeySpace {
KeyPartitioning { parts }
}
///
/// Check if key space contains overlapping range
///
pub fn overlaps(&self, range: &Range<Key>) -> bool {
match self.ranges.binary_search_by_key(&range.end, |r| r.start) {
Ok(0) => false,
Err(0) => false,
Ok(index) => self.ranges[index - 1].end > range.start,
Err(index) => self.ranges[index - 1].end > range.start,
}
}
}
///
@@ -129,3 +141,226 @@ impl KeySpaceAccum {
}
}
}
///
/// A helper object, to collect a set of keys and key ranges into a KeySpace
/// object. Key ranges may be inserted in any order and can overlap.
///
#[derive(Clone, Debug, Default)]
pub struct KeySpaceRandomAccum {
ranges: Vec<Range<Key>>,
}
impl KeySpaceRandomAccum {
pub fn new() -> Self {
Self { ranges: Vec::new() }
}
pub fn add_key(&mut self, key: Key) {
self.add_range(singleton_range(key))
}
pub fn add_range(&mut self, range: Range<Key>) {
self.ranges.push(range);
}
pub fn to_keyspace(mut self) -> KeySpace {
let mut ranges = Vec::new();
if !self.ranges.is_empty() {
self.ranges.sort_by_key(|r| r.start);
let mut start = self.ranges.first().unwrap().start;
let mut end = self.ranges.first().unwrap().end;
for r in self.ranges {
assert!(r.start >= start);
if r.start > end {
ranges.push(start..end);
start = r.start;
end = r.end;
} else if r.end > end {
end = r.end;
}
}
ranges.push(start..end);
}
KeySpace { ranges }
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::fmt::Write;
// Helper function to create a key range.
//
// Make the tests below less verbose.
fn kr(irange: Range<i128>) -> Range<Key> {
Key::from_i128(irange.start)..Key::from_i128(irange.end)
}
#[allow(dead_code)]
fn dump_keyspace(ks: &KeySpace) {
for r in ks.ranges.iter() {
println!(" {}..{}", r.start.to_i128(), r.end.to_i128());
}
}
fn assert_ks_eq(actual: &KeySpace, expected: Vec<Range<Key>>) {
if actual.ranges != expected {
let mut msg = String::new();
writeln!(msg, "expected:").unwrap();
for r in &expected {
writeln!(msg, " {}..{}", r.start.to_i128(), r.end.to_i128()).unwrap();
}
writeln!(msg, "got:").unwrap();
for r in &actual.ranges {
writeln!(msg, " {}..{}", r.start.to_i128(), r.end.to_i128()).unwrap();
}
panic!("{}", msg);
}
}
#[test]
fn keyspace_add_range() {
// two separate ranges
//
// #####
// #####
let mut ks = KeySpaceRandomAccum::default();
ks.add_range(kr(0..10));
ks.add_range(kr(20..30));
assert_ks_eq(&ks.to_keyspace(), vec![kr(0..10), kr(20..30)]);
// two separate ranges, added in reverse order
//
// #####
// #####
let mut ks = KeySpaceRandomAccum::default();
ks.add_range(kr(20..30));
ks.add_range(kr(0..10));
// add range that is adjacent to the end of an existing range
//
// #####
// #####
ks.add_range(kr(0..10));
ks.add_range(kr(10..30));
assert_ks_eq(&ks.to_keyspace(), vec![kr(0..30)]);
// add range that is adjacent to the start of an existing range
//
// #####
// #####
let mut ks = KeySpaceRandomAccum::default();
ks.add_range(kr(10..30));
ks.add_range(kr(0..10));
assert_ks_eq(&ks.to_keyspace(), vec![kr(0..30)]);
// add range that overlaps with the end of an existing range
//
// #####
// #####
let mut ks = KeySpaceRandomAccum::default();
ks.add_range(kr(0..10));
ks.add_range(kr(5..30));
assert_ks_eq(&ks.to_keyspace(), vec![kr(0..30)]);
// add range that overlaps with the start of an existing range
//
// #####
// #####
let mut ks = KeySpaceRandomAccum::default();
ks.add_range(kr(5..30));
ks.add_range(kr(0..10));
assert_ks_eq(&ks.to_keyspace(), vec![kr(0..30)]);
// add range that is fully covered by an existing range
//
// #########
// #####
let mut ks = KeySpaceRandomAccum::default();
ks.add_range(kr(0..30));
ks.add_range(kr(10..20));
assert_ks_eq(&ks.to_keyspace(), vec![kr(0..30)]);
// add range that extends an existing range from both ends
//
// #####
// #########
let mut ks = KeySpaceRandomAccum::default();
ks.add_range(kr(10..20));
ks.add_range(kr(0..30));
assert_ks_eq(&ks.to_keyspace(), vec![kr(0..30)]);
// add a range that overlaps with two existing ranges, joining them
//
// ##### #####
// #######
let mut ks = KeySpaceRandomAccum::default();
ks.add_range(kr(0..10));
ks.add_range(kr(20..30));
ks.add_range(kr(5..25));
assert_ks_eq(&ks.to_keyspace(), vec![kr(0..30)]);
}
#[test]
fn keyspace_overlaps() {
let mut ks = KeySpaceRandomAccum::default();
ks.add_range(kr(10..20));
ks.add_range(kr(30..40));
let ks = ks.to_keyspace();
// ##### #####
// xxxx
assert!(!ks.overlaps(&kr(0..5)));
// ##### #####
// xxxx
assert!(!ks.overlaps(&kr(5..9)));
// ##### #####
// xxxx
assert!(!ks.overlaps(&kr(5..10)));
// ##### #####
// xxxx
assert!(ks.overlaps(&kr(5..11)));
// ##### #####
// xxxx
assert!(ks.overlaps(&kr(10..15)));
// ##### #####
// xxxx
assert!(ks.overlaps(&kr(15..20)));
// ##### #####
// xxxx
assert!(ks.overlaps(&kr(15..25)));
// ##### #####
// xxxx
assert!(!ks.overlaps(&kr(22..28)));
// ##### #####
// xxxx
assert!(!ks.overlaps(&kr(25..30)));
// ##### #####
// xxxx
assert!(ks.overlaps(&kr(35..35)));
// ##### #####
// xxxx
assert!(!ks.overlaps(&kr(40..45)));
// ##### #####
// xxxx
assert!(!ks.overlaps(&kr(45..50)));
// ##### #####
// xxxxxxxxxxx
assert!(ks.overlaps(&kr(0..30))); // XXXXX This fails currently!
}
}

View File

@@ -22,8 +22,7 @@ use tracing::*;
use utils::id::TenantTimelineId;
use std::cmp::{max, min, Ordering};
use std::collections::BinaryHeap;
use std::collections::HashMap;
use std::collections::{BinaryHeap, HashMap};
use std::fs;
use std::ops::{Deref, Range};
use std::path::{Path, PathBuf};
@@ -48,7 +47,7 @@ use crate::tenant::{
};
use crate::config::PageServerConf;
use crate::keyspace::{KeyPartitioning, KeySpace};
use crate::keyspace::{KeyPartitioning, KeySpace, KeySpaceRandomAccum};
use crate::metrics::{TimelineMetrics, UNEXPECTED_ONDEMAND_DOWNLOADS};
use crate::pgdatadir_mapping::LsnForTimestamp;
use crate::pgdatadir_mapping::{is_rel_fsm_block_key, is_rel_vm_block_key};
@@ -123,6 +122,17 @@ pub struct Timeline {
pub(super) layers: RwLock<LayerMap<dyn PersistentLayer>>,
/// Set of key ranges which should be covered by image layers to
/// allow GC to remove old layers. This set is created by GC and its cutoff LSN is also stored.
/// It is used by compaction task when it checks if new image layer should be created.
/// Newly created image layer doesn't help to remove the delta layer, until the
/// newly created image layer falls off the PITR horizon. So on next GC cycle,
/// gc_timeline may still want the new image layer to be created. To avoid redundant
/// image layers creation we should check if image layer exists but beyond PITR horizon.
/// This is why we need remember GC cutoff LSN.
///
wanted_image_layers: Mutex<Option<(Lsn, KeySpace)>>,
last_freeze_at: AtomicLsn,
// Atomic would be more appropriate here.
last_freeze_ts: RwLock<Instant>,
@@ -1354,6 +1364,7 @@ impl Timeline {
tenant_id,
pg_version,
layers: RwLock::new(LayerMap::default()),
wanted_image_layers: Mutex::new(None),
walredo_mgr,
walreceiver,
@@ -2904,6 +2915,30 @@ impl Timeline {
let layers = self.layers.read().unwrap();
let mut max_deltas = 0;
{
let wanted_image_layers = self.wanted_image_layers.lock().unwrap();
if let Some((cutoff_lsn, wanted)) = &*wanted_image_layers {
let img_range =
partition.ranges.first().unwrap().start..partition.ranges.last().unwrap().end;
if wanted.overlaps(&img_range) {
//
// gc_timeline only pays attention to image layers that are older than the GC cutoff,
// but create_image_layers creates image layers at last-record-lsn.
// So it's possible that gc_timeline wants a new image layer to be created for a key range,
// but the range is already covered by image layers at more recent LSNs. Before we
// create a new image layer, check if the range is already covered at more recent LSNs.
if !layers
.image_layer_exists(&img_range, &(Lsn::min(lsn, *cutoff_lsn)..lsn + 1))?
{
debug!(
"Force generation of layer {}-{} wanted by GC, cutoff={}, lsn={})",
img_range.start, img_range.end, cutoff_lsn, lsn
);
return Ok(true);
}
}
}
}
for part_range in &partition.ranges {
let image_coverage = layers.image_coverage(part_range, lsn)?;
@@ -3023,6 +3058,12 @@ impl Timeline {
image_layers.push(image_layer);
}
}
// All layers that the GC wanted us to create have now been created.
//
// It's possible that another GC cycle happened while we were compacting, and added
// something new to wanted_image_layers, and we now clear that before processing it.
// That's OK, because the next GC iteration will put it back in.
*self.wanted_image_layers.lock().unwrap() = None;
// Sync the new layer to disk before adding it to the layer map, to make sure
// we don't garbage collect something based on the new layer, before it has
@@ -3720,6 +3761,7 @@ impl Timeline {
}
let mut layers_to_remove = Vec::new();
let mut wanted_image_layers = KeySpaceRandomAccum::default();
// Scan all layers in the timeline (remote or on-disk).
//
@@ -3803,6 +3845,15 @@ impl Timeline {
"keeping {} because it is the latest layer",
l.filename().file_name()
);
// Collect delta key ranges that need image layers to allow garbage
// collecting the layers.
// It is not so obvious whether we need to propagate information only about
// delta layers. Image layers can form "stairs" preventing old image from been deleted.
// But image layers are in any case less sparse than delta layers. Also we need some
// protection from replacing recent image layers with new one after each GC iteration.
if l.is_incremental() && !LayerMap::is_l0(&*l) {
wanted_image_layers.add_range(l.get_key_range());
}
result.layers_not_updated += 1;
continue 'outer;
}
@@ -3815,6 +3866,10 @@ impl Timeline {
);
layers_to_remove.push(Arc::clone(&l));
}
self.wanted_image_layers
.lock()
.unwrap()
.replace((new_gc_cutoff, wanted_image_layers.to_keyspace()));
let mut updates = layers.batch_update();
if !layers_to_remove.is_empty() {