mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-03 11:32:56 +00:00
Refactor keyspace code
Have separate classes for the KeySpace, a partitioning of the KeySpace (KeyPartitioning), and a builder object used to construct the KeySpace. Previously, KeyPartitioning did all those things, and it was a bit confusing.
This commit is contained in:
@@ -1,30 +1,101 @@
|
||||
use crate::repository::{key_range_size, singleton_range, Key};
|
||||
use postgres_ffi::pg_constants;
|
||||
use std::ops::Range;
|
||||
|
||||
use crate::repository::{key_range_size, singleton_range, Key};
|
||||
|
||||
use postgres_ffi::pg_constants;
|
||||
|
||||
// Target file size, when creating iage and delta layers
|
||||
// Target file size, when creating image and delta layers
|
||||
pub const TARGET_FILE_SIZE_BYTES: u64 = 128 * 1024 * 1024; // 128 MB
|
||||
|
||||
///
|
||||
/// Represents a set of Keys, in a compact form.
|
||||
///
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct KeyPartitioning {
|
||||
accum: Option<Range<Key>>,
|
||||
|
||||
pub struct KeySpace {
|
||||
// Contiguous ranges of keys that belong to the key space. In key order, and
|
||||
// with no overlap.
|
||||
ranges: Vec<Range<Key>>,
|
||||
}
|
||||
|
||||
impl KeySpace {
|
||||
///
|
||||
/// Partition a key space into roughly chunks of roughly 'target_size' bytes in
|
||||
/// each patition.
|
||||
///
|
||||
pub fn partition(&self, target_size: u64) -> KeyPartitioning {
|
||||
// Assume that each value is 8k in size.
|
||||
let target_nblocks = (target_size / pg_constants::BLCKSZ as u64) as usize;
|
||||
|
||||
let mut partitions = Vec::new();
|
||||
let mut current_part = Vec::new();
|
||||
let mut current_part_size: usize = 0;
|
||||
for range in &self.ranges {
|
||||
// If appending the next contiguous range in the keyspace to the current
|
||||
// partition would cause it to be too large, start a new partition.
|
||||
let this_size = key_range_size(range) as usize;
|
||||
if current_part_size + this_size > target_nblocks && !current_part.is_empty() {
|
||||
partitions.push(current_part);
|
||||
current_part = Vec::new();
|
||||
current_part_size = 0;
|
||||
}
|
||||
|
||||
// If the next range is larger than 'target_size', split it into
|
||||
// 'target_size' chunks.
|
||||
let mut remain_size = this_size;
|
||||
let mut start = range.start;
|
||||
while remain_size > target_nblocks {
|
||||
let next = start.add(target_nblocks as u32);
|
||||
partitions.push(vec![start..next]);
|
||||
start = next;
|
||||
remain_size -= target_nblocks
|
||||
}
|
||||
current_part.push(start..range.end);
|
||||
current_part_size += remain_size;
|
||||
}
|
||||
|
||||
// add last partition that wasn't full yet.
|
||||
if !current_part.is_empty() {
|
||||
partitions.push(current_part);
|
||||
}
|
||||
|
||||
KeyPartitioning { partitions }
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Represents a partitioning of the key space.
|
||||
///
|
||||
/// The only kind of partitioning we do is to partition the key space into
|
||||
/// partitions that are roughly equal in physical size (see KeySpace::partition).
|
||||
/// But this data structure could represent any partitioning.
|
||||
///
|
||||
#[derive(Clone, Debug, Default)]
|
||||
pub struct KeyPartitioning {
|
||||
pub partitions: Vec<Vec<Range<Key>>>,
|
||||
}
|
||||
|
||||
impl KeyPartitioning {
|
||||
pub fn new() -> Self {
|
||||
KeyPartitioning {
|
||||
partitions: Vec::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// A helper object, to collect a set of keys and key ranges into a KeySpace
|
||||
/// object. This takes care of merging adjacent keys and key ranges into
|
||||
/// contiguous ranges.
|
||||
///
|
||||
#[derive(Clone, Debug, Default)]
|
||||
pub struct KeySpaceAccum {
|
||||
accum: Option<Range<Key>>,
|
||||
|
||||
ranges: Vec<Range<Key>>,
|
||||
}
|
||||
|
||||
impl KeySpaceAccum {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
accum: None,
|
||||
ranges: Vec::new(),
|
||||
partitions: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -47,44 +118,12 @@ impl KeyPartitioning {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn repartition(&mut self, target_size: u64) {
|
||||
let target_nblocks = (target_size / pg_constants::BLCKSZ as u64) as usize;
|
||||
pub fn to_keyspace(mut self) -> KeySpace {
|
||||
if let Some(accum) = self.accum.take() {
|
||||
self.ranges.push(accum);
|
||||
}
|
||||
|
||||
self.partitions = Vec::new();
|
||||
|
||||
let mut current_part = Vec::new();
|
||||
let mut current_part_size: usize = 0;
|
||||
for range in &self.ranges {
|
||||
let this_size = key_range_size(range) as usize;
|
||||
|
||||
if current_part_size + this_size > target_nblocks && !current_part.is_empty() {
|
||||
self.partitions.push(current_part);
|
||||
current_part = Vec::new();
|
||||
current_part_size = 0;
|
||||
}
|
||||
|
||||
let mut remain_size = this_size;
|
||||
let mut start = range.start;
|
||||
while remain_size > target_nblocks {
|
||||
let next = start.add(target_nblocks as u32);
|
||||
self.partitions.push(vec![start..next]);
|
||||
start = next;
|
||||
remain_size -= target_nblocks
|
||||
}
|
||||
current_part.push(start..range.end);
|
||||
current_part_size += remain_size;
|
||||
}
|
||||
if !current_part.is_empty() {
|
||||
self.partitions.push(current_part);
|
||||
KeySpace {
|
||||
ranges: self.ranges,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for KeyPartitioning {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1910,6 +1910,7 @@ fn rename_to_backup(path: PathBuf) -> anyhow::Result<()> {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::keyspace::KeySpaceAccum;
|
||||
use crate::repository::repo_harness::*;
|
||||
use rand::thread_rng;
|
||||
use rand::Rng;
|
||||
@@ -2009,7 +2010,7 @@ mod tests {
|
||||
|
||||
let mut lsn = Lsn(0x10);
|
||||
|
||||
let mut parts = KeyPartitioning::new();
|
||||
let mut keyspace = KeySpaceAccum::new();
|
||||
|
||||
let mut test_key = Key::from_hex("012222222233333333444444445500000000").unwrap();
|
||||
let mut blknum = 0;
|
||||
@@ -2025,14 +2026,17 @@ mod tests {
|
||||
writer.advance_last_record_lsn(lsn);
|
||||
drop(writer);
|
||||
|
||||
parts.add_key(test_key);
|
||||
keyspace.add_key(test_key);
|
||||
|
||||
lsn = Lsn(lsn.0 + 0x10);
|
||||
blknum += 1;
|
||||
}
|
||||
|
||||
let cutoff = tline.get_last_record_lsn();
|
||||
parts.repartition(TEST_FILE_SIZE as u64);
|
||||
let parts = keyspace
|
||||
.clone()
|
||||
.to_keyspace()
|
||||
.partition(TEST_FILE_SIZE as u64);
|
||||
tline.hint_partitioning(parts.clone(), lsn)?;
|
||||
|
||||
tline.update_gc_info(Vec::new(), cutoff);
|
||||
@@ -2053,7 +2057,7 @@ mod tests {
|
||||
|
||||
let mut test_key = Key::from_hex("012222222233333333444444445500000000").unwrap();
|
||||
|
||||
let mut parts = KeyPartitioning::new();
|
||||
let mut keyspace = KeySpaceAccum::new();
|
||||
|
||||
// Track when each page was last modified. Used to assert that
|
||||
// a read sees the latest page version.
|
||||
@@ -2074,10 +2078,10 @@ mod tests {
|
||||
updated[blknum] = lsn;
|
||||
drop(writer);
|
||||
|
||||
parts.add_key(test_key);
|
||||
keyspace.add_key(test_key);
|
||||
}
|
||||
|
||||
parts.repartition(TEST_FILE_SIZE as u64);
|
||||
let parts = keyspace.to_keyspace().partition(TEST_FILE_SIZE as u64);
|
||||
tline.hint_partitioning(parts, lsn)?;
|
||||
|
||||
for _ in 0..50 {
|
||||
@@ -2127,7 +2131,7 @@ mod tests {
|
||||
|
||||
let mut test_key = Key::from_hex("012222222233333333444444445500000000").unwrap();
|
||||
|
||||
let mut parts = KeyPartitioning::new();
|
||||
let mut keyspace = KeySpaceAccum::new();
|
||||
|
||||
// Track when each page was last modified. Used to assert that
|
||||
// a read sees the latest page version.
|
||||
@@ -2148,10 +2152,10 @@ mod tests {
|
||||
updated[blknum] = lsn;
|
||||
drop(writer);
|
||||
|
||||
parts.add_key(test_key);
|
||||
keyspace.add_key(test_key);
|
||||
}
|
||||
|
||||
parts.repartition(TEST_FILE_SIZE as u64);
|
||||
let parts = keyspace.to_keyspace().partition(TEST_FILE_SIZE as u64);
|
||||
tline.hint_partitioning(parts, lsn)?;
|
||||
|
||||
let mut tline_id = TIMELINE_ID;
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
//! Clarify that)
|
||||
//!
|
||||
|
||||
use crate::keyspace::{KeyPartitioning, TARGET_FILE_SIZE_BYTES};
|
||||
use crate::keyspace::{KeySpace, KeySpaceAccum, TARGET_FILE_SIZE_BYTES};
|
||||
use crate::relish::*;
|
||||
use crate::repository::*;
|
||||
use crate::repository::{Repository, Timeline};
|
||||
@@ -336,9 +336,9 @@ impl<R: Repository> DatadirTimeline<R> {
|
||||
Ok(total_size * pg_constants::BLCKSZ as usize)
|
||||
}
|
||||
|
||||
fn collect_keyspace(&self, lsn: Lsn) -> Result<KeyPartitioning> {
|
||||
fn collect_keyspace(&self, lsn: Lsn) -> Result<KeySpace> {
|
||||
// Iterate through key ranges, greedily packing them into partitions
|
||||
let mut result = KeyPartitioning::new();
|
||||
let mut result = KeySpaceAccum::new();
|
||||
|
||||
// Add dbdir
|
||||
result.add_key(DBDIR_KEY);
|
||||
@@ -404,7 +404,7 @@ impl<R: Repository> DatadirTimeline<R> {
|
||||
result.add_key(CONTROLFILE_KEY);
|
||||
result.add_key(CHECKPOINT_KEY);
|
||||
|
||||
Ok(result)
|
||||
Ok(result.to_keyspace())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -801,8 +801,8 @@ impl<'a, R: Repository> DatadirTimelineWriter<'a, R> {
|
||||
if last_partitioning == Lsn(0)
|
||||
|| self.lsn.0 - last_partitioning.0 > TARGET_FILE_SIZE_BYTES / 8
|
||||
{
|
||||
let mut partitioning = self.tline.collect_keyspace(self.lsn)?;
|
||||
partitioning.repartition(TARGET_FILE_SIZE_BYTES);
|
||||
let keyspace = self.tline.collect_keyspace(self.lsn)?;
|
||||
let partitioning = keyspace.partition(TARGET_FILE_SIZE_BYTES);
|
||||
self.tline.tline.hint_partitioning(partitioning, self.lsn)?;
|
||||
self.tline.last_partitioning.store(self.lsn);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user