mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-17 18:32:56 +00:00
Compare commits
9 Commits
page_cache
...
partial_im
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
10b90506a0 | ||
|
|
5ee4524caa | ||
|
|
c5245a9e4f | ||
|
|
9f10195d7b | ||
|
|
51aa53ab90 | ||
|
|
2359106a9d | ||
|
|
885033ad42 | ||
|
|
487ec20085 | ||
|
|
898937d500 |
@@ -284,7 +284,7 @@ impl PostgresNode {
|
||||
conf.append("max_wal_senders", "10");
|
||||
// wal_log_hints is mandatory when running against pageserver (see gh issue#192)
|
||||
// TODO: is it possible to check wal_log_hints at pageserver side via XLOG_PARAMETER_CHANGE?
|
||||
conf.append("wal_log_hints", "on");
|
||||
conf.append("wal_log_hints", "off");
|
||||
conf.append("max_replication_slots", "10");
|
||||
conf.append("hot_standby", "on");
|
||||
conf.append("shared_buffers", "1MB");
|
||||
|
||||
@@ -810,7 +810,9 @@ async fn timeline_compact_handler(request: Request<Body>) -> Result<Response<Bod
|
||||
.get_timeline(timeline_id)
|
||||
.with_context(|| format!("No timeline {timeline_id} in repository for tenant {tenant_id}"))
|
||||
.map_err(ApiError::NotFound)?;
|
||||
timeline.compact().map_err(ApiError::InternalServerError)?;
|
||||
timeline
|
||||
.reconstruct()
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
@@ -310,7 +310,7 @@ impl Tenant {
|
||||
|
||||
for (timeline_id, timeline) in &timelines_to_compact {
|
||||
let _entered = info_span!("compact_timeline", timeline = %timeline_id).entered();
|
||||
timeline.compact()?;
|
||||
timeline.reconstruct()?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -1741,7 +1741,7 @@ mod tests {
|
||||
drop(writer);
|
||||
|
||||
tline.checkpoint(CheckpointConfig::Forced)?;
|
||||
tline.compact()?;
|
||||
tline.reconstruct()?;
|
||||
|
||||
let writer = tline.writer();
|
||||
writer.put(*TEST_KEY, Lsn(0x20), &Value::Image(TEST_IMG("foo at 0x20")))?;
|
||||
@@ -1749,7 +1749,7 @@ mod tests {
|
||||
drop(writer);
|
||||
|
||||
tline.checkpoint(CheckpointConfig::Forced)?;
|
||||
tline.compact()?;
|
||||
tline.reconstruct()?;
|
||||
|
||||
let writer = tline.writer();
|
||||
writer.put(*TEST_KEY, Lsn(0x30), &Value::Image(TEST_IMG("foo at 0x30")))?;
|
||||
@@ -1757,7 +1757,7 @@ mod tests {
|
||||
drop(writer);
|
||||
|
||||
tline.checkpoint(CheckpointConfig::Forced)?;
|
||||
tline.compact()?;
|
||||
tline.reconstruct()?;
|
||||
|
||||
let writer = tline.writer();
|
||||
writer.put(*TEST_KEY, Lsn(0x40), &Value::Image(TEST_IMG("foo at 0x40")))?;
|
||||
@@ -1765,7 +1765,7 @@ mod tests {
|
||||
drop(writer);
|
||||
|
||||
tline.checkpoint(CheckpointConfig::Forced)?;
|
||||
tline.compact()?;
|
||||
tline.reconstruct()?;
|
||||
|
||||
assert_eq!(tline.get(*TEST_KEY, Lsn(0x10))?, TEST_IMG("foo at 0x10"));
|
||||
assert_eq!(tline.get(*TEST_KEY, Lsn(0x1f))?, TEST_IMG("foo at 0x10"));
|
||||
@@ -1813,7 +1813,7 @@ mod tests {
|
||||
|
||||
tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?;
|
||||
tline.checkpoint(CheckpointConfig::Forced)?;
|
||||
tline.compact()?;
|
||||
tline.reconstruct()?;
|
||||
tline.gc()?;
|
||||
}
|
||||
|
||||
@@ -1883,7 +1883,7 @@ mod tests {
|
||||
let cutoff = tline.get_last_record_lsn();
|
||||
tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?;
|
||||
tline.checkpoint(CheckpointConfig::Forced)?;
|
||||
tline.compact()?;
|
||||
tline.reconstruct()?;
|
||||
tline.gc()?;
|
||||
}
|
||||
|
||||
@@ -1962,7 +1962,7 @@ mod tests {
|
||||
let cutoff = tline.get_last_record_lsn();
|
||||
tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?;
|
||||
tline.checkpoint(CheckpointConfig::Forced)?;
|
||||
tline.compact()?;
|
||||
tline.reconstruct()?;
|
||||
tline.gc()?;
|
||||
}
|
||||
|
||||
|
||||
@@ -95,6 +95,9 @@ impl From<&DeltaLayer> for Summary {
|
||||
// Flag indicating that this version initialize the page
|
||||
const WILL_INIT: u64 = 1;
|
||||
|
||||
// Flag indicating page image
|
||||
const IS_IMAGE: u64 = 2;
|
||||
|
||||
///
|
||||
/// Struct representing reference to BLOB in layers. Reference contains BLOB
|
||||
/// offset, and for WAL records it also contains `will_init` flag. The flag
|
||||
@@ -109,15 +112,22 @@ impl BlobRef {
|
||||
(self.0 & WILL_INIT) != 0
|
||||
}
|
||||
|
||||
pub fn pos(&self) -> u64 {
|
||||
self.0 >> 1
|
||||
pub fn is_image(&self) -> bool {
|
||||
(self.0 & IS_IMAGE) != 0
|
||||
}
|
||||
|
||||
pub fn new(pos: u64, will_init: bool) -> BlobRef {
|
||||
let mut blob_ref = pos << 1;
|
||||
pub fn pos(&self) -> u64 {
|
||||
self.0 >> 2
|
||||
}
|
||||
|
||||
pub fn new(pos: u64, will_init: bool, is_image: bool) -> BlobRef {
|
||||
let mut blob_ref = pos << 2;
|
||||
if will_init {
|
||||
blob_ref |= WILL_INIT;
|
||||
}
|
||||
if is_image {
|
||||
blob_ref |= IS_IMAGE;
|
||||
}
|
||||
BlobRef(blob_ref)
|
||||
}
|
||||
}
|
||||
@@ -314,13 +324,13 @@ impl Layer for DeltaLayer {
|
||||
}
|
||||
}
|
||||
|
||||
fn key_iter<'a>(&'a self) -> Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'a> {
|
||||
fn key_iter<'a>(&'a self, skip_images: bool) -> Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'a> {
|
||||
let inner = match self.load() {
|
||||
Ok(inner) => inner,
|
||||
Err(e) => panic!("Failed to load a delta layer: {e:?}"),
|
||||
};
|
||||
|
||||
match DeltaKeyIter::new(inner) {
|
||||
match DeltaKeyIter::new(inner, skip_images) {
|
||||
Ok(iter) => Box::new(iter),
|
||||
Err(e) => panic!("Layer index is corrupted: {e:?}"),
|
||||
}
|
||||
@@ -414,6 +424,30 @@ impl Layer for DeltaLayer {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn contains(&self, key: &Key) -> Result<bool> {
|
||||
// Open the file and lock the metadata in memory
|
||||
let inner = self.load()?;
|
||||
|
||||
// Scan the page versions backwards, starting from `lsn`.
|
||||
let file = inner.file.as_ref().unwrap();
|
||||
let reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||
inner.index_start_blk,
|
||||
inner.index_root_blk,
|
||||
file,
|
||||
);
|
||||
let search_key = DeltaKey::from_key_lsn(key, Lsn(0));
|
||||
let mut found = false;
|
||||
reader.visit(
|
||||
&search_key.0,
|
||||
VisitDirection::Forwards,
|
||||
|delta_key, _val| {
|
||||
found = DeltaKey::extract_key_from_buf(delta_key) == *key;
|
||||
false
|
||||
},
|
||||
)?;
|
||||
Ok(found)
|
||||
}
|
||||
}
|
||||
|
||||
impl DeltaLayer {
|
||||
@@ -671,7 +705,13 @@ impl DeltaLayerWriter {
|
||||
/// The values must be appended in key, lsn order.
|
||||
///
|
||||
pub fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> Result<()> {
|
||||
self.put_value_bytes(key, lsn, &Value::ser(&val)?, val.will_init())
|
||||
self.put_value_bytes(
|
||||
key,
|
||||
lsn,
|
||||
&Value::ser(&val)?,
|
||||
val.will_init(),
|
||||
val.is_image(),
|
||||
)
|
||||
}
|
||||
|
||||
pub fn put_value_bytes(
|
||||
@@ -680,12 +720,12 @@ impl DeltaLayerWriter {
|
||||
lsn: Lsn,
|
||||
val: &[u8],
|
||||
will_init: bool,
|
||||
is_image: bool,
|
||||
) -> Result<()> {
|
||||
assert!(self.lsn_range.start <= lsn);
|
||||
|
||||
let off = self.blob_writer.write_blob(val)?;
|
||||
|
||||
let blob_ref = BlobRef::new(off, will_init);
|
||||
let blob_ref = BlobRef::new(off, will_init, is_image);
|
||||
|
||||
let delta_key = DeltaKey::from_key_lsn(&key, lsn);
|
||||
self.tree.append(&delta_key.0, blob_ref.0)?;
|
||||
@@ -874,7 +914,7 @@ impl Iterator for DeltaKeyIter {
|
||||
}
|
||||
|
||||
impl<'a> DeltaKeyIter {
|
||||
fn new(inner: RwLockReadGuard<'a, DeltaLayerInner>) -> Result<Self> {
|
||||
fn new(inner: RwLockReadGuard<'a, DeltaLayerInner>, skip_images: bool) -> Result<Self> {
|
||||
let file = inner.file.as_ref().unwrap();
|
||||
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||
inner.index_start_blk,
|
||||
@@ -883,29 +923,33 @@ impl<'a> DeltaKeyIter {
|
||||
);
|
||||
|
||||
let mut all_keys: Vec<(DeltaKey, u64)> = Vec::new();
|
||||
let mut last_pos = 0u64;
|
||||
let mut last_delta: Option<DeltaKey> = None;
|
||||
tree_reader.visit(
|
||||
&[0u8; DELTA_KEY_SIZE],
|
||||
VisitDirection::Forwards,
|
||||
|key, value| {
|
||||
let delta_key = DeltaKey::from_slice(key);
|
||||
let pos = BlobRef(value).pos();
|
||||
if let Some(last) = all_keys.last_mut() {
|
||||
if last.0.key() == delta_key.key() {
|
||||
return true;
|
||||
} else {
|
||||
// subtract offset of new key BLOB and first blob of this key
|
||||
// to get total size if values associated with this key
|
||||
let first_pos = last.1;
|
||||
last.1 = pos - first_pos;
|
||||
let blob_ref = BlobRef(value);
|
||||
if !blob_ref.is_image() || !skip_images {
|
||||
let next_delta = DeltaKey::from_slice(key);
|
||||
let pos = blob_ref.pos();
|
||||
if let Some(prev_delta) = last_delta.take() {
|
||||
if prev_delta.key() == next_delta.key() {
|
||||
last_delta = Some(next_delta);
|
||||
return true;
|
||||
}
|
||||
all_keys.push((prev_delta, pos - last_pos));
|
||||
}
|
||||
last_delta = Some(next_delta);
|
||||
last_pos = pos;
|
||||
}
|
||||
all_keys.push((delta_key, pos));
|
||||
true
|
||||
},
|
||||
)?;
|
||||
if let Some(last) = all_keys.last_mut() {
|
||||
if let Some(prev_delta) = last_delta.take() {
|
||||
// Last key occupies all space till end of layer
|
||||
last.1 = std::fs::metadata(&file.file.path)?.len() - last.1;
|
||||
let file_size = std::fs::metadata(&file.file.path)?.len();
|
||||
all_keys.push((prev_delta, file_size - last_pos));
|
||||
}
|
||||
let iter = DeltaKeyIter {
|
||||
all_keys,
|
||||
|
||||
@@ -223,6 +223,10 @@ impl Layer for ImageLayer {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn contains(&self, key: &Key) -> Result<bool> {
|
||||
Ok(self.get_key_range().contains(key))
|
||||
}
|
||||
}
|
||||
|
||||
impl ImageLayer {
|
||||
|
||||
@@ -235,6 +235,11 @@ impl Layer for InMemoryLayer {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn contains(&self, key: &Key) -> Result<bool> {
|
||||
let inner = self.inner.read().unwrap();
|
||||
Ok(inner.index.get(key).is_some())
|
||||
}
|
||||
}
|
||||
|
||||
impl InMemoryLayer {
|
||||
@@ -358,8 +363,14 @@ impl InMemoryLayer {
|
||||
// Write all page versions
|
||||
for (lsn, pos) in vec_map.as_slice() {
|
||||
cursor.read_blob_into_buf(*pos, &mut buf)?;
|
||||
let will_init = Value::des(&buf)?.will_init();
|
||||
delta_layer_writer.put_value_bytes(key, *lsn, &buf, will_init)?;
|
||||
let value = Value::des(&buf)?;
|
||||
delta_layer_writer.put_value_bytes(
|
||||
key,
|
||||
*lsn,
|
||||
&buf,
|
||||
value.will_init(),
|
||||
value.is_image(),
|
||||
)?;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -15,19 +15,25 @@ use crate::repository::Key;
|
||||
use crate::tenant::inmemory_layer::InMemoryLayer;
|
||||
use crate::tenant::storage_layer::Layer;
|
||||
use crate::tenant::storage_layer::{range_eq, range_overlaps};
|
||||
use amplify_num::i256;
|
||||
use anyhow::Result;
|
||||
use num_traits::identities::{One, Zero};
|
||||
use num_traits::{Bounded, Num, Signed};
|
||||
use rstar::{RTree, RTreeObject, AABB};
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::VecDeque;
|
||||
use std::collections::{BTreeMap, VecDeque};
|
||||
use std::ops::Range;
|
||||
use std::ops::{Add, Div, Mul, Neg, Rem, Sub};
|
||||
use std::sync::Arc;
|
||||
use tracing::*;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, PartialOrd, Ord, Eq)]
|
||||
struct BTreeKey {
|
||||
lsn: Lsn,
|
||||
seq: usize,
|
||||
}
|
||||
|
||||
impl BTreeKey {
|
||||
fn new(lsn: Lsn) -> BTreeKey {
|
||||
BTreeKey { lsn, seq: 0 }
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// LayerMap tracks what layers exist on a timeline.
|
||||
///
|
||||
@@ -53,163 +59,11 @@ pub struct LayerMap {
|
||||
pub frozen_layers: VecDeque<Arc<InMemoryLayer>>,
|
||||
|
||||
/// All the historic layers are kept here
|
||||
historic_layers: RTree<LayerRTreeObject>,
|
||||
historic_layers: BTreeMap<BTreeKey, Arc<dyn Layer>>,
|
||||
layers_seqno: usize,
|
||||
|
||||
/// L0 layers have key range Key::MIN..Key::MAX, and locating them using R-Tree search is very inefficient.
|
||||
/// So L0 layers are held in l0_delta_layers vector, in addition to the R-tree.
|
||||
l0_delta_layers: Vec<Arc<dyn Layer>>,
|
||||
}
|
||||
|
||||
struct LayerRTreeObject {
|
||||
layer: Arc<dyn Layer>,
|
||||
}
|
||||
|
||||
// Representation of Key as numeric type.
|
||||
// We can not use native implementation of i128, because rstar::RTree
|
||||
// doesn't handle properly integer overflow during area calculation: sum(Xi*Yi).
|
||||
// Overflow will cause panic in debug mode and incorrect area calculation in release mode,
|
||||
// which leads to non-optimally balanced R-Tree (but doesn't fit correctness of R-Tree work).
|
||||
// By using i256 as the type, even though all the actual values would fit in i128, we can be
|
||||
// sure that multiplication doesn't overflow.
|
||||
//
|
||||
|
||||
#[derive(Clone, PartialEq, Eq, PartialOrd, Debug)]
|
||||
struct IntKey(i256);
|
||||
|
||||
impl Copy for IntKey {}
|
||||
|
||||
impl IntKey {
|
||||
fn from(i: i128) -> Self {
|
||||
IntKey(i256::from(i))
|
||||
}
|
||||
}
|
||||
|
||||
impl Bounded for IntKey {
|
||||
fn min_value() -> Self {
|
||||
IntKey(i256::MIN)
|
||||
}
|
||||
fn max_value() -> Self {
|
||||
IntKey(i256::MAX)
|
||||
}
|
||||
}
|
||||
|
||||
impl Signed for IntKey {
|
||||
fn is_positive(&self) -> bool {
|
||||
self.0 > i256::ZERO
|
||||
}
|
||||
fn is_negative(&self) -> bool {
|
||||
self.0 < i256::ZERO
|
||||
}
|
||||
fn signum(&self) -> Self {
|
||||
match self.0.cmp(&i256::ZERO) {
|
||||
Ordering::Greater => IntKey(i256::ONE),
|
||||
Ordering::Less => IntKey(-i256::ONE),
|
||||
Ordering::Equal => IntKey(i256::ZERO),
|
||||
}
|
||||
}
|
||||
fn abs(&self) -> Self {
|
||||
IntKey(self.0.abs())
|
||||
}
|
||||
fn abs_sub(&self, other: &Self) -> Self {
|
||||
if self.0 <= other.0 {
|
||||
IntKey(i256::ZERO)
|
||||
} else {
|
||||
IntKey(self.0 - other.0)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Neg for IntKey {
|
||||
type Output = Self;
|
||||
fn neg(self) -> Self::Output {
|
||||
IntKey(-self.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl Rem for IntKey {
|
||||
type Output = Self;
|
||||
fn rem(self, rhs: Self) -> Self::Output {
|
||||
IntKey(self.0 % rhs.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl Div for IntKey {
|
||||
type Output = Self;
|
||||
fn div(self, rhs: Self) -> Self::Output {
|
||||
IntKey(self.0 / rhs.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl Add for IntKey {
|
||||
type Output = Self;
|
||||
fn add(self, rhs: Self) -> Self::Output {
|
||||
IntKey(self.0 + rhs.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl Sub for IntKey {
|
||||
type Output = Self;
|
||||
fn sub(self, rhs: Self) -> Self::Output {
|
||||
IntKey(self.0 - rhs.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl Mul for IntKey {
|
||||
type Output = Self;
|
||||
fn mul(self, rhs: Self) -> Self::Output {
|
||||
IntKey(self.0 * rhs.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl One for IntKey {
|
||||
fn one() -> Self {
|
||||
IntKey(i256::ONE)
|
||||
}
|
||||
}
|
||||
|
||||
impl Zero for IntKey {
|
||||
fn zero() -> Self {
|
||||
IntKey(i256::ZERO)
|
||||
}
|
||||
fn is_zero(&self) -> bool {
|
||||
self.0 == i256::ZERO
|
||||
}
|
||||
}
|
||||
|
||||
impl Num for IntKey {
|
||||
type FromStrRadixErr = <i128 as Num>::FromStrRadixErr;
|
||||
fn from_str_radix(str: &str, radix: u32) -> Result<Self, Self::FromStrRadixErr> {
|
||||
Ok(IntKey(i256::from(i128::from_str_radix(str, radix)?)))
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq for LayerRTreeObject {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
// FIXME: ptr_eq might fail to return true for 'dyn'
|
||||
// references. Clippy complains about this. In practice it
|
||||
// seems to work, the assertion below would be triggered
|
||||
// otherwise but this ought to be fixed.
|
||||
#[allow(clippy::vtable_address_comparisons)]
|
||||
Arc::ptr_eq(&self.layer, &other.layer)
|
||||
}
|
||||
}
|
||||
|
||||
impl RTreeObject for LayerRTreeObject {
|
||||
type Envelope = AABB<[IntKey; 2]>;
|
||||
fn envelope(&self) -> Self::Envelope {
|
||||
let key_range = self.layer.get_key_range();
|
||||
let lsn_range = self.layer.get_lsn_range();
|
||||
AABB::from_corners(
|
||||
[
|
||||
IntKey::from(key_range.start.to_i128()),
|
||||
IntKey::from(lsn_range.start.0 as i128),
|
||||
],
|
||||
[
|
||||
IntKey::from(key_range.end.to_i128() - 1),
|
||||
IntKey::from(lsn_range.end.0 as i128 - 1),
|
||||
], // AABB::upper is inclusive, while `key_range.end` and `lsn_range.end` are exclusive
|
||||
)
|
||||
}
|
||||
/// Latest stored delta layer
|
||||
latest_delta_layer: Option<Arc<dyn Layer>>,
|
||||
}
|
||||
|
||||
/// Return value of LayerMap::search
|
||||
@@ -234,23 +88,17 @@ impl LayerMap {
|
||||
// linear search
|
||||
// Find the latest image layer that covers the given key
|
||||
let mut latest_img: Option<Arc<dyn Layer>> = None;
|
||||
let mut latest_img_lsn: Option<Lsn> = None;
|
||||
let envelope = AABB::from_corners(
|
||||
[IntKey::from(key.to_i128()), IntKey::from(0i128)],
|
||||
[
|
||||
IntKey::from(key.to_i128()),
|
||||
IntKey::from(end_lsn.0 as i128 - 1),
|
||||
],
|
||||
);
|
||||
for e in self
|
||||
let mut latest_img_lsn = Lsn(0);
|
||||
let mut iter = self
|
||||
.historic_layers
|
||||
.locate_in_envelope_intersecting(&envelope)
|
||||
{
|
||||
let l = &e.layer;
|
||||
.range(BTreeKey::new(Lsn(0))..BTreeKey::new(end_lsn));
|
||||
while let Some((_key, l)) = iter.next_back() {
|
||||
if l.is_incremental() {
|
||||
continue;
|
||||
}
|
||||
assert!(l.get_key_range().contains(&key));
|
||||
if !l.get_key_range().contains(&key) {
|
||||
continue;
|
||||
}
|
||||
let img_lsn = l.get_lsn_range().start;
|
||||
assert!(img_lsn < end_lsn);
|
||||
if Lsn(img_lsn.0 + 1) == end_lsn {
|
||||
@@ -260,23 +108,23 @@ impl LayerMap {
|
||||
lsn_floor: img_lsn,
|
||||
}));
|
||||
}
|
||||
if img_lsn > latest_img_lsn.unwrap_or(Lsn(0)) {
|
||||
latest_img = Some(Arc::clone(l));
|
||||
latest_img_lsn = Some(img_lsn);
|
||||
}
|
||||
latest_img = Some(Arc::clone(l));
|
||||
latest_img_lsn = img_lsn;
|
||||
break;
|
||||
}
|
||||
|
||||
// Search the delta layers
|
||||
let mut latest_delta: Option<Arc<dyn Layer>> = None;
|
||||
for e in self
|
||||
let mut iter = self
|
||||
.historic_layers
|
||||
.locate_in_envelope_intersecting(&envelope)
|
||||
{
|
||||
let l = &e.layer;
|
||||
.range(BTreeKey::new(Lsn(0))..BTreeKey::new(end_lsn));
|
||||
while let Some((_key, l)) = iter.next_back() {
|
||||
if !l.is_incremental() {
|
||||
continue;
|
||||
}
|
||||
assert!(l.get_key_range().contains(&key));
|
||||
if !l.get_key_range().contains(&key) {
|
||||
continue;
|
||||
}
|
||||
if l.get_lsn_range().start >= end_lsn {
|
||||
info!(
|
||||
"Candidate delta layer {}..{} is too new for lsn {}",
|
||||
@@ -286,6 +134,9 @@ impl LayerMap {
|
||||
);
|
||||
}
|
||||
assert!(l.get_lsn_range().start < end_lsn);
|
||||
if l.get_lsn_range().end <= latest_img_lsn {
|
||||
continue;
|
||||
}
|
||||
if l.get_lsn_range().end >= end_lsn {
|
||||
// this layer contains the requested point in the key/lsn space.
|
||||
// No need to search any further
|
||||
@@ -311,10 +162,7 @@ impl LayerMap {
|
||||
"found (old) layer {} for request on {key} at {end_lsn}",
|
||||
l.filename().display(),
|
||||
);
|
||||
let lsn_floor = std::cmp::max(
|
||||
Lsn(latest_img_lsn.unwrap_or(Lsn(0)).0 + 1),
|
||||
l.get_lsn_range().start,
|
||||
);
|
||||
let lsn_floor = std::cmp::max(Lsn(latest_img_lsn.0 + 1), l.get_lsn_range().start);
|
||||
Ok(Some(SearchResult {
|
||||
lsn_floor,
|
||||
layer: l,
|
||||
@@ -322,7 +170,7 @@ impl LayerMap {
|
||||
} else if let Some(l) = latest_img {
|
||||
trace!("found img layer and no deltas for request on {key} at {end_lsn}");
|
||||
Ok(Some(SearchResult {
|
||||
lsn_floor: latest_img_lsn.unwrap(),
|
||||
lsn_floor: latest_img_lsn,
|
||||
layer: l,
|
||||
}))
|
||||
} else {
|
||||
@@ -336,9 +184,28 @@ impl LayerMap {
|
||||
///
|
||||
pub fn insert_historic(&mut self, layer: Arc<dyn Layer>) {
|
||||
if layer.get_key_range() == (Key::MIN..Key::MAX) {
|
||||
self.l0_delta_layers.push(layer.clone());
|
||||
self.latest_delta_layer = Some(layer.clone());
|
||||
} else if !layer.is_incremental() {
|
||||
// If latest delta layer is followed by image layers
|
||||
// then reset it, preventing generation of partial image layer
|
||||
if let Some(latest_delta) = &self.latest_delta_layer {
|
||||
// May be it is more correct to use contains() rather than inrestects
|
||||
// but one delta layer can be covered by several image layers.
|
||||
let kr1 = layer.get_key_range();
|
||||
let kr2 = latest_delta.get_key_range();
|
||||
if range_overlaps(&kr1, &kr2) {
|
||||
self.latest_delta_layer = None;
|
||||
}
|
||||
}
|
||||
}
|
||||
self.historic_layers.insert(LayerRTreeObject { layer });
|
||||
self.historic_layers.insert(
|
||||
BTreeKey {
|
||||
lsn: layer.get_lsn_range().start,
|
||||
seq: self.layers_seqno,
|
||||
},
|
||||
layer,
|
||||
);
|
||||
self.layers_seqno += 1;
|
||||
NUM_ONDISK_LAYERS.inc();
|
||||
}
|
||||
|
||||
@@ -349,21 +216,33 @@ impl LayerMap {
|
||||
///
|
||||
pub fn remove_historic(&mut self, layer: Arc<dyn Layer>) {
|
||||
if layer.get_key_range() == (Key::MIN..Key::MAX) {
|
||||
let len_before = self.l0_delta_layers.len();
|
||||
|
||||
// FIXME: ptr_eq might fail to return true for 'dyn'
|
||||
// references. Clippy complains about this. In practice it
|
||||
// seems to work, the assertion below would be triggered
|
||||
// otherwise but this ought to be fixed.
|
||||
#[allow(clippy::vtable_address_comparisons)]
|
||||
self.l0_delta_layers
|
||||
.retain(|other| !Arc::ptr_eq(other, &layer));
|
||||
assert_eq!(self.l0_delta_layers.len(), len_before - 1);
|
||||
if let Some(latest_layer) = &self.latest_delta_layer {
|
||||
#[allow(clippy::vtable_address_comparisons)]
|
||||
if Arc::ptr_eq(&layer, latest_layer) {
|
||||
self.latest_delta_layer = None;
|
||||
}
|
||||
}
|
||||
}
|
||||
assert!(self
|
||||
.historic_layers
|
||||
.remove(&LayerRTreeObject { layer })
|
||||
.is_some());
|
||||
let len_before = self.historic_layers.len();
|
||||
#[allow(clippy::vtable_address_comparisons)]
|
||||
self.historic_layers
|
||||
.retain(|_key, other| !Arc::ptr_eq(other, &layer));
|
||||
if self.historic_layers.len() != len_before - 1 {
|
||||
assert!(self.historic_layers.len() == len_before);
|
||||
error!(
|
||||
"Failed to remove {} layer: {}..{}__{}..{}",
|
||||
if layer.is_incremental() {
|
||||
"inremental"
|
||||
} else {
|
||||
"image"
|
||||
},
|
||||
layer.get_key_range().start,
|
||||
layer.get_key_range().end,
|
||||
layer.get_lsn_range().start,
|
||||
layer.get_lsn_range().end
|
||||
);
|
||||
}
|
||||
assert!(self.historic_layers.len() == len_before - 1);
|
||||
NUM_ONDISK_LAYERS.dec();
|
||||
}
|
||||
|
||||
@@ -380,21 +259,10 @@ impl LayerMap {
|
||||
|
||||
loop {
|
||||
let mut made_progress = false;
|
||||
let envelope = AABB::from_corners(
|
||||
[
|
||||
IntKey::from(range_remain.start.to_i128()),
|
||||
IntKey::from(lsn_range.start.0 as i128),
|
||||
],
|
||||
[
|
||||
IntKey::from(range_remain.end.to_i128() - 1),
|
||||
IntKey::from(lsn_range.end.0 as i128 - 1),
|
||||
],
|
||||
);
|
||||
for e in self
|
||||
for (_key, l) in self
|
||||
.historic_layers
|
||||
.locate_in_envelope_intersecting(&envelope)
|
||||
.range(BTreeKey::new(lsn_range.start)..BTreeKey::new(lsn_range.end))
|
||||
{
|
||||
let l = &e.layer;
|
||||
if l.is_incremental() {
|
||||
continue;
|
||||
}
|
||||
@@ -417,39 +285,30 @@ impl LayerMap {
|
||||
}
|
||||
|
||||
pub fn iter_historic_layers(&self) -> impl '_ + Iterator<Item = Arc<dyn Layer>> {
|
||||
self.historic_layers.iter().map(|e| e.layer.clone())
|
||||
self.historic_layers
|
||||
.iter()
|
||||
.map(|(_key, layer)| layer.clone())
|
||||
}
|
||||
|
||||
/// Find the last image layer that covers 'key', ignoring any image layers
|
||||
/// newer than 'lsn'.
|
||||
fn find_latest_image(&self, key: Key, lsn: Lsn) -> Option<Arc<dyn Layer>> {
|
||||
let mut candidate_lsn = Lsn(0);
|
||||
let mut candidate = None;
|
||||
let envelope = AABB::from_corners(
|
||||
[IntKey::from(key.to_i128()), IntKey::from(0)],
|
||||
[IntKey::from(key.to_i128()), IntKey::from(lsn.0 as i128)],
|
||||
);
|
||||
for e in self
|
||||
let mut iter = self
|
||||
.historic_layers
|
||||
.locate_in_envelope_intersecting(&envelope)
|
||||
{
|
||||
let l = &e.layer;
|
||||
.range(BTreeKey::new(Lsn(0))..BTreeKey::new(lsn + 1));
|
||||
while let Some((_key, l)) = iter.next_back() {
|
||||
if l.is_incremental() {
|
||||
continue;
|
||||
}
|
||||
|
||||
assert!(l.get_key_range().contains(&key));
|
||||
let this_lsn = l.get_lsn_range().start;
|
||||
assert!(this_lsn <= lsn);
|
||||
if this_lsn < candidate_lsn {
|
||||
// our previous candidate was better
|
||||
if !l.get_key_range().contains(&key) {
|
||||
continue;
|
||||
}
|
||||
candidate_lsn = this_lsn;
|
||||
candidate = Some(Arc::clone(l));
|
||||
let this_lsn = l.get_lsn_range().start;
|
||||
assert!(this_lsn <= lsn);
|
||||
return Some(Arc::clone(l));
|
||||
}
|
||||
|
||||
candidate
|
||||
None
|
||||
}
|
||||
|
||||
///
|
||||
@@ -466,18 +325,10 @@ impl LayerMap {
|
||||
lsn: Lsn,
|
||||
) -> Result<Vec<(Range<Key>, Option<Arc<dyn Layer>>)>> {
|
||||
let mut points = vec![key_range.start];
|
||||
let envelope = AABB::from_corners(
|
||||
[IntKey::from(key_range.start.to_i128()), IntKey::from(0)],
|
||||
[
|
||||
IntKey::from(key_range.end.to_i128()),
|
||||
IntKey::from(lsn.0 as i128),
|
||||
],
|
||||
);
|
||||
for e in self
|
||||
for (_lsn, l) in self
|
||||
.historic_layers
|
||||
.locate_in_envelope_intersecting(&envelope)
|
||||
.range(BTreeKey::new(Lsn(0))..BTreeKey::new(lsn + 1))
|
||||
{
|
||||
let l = &e.layer;
|
||||
assert!(l.get_lsn_range().start <= lsn);
|
||||
let range = l.get_key_range();
|
||||
if key_range.contains(&range.start) {
|
||||
@@ -514,26 +365,17 @@ impl LayerMap {
|
||||
if lsn_range.start >= lsn_range.end {
|
||||
return Ok(0);
|
||||
}
|
||||
let envelope = AABB::from_corners(
|
||||
[
|
||||
IntKey::from(key_range.start.to_i128()),
|
||||
IntKey::from(lsn_range.start.0 as i128),
|
||||
],
|
||||
[
|
||||
IntKey::from(key_range.end.to_i128() - 1),
|
||||
IntKey::from(lsn_range.end.0 as i128 - 1),
|
||||
],
|
||||
);
|
||||
for e in self
|
||||
for (_lsn, l) in self
|
||||
.historic_layers
|
||||
.locate_in_envelope_intersecting(&envelope)
|
||||
.range(BTreeKey::new(lsn_range.start)..BTreeKey::new(lsn_range.end))
|
||||
{
|
||||
let l = &e.layer;
|
||||
if !l.is_incremental() {
|
||||
continue;
|
||||
}
|
||||
if !range_overlaps(&l.get_key_range(), key_range) {
|
||||
continue;
|
||||
}
|
||||
assert!(range_overlaps(&l.get_lsn_range(), lsn_range));
|
||||
assert!(range_overlaps(&l.get_key_range(), key_range));
|
||||
|
||||
// We ignore level0 delta layers. Unless the whole keyspace fits
|
||||
// into one partition
|
||||
@@ -549,8 +391,8 @@ impl LayerMap {
|
||||
}
|
||||
|
||||
/// Return all L0 delta layers
|
||||
pub fn get_level0_deltas(&self) -> Result<Vec<Arc<dyn Layer>>> {
|
||||
Ok(self.l0_delta_layers.clone())
|
||||
pub fn get_latest_delta_layer(&mut self) -> Option<Arc<dyn Layer>> {
|
||||
self.latest_delta_layer.take()
|
||||
}
|
||||
|
||||
/// debugging function to print out the contents of the layer map
|
||||
@@ -569,8 +411,8 @@ impl LayerMap {
|
||||
}
|
||||
|
||||
println!("historic_layers:");
|
||||
for e in self.historic_layers.iter() {
|
||||
e.layer.dump(verbose)?;
|
||||
for (_key, layer) in self.historic_layers.iter() {
|
||||
layer.dump(verbose)?;
|
||||
}
|
||||
println!("End dump LayerMap");
|
||||
Ok(())
|
||||
|
||||
@@ -139,9 +139,9 @@ pub trait Layer: Send + Sync {
|
||||
/// Iterate through all keys and values stored in the layer
|
||||
fn iter(&self) -> Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + '_>;
|
||||
|
||||
/// Iterate through all keys stored in the layer. Returns key, lsn and value size
|
||||
/// It is used only for compaction and so is currently implemented only for DeltaLayer
|
||||
fn key_iter(&self) -> Box<dyn Iterator<Item = (Key, Lsn, u64)> + '_> {
|
||||
/// Iterate through all keys stored in the layer. Returns key, lsn and value size.
|
||||
/// It is used only for reconstruction and so is currently implemented only for DeltaLayer
|
||||
fn key_iter(&self, _skip_images: bool) -> Box<dyn Iterator<Item = (Key, Lsn, u64)> + '_> {
|
||||
panic!("Not implemented")
|
||||
}
|
||||
|
||||
@@ -150,4 +150,7 @@ pub trait Layer: Send + Sync {
|
||||
|
||||
/// Dump summary of the contents of the layer to stdout
|
||||
fn dump(&self, verbose: bool) -> Result<()>;
|
||||
|
||||
// Check if ayer contains particular key
|
||||
fn contains(&self, key: &Key) -> Result<bool>;
|
||||
}
|
||||
|
||||
@@ -3,7 +3,6 @@
|
||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use bytes::Bytes;
|
||||
use fail::fail_point;
|
||||
use itertools::Itertools;
|
||||
use once_cell::sync::OnceCell;
|
||||
use tokio::task::spawn_blocking;
|
||||
use tracing::*;
|
||||
@@ -119,7 +118,7 @@ pub struct Timeline {
|
||||
|
||||
/// Layer removal lock.
|
||||
/// A lock to ensure that no layer of the timeline is removed concurrently by other tasks.
|
||||
/// This lock is acquired in [`Timeline::gc`], [`Timeline::compact`],
|
||||
/// This lock is acquired in [`Timeline::gc`], [`Timeline::reconstruct`],
|
||||
/// and [`Tenant::delete_timeline`].
|
||||
layer_removal_cs: Mutex<()>,
|
||||
|
||||
@@ -469,7 +468,7 @@ impl Timeline {
|
||||
CheckpointConfig::Forced => {
|
||||
self.freeze_inmem_layer(false);
|
||||
self.flush_frozen_layers(true)?;
|
||||
self.compact()
|
||||
self.reconstruct()
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -510,13 +509,6 @@ impl Timeline {
|
||||
.unwrap_or(self.conf.default_tenant_conf.compaction_target_size)
|
||||
}
|
||||
|
||||
fn get_compaction_threshold(&self) -> usize {
|
||||
let tenant_conf = self.tenant_conf.read().unwrap();
|
||||
tenant_conf
|
||||
.compaction_threshold
|
||||
.unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
|
||||
}
|
||||
|
||||
fn get_image_creation_threshold(&self) -> usize {
|
||||
let tenant_conf = self.tenant_conf.read().unwrap();
|
||||
tenant_conf
|
||||
@@ -597,7 +589,7 @@ impl Timeline {
|
||||
last_received_wal: Mutex::new(None),
|
||||
rel_size_cache: RwLock::new(HashMap::new()),
|
||||
};
|
||||
result.repartition_threshold = result.get_checkpoint_distance() / 10;
|
||||
result.repartition_threshold = result.get_checkpoint_distance() * 3;
|
||||
result
|
||||
}
|
||||
|
||||
@@ -731,7 +723,7 @@ impl Timeline {
|
||||
pub fn layer_removal_guard(&self) -> anyhow::Result<MutexGuard<()>> {
|
||||
self.layer_removal_cs
|
||||
.try_lock()
|
||||
.map_err(|e| anyhow!("cannot lock compaction critical section {e}"))
|
||||
.map_err(|e| anyhow!("cannot lock reconstruction critical section {e}"))
|
||||
}
|
||||
|
||||
/// Retrieve current logical size of the timeline.
|
||||
@@ -1333,17 +1325,17 @@ impl Timeline {
|
||||
Ok(new_delta_path)
|
||||
}
|
||||
|
||||
pub fn compact(&self) -> anyhow::Result<()> {
|
||||
pub fn reconstruct(&self) -> anyhow::Result<()> {
|
||||
let last_record_lsn = self.get_last_record_lsn();
|
||||
|
||||
// Last record Lsn could be zero in case the timelie was just created
|
||||
if !last_record_lsn.is_valid() {
|
||||
warn!("Skipping compaction for potentially just initialized timeline, it has invalid last record lsn: {last_record_lsn}");
|
||||
warn!("Skipping reconstruction for potentially just initialized timeline, it has invalid last record lsn: {last_record_lsn}");
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
//
|
||||
// High level strategy for compaction / image creation:
|
||||
// High level strategy for reconstruction / image creation:
|
||||
//
|
||||
// 1. First, calculate the desired "partitioning" of the
|
||||
// currently in-use key space. The goal is to partition the
|
||||
@@ -1367,13 +1359,13 @@ impl Timeline {
|
||||
// total in the delta file. Or perhaps: if creating an image
|
||||
// file would allow to delete some older files.
|
||||
//
|
||||
// 3. After that, we compact all level0 delta files if there
|
||||
// are too many of them. While compacting, we also garbage
|
||||
// 3. After that, we reconstruct all level0 delta files if there
|
||||
// are too many of them. While reconstructing, we also garbage
|
||||
// collect any page versions that are no longer needed because
|
||||
// of the new image layers we created in step 2.
|
||||
//
|
||||
// TODO: This high level strategy hasn't been implemented yet.
|
||||
// Below are functions compact_level0() and create_image_layers()
|
||||
// Below are functions reconstruct_level0() and create_image_layers()
|
||||
// but they are a bit ad hoc and don't quite work like it's explained
|
||||
// above. Rewrite it.
|
||||
let _layer_removal_cs = self.layer_removal_cs.lock().unwrap();
|
||||
@@ -1400,21 +1392,21 @@ impl Timeline {
|
||||
None,
|
||||
);
|
||||
}
|
||||
|
||||
// 3. Compact
|
||||
let timer = self.metrics.compact_time_histo.start_timer();
|
||||
self.compact_level0(target_file_size)?;
|
||||
timer.stop_and_record();
|
||||
}
|
||||
Err(err) => {
|
||||
// no partitioning? This is normal, if the timeline was just created
|
||||
// as an empty timeline. Also in unit tests, when we use the timeline
|
||||
// as a simple key-value store, ignoring the datadir layout. Log the
|
||||
// error but continue.
|
||||
error!("could not compact, repartitioning keyspace failed: {err:?}");
|
||||
error!("could not reconstruct, repartitioning keyspace failed: {err:?}");
|
||||
}
|
||||
};
|
||||
|
||||
// 3. Reconstruct
|
||||
let timer = self.metrics.reconstruct_time_histo.start_timer();
|
||||
self.reconstruct_level0(target_file_size)?;
|
||||
timer.stop_and_record();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1514,7 +1506,7 @@ impl Timeline {
|
||||
// We must also fsync the timeline dir to ensure the directory entries for
|
||||
// new layer files are durable
|
||||
//
|
||||
// Compaction creates multiple image layers. It would be better to create them all
|
||||
// Reconstruction creates multiple image layers. It would be better to create them all
|
||||
// and fsync them all in parallel.
|
||||
let mut all_paths = Vec::from_iter(layer_paths_to_upload.clone());
|
||||
all_paths.push(self.conf.timeline_path(&self.timeline_id, &self.tenant_id));
|
||||
@@ -1534,230 +1526,46 @@ impl Timeline {
|
||||
}
|
||||
|
||||
///
|
||||
/// Collect a bunch of Level 0 layer files, and compact and reshuffle them as
|
||||
/// Collect a bunch of Level 0 layer files, and reconstruct and reshuffle them as
|
||||
/// as Level 1 files.
|
||||
///
|
||||
fn compact_level0(&self, target_file_size: u64) -> Result<()> {
|
||||
let layers = self.layers.read().unwrap();
|
||||
let mut level0_deltas = layers.get_level0_deltas()?;
|
||||
fn reconstruct_level0(&self, target_file_size: u64) -> Result<()> {
|
||||
let mut layers = self.layers.write().unwrap();
|
||||
let latest_delta_layer = layers.get_latest_delta_layer();
|
||||
drop(layers);
|
||||
|
||||
// Only compact if enough layers have accumulated.
|
||||
if level0_deltas.is_empty() || level0_deltas.len() < self.get_compaction_threshold() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Gather the files to compact in this iteration.
|
||||
//
|
||||
// Start with the oldest Level 0 delta file, and collect any other
|
||||
// level 0 files that form a contiguous sequence, such that the end
|
||||
// LSN of previous file matches the start LSN of the next file.
|
||||
//
|
||||
// Note that if the files don't form such a sequence, we might
|
||||
// "compact" just a single file. That's a bit pointless, but it allows
|
||||
// us to get rid of the level 0 file, and compact the other files on
|
||||
// the next iteration. This could probably made smarter, but such
|
||||
// "gaps" in the sequence of level 0 files should only happen in case
|
||||
// of a crash, partial download from cloud storage, or something like
|
||||
// that, so it's not a big deal in practice.
|
||||
level0_deltas.sort_by_key(|l| l.get_lsn_range().start);
|
||||
let mut level0_deltas_iter = level0_deltas.iter();
|
||||
|
||||
let first_level0_delta = level0_deltas_iter.next().unwrap();
|
||||
let mut prev_lsn_end = first_level0_delta.get_lsn_range().end;
|
||||
let mut deltas_to_compact = vec![Arc::clone(first_level0_delta)];
|
||||
for l in level0_deltas_iter {
|
||||
let lsn_range = l.get_lsn_range();
|
||||
|
||||
if lsn_range.start != prev_lsn_end {
|
||||
break;
|
||||
}
|
||||
deltas_to_compact.push(Arc::clone(l));
|
||||
prev_lsn_end = lsn_range.end;
|
||||
}
|
||||
let lsn_range = Range {
|
||||
start: deltas_to_compact.first().unwrap().get_lsn_range().start,
|
||||
end: deltas_to_compact.last().unwrap().get_lsn_range().end,
|
||||
};
|
||||
|
||||
info!(
|
||||
"Starting Level0 compaction in LSN range {}-{} for {} layers ({} deltas in total)",
|
||||
lsn_range.start,
|
||||
lsn_range.end,
|
||||
deltas_to_compact.len(),
|
||||
level0_deltas.len()
|
||||
);
|
||||
for l in deltas_to_compact.iter() {
|
||||
info!("compact includes {}", l.filename().display());
|
||||
}
|
||||
// We don't need the original list of layers anymore. Drop it so that
|
||||
// we don't accidentally use it later in the function.
|
||||
drop(level0_deltas);
|
||||
|
||||
// This iterator walks through all key-value pairs from all the layers
|
||||
// we're compacting, in key, LSN order.
|
||||
let all_values_iter = deltas_to_compact
|
||||
.iter()
|
||||
.map(|l| l.iter())
|
||||
.kmerge_by(|a, b| {
|
||||
if let Ok((a_key, a_lsn, _)) = a {
|
||||
if let Ok((b_key, b_lsn, _)) = b {
|
||||
match a_key.cmp(b_key) {
|
||||
Ordering::Less => true,
|
||||
Ordering::Equal => a_lsn <= b_lsn,
|
||||
Ordering::Greater => false,
|
||||
}
|
||||
} else {
|
||||
false
|
||||
}
|
||||
} else {
|
||||
true
|
||||
}
|
||||
});
|
||||
|
||||
// This iterator walks through all keys and is needed to calculate size used by each key
|
||||
let mut all_keys_iter = deltas_to_compact
|
||||
.iter()
|
||||
.map(|l| l.key_iter())
|
||||
.kmerge_by(|a, b| {
|
||||
let (a_key, a_lsn, _) = a;
|
||||
let (b_key, b_lsn, _) = b;
|
||||
match a_key.cmp(b_key) {
|
||||
Ordering::Less => true,
|
||||
Ordering::Equal => a_lsn <= b_lsn,
|
||||
Ordering::Greater => false,
|
||||
}
|
||||
});
|
||||
|
||||
// Merge the contents of all the input delta layers into a new set
|
||||
// of delta layers, based on the current partitioning.
|
||||
//
|
||||
// We split the new delta layers on the key dimension. We iterate through the key space, and for each key, check if including the next key to the current output layer we're building would cause the layer to become too large. If so, dump the current output layer and start new one.
|
||||
// It's possible that there is a single key with so many page versions that storing all of them in a single layer file
|
||||
// would be too large. In that case, we also split on the LSN dimension.
|
||||
//
|
||||
// LSN
|
||||
// ^
|
||||
// |
|
||||
// | +-----------+ +--+--+--+--+
|
||||
// | | | | | | | |
|
||||
// | +-----------+ | | | | |
|
||||
// | | | | | | | |
|
||||
// | +-----------+ ==> | | | | |
|
||||
// | | | | | | | |
|
||||
// | +-----------+ | | | | |
|
||||
// | | | | | | | |
|
||||
// | +-----------+ +--+--+--+--+
|
||||
// |
|
||||
// +--------------> key
|
||||
//
|
||||
//
|
||||
// If one key (X) has a lot of page versions:
|
||||
//
|
||||
// LSN
|
||||
// ^
|
||||
// | (X)
|
||||
// | +-----------+ +--+--+--+--+
|
||||
// | | | | | | | |
|
||||
// | +-----------+ | | +--+ |
|
||||
// | | | | | | | |
|
||||
// | +-----------+ ==> | | | | |
|
||||
// | | | | | +--+ |
|
||||
// | +-----------+ | | | | |
|
||||
// | | | | | | | |
|
||||
// | +-----------+ +--+--+--+--+
|
||||
// |
|
||||
// +--------------> key
|
||||
// TODO: this actually divides the layers into fixed-size chunks, not
|
||||
// based on the partitioning.
|
||||
//
|
||||
// TODO: we should also opportunistically materialize and
|
||||
// garbage collect what we can.
|
||||
let mut new_layers = Vec::new();
|
||||
let mut prev_key: Option<Key> = None;
|
||||
let mut writer: Option<DeltaLayerWriter> = None;
|
||||
let mut key_values_total_size = 0u64;
|
||||
let mut dup_start_lsn: Lsn = Lsn::INVALID; // start LSN of layer containing values of the single key
|
||||
let mut dup_end_lsn: Lsn = Lsn::INVALID; // end LSN of layer containing values of the single key
|
||||
for x in all_values_iter {
|
||||
let (key, lsn, value) = x?;
|
||||
let same_key = prev_key.map_or(false, |prev_key| prev_key == key);
|
||||
// We need to check key boundaries once we reach next key or end of layer with the same key
|
||||
if !same_key || lsn == dup_end_lsn {
|
||||
let mut next_key_size = 0u64;
|
||||
let is_dup_layer = dup_end_lsn.is_valid();
|
||||
dup_start_lsn = Lsn::INVALID;
|
||||
if !same_key {
|
||||
dup_end_lsn = Lsn::INVALID;
|
||||
}
|
||||
// Determine size occupied by this key. We stop at next key or when size becomes larger than target_file_size
|
||||
for (next_key, next_lsn, next_size) in all_keys_iter.by_ref() {
|
||||
next_key_size = next_size;
|
||||
if key != next_key {
|
||||
if dup_end_lsn.is_valid() {
|
||||
// We are writting segment with duplicates:
|
||||
// place all remaining values of this key in separate segment
|
||||
dup_start_lsn = dup_end_lsn; // new segments starts where old stops
|
||||
dup_end_lsn = lsn_range.end; // there are no more values of this key till end of LSN range
|
||||
}
|
||||
break;
|
||||
}
|
||||
key_values_total_size += next_size;
|
||||
// Check if it is time to split segment: if total keys size is larger than target file size.
|
||||
// We need to avoid generation of empty segments if next_size > target_file_size.
|
||||
if key_values_total_size > target_file_size && lsn != next_lsn {
|
||||
// Split key between multiple layers: such layer can contain only single key
|
||||
dup_start_lsn = if dup_end_lsn.is_valid() {
|
||||
dup_end_lsn // new segment with duplicates starts where old one stops
|
||||
} else {
|
||||
lsn // start with the first LSN for this key
|
||||
};
|
||||
dup_end_lsn = next_lsn; // upper LSN boundary is exclusive
|
||||
break;
|
||||
}
|
||||
}
|
||||
// handle case when loop reaches last key: in this case dup_end is non-zero but dup_start is not set.
|
||||
if dup_end_lsn.is_valid() && !dup_start_lsn.is_valid() {
|
||||
dup_start_lsn = dup_end_lsn;
|
||||
dup_end_lsn = lsn_range.end;
|
||||
}
|
||||
if writer.is_some() {
|
||||
let written_size = writer.as_mut().unwrap().size();
|
||||
// check if key cause layer overflow...
|
||||
if is_dup_layer
|
||||
|| dup_end_lsn.is_valid()
|
||||
|| written_size + key_values_total_size > target_file_size
|
||||
{
|
||||
// ... if so, flush previous layer and prepare to write new one
|
||||
new_layers.push(writer.take().unwrap().finish(prev_key.unwrap().next())?);
|
||||
let mut new_layers = Vec::new();
|
||||
let mut last_key: Option<Key> = None;
|
||||
if let Some(last_delta_layer) = latest_delta_layer {
|
||||
let end_lsn = last_delta_layer.get_lsn_range().end;
|
||||
let lsn_range = end_lsn..end_lsn + 1;
|
||||
for (key, lsn, _) in last_delta_layer.key_iter(true) {
|
||||
let value = self.get(key, lsn)?;
|
||||
if let Some(curr_writer) = &writer {
|
||||
if curr_writer.size() > target_file_size {
|
||||
new_layers.push(writer.take().unwrap().finish(key)?);
|
||||
writer = None;
|
||||
}
|
||||
}
|
||||
// Remember size of key value because at next iteration we will access next item
|
||||
key_values_total_size = next_key_size;
|
||||
}
|
||||
if writer.is_none() {
|
||||
// Create writer if not initiaized yet
|
||||
writer = Some(DeltaLayerWriter::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_id,
|
||||
key,
|
||||
if dup_end_lsn.is_valid() {
|
||||
// this is a layer containing slice of values of the same key
|
||||
debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn);
|
||||
dup_start_lsn..dup_end_lsn
|
||||
} else {
|
||||
debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
|
||||
lsn_range.clone()
|
||||
},
|
||||
)?);
|
||||
if writer.is_none() {
|
||||
writer = Some(DeltaLayerWriter::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_id,
|
||||
key,
|
||||
lsn_range.clone(),
|
||||
)?);
|
||||
}
|
||||
writer
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.put_value(key, end_lsn, Value::Image(value))?;
|
||||
last_key = Some(key);
|
||||
}
|
||||
writer.as_mut().unwrap().put_value(key, lsn, value)?;
|
||||
prev_key = Some(key);
|
||||
}
|
||||
if let Some(writer) = writer {
|
||||
new_layers.push(writer.finish(prev_key.unwrap().next())?);
|
||||
new_layers.push(writer.finish(last_key.unwrap().next())?);
|
||||
}
|
||||
|
||||
// Sync layers
|
||||
@@ -1787,23 +1595,6 @@ impl Timeline {
|
||||
new_layer_paths.insert(new_delta_path);
|
||||
layers.insert_historic(Arc::new(l));
|
||||
}
|
||||
|
||||
// Now that we have reshuffled the data to set of new delta layers, we can
|
||||
// delete the old ones
|
||||
let mut layer_paths_do_delete = HashSet::with_capacity(deltas_to_compact.len());
|
||||
drop(all_keys_iter);
|
||||
for l in deltas_to_compact {
|
||||
if let Some(path) = l.local_path() {
|
||||
self.metrics
|
||||
.current_physical_size_gauge
|
||||
.sub(path.metadata()?.len());
|
||||
layer_paths_do_delete.insert(path);
|
||||
}
|
||||
l.delete()?;
|
||||
layers.remove_historic(l);
|
||||
}
|
||||
drop(layers);
|
||||
|
||||
if self.upload_layers.load(atomic::Ordering::Relaxed) {
|
||||
storage_sync::schedule_layer_upload(
|
||||
self.tenant_id,
|
||||
@@ -1811,11 +1602,6 @@ impl Timeline {
|
||||
new_layer_paths,
|
||||
None,
|
||||
);
|
||||
storage_sync::schedule_layer_delete(
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
layer_paths_do_delete,
|
||||
);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -1823,10 +1609,10 @@ impl Timeline {
|
||||
|
||||
/// Update information about which layer files need to be retained on
|
||||
/// garbage collection. This is separate from actually performing the GC,
|
||||
/// and is updated more frequently, so that compaction can remove obsolete
|
||||
/// and is updated more frequently, so that reconstruction can remove obsolete
|
||||
/// page versions more aggressively.
|
||||
///
|
||||
/// TODO: that's wishful thinking, compaction doesn't actually do that
|
||||
/// TODO: that's wishful thinking, reconstruction doesn't actually do that
|
||||
/// currently.
|
||||
///
|
||||
/// The caller specifies how much history is needed with the 3 arguments:
|
||||
|
||||
Reference in New Issue
Block a user