mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-07 05:22:56 +00:00
fix(pageserver): consider partial compaction layer map in layer check (#10044)
## Problem In https://github.com/neondatabase/neon/pull/9897 we temporarily disabled the layer valid check because the current one only considers the end result of all compaction algorithms, but partial gc-compaction would temporarily produce an "invalid" layer map. part of https://github.com/neondatabase/neon/issues/9114 ## Summary of changes Allow LSN splits to overlap in the slow path check. Currently, the valid check is only used in storage scrubber (background job) and during gc-compaction (without taking layer lock). Therefore, it's fine for such checks to be a little bit inefficient but more accurate. --------- Signed-off-by: Alex Chi Z <chi@neon.tech> Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
This commit is contained in:
@@ -1,12 +1,15 @@
|
||||
use std::collections::BTreeSet;
|
||||
|
||||
use itertools::Itertools;
|
||||
use pageserver_compaction::helpers::overlaps_with;
|
||||
|
||||
use super::storage_layer::LayerName;
|
||||
|
||||
/// Checks whether a layer map is valid (i.e., is a valid result of the current compaction algorithm if nothing goes wrong).
|
||||
///
|
||||
/// The function checks if we can split the LSN range of a delta layer only at the LSNs of the delta layers. For example,
|
||||
/// The function implements a fast path check and a slow path check.
|
||||
///
|
||||
/// The fast path checks if we can split the LSN range of a delta layer only at the LSNs of the delta layers. For example,
|
||||
///
|
||||
/// ```plain
|
||||
/// | | | |
|
||||
@@ -25,31 +28,47 @@ use super::storage_layer::LayerName;
|
||||
/// | | | 4 | | |
|
||||
///
|
||||
/// If layer 2 and 4 contain the same single key, this is also a valid layer map.
|
||||
///
|
||||
/// However, if a partial compaction is still going on, it is possible that we get a layer map not satisfying the above condition.
|
||||
/// Therefore, we fallback to simply check if any of the two delta layers overlap. (See "A slow path...")
|
||||
pub fn check_valid_layermap(metadata: &[LayerName]) -> Option<String> {
|
||||
let mut lsn_split_point = BTreeSet::new(); // TODO: use a better data structure (range tree / range set?)
|
||||
let mut all_delta_layers = Vec::new();
|
||||
for name in metadata {
|
||||
if let LayerName::Delta(layer) = name {
|
||||
if layer.key_range.start.next() != layer.key_range.end {
|
||||
all_delta_layers.push(layer.clone());
|
||||
}
|
||||
all_delta_layers.push(layer.clone());
|
||||
}
|
||||
}
|
||||
for layer in &all_delta_layers {
|
||||
let lsn_range = &layer.lsn_range;
|
||||
lsn_split_point.insert(lsn_range.start);
|
||||
lsn_split_point.insert(lsn_range.end);
|
||||
if layer.key_range.start.next() != layer.key_range.end {
|
||||
let lsn_range = &layer.lsn_range;
|
||||
lsn_split_point.insert(lsn_range.start);
|
||||
lsn_split_point.insert(lsn_range.end);
|
||||
}
|
||||
}
|
||||
for layer in &all_delta_layers {
|
||||
for (idx, layer) in all_delta_layers.iter().enumerate() {
|
||||
if layer.key_range.start.next() == layer.key_range.end {
|
||||
continue;
|
||||
}
|
||||
let lsn_range = layer.lsn_range.clone();
|
||||
let intersects = lsn_split_point.range(lsn_range).collect_vec();
|
||||
if intersects.len() > 1 {
|
||||
let err = format!(
|
||||
"layer violates the layer map LSN split assumption: layer {} intersects with LSN [{}]",
|
||||
layer,
|
||||
intersects.into_iter().map(|lsn| lsn.to_string()).join(", ")
|
||||
);
|
||||
return Some(err);
|
||||
// A slow path to check if the layer intersects with any other delta layer.
|
||||
for (other_idx, other_layer) in all_delta_layers.iter().enumerate() {
|
||||
if other_idx == idx {
|
||||
// do not check self intersects with self
|
||||
continue;
|
||||
}
|
||||
if overlaps_with(&layer.lsn_range, &other_layer.lsn_range)
|
||||
&& overlaps_with(&layer.key_range, &other_layer.key_range)
|
||||
{
|
||||
let err = format!(
|
||||
"layer violates the layer map LSN split assumption: layer {} intersects with layer {}",
|
||||
layer, other_layer
|
||||
);
|
||||
return Some(err);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
None
|
||||
|
||||
@@ -29,6 +29,7 @@ use utils::id::TimelineId;
|
||||
use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
|
||||
use crate::page_cache;
|
||||
use crate::statvfs::Statvfs;
|
||||
use crate::tenant::checks::check_valid_layermap;
|
||||
use crate::tenant::remote_timeline_client::WaitCompletionError;
|
||||
use crate::tenant::storage_layer::batch_split_writer::{
|
||||
BatchWriterResult, SplitDeltaLayerWriter, SplitImageLayerWriter,
|
||||
@@ -2156,15 +2157,14 @@ impl Timeline {
|
||||
|
||||
// Step 1: construct a k-merge iterator over all layers.
|
||||
// Also, verify if the layer map can be split by drawing a horizontal line at every LSN start/end split point.
|
||||
// disable the check for now because we need to adjust the check for partial compactions, will enable later.
|
||||
// let layer_names = job_desc
|
||||
// .selected_layers
|
||||
// .iter()
|
||||
// .map(|layer| layer.layer_desc().layer_name())
|
||||
// .collect_vec();
|
||||
// if let Some(err) = check_valid_layermap(&layer_names) {
|
||||
// warn!("gc-compaction layer map check failed because {}, this is normal if partial compaction is not finished yet", err);
|
||||
// }
|
||||
let layer_names = job_desc
|
||||
.selected_layers
|
||||
.iter()
|
||||
.map(|layer| layer.layer_desc().layer_name())
|
||||
.collect_vec();
|
||||
if let Some(err) = check_valid_layermap(&layer_names) {
|
||||
bail!("gc-compaction layer map check failed because {}, cannot proceed with compaction due to potential data loss", err);
|
||||
}
|
||||
// The maximum LSN we are processing in this compaction loop
|
||||
let end_lsn = job_desc
|
||||
.selected_layers
|
||||
@@ -2546,8 +2546,36 @@ impl Timeline {
|
||||
);
|
||||
|
||||
// Step 3: Place back to the layer map.
|
||||
|
||||
// First, do a sanity check to ensure the newly-created layer map does not contain overlaps.
|
||||
let all_layers = {
|
||||
let guard = self.layers.read().await;
|
||||
let layer_map = guard.layer_map()?;
|
||||
layer_map.iter_historic_layers().collect_vec()
|
||||
};
|
||||
|
||||
let mut final_layers = all_layers
|
||||
.iter()
|
||||
.map(|layer| layer.layer_name())
|
||||
.collect::<HashSet<_>>();
|
||||
for layer in &layer_selection {
|
||||
final_layers.remove(&layer.layer_desc().layer_name());
|
||||
}
|
||||
for layer in &compact_to {
|
||||
final_layers.insert(layer.layer_desc().layer_name());
|
||||
}
|
||||
let final_layers = final_layers.into_iter().collect_vec();
|
||||
|
||||
// TODO: move this check before we call `finish` on image layer writers. However, this will require us to get the layer name before we finish
|
||||
// the writer, so potentially, we will need a function like `ImageLayerBatchWriter::get_all_pending_layer_keys` to get all the keys that are
|
||||
// in the writer before finalizing the persistent layers. Now we would leave some dangling layers on the disk if the check fails.
|
||||
if let Some(err) = check_valid_layermap(&final_layers) {
|
||||
bail!("gc-compaction layer map check failed after compaction because {}, compaction result not applied to the layer map due to potential data loss", err);
|
||||
}
|
||||
|
||||
// Between the sanity check and this compaction update, there could be new layers being flushed, but it should be fine because we only
|
||||
// operate on L1 layers.
|
||||
{
|
||||
// TODO: sanity check if the layer map is valid (i.e., should not have overlaps)
|
||||
let mut guard = self.layers.write().await;
|
||||
guard
|
||||
.open_mut()?
|
||||
|
||||
Reference in New Issue
Block a user