mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-21 15:10:44 +00:00
Before this patch, GC would call PersistentLayer::delete() on every GC'ed layer. RemoteLayer::delete() returned Ok(()) unconditionally. GC would then proceed by decrementing the resident size metric, even though the layer is a RemoteLayer. This patch makes the following changes: - Rename PersistentLayer::delete() to delete_resident_layer_file(). That name is unambiguous. - Make RemoteLayer::delete_resident_layer_file return an Err(). We would have uncovered this bug if we had done that from the start. - Change GC / Timeline::delete_historic_layer check whether the layer is remote or not, and only call delete_resident_layer_file() if it's not remote. This brings us in line with how eviction does it. - Add a regression test. fixes https://github.com/neondatabase/neon/issues/3722
499 lines
17 KiB
Rust
499 lines
17 KiB
Rust
//! Common traits and structs for layers
|
|
|
|
pub mod delta_layer;
|
|
mod filename;
|
|
mod image_layer;
|
|
mod inmemory_layer;
|
|
mod remote_layer;
|
|
|
|
use crate::config::PageServerConf;
|
|
use crate::context::RequestContext;
|
|
use crate::repository::{Key, Value};
|
|
use crate::task_mgr::TaskKind;
|
|
use crate::walrecord::NeonWalRecord;
|
|
use anyhow::Result;
|
|
use bytes::Bytes;
|
|
use either::Either;
|
|
use enum_map::EnumMap;
|
|
use enumset::EnumSet;
|
|
use pageserver_api::models::LayerAccessKind;
|
|
use pageserver_api::models::{
|
|
HistoricLayerInfo, LayerResidenceEvent, LayerResidenceEventReason, LayerResidenceStatus,
|
|
};
|
|
use std::ops::Range;
|
|
use std::path::PathBuf;
|
|
use std::sync::{Arc, Mutex};
|
|
use std::time::{SystemTime, UNIX_EPOCH};
|
|
use utils::history_buffer::HistoryBufferWithDropCounter;
|
|
|
|
use utils::{
|
|
id::{TenantId, TimelineId},
|
|
lsn::Lsn,
|
|
};
|
|
|
|
pub use delta_layer::{DeltaLayer, DeltaLayerWriter};
|
|
pub use filename::{DeltaFileName, ImageFileName, LayerFileName};
|
|
pub use image_layer::{ImageLayer, ImageLayerWriter};
|
|
pub use inmemory_layer::InMemoryLayer;
|
|
pub use remote_layer::RemoteLayer;
|
|
|
|
pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
|
|
where
|
|
T: PartialOrd<T>,
|
|
{
|
|
if a.start < b.start {
|
|
a.end > b.start
|
|
} else {
|
|
b.end > a.start
|
|
}
|
|
}
|
|
|
|
pub fn range_eq<T>(a: &Range<T>, b: &Range<T>) -> bool
|
|
where
|
|
T: PartialEq<T>,
|
|
{
|
|
a.start == b.start && a.end == b.end
|
|
}
|
|
|
|
/// Struct used to communicate across calls to 'get_value_reconstruct_data'.
|
|
///
|
|
/// Before first call, you can fill in 'page_img' if you have an older cached
|
|
/// version of the page available. That can save work in
|
|
/// 'get_value_reconstruct_data', as it can stop searching for page versions
|
|
/// when all the WAL records going back to the cached image have been collected.
|
|
///
|
|
/// When get_value_reconstruct_data returns Complete, 'img' is set to an image
|
|
/// of the page, or the oldest WAL record in 'records' is a will_init-type
|
|
/// record that initializes the page without requiring a previous image.
|
|
///
|
|
/// If 'get_page_reconstruct_data' returns Continue, some 'records' may have
|
|
/// been collected, but there are more records outside the current layer. Pass
|
|
/// the same ValueReconstructState struct in the next 'get_value_reconstruct_data'
|
|
/// call, to collect more records.
|
|
///
|
|
#[derive(Debug)]
|
|
pub struct ValueReconstructState {
|
|
pub records: Vec<(Lsn, NeonWalRecord)>,
|
|
pub img: Option<(Lsn, Bytes)>,
|
|
}
|
|
|
|
/// Return value from Layer::get_page_reconstruct_data
|
|
#[derive(Clone, Copy, Debug)]
|
|
pub enum ValueReconstructResult {
|
|
/// Got all the data needed to reconstruct the requested page
|
|
Complete,
|
|
/// This layer didn't contain all the required data, the caller should look up
|
|
/// the predecessor layer at the returned LSN and collect more data from there.
|
|
Continue,
|
|
|
|
/// This layer didn't contain data needed to reconstruct the page version at
|
|
/// the returned LSN. This is usually considered an error, but might be OK
|
|
/// in some circumstances.
|
|
Missing,
|
|
}
|
|
|
|
#[derive(Debug)]
|
|
pub struct LayerAccessStats(Mutex<LayerAccessStatsLocked>);
|
|
|
|
/// This struct holds two instances of [`LayerAccessStatsInner`].
|
|
/// Accesses are recorded to both instances.
|
|
/// The `for_scraping_api`instance can be reset from the management API via [`LayerAccessStatsReset`].
|
|
/// The `for_eviction_policy` is never reset.
|
|
#[derive(Debug, Default, Clone)]
|
|
struct LayerAccessStatsLocked {
|
|
for_scraping_api: LayerAccessStatsInner,
|
|
for_eviction_policy: LayerAccessStatsInner,
|
|
}
|
|
|
|
impl LayerAccessStatsLocked {
|
|
fn iter_mut(&mut self) -> impl Iterator<Item = &mut LayerAccessStatsInner> {
|
|
[&mut self.for_scraping_api, &mut self.for_eviction_policy].into_iter()
|
|
}
|
|
}
|
|
|
|
#[derive(Debug, Default, Clone)]
|
|
struct LayerAccessStatsInner {
|
|
first_access: Option<LayerAccessStatFullDetails>,
|
|
count_by_access_kind: EnumMap<LayerAccessKind, u64>,
|
|
task_kind_flag: EnumSet<TaskKind>,
|
|
last_accesses: HistoryBufferWithDropCounter<LayerAccessStatFullDetails, 16>,
|
|
last_residence_changes: HistoryBufferWithDropCounter<LayerResidenceEvent, 16>,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Copy)]
|
|
pub(super) struct LayerAccessStatFullDetails {
|
|
pub(super) when: SystemTime,
|
|
pub(super) task_kind: TaskKind,
|
|
pub(super) access_kind: LayerAccessKind,
|
|
}
|
|
|
|
#[derive(Clone, Copy, strum_macros::EnumString)]
|
|
pub enum LayerAccessStatsReset {
|
|
NoReset,
|
|
JustTaskKindFlags,
|
|
AllStats,
|
|
}
|
|
|
|
fn system_time_to_millis_since_epoch(ts: &SystemTime) -> u64 {
|
|
ts.duration_since(UNIX_EPOCH)
|
|
.expect("better to die in this unlikely case than report false stats")
|
|
.as_millis()
|
|
.try_into()
|
|
.expect("64 bits is enough for few more years")
|
|
}
|
|
|
|
impl LayerAccessStatFullDetails {
|
|
fn as_api_model(&self) -> pageserver_api::models::LayerAccessStatFullDetails {
|
|
let Self {
|
|
when,
|
|
task_kind,
|
|
access_kind,
|
|
} = self;
|
|
pageserver_api::models::LayerAccessStatFullDetails {
|
|
when_millis_since_epoch: system_time_to_millis_since_epoch(when),
|
|
task_kind: task_kind.into(), // into static str, powered by strum_macros
|
|
access_kind: *access_kind,
|
|
}
|
|
}
|
|
}
|
|
|
|
impl LayerAccessStats {
|
|
pub(crate) fn for_loading_layer(status: LayerResidenceStatus) -> Self {
|
|
let new = LayerAccessStats(Mutex::new(LayerAccessStatsLocked::default()));
|
|
new.record_residence_event(status, LayerResidenceEventReason::LayerLoad);
|
|
new
|
|
}
|
|
|
|
pub(crate) fn for_new_layer_file() -> Self {
|
|
let new = LayerAccessStats(Mutex::new(LayerAccessStatsLocked::default()));
|
|
new.record_residence_event(
|
|
LayerResidenceStatus::Resident,
|
|
LayerResidenceEventReason::LayerCreate,
|
|
);
|
|
new
|
|
}
|
|
|
|
/// Creates a clone of `self` and records `new_status` in the clone.
|
|
/// The `new_status` is not recorded in `self`
|
|
pub(crate) fn clone_for_residence_change(
|
|
&self,
|
|
new_status: LayerResidenceStatus,
|
|
) -> LayerAccessStats {
|
|
let clone = {
|
|
let inner = self.0.lock().unwrap();
|
|
inner.clone()
|
|
};
|
|
let new = LayerAccessStats(Mutex::new(clone));
|
|
new.record_residence_event(new_status, LayerResidenceEventReason::ResidenceChange);
|
|
new
|
|
}
|
|
|
|
fn record_residence_event(
|
|
&self,
|
|
status: LayerResidenceStatus,
|
|
reason: LayerResidenceEventReason,
|
|
) {
|
|
let mut locked = self.0.lock().unwrap();
|
|
locked.iter_mut().for_each(|inner| {
|
|
inner
|
|
.last_residence_changes
|
|
.write(LayerResidenceEvent::new(status, reason))
|
|
});
|
|
}
|
|
|
|
fn record_access(&self, access_kind: LayerAccessKind, task_kind: TaskKind) {
|
|
let this_access = LayerAccessStatFullDetails {
|
|
when: SystemTime::now(),
|
|
task_kind,
|
|
access_kind,
|
|
};
|
|
|
|
let mut locked = self.0.lock().unwrap();
|
|
locked.iter_mut().for_each(|inner| {
|
|
inner.first_access.get_or_insert(this_access);
|
|
inner.count_by_access_kind[access_kind] += 1;
|
|
inner.task_kind_flag |= task_kind;
|
|
inner.last_accesses.write(this_access);
|
|
})
|
|
}
|
|
|
|
fn as_api_model(
|
|
&self,
|
|
reset: LayerAccessStatsReset,
|
|
) -> pageserver_api::models::LayerAccessStats {
|
|
let mut locked = self.0.lock().unwrap();
|
|
let inner = &mut locked.for_scraping_api;
|
|
let LayerAccessStatsInner {
|
|
first_access,
|
|
count_by_access_kind,
|
|
task_kind_flag,
|
|
last_accesses,
|
|
last_residence_changes,
|
|
} = inner;
|
|
let ret = pageserver_api::models::LayerAccessStats {
|
|
access_count_by_access_kind: count_by_access_kind
|
|
.iter()
|
|
.map(|(kind, count)| (kind, *count))
|
|
.collect(),
|
|
task_kind_access_flag: task_kind_flag
|
|
.iter()
|
|
.map(|task_kind| task_kind.into()) // into static str, powered by strum_macros
|
|
.collect(),
|
|
first: first_access.as_ref().map(|a| a.as_api_model()),
|
|
accesses_history: last_accesses.map(|m| m.as_api_model()),
|
|
residence_events_history: last_residence_changes.clone(),
|
|
};
|
|
match reset {
|
|
LayerAccessStatsReset::NoReset => (),
|
|
LayerAccessStatsReset::JustTaskKindFlags => {
|
|
inner.task_kind_flag.clear();
|
|
}
|
|
LayerAccessStatsReset::AllStats => {
|
|
*inner = LayerAccessStatsInner::default();
|
|
}
|
|
}
|
|
ret
|
|
}
|
|
|
|
pub(super) fn most_recent_access_or_residence_event(
|
|
&self,
|
|
) -> Either<LayerAccessStatFullDetails, LayerResidenceEvent> {
|
|
let locked = self.0.lock().unwrap();
|
|
let inner = &locked.for_eviction_policy;
|
|
match inner.last_accesses.recent() {
|
|
Some(a) => Either::Left(*a),
|
|
None => match inner.last_residence_changes.recent() {
|
|
Some(e) => Either::Right(e.clone()),
|
|
None => unreachable!("constructors for LayerAccessStats ensure that there's always a residence change event"),
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Supertrait of the [`Layer`] trait that captures the bare minimum interface
|
|
/// required by [`LayerMap`].
|
|
///
|
|
/// All layers should implement a minimal `std::fmt::Debug` without tenant or
|
|
/// timeline names, because those are known in the context of which the layers
|
|
/// are used in (timeline).
|
|
pub trait Layer: std::fmt::Debug + Send + Sync {
|
|
/// Range of keys that this layer covers
|
|
fn get_key_range(&self) -> Range<Key>;
|
|
|
|
/// Inclusive start bound of the LSN range that this layer holds
|
|
/// Exclusive end bound of the LSN range that this layer holds.
|
|
///
|
|
/// - For an open in-memory layer, this is MAX_LSN.
|
|
/// - For a frozen in-memory layer or a delta layer, this is a valid end bound.
|
|
/// - An image layer represents snapshot at one LSN, so end_lsn is always the snapshot LSN + 1
|
|
fn get_lsn_range(&self) -> Range<Lsn>;
|
|
|
|
/// Does this layer only contain some data for the key-range (incremental),
|
|
/// or does it contain a version of every page? This is important to know
|
|
/// for garbage collecting old layers: an incremental layer depends on
|
|
/// the previous non-incremental layer.
|
|
fn is_incremental(&self) -> bool;
|
|
|
|
///
|
|
/// Return data needed to reconstruct given page at LSN.
|
|
///
|
|
/// It is up to the caller to collect more data from previous layer and
|
|
/// perform WAL redo, if necessary.
|
|
///
|
|
/// See PageReconstructResult for possible return values. The collected data
|
|
/// is appended to reconstruct_data; the caller should pass an empty struct
|
|
/// on first call, or a struct with a cached older image of the page if one
|
|
/// is available. If this returns ValueReconstructResult::Continue, look up
|
|
/// the predecessor layer and call again with the same 'reconstruct_data' to
|
|
/// collect more data.
|
|
fn get_value_reconstruct_data(
|
|
&self,
|
|
key: Key,
|
|
lsn_range: Range<Lsn>,
|
|
reconstruct_data: &mut ValueReconstructState,
|
|
ctx: &RequestContext,
|
|
) -> Result<ValueReconstructResult>;
|
|
|
|
/// A short ID string that uniquely identifies the given layer within a [`LayerMap`].
|
|
fn short_id(&self) -> String;
|
|
|
|
/// Dump summary of the contents of the layer to stdout
|
|
fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()>;
|
|
}
|
|
|
|
/// Returned by [`Layer::iter`]
|
|
pub type LayerIter<'i> = Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + 'i>;
|
|
|
|
/// Returned by [`Layer::key_iter`]
|
|
pub type LayerKeyIter<'i> = Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'i>;
|
|
|
|
/// A Layer contains all data in a "rectangle" consisting of a range of keys and
|
|
/// range of LSNs.
|
|
///
|
|
/// There are two kinds of layers, in-memory and on-disk layers. In-memory
|
|
/// layers are used to ingest incoming WAL, and provide fast access to the
|
|
/// recent page versions. On-disk layers are stored as files on disk, and are
|
|
/// immutable. This trait presents the common functionality of in-memory and
|
|
/// on-disk layers.
|
|
///
|
|
/// Furthermore, there are two kinds of on-disk layers: delta and image layers.
|
|
/// A delta layer contains all modifications within a range of LSNs and keys.
|
|
/// An image layer is a snapshot of all the data in a key-range, at a single
|
|
/// LSN.
|
|
pub trait PersistentLayer: Layer {
|
|
fn get_tenant_id(&self) -> TenantId;
|
|
|
|
/// Identify the timeline this layer belongs to
|
|
fn get_timeline_id(&self) -> TimelineId;
|
|
|
|
/// File name used for this layer, both in the pageserver's local filesystem
|
|
/// state as well as in the remote storage.
|
|
fn filename(&self) -> LayerFileName;
|
|
|
|
// Path to the layer file in the local filesystem.
|
|
// `None` for `RemoteLayer`.
|
|
fn local_path(&self) -> Option<PathBuf>;
|
|
|
|
/// Iterate through all keys and values stored in the layer
|
|
fn iter(&self, ctx: &RequestContext) -> Result<LayerIter<'_>>;
|
|
|
|
/// Iterate through all keys stored in the layer. Returns key, lsn and value size
|
|
/// It is used only for compaction and so is currently implemented only for DeltaLayer
|
|
fn key_iter(&self, _ctx: &RequestContext) -> Result<LayerKeyIter<'_>> {
|
|
panic!("Not implemented")
|
|
}
|
|
|
|
/// Permanently remove this layer from disk.
|
|
fn delete_resident_layer_file(&self) -> Result<()>;
|
|
|
|
fn downcast_remote_layer(self: Arc<Self>) -> Option<std::sync::Arc<RemoteLayer>> {
|
|
None
|
|
}
|
|
|
|
fn is_remote_layer(&self) -> bool {
|
|
false
|
|
}
|
|
|
|
/// Returns None if the layer file size is not known.
|
|
///
|
|
/// Should not change over the lifetime of the layer object because
|
|
/// current_physical_size is computed as the som of this value.
|
|
fn file_size(&self) -> Option<u64>;
|
|
|
|
fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo;
|
|
|
|
fn access_stats(&self) -> &LayerAccessStats;
|
|
}
|
|
|
|
pub fn downcast_remote_layer(
|
|
layer: &Arc<dyn PersistentLayer>,
|
|
) -> Option<std::sync::Arc<RemoteLayer>> {
|
|
if layer.is_remote_layer() {
|
|
Arc::clone(layer).downcast_remote_layer()
|
|
} else {
|
|
None
|
|
}
|
|
}
|
|
|
|
/// Holds metadata about a layer without any content. Used mostly for testing.
|
|
///
|
|
/// To use filenames as fixtures, parse them as [`LayerFileName`] then convert from that to a
|
|
/// LayerDescriptor.
|
|
#[derive(Clone, Debug)]
|
|
pub struct LayerDescriptor {
|
|
pub key: Range<Key>,
|
|
pub lsn: Range<Lsn>,
|
|
pub is_incremental: bool,
|
|
pub short_id: String,
|
|
}
|
|
|
|
impl Layer for LayerDescriptor {
|
|
fn get_key_range(&self) -> Range<Key> {
|
|
self.key.clone()
|
|
}
|
|
|
|
fn get_lsn_range(&self) -> Range<Lsn> {
|
|
self.lsn.clone()
|
|
}
|
|
|
|
fn is_incremental(&self) -> bool {
|
|
self.is_incremental
|
|
}
|
|
|
|
fn get_value_reconstruct_data(
|
|
&self,
|
|
_key: Key,
|
|
_lsn_range: Range<Lsn>,
|
|
_reconstruct_data: &mut ValueReconstructState,
|
|
_ctx: &RequestContext,
|
|
) -> Result<ValueReconstructResult> {
|
|
todo!("This method shouldn't be part of the Layer trait")
|
|
}
|
|
|
|
fn short_id(&self) -> String {
|
|
self.short_id.clone()
|
|
}
|
|
|
|
fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
|
|
todo!()
|
|
}
|
|
}
|
|
|
|
impl From<DeltaFileName> for LayerDescriptor {
|
|
fn from(value: DeltaFileName) -> Self {
|
|
let short_id = value.to_string();
|
|
LayerDescriptor {
|
|
key: value.key_range,
|
|
lsn: value.lsn_range,
|
|
is_incremental: true,
|
|
short_id,
|
|
}
|
|
}
|
|
}
|
|
|
|
impl From<ImageFileName> for LayerDescriptor {
|
|
fn from(value: ImageFileName) -> Self {
|
|
let short_id = value.to_string();
|
|
let lsn = value.lsn_as_range();
|
|
LayerDescriptor {
|
|
key: value.key_range,
|
|
lsn,
|
|
is_incremental: false,
|
|
short_id,
|
|
}
|
|
}
|
|
}
|
|
|
|
impl From<LayerFileName> for LayerDescriptor {
|
|
fn from(value: LayerFileName) -> Self {
|
|
match value {
|
|
LayerFileName::Delta(d) => Self::from(d),
|
|
LayerFileName::Image(i) => Self::from(i),
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Helper enum to hold a PageServerConf, or a path
|
|
///
|
|
/// This is used by DeltaLayer and ImageLayer. Normally, this holds a reference to the
|
|
/// global config, and paths to layer files are constructed using the tenant/timeline
|
|
/// path from the config. But in the 'pageserver_binutils' binary, we need to construct a Layer
|
|
/// struct for a file on disk, without having a page server running, so that we have no
|
|
/// config. In that case, we use the Path variant to hold the full path to the file on
|
|
/// disk.
|
|
enum PathOrConf {
|
|
Path(PathBuf),
|
|
Conf(&'static PageServerConf),
|
|
}
|
|
|
|
/// Range wrapping newtype, which uses display to render Debug.
|
|
///
|
|
/// Useful with `Key`, which has too verbose `{:?}` for printing multiple layers.
|
|
struct RangeDisplayDebug<'a, T: std::fmt::Display>(&'a Range<T>);
|
|
|
|
impl<'a, T: std::fmt::Display> std::fmt::Debug for RangeDisplayDebug<'a, T> {
|
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
write!(f, "{}..{}", self.0.start, self.0.end)
|
|
}
|
|
}
|