Files
neon/pageserver/src/tenant/storage_layer.rs
Alex Chi Z 6810d2aa53 feat(pageserver): do not read past image layers for vectored get (#7773)
## Problem

Part of https://github.com/neondatabase/neon/issues/7462

On metadata keyspace, vectored get will not stop if a key is not found,
and will read past the image layer. However, the semantics is different
from single get, because if a key does not exist in the image layer, it
means that the key does not exist in the past, or have been deleted.
This pull request fixed it by recording image layer coverage during the
vectored get process and stop when the full keyspace is covered by an
image layer. A corresponding test case is added to ensure generating
image layer reduces the number of delta layers.

This optimization (or bug fix) also applies to rel block keyspaces. If a
key is missing, we can know it's missing once the first image layer is
reached. Page server will not attempt to read lower layers, which
potentially incurs layer downloads + evictions.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
2024-05-20 14:24:18 -04:00

726 lines
25 KiB
Rust

//! Common traits and structs for layers
pub mod delta_layer;
pub mod image_layer;
pub(crate) mod inmemory_layer;
pub(crate) mod layer;
mod layer_desc;
mod layer_name;
use crate::context::{AccessStatsBehavior, RequestContext};
use crate::repository::Value;
use crate::task_mgr::TaskKind;
use crate::walrecord::NeonWalRecord;
use bytes::Bytes;
use enum_map::EnumMap;
use enumset::EnumSet;
use once_cell::sync::Lazy;
use pageserver_api::key::Key;
use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
use pageserver_api::models::{
LayerAccessKind, LayerResidenceEvent, LayerResidenceEventReason, LayerResidenceStatus,
};
use std::borrow::Cow;
use std::cmp::{Ordering, Reverse};
use std::collections::hash_map::Entry;
use std::collections::{BinaryHeap, HashMap};
use std::ops::Range;
use std::sync::{Arc, Mutex};
use std::time::{Duration, SystemTime, UNIX_EPOCH};
use tracing::warn;
use utils::history_buffer::HistoryBufferWithDropCounter;
use utils::rate_limit::RateLimit;
use utils::{id::TimelineId, lsn::Lsn};
pub use delta_layer::{DeltaLayer, DeltaLayerWriter, ValueRef};
pub use image_layer::{ImageLayer, ImageLayerWriter};
pub use inmemory_layer::InMemoryLayer;
pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
pub use layer_name::{DeltaLayerName, ImageLayerName, LayerName};
pub(crate) use layer::{EvictionError, Layer, ResidentLayer};
use self::inmemory_layer::InMemoryLayerFileId;
use super::timeline::GetVectoredError;
use super::PageReconstructError;
pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
where
T: PartialOrd<T>,
{
if a.start < b.start {
a.end > b.start
} else {
b.end > a.start
}
}
/// Struct used to communicate across calls to 'get_value_reconstruct_data'.
///
/// Before first call, you can fill in 'page_img' if you have an older cached
/// version of the page available. That can save work in
/// 'get_value_reconstruct_data', as it can stop searching for page versions
/// when all the WAL records going back to the cached image have been collected.
///
/// When get_value_reconstruct_data returns Complete, 'img' is set to an image
/// of the page, or the oldest WAL record in 'records' is a will_init-type
/// record that initializes the page without requiring a previous image.
///
/// If 'get_page_reconstruct_data' returns Continue, some 'records' may have
/// been collected, but there are more records outside the current layer. Pass
/// the same ValueReconstructState struct in the next 'get_value_reconstruct_data'
/// call, to collect more records.
///
#[derive(Debug, Default)]
pub struct ValueReconstructState {
pub records: Vec<(Lsn, NeonWalRecord)>,
pub img: Option<(Lsn, Bytes)>,
}
#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
pub(crate) enum ValueReconstructSituation {
Complete,
#[default]
Continue,
}
/// Reconstruct data accumulated for a single key during a vectored get
#[derive(Debug, Default, Clone)]
pub(crate) struct VectoredValueReconstructState {
pub(crate) records: Vec<(Lsn, NeonWalRecord)>,
pub(crate) img: Option<(Lsn, Bytes)>,
situation: ValueReconstructSituation,
}
impl VectoredValueReconstructState {
fn get_cached_lsn(&self) -> Option<Lsn> {
self.img.as_ref().map(|img| img.0)
}
}
impl From<VectoredValueReconstructState> for ValueReconstructState {
fn from(mut state: VectoredValueReconstructState) -> Self {
// walredo expects the records to be descending in terms of Lsn
state.records.sort_by_key(|(lsn, _)| Reverse(*lsn));
ValueReconstructState {
records: state.records,
img: state.img,
}
}
}
/// Bag of data accumulated during a vectored get..
pub(crate) struct ValuesReconstructState {
/// The keys will be removed after `get_vectored` completes. The caller outside `Timeline`
/// should not expect to get anything from this hashmap.
pub(crate) keys: HashMap<Key, Result<VectoredValueReconstructState, PageReconstructError>>,
/// The keys which are already retrieved
keys_done: KeySpaceRandomAccum,
/// The keys covered by the image layers
keys_with_image_coverage: Option<Range<Key>>,
// Statistics that are still accessible as a caller of `get_vectored_impl`.
layers_visited: u32,
delta_layers_visited: u32,
}
impl ValuesReconstructState {
pub(crate) fn new() -> Self {
Self {
keys: HashMap::new(),
keys_done: KeySpaceRandomAccum::new(),
keys_with_image_coverage: None,
layers_visited: 0,
delta_layers_visited: 0,
}
}
/// Associate a key with the error which it encountered and mark it as done
pub(crate) fn on_key_error(&mut self, key: Key, err: PageReconstructError) {
let previous = self.keys.insert(key, Err(err));
if let Some(Ok(state)) = previous {
if state.situation == ValueReconstructSituation::Continue {
self.keys_done.add_key(key);
}
}
}
pub(crate) fn on_layer_visited(&mut self, layer: &ReadableLayer) {
self.layers_visited += 1;
if let ReadableLayer::PersistentLayer(layer) = layer {
if layer.layer_desc().is_delta() {
self.delta_layers_visited += 1;
}
}
}
pub(crate) fn get_delta_layers_visited(&self) -> u32 {
self.delta_layers_visited
}
pub(crate) fn get_layers_visited(&self) -> u32 {
self.layers_visited
}
/// This function is called after reading a keyspace from a layer.
/// It checks if the read path has now moved past the cached Lsn for any keys.
///
/// Implementation note: We intentionally iterate over the keys for which we've
/// already collected some reconstruct data. This avoids scaling complexity with
/// the size of the search space.
pub(crate) fn on_lsn_advanced(&mut self, keyspace: &KeySpace, advanced_to: Lsn) {
for (key, value) in self.keys.iter_mut() {
if !keyspace.contains(key) {
continue;
}
if let Ok(state) = value {
if state.situation != ValueReconstructSituation::Complete
&& state.get_cached_lsn() >= Some(advanced_to)
{
state.situation = ValueReconstructSituation::Complete;
self.keys_done.add_key(*key);
}
}
}
}
/// On hitting image layer, we can mark all keys in this range as done, because
/// if the image layer does not contain a key, it is deleted/never added.
pub(crate) fn on_image_layer_visited(&mut self, key_range: &Range<Key>) {
let prev_val = self.keys_with_image_coverage.replace(key_range.clone());
assert_eq!(
prev_val, None,
"should consume the keyspace before the next iteration"
);
}
/// Update the state collected for a given key.
/// Returns true if this was the last value needed for the key and false otherwise.
///
/// If the key is done after the update, mark it as such.
pub(crate) fn update_key(
&mut self,
key: &Key,
lsn: Lsn,
value: Value,
) -> ValueReconstructSituation {
let state = self
.keys
.entry(*key)
.or_insert(Ok(VectoredValueReconstructState::default()));
if let Ok(state) = state {
let key_done = match state.situation {
ValueReconstructSituation::Complete => unreachable!(),
ValueReconstructSituation::Continue => match value {
Value::Image(img) => {
state.img = Some((lsn, img));
true
}
Value::WalRecord(rec) => {
debug_assert!(
Some(lsn) > state.get_cached_lsn(),
"Attempt to collect a record below cached LSN for walredo: {} < {}",
lsn,
state
.get_cached_lsn()
.expect("Assertion can only fire if a cached lsn is present")
);
let will_init = rec.will_init();
state.records.push((lsn, rec));
will_init
}
},
};
if key_done && state.situation == ValueReconstructSituation::Continue {
state.situation = ValueReconstructSituation::Complete;
self.keys_done.add_key(*key);
}
state.situation
} else {
ValueReconstructSituation::Complete
}
}
/// Returns the Lsn at which this key is cached if one exists.
/// The read path should go no further than this Lsn for the given key.
pub(crate) fn get_cached_lsn(&self, key: &Key) -> Option<Lsn> {
self.keys
.get(key)
.and_then(|k| k.as_ref().ok())
.and_then(|state| state.get_cached_lsn())
}
/// Returns the key space describing the keys that have
/// been marked as completed since the last call to this function.
/// Returns individual keys done, and the image layer coverage.
pub(crate) fn consume_done_keys(&mut self) -> (KeySpace, Option<Range<Key>>) {
(
self.keys_done.consume_keyspace(),
self.keys_with_image_coverage.take(),
)
}
}
impl Default for ValuesReconstructState {
fn default() -> Self {
Self::new()
}
}
/// A key that uniquely identifies a layer in a timeline
#[derive(Debug, PartialEq, Eq, Clone, Hash)]
pub(crate) enum LayerId {
PersitentLayerId(PersistentLayerKey),
InMemoryLayerId(InMemoryLayerFileId),
}
/// Layer wrapper for the read path. Note that it is valid
/// to use these layers even after external operations have
/// been performed on them (compaction, freeze, etc.).
#[derive(Debug)]
pub(crate) enum ReadableLayer {
PersistentLayer(Layer),
InMemoryLayer(Arc<InMemoryLayer>),
}
/// A partial description of a read to be done.
#[derive(Debug, Clone)]
struct ReadDesc {
/// An id used to resolve the readable layer within the fringe
layer_id: LayerId,
/// Lsn range for the read, used for selecting the next read
lsn_range: Range<Lsn>,
}
/// Data structure which maintains a fringe of layers for the
/// read path. The fringe is the set of layers which intersects
/// the current keyspace that the search is descending on.
/// Each layer tracks the keyspace that intersects it.
///
/// The fringe must appear sorted by Lsn. Hence, it uses
/// a two layer indexing scheme.
#[derive(Debug)]
pub(crate) struct LayerFringe {
planned_reads_by_lsn: BinaryHeap<ReadDesc>,
layers: HashMap<LayerId, LayerKeyspace>,
}
#[derive(Debug)]
struct LayerKeyspace {
layer: ReadableLayer,
target_keyspace: KeySpace,
}
impl LayerFringe {
pub(crate) fn new() -> Self {
LayerFringe {
planned_reads_by_lsn: BinaryHeap::new(),
layers: HashMap::new(),
}
}
pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayer, KeySpace, Range<Lsn>)> {
let read_desc = match self.planned_reads_by_lsn.pop() {
Some(desc) => desc,
None => return None,
};
let removed = self.layers.remove_entry(&read_desc.layer_id);
match removed {
Some((
_,
LayerKeyspace {
layer,
target_keyspace,
},
)) => Some((layer, target_keyspace, read_desc.lsn_range)),
None => unreachable!("fringe internals are always consistent"),
}
}
pub(crate) fn update(
&mut self,
layer: ReadableLayer,
keyspace: KeySpace,
lsn_range: Range<Lsn>,
) {
let layer_id = layer.id();
let entry = self.layers.entry(layer_id.clone());
match entry {
Entry::Occupied(mut entry) => {
entry.get_mut().target_keyspace.merge(&keyspace);
}
Entry::Vacant(entry) => {
self.planned_reads_by_lsn.push(ReadDesc {
lsn_range,
layer_id: layer_id.clone(),
});
entry.insert(LayerKeyspace {
layer,
target_keyspace: keyspace,
});
}
}
}
}
impl Default for LayerFringe {
fn default() -> Self {
Self::new()
}
}
impl Ord for ReadDesc {
fn cmp(&self, other: &Self) -> Ordering {
let ord = self.lsn_range.end.cmp(&other.lsn_range.end);
if ord == std::cmp::Ordering::Equal {
self.lsn_range.start.cmp(&other.lsn_range.start).reverse()
} else {
ord
}
}
}
impl PartialOrd for ReadDesc {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl PartialEq for ReadDesc {
fn eq(&self, other: &Self) -> bool {
self.lsn_range == other.lsn_range
}
}
impl Eq for ReadDesc {}
impl ReadableLayer {
pub(crate) fn id(&self) -> LayerId {
match self {
Self::PersistentLayer(layer) => LayerId::PersitentLayerId(layer.layer_desc().key()),
Self::InMemoryLayer(layer) => LayerId::InMemoryLayerId(layer.file_id()),
}
}
pub(crate) async fn get_values_reconstruct_data(
&self,
keyspace: KeySpace,
lsn_range: Range<Lsn>,
reconstruct_state: &mut ValuesReconstructState,
ctx: &RequestContext,
) -> Result<(), GetVectoredError> {
match self {
ReadableLayer::PersistentLayer(layer) => {
layer
.get_values_reconstruct_data(keyspace, lsn_range, reconstruct_state, ctx)
.await
}
ReadableLayer::InMemoryLayer(layer) => {
layer
.get_values_reconstruct_data(keyspace, lsn_range.end, reconstruct_state, ctx)
.await
}
}
}
}
/// Return value from [`Layer::get_value_reconstruct_data`]
#[derive(Clone, Copy, Debug)]
pub enum ValueReconstructResult {
/// Got all the data needed to reconstruct the requested page
Complete,
/// This layer didn't contain all the required data, the caller should look up
/// the predecessor layer at the returned LSN and collect more data from there.
Continue,
/// This layer didn't contain data needed to reconstruct the page version at
/// the returned LSN. This is usually considered an error, but might be OK
/// in some circumstances.
Missing,
}
#[derive(Debug)]
pub struct LayerAccessStats(Mutex<LayerAccessStatsLocked>);
/// This struct holds two instances of [`LayerAccessStatsInner`].
/// Accesses are recorded to both instances.
/// The `for_scraping_api`instance can be reset from the management API via [`LayerAccessStatsReset`].
/// The `for_eviction_policy` is never reset.
#[derive(Debug, Default, Clone)]
struct LayerAccessStatsLocked {
for_scraping_api: LayerAccessStatsInner,
for_eviction_policy: LayerAccessStatsInner,
}
impl LayerAccessStatsLocked {
fn iter_mut(&mut self) -> impl Iterator<Item = &mut LayerAccessStatsInner> {
[&mut self.for_scraping_api, &mut self.for_eviction_policy].into_iter()
}
}
#[derive(Debug, Default, Clone)]
struct LayerAccessStatsInner {
first_access: Option<LayerAccessStatFullDetails>,
count_by_access_kind: EnumMap<LayerAccessKind, u64>,
task_kind_flag: EnumSet<TaskKind>,
last_accesses: HistoryBufferWithDropCounter<LayerAccessStatFullDetails, 16>,
last_residence_changes: HistoryBufferWithDropCounter<LayerResidenceEvent, 16>,
}
#[derive(Debug, Clone, Copy)]
pub(crate) struct LayerAccessStatFullDetails {
pub(crate) when: SystemTime,
pub(crate) task_kind: TaskKind,
pub(crate) access_kind: LayerAccessKind,
}
#[derive(Clone, Copy, strum_macros::EnumString)]
pub enum LayerAccessStatsReset {
NoReset,
JustTaskKindFlags,
AllStats,
}
fn system_time_to_millis_since_epoch(ts: &SystemTime) -> u64 {
ts.duration_since(UNIX_EPOCH)
.expect("better to die in this unlikely case than report false stats")
.as_millis()
.try_into()
.expect("64 bits is enough for few more years")
}
impl LayerAccessStatFullDetails {
fn as_api_model(&self) -> pageserver_api::models::LayerAccessStatFullDetails {
let Self {
when,
task_kind,
access_kind,
} = self;
pageserver_api::models::LayerAccessStatFullDetails {
when_millis_since_epoch: system_time_to_millis_since_epoch(when),
task_kind: Cow::Borrowed(task_kind.into()), // into static str, powered by strum_macros
access_kind: *access_kind,
}
}
}
impl LayerAccessStats {
/// Create an empty stats object.
///
/// The caller is responsible for recording a residence event
/// using [`record_residence_event`] before calling `latest_activity`.
/// If they don't, [`latest_activity`] will return `None`.
///
/// [`record_residence_event`]: Self::record_residence_event
/// [`latest_activity`]: Self::latest_activity
pub(crate) fn empty_will_record_residence_event_later() -> Self {
LayerAccessStats(Mutex::default())
}
/// Create an empty stats object and record a [`LayerLoad`] event with the given residence status.
///
/// See [`record_residence_event`] for why you need to do this while holding the layer map lock.
///
/// [`LayerLoad`]: LayerResidenceEventReason::LayerLoad
/// [`record_residence_event`]: Self::record_residence_event
pub(crate) fn for_loading_layer(status: LayerResidenceStatus) -> Self {
let new = LayerAccessStats(Mutex::new(LayerAccessStatsLocked::default()));
new.record_residence_event(status, LayerResidenceEventReason::LayerLoad);
new
}
/// Record a change in layer residency.
///
/// Recording the event must happen while holding the layer map lock to
/// ensure that latest-activity-threshold-based layer eviction (eviction_task.rs)
/// can do an "imitate access" to this layer, before it observes `now-latest_activity() > threshold`.
///
/// If we instead recorded the residence event with a timestamp from before grabbing the layer map lock,
/// the following race could happen:
///
/// - Compact: Write out an L1 layer from several L0 layers. This records residence event LayerCreate with the current timestamp.
/// - Eviction: imitate access logical size calculation. This accesses the L0 layers because the L1 layer is not yet in the layer map.
/// - Compact: Grab layer map lock, add the new L1 to layer map and remove the L0s, release layer map lock.
/// - Eviction: observes the new L1 layer whose only activity timestamp is the LayerCreate event.
///
pub(crate) fn record_residence_event(
&self,
status: LayerResidenceStatus,
reason: LayerResidenceEventReason,
) {
let mut locked = self.0.lock().unwrap();
locked.iter_mut().for_each(|inner| {
inner
.last_residence_changes
.write(LayerResidenceEvent::new(status, reason))
});
}
fn record_access(&self, access_kind: LayerAccessKind, ctx: &RequestContext) {
if ctx.access_stats_behavior() == AccessStatsBehavior::Skip {
return;
}
let this_access = LayerAccessStatFullDetails {
when: SystemTime::now(),
task_kind: ctx.task_kind(),
access_kind,
};
let mut locked = self.0.lock().unwrap();
locked.iter_mut().for_each(|inner| {
inner.first_access.get_or_insert(this_access);
inner.count_by_access_kind[access_kind] += 1;
inner.task_kind_flag |= ctx.task_kind();
inner.last_accesses.write(this_access);
})
}
fn as_api_model(
&self,
reset: LayerAccessStatsReset,
) -> pageserver_api::models::LayerAccessStats {
let mut locked = self.0.lock().unwrap();
let inner = &mut locked.for_scraping_api;
let LayerAccessStatsInner {
first_access,
count_by_access_kind,
task_kind_flag,
last_accesses,
last_residence_changes,
} = inner;
let ret = pageserver_api::models::LayerAccessStats {
access_count_by_access_kind: count_by_access_kind
.iter()
.map(|(kind, count)| (kind, *count))
.collect(),
task_kind_access_flag: task_kind_flag
.iter()
.map(|task_kind| Cow::Borrowed(task_kind.into())) // into static str, powered by strum_macros
.collect(),
first: first_access.as_ref().map(|a| a.as_api_model()),
accesses_history: last_accesses.map(|m| m.as_api_model()),
residence_events_history: last_residence_changes.clone(),
};
match reset {
LayerAccessStatsReset::NoReset => (),
LayerAccessStatsReset::JustTaskKindFlags => {
inner.task_kind_flag.clear();
}
LayerAccessStatsReset::AllStats => {
*inner = LayerAccessStatsInner::default();
}
}
ret
}
/// Get the latest access timestamp, falling back to latest residence event, further falling
/// back to `SystemTime::now` for a usable timestamp for eviction.
pub(crate) fn latest_activity_or_now(&self) -> SystemTime {
self.latest_activity().unwrap_or_else(SystemTime::now)
}
/// Get the latest access timestamp, falling back to latest residence event.
///
/// This function can only return `None` if there has not yet been a call to the
/// [`record_residence_event`] method. That would generally be considered an
/// implementation error. This function logs a rate-limited warning in that case.
///
/// TODO: use type system to avoid the need for `fallback`.
/// The approach in <https://github.com/neondatabase/neon/pull/3775>
/// could be used to enforce that a residence event is recorded
/// before a layer is added to the layer map. We could also have
/// a layer wrapper type that holds the LayerAccessStats, and ensure
/// that that type can only be produced by inserting into the layer map.
///
/// [`record_residence_event`]: Self::record_residence_event
fn latest_activity(&self) -> Option<SystemTime> {
let locked = self.0.lock().unwrap();
let inner = &locked.for_eviction_policy;
match inner.last_accesses.recent() {
Some(a) => Some(a.when),
None => match inner.last_residence_changes.recent() {
Some(e) => Some(e.timestamp),
None => {
static WARN_RATE_LIMIT: Lazy<Mutex<(usize, RateLimit)>> =
Lazy::new(|| Mutex::new((0, RateLimit::new(Duration::from_secs(10)))));
let mut guard = WARN_RATE_LIMIT.lock().unwrap();
guard.0 += 1;
let occurences = guard.0;
guard.1.call(move || {
warn!(parent: None, occurences, "latest_activity not available, this is an implementation bug, using fallback value");
});
None
}
},
}
}
}
/// Get a layer descriptor from a layer.
pub trait AsLayerDesc {
/// Get the layer descriptor.
fn layer_desc(&self) -> &PersistentLayerDesc;
}
pub mod tests {
use pageserver_api::shard::TenantShardId;
use super::*;
impl From<DeltaLayerName> for PersistentLayerDesc {
fn from(value: DeltaLayerName) -> Self {
PersistentLayerDesc::new_delta(
TenantShardId::from([0; 18]),
TimelineId::from_array([0; 16]),
value.key_range,
value.lsn_range,
233,
)
}
}
impl From<ImageLayerName> for PersistentLayerDesc {
fn from(value: ImageLayerName) -> Self {
PersistentLayerDesc::new_img(
TenantShardId::from([0; 18]),
TimelineId::from_array([0; 16]),
value.key_range,
value.lsn,
233,
)
}
}
impl From<LayerName> for PersistentLayerDesc {
fn from(value: LayerName) -> Self {
match value {
LayerName::Delta(d) => Self::from(d),
LayerName::Image(i) => Self::from(i),
}
}
}
}
/// Range wrapping newtype, which uses display to render Debug.
///
/// Useful with `Key`, which has too verbose `{:?}` for printing multiple layers.
struct RangeDisplayDebug<'a, T: std::fmt::Display>(&'a Range<T>);
impl<'a, T: std::fmt::Display> std::fmt::Debug for RangeDisplayDebug<'a, T> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}..{}", self.0.start, self.0.end)
}
}