mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-31 03:50:37 +00:00
pageserver: drop the layer map lock after planning reads (#7215)
## Problem The vectored read path holds the layer map lock while visiting a timeline. ## Summary of changes * Rework the fringe order to hold `Layer` on `Arc<InMemoryLayer>` handles instead of descriptions that are resolved by the layer map at the time of read. Note that previously `get_values_reconstruct_data` was implemented for the layer description which already knew the lsn range for the read. Now it is implemented on the new `ReadableLayer` handle and needs to get the lsn range as an argument. * Drop the layer map lock after updating the fringe. Related https://github.com/neondatabase/neon/issues/6833
This commit is contained in:
@@ -72,6 +72,10 @@ impl EphemeralFile {
|
||||
self.len
|
||||
}
|
||||
|
||||
pub(crate) fn id(&self) -> page_cache::FileId {
|
||||
self.page_cache_file_id
|
||||
}
|
||||
|
||||
pub(crate) async fn read_blk(
|
||||
&self,
|
||||
blknum: u32,
|
||||
|
||||
@@ -346,35 +346,6 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(PartialEq, Eq, Hash, Debug, Clone)]
|
||||
pub enum InMemoryLayerHandle {
|
||||
Open {
|
||||
lsn_floor: Lsn,
|
||||
end_lsn: Lsn,
|
||||
},
|
||||
Frozen {
|
||||
idx: usize,
|
||||
lsn_floor: Lsn,
|
||||
end_lsn: Lsn,
|
||||
},
|
||||
}
|
||||
|
||||
impl InMemoryLayerHandle {
|
||||
pub fn get_lsn_floor(&self) -> Lsn {
|
||||
match self {
|
||||
InMemoryLayerHandle::Open { lsn_floor, .. } => *lsn_floor,
|
||||
InMemoryLayerHandle::Frozen { lsn_floor, .. } => *lsn_floor,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_end_lsn(&self) -> Lsn {
|
||||
match self {
|
||||
InMemoryLayerHandle::Open { end_lsn, .. } => *end_lsn,
|
||||
InMemoryLayerHandle::Frozen { end_lsn, .. } => *end_lsn,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl LayerMap {
|
||||
///
|
||||
/// Find the latest layer (by lsn.end) that covers the given
|
||||
@@ -576,41 +547,18 @@ impl LayerMap {
|
||||
self.historic.iter()
|
||||
}
|
||||
|
||||
/// Get a handle for the first in memory layer that matches the provided predicate.
|
||||
/// The handle should be used with [`Self::get_in_memory_layer`] to retrieve the actual layer.
|
||||
///
|
||||
/// Note: [`Self::find_in_memory_layer`] and [`Self::get_in_memory_layer`] should be called during
|
||||
/// the same exclusive region established by holding the layer manager lock.
|
||||
pub fn find_in_memory_layer<Pred>(&self, mut pred: Pred) -> Option<InMemoryLayerHandle>
|
||||
/// Get a ref counted pointer for the first in memory layer that matches the provided predicate.
|
||||
pub fn find_in_memory_layer<Pred>(&self, mut pred: Pred) -> Option<Arc<InMemoryLayer>>
|
||||
where
|
||||
Pred: FnMut(&Arc<InMemoryLayer>) -> bool,
|
||||
{
|
||||
if let Some(open) = &self.open_layer {
|
||||
if pred(open) {
|
||||
return Some(InMemoryLayerHandle::Open {
|
||||
lsn_floor: open.get_lsn_range().start,
|
||||
end_lsn: open.get_lsn_range().end,
|
||||
});
|
||||
return Some(open.clone());
|
||||
}
|
||||
}
|
||||
|
||||
let pos = self.frozen_layers.iter().rev().position(pred);
|
||||
pos.map(|rev_idx| {
|
||||
let idx = self.frozen_layers.len() - 1 - rev_idx;
|
||||
InMemoryLayerHandle::Frozen {
|
||||
idx,
|
||||
lsn_floor: self.frozen_layers[idx].get_lsn_range().start,
|
||||
end_lsn: self.frozen_layers[idx].get_lsn_range().end,
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
/// Get the layer pointed to by the provided handle.
|
||||
pub fn get_in_memory_layer(&self, handle: &InMemoryLayerHandle) -> Option<Arc<InMemoryLayer>> {
|
||||
match handle {
|
||||
InMemoryLayerHandle::Open { .. } => self.open_layer.clone(),
|
||||
InMemoryLayerHandle::Frozen { idx, .. } => self.frozen_layers.get(*idx).cloned(),
|
||||
}
|
||||
self.frozen_layers.iter().rfind(|l| pred(l)).cloned()
|
||||
}
|
||||
|
||||
///
|
||||
|
||||
@@ -25,7 +25,7 @@ use std::cmp::{Ordering, Reverse};
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::collections::{BinaryHeap, HashMap};
|
||||
use std::ops::Range;
|
||||
use std::sync::Mutex;
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::time::{Duration, SystemTime, UNIX_EPOCH};
|
||||
use tracing::warn;
|
||||
use utils::history_buffer::HistoryBufferWithDropCounter;
|
||||
@@ -41,8 +41,8 @@ pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
|
||||
|
||||
pub(crate) use layer::{EvictionError, Layer, ResidentLayer};
|
||||
|
||||
use super::layer_map::InMemoryLayerHandle;
|
||||
use super::timeline::layer_manager::LayerManager;
|
||||
use self::inmemory_layer::InMemoryLayerFileId;
|
||||
|
||||
use super::timeline::GetVectoredError;
|
||||
use super::PageReconstructError;
|
||||
|
||||
@@ -204,23 +204,30 @@ impl Default for ValuesReconstructState {
|
||||
}
|
||||
}
|
||||
|
||||
/// Description of layer to be read - the layer map can turn
|
||||
/// this description into the actual layer.
|
||||
#[derive(PartialEq, Eq, Hash, Debug, Clone)]
|
||||
pub(crate) enum ReadableLayerDesc {
|
||||
Persistent {
|
||||
desc: PersistentLayerDesc,
|
||||
lsn_range: Range<Lsn>,
|
||||
},
|
||||
InMemory {
|
||||
handle: InMemoryLayerHandle,
|
||||
lsn_ceil: Lsn,
|
||||
},
|
||||
/// A key that uniquely identifies a layer in a timeline
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Hash)]
|
||||
pub(crate) enum LayerId {
|
||||
PersitentLayerId(PersistentLayerKey),
|
||||
InMemoryLayerId(InMemoryLayerFileId),
|
||||
}
|
||||
|
||||
/// Wraper for 'ReadableLayerDesc' sorted by Lsn
|
||||
/// Layer wrapper for the read path. Note that it is valid
|
||||
/// to use these layers even after external operations have
|
||||
/// been performed on them (compaction, freeze, etc.).
|
||||
#[derive(Debug)]
|
||||
struct ReadableLayerDescOrdered(ReadableLayerDesc);
|
||||
pub(crate) enum ReadableLayer {
|
||||
PersistentLayer(Layer),
|
||||
InMemoryLayer(Arc<InMemoryLayer>),
|
||||
}
|
||||
|
||||
/// A partial description of a read to be done.
|
||||
#[derive(Debug, Clone)]
|
||||
struct ReadDesc {
|
||||
/// An id used to resolve the readable layer within the fringe
|
||||
layer_id: LayerId,
|
||||
/// Lsn range for the read, used for selecting the next read
|
||||
lsn_range: Range<Lsn>,
|
||||
}
|
||||
|
||||
/// Data structure which maintains a fringe of layers for the
|
||||
/// read path. The fringe is the set of layers which intersects
|
||||
@@ -231,41 +238,64 @@ struct ReadableLayerDescOrdered(ReadableLayerDesc);
|
||||
/// a two layer indexing scheme.
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct LayerFringe {
|
||||
layers_by_lsn: BinaryHeap<ReadableLayerDescOrdered>,
|
||||
layers: HashMap<ReadableLayerDesc, KeySpace>,
|
||||
planned_reads_by_lsn: BinaryHeap<ReadDesc>,
|
||||
layers: HashMap<LayerId, LayerKeyspace>,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct LayerKeyspace {
|
||||
layer: ReadableLayer,
|
||||
target_keyspace: KeySpace,
|
||||
}
|
||||
|
||||
impl LayerFringe {
|
||||
pub(crate) fn new() -> Self {
|
||||
LayerFringe {
|
||||
layers_by_lsn: BinaryHeap::new(),
|
||||
planned_reads_by_lsn: BinaryHeap::new(),
|
||||
layers: HashMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayerDesc, KeySpace)> {
|
||||
let handle = match self.layers_by_lsn.pop() {
|
||||
Some(h) => h,
|
||||
pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayer, KeySpace, Range<Lsn>)> {
|
||||
let read_desc = match self.planned_reads_by_lsn.pop() {
|
||||
Some(desc) => desc,
|
||||
None => return None,
|
||||
};
|
||||
|
||||
let removed = self.layers.remove_entry(&handle.0);
|
||||
let removed = self.layers.remove_entry(&read_desc.layer_id);
|
||||
match removed {
|
||||
Some((layer, keyspace)) => Some((layer, keyspace)),
|
||||
Some((
|
||||
_,
|
||||
LayerKeyspace {
|
||||
layer,
|
||||
target_keyspace,
|
||||
},
|
||||
)) => Some((layer, target_keyspace, read_desc.lsn_range)),
|
||||
None => unreachable!("fringe internals are always consistent"),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn update(&mut self, layer: ReadableLayerDesc, keyspace: KeySpace) {
|
||||
let entry = self.layers.entry(layer.clone());
|
||||
pub(crate) fn update(
|
||||
&mut self,
|
||||
layer: ReadableLayer,
|
||||
keyspace: KeySpace,
|
||||
lsn_range: Range<Lsn>,
|
||||
) {
|
||||
let layer_id = layer.id();
|
||||
let entry = self.layers.entry(layer_id.clone());
|
||||
match entry {
|
||||
Entry::Occupied(mut entry) => {
|
||||
entry.get_mut().merge(&keyspace);
|
||||
entry.get_mut().target_keyspace.merge(&keyspace);
|
||||
}
|
||||
Entry::Vacant(entry) => {
|
||||
self.layers_by_lsn
|
||||
.push(ReadableLayerDescOrdered(entry.key().clone()));
|
||||
entry.insert(keyspace);
|
||||
self.planned_reads_by_lsn.push(ReadDesc {
|
||||
lsn_range,
|
||||
layer_id: layer_id.clone(),
|
||||
});
|
||||
entry.insert(LayerKeyspace {
|
||||
layer,
|
||||
target_keyspace: keyspace,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -277,77 +307,55 @@ impl Default for LayerFringe {
|
||||
}
|
||||
}
|
||||
|
||||
impl Ord for ReadableLayerDescOrdered {
|
||||
impl Ord for ReadDesc {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
let ord = self.0.get_lsn_ceil().cmp(&other.0.get_lsn_ceil());
|
||||
let ord = self.lsn_range.end.cmp(&other.lsn_range.end);
|
||||
if ord == std::cmp::Ordering::Equal {
|
||||
self.0
|
||||
.get_lsn_floor()
|
||||
.cmp(&other.0.get_lsn_floor())
|
||||
.reverse()
|
||||
self.lsn_range.start.cmp(&other.lsn_range.start).reverse()
|
||||
} else {
|
||||
ord
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialOrd for ReadableLayerDescOrdered {
|
||||
impl PartialOrd for ReadDesc {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq for ReadableLayerDescOrdered {
|
||||
impl PartialEq for ReadDesc {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.0.get_lsn_floor() == other.0.get_lsn_floor()
|
||||
&& self.0.get_lsn_ceil() == other.0.get_lsn_ceil()
|
||||
self.lsn_range == other.lsn_range
|
||||
}
|
||||
}
|
||||
|
||||
impl Eq for ReadableLayerDescOrdered {}
|
||||
impl Eq for ReadDesc {}
|
||||
|
||||
impl ReadableLayerDesc {
|
||||
pub(crate) fn get_lsn_floor(&self) -> Lsn {
|
||||
impl ReadableLayer {
|
||||
pub(crate) fn id(&self) -> LayerId {
|
||||
match self {
|
||||
ReadableLayerDesc::Persistent { lsn_range, .. } => lsn_range.start,
|
||||
ReadableLayerDesc::InMemory { handle, .. } => handle.get_lsn_floor(),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn get_lsn_ceil(&self) -> Lsn {
|
||||
match self {
|
||||
ReadableLayerDesc::Persistent { lsn_range, .. } => lsn_range.end,
|
||||
ReadableLayerDesc::InMemory { lsn_ceil, .. } => *lsn_ceil,
|
||||
Self::PersistentLayer(layer) => LayerId::PersitentLayerId(layer.layer_desc().key()),
|
||||
Self::InMemoryLayer(layer) => LayerId::InMemoryLayerId(layer.file_id()),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) async fn get_values_reconstruct_data(
|
||||
&self,
|
||||
layer_manager: &LayerManager,
|
||||
keyspace: KeySpace,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_state: &mut ValuesReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), GetVectoredError> {
|
||||
match self {
|
||||
ReadableLayerDesc::Persistent { desc, lsn_range } => {
|
||||
let layer = layer_manager.get_from_desc(desc);
|
||||
ReadableLayer::PersistentLayer(layer) => {
|
||||
layer
|
||||
.get_values_reconstruct_data(
|
||||
keyspace,
|
||||
lsn_range.clone(),
|
||||
reconstruct_state,
|
||||
ctx,
|
||||
)
|
||||
.get_values_reconstruct_data(keyspace, lsn_range, reconstruct_state, ctx)
|
||||
.await
|
||||
}
|
||||
ReadableLayerDesc::InMemory { handle, lsn_ceil } => {
|
||||
let layer = layer_manager
|
||||
.layer_map()
|
||||
.get_in_memory_layer(handle)
|
||||
.unwrap();
|
||||
|
||||
ReadableLayer::InMemoryLayer(layer) => {
|
||||
layer
|
||||
.get_values_reconstruct_data(keyspace, *lsn_ceil, reconstruct_state, ctx)
|
||||
.get_values_reconstruct_data(keyspace, lsn_range.end, reconstruct_state, ctx)
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,7 +12,7 @@ use crate::tenant::ephemeral_file::EphemeralFile;
|
||||
use crate::tenant::storage_layer::ValueReconstructResult;
|
||||
use crate::tenant::timeline::GetVectoredError;
|
||||
use crate::tenant::{PageReconstructError, Timeline};
|
||||
use crate::walrecord;
|
||||
use crate::{page_cache, walrecord};
|
||||
use anyhow::{anyhow, ensure, Result};
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::InMemoryLayerInfo;
|
||||
@@ -36,10 +36,14 @@ use super::{
|
||||
ValuesReconstructState,
|
||||
};
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)]
|
||||
pub(crate) struct InMemoryLayerFileId(page_cache::FileId);
|
||||
|
||||
pub struct InMemoryLayer {
|
||||
conf: &'static PageServerConf,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
file_id: InMemoryLayerFileId,
|
||||
|
||||
/// This layer contains all the changes from 'start_lsn'. The
|
||||
/// start is inclusive.
|
||||
@@ -200,6 +204,10 @@ pub(crate) static GLOBAL_RESOURCES: GlobalResources = GlobalResources {
|
||||
};
|
||||
|
||||
impl InMemoryLayer {
|
||||
pub(crate) fn file_id(&self) -> InMemoryLayerFileId {
|
||||
self.file_id
|
||||
}
|
||||
|
||||
pub(crate) fn get_timeline_id(&self) -> TimelineId {
|
||||
self.timeline_id
|
||||
}
|
||||
@@ -443,8 +451,10 @@ impl InMemoryLayer {
|
||||
trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");
|
||||
|
||||
let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id).await?;
|
||||
let key = InMemoryLayerFileId(file.id());
|
||||
|
||||
Ok(InMemoryLayer {
|
||||
file_id: key,
|
||||
conf,
|
||||
timeline_id,
|
||||
tenant_shard_id,
|
||||
|
||||
@@ -118,11 +118,11 @@ use self::layer_manager::LayerManager;
|
||||
use self::logical_size::LogicalSize;
|
||||
use self::walreceiver::{WalReceiver, WalReceiverConf};
|
||||
|
||||
use super::remote_timeline_client::RemoteTimelineClient;
|
||||
use super::config::TenantConf;
|
||||
use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline};
|
||||
use super::{config::TenantConf, storage_layer::ReadableLayerDesc};
|
||||
use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
|
||||
use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
|
||||
use super::{remote_timeline_client::RemoteTimelineClient, storage_layer::ReadableLayer};
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
|
||||
pub(super) enum FlushLoopState {
|
||||
@@ -2905,16 +2905,6 @@ impl Timeline {
|
||||
|
||||
let mut completed_keyspace = KeySpace::default();
|
||||
|
||||
// Hold the layer map whilst visiting the timeline to prevent
|
||||
// compaction, eviction and flushes from rendering the layers unreadable.
|
||||
//
|
||||
// TODO: Do we actually need to do this? In theory holding on
|
||||
// to [`tenant::storage_layer::Layer`] should be enough. However,
|
||||
// [`Timeline::get`] also holds the lock during IO, so more investigation
|
||||
// is needed.
|
||||
let guard = timeline.layers.read().await;
|
||||
let layers = guard.layer_map();
|
||||
|
||||
loop {
|
||||
if cancel.is_cancelled() {
|
||||
return Err(GetVectoredError::Cancelled);
|
||||
@@ -2924,6 +2914,9 @@ impl Timeline {
|
||||
unmapped_keyspace.remove_overlapping_with(&keys_done_last_step);
|
||||
completed_keyspace.merge(&keys_done_last_step);
|
||||
|
||||
let guard = timeline.layers.read().await;
|
||||
let layers = guard.layer_map();
|
||||
|
||||
let in_memory_layer = layers.find_in_memory_layer(|l| {
|
||||
let start_lsn = l.get_lsn_range().start;
|
||||
cont_lsn > start_lsn
|
||||
@@ -2931,12 +2924,11 @@ impl Timeline {
|
||||
|
||||
match in_memory_layer {
|
||||
Some(l) => {
|
||||
let lsn_range = l.get_lsn_range().start..cont_lsn;
|
||||
fringe.update(
|
||||
ReadableLayerDesc::InMemory {
|
||||
handle: l,
|
||||
lsn_ceil: cont_lsn,
|
||||
},
|
||||
ReadableLayer::InMemoryLayer(l),
|
||||
unmapped_keyspace.clone(),
|
||||
lsn_range,
|
||||
);
|
||||
}
|
||||
None => {
|
||||
@@ -2948,30 +2940,43 @@ impl Timeline {
|
||||
.into_iter()
|
||||
.map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| {
|
||||
(
|
||||
ReadableLayerDesc::Persistent {
|
||||
desc: (*layer).clone(),
|
||||
lsn_range: lsn_floor..cont_lsn,
|
||||
},
|
||||
ReadableLayer::PersistentLayer(guard.get_from_desc(&layer)),
|
||||
keyspace_accum.to_keyspace(),
|
||||
lsn_floor..cont_lsn,
|
||||
)
|
||||
})
|
||||
.for_each(|(layer, keyspace)| fringe.update(layer, keyspace));
|
||||
.for_each(|(layer, keyspace, lsn_range)| {
|
||||
fringe.update(layer, keyspace, lsn_range)
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if let Some((layer_to_read, keyspace_to_read)) = fringe.next_layer() {
|
||||
// It's safe to drop the layer map lock after planning the next round of reads.
|
||||
// The fringe keeps readable handles for the layers which are safe to read even
|
||||
// if layers were compacted or flushed.
|
||||
//
|
||||
// The more interesting consideration is: "Why is the read algorithm still correct
|
||||
// if the layer map changes while it is operating?". Doing a vectored read on a
|
||||
// timeline boils down to pushing an imaginary lsn boundary downwards for each range
|
||||
// covered by the read. The layer map tells us how to move the lsn downwards for a
|
||||
// range at *a particular point in time*. It is fine for the answer to be different
|
||||
// at two different time points.
|
||||
drop(guard);
|
||||
|
||||
if let Some((layer_to_read, keyspace_to_read, lsn_range)) = fringe.next_layer() {
|
||||
let next_cont_lsn = lsn_range.start;
|
||||
layer_to_read
|
||||
.get_values_reconstruct_data(
|
||||
&guard,
|
||||
keyspace_to_read.clone(),
|
||||
lsn_range,
|
||||
reconstruct_state,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
unmapped_keyspace = keyspace_to_read;
|
||||
cont_lsn = layer_to_read.get_lsn_floor();
|
||||
cont_lsn = next_cont_lsn;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user