pageserver: drop the layer map lock after planning reads (#7215)

## Problem
The vectored read path holds the layer map lock while visiting a
timeline.

## Summary of changes
* Rework the fringe order to hold `Layer` on `Arc<InMemoryLayer>`
handles instead of descriptions that are resolved by the layer map at
the time of read. Note that previously `get_values_reconstruct_data` was
implemented for the layer description which already knew the lsn range
for the read. Now it is implemented on the new `ReadableLayer` handle
and needs to get the lsn range as an argument.
* Drop the layer map lock after updating the fringe.

Related https://github.com/neondatabase/neon/issues/6833
This commit is contained in:
Vlad Lazar
2024-04-02 17:16:15 +01:00
committed by GitHub
parent a5777bab09
commit 9957c6a9a0
5 changed files with 125 additions and 150 deletions

View File

@@ -72,6 +72,10 @@ impl EphemeralFile {
self.len
}
pub(crate) fn id(&self) -> page_cache::FileId {
self.page_cache_file_id
}
pub(crate) async fn read_blk(
&self,
blknum: u32,

View File

@@ -346,35 +346,6 @@ where
}
}
#[derive(PartialEq, Eq, Hash, Debug, Clone)]
pub enum InMemoryLayerHandle {
Open {
lsn_floor: Lsn,
end_lsn: Lsn,
},
Frozen {
idx: usize,
lsn_floor: Lsn,
end_lsn: Lsn,
},
}
impl InMemoryLayerHandle {
pub fn get_lsn_floor(&self) -> Lsn {
match self {
InMemoryLayerHandle::Open { lsn_floor, .. } => *lsn_floor,
InMemoryLayerHandle::Frozen { lsn_floor, .. } => *lsn_floor,
}
}
pub fn get_end_lsn(&self) -> Lsn {
match self {
InMemoryLayerHandle::Open { end_lsn, .. } => *end_lsn,
InMemoryLayerHandle::Frozen { end_lsn, .. } => *end_lsn,
}
}
}
impl LayerMap {
///
/// Find the latest layer (by lsn.end) that covers the given
@@ -576,41 +547,18 @@ impl LayerMap {
self.historic.iter()
}
/// Get a handle for the first in memory layer that matches the provided predicate.
/// The handle should be used with [`Self::get_in_memory_layer`] to retrieve the actual layer.
///
/// Note: [`Self::find_in_memory_layer`] and [`Self::get_in_memory_layer`] should be called during
/// the same exclusive region established by holding the layer manager lock.
pub fn find_in_memory_layer<Pred>(&self, mut pred: Pred) -> Option<InMemoryLayerHandle>
/// Get a ref counted pointer for the first in memory layer that matches the provided predicate.
pub fn find_in_memory_layer<Pred>(&self, mut pred: Pred) -> Option<Arc<InMemoryLayer>>
where
Pred: FnMut(&Arc<InMemoryLayer>) -> bool,
{
if let Some(open) = &self.open_layer {
if pred(open) {
return Some(InMemoryLayerHandle::Open {
lsn_floor: open.get_lsn_range().start,
end_lsn: open.get_lsn_range().end,
});
return Some(open.clone());
}
}
let pos = self.frozen_layers.iter().rev().position(pred);
pos.map(|rev_idx| {
let idx = self.frozen_layers.len() - 1 - rev_idx;
InMemoryLayerHandle::Frozen {
idx,
lsn_floor: self.frozen_layers[idx].get_lsn_range().start,
end_lsn: self.frozen_layers[idx].get_lsn_range().end,
}
})
}
/// Get the layer pointed to by the provided handle.
pub fn get_in_memory_layer(&self, handle: &InMemoryLayerHandle) -> Option<Arc<InMemoryLayer>> {
match handle {
InMemoryLayerHandle::Open { .. } => self.open_layer.clone(),
InMemoryLayerHandle::Frozen { idx, .. } => self.frozen_layers.get(*idx).cloned(),
}
self.frozen_layers.iter().rfind(|l| pred(l)).cloned()
}
///

View File

@@ -25,7 +25,7 @@ use std::cmp::{Ordering, Reverse};
use std::collections::hash_map::Entry;
use std::collections::{BinaryHeap, HashMap};
use std::ops::Range;
use std::sync::Mutex;
use std::sync::{Arc, Mutex};
use std::time::{Duration, SystemTime, UNIX_EPOCH};
use tracing::warn;
use utils::history_buffer::HistoryBufferWithDropCounter;
@@ -41,8 +41,8 @@ pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
pub(crate) use layer::{EvictionError, Layer, ResidentLayer};
use super::layer_map::InMemoryLayerHandle;
use super::timeline::layer_manager::LayerManager;
use self::inmemory_layer::InMemoryLayerFileId;
use super::timeline::GetVectoredError;
use super::PageReconstructError;
@@ -204,23 +204,30 @@ impl Default for ValuesReconstructState {
}
}
/// Description of layer to be read - the layer map can turn
/// this description into the actual layer.
#[derive(PartialEq, Eq, Hash, Debug, Clone)]
pub(crate) enum ReadableLayerDesc {
Persistent {
desc: PersistentLayerDesc,
lsn_range: Range<Lsn>,
},
InMemory {
handle: InMemoryLayerHandle,
lsn_ceil: Lsn,
},
/// A key that uniquely identifies a layer in a timeline
#[derive(Debug, PartialEq, Eq, Clone, Hash)]
pub(crate) enum LayerId {
PersitentLayerId(PersistentLayerKey),
InMemoryLayerId(InMemoryLayerFileId),
}
/// Wraper for 'ReadableLayerDesc' sorted by Lsn
/// Layer wrapper for the read path. Note that it is valid
/// to use these layers even after external operations have
/// been performed on them (compaction, freeze, etc.).
#[derive(Debug)]
struct ReadableLayerDescOrdered(ReadableLayerDesc);
pub(crate) enum ReadableLayer {
PersistentLayer(Layer),
InMemoryLayer(Arc<InMemoryLayer>),
}
/// A partial description of a read to be done.
#[derive(Debug, Clone)]
struct ReadDesc {
/// An id used to resolve the readable layer within the fringe
layer_id: LayerId,
/// Lsn range for the read, used for selecting the next read
lsn_range: Range<Lsn>,
}
/// Data structure which maintains a fringe of layers for the
/// read path. The fringe is the set of layers which intersects
@@ -231,41 +238,64 @@ struct ReadableLayerDescOrdered(ReadableLayerDesc);
/// a two layer indexing scheme.
#[derive(Debug)]
pub(crate) struct LayerFringe {
layers_by_lsn: BinaryHeap<ReadableLayerDescOrdered>,
layers: HashMap<ReadableLayerDesc, KeySpace>,
planned_reads_by_lsn: BinaryHeap<ReadDesc>,
layers: HashMap<LayerId, LayerKeyspace>,
}
#[derive(Debug)]
struct LayerKeyspace {
layer: ReadableLayer,
target_keyspace: KeySpace,
}
impl LayerFringe {
pub(crate) fn new() -> Self {
LayerFringe {
layers_by_lsn: BinaryHeap::new(),
planned_reads_by_lsn: BinaryHeap::new(),
layers: HashMap::new(),
}
}
pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayerDesc, KeySpace)> {
let handle = match self.layers_by_lsn.pop() {
Some(h) => h,
pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayer, KeySpace, Range<Lsn>)> {
let read_desc = match self.planned_reads_by_lsn.pop() {
Some(desc) => desc,
None => return None,
};
let removed = self.layers.remove_entry(&handle.0);
let removed = self.layers.remove_entry(&read_desc.layer_id);
match removed {
Some((layer, keyspace)) => Some((layer, keyspace)),
Some((
_,
LayerKeyspace {
layer,
target_keyspace,
},
)) => Some((layer, target_keyspace, read_desc.lsn_range)),
None => unreachable!("fringe internals are always consistent"),
}
}
pub(crate) fn update(&mut self, layer: ReadableLayerDesc, keyspace: KeySpace) {
let entry = self.layers.entry(layer.clone());
pub(crate) fn update(
&mut self,
layer: ReadableLayer,
keyspace: KeySpace,
lsn_range: Range<Lsn>,
) {
let layer_id = layer.id();
let entry = self.layers.entry(layer_id.clone());
match entry {
Entry::Occupied(mut entry) => {
entry.get_mut().merge(&keyspace);
entry.get_mut().target_keyspace.merge(&keyspace);
}
Entry::Vacant(entry) => {
self.layers_by_lsn
.push(ReadableLayerDescOrdered(entry.key().clone()));
entry.insert(keyspace);
self.planned_reads_by_lsn.push(ReadDesc {
lsn_range,
layer_id: layer_id.clone(),
});
entry.insert(LayerKeyspace {
layer,
target_keyspace: keyspace,
});
}
}
}
@@ -277,77 +307,55 @@ impl Default for LayerFringe {
}
}
impl Ord for ReadableLayerDescOrdered {
impl Ord for ReadDesc {
fn cmp(&self, other: &Self) -> Ordering {
let ord = self.0.get_lsn_ceil().cmp(&other.0.get_lsn_ceil());
let ord = self.lsn_range.end.cmp(&other.lsn_range.end);
if ord == std::cmp::Ordering::Equal {
self.0
.get_lsn_floor()
.cmp(&other.0.get_lsn_floor())
.reverse()
self.lsn_range.start.cmp(&other.lsn_range.start).reverse()
} else {
ord
}
}
}
impl PartialOrd for ReadableLayerDescOrdered {
impl PartialOrd for ReadDesc {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl PartialEq for ReadableLayerDescOrdered {
impl PartialEq for ReadDesc {
fn eq(&self, other: &Self) -> bool {
self.0.get_lsn_floor() == other.0.get_lsn_floor()
&& self.0.get_lsn_ceil() == other.0.get_lsn_ceil()
self.lsn_range == other.lsn_range
}
}
impl Eq for ReadableLayerDescOrdered {}
impl Eq for ReadDesc {}
impl ReadableLayerDesc {
pub(crate) fn get_lsn_floor(&self) -> Lsn {
impl ReadableLayer {
pub(crate) fn id(&self) -> LayerId {
match self {
ReadableLayerDesc::Persistent { lsn_range, .. } => lsn_range.start,
ReadableLayerDesc::InMemory { handle, .. } => handle.get_lsn_floor(),
}
}
pub(crate) fn get_lsn_ceil(&self) -> Lsn {
match self {
ReadableLayerDesc::Persistent { lsn_range, .. } => lsn_range.end,
ReadableLayerDesc::InMemory { lsn_ceil, .. } => *lsn_ceil,
Self::PersistentLayer(layer) => LayerId::PersitentLayerId(layer.layer_desc().key()),
Self::InMemoryLayer(layer) => LayerId::InMemoryLayerId(layer.file_id()),
}
}
pub(crate) async fn get_values_reconstruct_data(
&self,
layer_manager: &LayerManager,
keyspace: KeySpace,
lsn_range: Range<Lsn>,
reconstruct_state: &mut ValuesReconstructState,
ctx: &RequestContext,
) -> Result<(), GetVectoredError> {
match self {
ReadableLayerDesc::Persistent { desc, lsn_range } => {
let layer = layer_manager.get_from_desc(desc);
ReadableLayer::PersistentLayer(layer) => {
layer
.get_values_reconstruct_data(
keyspace,
lsn_range.clone(),
reconstruct_state,
ctx,
)
.get_values_reconstruct_data(keyspace, lsn_range, reconstruct_state, ctx)
.await
}
ReadableLayerDesc::InMemory { handle, lsn_ceil } => {
let layer = layer_manager
.layer_map()
.get_in_memory_layer(handle)
.unwrap();
ReadableLayer::InMemoryLayer(layer) => {
layer
.get_values_reconstruct_data(keyspace, *lsn_ceil, reconstruct_state, ctx)
.get_values_reconstruct_data(keyspace, lsn_range.end, reconstruct_state, ctx)
.await
}
}

View File

@@ -12,7 +12,7 @@ use crate::tenant::ephemeral_file::EphemeralFile;
use crate::tenant::storage_layer::ValueReconstructResult;
use crate::tenant::timeline::GetVectoredError;
use crate::tenant::{PageReconstructError, Timeline};
use crate::walrecord;
use crate::{page_cache, walrecord};
use anyhow::{anyhow, ensure, Result};
use pageserver_api::keyspace::KeySpace;
use pageserver_api::models::InMemoryLayerInfo;
@@ -36,10 +36,14 @@ use super::{
ValuesReconstructState,
};
#[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)]
pub(crate) struct InMemoryLayerFileId(page_cache::FileId);
pub struct InMemoryLayer {
conf: &'static PageServerConf,
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
file_id: InMemoryLayerFileId,
/// This layer contains all the changes from 'start_lsn'. The
/// start is inclusive.
@@ -200,6 +204,10 @@ pub(crate) static GLOBAL_RESOURCES: GlobalResources = GlobalResources {
};
impl InMemoryLayer {
pub(crate) fn file_id(&self) -> InMemoryLayerFileId {
self.file_id
}
pub(crate) fn get_timeline_id(&self) -> TimelineId {
self.timeline_id
}
@@ -443,8 +451,10 @@ impl InMemoryLayer {
trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");
let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id).await?;
let key = InMemoryLayerFileId(file.id());
Ok(InMemoryLayer {
file_id: key,
conf,
timeline_id,
tenant_shard_id,

View File

@@ -118,11 +118,11 @@ use self::layer_manager::LayerManager;
use self::logical_size::LogicalSize;
use self::walreceiver::{WalReceiver, WalReceiverConf};
use super::remote_timeline_client::RemoteTimelineClient;
use super::config::TenantConf;
use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline};
use super::{config::TenantConf, storage_layer::ReadableLayerDesc};
use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
use super::{remote_timeline_client::RemoteTimelineClient, storage_layer::ReadableLayer};
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
pub(super) enum FlushLoopState {
@@ -2905,16 +2905,6 @@ impl Timeline {
let mut completed_keyspace = KeySpace::default();
// Hold the layer map whilst visiting the timeline to prevent
// compaction, eviction and flushes from rendering the layers unreadable.
//
// TODO: Do we actually need to do this? In theory holding on
// to [`tenant::storage_layer::Layer`] should be enough. However,
// [`Timeline::get`] also holds the lock during IO, so more investigation
// is needed.
let guard = timeline.layers.read().await;
let layers = guard.layer_map();
loop {
if cancel.is_cancelled() {
return Err(GetVectoredError::Cancelled);
@@ -2924,6 +2914,9 @@ impl Timeline {
unmapped_keyspace.remove_overlapping_with(&keys_done_last_step);
completed_keyspace.merge(&keys_done_last_step);
let guard = timeline.layers.read().await;
let layers = guard.layer_map();
let in_memory_layer = layers.find_in_memory_layer(|l| {
let start_lsn = l.get_lsn_range().start;
cont_lsn > start_lsn
@@ -2931,12 +2924,11 @@ impl Timeline {
match in_memory_layer {
Some(l) => {
let lsn_range = l.get_lsn_range().start..cont_lsn;
fringe.update(
ReadableLayerDesc::InMemory {
handle: l,
lsn_ceil: cont_lsn,
},
ReadableLayer::InMemoryLayer(l),
unmapped_keyspace.clone(),
lsn_range,
);
}
None => {
@@ -2948,30 +2940,43 @@ impl Timeline {
.into_iter()
.map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| {
(
ReadableLayerDesc::Persistent {
desc: (*layer).clone(),
lsn_range: lsn_floor..cont_lsn,
},
ReadableLayer::PersistentLayer(guard.get_from_desc(&layer)),
keyspace_accum.to_keyspace(),
lsn_floor..cont_lsn,
)
})
.for_each(|(layer, keyspace)| fringe.update(layer, keyspace));
.for_each(|(layer, keyspace, lsn_range)| {
fringe.update(layer, keyspace, lsn_range)
});
}
}
}
if let Some((layer_to_read, keyspace_to_read)) = fringe.next_layer() {
// It's safe to drop the layer map lock after planning the next round of reads.
// The fringe keeps readable handles for the layers which are safe to read even
// if layers were compacted or flushed.
//
// The more interesting consideration is: "Why is the read algorithm still correct
// if the layer map changes while it is operating?". Doing a vectored read on a
// timeline boils down to pushing an imaginary lsn boundary downwards for each range
// covered by the read. The layer map tells us how to move the lsn downwards for a
// range at *a particular point in time*. It is fine for the answer to be different
// at two different time points.
drop(guard);
if let Some((layer_to_read, keyspace_to_read, lsn_range)) = fringe.next_layer() {
let next_cont_lsn = lsn_range.start;
layer_to_read
.get_values_reconstruct_data(
&guard,
keyspace_to_read.clone(),
lsn_range,
reconstruct_state,
ctx,
)
.await?;
unmapped_keyspace = keyspace_to_read;
cont_lsn = layer_to_read.get_lsn_floor();
cont_lsn = next_cont_lsn;
} else {
break;
}