//! An in-memory layer stores recently received key-value pairs. //! //! The "in-memory" part of the name is a bit misleading: the actual page versions are //! held in an ephemeral file, not in memory. The metadata for each page version, i.e. //! its position in the file, is kept in memory, though. //! use crate::config::PageServerConf; use crate::context::RequestContext; use crate::repository::{Key, Value}; use crate::tenant::blob_io::BlobWriter; use crate::tenant::block_io::BlockReader; use crate::tenant::ephemeral_file::EphemeralFile; use crate::tenant::storage_layer::{ValueReconstructResult, ValueReconstructState}; use crate::walrecord; use anyhow::{ensure, Result}; use pageserver_api::models::InMemoryLayerInfo; use std::cell::RefCell; use std::collections::HashMap; use tracing::*; use utils::{ bin_ser::BeSer, id::{TenantId, TimelineId}, lsn::Lsn, vec_map::VecMap, }; // avoid binding to Write (conflicts with std::io::Write) // while being able to use std::fmt::Write's methods use std::fmt::Write as _; use std::ops::Range; use std::sync::RwLock; use super::{DeltaLayer, DeltaLayerWriter, Layer}; thread_local! { /// A buffer for serializing object during [`InMemoryLayer::put_value`]. /// This buffer is reused for each serialization to avoid additional malloc calls. static SER_BUFFER: RefCell> = RefCell::new(Vec::new()); } pub struct InMemoryLayer { conf: &'static PageServerConf, tenant_id: TenantId, timeline_id: TimelineId, /// /// This layer contains all the changes from 'start_lsn'. The /// start is inclusive. /// start_lsn: Lsn, /// The above fields never change. The parts that do change are in 'inner', /// and protected by mutex. inner: RwLock, } impl std::fmt::Debug for InMemoryLayer { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("InMemoryLayer") .field("start_lsn", &self.start_lsn) .field("inner", &self.inner) .finish() } } pub struct InMemoryLayerInner { /// Frozen layers have an exclusive end LSN. /// Writes are only allowed when this is None end_lsn: Option, /// /// All versions of all pages in the layer are kept here. Indexed /// by block number and LSN. The value is an offset into the /// ephemeral file where the page version is stored. /// index: HashMap>, /// The values are stored in a serialized format in this file. /// Each serialized Value is preceded by a 'u32' length field. /// PerSeg::page_versions map stores offsets into this file. file: EphemeralFile, } impl std::fmt::Debug for InMemoryLayerInner { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("InMemoryLayerInner") .field("end_lsn", &self.end_lsn) .finish() } } impl InMemoryLayerInner { fn assert_writeable(&self) { assert!(self.end_lsn.is_none()); } } impl InMemoryLayer { pub fn get_timeline_id(&self) -> TimelineId { self.timeline_id } pub fn info(&self) -> InMemoryLayerInfo { let lsn_start = self.start_lsn; let lsn_end = self.inner.read().unwrap().end_lsn; match lsn_end { Some(lsn_end) => InMemoryLayerInfo::Frozen { lsn_start, lsn_end }, None => InMemoryLayerInfo::Open { lsn_start }, } } } #[async_trait::async_trait] impl Layer for InMemoryLayer { fn get_key_range(&self) -> Range { Key::MIN..Key::MAX } fn get_lsn_range(&self) -> Range { let inner = self.inner.read().unwrap(); let end_lsn = if let Some(end_lsn) = inner.end_lsn { end_lsn } else { Lsn(u64::MAX) }; self.start_lsn..end_lsn } fn is_incremental(&self) -> bool { // in-memory layer is always considered incremental. true } /// debugging function to print out the contents of the layer async fn dump(&self, verbose: bool, _ctx: &RequestContext) -> Result<()> { let inner = self.inner.read().unwrap(); let end_str = inner .end_lsn .as_ref() .map(Lsn::to_string) .unwrap_or_default(); println!( "----- in-memory layer for tli {} LSNs {}-{} ----", self.timeline_id, self.start_lsn, end_str, ); if !verbose { return Ok(()); } let cursor = inner.file.block_cursor(); let mut buf = Vec::new(); for (key, vec_map) in inner.index.iter() { for (lsn, pos) in vec_map.as_slice() { let mut desc = String::new(); cursor.read_blob_into_buf(*pos, &mut buf)?; let val = Value::des(&buf); match val { Ok(Value::Image(img)) => { write!(&mut desc, " img {} bytes", img.len())?; } Ok(Value::WalRecord(rec)) => { let wal_desc = walrecord::describe_wal_record(&rec).unwrap(); write!( &mut desc, " rec {} bytes will_init: {} {}", buf.len(), rec.will_init(), wal_desc )?; } Err(err) => { write!(&mut desc, " DESERIALIZATION ERROR: {}", err)?; } } println!(" key {} at {}: {}", key, lsn, desc); } } Ok(()) } /// Look up given value in the layer. async fn get_value_reconstruct_data( &self, key: Key, lsn_range: Range, reconstruct_state: &mut ValueReconstructState, _ctx: &RequestContext, ) -> anyhow::Result { ensure!(lsn_range.start >= self.start_lsn); let mut need_image = true; let inner = self.inner.read().unwrap(); let reader = inner.file.block_cursor(); // Scan the page versions backwards, starting from `lsn`. if let Some(vec_map) = inner.index.get(&key) { let slice = vec_map.slice_range(lsn_range); for (entry_lsn, pos) in slice.iter().rev() { let buf = reader.read_blob(*pos)?; let value = Value::des(&buf)?; match value { Value::Image(img) => { reconstruct_state.img = Some((*entry_lsn, img)); return Ok(ValueReconstructResult::Complete); } Value::WalRecord(rec) => { let will_init = rec.will_init(); reconstruct_state.records.push((*entry_lsn, rec)); if will_init { // This WAL record initializes the page, so no need to go further back need_image = false; break; } } } } } // release lock on 'inner' // If an older page image is needed to reconstruct the page, let the // caller know. if need_image { Ok(ValueReconstructResult::Continue) } else { Ok(ValueReconstructResult::Complete) } } } impl std::fmt::Display for InMemoryLayer { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { let inner = self.inner.read().unwrap(); let end_lsn = inner.end_lsn.unwrap_or(Lsn(u64::MAX)); write!(f, "inmem-{:016X}-{:016X}", self.start_lsn.0, end_lsn.0) } } impl InMemoryLayer { /// /// Get layer size on the disk /// pub fn size(&self) -> Result { let inner = self.inner.read().unwrap(); Ok(inner.file.size) } /// /// Create a new, empty, in-memory layer /// pub fn create( conf: &'static PageServerConf, timeline_id: TimelineId, tenant_id: TenantId, start_lsn: Lsn, ) -> Result { trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}"); let file = EphemeralFile::create(conf, tenant_id, timeline_id)?; Ok(InMemoryLayer { conf, timeline_id, tenant_id, start_lsn, inner: RwLock::new(InMemoryLayerInner { end_lsn: None, index: HashMap::new(), file, }), }) } // Write operations /// Common subroutine of the public put_wal_record() and put_page_image() functions. /// Adds the page version to the in-memory tree pub fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> { trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn); let mut inner = self.inner.write().unwrap(); inner.assert_writeable(); let off = { SER_BUFFER.with(|x| -> Result<_> { let mut buf = x.borrow_mut(); buf.clear(); val.ser_into(&mut (*buf))?; let off = inner.file.write_blob(&buf)?; Ok(off) })? }; let vec_map = inner.index.entry(key).or_default(); let old = vec_map.append_or_update_last(lsn, off).unwrap().0; if old.is_some() { // We already had an entry for this LSN. That's odd.. warn!("Key {} at {} already exists", key, lsn); } Ok(()) } pub async fn put_tombstone(&self, _key_range: Range, _lsn: Lsn) -> Result<()> { // TODO: Currently, we just leak the storage for any deleted keys Ok(()) } /// Make the layer non-writeable. Only call once. /// Records the end_lsn for non-dropped layers. /// `end_lsn` is exclusive pub fn freeze(&self, end_lsn: Lsn) { let mut inner = self.inner.write().unwrap(); assert!(self.start_lsn < end_lsn); inner.end_lsn = Some(end_lsn); for vec_map in inner.index.values() { for (lsn, _pos) in vec_map.as_slice() { assert!(*lsn < end_lsn); } } } /// Write this frozen in-memory layer to disk. /// /// Returns a new delta layer with all the same data as this in-memory layer pub fn write_to_disk(&self) -> Result { // Grab the lock in read-mode. We hold it over the I/O, but because this // layer is not writeable anymore, no one should be trying to acquire the // write lock on it, so we shouldn't block anyone. There's one exception // though: another thread might have grabbed a reference to this layer // in `get_layer_for_write' just before the checkpointer called // `freeze`, and then `write_to_disk` on it. When the thread gets the // lock, it will see that it's not writeable anymore and retry, but it // would have to wait until we release it. That race condition is very // rare though, so we just accept the potential latency hit for now. let inner = self.inner.read().unwrap(); let mut delta_layer_writer = DeltaLayerWriter::new( self.conf, self.timeline_id, self.tenant_id, Key::MIN, self.start_lsn..inner.end_lsn.unwrap(), )?; let mut buf = Vec::new(); let cursor = inner.file.block_cursor(); let mut keys: Vec<(&Key, &VecMap)> = inner.index.iter().collect(); keys.sort_by_key(|k| k.0); for (key, vec_map) in keys.iter() { let key = **key; // Write all page versions for (lsn, pos) in vec_map.as_slice() { cursor.read_blob_into_buf(*pos, &mut buf)?; let will_init = Value::des(&buf)?.will_init(); delta_layer_writer.put_value_bytes(key, *lsn, &buf, will_init)?; } } let delta_layer = delta_layer_writer.finish(Key::MAX)?; Ok(delta_layer) } }