mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-05 20:42:54 +00:00
Refactor the way Image- and DeltaLayers are created
Introduce builder objects, DeltaLayerWriter and ImageLayerWriter. This gives more flexibility, as the DeltaLayer::create and ImageLayer::create functions don't need to know about the details of the format of where the page versions are coming from. This allows us to change the format used in InMemoryLayer more easily, without having to modify Delta- and ImageLayer code. Also refactor the code in InMemoryLayer::write_to_disk for clarity.
This commit is contained in:
@@ -55,7 +55,6 @@ use zenith_utils::crashsafe_dir;
|
||||
use zenith_utils::lsn::{AtomicLsn, Lsn, RecordLsn};
|
||||
use zenith_utils::seqwait::SeqWait;
|
||||
|
||||
mod blob;
|
||||
mod delta_layer;
|
||||
mod ephemeral_file;
|
||||
mod filename;
|
||||
|
||||
@@ -1,46 +0,0 @@
|
||||
use std::io::{Read, Write};
|
||||
use std::os::unix::prelude::FileExt;
|
||||
|
||||
use anyhow::Result;
|
||||
use bookfile::{BookWriter, BoundedReader, ChapterId, ChapterWriter};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct BlobRange {
|
||||
offset: u64,
|
||||
size: usize,
|
||||
}
|
||||
|
||||
pub fn read_blob<F: FileExt>(reader: &BoundedReader<&'_ F>, range: &BlobRange) -> Result<Vec<u8>> {
|
||||
let mut buf = vec![0u8; range.size];
|
||||
reader.read_exact_at(&mut buf, range.offset)?;
|
||||
Ok(buf)
|
||||
}
|
||||
|
||||
pub struct BlobWriter<W> {
|
||||
writer: ChapterWriter<W>,
|
||||
offset: u64,
|
||||
}
|
||||
|
||||
impl<W: Write> BlobWriter<W> {
|
||||
// This function takes a BookWriter and creates a new chapter to ensure offset is 0.
|
||||
pub fn new(book_writer: BookWriter<W>, chapter_id: impl Into<ChapterId>) -> Self {
|
||||
let writer = book_writer.new_chapter(chapter_id);
|
||||
Self { writer, offset: 0 }
|
||||
}
|
||||
|
||||
pub fn write_blob_from_reader(&mut self, r: &mut impl Read) -> Result<BlobRange> {
|
||||
let len = std::io::copy(r, &mut self.writer)?;
|
||||
|
||||
let range = BlobRange {
|
||||
offset: self.offset,
|
||||
size: len as usize,
|
||||
};
|
||||
self.offset += len as u64;
|
||||
Ok(range)
|
||||
}
|
||||
|
||||
pub fn close(self) -> bookfile::Result<BookWriter<W>> {
|
||||
self.writer.close()
|
||||
}
|
||||
}
|
||||
@@ -37,9 +37,7 @@
|
||||
//! A detlta file is constructed using the 'bookfile' crate. Each file consists of two
|
||||
//! parts: the page versions and the segment sizes. They are stored as separate chapters.
|
||||
//!
|
||||
use crate::layered_repository::blob::BlobWriter;
|
||||
use crate::layered_repository::filename::{DeltaFileName, PathOrConf};
|
||||
use crate::layered_repository::page_versions::PageVersions;
|
||||
use crate::layered_repository::storage_layer::{
|
||||
Layer, PageReconstructData, PageReconstructResult, PageVersion, SegmentBlk, SegmentTag,
|
||||
RELISH_SEG_SIZE,
|
||||
@@ -58,16 +56,15 @@ use std::fmt::Write as _;
|
||||
use std::fs;
|
||||
use std::io::{BufWriter, Write};
|
||||
use std::ops::Bound::Included;
|
||||
use std::os::unix::fs::FileExt;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::{Mutex, MutexGuard};
|
||||
|
||||
use bookfile::{Book, BookWriter};
|
||||
use bookfile::{Book, BookWriter, BoundedReader, ChapterWriter};
|
||||
|
||||
use zenith_utils::bin_ser::BeSer;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
use super::blob::{read_blob, BlobRange};
|
||||
|
||||
// Magic constant to identify a Zenith delta file
|
||||
pub const DELTA_FILE_MAGIC: u32 = 0x5A616E01;
|
||||
|
||||
@@ -109,6 +106,18 @@ impl From<&DeltaLayer> for Summary {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
struct BlobRange {
|
||||
offset: u64,
|
||||
size: usize,
|
||||
}
|
||||
|
||||
fn read_blob<F: FileExt>(reader: &BoundedReader<&'_ F>, range: &BlobRange) -> Result<Vec<u8>> {
|
||||
let mut buf = vec![0u8; range.size];
|
||||
reader.read_exact_at(&mut buf, range.offset)?;
|
||||
Ok(buf)
|
||||
}
|
||||
|
||||
///
|
||||
/// DeltaLayer is the in-memory data structure associated with an
|
||||
/// on-disk delta file. We keep a DeltaLayer in memory for each
|
||||
@@ -147,12 +156,13 @@ pub struct DeltaLayerInner {
|
||||
/// Indexed by block number and LSN.
|
||||
page_version_metas: VecMap<(SegmentBlk, Lsn), BlobRange>,
|
||||
|
||||
/// `seg_sizes` tracks the size of the relation at different points in time.
|
||||
/// `seg_sizes` tracks the size of the segment at different points in time.
|
||||
seg_sizes: VecMap<Lsn, SegmentBlk>,
|
||||
}
|
||||
|
||||
impl DeltaLayerInner {
|
||||
fn get_seg_size(&self, lsn: Lsn) -> Result<SegmentBlk> {
|
||||
// Scan the VecMap backwards, starting from the given entry.
|
||||
let slice = self
|
||||
.seg_sizes
|
||||
.slice_range((Included(&Lsn(0)), Included(&lsn)));
|
||||
@@ -221,7 +231,7 @@ impl Layer for DeltaLayer {
|
||||
.expect("should be loaded in load call above")
|
||||
.chapter_reader(PAGE_VERSIONS_CHAPTER)?;
|
||||
|
||||
// Scan the metadata BTreeMap backwards, starting from the given entry.
|
||||
// Scan the metadata VecMap backwards, starting from the given entry.
|
||||
let minkey = (blknum, Lsn(0));
|
||||
let maxkey = (blknum, lsn);
|
||||
let iter = inner
|
||||
@@ -287,7 +297,6 @@ impl Layer for DeltaLayer {
|
||||
"get_seg_size() called on a non-blocky rel"
|
||||
);
|
||||
|
||||
// Scan the BTreeMap backwards, starting from the given entry.
|
||||
let inner = self.load()?;
|
||||
inner.get_seg_size(lsn)
|
||||
}
|
||||
@@ -397,112 +406,6 @@ impl DeltaLayer {
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a new delta file, using the given page versions and seg_sizes.
|
||||
/// The page versions are passed in a PageVersions struct. If 'cutoff' is
|
||||
/// given, only page versions with LSN < cutoff are included.
|
||||
///
|
||||
/// This is used to write the in-memory layer to disk. The page_versions and
|
||||
/// seg_sizes are thus passed in the same format as they are in the in-memory
|
||||
/// layer, as that's expedient.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn create(
|
||||
conf: &'static PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
seg: SegmentTag,
|
||||
start_lsn: Lsn,
|
||||
end_lsn: Lsn,
|
||||
dropped: bool,
|
||||
page_versions: &PageVersions,
|
||||
cutoff: Option<Lsn>,
|
||||
seg_sizes: VecMap<Lsn, SegmentBlk>,
|
||||
) -> Result<DeltaLayer> {
|
||||
if seg.rel.is_blocky() {
|
||||
assert!(!seg_sizes.is_empty());
|
||||
}
|
||||
|
||||
let delta_layer = DeltaLayer {
|
||||
path_or_conf: PathOrConf::Conf(conf),
|
||||
timelineid,
|
||||
tenantid,
|
||||
seg,
|
||||
start_lsn,
|
||||
end_lsn,
|
||||
dropped,
|
||||
inner: Mutex::new(DeltaLayerInner {
|
||||
loaded: false,
|
||||
book: None,
|
||||
page_version_metas: VecMap::default(),
|
||||
seg_sizes,
|
||||
}),
|
||||
};
|
||||
let mut inner = delta_layer.inner.lock().unwrap();
|
||||
|
||||
// Write the data into a file
|
||||
//
|
||||
// Note: Because we open the file in write-only mode, we cannot
|
||||
// reuse the same VirtualFile for reading later. That's why we don't
|
||||
// set inner.book here. The first read will have to re-open it.
|
||||
//
|
||||
// Note: This overwrites any existing file. There shouldn't be any.
|
||||
// FIXME: throw an error instead?
|
||||
let path = delta_layer.path();
|
||||
let file = VirtualFile::create(&path)?;
|
||||
let buf_writer = BufWriter::new(file);
|
||||
let book = BookWriter::new(buf_writer, DELTA_FILE_MAGIC)?;
|
||||
|
||||
let mut page_version_writer = BlobWriter::new(book, PAGE_VERSIONS_CHAPTER);
|
||||
|
||||
let page_versions_iter = page_versions.ordered_page_version_iter(cutoff);
|
||||
for (blknum, lsn, pos) in page_versions_iter {
|
||||
let blob_range =
|
||||
page_version_writer.write_blob_from_reader(&mut page_versions.reader(pos)?)?;
|
||||
|
||||
inner
|
||||
.page_version_metas
|
||||
.append((blknum, lsn), blob_range)
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
let book = page_version_writer.close()?;
|
||||
|
||||
// Write out page versions
|
||||
let mut chapter = book.new_chapter(PAGE_VERSION_METAS_CHAPTER);
|
||||
let buf = VecMap::ser(&inner.page_version_metas)?;
|
||||
chapter.write_all(&buf)?;
|
||||
let book = chapter.close()?;
|
||||
|
||||
// and seg_sizes to separate chapter
|
||||
let mut chapter = book.new_chapter(SEG_SIZES_CHAPTER);
|
||||
let buf = VecMap::ser(&inner.seg_sizes)?;
|
||||
chapter.write_all(&buf)?;
|
||||
let book = chapter.close()?;
|
||||
|
||||
let mut chapter = book.new_chapter(SUMMARY_CHAPTER);
|
||||
let summary = Summary {
|
||||
tenantid,
|
||||
timelineid,
|
||||
seg,
|
||||
|
||||
start_lsn,
|
||||
end_lsn,
|
||||
|
||||
dropped,
|
||||
};
|
||||
Summary::ser_into(&summary, &mut chapter)?;
|
||||
let book = chapter.close()?;
|
||||
|
||||
// This flushes the underlying 'buf_writer'.
|
||||
let writer = book.close()?;
|
||||
writer.get_ref().sync_all()?;
|
||||
|
||||
trace!("saved {}", &path.display());
|
||||
|
||||
drop(inner);
|
||||
|
||||
Ok(delta_layer)
|
||||
}
|
||||
|
||||
///
|
||||
/// Load the contents of the file into memory
|
||||
///
|
||||
@@ -633,3 +536,171 @@ impl DeltaLayer {
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/// A builder object for constructing a new delta layer.
|
||||
///
|
||||
/// Usage:
|
||||
///
|
||||
/// 1. Create the DeltaLayerWriter by calling DeltaLayerWriter::new(...)
|
||||
///
|
||||
/// 2. Write the contents by calling `put_page_version` for every page
|
||||
/// version to store in the layer.
|
||||
///
|
||||
/// 3. Call `finish`.
|
||||
///
|
||||
pub struct DeltaLayerWriter {
|
||||
conf: &'static PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
seg: SegmentTag,
|
||||
start_lsn: Lsn,
|
||||
end_lsn: Lsn,
|
||||
dropped: bool,
|
||||
|
||||
page_version_writer: ChapterWriter<BufWriter<VirtualFile>>,
|
||||
pv_offset: u64,
|
||||
|
||||
page_version_metas: VecMap<(SegmentBlk, Lsn), BlobRange>,
|
||||
}
|
||||
|
||||
impl DeltaLayerWriter {
|
||||
///
|
||||
/// Start building a new delta layer.
|
||||
///
|
||||
pub fn new(
|
||||
conf: &'static PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
seg: SegmentTag,
|
||||
start_lsn: Lsn,
|
||||
end_lsn: Lsn,
|
||||
dropped: bool,
|
||||
) -> Result<DeltaLayerWriter> {
|
||||
// Create the file
|
||||
//
|
||||
// Note: This overwrites any existing file. There shouldn't be any.
|
||||
// FIXME: throw an error instead?
|
||||
let path = DeltaLayer::path_for(
|
||||
&PathOrConf::Conf(conf),
|
||||
timelineid,
|
||||
tenantid,
|
||||
&DeltaFileName {
|
||||
seg,
|
||||
start_lsn,
|
||||
end_lsn,
|
||||
dropped,
|
||||
},
|
||||
);
|
||||
let file = VirtualFile::create(&path)?;
|
||||
let buf_writer = BufWriter::new(file);
|
||||
let book = BookWriter::new(buf_writer, DELTA_FILE_MAGIC)?;
|
||||
|
||||
// Open the page-versions chapter for writing. The calls to
|
||||
// `put_page_version` will use this to write the contents.
|
||||
let page_version_writer = book.new_chapter(PAGE_VERSIONS_CHAPTER);
|
||||
|
||||
Ok(DeltaLayerWriter {
|
||||
conf,
|
||||
timelineid,
|
||||
tenantid,
|
||||
seg,
|
||||
start_lsn,
|
||||
end_lsn,
|
||||
dropped,
|
||||
page_version_writer,
|
||||
page_version_metas: VecMap::default(),
|
||||
pv_offset: 0,
|
||||
})
|
||||
}
|
||||
|
||||
///
|
||||
/// Append a page version to the file.
|
||||
///
|
||||
/// 'buf' is a serialized PageVersion.
|
||||
/// The page versions must be appended in blknum, lsn order.
|
||||
///
|
||||
pub fn put_page_version(&mut self, blknum: SegmentBlk, lsn: Lsn, buf: &[u8]) -> Result<()> {
|
||||
// Remember the offset and size metadata. The metadata is written
|
||||
// to a separate chapter, in `finish`.
|
||||
let blob_range = BlobRange {
|
||||
offset: self.pv_offset,
|
||||
size: buf.len(),
|
||||
};
|
||||
self.page_version_metas
|
||||
.append((blknum, lsn), blob_range)
|
||||
.unwrap();
|
||||
|
||||
// write the page version
|
||||
self.page_version_writer.write_all(buf)?;
|
||||
self.pv_offset += buf.len() as u64;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// Finish writing the delta layer.
|
||||
///
|
||||
/// 'seg_sizes' is a list of size changes to store with the actual data.
|
||||
///
|
||||
pub fn finish(self, seg_sizes: VecMap<Lsn, SegmentBlk>) -> Result<DeltaLayer> {
|
||||
// Close the page-versions chapter
|
||||
let book = self.page_version_writer.close()?;
|
||||
|
||||
// Write out page versions metadata
|
||||
let mut chapter = book.new_chapter(PAGE_VERSION_METAS_CHAPTER);
|
||||
let buf = VecMap::ser(&self.page_version_metas)?;
|
||||
chapter.write_all(&buf)?;
|
||||
let book = chapter.close()?;
|
||||
|
||||
if self.seg.rel.is_blocky() {
|
||||
assert!(!seg_sizes.is_empty());
|
||||
}
|
||||
|
||||
// and seg_sizes to separate chapter
|
||||
let mut chapter = book.new_chapter(SEG_SIZES_CHAPTER);
|
||||
let buf = VecMap::ser(&seg_sizes)?;
|
||||
chapter.write_all(&buf)?;
|
||||
let book = chapter.close()?;
|
||||
|
||||
let mut chapter = book.new_chapter(SUMMARY_CHAPTER);
|
||||
let summary = Summary {
|
||||
tenantid: self.tenantid,
|
||||
timelineid: self.timelineid,
|
||||
seg: self.seg,
|
||||
|
||||
start_lsn: self.start_lsn,
|
||||
end_lsn: self.end_lsn,
|
||||
|
||||
dropped: self.dropped,
|
||||
};
|
||||
Summary::ser_into(&summary, &mut chapter)?;
|
||||
let book = chapter.close()?;
|
||||
|
||||
// This flushes the underlying 'buf_writer'.
|
||||
let writer = book.close()?;
|
||||
writer.get_ref().sync_all()?;
|
||||
|
||||
// Note: Because we opened the file in write-only mode, we cannot
|
||||
// reuse the same VirtualFile for reading later. That's why we don't
|
||||
// set inner.book here. The first read will have to re-open it.
|
||||
let layer = DeltaLayer {
|
||||
path_or_conf: PathOrConf::Conf(self.conf),
|
||||
tenantid: self.tenantid,
|
||||
timelineid: self.timelineid,
|
||||
seg: self.seg,
|
||||
start_lsn: self.start_lsn,
|
||||
end_lsn: self.end_lsn,
|
||||
dropped: self.dropped,
|
||||
inner: Mutex::new(DeltaLayerInner {
|
||||
loaded: false,
|
||||
book: None,
|
||||
page_version_metas: VecMap::default(),
|
||||
seg_sizes: VecMap::default(),
|
||||
}),
|
||||
};
|
||||
|
||||
trace!("created delta layer {}", &layer.path().display());
|
||||
|
||||
Ok(layer)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -25,7 +25,6 @@ use crate::layered_repository::filename::{ImageFileName, PathOrConf};
|
||||
use crate::layered_repository::storage_layer::{
|
||||
Layer, PageReconstructData, PageReconstructResult, SegmentBlk, SegmentTag,
|
||||
};
|
||||
use crate::layered_repository::LayeredTimeline;
|
||||
use crate::layered_repository::RELISH_SEG_SIZE;
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use crate::PageServerConf;
|
||||
@@ -40,7 +39,7 @@ use std::io::{BufWriter, Write};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::{Mutex, MutexGuard};
|
||||
|
||||
use bookfile::{Book, BookWriter};
|
||||
use bookfile::{Book, BookWriter, ChapterWriter};
|
||||
|
||||
use zenith_utils::bin_ser::BeSer;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
@@ -265,121 +264,6 @@ impl ImageLayer {
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a new image file, using the given array of pages.
|
||||
fn create(
|
||||
conf: &'static PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
seg: SegmentTag,
|
||||
lsn: Lsn,
|
||||
base_images: Vec<Bytes>,
|
||||
) -> Result<ImageLayer> {
|
||||
let image_type = if seg.rel.is_blocky() {
|
||||
let num_blocks: SegmentBlk = base_images.len().try_into()?;
|
||||
ImageType::Blocky { num_blocks }
|
||||
} else {
|
||||
assert_eq!(base_images.len(), 1);
|
||||
ImageType::NonBlocky
|
||||
};
|
||||
|
||||
let layer = ImageLayer {
|
||||
path_or_conf: PathOrConf::Conf(conf),
|
||||
timelineid,
|
||||
tenantid,
|
||||
seg,
|
||||
lsn,
|
||||
inner: Mutex::new(ImageLayerInner {
|
||||
book: None,
|
||||
image_type: image_type.clone(),
|
||||
}),
|
||||
};
|
||||
let inner = layer.inner.lock().unwrap();
|
||||
|
||||
// Write the images into a file
|
||||
//
|
||||
// Note: Because we open the file in write-only mode, we cannot
|
||||
// reuse the same VirtualFile for reading later. That's why we don't
|
||||
// set inner.book here. The first read will have to re-open it.
|
||||
//
|
||||
// Note: This overwrites any existing file. There shouldn't be any.
|
||||
// FIXME: throw an error instead?
|
||||
let path = layer.path();
|
||||
let file = VirtualFile::create(&path)?;
|
||||
let buf_writer = BufWriter::new(file);
|
||||
let book = BookWriter::new(buf_writer, IMAGE_FILE_MAGIC)?;
|
||||
|
||||
let book = match &image_type {
|
||||
ImageType::Blocky { .. } => {
|
||||
let mut chapter = book.new_chapter(BLOCKY_IMAGES_CHAPTER);
|
||||
for block_bytes in base_images {
|
||||
assert_eq!(block_bytes.len(), BLOCK_SIZE);
|
||||
chapter.write_all(&block_bytes)?;
|
||||
}
|
||||
chapter.close()?
|
||||
}
|
||||
ImageType::NonBlocky => {
|
||||
let mut chapter = book.new_chapter(NONBLOCKY_IMAGE_CHAPTER);
|
||||
chapter.write_all(&base_images[0])?;
|
||||
chapter.close()?
|
||||
}
|
||||
};
|
||||
|
||||
let mut chapter = book.new_chapter(SUMMARY_CHAPTER);
|
||||
let summary = Summary {
|
||||
tenantid,
|
||||
timelineid,
|
||||
seg,
|
||||
|
||||
lsn,
|
||||
};
|
||||
Summary::ser_into(&summary, &mut chapter)?;
|
||||
let book = chapter.close()?;
|
||||
|
||||
// This flushes the underlying 'buf_writer'.
|
||||
let writer = book.close()?;
|
||||
writer.get_ref().sync_all()?;
|
||||
|
||||
trace!("saved {}", path.display());
|
||||
|
||||
drop(inner);
|
||||
|
||||
Ok(layer)
|
||||
}
|
||||
|
||||
// Create a new image file by materializing every page in a source layer
|
||||
// at given LSN.
|
||||
pub fn create_from_src(
|
||||
conf: &'static PageServerConf,
|
||||
timeline: &LayeredTimeline,
|
||||
src: &dyn Layer,
|
||||
lsn: Lsn,
|
||||
) -> Result<ImageLayer> {
|
||||
let seg = src.get_seg_tag();
|
||||
let timelineid = timeline.timelineid;
|
||||
|
||||
let size = if seg.rel.is_blocky() {
|
||||
src.get_seg_size(lsn)?
|
||||
} else {
|
||||
1
|
||||
};
|
||||
|
||||
trace!(
|
||||
"creating new ImageLayer for {} on timeline {} at {}",
|
||||
seg,
|
||||
timelineid,
|
||||
lsn,
|
||||
);
|
||||
|
||||
let mut base_images: Vec<Bytes> = Vec::new();
|
||||
for blknum in 0..size {
|
||||
let img = timeline.materialize_page(seg, blknum, lsn, &*src)?;
|
||||
|
||||
base_images.push(img);
|
||||
}
|
||||
|
||||
Self::create(conf, timelineid, timeline.tenantid, seg, lsn, base_images)
|
||||
}
|
||||
|
||||
///
|
||||
/// Load the contents of the file into memory
|
||||
///
|
||||
@@ -507,3 +391,137 @@ impl ImageLayer {
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/// A builder object for constructing a new image layer.
|
||||
///
|
||||
/// Usage:
|
||||
///
|
||||
/// 1. Create the ImageLayerWriter by calling ImageLayerWriter::new(...)
|
||||
///
|
||||
/// 2. Write the contents by calling `put_page_image` for every page
|
||||
/// in the segment.
|
||||
///
|
||||
/// 3. Call `finish`.
|
||||
///
|
||||
pub struct ImageLayerWriter {
|
||||
conf: &'static PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
seg: SegmentTag,
|
||||
lsn: Lsn,
|
||||
|
||||
num_blocks: SegmentBlk,
|
||||
|
||||
page_image_writer: ChapterWriter<BufWriter<VirtualFile>>,
|
||||
num_blocks_written: SegmentBlk,
|
||||
}
|
||||
|
||||
impl ImageLayerWriter {
|
||||
pub fn new(
|
||||
conf: &'static PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
seg: SegmentTag,
|
||||
lsn: Lsn,
|
||||
num_blocks: SegmentBlk,
|
||||
) -> Result<ImageLayerWriter> {
|
||||
// Create the file
|
||||
//
|
||||
// Note: This overwrites any existing file. There shouldn't be any.
|
||||
// FIXME: throw an error instead?
|
||||
let path = ImageLayer::path_for(
|
||||
&PathOrConf::Conf(conf),
|
||||
timelineid,
|
||||
tenantid,
|
||||
&ImageFileName { seg, lsn },
|
||||
);
|
||||
let file = VirtualFile::create(&path)?;
|
||||
let buf_writer = BufWriter::new(file);
|
||||
let book = BookWriter::new(buf_writer, IMAGE_FILE_MAGIC)?;
|
||||
|
||||
// Open the page-images chapter for writing. The calls to
|
||||
// `put_page_image` will use this to write the contents.
|
||||
let chapter = if seg.rel.is_blocky() {
|
||||
book.new_chapter(BLOCKY_IMAGES_CHAPTER)
|
||||
} else {
|
||||
assert_eq!(num_blocks, 1);
|
||||
book.new_chapter(NONBLOCKY_IMAGE_CHAPTER)
|
||||
};
|
||||
|
||||
let writer = ImageLayerWriter {
|
||||
conf,
|
||||
timelineid,
|
||||
tenantid,
|
||||
seg,
|
||||
lsn,
|
||||
num_blocks,
|
||||
page_image_writer: chapter,
|
||||
num_blocks_written: 0,
|
||||
};
|
||||
|
||||
Ok(writer)
|
||||
}
|
||||
|
||||
///
|
||||
/// Write next page image to the file.
|
||||
///
|
||||
/// The page versions must be appended in blknum order.
|
||||
///
|
||||
pub fn put_page_image(&mut self, block_bytes: &[u8]) -> Result<()> {
|
||||
assert!(self.num_blocks_written < self.num_blocks);
|
||||
if self.seg.rel.is_blocky() {
|
||||
assert_eq!(block_bytes.len(), BLOCK_SIZE);
|
||||
}
|
||||
self.page_image_writer.write_all(block_bytes)?;
|
||||
self.num_blocks_written += 1;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn finish(self) -> Result<ImageLayer> {
|
||||
// Check that the `put_page_image' was called for every block.
|
||||
assert!(self.num_blocks_written == self.num_blocks);
|
||||
|
||||
// Close the page-images chapter
|
||||
let book = self.page_image_writer.close()?;
|
||||
|
||||
// Write out the summary chapter
|
||||
let image_type = if self.seg.rel.is_blocky() {
|
||||
ImageType::Blocky {
|
||||
num_blocks: self.num_blocks,
|
||||
}
|
||||
} else {
|
||||
ImageType::NonBlocky
|
||||
};
|
||||
let mut chapter = book.new_chapter(SUMMARY_CHAPTER);
|
||||
let summary = Summary {
|
||||
tenantid: self.tenantid,
|
||||
timelineid: self.timelineid,
|
||||
seg: self.seg,
|
||||
lsn: self.lsn,
|
||||
};
|
||||
Summary::ser_into(&summary, &mut chapter)?;
|
||||
let book = chapter.close()?;
|
||||
|
||||
// This flushes the underlying 'buf_writer'.
|
||||
let writer = book.close()?;
|
||||
writer.get_ref().sync_all()?;
|
||||
|
||||
// Note: Because we open the file in write-only mode, we cannot
|
||||
// reuse the same VirtualFile for reading later. That's why we don't
|
||||
// set inner.book here. The first read will have to re-open it.
|
||||
let layer = ImageLayer {
|
||||
path_or_conf: PathOrConf::Conf(self.conf),
|
||||
timelineid: self.timelineid,
|
||||
tenantid: self.tenantid,
|
||||
seg: self.seg,
|
||||
lsn: self.lsn,
|
||||
inner: Mutex::new(ImageLayerInner {
|
||||
book: None,
|
||||
image_type,
|
||||
}),
|
||||
};
|
||||
trace!("created image layer {}", layer.path().display());
|
||||
|
||||
Ok(layer)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,15 +4,16 @@
|
||||
//!
|
||||
//! And there's another BTreeMap to track the size of the relation.
|
||||
//!
|
||||
use crate::layered_repository::delta_layer::{DeltaLayer, DeltaLayerWriter};
|
||||
use crate::layered_repository::ephemeral_file::EphemeralFile;
|
||||
use crate::layered_repository::filename::DeltaFileName;
|
||||
use crate::layered_repository::image_layer::{ImageLayer, ImageLayerWriter};
|
||||
use crate::layered_repository::storage_layer::{
|
||||
Layer, PageReconstructData, PageReconstructResult, PageVersion, SegmentBlk, SegmentTag,
|
||||
RELISH_SEG_SIZE,
|
||||
};
|
||||
use crate::layered_repository::LayeredTimeline;
|
||||
use crate::layered_repository::ZERO_PAGE;
|
||||
use crate::layered_repository::{DeltaLayer, ImageLayer};
|
||||
use crate::repository::WALRecord;
|
||||
use crate::PageServerConf;
|
||||
use crate::{ZTenantId, ZTimelineId};
|
||||
@@ -177,7 +178,7 @@ impl Layer for InMemoryLayer {
|
||||
_ => {}
|
||||
}
|
||||
|
||||
let pv = inner.page_versions.get_page_version(*pos)?;
|
||||
let pv = inner.page_versions.read_pv(*pos)?;
|
||||
match pv {
|
||||
PageVersion::Page(img) => {
|
||||
reconstruct_data.page_img = Some(img);
|
||||
@@ -297,7 +298,7 @@ impl Layer for InMemoryLayer {
|
||||
}
|
||||
|
||||
for (blknum, lsn, pos) in inner.page_versions.ordered_page_version_iter(None) {
|
||||
let pv = inner.page_versions.get_page_version(pos)?;
|
||||
let pv = inner.page_versions.read_pv(pos)?;
|
||||
let pv_description = match pv {
|
||||
PageVersion::Page(_img) => "page",
|
||||
PageVersion::Wal(_rec) => "wal",
|
||||
@@ -596,84 +597,102 @@ impl InMemoryLayer {
|
||||
// would have to wait until we release it. That race condition is very
|
||||
// rare though, so we just accept the potential latency hit for now.
|
||||
let inner = self.inner.read().unwrap();
|
||||
|
||||
// Since `end_lsn` is exclusive, subtract 1 to calculate the last LSN
|
||||
// that is included.
|
||||
let end_lsn_exclusive = inner.end_lsn.unwrap();
|
||||
|
||||
if inner.dropped {
|
||||
let delta_layer = DeltaLayer::create(
|
||||
self.conf,
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
self.seg,
|
||||
self.start_lsn,
|
||||
end_lsn_exclusive,
|
||||
true,
|
||||
&inner.page_versions,
|
||||
None,
|
||||
inner.seg_sizes.clone(),
|
||||
)?;
|
||||
trace!(
|
||||
"freeze: created delta layer for dropped segment {} {}-{}",
|
||||
self.seg,
|
||||
self.start_lsn,
|
||||
end_lsn_exclusive
|
||||
);
|
||||
return Ok(LayersOnDisk {
|
||||
delta_layers: vec![delta_layer],
|
||||
image_layers: Vec::new(),
|
||||
});
|
||||
}
|
||||
|
||||
// Since `end_lsn` is inclusive, subtract 1.
|
||||
// We want to make an ImageLayer for the last included LSN,
|
||||
// so the DeltaLayer should exclude that LSN.
|
||||
let end_lsn_inclusive = Lsn(end_lsn_exclusive.0 - 1);
|
||||
|
||||
let mut delta_layers = Vec::new();
|
||||
// Figure out if we should create a delta layer, image layer, or both.
|
||||
let image_lsn: Option<Lsn>;
|
||||
let delta_end_lsn: Option<Lsn>;
|
||||
if self.is_dropped() {
|
||||
// The segment was dropped. Create just a delta layer containing all the
|
||||
// changes up to and including the drop.
|
||||
delta_end_lsn = Some(end_lsn_exclusive);
|
||||
image_lsn = None;
|
||||
} else if self.start_lsn == end_lsn_inclusive {
|
||||
// The layer contains exactly one LSN. It's enough to write an image
|
||||
// layer at that LSN.
|
||||
delta_end_lsn = None;
|
||||
image_lsn = Some(end_lsn_inclusive);
|
||||
} else {
|
||||
// Create a delta layer with all the changes up to the end LSN,
|
||||
// and an image layer at the end LSN.
|
||||
//
|
||||
// Note that we the delta layer does *not* include the page versions
|
||||
// at the end LSN. They are included in the image layer, and there's
|
||||
// no need to store them twice.
|
||||
delta_end_lsn = Some(end_lsn_inclusive);
|
||||
image_lsn = Some(end_lsn_inclusive);
|
||||
}
|
||||
|
||||
if self.start_lsn != end_lsn_inclusive {
|
||||
let (seg_sizes, _) = inner.seg_sizes.split_at(&end_lsn_exclusive);
|
||||
// Write the page versions before the cutoff to disk.
|
||||
let delta_layer = DeltaLayer::create(
|
||||
let mut delta_layers = Vec::new();
|
||||
let mut image_layers = Vec::new();
|
||||
|
||||
if let Some(delta_end_lsn) = delta_end_lsn {
|
||||
let mut delta_layer_writer = DeltaLayerWriter::new(
|
||||
self.conf,
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
self.seg,
|
||||
self.start_lsn,
|
||||
end_lsn_inclusive,
|
||||
false,
|
||||
&inner.page_versions,
|
||||
Some(end_lsn_inclusive),
|
||||
seg_sizes,
|
||||
delta_end_lsn,
|
||||
self.is_dropped(),
|
||||
)?;
|
||||
delta_layers.push(delta_layer);
|
||||
trace!(
|
||||
"freeze: created delta layer {} {}-{}",
|
||||
self.seg,
|
||||
self.start_lsn,
|
||||
end_lsn_inclusive
|
||||
);
|
||||
} else {
|
||||
assert!(inner
|
||||
|
||||
// Write all page versions
|
||||
let mut buf: Vec<u8> = Vec::new();
|
||||
|
||||
let page_versions_iter = inner
|
||||
.page_versions
|
||||
.ordered_page_version_iter(None)
|
||||
.next()
|
||||
.is_none());
|
||||
.ordered_page_version_iter(Some(delta_end_lsn));
|
||||
for (blknum, lsn, pos) in page_versions_iter {
|
||||
let len = inner.page_versions.read_pv_bytes(pos, &mut buf)?;
|
||||
delta_layer_writer.put_page_version(blknum, lsn, &buf[..len])?;
|
||||
}
|
||||
|
||||
// Create seg_sizes
|
||||
let seg_sizes = if delta_end_lsn == end_lsn_exclusive {
|
||||
inner.seg_sizes.clone()
|
||||
} else {
|
||||
inner.seg_sizes.split_at(&end_lsn_exclusive).0
|
||||
};
|
||||
|
||||
let delta_layer = delta_layer_writer.finish(seg_sizes)?;
|
||||
delta_layers.push(delta_layer);
|
||||
}
|
||||
|
||||
drop(inner);
|
||||
|
||||
// Write a new base image layer at the cutoff point
|
||||
let image_layer =
|
||||
ImageLayer::create_from_src(self.conf, timeline, self, end_lsn_inclusive)?;
|
||||
trace!(
|
||||
"freeze: created image layer {} at {}",
|
||||
self.seg,
|
||||
end_lsn_inclusive
|
||||
);
|
||||
if let Some(image_lsn) = image_lsn {
|
||||
let size = if self.seg.rel.is_blocky() {
|
||||
self.get_seg_size(image_lsn)?
|
||||
} else {
|
||||
1
|
||||
};
|
||||
let mut image_layer_writer = ImageLayerWriter::new(
|
||||
self.conf,
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
self.seg,
|
||||
image_lsn,
|
||||
size,
|
||||
)?;
|
||||
|
||||
for blknum in 0..size {
|
||||
let img = timeline.materialize_page(self.seg, blknum, image_lsn, &*self)?;
|
||||
|
||||
image_layer_writer.put_page_image(&img)?;
|
||||
}
|
||||
let image_layer = image_layer_writer.finish()?;
|
||||
image_layers.push(image_layer);
|
||||
}
|
||||
|
||||
Ok(LayersOnDisk {
|
||||
delta_layers,
|
||||
image_layers: vec![image_layer],
|
||||
image_layers,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -98,23 +98,39 @@ impl PageVersions {
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns a 'Read' that reads the page version at given offset.
|
||||
pub fn reader(&self, pos: u64) -> Result<PageVersionReader, std::io::Error> {
|
||||
// read length
|
||||
let mut lenbuf = [0u8; 4];
|
||||
self.file.read_exact_at(&mut lenbuf, pos)?;
|
||||
let len = u32::from_ne_bytes(lenbuf);
|
||||
|
||||
Ok(PageVersionReader {
|
||||
file: &self.file,
|
||||
pos: pos + 4,
|
||||
end_pos: pos + 4 + len as u64,
|
||||
})
|
||||
///
|
||||
/// Read a page version.
|
||||
///
|
||||
pub fn read_pv(&self, off: u64) -> Result<PageVersion> {
|
||||
let mut buf = Vec::new();
|
||||
self.read_pv_bytes(off, &mut buf)?;
|
||||
Ok(PageVersion::des(&buf)?)
|
||||
}
|
||||
|
||||
pub fn get_page_version(&self, pos: u64) -> Result<PageVersion> {
|
||||
let mut reader = self.reader(pos)?;
|
||||
Ok(PageVersion::des_from(&mut reader)?)
|
||||
///
|
||||
/// Read a page version, as raw bytes, at the given offset. The bytes
|
||||
/// are read into 'buf', which is expanded if necessary. Returns the
|
||||
/// size of the page version.
|
||||
///
|
||||
pub fn read_pv_bytes(&self, off: u64, buf: &mut Vec<u8>) -> Result<usize> {
|
||||
// read length
|
||||
let mut lenbuf = [0u8; 4];
|
||||
self.file.read_exact_at(&mut lenbuf, off)?;
|
||||
let len = u32::from_ne_bytes(lenbuf) as usize;
|
||||
|
||||
// Resize the buffer to fit the data, if needed.
|
||||
//
|
||||
// We don't shrink the buffer if it's larger than necessary. That avoids
|
||||
// repeatedly shrinking and expanding when you reuse the same buffer to
|
||||
// read multiple page versions. Expanding a Vec requires initializing the
|
||||
// new bytes, which is a waste of time because we're immediately overwriting
|
||||
// it, but there's no way to avoid it without resorting to unsafe code.
|
||||
if buf.len() < len {
|
||||
buf.resize(len, 0);
|
||||
}
|
||||
self.file.read_exact_at(&mut buf[0..len], off + 4)?;
|
||||
|
||||
Ok(len)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user