Compare commits

...

9 Commits

Author SHA1 Message Date
Christian Schwarz
6efd9fb0a6 WIP: refactor for unit testing 2024-08-19 16:56:24 +00:00
Christian Schwarz
0d22d4d310 https://github.com/neondatabase/neon/pull/8537#discussion_r1719877858 2024-08-19 15:26:56 +00:00
Christian Schwarz
10c7419fee https://github.com/neondatabase/neon/pull/8537#discussion_r1719830114 2024-08-19 15:22:33 +00:00
Christian Schwarz
311282762f manually benchmark get_values_reconstruct_data performance
on my M2 MacBook Pro, running a Linux VM

  ./target/release/neon_local stop
  rm -rf .neon
  ./target/release/neon_local init
  ./target/release/neon_local start
  ./target/release/neon_local tenant create --set-default
  ./target/release/neon_local endpoint create foo
  ./target/release/neon_local endpoint start foo
  psql 'postgresql://cloud_admin@127.0.0.1:55432/postgres'
psql (13.16 (Debian 13.16-0+deb11u1), server 15.7)

CREATE TABLE wal_test (
    id SERIAL PRIMARY KEY,
    data TEXT
);

DO $$
DECLARE
    i INTEGER := 1;
BEGIN
    WHILE i <= 500000 LOOP
        INSERT INTO wal_test (data) VALUES ('data');
        i := i + 1;
    END LOOP;
END $$;

-- => result is one L0 from initdb and one 137M-sized ephemeral-2

DO $$
DECLARE
    i INTEGER := 1;
    random_id INTEGER;
    random_record wal_test%ROWTYPE;
    start_time TIMESTAMP := clock_timestamp();
    selects_completed INTEGER := 0;
    min_id INTEGER := 1;  -- Minimum ID value
    max_id INTEGER := 100000;  -- Maximum ID value, based on your insert range
    iters INTEGER := 100000000;  -- Number of iterations to run
BEGIN
    WHILE i <= iters LOOP
        -- Generate a random ID within the known range
        random_id := min_id + floor(random() * (max_id - min_id + 1))::int;

        -- Select the row with the generated random ID
        SELECT * INTO random_record
        FROM wal_test
        WHERE id = random_id;

        -- Increment the select counter
        selects_completed := selects_completed + 1;

        -- Check if a second has passed
        IF EXTRACT(EPOCH FROM clock_timestamp() - start_time) >= 1 THEN
            -- Print the number of selects completed in the last second
            RAISE NOTICE 'Selects completed in last second: %', selects_completed;

            -- Reset counters for the next second
            selects_completed := 0;
            start_time := clock_timestamp();
        END IF;

        -- Increment the loop counter
        i := i + 1;
    END LOOP;
END $$;

./target/release/neon_local stop

baseline: commit d9a57aeed9 origin/main,

NOTICE:  Selects completed in last second: 1286
NOTICE:  Selects completed in last second: 1352
NOTICE:  Selects completed in last second: 1365
NOTICE:  Selects completed in last second: 1399
NOTICE:  Selects completed in last second: 1410
NOTICE:  Selects completed in last second: 1393
NOTICE:  Selects completed in last second: 1316

ours

NOTICE:  Selects completed in last second: 1541
NOTICE:  Selects completed in last second: 1536
NOTICE:  Selects completed in last second: 1493
NOTICE:  Selects completed in last second: 1379
NOTICE:  Selects completed in last second: 1519
NOTICE:  Selects completed in last second: 1546
NOTICE:  Selects completed in last second: 1489
NOTICE:  Selects completed in last second: 1578
NOTICE:  Selects completed in last second: 1508
2024-08-15 12:51:04 +00:00
Christian Schwarz
72966e06a2 bench_ingest results (on my MBP linux VM)
cargo bench --bench bench_ingest -- 'ingest 128MB/100b seq, no delta'

baseline: commit d9a57aeed9 (origin/main)

ingest-small-values/ingest 128MB/100b seq, no delta
                        time:   [527.44 ms 543.79 ms 562.97 ms]
                        thrpt:  [227.36 MiB/s 235.39 MiB/s 242.68 MiB/s]

HEAD~1

ingest-small-values/ingest 128MB/100b seq, no delta
                        time:   [491.37 ms 494.69 ms 498.30 ms]
                        thrpt:  [256.87 MiB/s 258.75 MiB/s 260.49 MiB/s]
2024-08-15 11:46:24 +00:00
Christian Schwarz
fb78185074 merging of adjacent chunk reads, up to max batch size 2024-08-15 11:31:24 +00:00
Christian Schwarz
6f65b4d2d3 don't think in pages, but DIO chunks; remove read_page & page_caching remnants 2024-08-15 11:28:34 +00:00
Christian Schwarz
37bfa04996 implement coalescing of multiple reads onto same page 2024-08-14 19:58:26 +00:00
Christian Schwarz
332ca2bf09 WIP 2024-08-14 19:58:26 +00:00
11 changed files with 547 additions and 827 deletions

View File

@@ -67,7 +67,8 @@ async fn ingest(
let layer =
InMemoryLayer::create(conf, timeline_id, tenant_shard_id, lsn, entered, &ctx).await?;
let data = Value::Image(Bytes::from(vec![0u8; put_size])).ser()?;
let value = Value::Image(Bytes::from(vec![0u8; put_size]));
let data = value.ser()?;
let ctx = RequestContext::new(
pageserver::task_mgr::TaskKind::WalReceiverConnectionHandler,
pageserver::context::DownloadBehavior::Download,
@@ -95,7 +96,9 @@ async fn ingest(
}
}
layer.put_value(key.to_compact(), lsn, &data, &ctx).await?;
layer
.put_value(key.to_compact(), lsn, &data, value.will_init(), &ctx)
.await?;
}
layer.freeze(lsn + 1).await;

View File

@@ -1,15 +1,10 @@
use std::{num::NonZeroUsize, sync::Arc};
use crate::tenant::ephemeral_file;
#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize)]
#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
pub enum L0FlushConfig {
PageCached,
#[serde(rename_all = "snake_case")]
Direct {
max_concurrency: NonZeroUsize,
},
Direct { max_concurrency: NonZeroUsize },
}
impl Default for L0FlushConfig {
@@ -25,14 +20,12 @@ impl Default for L0FlushConfig {
pub struct L0FlushGlobalState(Arc<Inner>);
pub enum Inner {
PageCached,
Direct { semaphore: tokio::sync::Semaphore },
}
impl L0FlushGlobalState {
pub fn new(config: L0FlushConfig) -> Self {
match config {
L0FlushConfig::PageCached => Self(Arc::new(Inner::PageCached)),
L0FlushConfig::Direct { max_concurrency } => {
let semaphore = tokio::sync::Semaphore::new(max_concurrency.get());
Self(Arc::new(Inner::Direct { semaphore }))
@@ -44,13 +37,3 @@ impl L0FlushGlobalState {
&self.0
}
}
impl L0FlushConfig {
pub(crate) fn prewarm_on_write(&self) -> ephemeral_file::PrewarmPageCacheOnWrite {
use L0FlushConfig::*;
match self {
PageCached => ephemeral_file::PrewarmPageCacheOnWrite::Yes,
Direct { .. } => ephemeral_file::PrewarmPageCacheOnWrite::No,
}
}
}

View File

@@ -2,7 +2,6 @@
//! Low-level Block-oriented I/O functions
//!
use super::ephemeral_file::EphemeralFile;
use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner};
use crate::context::RequestContext;
use crate::page_cache::{self, FileId, PageReadGuard, PageWriteGuard, ReadBufResult, PAGE_SZ};
@@ -81,9 +80,7 @@ impl<'a> Deref for BlockLease<'a> {
/// Unlike traits, we also support the read function to be async though.
pub(crate) enum BlockReaderRef<'a> {
FileBlockReader(&'a FileBlockReader<'a>),
EphemeralFile(&'a EphemeralFile),
Adapter(Adapter<&'a DeltaLayerInner>),
Slice(&'a [u8]),
#[cfg(test)]
TestDisk(&'a super::disk_btree::tests::TestDisk),
#[cfg(test)]
@@ -100,9 +97,7 @@ impl<'a> BlockReaderRef<'a> {
use BlockReaderRef::*;
match self {
FileBlockReader(r) => r.read_blk(blknum, ctx).await,
EphemeralFile(r) => r.read_blk(blknum, ctx).await,
Adapter(r) => r.read_blk(blknum, ctx).await,
Slice(s) => Self::read_blk_slice(s, blknum),
#[cfg(test)]
TestDisk(r) => r.read_blk(blknum),
#[cfg(test)]
@@ -111,24 +106,6 @@ impl<'a> BlockReaderRef<'a> {
}
}
impl<'a> BlockReaderRef<'a> {
fn read_blk_slice(slice: &[u8], blknum: u32) -> std::io::Result<BlockLease> {
let start = (blknum as usize).checked_mul(PAGE_SZ).unwrap();
let end = start.checked_add(PAGE_SZ).unwrap();
if end > slice.len() {
return Err(std::io::Error::new(
std::io::ErrorKind::UnexpectedEof,
format!("slice too short, len={} end={}", slice.len(), end),
));
}
let slice = &slice[start..end];
let page_sized: &[u8; PAGE_SZ] = slice
.try_into()
.expect("we add PAGE_SZ to start, so the slice must have PAGE_SZ");
Ok(BlockLease::Slice(page_sized))
}
}
///
/// A "cursor" for efficiently reading multiple pages from a BlockReader
///

View File

@@ -4,10 +4,16 @@
use crate::config::PageServerConf;
use crate::context::RequestContext;
use crate::page_cache;
use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader};
use crate::virtual_file::{self, VirtualFile};
use crate::virtual_file::owned_buffers_io::slice::SliceMutExt;
use crate::virtual_file::owned_buffers_io::util::size_tracking_writer;
use crate::virtual_file::owned_buffers_io::write::Buffer;
use crate::virtual_file::{self, owned_buffers_io, VirtualFile};
use anyhow::Context;
use bytes::BytesMut;
use camino::Utf8PathBuf;
use pageserver_api::shard::TenantShardId;
use tokio_epoll_uring::{BoundedBuf, IoBufMut, Slice};
use tracing::error;
use std::io;
use std::sync::atomic::AtomicU64;
@@ -16,13 +22,19 @@ use utils::id::TimelineId;
pub struct EphemeralFile {
_tenant_shard_id: TenantShardId,
_timeline_id: TimelineId,
rw: page_caching::RW,
page_cache_file_id: page_cache::FileId,
bytes_written: u32,
buffered_writer: owned_buffers_io::write::BufferedWriter<
BytesMut,
size_tracking_writer::Writer<VirtualFile>,
>,
/// Gate guard is held on as long as we need to do operations in the path (delete on drop)
_gate_guard: utils::sync::gate::GateGuard,
}
mod page_caching;
pub(crate) use page_caching::PrewarmOnWrite as PrewarmPageCacheOnWrite;
mod zero_padded_read_write;
use super::storage_layer::inmemory_layer::InMemoryLayerIndexValue;
const TAIL_SZ: usize = 64 * 1024;
impl EphemeralFile {
pub async fn create(
@@ -52,59 +64,199 @@ impl EphemeralFile {
)
.await?;
let prewarm = conf.l0_flush.prewarm_on_write();
let page_cache_file_id = page_cache::next_file_id(); // XXX get rid, we're not page-caching anymore
Ok(EphemeralFile {
_tenant_shard_id: tenant_shard_id,
_timeline_id: timeline_id,
rw: page_caching::RW::new(file, prewarm, gate_guard),
page_cache_file_id,
bytes_written: 0,
buffered_writer: owned_buffers_io::write::BufferedWriter::new(
size_tracking_writer::Writer::new(file),
BytesMut::with_capacity(TAIL_SZ),
),
_gate_guard: gate_guard,
})
}
}
pub(crate) fn len(&self) -> u64 {
self.rw.bytes_written()
impl Drop for EphemeralFile {
fn drop(&mut self) {
// unlink the file
// we are clear to do this, because we have entered a gate
let path = &self.buffered_writer.as_inner().as_inner().path;
let res = std::fs::remove_file(path);
if let Err(e) = res {
if e.kind() != std::io::ErrorKind::NotFound {
// just never log the not found errors, we cannot do anything for them; on detach
// the tenant directory is already gone.
//
// not found files might also be related to https://github.com/neondatabase/neon/issues/2442
error!("could not remove ephemeral file '{path}': {e}");
}
}
}
}
impl EphemeralFile {
pub(crate) fn len(&self) -> u32 {
self.bytes_written
}
pub(crate) fn page_cache_file_id(&self) -> page_cache::FileId {
self.rw.page_cache_file_id()
self.page_cache_file_id
}
/// See [`self::page_caching::RW::load_to_vec`].
pub(crate) async fn load_to_vec(&self, ctx: &RequestContext) -> Result<Vec<u8>, io::Error> {
self.rw.load_to_vec(ctx).await
let size = usize::try_from(self.len()).unwrap();
let vec = Vec::with_capacity(size);
// read from disk what we've already flushed
let file_size_tracker = self.buffered_writer.as_inner();
let flushed_offset = usize::try_from(file_size_tracker.bytes_written()).unwrap();
let flushed_range = 0..flushed_offset;
let file: &VirtualFile = file_size_tracker.as_inner();
let mut vec = file
.read_exact_at(
vec.slice(0..(flushed_range.end - flushed_range.start)),
u64::try_from(flushed_range.start).unwrap(),
ctx,
)
.await?
.into_inner();
// copy from in-memory buffer what we haven't flushed yet but would return when accessed via read_blk
let buffer = self.buffered_writer.inspect_buffer();
let buffered = &buffer[0..buffer.pending()];
vec.extend_from_slice(buffered);
assert_eq!(vec.len(), size);
Ok(vec)
}
pub(crate) async fn read_blk(
/// Fill dst will dst.bytes_total() bytes from the bytes written to the buffered writer from offset `start` and later.
/// If `dst` is larger than the available bytes, the read will be short.
/// The read will never be short for other reasons.
/// The number of bytes read into `dst` is returned as part of the result tuple.
/// No guarantees are made about the remaining bytes in `dst`, i.e., assume their contents are random.
pub(crate) async fn read_at_to_end<B: IoBufMut + Send>(
&self,
blknum: u32,
start: u32,
dst: Slice<B>,
ctx: &RequestContext,
) -> Result<BlockLease, io::Error> {
self.rw.read_blk(blknum, ctx).await
) -> std::io::Result<(Slice<B>, usize)> {
let file_size_tracking_writer = self.buffered_writer.as_inner();
let flushed_offset = u32::try_from(file_size_tracking_writer.bytes_written())
.expect("we don't allow writing more than u32::MAX bytes");
let buffer = self.buffered_writer.inspect_buffer();
let buffered = &buffer[0..buffer.pending()];
let dst_cap = u32::try_from(dst.bytes_total())
.with_context(|| {
format!(
"read_aligned: dst.bytes_total() is too large: {}",
dst.len()
)
})
.map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?;
let end = {
let mut end = start
.checked_add(dst_cap)
.with_context(|| {
format!("read_aligned: offset + dst.bytes_total() is too large: {start} + {dst_cap}",)
})
.map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?;
if end > self.bytes_written {
end = self.bytes_written;
}
end
};
// inclusive, exclusive
#[derive(Debug)]
struct Range(u32, u32);
impl Range {
fn len(&self) -> u32 {
if self.0 > self.1 {
0
} else {
self.1 - self.0
}
}
}
let written_range = Range(start, std::cmp::min(end, flushed_offset));
let buffered_range = Range(std::cmp::max(start, flushed_offset), end);
let dst = if written_range.len() > 0 {
let file: &VirtualFile = file_size_tracking_writer.as_inner();
let bounds = dst.bounds();
let slice = file
.read_exact_at(
dst.slice(0..written_range.len() as usize),
start as u64,
ctx,
)
.await?;
Slice::from_buf_bounds(Slice::into_inner(slice), bounds)
} else {
dst
};
let dst = if buffered_range.len() > 0 {
let offset_in_buffer =
usize::try_from(buffered_range.0.checked_sub(flushed_offset).unwrap()).unwrap();
let to_copy =
&buffered[offset_in_buffer..(offset_in_buffer + buffered_range.len() as usize)];
let bounds = dst.bounds();
let mut view = dst.slice(
written_range.len() as usize
..written_range.len() as usize + buffered_range.len() as usize,
);
view.as_mut_rust_slice_full_zeroed()
.copy_from_slice(to_copy);
Slice::from_buf_bounds(Slice::into_inner(view), bounds)
} else {
dst
};
// TODO: in debug mode, randomize the remaining bytes in `dst` to catch bugs
Ok((dst, (end - start) as usize))
}
pub(crate) async fn write_blob(
&mut self,
srcbuf: &[u8],
buf: &[u8],
will_init: bool,
ctx: &RequestContext,
) -> Result<u64, io::Error> {
let pos = self.rw.bytes_written();
) -> Result<InMemoryLayerIndexValue, io::Error> {
let pos = self.bytes_written;
let len = u32::try_from(buf.len()).map_err(|e| {
std::io::Error::new(
std::io::ErrorKind::Other,
anyhow::anyhow!(
"EphemeralFile::write_blob value too large: {}: {e}",
buf.len()
),
)
})?;
pos.checked_add(len).ok_or_else(|| {
io::Error::new(
io::ErrorKind::InvalidInput,
"EphemeralFile::write_blob: overflow",
)
})?;
// Write the length field
if srcbuf.len() < 0x80 {
// short one-byte length header
let len_buf = [srcbuf.len() as u8];
self.buffered_writer
.write_buffered_borrowed(buf, ctx)
.await?;
self.bytes_written += len;
self.rw.write_all_borrowed(&len_buf, ctx).await?;
} else {
let mut len_buf = u32::to_be_bytes(srcbuf.len() as u32);
len_buf[0] |= 0x80;
self.rw.write_all_borrowed(&len_buf, ctx).await?;
}
// Write the payload
self.rw.write_all_borrowed(srcbuf, ctx).await?;
Ok(pos)
Ok(InMemoryLayerIndexValue {
pos,
len,
will_init,
})
}
}
@@ -117,19 +269,11 @@ pub fn is_ephemeral_file(filename: &str) -> bool {
}
}
impl BlockReader for EphemeralFile {
fn block_cursor(&self) -> super::block_io::BlockCursor<'_> {
BlockCursor::new(super::block_io::BlockReaderRef::EphemeralFile(self))
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::context::DownloadBehavior;
use crate::task_mgr::TaskKind;
use crate::tenant::block_io::BlockReaderRef;
use rand::{thread_rng, RngCore};
use std::fs;
use std::str::FromStr;
@@ -160,69 +304,6 @@ mod tests {
Ok((conf, tenant_shard_id, timeline_id, ctx))
}
#[tokio::test]
async fn test_ephemeral_blobs() -> Result<(), io::Error> {
let (conf, tenant_id, timeline_id, ctx) = harness("ephemeral_blobs")?;
let gate = utils::sync::gate::Gate::default();
let entered = gate.enter().unwrap();
let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, entered, &ctx).await?;
let pos_foo = file.write_blob(b"foo", &ctx).await?;
assert_eq!(
b"foo",
file.block_cursor()
.read_blob(pos_foo, &ctx)
.await?
.as_slice()
);
let pos_bar = file.write_blob(b"bar", &ctx).await?;
assert_eq!(
b"foo",
file.block_cursor()
.read_blob(pos_foo, &ctx)
.await?
.as_slice()
);
assert_eq!(
b"bar",
file.block_cursor()
.read_blob(pos_bar, &ctx)
.await?
.as_slice()
);
let mut blobs = Vec::new();
for i in 0..10000 {
let data = Vec::from(format!("blob{}", i).as_bytes());
let pos = file.write_blob(&data, &ctx).await?;
blobs.push((pos, data));
}
// also test with a large blobs
for i in 0..100 {
let data = format!("blob{}", i).as_bytes().repeat(100);
let pos = file.write_blob(&data, &ctx).await?;
blobs.push((pos, data));
}
let cursor = BlockCursor::new(BlockReaderRef::EphemeralFile(&file));
for (pos, expected) in blobs {
let actual = cursor.read_blob(pos, &ctx).await?;
assert_eq!(actual, expected);
}
// Test a large blob that spans multiple pages
let mut large_data = vec![0; 20000];
thread_rng().fill_bytes(&mut large_data);
let pos_large = file.write_blob(&large_data, &ctx).await?;
let result = file.block_cursor().read_blob(pos_large, &ctx).await?;
assert_eq!(result, large_data);
Ok(())
}
#[tokio::test]
async fn ephemeral_file_holds_gate_open() {
const FOREVER: std::time::Duration = std::time::Duration::from_secs(5);

View File

@@ -1,290 +0,0 @@
//! Wrapper around [`super::zero_padded_read_write::RW`] that uses the
//! [`crate::page_cache`] to serve reads that need to go to the underlying [`VirtualFile`].
use crate::context::RequestContext;
use crate::page_cache::{self, PAGE_SZ};
use crate::tenant::block_io::BlockLease;
use crate::virtual_file::owned_buffers_io::io_buf_ext::FullSlice;
use crate::virtual_file::VirtualFile;
use once_cell::sync::Lazy;
use std::io::{self, ErrorKind};
use std::ops::{Deref, Range};
use tokio_epoll_uring::BoundedBuf;
use tracing::*;
use super::zero_padded_read_write;
/// See module-level comment.
pub struct RW {
page_cache_file_id: page_cache::FileId,
rw: super::zero_padded_read_write::RW<PreWarmingWriter>,
/// Gate guard is held on as long as we need to do operations in the path (delete on drop).
_gate_guard: utils::sync::gate::GateGuard,
}
/// When we flush a block to the underlying [`crate::virtual_file::VirtualFile`],
/// should we pre-warm the [`crate::page_cache`] with the contents?
#[derive(Clone, Copy)]
pub enum PrewarmOnWrite {
Yes,
No,
}
impl RW {
pub fn new(
file: VirtualFile,
prewarm_on_write: PrewarmOnWrite,
_gate_guard: utils::sync::gate::GateGuard,
) -> Self {
let page_cache_file_id = page_cache::next_file_id();
Self {
page_cache_file_id,
rw: super::zero_padded_read_write::RW::new(PreWarmingWriter::new(
page_cache_file_id,
file,
prewarm_on_write,
)),
_gate_guard,
}
}
pub fn page_cache_file_id(&self) -> page_cache::FileId {
self.page_cache_file_id
}
pub(crate) async fn write_all_borrowed(
&mut self,
srcbuf: &[u8],
ctx: &RequestContext,
) -> Result<usize, io::Error> {
// It doesn't make sense to proactively fill the page cache on the Pageserver write path
// because Compute is unlikely to access recently written data.
self.rw.write_all_borrowed(srcbuf, ctx).await
}
pub(crate) fn bytes_written(&self) -> u64 {
self.rw.bytes_written()
}
/// Load all blocks that can be read via [`Self::read_blk`] into a contiguous memory buffer.
///
/// This includes the blocks that aren't yet flushed to disk by the internal buffered writer.
/// The last block is zero-padded to [`PAGE_SZ`], so, the returned buffer is always a multiple of [`PAGE_SZ`].
pub(super) async fn load_to_vec(&self, ctx: &RequestContext) -> Result<Vec<u8>, io::Error> {
// round up to the next PAGE_SZ multiple, required by blob_io
let size = {
let s = usize::try_from(self.bytes_written()).unwrap();
if s % PAGE_SZ == 0 {
s
} else {
s.checked_add(PAGE_SZ - (s % PAGE_SZ)).unwrap()
}
};
let vec = Vec::with_capacity(size);
// read from disk what we've already flushed
let writer = self.rw.as_writer();
let flushed_range = writer.written_range();
let mut vec = writer
.file
.read_exact_at(
vec.slice(0..(flushed_range.end - flushed_range.start)),
u64::try_from(flushed_range.start).unwrap(),
ctx,
)
.await?
.into_inner();
// copy from in-memory buffer what we haven't flushed yet but would return when accessed via read_blk
let buffered = self.rw.get_tail_zero_padded();
vec.extend_from_slice(buffered);
assert_eq!(vec.len(), size);
assert_eq!(vec.len() % PAGE_SZ, 0);
Ok(vec)
}
pub(crate) async fn read_blk(
&self,
blknum: u32,
ctx: &RequestContext,
) -> Result<BlockLease, io::Error> {
match self.rw.read_blk(blknum).await? {
zero_padded_read_write::ReadResult::NeedsReadFromWriter { writer } => {
let cache = page_cache::get();
match cache
.read_immutable_buf(self.page_cache_file_id, blknum, ctx)
.await
.map_err(|e| {
std::io::Error::new(
std::io::ErrorKind::Other,
// order path before error because error is anyhow::Error => might have many contexts
format!(
"ephemeral file: read immutable page #{}: {}: {:#}",
blknum,
self.rw.as_writer().file.path,
e,
),
)
})? {
page_cache::ReadBufResult::Found(guard) => {
return Ok(BlockLease::PageReadGuard(guard))
}
page_cache::ReadBufResult::NotFound(write_guard) => {
let write_guard = writer
.file
.read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64, ctx)
.await?;
let read_guard = write_guard.mark_valid();
return Ok(BlockLease::PageReadGuard(read_guard));
}
}
}
zero_padded_read_write::ReadResult::ServedFromZeroPaddedMutableTail { buffer } => {
Ok(BlockLease::EphemeralFileMutableTail(buffer))
}
}
}
}
impl Drop for RW {
fn drop(&mut self) {
// There might still be pages in the [`crate::page_cache`] for this file.
// We leave them there, [`crate::page_cache::PageCache::find_victim`] will evict them when needed.
// unlink the file
// we are clear to do this, because we have entered a gate
let res = std::fs::remove_file(&self.rw.as_writer().file.path);
if let Err(e) = res {
if e.kind() != std::io::ErrorKind::NotFound {
// just never log the not found errors, we cannot do anything for them; on detach
// the tenant directory is already gone.
//
// not found files might also be related to https://github.com/neondatabase/neon/issues/2442
error!(
"could not remove ephemeral file '{}': {}",
self.rw.as_writer().file.path,
e
);
}
}
}
}
struct PreWarmingWriter {
prewarm_on_write: PrewarmOnWrite,
nwritten_blocks: u32,
page_cache_file_id: page_cache::FileId,
file: VirtualFile,
}
impl PreWarmingWriter {
fn new(
page_cache_file_id: page_cache::FileId,
file: VirtualFile,
prewarm_on_write: PrewarmOnWrite,
) -> Self {
Self {
prewarm_on_write,
nwritten_blocks: 0,
page_cache_file_id,
file,
}
}
/// Return the byte range within `file` that has been written though `write_all`.
///
/// The returned range would be invalidated by another `write_all`. To prevent that, we capture `&_`.
fn written_range(&self) -> (impl Deref<Target = Range<usize>> + '_) {
let nwritten_blocks = usize::try_from(self.nwritten_blocks).unwrap();
struct Wrapper(Range<usize>);
impl Deref for Wrapper {
type Target = Range<usize>;
fn deref(&self) -> &Range<usize> {
&self.0
}
}
Wrapper(0..nwritten_blocks * PAGE_SZ)
}
}
impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmingWriter {
async fn write_all<Buf: tokio_epoll_uring::IoBuf + Send>(
&mut self,
buf: FullSlice<Buf>,
ctx: &RequestContext,
) -> std::io::Result<(usize, FullSlice<Buf>)> {
let buflen = buf.len();
assert_eq!(
buflen % PAGE_SZ,
0,
"{buflen} ; we know TAIL_SZ is a PAGE_SZ multiple, and write_buffered_borrowed is used"
);
// Do the IO.
let buf = match self.file.write_all(buf, ctx).await {
(buf, Ok(nwritten)) => {
assert_eq!(nwritten, buflen);
buf
}
(_, Err(e)) => {
return Err(std::io::Error::new(
ErrorKind::Other,
// order error before path because path is long and error is short
format!(
"ephemeral_file: write_blob: write-back tail self.nwritten_blocks={}, buflen={}, {:#}: {}",
self.nwritten_blocks, buflen, e, self.file.path,
),
));
}
};
let nblocks = buflen / PAGE_SZ;
let nblocks32 = u32::try_from(nblocks).unwrap();
if matches!(self.prewarm_on_write, PrewarmOnWrite::Yes) {
// Pre-warm page cache with the contents.
// At least in isolated bulk ingest benchmarks (test_bulk_insert.py), the pre-warming
// benefits the code that writes InMemoryLayer=>L0 layers.
let cache = page_cache::get();
static CTX: Lazy<RequestContext> = Lazy::new(|| {
RequestContext::new(
crate::task_mgr::TaskKind::EphemeralFilePreWarmPageCache,
crate::context::DownloadBehavior::Error,
)
});
for blknum_in_buffer in 0..nblocks {
let blk_in_buffer =
&buf[blknum_in_buffer * PAGE_SZ..(blknum_in_buffer + 1) * PAGE_SZ];
let blknum = self
.nwritten_blocks
.checked_add(blknum_in_buffer as u32)
.unwrap();
match cache
.read_immutable_buf(self.page_cache_file_id, blknum, &CTX)
.await
{
Err(e) => {
error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}");
// fail gracefully, it's not the end of the world if we can't pre-warm the cache here
}
Ok(v) => match v {
page_cache::ReadBufResult::Found(_guard) => {
// This function takes &mut self, so, it shouldn't be possible to reach this point.
unreachable!("we just wrote block {blknum} to the VirtualFile, which is owned by Self, \
and this function takes &mut self, so, no concurrent read_blk is possible");
}
page_cache::ReadBufResult::NotFound(mut write_guard) => {
write_guard.copy_from_slice(blk_in_buffer);
let _ = write_guard.mark_valid();
}
},
}
}
}
self.nwritten_blocks = self.nwritten_blocks.checked_add(nblocks32).unwrap();
Ok((buflen, buf))
}
}

View File

@@ -1,145 +0,0 @@
//! The heart of how [`super::EphemeralFile`] does its reads and writes.
//!
//! # Writes
//!
//! [`super::EphemeralFile`] writes small, borrowed buffers using [`RW::write_all_borrowed`].
//! The [`RW`] batches these into [`TAIL_SZ`] bigger writes, using [`owned_buffers_io::write::BufferedWriter`].
//!
//! # Reads
//!
//! [`super::EphemeralFile`] always reads full [`PAGE_SZ`]ed blocks using [`RW::read_blk`].
//!
//! The [`RW`] serves these reads either from the buffered writer's in-memory buffer
//! or redirects the caller to read from the underlying [`OwnedAsyncWriter`]
//! if the read is for the prefix that has already been flushed.
//!
//! # Current Usage
//!
//! The current user of this module is [`super::page_caching::RW`].
mod zero_padded;
use crate::{
context::RequestContext,
page_cache::PAGE_SZ,
virtual_file::owned_buffers_io::{
self,
write::{Buffer, OwnedAsyncWriter},
},
};
const TAIL_SZ: usize = 64 * 1024;
/// See module-level comment.
pub struct RW<W: OwnedAsyncWriter> {
buffered_writer: owned_buffers_io::write::BufferedWriter<
zero_padded::Buffer<TAIL_SZ>,
owned_buffers_io::util::size_tracking_writer::Writer<W>,
>,
}
pub enum ReadResult<'a, W> {
NeedsReadFromWriter { writer: &'a W },
ServedFromZeroPaddedMutableTail { buffer: &'a [u8; PAGE_SZ] },
}
impl<W> RW<W>
where
W: OwnedAsyncWriter,
{
pub fn new(writer: W) -> Self {
let bytes_flushed_tracker =
owned_buffers_io::util::size_tracking_writer::Writer::new(writer);
let buffered_writer = owned_buffers_io::write::BufferedWriter::new(
bytes_flushed_tracker,
zero_padded::Buffer::default(),
);
Self { buffered_writer }
}
pub(crate) fn as_writer(&self) -> &W {
self.buffered_writer.as_inner().as_inner()
}
pub async fn write_all_borrowed(
&mut self,
buf: &[u8],
ctx: &RequestContext,
) -> std::io::Result<usize> {
self.buffered_writer.write_buffered_borrowed(buf, ctx).await
}
pub fn bytes_written(&self) -> u64 {
let flushed_offset = self.buffered_writer.as_inner().bytes_written();
let buffer: &zero_padded::Buffer<TAIL_SZ> = self.buffered_writer.inspect_buffer();
flushed_offset + u64::try_from(buffer.pending()).unwrap()
}
/// Get a slice of all blocks that [`Self::read_blk`] would return as [`ReadResult::ServedFromZeroPaddedMutableTail`].
pub fn get_tail_zero_padded(&self) -> &[u8] {
let buffer: &zero_padded::Buffer<TAIL_SZ> = self.buffered_writer.inspect_buffer();
let buffer_written_up_to = buffer.pending();
// pad to next page boundary
let read_up_to = if buffer_written_up_to % PAGE_SZ == 0 {
buffer_written_up_to
} else {
buffer_written_up_to
.checked_add(PAGE_SZ - (buffer_written_up_to % PAGE_SZ))
.unwrap()
};
&buffer.as_zero_padded_slice()[0..read_up_to]
}
pub(crate) async fn read_blk(&self, blknum: u32) -> Result<ReadResult<'_, W>, std::io::Error> {
let flushed_offset = self.buffered_writer.as_inner().bytes_written();
let buffer: &zero_padded::Buffer<TAIL_SZ> = self.buffered_writer.inspect_buffer();
let buffered_offset = flushed_offset + u64::try_from(buffer.pending()).unwrap();
let read_offset = (blknum as u64) * (PAGE_SZ as u64);
// The trailing page ("block") might only be partially filled,
// yet the blob_io code relies on us to return a full PAGE_SZed slice anyway.
// Moreover, it has to be zero-padded, because when we still had
// a write-back page cache, it provided pre-zeroed pages, and blob_io came to rely on it.
// DeltaLayer probably has the same issue, not sure why it needs no special treatment.
// => check here that the read doesn't go beyond this potentially trailing
// => the zero-padding is done in the `else` branch below
let blocks_written = if buffered_offset % (PAGE_SZ as u64) == 0 {
buffered_offset / (PAGE_SZ as u64)
} else {
(buffered_offset / (PAGE_SZ as u64)) + 1
};
if (blknum as u64) >= blocks_written {
return Err(std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!("read past end of ephemeral_file: read=0x{read_offset:x} buffered=0x{buffered_offset:x} flushed=0x{flushed_offset}")));
}
// assertions for the `if-else` below
assert_eq!(
flushed_offset % (TAIL_SZ as u64), 0,
"we only use write_buffered_borrowed to write to the buffered writer, so it's guaranteed that flushes happen buffer.cap()-sized chunks"
);
assert_eq!(
flushed_offset % (PAGE_SZ as u64),
0,
"the logic below can't handle if the page is spread across the flushed part and the buffer"
);
if read_offset < flushed_offset {
assert!(read_offset + (PAGE_SZ as u64) <= flushed_offset);
Ok(ReadResult::NeedsReadFromWriter {
writer: self.as_writer(),
})
} else {
let read_offset_in_buffer = read_offset
.checked_sub(flushed_offset)
.expect("would have taken `if` branch instead of this one");
let read_offset_in_buffer = usize::try_from(read_offset_in_buffer).unwrap();
let zero_padded_slice = buffer.as_zero_padded_slice();
let page = &zero_padded_slice[read_offset_in_buffer..(read_offset_in_buffer + PAGE_SZ)];
Ok(ReadResult::ServedFromZeroPaddedMutableTail {
buffer: page
.try_into()
.expect("the slice above got it as page-size slice"),
})
}
}
}

View File

@@ -1,110 +0,0 @@
//! A [`crate::virtual_file::owned_buffers_io::write::Buffer`] whose
//! unwritten range is guaranteed to be zero-initialized.
//! This is used by [`crate::tenant::ephemeral_file::zero_padded_read_write::RW::read_blk`]
//! to serve page-sized reads of the trailing page when the trailing page has only been partially filled.
use std::mem::MaybeUninit;
use crate::virtual_file::owned_buffers_io::io_buf_ext::FullSlice;
/// See module-level comment.
pub struct Buffer<const N: usize> {
allocation: Box<[u8; N]>,
written: usize,
}
impl<const N: usize> Default for Buffer<N> {
fn default() -> Self {
Self {
allocation: Box::new(
// SAFETY: zeroed memory is a valid [u8; N]
unsafe { MaybeUninit::zeroed().assume_init() },
),
written: 0,
}
}
}
impl<const N: usize> Buffer<N> {
#[inline(always)]
fn invariants(&self) {
// don't check by default, unoptimized is too expensive even for debug mode
if false {
debug_assert!(self.written <= N, "{}", self.written);
debug_assert!(self.allocation[self.written..N].iter().all(|v| *v == 0));
}
}
pub fn as_zero_padded_slice(&self) -> &[u8; N] {
&self.allocation
}
}
impl<const N: usize> crate::virtual_file::owned_buffers_io::write::Buffer for Buffer<N> {
type IoBuf = Self;
fn cap(&self) -> usize {
self.allocation.len()
}
fn extend_from_slice(&mut self, other: &[u8]) {
self.invariants();
let remaining = self.allocation.len() - self.written;
if other.len() > remaining {
panic!("calling extend_from_slice() with insufficient remaining capacity");
}
self.allocation[self.written..(self.written + other.len())].copy_from_slice(other);
self.written += other.len();
self.invariants();
}
fn pending(&self) -> usize {
self.written
}
fn flush(self) -> FullSlice<Self> {
self.invariants();
let written = self.written;
FullSlice::must_new(tokio_epoll_uring::BoundedBuf::slice(self, 0..written))
}
fn reuse_after_flush(iobuf: Self::IoBuf) -> Self {
let Self {
mut allocation,
written,
} = iobuf;
allocation[0..written].fill(0);
let new = Self {
allocation,
written: 0,
};
new.invariants();
new
}
}
/// We have this trait impl so that the `flush` method in the `Buffer` impl above can produce a
/// [`tokio_epoll_uring::BoundedBuf::slice`] of the [`Self::written`] range of the data.
///
/// Remember that bytes_init is generally _not_ a tracker of the amount
/// of valid data in the io buffer; we use `Slice` for that.
/// The `IoBuf` is _only_ for keeping track of uninitialized memory, a bit like MaybeUninit.
///
/// SAFETY:
///
/// The [`Self::allocation`] is stable becauses boxes are stable.
/// The memory is zero-initialized, so, bytes_init is always N.
unsafe impl<const N: usize> tokio_epoll_uring::IoBuf for Buffer<N> {
fn stable_ptr(&self) -> *const u8 {
self.allocation.as_ptr()
}
fn bytes_init(&self) -> usize {
// Yes, N, not self.written; Read the full comment of this impl block!
N
}
fn bytes_total(&self) -> usize {
N
}
}

View File

@@ -64,7 +64,7 @@ use std::os::unix::fs::FileExt;
use std::str::FromStr;
use std::sync::Arc;
use tokio::sync::OnceCell;
use tokio_epoll_uring::IoBufMut;
use tokio_epoll_uring::IoBuf;
use tracing::*;
use utils::{
@@ -458,7 +458,7 @@ impl DeltaLayerWriterInner {
ctx: &RequestContext,
) -> (FullSlice<Buf>, anyhow::Result<()>)
where
Buf: IoBufMut + Send,
Buf: IoBuf + Send,
{
assert!(
self.lsn_range.start <= lsn,
@@ -666,7 +666,7 @@ impl DeltaLayerWriter {
ctx: &RequestContext,
) -> (FullSlice<Buf>, anyhow::Result<()>)
where
Buf: IoBufMut + Send,
Buf: IoBuf + Send,
{
self.inner
.as_mut()

View File

@@ -6,23 +6,24 @@
//!
use crate::config::PageServerConf;
use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
use crate::page_cache::PAGE_SZ;
use crate::repository::{Key, Value};
use crate::tenant::block_io::{BlockCursor, BlockReader, BlockReaderRef};
use crate::tenant::ephemeral_file::EphemeralFile;
use crate::tenant::timeline::GetVectoredError;
use crate::tenant::PageReconstructError;
use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
use crate::{l0_flush, page_cache, walrecord};
use crate::{l0_flush, page_cache};
use anyhow::{anyhow, Result};
use bytes::Bytes;
use camino::Utf8PathBuf;
use itertools::Itertools;
use pageserver_api::key::CompactKey;
use pageserver_api::keyspace::KeySpace;
use pageserver_api::models::InMemoryLayerInfo;
use pageserver_api::shard::TenantShardId;
use std::collections::BTreeMap;
use std::collections::{BTreeMap, HashMap};
use std::sync::{Arc, OnceLock};
use std::time::Instant;
use tokio_epoll_uring::BoundedBuf;
use tracing::*;
use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap};
// avoid binding to Write (conflicts with std::io::Write)
@@ -80,7 +81,7 @@ pub struct InMemoryLayerInner {
/// All versions of all pages in the layer are kept here. Indexed
/// by block number and LSN. The value is an offset into the
/// ephemeral file where the page version is stored.
index: BTreeMap<CompactKey, VecMap<Lsn, u64>>,
index: BTreeMap<CompactKey, VecMap<Lsn, InMemoryLayerIndexValue>>,
/// The values are stored in a serialized format in this file.
/// Each serialized Value is preceded by a 'u32' length field.
@@ -89,6 +90,11 @@ pub struct InMemoryLayerInner {
resource_units: GlobalResourceUnits,
}
pub(crate) struct InMemoryLayerIndexValue {
pub(crate) pos: u32,
pub(crate) len: u32,
pub(crate) will_init: bool, // XXX this blows up the size, can we shrink down `len`?
}
impl std::fmt::Debug for InMemoryLayerInner {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -230,7 +236,7 @@ impl InMemoryLayer {
}
}
pub(crate) fn try_len(&self) -> Option<u64> {
pub(crate) fn try_len(&self) -> Option<u32> {
self.inner.try_read().map(|i| i.file.len()).ok()
}
@@ -249,9 +255,7 @@ impl InMemoryLayer {
/// debugging function to print out the contents of the layer
///
/// this is likely completly unused
pub async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
let inner = self.inner.read().await;
pub async fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
let end_str = self.end_lsn_or_max();
println!(
@@ -259,39 +263,6 @@ impl InMemoryLayer {
self.timeline_id, self.start_lsn, end_str,
);
if !verbose {
return Ok(());
}
let cursor = inner.file.block_cursor();
let mut buf = Vec::new();
for (key, vec_map) in inner.index.iter() {
for (lsn, pos) in vec_map.as_slice() {
let mut desc = String::new();
cursor.read_blob_into_buf(*pos, &mut buf, ctx).await?;
let val = Value::des(&buf);
match val {
Ok(Value::Image(img)) => {
write!(&mut desc, " img {} bytes", img.len())?;
}
Ok(Value::WalRecord(rec)) => {
let wal_desc = walrecord::describe_wal_record(&rec).unwrap();
write!(
&mut desc,
" rec {} bytes will_init: {} {}",
buf.len(),
rec.will_init(),
wal_desc
)?;
}
Err(err) => {
write!(&mut desc, " DESERIALIZATION ERROR: {}", err)?;
}
}
println!(" key {} at {}: {}", key, lsn, desc);
}
}
Ok(())
}
@@ -311,7 +282,12 @@ impl InMemoryLayer {
.build();
let inner = self.inner.read().await;
let reader = inner.file.block_cursor();
struct ValueRead {
entry_lsn: Lsn,
read: vectored_dio_read::ValueRead<Vec<u8>>,
}
let mut reads: HashMap<Key, Vec<ValueRead>> = HashMap::new();
for range in keyspace.ranges.iter() {
for (key, vec_map) in inner
@@ -326,24 +302,53 @@ impl InMemoryLayer {
let slice = vec_map.slice_range(lsn_range);
for (entry_lsn, pos) in slice.iter().rev() {
// TODO: this uses the page cache => https://github.com/neondatabase/neon/issues/8183
let buf = reader.read_blob(*pos, &ctx).await;
if let Err(e) = buf {
reconstruct_state.on_key_error(key, PageReconstructError::from(anyhow!(e)));
for (entry_lsn, index_value) in slice.iter().rev() {
reads.entry(key).or_default().push(ValueRead {
entry_lsn: *entry_lsn,
read: vectored_dio_read::ValueRead::new(
index_value.pos,
Vec::with_capacity(index_value.len as usize),
),
});
if index_value.will_init {
break;
}
}
}
}
let value = Value::des(&buf.unwrap());
if let Err(e) = value {
// Execute the read.
vectored_dio_read::execute(
&inner.file,
reads
.iter()
.flat_map(|(_, value_reads)| value_reads.iter().map(|v| &v.read)),
&ctx,
)
.await;
// Process results into the reconstruct state
'next_key: for (key, value_reads) in reads {
for ValueRead { entry_lsn, read } in value_reads {
match read.into_result() {
Err(e) => {
reconstruct_state.on_key_error(key, PageReconstructError::from(anyhow!(e)));
break;
continue 'next_key;
}
Ok(value_buf) => {
let value = Value::des(&value_buf);
if let Err(e) = value {
reconstruct_state
.on_key_error(key, PageReconstructError::from(anyhow!(e)));
continue 'next_key;
}
let key_situation =
reconstruct_state.update_key(&key, *entry_lsn, value.unwrap());
if key_situation == ValueReconstructSituation::Complete {
break;
let key_situation =
reconstruct_state.update_key(&key, entry_lsn, value.unwrap());
if key_situation == ValueReconstructSituation::Complete {
// TODO: metric to see if we fetched more values than necessary
continue 'next_key;
}
}
}
}
@@ -355,6 +360,17 @@ impl InMemoryLayer {
}
}
impl vectored_dio_read::File for EphemeralFile {
async fn read_at_to_end<'a, 'b, B: tokio_epoll_uring::IoBufMut + Send>(
&'b self,
start: u32,
dst: tokio_epoll_uring::Slice<B>,
ctx: &'a RequestContext,
) -> std::io::Result<(tokio_epoll_uring::Slice<B>, usize)> {
EphemeralFile::read_at_to_end(self, start, dst, ctx).await
}
}
fn inmem_layer_display(mut f: impl Write, start_lsn: Lsn, end_lsn: Lsn) -> std::fmt::Result {
write!(f, "inmem-{:016X}-{:016X}", start_lsn.0, end_lsn.0)
}
@@ -380,7 +396,7 @@ impl InMemoryLayer {
/// Get layer size.
pub async fn size(&self) -> Result<u64> {
let inner = self.inner.read().await;
Ok(inner.file.len())
Ok(inner.file.len() as u64)
}
/// Create a new, empty, in-memory layer
@@ -424,11 +440,13 @@ impl InMemoryLayer {
key: CompactKey,
lsn: Lsn,
buf: &[u8],
will_init: bool,
ctx: &RequestContext,
) -> Result<()> {
let mut inner = self.inner.write().await;
self.assert_writable();
self.put_value_locked(&mut inner, key, lsn, buf, ctx).await
self.put_value_locked(&mut inner, key, lsn, buf, will_init, ctx)
.await
}
async fn put_value_locked(
@@ -437,31 +455,31 @@ impl InMemoryLayer {
key: CompactKey,
lsn: Lsn,
buf: &[u8],
will_init: bool,
ctx: &RequestContext,
) -> Result<()> {
trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
let off = {
locked_inner
.file
.write_blob(
buf,
&RequestContextBuilder::extend(ctx)
.page_content_kind(PageContentKind::InMemoryLayer)
.build(),
)
.await?
};
let entry = locked_inner
.file
.write_blob(
buf,
will_init,
&RequestContextBuilder::extend(ctx)
.page_content_kind(PageContentKind::InMemoryLayer)
.build(),
)
.await?;
let vec_map = locked_inner.index.entry(key).or_default();
let old = vec_map.append_or_update_last(lsn, off).unwrap().0;
let old = vec_map.append_or_update_last(lsn, entry).unwrap().0;
if old.is_some() {
// We already had an entry for this LSN. That's odd..
warn!("Key {} at {} already exists", key, lsn);
}
let size = locked_inner.file.len();
locked_inner.resource_units.maybe_publish_size(size);
locked_inner.resource_units.maybe_publish_size(size as u64);
Ok(())
}
@@ -473,7 +491,7 @@ impl InMemoryLayer {
pub(crate) async fn tick(&self) -> Option<u64> {
let mut inner = self.inner.write().await;
let size = inner.file.len();
inner.resource_units.publish_size(size)
inner.resource_units.publish_size(size as u64)
}
pub(crate) async fn put_tombstones(&self, _key_ranges: &[(Range<Key>, Lsn)]) -> Result<()> {
@@ -536,7 +554,6 @@ impl InMemoryLayer {
use l0_flush::Inner;
let _concurrency_permit = match l0_flush_global_state {
Inner::PageCached => None,
Inner::Direct { semaphore, .. } => Some(semaphore.acquire().await),
};
@@ -568,76 +585,31 @@ impl InMemoryLayer {
.await?;
match l0_flush_global_state {
l0_flush::Inner::PageCached => {
let ctx = RequestContextBuilder::extend(ctx)
.page_content_kind(PageContentKind::InMemoryLayer)
.build();
let mut buf = Vec::new();
let cursor = inner.file.block_cursor();
for (key, vec_map) in inner.index.iter() {
// Write all page versions
for (lsn, pos) in vec_map.as_slice() {
cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?;
let will_init = Value::des(&buf)?.will_init();
let (tmp, res) = delta_layer_writer
.put_value_bytes(
Key::from_compact(*key),
*lsn,
buf.slice_len(),
will_init,
&ctx,
)
.await;
res?;
buf = tmp.into_raw_slice().into_inner();
}
}
}
l0_flush::Inner::Direct { .. } => {
let file_contents: Vec<u8> = inner.file.load_to_vec(ctx).await?;
assert_eq!(
file_contents.len() % PAGE_SZ,
0,
"needed by BlockReaderRef::Slice"
);
assert_eq!(file_contents.len(), {
let written = usize::try_from(inner.file.len()).unwrap();
if written % PAGE_SZ == 0 {
written
} else {
written.checked_add(PAGE_SZ - (written % PAGE_SZ)).unwrap()
}
});
let cursor = BlockCursor::new(BlockReaderRef::Slice(&file_contents));
let mut buf = Vec::new();
let file_contents = Bytes::from(file_contents);
for (key, vec_map) in inner.index.iter() {
// Write all page versions
for (lsn, pos) in vec_map.as_slice() {
// TODO: once we have blob lengths in the in-memory index, we can
// 1. get rid of the blob_io / BlockReaderRef::Slice business and
// 2. load the file contents into a Bytes and
// 3. the use `Bytes::slice` to get the `buf` that is our blob
// 4. pass that `buf` into `put_value_bytes`
// => https://github.com/neondatabase/neon/issues/8183
cursor.read_blob_into_buf(*pos, &mut buf, ctx).await?;
let will_init = Value::des(&buf)?.will_init();
let (tmp, res) = delta_layer_writer
for (lsn, entry) in vec_map.as_slice() {
let InMemoryLayerIndexValue {
pos,
len,
will_init,
} = entry;
let buf =
Bytes::slice(&file_contents, *pos as usize..(*pos + *len) as usize);
let (_buf, res) = delta_layer_writer
.put_value_bytes(
Key::from_compact(*key),
*lsn,
buf.slice_len(),
will_init,
*will_init,
ctx,
)
.await;
res?;
buf = tmp.into_raw_slice().into_inner();
}
}
}
@@ -659,3 +631,5 @@ impl InMemoryLayer {
Ok(Some((desc, path)))
}
}
mod vectored_dio_read;

View File

@@ -0,0 +1,244 @@
use std::{
collections::BTreeMap,
sync::{Arc, Mutex},
};
use itertools::Itertools;
use tokio_epoll_uring::{BoundedBuf, IoBufMut, Slice};
use crate::context::RequestContext;
pub trait File {
async fn read_at_to_end<'a, 'b, B: IoBufMut + Send>(
&'b self,
start: u32,
dst: Slice<B>,
ctx: &'a RequestContext,
) -> std::io::Result<(Slice<B>, usize)>;
}
trait Sealed {}
pub trait Buffer: Sealed + std::ops::Deref<Target = [u8]> {
fn cap(&self) -> usize;
fn len(&self) -> usize;
fn remaining(&self) -> usize {
self.cap().checked_sub(self.len()).unwrap()
}
/// Panics if the total length would exceed the initialized capacity.
fn extend_from_slice(&mut self, src: &[u8]);
}
pub struct ValueRead<B: Buffer> {
pos: u32,
state: MutexRefCell<Result<B, Arc<std::io::Error>>>,
}
struct MutexRefCell<T>(Mutex<T>);
impl<T> MutexRefCell<T> {
fn new(value: T) -> Self {
Self(Mutex::new(value))
}
fn borrow(&self) -> impl std::ops::Deref<Target = T> + '_ {
self.0.lock().unwrap()
}
fn borrow_mut(&self) -> impl std::ops::DerefMut<Target = T> + '_ {
self.0.lock().unwrap()
}
fn into_inner(self) -> T {
self.0.into_inner().unwrap()
}
}
impl<B: Buffer> ValueRead<B> {
pub fn new(pos: u32, buf: B) -> Self {
Self {
pos,
state: MutexRefCell::new(Ok(buf)),
}
}
pub fn into_result(self) -> Result<B, Arc<std::io::Error>> {
self.state.into_inner()
}
}
impl Sealed for Vec<u8> {}
impl Buffer for Vec<u8> {
fn cap(&self) -> usize {
self.capacity()
}
fn len(&self) -> usize {
self.len()
}
fn extend_from_slice(&mut self, src: &[u8]) {
if self.len() + src.len() > self.cap() {
panic!("Buffer capacity exceeded");
}
Vec::extend_from_slice(self, src);
}
}
pub async fn execute<'a, 'b, 'c, I, F, B>(file: &'c F, reads: I, ctx: &'b RequestContext)
where
I: IntoIterator<Item = &'a ValueRead<B>> + Send,
F: File + Send,
B: Buffer + IoBufMut + Send,
{
const DIO_CHUNK_SIZE: usize = 512;
// Plan which parts of which chunks need to be appended to which buffer
struct ChunkReadDestination<'a, B: Buffer> {
value_read: &'a ValueRead<B>,
offset_in_chunk: u32,
len: u32,
}
// use of BTreeMap's sorted iterator is critical to ensure buffer is filled in order
let mut chunk_reads: BTreeMap<u32, Vec<ChunkReadDestination<B>>> = BTreeMap::new();
for value_read in reads {
let ValueRead { pos, state } = value_read;
let len = state
.borrow()
.as_ref()
.expect("we haven't started reading, no chance it's in Err() state")
.len();
let mut remaining = usize::try_from(len).unwrap();
let mut chunk_no = *pos / (DIO_CHUNK_SIZE as u32);
let mut offset_in_chunk = usize::try_from(*pos % (DIO_CHUNK_SIZE as u32)).unwrap();
while remaining > 0 {
let remaining_in_chunk = std::cmp::min(remaining, DIO_CHUNK_SIZE - offset_in_chunk);
chunk_reads
.entry(chunk_no)
.or_default()
.push(ChunkReadDestination {
value_read,
offset_in_chunk: offset_in_chunk as u32,
len: remaining_in_chunk as u32,
});
offset_in_chunk = 0;
chunk_no += 1;
remaining -= remaining_in_chunk;
}
}
// Merge adjacent chunk reads (merging pass on the BTreeMap iterator)
const MAX_CHUNK_BATCH_SIZE: usize = {
let desired = 128 * 1024; // 128k
if desired % DIO_CHUNK_SIZE != 0 {
panic!("MAX_CHUNK_BATCH_SIZE must be a multiple of DIO_CHUNK_SIZE")
// compile-time error
}
desired / DIO_CHUNK_SIZE
};
struct MergedRead<'a, B: Buffer> {
start_chunk_no: u32,
nchunks: u32,
dsts: Vec<MergedChunkReadDestination<'a, B>>,
}
struct MergedChunkReadDestination<'a, B: Buffer> {
value_read: &'a ValueRead<B>,
offset_in_merged_read: u32,
len: u32,
}
let mut merged_reads: Vec<MergedRead<B>> = Vec::new();
let mut chunk_reads = chunk_reads.into_iter().peekable();
loop {
let mut last_chunk_no = None;
let to_merge: Vec<(u32, Vec<ChunkReadDestination<B>>)> = chunk_reads
.peeking_take_while(|(chunk_no, _)| {
if let Some(last_chunk_no) = last_chunk_no {
if *chunk_no != last_chunk_no + 1 {
return false;
}
}
last_chunk_no = Some(*chunk_no);
true
})
.take(MAX_CHUNK_BATCH_SIZE)
.collect(); // TODO: avoid this .collect()
let Some(start_chunk_no) = to_merge.first().map(|(chunk_no, _)| *chunk_no) else {
break;
};
let nchunks = to_merge.len() as u32;
let dsts = to_merge
.into_iter()
.enumerate()
.flat_map(|(i, (_, dsts))| {
dsts.into_iter().map(
move |ChunkReadDestination {
value_read,
offset_in_chunk,
len,
}| {
MergedChunkReadDestination {
value_read,
offset_in_merged_read: i as u32 * DIO_CHUNK_SIZE as u32
+ offset_in_chunk,
len,
}
},
)
})
.collect();
merged_reads.push(MergedRead {
start_chunk_no,
nchunks,
dsts,
});
}
drop(chunk_reads);
// Execute reads and fill the destination
// TODO: prefetch
let get_chunk_buf = |nchunks| Vec::with_capacity(nchunks as usize * (DIO_CHUNK_SIZE));
for MergedRead {
start_chunk_no,
nchunks,
dsts,
} in merged_reads
{
let all_done = dsts
.iter()
.all(|MergedChunkReadDestination { value_read, .. }| {
value_read.state.borrow().is_err()
});
if all_done {
continue;
}
let (merged_read_buf_slice, nread) = match file
.read_at_to_end(
start_chunk_no * DIO_CHUNK_SIZE as u32,
get_chunk_buf(nchunks).slice_full(),
ctx,
)
.await
{
Ok(t) => t,
Err(e) => {
let e = Arc::new(e);
for MergedChunkReadDestination { value_read, .. } in dsts {
*value_read.state.borrow_mut() = Err(Arc::clone(&e));
// this will make later reads for the given ValueRead short-circuit, see top of loop body
}
continue;
}
};
let merged_read_buf = merged_read_buf_slice.into_inner();
assert_eq!(nread, merged_read_buf.len());
let merged_read_buf = &merged_read_buf[..nread];
for MergedChunkReadDestination {
value_read,
offset_in_merged_read,
len,
} in dsts
{
if let Ok(buf) = &mut *value_read.state.borrow_mut() {
let data = &merged_read_buf
[offset_in_merged_read as usize..(offset_in_merged_read + len) as usize];
assert!(buf.remaining() >= data.len());
buf.extend_from_slice(data);
}
}
}
}

View File

@@ -1463,6 +1463,7 @@ impl Timeline {
tracing::warn!("Lock conflict while reading size of open layer");
return;
};
let current_size = current_size as u64;
let current_lsn = self.get_last_record_lsn();
@@ -5576,7 +5577,9 @@ impl<'a> TimelineWriter<'a> {
let action = self.get_open_layer_action(lsn, buf_size);
let layer = self.handle_open_layer_action(lsn, action, ctx).await?;
let res = layer.put_value(key.to_compact(), lsn, &buf, ctx).await;
let res = layer
.put_value(key.to_compact(), lsn, &buf, value.will_init(), ctx)
.await;
if res.is_ok() {
// Update the current size only when the entire write was ok.