Compare commits

...

49 Commits

Author SHA1 Message Date
Christian Schwarz
63cbf75b02 clippy & doc check 2024-04-23 12:42:57 +00:00
Christian Schwarz
a57fee334f comment out the invariants check for zero_padded_buffer, it's too expensive 2024-04-23 12:01:09 +00:00
Christian Schwarz
842474f2d1 done 2024-04-23 11:45:27 +00:00
Christian Schwarz
6138941e6c WIP 2024-04-23 10:53:07 +00:00
Christian Schwarz
ec0e7b5021 WIP 2024-04-23 10:27:51 +00:00
Christian Schwarz
06cdbdff3d WIP: avoid impls of trait Buffer 2024-04-23 09:36:47 +00:00
Christian Schwarz
585a522ff4 remove stray comment 2024-04-22 17:24:52 +00:00
Christian Schwarz
ea5f307e81 refactor sandwiching code
Goals:
1. don't junk up the owned_buffer_io interface with the
   write_all_borrowed, eliminating one `panic!("should not happen")`
1. make the sandwiching clearer
2024-04-22 16:38:44 +00:00
Christian Schwarz
9281f54527 fix doc comments 2024-04-17 13:35:52 +00:00
Christian Schwarz
717b7cb73e safety comment 2024-04-17 13:35:45 +00:00
Christian Schwarz
99056dde13 clippy nitpickery 2024-04-17 13:26:59 +00:00
Christian Schwarz
78233dc969 Merge remote-tracking branch 'origin/main' into problame/write-path-larger-buffers 2024-04-17 13:19:46 +00:00
Christian Schwarz
5e14362cf7 Revert the page_cache_priming_writer changes
It would require plumbing through the RequestContext, and its utility is
somewhat dubious anyway.
2024-03-28 14:18:40 +00:00
Christian Schwarz
fcd1ccfea7 drop the parametrization changes, not part of the feature PR 2024-03-28 12:44:56 +00:00
Christian Schwarz
27564dde78 Merge remote-tracking branch 'origin/main' into problame/write-path-larger-buffers
Conflicts:
	control_plane/src/local_env.rs
	control_plane/src/pageserver.rs
the above have been preliminaried to `main` since
	pageserver/src/virtual_file.rs
	pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs
	pageserver/src/virtual_file/owned_buffers_io/write.rs
strategy: pick ours (likely breaks the build, will fix subsequently)
2024-03-28 12:43:02 +00:00
Christian Schwarz
261f116a2d local benchmark run
test_bulk_insert[neon-release-pg14-std-fs].wal_written: 345 MB
test_bulk_insert[neon-release-pg14-std-fs].wal_recovery: 8.381 s

test_bulk_insert[neon-release-pg14-tokio-epoll-uring].wal_written: 345 MB
test_bulk_insert[neon-release-pg14-tokio-epoll-uring].wal_recovery: 11.760 s
2024-03-14 17:29:04 +00:00
Christian Schwarz
79a6cda45d page_cache_priming_writer: done 2024-03-14 17:19:36 +00:00
Christian Schwarz
f0a67c4071 WIP: page_cache_priming_writer: clippy 2024-03-14 17:19:36 +00:00
Christian Schwarz
a1f4eb2815 WIP: page_cache_priming_writer: bugfixes 2024-03-14 17:15:30 +00:00
Christian Schwarz
eb1ccd7988 WIP: page_cache_priming_writer: plumb through RequestContext for previous commit, yet more churn -,- 2024-03-14 17:15:21 +00:00
Christian Schwarz
f1f0452722 WIP: page_cache_priming_writer (is it really worth it?) 2024-03-14 15:46:41 +00:00
Christian Schwarz
40200b7521 figure out why & when exactly zeroes past write offset are required & assert it 2024-03-14 12:04:17 +00:00
Christian Schwarz
91bd729be2 junk up owned_buffers_io from previous commit to deal with EphemeralFile speciality of reading zeroes past end-of-file
This makes it diverge semantically from what's in the tokio-epoll-uring
download PR :(
2024-03-13 18:23:12 +00:00
Christian Schwarz
4b84f23cea larger buffers for the write path
The OwnedAsyncWrite stuff is based on the code in
tokio-epoll-uring on-demand download PR (#6992), which hasn't merged
yet.
2024-03-13 18:23:08 +00:00
Christian Schwarz
644c5e243d Revert "experiment(repeat, without preceding reverts) demonstrate that std-fs performs better because it hits the page cache"
This reverts commit d66ccbae5e.
2024-03-13 15:28:24 +00:00
Christian Schwarz
d66ccbae5e experiment(repeat, without preceding reverts) demonstrate that std-fs performs better because it hits the page cache
... by forcing each write system call to go to disk

test_bulk_insert[neon-release-pg14-tokio-epoll-uring].wal_written: 346 MB
test_bulk_insert[neon-release-pg14-tokio-epoll-uring].wal_recovery: 93.417 s

test_bulk_insert[neon-release-pg14-std-fs].wal_written: 346 MB
test_bulk_insert[neon-release-pg14-std-fs].wal_recovery: 86.009 s

=> ~8% instead of 2x difference
2024-03-13 15:27:41 +00:00
Christian Schwarz
578a2d5d5f Revert "experiment: for create_delta_layer, use global io_engine, but inside a spawn_blocking single-threaded runtime"
This reverts commit 72a8e090dd.
2024-03-13 15:09:11 +00:00
Christian Schwarz
c9d1f51a93 Revert "experiment: for create_delta_layer _write path_, use StdFs io engine in a spawn_blocking thread single-threaded runtime"
This reverts commit 4a8e7f8716.
2024-03-13 15:09:06 +00:00
Christian Schwarz
1339834297 Revert "experiment: for EphemeralFile write path, use StdFs io engine"
This reverts commit c8c04c0db8.
2024-03-13 15:09:00 +00:00
Christian Schwarz
746fc530c5 "experiment: demonstrate that std-fs performs better because it hits the page cache"
This reverts commit 2edbc07733.
2024-03-13 15:08:43 +00:00
Christian Schwarz
94311052cd previous commit's numbers were with all the preceding experiments 2024-03-13 15:08:11 +00:00
Christian Schwarz
2edbc07733 experiment: demonstrate that std-fs performs better because it hits the page cache
... by forcing each write system call to go to disk

test_bulk_insert[neon-release-pg14-tokio-epoll-uring].wal_written: 346 MB
test_bulk_insert[neon-release-pg14-tokio-epoll-uring].wal_recovery: 92.559 s

test_bulk_insert[neon-release-pg14-std-fs].wal_written: 346 MB
test_bulk_insert[neon-release-pg14-std-fs].wal_recovery: 81.998 s

=> 10%ish worse instead of 2x
2024-03-13 15:07:46 +00:00
Christian Schwarz
c8c04c0db8 experiment: for EphemeralFile write path, use StdFs io engine
together with previous commits, this brings us back down to
pre-regression

test_bulk_insert[neon-release-pg14-tokio-epoll-uring].wal_written: 345 MB
test_bulk_insert[neon-release-pg14-tokio-epoll-uring].wal_recovery: 9.991 s
2024-03-13 14:09:21 +00:00
Christian Schwarz
4a8e7f8716 experiment: for create_delta_layer _write path_, use StdFs io engine in a spawn_blocking thread single-threaded runtime
builds on top of the previous commit

test_bulk_insert[neon-release-pg14-tokio-epoll-uring].wal_written: 345 MB
test_bulk_insert[neon-release-pg14-tokio-epoll-uring].wal_recovery: 13.153 s
2024-03-13 14:09:16 +00:00
Christian Schwarz
72a8e090dd experiment: for create_delta_layer, use global io_engine, but inside a spawn_blocking single-threaded runtime
This makes things worse with tokio-epoll-uring

test_bulk_insert[neon-release-pg14-tokio-epoll-uring].wal_written: 345 MB
test_bulk_insert[neon-release-pg14-tokio-epoll-uring].wal_recovery: 19.574 s

This is a partial revert of 3da410c8fe
2024-03-13 14:09:12 +00:00
Christian Schwarz
e0ea465aed Revert "experiment: Revert "tokio-epoll-uring: use it on the layer-creating code paths (#6378)""
This reverts commit d3c157eeee.
2024-03-13 12:45:53 +00:00
Christian Schwarz
d3c157eeee experiment: Revert "tokio-epoll-uring: use it on the layer-creating code paths (#6378)"
Unchanged

test_bulk_insert[neon-release-pg14-tokio-epoll-uring].wal_written: 345 MB
test_bulk_insert[neon-release-pg14-tokio-epoll-uring].wal_recovery: 9.194 s

This reverts commit 3da410c8fe.
2024-03-13 12:45:40 +00:00
Christian Schwarz
c600355802 Revert "experiment: StdFs for EphemeralFile writes isn't the bottleneck"
This reverts commit 57241c1c5a.
2024-03-13 12:36:21 +00:00
Christian Schwarz
57241c1c5a experiment: StdFs for EphemeralFile writes isn't the bottleneck
With this

test_bulk_insert[neon-release-pg14-tokio-epoll-uring].wal_written: 345 MB
test_bulk_insert[neon-release-pg14-tokio-epoll-uring].wal_recovery: 16.053 s

down from

test_bulk_insert[neon-release-pg14-tokio-epoll-uring].wal_written: 345 MB
test_bulk_insert[neon-release-pg14-tokio-epoll-uring].wal_recovery: 17.669 s

but the regression is from baseline

test_bulk_insert[neon-release-pg14-std-fs].wal_written: 345 MB
test_bulk_insert[neon-release-pg14-std-fs].wal_recovery: 9.335 s
2024-03-13 11:41:17 +00:00
Christian Schwarz
b9c30dbd6b fix the parametrization 2024-03-12 20:24:45 +00:00
Christian Schwarz
35f8735a27 wip 2024-03-12 20:13:44 +00:00
Christian Schwarz
095130c1b3 DO NOT MERGE: always parametrize 2024-03-12 20:09:14 +00:00
Christian Schwarz
5a0277476d Revert "make changes preparing next commit"
This reverts commit e85a631ddb.
2024-03-12 20:09:14 +00:00
Christian Schwarz
6348833bdc expose that virtual_file_io_engine and get_vectored_impl were never set 2024-03-12 20:09:14 +00:00
Christian Schwarz
dbabd4e4ea Revert "expose that pageserver_virtual_file_io_engine test param was never used (same for get_vectored_impl)"
This reverts commit 5b8888ce6b.
2024-03-12 20:09:14 +00:00
Christian Schwarz
5b8888ce6b expose that pageserver_virtual_file_io_engine test param was never used (same for get_vectored_impl) 2024-03-12 19:56:02 +00:00
Christian Schwarz
e85a631ddb make changes preparing next commit 2024-03-12 19:56:02 +00:00
Christian Schwarz
95deea4f39 Revert "Revert "tokio-epoll-uring: use it on the layer-creating code paths (#6378)""
This reverts commit 9876045444.
2024-03-12 18:53:16 +00:00
Christian Schwarz
9876045444 Revert "tokio-epoll-uring: use it on the layer-creating code paths (#6378)"
This reverts commit 3da410c8fe.
2024-03-12 18:53:11 +00:00
10 changed files with 490 additions and 249 deletions

View File

@@ -121,7 +121,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
self.offset
}
const CAPACITY: usize = if BUFFERED { PAGE_SZ } else { 0 };
const CAPACITY: usize = if BUFFERED { 64 * 1024 } else { 0 };
/// Writes the given buffer directly to the underlying `VirtualFile`.
/// You need to make sure that the internal buffer is empty, otherwise

View File

@@ -3,36 +3,27 @@
use crate::config::PageServerConf;
use crate::context::RequestContext;
use crate::page_cache::{self, PAGE_SZ};
use crate::page_cache;
use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader};
use crate::virtual_file::{self, VirtualFile};
use bytes::BytesMut;
use camino::Utf8PathBuf;
use pageserver_api::shard::TenantShardId;
use std::cmp::min;
use std::io::{self, ErrorKind};
use std::ops::DerefMut;
use std::io;
use std::sync::atomic::AtomicU64;
use tracing::*;
use utils::id::TimelineId;
pub struct EphemeralFile {
page_cache_file_id: page_cache::FileId,
_tenant_shard_id: TenantShardId,
_timeline_id: TimelineId,
file: VirtualFile,
len: u64,
/// An ephemeral file is append-only.
/// We keep the last page, which can still be modified, in [`Self::mutable_tail`].
/// The other pages, which can no longer be modified, are accessed through the page cache.
///
/// None <=> IO is ongoing.
/// Size is fixed to PAGE_SZ at creation time and must not be changed.
mutable_tail: Option<BytesMut>,
sandwich: page_caching_sandwich::PageCachingSandwich,
}
mod page_caching_sandwich;
mod sandwich;
mod zero_padded_buffer;
impl EphemeralFile {
pub async fn create(
conf: &PageServerConf,
@@ -59,21 +50,18 @@ impl EphemeralFile {
.await?;
Ok(EphemeralFile {
page_cache_file_id: page_cache::next_file_id(),
_tenant_shard_id: tenant_shard_id,
_timeline_id: timeline_id,
file,
len: 0,
mutable_tail: Some(BytesMut::zeroed(PAGE_SZ)),
sandwich: page_caching_sandwich::PageCachingSandwich::new(file),
})
}
pub(crate) fn len(&self) -> u64 {
self.len
self.sandwich.bytes_written()
}
pub(crate) fn id(&self) -> page_cache::FileId {
self.page_cache_file_id
pub(crate) fn page_cache_file_id(&self) -> page_cache::FileId {
self.sandwich.page_cache_file_id()
}
pub(crate) async fn read_blk(
@@ -81,182 +69,30 @@ impl EphemeralFile {
blknum: u32,
ctx: &RequestContext,
) -> Result<BlockLease, io::Error> {
let flushed_blknums = 0..self.len / PAGE_SZ as u64;
if flushed_blknums.contains(&(blknum as u64)) {
let cache = page_cache::get();
match cache
.read_immutable_buf(self.page_cache_file_id, blknum, ctx)
.await
.map_err(|e| {
std::io::Error::new(
std::io::ErrorKind::Other,
// order path before error because error is anyhow::Error => might have many contexts
format!(
"ephemeral file: read immutable page #{}: {}: {:#}",
blknum, self.file.path, e,
),
)
})? {
page_cache::ReadBufResult::Found(guard) => {
return Ok(BlockLease::PageReadGuard(guard))
}
page_cache::ReadBufResult::NotFound(write_guard) => {
let write_guard = self
.file
.read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64)
.await?;
let read_guard = write_guard.mark_valid();
return Ok(BlockLease::PageReadGuard(read_guard));
}
};
} else {
debug_assert_eq!(blknum as u64, self.len / PAGE_SZ as u64);
Ok(BlockLease::EphemeralFileMutableTail(
self.mutable_tail
.as_deref()
.expect("we're not doing IO, it must be Some()")
.try_into()
.expect("we ensure that it's always PAGE_SZ"),
))
}
self.sandwich.read_blk(blknum, ctx).await
}
pub(crate) async fn write_blob(
&mut self,
srcbuf: &[u8],
ctx: &RequestContext,
_ctx: &RequestContext,
) -> Result<u64, io::Error> {
struct Writer<'a> {
ephemeral_file: &'a mut EphemeralFile,
/// The block to which the next [`push_bytes`] will write.
blknum: u32,
/// The offset inside the block identified by [`blknum`] to which [`push_bytes`] will write.
off: usize,
}
impl<'a> Writer<'a> {
fn new(ephemeral_file: &'a mut EphemeralFile) -> io::Result<Writer<'a>> {
Ok(Writer {
blknum: (ephemeral_file.len / PAGE_SZ as u64) as u32,
off: (ephemeral_file.len % PAGE_SZ as u64) as usize,
ephemeral_file,
})
}
#[inline(always)]
async fn push_bytes(
&mut self,
src: &[u8],
ctx: &RequestContext,
) -> Result<(), io::Error> {
let mut src_remaining = src;
while !src_remaining.is_empty() {
let dst_remaining = &mut self
.ephemeral_file
.mutable_tail
.as_deref_mut()
.expect("IO is not yet ongoing")[self.off..];
let n = min(dst_remaining.len(), src_remaining.len());
dst_remaining[..n].copy_from_slice(&src_remaining[..n]);
self.off += n;
src_remaining = &src_remaining[n..];
if self.off == PAGE_SZ {
let mutable_tail = std::mem::take(&mut self.ephemeral_file.mutable_tail)
.expect("IO is not yet ongoing");
let (mutable_tail, res) = self
.ephemeral_file
.file
.write_all_at(mutable_tail, self.blknum as u64 * PAGE_SZ as u64)
.await;
// TODO: If we panic before we can put the mutable_tail back, subsequent calls will fail.
// I.e., the IO isn't retryable if we panic.
self.ephemeral_file.mutable_tail = Some(mutable_tail);
match res {
Ok(_) => {
// Pre-warm the page cache with what we just wrote.
// This isn't necessary for coherency/correctness, but it's how we've always done it.
let cache = page_cache::get();
match cache
.read_immutable_buf(
self.ephemeral_file.page_cache_file_id,
self.blknum,
ctx,
)
.await
{
Ok(page_cache::ReadBufResult::Found(_guard)) => {
// This function takes &mut self, so, it shouldn't be possible to reach this point.
unreachable!("we just wrote blknum {} and this function takes &mut self, so, no concurrent read_blk is possible", self.blknum);
}
Ok(page_cache::ReadBufResult::NotFound(mut write_guard)) => {
let buf: &mut [u8] = write_guard.deref_mut();
debug_assert_eq!(buf.len(), PAGE_SZ);
buf.copy_from_slice(
self.ephemeral_file
.mutable_tail
.as_deref()
.expect("IO is not ongoing"),
);
let _ = write_guard.mark_valid();
// pre-warm successful
}
Err(e) => {
error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}");
// fail gracefully, it's not the end of the world if we can't pre-warm the cache here
}
}
// Zero the buffer for re-use.
// Zeroing is critical for correcntess because the write_blob code below
// and similarly read_blk expect zeroed pages.
self.ephemeral_file
.mutable_tail
.as_deref_mut()
.expect("IO is not ongoing")
.fill(0);
// This block is done, move to next one.
self.blknum += 1;
self.off = 0;
}
Err(e) => {
return Err(std::io::Error::new(
ErrorKind::Other,
// order error before path because path is long and error is short
format!(
"ephemeral_file: write_blob: write-back full tail blk #{}: {:#}: {}",
self.blknum,
e,
self.ephemeral_file.file.path,
),
));
}
}
}
}
Ok(())
}
}
let pos = self.len;
let mut writer = Writer::new(self)?;
let pos = self.sandwich.bytes_written();
// Write the length field
if srcbuf.len() < 0x80 {
// short one-byte length header
let len_buf = [srcbuf.len() as u8];
writer.push_bytes(&len_buf, ctx).await?;
self.sandwich.write_all_borrowed(&len_buf).await?;
} else {
let mut len_buf = u32::to_be_bytes(srcbuf.len() as u32);
len_buf[0] |= 0x80;
writer.push_bytes(&len_buf, ctx).await?;
self.sandwich.write_all_borrowed(&len_buf).await?;
}
// Write the payload
writer.push_bytes(srcbuf, ctx).await?;
if srcbuf.len() < 0x80 {
self.len += 1;
} else {
self.len += 4;
}
self.len += srcbuf.len() as u64;
self.sandwich.write_all_borrowed(srcbuf).await?;
Ok(pos)
}
@@ -271,28 +107,6 @@ pub fn is_ephemeral_file(filename: &str) -> bool {
}
}
impl Drop for EphemeralFile {
fn drop(&mut self) {
// There might still be pages in the [`crate::page_cache`] for this file.
// We leave them there, [`crate::page_cache::PageCache::find_victim`] will evict them when needed.
// unlink the file
let res = std::fs::remove_file(&self.file.path);
if let Err(e) = res {
if e.kind() != std::io::ErrorKind::NotFound {
// just never log the not found errors, we cannot do anything for them; on detach
// the tenant directory is already gone.
//
// not found files might also be related to https://github.com/neondatabase/neon/issues/2442
error!(
"could not remove ephemeral file '{}': {}",
self.file.path, e
);
}
}
}
}
impl BlockReader for EphemeralFile {
fn block_cursor(&self) -> super::block_io::BlockCursor<'_> {
BlockCursor::new(super::block_io::BlockReaderRef::EphemeralFile(self))

View File

@@ -0,0 +1,101 @@
use crate::context::RequestContext;
use crate::page_cache::{self, PAGE_SZ};
use crate::tenant::block_io::BlockLease;
use crate::virtual_file::VirtualFile;
use std::io;
use tracing::*;
use super::sandwich;
pub struct PageCachingSandwich {
page_cache_file_id: page_cache::FileId,
sandwich: super::sandwich::Sandwich,
}
impl PageCachingSandwich {
pub fn new(file: VirtualFile) -> Self {
Self {
page_cache_file_id: page_cache::next_file_id(),
sandwich: super::sandwich::Sandwich::new(file),
}
}
pub fn page_cache_file_id(&self) -> page_cache::FileId {
self.page_cache_file_id
}
pub(crate) async fn write_all_borrowed(&mut self, srcbuf: &[u8]) -> Result<usize, io::Error> {
// It doesn't make sense to proactively fill the page cache on the Pageserver write path
// because Compute is unlikely to access recently written data.
self.sandwich.write_all_borrowed(srcbuf).await
}
pub(crate) fn bytes_written(&self) -> u64 {
self.sandwich.bytes_written()
}
pub(crate) async fn read_blk(
&self,
blknum: u32,
ctx: &RequestContext,
) -> Result<BlockLease, io::Error> {
match self.sandwich.read_blk(blknum).await? {
sandwich::ReadResult::NeedsReadFromVirtualFile { virtual_file } => {
let cache = page_cache::get();
match cache
.read_immutable_buf(self.page_cache_file_id, blknum, ctx)
.await
.map_err(|e| {
std::io::Error::new(
std::io::ErrorKind::Other,
// order path before error because error is anyhow::Error => might have many contexts
format!(
"ephemeral file: read immutable page #{}: {}: {:#}",
blknum,
self.sandwich.as_inner_virtual_file().path,
e,
),
)
})? {
page_cache::ReadBufResult::Found(guard) => {
return Ok(BlockLease::PageReadGuard(guard))
}
page_cache::ReadBufResult::NotFound(write_guard) => {
let write_guard = virtual_file
.read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64)
.await?;
let read_guard = write_guard.mark_valid();
return Ok(BlockLease::PageReadGuard(read_guard));
}
}
}
sandwich::ReadResult::ServedFromZeroPaddedMutableTail { buffer } => {
Ok(BlockLease::EphemeralFileMutableTail(buffer))
}
}
}
}
impl Drop for PageCachingSandwich {
fn drop(&mut self) {
// There might still be pages in the [`crate::page_cache`] for this file.
// We leave them there, [`crate::page_cache::PageCache::find_victim`] will evict them when needed.
// unlink the file
let res = std::fs::remove_file(&self.sandwich.as_inner_virtual_file().path);
if let Err(e) = res {
if e.kind() != std::io::ErrorKind::NotFound {
// just never log the not found errors, we cannot do anything for them; on detach
// the tenant directory is already gone.
//
// not found files might also be related to https://github.com/neondatabase/neon/issues/2442
error!(
"could not remove ephemeral file '{}': {}",
self.sandwich.as_inner_virtual_file().path,
e
);
}
}
}
}

View File

@@ -0,0 +1,99 @@
use crate::{
page_cache::PAGE_SZ,
tenant::ephemeral_file::zero_padded_buffer,
virtual_file::{
owned_buffers_io::{self, write::Buffer},
VirtualFile,
},
};
pub struct Sandwich {
buffered_writer: owned_buffers_io::write::BufferedWriter<
zero_padded_buffer::Buf<{ Self::TAIL_SZ }>,
owned_buffers_io::util::size_tracking_writer::Writer<VirtualFile>,
>,
}
pub enum ReadResult<'a> {
NeedsReadFromVirtualFile { virtual_file: &'a VirtualFile },
ServedFromZeroPaddedMutableTail { buffer: &'a [u8; PAGE_SZ] },
}
impl Sandwich {
const TAIL_SZ: usize = 64 * 1024;
pub fn new(file: VirtualFile) -> Self {
let bytes_flushed_tracker = owned_buffers_io::util::size_tracking_writer::Writer::new(file);
let buffered_writer = owned_buffers_io::write::BufferedWriter::new(
bytes_flushed_tracker,
zero_padded_buffer::Buf::default(),
);
Self { buffered_writer }
}
pub(crate) fn as_inner_virtual_file(&self) -> &VirtualFile {
self.buffered_writer.as_inner().as_inner()
}
pub async fn write_all_borrowed(&mut self, buf: &[u8]) -> std::io::Result<usize> {
self.buffered_writer.write_buffered_borrowed(buf).await
}
pub fn bytes_written(&self) -> u64 {
let flushed_offset = self.buffered_writer.as_inner().bytes_written();
let buffer: &zero_padded_buffer::Buf<{ Self::TAIL_SZ }> =
self.buffered_writer.inspect_buffer();
flushed_offset + u64::try_from(buffer.pending()).unwrap()
}
pub(crate) async fn read_blk(&self, blknum: u32) -> Result<ReadResult, std::io::Error> {
let flushed_offset = self.buffered_writer.as_inner().bytes_written();
let buffer: &zero_padded_buffer::Buf<{ Self::TAIL_SZ }> =
self.buffered_writer.inspect_buffer();
let buffered_offset = flushed_offset + u64::try_from(buffer.pending()).unwrap();
let read_offset = (blknum as u64) * (PAGE_SZ as u64);
assert_eq!(
flushed_offset % (PAGE_SZ as u64),
0,
"we need this in the logic below, because it assumes the page isn't spread across flushed part and in-memory buffer"
);
if read_offset < flushed_offset {
assert!(
read_offset + (PAGE_SZ as u64) <= flushed_offset,
"this impl can't deal with pages spread across flushed & buffered part"
);
Ok(ReadResult::NeedsReadFromVirtualFile {
virtual_file: self.as_inner_virtual_file(),
})
} else {
let read_until_offset = read_offset + (PAGE_SZ as u64);
if !(0..buffered_offset).contains(&read_until_offset) {
// The blob_io code relies on the reader allowing reads past
// the end of what was written, up to end of the current PAGE_SZ chunk.
// This is a relict of the past where we would get a pre-zeroed page from the page cache.
//
// DeltaLayer probably has the same issue, not sure why it needs no special treatment.
let nbytes_past_end = read_until_offset.checked_sub(buffered_offset).unwrap();
if nbytes_past_end >= (PAGE_SZ as u64) {
// TODO: treat this as error. Pre-existing issue before this patch.
panic!(
"return IO error: read past end of file: read=0x{read_offset:x} buffered=0x{buffered_offset:x} flushed=0x{flushed_offset}"
)
}
}
let read_offset_in_buffer = read_offset
.checked_sub(flushed_offset)
.expect("would have taken `if` branch instead of this one");
let read_offset_in_buffer = usize::try_from(read_offset_in_buffer).unwrap();
let zero_padded_slice = buffer.as_zero_padded_slice();
let page = &zero_padded_slice[read_offset_in_buffer..(read_offset_in_buffer + PAGE_SZ)];
Ok(ReadResult::ServedFromZeroPaddedMutableTail {
buffer: page
.try_into()
.expect("the slice above got it as page-size slice"),
})
}
}
}

View File

@@ -0,0 +1,98 @@
use std::mem::MaybeUninit;
pub struct Buf<const N: usize> {
allocation: Box<[u8; N]>,
written: usize,
}
impl<const N: usize> Default for Buf<N> {
fn default() -> Self {
Self {
allocation: Box::new(
// SAFETY: zeroed memory is a valid [u8; N]
unsafe { MaybeUninit::zeroed().assume_init() },
),
written: 0,
}
}
}
impl<const N: usize> Buf<N> {
#[inline(always)]
fn invariants(&self) {
// don't check by default, unoptimized is too expensive even for debug mode
if false {
debug_assert!(self.written <= N, "{}", self.written);
debug_assert!(self.allocation[self.written..N].iter().all(|v| *v == 0));
}
}
pub fn as_zero_padded_slice(&self) -> &[u8; N] {
&self.allocation
}
}
impl<const N: usize> crate::virtual_file::owned_buffers_io::write::Buffer for Buf<N> {
type IoBuf = Self;
fn cap(&self) -> usize {
self.allocation.len()
}
fn extend_from_slice(&mut self, other: &[u8]) {
self.invariants();
let remaining = self.cap() - other.len();
if other.len() > remaining {
panic!("calling extend_from_slice() with insufficient remaining capacity");
}
self.allocation[self.written..(self.written + other.len())].copy_from_slice(other);
self.written += other.len();
self.invariants();
}
fn pending(&self) -> usize {
self.written
}
fn flush(self) -> tokio_epoll_uring::Slice<Self> {
self.invariants();
let written = self.written;
tokio_epoll_uring::BoundedBuf::slice(self, 0..written)
}
fn reconstruct_after_flush(this: Self::IoBuf) -> Self {
let Self {
mut allocation,
written,
} = this;
allocation[0..written].fill(0);
let new = Self {
allocation,
written: 0,
};
new.invariants();
new
}
}
/// We have this implementation so that [`Buf<N>`] can be used with [`crate::virtual_file::owned_buffers_io::write::BufferedWriter`].
///
///
/// SAFETY:
///
/// The [`Self::allocation`] is stable becauses boxes are stable.
/// The memory is zero-initialized, so, bytes_init is always N.
///
unsafe impl<const N: usize> tokio_epoll_uring::IoBuf for Buf<N> {
fn stable_ptr(&self) -> *const u8 {
self.allocation.as_ptr()
}
fn bytes_init(&self) -> usize {
N
}
fn bytes_total(&self) -> usize {
N
}
}

View File

@@ -7,6 +7,7 @@ use std::collections::HashSet;
use std::future::Future;
use anyhow::{anyhow, Context};
use bytes::BytesMut;
use camino::{Utf8Path, Utf8PathBuf};
use pageserver_api::shard::TenantShardId;
use tokio::fs::{self, File, OpenOptions};
@@ -194,10 +195,10 @@ async fn download_object<'a>(
// There's chunks_vectored() on the stream.
let (bytes_amount, destination_file) = async {
let size_tracking = size_tracking_writer::Writer::new(destination_file);
let mut buffered = owned_buffers_io::write::BufferedWriter::<
{ super::BUFFER_SIZE },
_,
>::new(size_tracking);
let mut buffered = owned_buffers_io::write::BufferedWriter::<BytesMut, _>::new(
size_tracking,
BytesMut::with_capacity(super::BUFFER_SIZE),
);
while let Some(res) =
futures::StreamExt::next(&mut download.download_stream).await
{

View File

@@ -454,7 +454,7 @@ impl InMemoryLayer {
trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");
let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id).await?;
let key = InMemoryLayerFileId(file.id());
let key = InMemoryLayerFileId(file.page_cache_file_id());
Ok(InMemoryLayer {
file_id: key,

View File

@@ -36,7 +36,8 @@ pub(crate) use io_engine::IoEngineKind;
pub(crate) use metadata::Metadata;
pub(crate) use open_options::*;
#[cfg_attr(not(target_os = "linux"), allow(dead_code))]
use self::owned_buffers_io::write::OwnedAsyncWriter;
pub(crate) mod owned_buffers_io {
//! Abstractions for IO with owned buffers.
//!
@@ -1083,6 +1084,17 @@ impl Drop for VirtualFile {
}
}
impl OwnedAsyncWriter for VirtualFile {
#[inline(always)]
async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
&mut self,
buf: B,
) -> std::io::Result<(usize, B::Buf)> {
let (buf, res) = VirtualFile::write_all(self, buf).await;
res.map(move |v| (v, buf))
}
}
impl OpenFiles {
fn new(num_slots: usize) -> OpenFiles {
let mut slots = Box::new(Vec::with_capacity(num_slots));

View File

@@ -1,33 +1,41 @@
use crate::virtual_file::{owned_buffers_io::write::OwnedAsyncWriter, VirtualFile};
use crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter;
use tokio_epoll_uring::{BoundedBuf, IoBuf};
pub struct Writer {
dst: VirtualFile,
pub struct Writer<W> {
dst: W,
bytes_amount: u64,
}
impl Writer {
pub fn new(dst: VirtualFile) -> Self {
impl<W> Writer<W> {
pub fn new(dst: W) -> Self {
Self {
dst,
bytes_amount: 0,
}
}
pub fn bytes_written(&self) -> u64 {
self.bytes_amount
}
pub fn as_inner(&self) -> &W {
&self.dst
}
/// Returns the wrapped `VirtualFile` object as well as the number
/// of bytes that were written to it through this object.
pub fn into_inner(self) -> (u64, VirtualFile) {
pub fn into_inner(self) -> (u64, W) {
(self.bytes_amount, self.dst)
}
}
impl OwnedAsyncWriter for Writer {
impl<W> OwnedAsyncWriter for Writer<W>
where
W: OwnedAsyncWriter,
{
#[inline(always)]
async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
&mut self,
buf: B,
) -> std::io::Result<(usize, B::Buf)> {
let (buf, res) = self.dst.write_all(buf).await;
let nwritten = res?;
let (nwritten, buf) = self.dst.write_all(buf).await?;
self.bytes_amount += u64::try_from(nwritten).unwrap();
Ok((nwritten, buf))
}

View File

@@ -10,6 +10,47 @@ pub trait OwnedAsyncWriter {
) -> std::io::Result<(usize, B::Buf)>;
}
pub trait Buffer {
type IoBuf: IoBuf;
fn cap(&self) -> usize;
fn extend_from_slice(&mut self, other: &[u8]);
fn pending(&self) -> usize;
fn flush(self) -> Slice<Self::IoBuf>;
fn reconstruct_after_flush(iobuf: Self::IoBuf) -> Self;
}
impl Buffer for BytesMut {
type IoBuf = BytesMut;
#[inline(always)]
fn cap(&self) -> usize {
self.capacity()
}
fn extend_from_slice(&mut self, other: &[u8]) {
BytesMut::extend_from_slice(self, other)
}
#[inline(always)]
fn pending(&self) -> usize {
self.len()
}
fn flush(self) -> Slice<BytesMut> {
if self.is_empty() {
return self.slice_full();
}
let len = self.len();
self.slice(0..len)
}
fn reconstruct_after_flush(mut iobuf: BytesMut) -> Self {
iobuf.clear();
iobuf
}
}
/// A wrapper aorund an [`OwnedAsyncWriter`] that batches smaller writers
/// into `BUFFER_SIZE`-sized writes.
///
@@ -25,27 +66,38 @@ pub trait OwnedAsyncWriter {
///
/// In such cases, a different implementation that always buffers in memory
/// may be preferable.
pub struct BufferedWriter<const BUFFER_SIZE: usize, W> {
pub struct BufferedWriter<B, W> {
writer: W,
// invariant: always remains Some(buf)
// with buf.capacity() == BUFFER_SIZE except
// - while IO is ongoing => goes back to Some() once the IO completed successfully
// - after an IO error => stays `None` forever
// In these exceptional cases, it's `None`.
buf: Option<BytesMut>,
buf: Option<B>,
}
impl<const BUFFER_SIZE: usize, W> BufferedWriter<BUFFER_SIZE, W>
impl<B, Buf, W> BufferedWriter<B, W>
where
B: Buffer<IoBuf = Buf> + Send,
Buf: IoBuf + Send,
W: OwnedAsyncWriter,
{
pub fn new(writer: W) -> Self {
pub fn new(writer: W, buf: B) -> Self {
Self {
writer,
buf: Some(BytesMut::with_capacity(BUFFER_SIZE)),
buf: Some(buf),
}
}
pub fn as_inner(&self) -> &W {
&self.writer
}
/// Panics if used after any of the write paths returned an error
pub fn inspect_buffer(&self) -> &B {
self.buf()
}
pub async fn flush_and_into_inner(mut self) -> std::io::Result<W> {
self.flush().await?;
let Self { buf, writer } = self;
@@ -53,57 +105,84 @@ where
Ok(writer)
}
pub async fn write_buffered<B: IoBuf>(&mut self, chunk: Slice<B>) -> std::io::Result<()>
#[inline(always)]
fn buf(&self) -> &B {
self.buf
.as_ref()
.expect("must not use after we returned an error")
}
pub async fn write_buffered<S: IoBuf>(&mut self, chunk: Slice<S>) -> std::io::Result<(usize, S)>
where
B: IoBuf + Send,
S: IoBuf + Send,
{
let chunk_len = chunk.len();
// avoid memcpy for the middle of the chunk
if chunk.len() >= BUFFER_SIZE {
if chunk.len() >= self.buf().cap() {
self.flush().await?;
// do a big write, bypassing `buf`
assert_eq!(
self.buf
.as_ref()
.expect("must not use after an error")
.len(),
.pending(),
0
);
let chunk_len = chunk.len();
let (nwritten, chunk) = self.writer.write_all(chunk).await?;
assert_eq!(nwritten, chunk_len);
drop(chunk);
return Ok(());
return Ok((nwritten, chunk));
}
// in-memory copy the < BUFFER_SIZED tail of the chunk
assert!(chunk.len() < BUFFER_SIZE);
let mut chunk = &chunk[..];
assert!(chunk.len() < self.buf().cap());
let mut slice = &chunk[..];
while !slice.is_empty() {
let buf = self.buf.as_mut().expect("must not use after an error");
let need = buf.cap() - buf.pending();
let have = slice.len();
let n = std::cmp::min(need, have);
buf.extend_from_slice(&slice[..n]);
slice = &slice[n..];
if buf.pending() >= buf.cap() {
assert_eq!(buf.pending(), buf.cap());
self.flush().await?;
}
}
assert!(slice.is_empty(), "by now we should have drained the chunk");
Ok((chunk_len, chunk.into_inner()))
}
/// Strictly less performant variant of [`Self::write_buffered`] that allows writing borrowed data.
///
/// It is less performant because we always have to copy the borrowed data into the internal buffer
/// before we can do the IO. The [`Self::write_buffered`] can avoid this, which is more performant
/// for large writes.
pub async fn write_buffered_borrowed(&mut self, mut chunk: &[u8]) -> std::io::Result<usize> {
let chunk_len = chunk.len();
while !chunk.is_empty() {
let buf = self.buf.as_mut().expect("must not use after an error");
let need = BUFFER_SIZE - buf.len();
let need = buf.cap() - buf.pending();
let have = chunk.len();
let n = std::cmp::min(need, have);
buf.extend_from_slice(&chunk[..n]);
chunk = &chunk[n..];
if buf.len() >= BUFFER_SIZE {
assert_eq!(buf.len(), BUFFER_SIZE);
if buf.pending() >= buf.cap() {
assert_eq!(buf.pending(), buf.cap());
self.flush().await?;
}
}
assert!(chunk.is_empty(), "by now we should have drained the chunk");
Ok(())
Ok(chunk_len)
}
async fn flush(&mut self) -> std::io::Result<()> {
let buf = self.buf.take().expect("must not use after an error");
if buf.is_empty() {
let buf_len = buf.pending();
if buf_len == 0 {
self.buf = Some(buf);
return std::io::Result::Ok(());
return Ok(());
}
let buf_len = buf.len();
let (nwritten, mut buf) = self.writer.write_all(buf).await?;
let (nwritten, io_buf) = self.writer.write_all(buf.flush()).await?;
assert_eq!(nwritten, buf_len);
buf.clear();
self.buf = Some(buf);
self.buf = Some(Buffer::reconstruct_after_flush(io_buf));
Ok(())
}
}
@@ -125,6 +204,8 @@ impl OwnedAsyncWriter for Vec<u8> {
#[cfg(test)]
mod tests {
use bytes::BytesMut;
use super::*;
#[derive(Default)]
@@ -158,7 +239,7 @@ mod tests {
#[tokio::test]
async fn test_buffered_writes_only() -> std::io::Result<()> {
let recorder = RecorderWriter::default();
let mut writer = BufferedWriter::<2, _>::new(recorder);
let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2));
write!(writer, b"a");
write!(writer, b"b");
write!(writer, b"c");
@@ -175,7 +256,7 @@ mod tests {
#[tokio::test]
async fn test_passthrough_writes_only() -> std::io::Result<()> {
let recorder = RecorderWriter::default();
let mut writer = BufferedWriter::<2, _>::new(recorder);
let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2));
write!(writer, b"abc");
write!(writer, b"de");
write!(writer, b"");
@@ -191,7 +272,7 @@ mod tests {
#[tokio::test]
async fn test_passthrough_write_with_nonempty_buffer() -> std::io::Result<()> {
let recorder = RecorderWriter::default();
let mut writer = BufferedWriter::<2, _>::new(recorder);
let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2));
write!(writer, b"a");
write!(writer, b"bc");
write!(writer, b"d");
@@ -203,4 +284,31 @@ mod tests {
);
Ok(())
}
#[tokio::test]
async fn test_write_all_borrowed_always_goes_through_buffer() -> std::io::Result<()> {
let recorder = RecorderWriter::default();
let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2));
writer.write_buffered_borrowed(b"abc").await?;
writer.write_buffered_borrowed(b"d").await?;
writer.write_buffered_borrowed(b"e").await?;
writer.write_buffered_borrowed(b"fg").await?;
writer.write_buffered_borrowed(b"hi").await?;
writer.write_buffered_borrowed(b"j").await?;
writer.write_buffered_borrowed(b"klmno").await?;
let recorder = writer.flush_and_into_inner().await?;
assert_eq!(
recorder.writes,
{
let expect: &[&[u8]] = &[b"ab", b"cd", b"ef", b"gh", b"ij", b"kl", b"mn", b"o"];
expect
}
.iter()
.map(|v| v[..].to_vec())
.collect::<Vec<_>>()
);
Ok(())
}
}