Compare commits

...

34 Commits

Author SHA1 Message Date
Christian Schwarz
261f116a2d local benchmark run
test_bulk_insert[neon-release-pg14-std-fs].wal_written: 345 MB
test_bulk_insert[neon-release-pg14-std-fs].wal_recovery: 8.381 s

test_bulk_insert[neon-release-pg14-tokio-epoll-uring].wal_written: 345 MB
test_bulk_insert[neon-release-pg14-tokio-epoll-uring].wal_recovery: 11.760 s
2024-03-14 17:29:04 +00:00
Christian Schwarz
79a6cda45d page_cache_priming_writer: done 2024-03-14 17:19:36 +00:00
Christian Schwarz
f0a67c4071 WIP: page_cache_priming_writer: clippy 2024-03-14 17:19:36 +00:00
Christian Schwarz
a1f4eb2815 WIP: page_cache_priming_writer: bugfixes 2024-03-14 17:15:30 +00:00
Christian Schwarz
eb1ccd7988 WIP: page_cache_priming_writer: plumb through RequestContext for previous commit, yet more churn -,- 2024-03-14 17:15:21 +00:00
Christian Schwarz
f1f0452722 WIP: page_cache_priming_writer (is it really worth it?) 2024-03-14 15:46:41 +00:00
Christian Schwarz
40200b7521 figure out why & when exactly zeroes past write offset are required & assert it 2024-03-14 12:04:17 +00:00
Christian Schwarz
91bd729be2 junk up owned_buffers_io from previous commit to deal with EphemeralFile speciality of reading zeroes past end-of-file
This makes it diverge semantically from what's in the tokio-epoll-uring
download PR :(
2024-03-13 18:23:12 +00:00
Christian Schwarz
4b84f23cea larger buffers for the write path
The OwnedAsyncWrite stuff is based on the code in
tokio-epoll-uring on-demand download PR (#6992), which hasn't merged
yet.
2024-03-13 18:23:08 +00:00
Christian Schwarz
644c5e243d Revert "experiment(repeat, without preceding reverts) demonstrate that std-fs performs better because it hits the page cache"
This reverts commit d66ccbae5e.
2024-03-13 15:28:24 +00:00
Christian Schwarz
d66ccbae5e experiment(repeat, without preceding reverts) demonstrate that std-fs performs better because it hits the page cache
... by forcing each write system call to go to disk

test_bulk_insert[neon-release-pg14-tokio-epoll-uring].wal_written: 346 MB
test_bulk_insert[neon-release-pg14-tokio-epoll-uring].wal_recovery: 93.417 s

test_bulk_insert[neon-release-pg14-std-fs].wal_written: 346 MB
test_bulk_insert[neon-release-pg14-std-fs].wal_recovery: 86.009 s

=> ~8% instead of 2x difference
2024-03-13 15:27:41 +00:00
Christian Schwarz
578a2d5d5f Revert "experiment: for create_delta_layer, use global io_engine, but inside a spawn_blocking single-threaded runtime"
This reverts commit 72a8e090dd.
2024-03-13 15:09:11 +00:00
Christian Schwarz
c9d1f51a93 Revert "experiment: for create_delta_layer _write path_, use StdFs io engine in a spawn_blocking thread single-threaded runtime"
This reverts commit 4a8e7f8716.
2024-03-13 15:09:06 +00:00
Christian Schwarz
1339834297 Revert "experiment: for EphemeralFile write path, use StdFs io engine"
This reverts commit c8c04c0db8.
2024-03-13 15:09:00 +00:00
Christian Schwarz
746fc530c5 "experiment: demonstrate that std-fs performs better because it hits the page cache"
This reverts commit 2edbc07733.
2024-03-13 15:08:43 +00:00
Christian Schwarz
94311052cd previous commit's numbers were with all the preceding experiments 2024-03-13 15:08:11 +00:00
Christian Schwarz
2edbc07733 experiment: demonstrate that std-fs performs better because it hits the page cache
... by forcing each write system call to go to disk

test_bulk_insert[neon-release-pg14-tokio-epoll-uring].wal_written: 346 MB
test_bulk_insert[neon-release-pg14-tokio-epoll-uring].wal_recovery: 92.559 s

test_bulk_insert[neon-release-pg14-std-fs].wal_written: 346 MB
test_bulk_insert[neon-release-pg14-std-fs].wal_recovery: 81.998 s

=> 10%ish worse instead of 2x
2024-03-13 15:07:46 +00:00
Christian Schwarz
c8c04c0db8 experiment: for EphemeralFile write path, use StdFs io engine
together with previous commits, this brings us back down to
pre-regression

test_bulk_insert[neon-release-pg14-tokio-epoll-uring].wal_written: 345 MB
test_bulk_insert[neon-release-pg14-tokio-epoll-uring].wal_recovery: 9.991 s
2024-03-13 14:09:21 +00:00
Christian Schwarz
4a8e7f8716 experiment: for create_delta_layer _write path_, use StdFs io engine in a spawn_blocking thread single-threaded runtime
builds on top of the previous commit

test_bulk_insert[neon-release-pg14-tokio-epoll-uring].wal_written: 345 MB
test_bulk_insert[neon-release-pg14-tokio-epoll-uring].wal_recovery: 13.153 s
2024-03-13 14:09:16 +00:00
Christian Schwarz
72a8e090dd experiment: for create_delta_layer, use global io_engine, but inside a spawn_blocking single-threaded runtime
This makes things worse with tokio-epoll-uring

test_bulk_insert[neon-release-pg14-tokio-epoll-uring].wal_written: 345 MB
test_bulk_insert[neon-release-pg14-tokio-epoll-uring].wal_recovery: 19.574 s

This is a partial revert of 3da410c8fe
2024-03-13 14:09:12 +00:00
Christian Schwarz
e0ea465aed Revert "experiment: Revert "tokio-epoll-uring: use it on the layer-creating code paths (#6378)""
This reverts commit d3c157eeee.
2024-03-13 12:45:53 +00:00
Christian Schwarz
d3c157eeee experiment: Revert "tokio-epoll-uring: use it on the layer-creating code paths (#6378)"
Unchanged

test_bulk_insert[neon-release-pg14-tokio-epoll-uring].wal_written: 345 MB
test_bulk_insert[neon-release-pg14-tokio-epoll-uring].wal_recovery: 9.194 s

This reverts commit 3da410c8fe.
2024-03-13 12:45:40 +00:00
Christian Schwarz
c600355802 Revert "experiment: StdFs for EphemeralFile writes isn't the bottleneck"
This reverts commit 57241c1c5a.
2024-03-13 12:36:21 +00:00
Christian Schwarz
57241c1c5a experiment: StdFs for EphemeralFile writes isn't the bottleneck
With this

test_bulk_insert[neon-release-pg14-tokio-epoll-uring].wal_written: 345 MB
test_bulk_insert[neon-release-pg14-tokio-epoll-uring].wal_recovery: 16.053 s

down from

test_bulk_insert[neon-release-pg14-tokio-epoll-uring].wal_written: 345 MB
test_bulk_insert[neon-release-pg14-tokio-epoll-uring].wal_recovery: 17.669 s

but the regression is from baseline

test_bulk_insert[neon-release-pg14-std-fs].wal_written: 345 MB
test_bulk_insert[neon-release-pg14-std-fs].wal_recovery: 9.335 s
2024-03-13 11:41:17 +00:00
Christian Schwarz
b9c30dbd6b fix the parametrization 2024-03-12 20:24:45 +00:00
Christian Schwarz
35f8735a27 wip 2024-03-12 20:13:44 +00:00
Christian Schwarz
095130c1b3 DO NOT MERGE: always parametrize 2024-03-12 20:09:14 +00:00
Christian Schwarz
5a0277476d Revert "make changes preparing next commit"
This reverts commit e85a631ddb.
2024-03-12 20:09:14 +00:00
Christian Schwarz
6348833bdc expose that virtual_file_io_engine and get_vectored_impl were never set 2024-03-12 20:09:14 +00:00
Christian Schwarz
dbabd4e4ea Revert "expose that pageserver_virtual_file_io_engine test param was never used (same for get_vectored_impl)"
This reverts commit 5b8888ce6b.
2024-03-12 20:09:14 +00:00
Christian Schwarz
5b8888ce6b expose that pageserver_virtual_file_io_engine test param was never used (same for get_vectored_impl) 2024-03-12 19:56:02 +00:00
Christian Schwarz
e85a631ddb make changes preparing next commit 2024-03-12 19:56:02 +00:00
Christian Schwarz
95deea4f39 Revert "Revert "tokio-epoll-uring: use it on the layer-creating code paths (#6378)""
This reverts commit 9876045444.
2024-03-12 18:53:16 +00:00
Christian Schwarz
9876045444 Revert "tokio-epoll-uring: use it on the layer-creating code paths (#6378)"
This reverts commit 3da410c8fe.
2024-03-12 18:53:11 +00:00
10 changed files with 846 additions and 164 deletions

View File

@@ -114,7 +114,7 @@ impl NeonBroker {
}
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
#[serde(default)]
#[serde(default, deny_unknown_fields)]
pub struct PageServerConf {
// node id
pub id: NodeId,
@@ -126,6 +126,9 @@ pub struct PageServerConf {
// auth type used for the PG and HTTP ports
pub pg_auth_type: AuthType,
pub http_auth_type: AuthType,
pub(crate) virtual_file_io_engine: String,
pub(crate) get_vectored_impl: String,
}
impl Default for PageServerConf {
@@ -136,6 +139,9 @@ impl Default for PageServerConf {
listen_http_addr: String::new(),
pg_auth_type: AuthType::Trust,
http_auth_type: AuthType::Trust,
// FIXME: use the ones exposed by pageserver crate
virtual_file_io_engine: "tokio-epoll-uring".to_owned(),
get_vectored_impl: "sequential".to_owned(),
}
}
}

View File

@@ -78,18 +78,31 @@ impl PageServerNode {
///
/// These all end up on the command line of the `pageserver` binary.
fn neon_local_overrides(&self, cli_overrides: &[&str]) -> Vec<String> {
let id = format!("id={}", self.conf.id);
// FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc.
let pg_distrib_dir_param = format!(
"pg_distrib_dir='{}'",
self.env.pg_distrib_dir_raw().display()
);
let http_auth_type_param = format!("http_auth_type='{}'", self.conf.http_auth_type);
let listen_http_addr_param = format!("listen_http_addr='{}'", self.conf.listen_http_addr);
let PageServerConf {
id,
listen_pg_addr,
listen_http_addr,
pg_auth_type,
http_auth_type,
virtual_file_io_engine,
get_vectored_impl,
} = &self.conf;
let pg_auth_type_param = format!("pg_auth_type='{}'", self.conf.pg_auth_type);
let listen_pg_addr_param = format!("listen_pg_addr='{}'", self.conf.listen_pg_addr);
let id = format!("id={}", id);
let http_auth_type_param = format!("http_auth_type='{}'", http_auth_type);
let listen_http_addr_param = format!("listen_http_addr='{}'", listen_http_addr);
let pg_auth_type_param = format!("pg_auth_type='{}'", pg_auth_type);
let listen_pg_addr_param = format!("listen_pg_addr='{}'", listen_pg_addr);
let virtual_file_io_engine = format!("virtual_file_io_engine='{virtual_file_io_engine}'");
let get_vectored_impl = format!("get_vectored_impl='{get_vectored_impl}'");
let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url());
@@ -101,6 +114,8 @@ impl PageServerNode {
listen_http_addr_param,
listen_pg_addr_param,
broker_endpoint_param,
virtual_file_io_engine,
get_vectored_impl,
];
if let Some(control_plane_api) = &self.env.control_plane_api {
@@ -111,7 +126,7 @@ impl PageServerNode {
// Storage controller uses the same auth as pageserver: if JWT is enabled
// for us, we will also need it to talk to them.
if matches!(self.conf.http_auth_type, AuthType::NeonJWT) {
if matches!(http_auth_type, AuthType::NeonJWT) {
let jwt_token = self
.env
.generate_auth_token(&Claims::new(None, Scope::GenerationsApi))
@@ -129,8 +144,7 @@ impl PageServerNode {
));
}
if self.conf.http_auth_type != AuthType::Trust || self.conf.pg_auth_type != AuthType::Trust
{
if *http_auth_type != AuthType::Trust || *pg_auth_type != AuthType::Trust {
// Keys are generated in the toplevel repo dir, pageservers' workdirs
// are one level below that, so refer to keys with ../
overrides.push("auth_validation_public_key_path='../auth_public_key.pem'".to_owned());

View File

@@ -121,7 +121,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
self.offset
}
const CAPACITY: usize = if BUFFERED { PAGE_SZ } else { 0 };
const CAPACITY: usize = if BUFFERED { 64 * 1024 } else { 0 };
/// Writes the given buffer directly to the underlying `VirtualFile`.
/// You need to make sure that the internal buffer is empty, otherwise

View File

@@ -5,14 +5,12 @@ use crate::config::PageServerConf;
use crate::context::RequestContext;
use crate::page_cache::{self, PAGE_SZ};
use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader};
use crate::virtual_file::{self, VirtualFile};
use bytes::BytesMut;
use crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter;
use crate::virtual_file::{self, owned_buffers_io, VirtualFile};
use camino::Utf8PathBuf;
use pageserver_api::shard::TenantShardId;
use std::cmp::min;
use std::io::{self, ErrorKind};
use std::ops::DerefMut;
use std::io;
use std::sync::atomic::AtomicU64;
use tracing::*;
use utils::id::TimelineId;
@@ -22,18 +20,31 @@ pub struct EphemeralFile {
_tenant_shard_id: TenantShardId,
_timeline_id: TimelineId,
file: VirtualFile,
len: u64,
/// An ephemeral file is append-only.
/// We keep the last page, which can still be modified, in [`Self::mutable_tail`].
/// The other pages, which can no longer be modified, are accessed through the page cache.
/// We sandwich the buffered writer between two size-tracking writers.
/// This allows us to "elegantly" track in-memory bytes vs flushed bytes,
/// enabling [`Self::read_blk`] to determine whether to read from the
/// buffered writer's buffer, versus going to the VirtualFile.
///
/// None <=> IO is ongoing.
/// Size is fixed to PAGE_SZ at creation time and must not be changed.
mutable_tail: Option<BytesMut>,
/// TODO: longer-term, we probably wand to get rid of this in favor
/// of a double-buffering scheme. See this commit's commit message
/// and git history for what we had before this sandwich, it might be useful.
file: owned_buffers_io::util::size_tracking_writer::Writer<
owned_buffers_io::write::BufferedWriter<
{ Self::TAIL_SZ },
owned_buffers_io::util::size_tracking_writer::Writer<
owned_buffers_io::util::page_cache_priming_writer::Writer<
PAGE_SZ,
owned_buffers_io::util::page_cache_priming_writer::VirtualFileAdaptor,
&'static crate::page_cache::PageCache,
>,
>,
>,
>,
}
impl EphemeralFile {
const TAIL_SZ: usize = 64 * 1024;
pub async fn create(
conf: &PageServerConf,
tenant_shard_id: TenantShardId,
@@ -58,18 +69,27 @@ impl EphemeralFile {
)
.await?;
let page_cache_file_id = page_cache::next_file_id();
let file = owned_buffers_io::util::page_cache_priming_writer::VirtualFileAdaptor::new(
file,
page_cache_file_id,
);
let file =
owned_buffers_io::util::page_cache_priming_writer::Writer::new(file, page_cache::get());
let file = owned_buffers_io::util::size_tracking_writer::Writer::new(file);
let file = owned_buffers_io::write::BufferedWriter::new(file);
let file = owned_buffers_io::util::size_tracking_writer::Writer::new(file);
Ok(EphemeralFile {
page_cache_file_id: page_cache::next_file_id(),
page_cache_file_id,
_tenant_shard_id: tenant_shard_id,
_timeline_id: timeline_id,
file,
len: 0,
mutable_tail: Some(BytesMut::zeroed(PAGE_SZ)),
})
}
pub(crate) fn len(&self) -> u64 {
self.len
self.file.bytes_written()
}
pub(crate) async fn read_blk(
@@ -77,8 +97,22 @@ impl EphemeralFile {
blknum: u32,
ctx: &RequestContext,
) -> Result<BlockLease, io::Error> {
let flushed_blknums = 0..self.len / PAGE_SZ as u64;
if flushed_blknums.contains(&(blknum as u64)) {
let buffered_offset = self.file.bytes_written();
let flushed_offset = self.file.as_inner().as_inner().bytes_written();
assert!(buffered_offset >= flushed_offset);
let read_offset = (blknum as u64) * (PAGE_SZ as u64);
assert_eq!(
flushed_offset % (PAGE_SZ as u64),
0,
"we need this in the logic below, because it assumes the page isn't spread across flushed part and in-memory buffer"
);
if read_offset < flushed_offset {
assert!(
read_offset + (PAGE_SZ as u64) <= flushed_offset,
"this impl can't deal with pages spread across flushed & buffered part"
);
let cache = page_cache::get();
match cache
.read_immutable_buf(self.page_cache_file_id, blknum, ctx)
@@ -89,7 +123,15 @@ impl EphemeralFile {
// order path before error because error is anyhow::Error => might have many contexts
format!(
"ephemeral file: read immutable page #{}: {}: {:#}",
blknum, self.file.path, e,
blknum,
self.file
.as_inner()
.as_inner()
.as_inner()
.as_inner()
.as_inner()
.path,
e,
),
)
})? {
@@ -99,6 +141,11 @@ impl EphemeralFile {
page_cache::ReadBufResult::NotFound(write_guard) => {
let write_guard = self
.file
.as_inner()
.as_inner()
.as_inner()
.as_inner()
.as_inner()
.read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64)
.await?;
let read_guard = write_guard.mark_valid();
@@ -106,13 +153,31 @@ impl EphemeralFile {
}
};
} else {
debug_assert_eq!(blknum as u64, self.len / PAGE_SZ as u64);
let read_until_offset = read_offset + (PAGE_SZ as u64);
if !(0..buffered_offset).contains(&read_until_offset) {
// The blob_io code relies on the reader allowing reads past
// the end of what was written, up to end of the current PAGE_SZ chunk.
// This is a relict of the past where we would get a pre-zeroed page from the page cache.
//
// DeltaLayer probably has the same issue, not sure why it needs no special treatment.
let nbytes_past_end = read_until_offset.checked_sub(buffered_offset).unwrap();
if nbytes_past_end >= (PAGE_SZ as u64) {
// TODO: treat this as error. Pre-existing issue before this patch.
panic!(
"return IO error: read past end of file: read=0x{read_offset:x} buffered=0x{buffered_offset:x} flushed=0x{flushed_offset}"
)
}
}
let buffer: &[u8; Self::TAIL_SZ] = self.file.as_inner().inspect_buffer();
let read_offset_in_buffer = read_offset
.checked_sub(flushed_offset)
.expect("would have taken `if` branch instead of this one");
let read_offset_in_buffer = usize::try_from(read_offset_in_buffer).unwrap();
let page = &buffer[read_offset_in_buffer..(read_offset_in_buffer + PAGE_SZ)];
Ok(BlockLease::EphemeralFileMutableTail(
self.mutable_tail
.as_deref()
.expect("we're not doing IO, it must be Some()")
.try_into()
.expect("we ensure that it's always PAGE_SZ"),
page.try_into()
.expect("the slice above got it as page-size slice"),
))
}
}
@@ -122,137 +187,22 @@ impl EphemeralFile {
srcbuf: &[u8],
ctx: &RequestContext,
) -> Result<u64, io::Error> {
struct Writer<'a> {
ephemeral_file: &'a mut EphemeralFile,
/// The block to which the next [`push_bytes`] will write.
blknum: u32,
/// The offset inside the block identified by [`blknum`] to which [`push_bytes`] will write.
off: usize,
}
impl<'a> Writer<'a> {
fn new(ephemeral_file: &'a mut EphemeralFile) -> io::Result<Writer<'a>> {
Ok(Writer {
blknum: (ephemeral_file.len / PAGE_SZ as u64) as u32,
off: (ephemeral_file.len % PAGE_SZ as u64) as usize,
ephemeral_file,
})
}
#[inline(always)]
async fn push_bytes(
&mut self,
src: &[u8],
ctx: &RequestContext,
) -> Result<(), io::Error> {
let mut src_remaining = src;
while !src_remaining.is_empty() {
let dst_remaining = &mut self
.ephemeral_file
.mutable_tail
.as_deref_mut()
.expect("IO is not yet ongoing")[self.off..];
let n = min(dst_remaining.len(), src_remaining.len());
dst_remaining[..n].copy_from_slice(&src_remaining[..n]);
self.off += n;
src_remaining = &src_remaining[n..];
if self.off == PAGE_SZ {
let mutable_tail = std::mem::take(&mut self.ephemeral_file.mutable_tail)
.expect("IO is not yet ongoing");
let (mutable_tail, res) = self
.ephemeral_file
.file
.write_all_at(mutable_tail, self.blknum as u64 * PAGE_SZ as u64)
.await;
// TODO: If we panic before we can put the mutable_tail back, subsequent calls will fail.
// I.e., the IO isn't retryable if we panic.
self.ephemeral_file.mutable_tail = Some(mutable_tail);
match res {
Ok(_) => {
// Pre-warm the page cache with what we just wrote.
// This isn't necessary for coherency/correctness, but it's how we've always done it.
let cache = page_cache::get();
match cache
.read_immutable_buf(
self.ephemeral_file.page_cache_file_id,
self.blknum,
ctx,
)
.await
{
Ok(page_cache::ReadBufResult::Found(_guard)) => {
// This function takes &mut self, so, it shouldn't be possible to reach this point.
unreachable!("we just wrote blknum {} and this function takes &mut self, so, no concurrent read_blk is possible", self.blknum);
}
Ok(page_cache::ReadBufResult::NotFound(mut write_guard)) => {
let buf: &mut [u8] = write_guard.deref_mut();
debug_assert_eq!(buf.len(), PAGE_SZ);
buf.copy_from_slice(
self.ephemeral_file
.mutable_tail
.as_deref()
.expect("IO is not ongoing"),
);
let _ = write_guard.mark_valid();
// pre-warm successful
}
Err(e) => {
error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}");
// fail gracefully, it's not the end of the world if we can't pre-warm the cache here
}
}
// Zero the buffer for re-use.
// Zeroing is critical for correcntess because the write_blob code below
// and similarly read_blk expect zeroed pages.
self.ephemeral_file
.mutable_tail
.as_deref_mut()
.expect("IO is not ongoing")
.fill(0);
// This block is done, move to next one.
self.blknum += 1;
self.off = 0;
}
Err(e) => {
return Err(std::io::Error::new(
ErrorKind::Other,
// order error before path because path is long and error is short
format!(
"ephemeral_file: write_blob: write-back full tail blk #{}: {:#}: {}",
self.blknum,
e,
self.ephemeral_file.file.path,
),
));
}
}
}
}
Ok(())
}
}
let pos = self.len;
let mut writer = Writer::new(self)?;
let pos = self.file.bytes_written();
// Write the length field
if srcbuf.len() < 0x80 {
// short one-byte length header
let len_buf = [srcbuf.len() as u8];
writer.push_bytes(&len_buf, ctx).await?;
self.file.write_all_borrowed(&len_buf, ctx).await?;
} else {
let mut len_buf = u32::to_be_bytes(srcbuf.len() as u32);
len_buf[0] |= 0x80;
writer.push_bytes(&len_buf, ctx).await?;
self.file.write_all_borrowed(&len_buf, ctx).await?;
}
// Write the payload
writer.push_bytes(srcbuf, ctx).await?;
if srcbuf.len() < 0x80 {
self.len += 1;
} else {
self.len += 4;
}
self.len += srcbuf.len() as u64;
self.file.write_all_borrowed(srcbuf, ctx).await?;
Ok(pos)
}
@@ -273,7 +223,16 @@ impl Drop for EphemeralFile {
// We leave them there, [`crate::page_cache::PageCache::find_victim`] will evict them when needed.
// unlink the file
let res = std::fs::remove_file(&self.file.path);
let res = std::fs::remove_file(
&self
.file
.as_inner()
.as_inner()
.as_inner()
.as_inner()
.as_inner()
.path,
);
if let Err(e) = res {
if e.kind() != std::io::ErrorKind::NotFound {
// just never log the not found errors, we cannot do anything for them; on detach
@@ -282,7 +241,14 @@ impl Drop for EphemeralFile {
// not found files might also be related to https://github.com/neondatabase/neon/issues/2442
error!(
"could not remove ephemeral file '{}': {}",
self.file.path, e
self.file
.as_inner()
.as_inner()
.as_inner()
.as_inner()
.as_inner()
.path,
e
);
}
}

View File

@@ -10,6 +10,7 @@
//! This is similar to PostgreSQL's virtual file descriptor facility in
//! src/backend/storage/file/fd.c
//!
use crate::context::RequestContext;
use crate::metrics::{StorageIoOperation, STORAGE_IO_SIZE, STORAGE_IO_TIME_METRIC};
use crate::page_cache::PageWriteGuard;
@@ -34,6 +35,25 @@ pub(crate) use io_engine::IoEngineKind;
pub(crate) use metadata::Metadata;
pub(crate) use open_options::*;
use self::owned_buffers_io::write::OwnedAsyncWriter;
pub(crate) mod owned_buffers_io {
//! Abstractions for IO with owned buffers.
//!
//! Not actually tied to [`crate::virtual_file`] specifically, but, it's the primary
//! reason we need this abstraction.
//!
//! Over time, this could move into the `tokio-epoll-uring` crate, maybe `uring-common`,
//! but for the time being we're proving out the primitives in the neon.git repo
//! for faster iteration.
pub(crate) mod write;
pub(crate) mod util {
pub(crate) mod page_cache_priming_writer;
pub(crate) mod size_tracking_writer;
}
}
///
/// A virtual file descriptor. You can use this just like std::fs::File, but internally
/// the underlying file is closed if the system is low on file descriptors,
@@ -1064,6 +1084,32 @@ impl Drop for VirtualFile {
}
}
impl OwnedAsyncWriter for VirtualFile {
#[inline(always)]
async fn write_all<
B: BoundedBuf<Buf = Buf, Bounds = Bounds>,
Buf: IoBuf + Send,
Bounds: std::ops::RangeBounds<usize>,
>(
&mut self,
buf: B,
_: &RequestContext,
) -> std::io::Result<(usize, B::Buf)> {
let (buf, res) = VirtualFile::write_all(self, buf).await;
res.map(move |v| (v, buf))
}
#[inline(always)]
async fn write_all_borrowed(
&mut self,
_buf: &[u8],
_ctx: &RequestContext,
) -> std::io::Result<usize> {
// TODO: ensure this through the type system
panic!("this should not happen");
}
}
impl OpenFiles {
fn new(num_slots: usize) -> OpenFiles {
let mut slots = Box::new(Vec::with_capacity(num_slots));

View File

@@ -0,0 +1,173 @@
use std::ops::Deref;
use tokio_epoll_uring::BoundedBuf;
use tracing::error;
use crate::{
context::RequestContext,
page_cache::{self, PageReadGuard, PAGE_SZ},
virtual_file::{owned_buffers_io::write::OwnedAsyncWriter, VirtualFile},
};
pub struct Writer<const N: usize, W: OwnedAsyncWriter, C: Cache<N, W>> {
under: W,
written: u64,
cache: C,
}
pub trait Cache<const PAGE_SZ: usize, W> {
async fn fill_cache(
&self,
under: &W,
page_no: u64,
contents: &[u8; PAGE_SZ],
ctx: &RequestContext,
);
}
impl<const N: usize, W, C> Writer<N, W, C>
where
C: Cache<N, W>,
W: OwnedAsyncWriter,
{
pub fn new(under: W, cache: C) -> Self {
Self {
under,
written: 0,
cache,
}
}
pub fn as_inner(&self) -> &W {
&self.under
}
fn invariants(&self) {
assert_eq!(
self.written % (N as u64),
0,
"writes must happen in multiples of N"
);
}
}
impl<const N: usize, W, C> OwnedAsyncWriter for Writer<N, W, C>
where
W: OwnedAsyncWriter,
C: Cache<N, W>,
{
async fn write_all<
B: tokio_epoll_uring::BoundedBuf<Buf = Buf, Bounds = Bounds>,
Buf: tokio_epoll_uring::IoBuf + Send,
Bounds: std::ops::RangeBounds<usize>,
>(
&mut self,
buf: B,
ctx: &RequestContext,
) -> std::io::Result<(usize, B::Buf)> {
let buf = buf.slice_full();
assert_eq!(buf.bytes_init() % N, 0);
self.invariants();
let saved_bounds = buf.bounds();
let debug_assert_contents_eq = if cfg!(debug_assertions) {
Some(buf[..].to_vec())
} else {
None
};
let res = self.under.write_all(buf, ctx).await;
let res = if let Ok((nwritten, buf)) = res {
assert_eq!(nwritten % N, 0);
let buf = tokio_epoll_uring::Slice::from_buf_bounds(buf, saved_bounds);
if let Some(before) = debug_assert_contents_eq {
debug_assert_eq!(&before[..], &buf[..]);
}
assert_eq!(nwritten, buf.bytes_init());
for page_no_in_buf in 0..(nwritten / N) {
let page: &[u8; N] = (&buf[(page_no_in_buf * N)..((page_no_in_buf + 1) * N)])
.try_into()
.unwrap();
self.cache
.fill_cache(
&self.under,
(self.written / (N as u64)) + (page_no_in_buf as u64),
page,
ctx,
)
.await;
}
self.written += nwritten as u64;
Ok((nwritten, tokio_epoll_uring::Slice::into_inner(buf)))
} else {
res
};
self.invariants();
res
}
async fn write_all_borrowed(&mut self, _: &[u8], _: &RequestContext) -> std::io::Result<usize> {
// TODO: use type system to ensure this doesn't happen at runtime
panic!("don't put the types together this way")
}
}
pub struct VirtualFileAdaptor {
file: VirtualFile,
file_id: page_cache::FileId,
}
impl VirtualFileAdaptor {
pub fn new(file: VirtualFile, file_id: page_cache::FileId) -> Self {
Self { file, file_id }
}
pub fn as_inner(&self) -> &VirtualFile {
&self.file
}
}
impl<'c> Cache<{ PAGE_SZ }, VirtualFileAdaptor> for &'c crate::page_cache::PageCache {
async fn fill_cache(
&self,
under: &VirtualFileAdaptor,
page_no: u64,
contents: &[u8; PAGE_SZ],
ctx: &RequestContext,
) {
match self
.read_immutable_buf(
under.file_id,
u32::try_from(page_no).expect("files larger than u32::MAX * 8192 aren't supported"),
ctx,
)
.await
{
Ok(crate::page_cache::ReadBufResult::Found(guard)) => {
debug_assert_eq!(guard.deref(), contents);
}
Ok(crate::page_cache::ReadBufResult::NotFound(mut guard)) => {
guard.copy_from_slice(contents);
let guard: PageReadGuard<'_> = guard.mark_valid();
drop(guard);
}
Err(e) => {
error!("failed to get immutable buf to pre-warm page cache: {e:?}");
}
}
}
}
impl OwnedAsyncWriter for VirtualFileAdaptor {
async fn write_all<
B: tokio_epoll_uring::BoundedBuf<Buf = Buf, Bounds = Bounds>,
Buf: tokio_epoll_uring::IoBuf + Send,
Bounds: std::ops::RangeBounds<usize>,
>(
&mut self,
buf: B,
ctx: &RequestContext,
) -> std::io::Result<(usize, B::Buf)> {
OwnedAsyncWriter::write_all(&mut self.file, buf, ctx).await
}
async fn write_all_borrowed(&mut self, _: &[u8], _: &RequestContext) -> std::io::Result<usize> {
// TODO: use type system to ensure this doesn't happen at runtime
panic!("don't put the types together this way")
}
}

View File

@@ -0,0 +1,59 @@
use crate::{context::RequestContext, virtual_file::owned_buffers_io::write::OwnedAsyncWriter};
use tokio_epoll_uring::{BoundedBuf, IoBuf};
pub struct Writer<W> {
dst: W,
bytes_amount: u64,
}
impl<W> Writer<W> {
pub fn new(dst: W) -> Self {
Self {
dst,
bytes_amount: 0,
}
}
pub fn bytes_written(&self) -> u64 {
self.bytes_amount
}
pub fn as_inner(&self) -> &W {
&self.dst
}
/// Returns the wrapped `VirtualFile` object as well as the number
/// of bytes that were written to it through this object.
#[allow(dead_code)]
pub fn into_inner(self) -> (u64, W) {
(self.bytes_amount, self.dst)
}
}
impl<W> OwnedAsyncWriter for Writer<W>
where
W: OwnedAsyncWriter,
{
#[inline(always)]
async fn write_all<
B: BoundedBuf<Buf = Buf, Bounds = Bounds>,
Buf: IoBuf + Send,
Bounds: std::ops::RangeBounds<usize>,
>(
&mut self,
buf: B,
ctx: &RequestContext,
) -> std::io::Result<(usize, B::Buf)> {
let (nwritten, buf) = self.dst.write_all(buf, ctx).await?;
self.bytes_amount += u64::try_from(nwritten).unwrap();
Ok((nwritten, buf))
}
#[inline(always)]
async fn write_all_borrowed(
&mut self,
buf: &[u8],
ctx: &RequestContext,
) -> std::io::Result<usize> {
let nwritten = self.dst.write_all_borrowed(buf, ctx).await?;
self.bytes_amount += u64::try_from(nwritten).unwrap();
Ok(nwritten)
}
}

View File

@@ -0,0 +1,351 @@
use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice};
use crate::context::RequestContext;
/// A trait for doing owned-buffer write IO.
/// Think [`tokio::io::AsyncWrite`] but with owned buffers.
pub trait OwnedAsyncWriter {
async fn write_all<
B: BoundedBuf<Buf = Buf, Bounds = Bounds>,
Buf: IoBuf + Send,
Bounds: std::ops::RangeBounds<usize>,
>(
&mut self,
buf: B,
ctx: &RequestContext,
) -> std::io::Result<(usize, B::Buf)>;
async fn write_all_borrowed(
&mut self,
buf: &[u8],
ctx: &RequestContext,
) -> std::io::Result<usize>;
}
/// A wrapper aorund an [`OwnedAsyncWriter`] that batches smaller writers
/// into `BUFFER_SIZE`-sized writes.
///
/// # Passthrough Of Large Writers
///
/// Buffered writes larger than the `BUFFER_SIZE` cause the internal
/// buffer to be flushed, even if it is not full yet. Then, the large
/// buffered write is passed through to the unerlying [`OwnedAsyncWriter`].
///
/// This pass-through is generally beneficial for throughput, but if
/// the storage backend of the [`OwnedAsyncWriter`] is a shared resource,
/// unlimited large writes may cause latency or fairness issues.
///
/// In such cases, a different implementation that always buffers in memory
/// may be preferable.
pub struct BufferedWriter<const BUFFER_SIZE: usize, W> {
writer: W,
// invariant: always remains Some(buf)
// with buf.capacity() == BUFFER_SIZE except
// - while IO is ongoing => goes back to Some() once the IO completed successfully
// - after an IO error => stays `None` forever
// In these exceptional cases, it's `None`.
buf: Option<zero_initialized_buffer::Buf<BUFFER_SIZE>>,
}
mod zero_initialized_buffer;
impl<const BUFFER_SIZE: usize, W> BufferedWriter<BUFFER_SIZE, W>
where
W: OwnedAsyncWriter,
{
pub fn new(writer: W) -> Self {
Self {
writer,
buf: Some(zero_initialized_buffer::Buf::default()),
}
}
pub fn as_inner(&self) -> &W {
&self.writer
}
/// panics if used after an error
pub fn inspect_buffer(&self) -> &[u8; BUFFER_SIZE] {
self.buf
.as_ref()
// TODO: can this happen on the EphemeralFile read path?
.expect("must not use after an error")
.as_zero_padded_slice()
}
#[allow(dead_code)]
pub async fn flush_and_into_inner(mut self, ctx: &RequestContext) -> std::io::Result<W> {
self.flush(ctx).await?;
let Self { buf, writer } = self;
assert!(buf.is_some());
Ok(writer)
}
pub async fn write_buffered<B: IoBuf>(
&mut self,
chunk: Slice<B>,
ctx: &RequestContext,
) -> std::io::Result<(usize, B)>
where
B: IoBuf + Send,
{
let chunk_len = chunk.len();
// avoid memcpy for the middle of the chunk
if chunk.len() >= BUFFER_SIZE {
self.flush(ctx).await?;
// do a big write, bypassing `buf`
assert_eq!(
self.buf
.as_ref()
.expect("must not use after an error")
.len(),
0
);
let (nwritten, chunk) = self.writer.write_all(chunk, ctx).await?;
assert_eq!(nwritten, chunk_len);
return Ok((nwritten, chunk));
}
// in-memory copy the < BUFFER_SIZED tail of the chunk
assert!(chunk.len() < BUFFER_SIZE);
let mut slice = &chunk[..];
while !slice.is_empty() {
let buf = self.buf.as_mut().expect("must not use after an error");
let need = BUFFER_SIZE - buf.len();
let have = slice.len();
let n = std::cmp::min(need, have);
buf.extend_from_slice(&slice[..n]);
slice = &slice[n..];
if buf.len() >= BUFFER_SIZE {
assert_eq!(buf.len(), BUFFER_SIZE);
self.flush(ctx).await?;
}
}
assert!(slice.is_empty(), "by now we should have drained the chunk");
Ok((chunk_len, chunk.into_inner()))
}
/// Always goes through the internal buffer.
/// Guaranteed to never invoke [`OwnedAsyncWrite::write_all_borrowed`] on the underlying.
pub async fn write_all_borrowed(
&mut self,
mut chunk: &[u8],
ctx: &RequestContext,
) -> std::io::Result<usize> {
let chunk_len = chunk.len();
while !chunk.is_empty() {
let buf = self.buf.as_mut().expect("must not use after an error");
let need = BUFFER_SIZE - buf.len();
let have = chunk.len();
let n = std::cmp::min(need, have);
buf.extend_from_slice(&chunk[..n]);
chunk = &chunk[n..];
if buf.len() >= BUFFER_SIZE {
assert_eq!(buf.len(), BUFFER_SIZE);
self.flush(ctx).await?;
}
}
Ok(chunk_len)
}
async fn flush(&mut self, ctx: &RequestContext) -> std::io::Result<()> {
let buf = self.buf.take().expect("must not use after an error");
if buf.is_empty() {
self.buf = Some(buf);
return std::io::Result::Ok(());
}
let buf_len = buf.len();
let (nwritten, mut buf) = self.writer.write_all(buf, ctx).await?;
assert_eq!(nwritten, buf_len);
buf.clear();
self.buf = Some(buf);
Ok(())
}
}
impl<const BUFFER_SIZE: usize, W: OwnedAsyncWriter> OwnedAsyncWriter
for BufferedWriter<BUFFER_SIZE, W>
{
#[inline(always)]
async fn write_all<
B: BoundedBuf<Buf = Buf, Bounds = Bounds>,
Buf: IoBuf + Send,
Bounds: std::ops::RangeBounds<usize>,
>(
&mut self,
buf: B,
ctx: &RequestContext,
) -> std::io::Result<(usize, B::Buf)> {
let nbytes = buf.bytes_init();
if nbytes == 0 {
return Ok((0, Slice::into_inner(buf.slice_full())));
}
let slice = buf.slice(0..nbytes);
BufferedWriter::write_buffered(self, slice, ctx).await
}
#[inline(always)]
async fn write_all_borrowed(
&mut self,
buf: &[u8],
ctx: &RequestContext,
) -> std::io::Result<usize> {
BufferedWriter::write_all_borrowed(self, buf, ctx).await
}
}
impl OwnedAsyncWriter for Vec<u8> {
async fn write_all<
B: BoundedBuf<Buf = Buf, Bounds = Bounds>,
Buf: IoBuf + Send,
Bounds: std::ops::RangeBounds<usize>,
>(
&mut self,
buf: B,
_: &RequestContext,
) -> std::io::Result<(usize, B::Buf)> {
let nbytes = buf.bytes_init();
if nbytes == 0 {
return Ok((0, Slice::into_inner(buf.slice_full())));
}
let buf = buf.slice(0..nbytes);
self.extend_from_slice(&buf[..]);
Ok((buf.len(), Slice::into_inner(buf)))
}
async fn write_all_borrowed(
&mut self,
buf: &[u8],
_ctx: &RequestContext,
) -> std::io::Result<usize> {
self.extend_from_slice(buf);
Ok(buf.len())
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::context::{DownloadBehavior, RequestContext};
use crate::task_mgr::TaskKind;
#[derive(Default)]
struct RecorderWriter {
writes: Vec<Vec<u8>>,
}
impl OwnedAsyncWriter for RecorderWriter {
async fn write_all<
B: BoundedBuf<Buf = Buf, Bounds = Bounds>,
Buf: IoBuf + Send,
Bounds: std::ops::RangeBounds<usize>,
>(
&mut self,
buf: B,
_: &RequestContext,
) -> std::io::Result<(usize, B::Buf)> {
let nbytes = buf.bytes_init();
if nbytes == 0 {
self.writes.push(vec![]);
return Ok((0, Slice::into_inner(buf.slice_full())));
}
let buf = buf.slice(0..nbytes);
self.writes.push(Vec::from(&buf[..]));
Ok((buf.len(), Slice::into_inner(buf)))
}
async fn write_all_borrowed(
&mut self,
buf: &[u8],
_: &RequestContext,
) -> std::io::Result<usize> {
self.writes.push(Vec::from(buf));
Ok(buf.len())
}
}
fn test_ctx() -> RequestContext {
RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error)
}
macro_rules! write {
($writer:ident, $data:literal) => {{
$writer
.write_buffered(::bytes::Bytes::from_static($data).slice_full(), &test_ctx())
.await?;
}};
}
#[tokio::test]
async fn test_buffered_writes_only() -> std::io::Result<()> {
let recorder = RecorderWriter::default();
let mut writer = BufferedWriter::<2, _>::new(recorder);
write!(writer, b"a");
write!(writer, b"b");
write!(writer, b"c");
write!(writer, b"d");
write!(writer, b"e");
let recorder = writer.flush_and_into_inner(&test_ctx()).await?;
assert_eq!(
recorder.writes,
vec![Vec::from(b"ab"), Vec::from(b"cd"), Vec::from(b"e")]
);
Ok(())
}
#[tokio::test]
async fn test_passthrough_writes_only() -> std::io::Result<()> {
let recorder = RecorderWriter::default();
let mut writer = BufferedWriter::<2, _>::new(recorder);
write!(writer, b"abc");
write!(writer, b"de");
write!(writer, b"");
write!(writer, b"fghijk");
let recorder = writer.flush_and_into_inner(&test_ctx()).await?;
assert_eq!(
recorder.writes,
vec![Vec::from(b"abc"), Vec::from(b"de"), Vec::from(b"fghijk")]
);
Ok(())
}
#[tokio::test]
async fn test_passthrough_write_with_nonempty_buffer() -> std::io::Result<()> {
let recorder = RecorderWriter::default();
let mut writer = BufferedWriter::<2, _>::new(recorder);
write!(writer, b"a");
write!(writer, b"bc");
write!(writer, b"d");
write!(writer, b"e");
let recorder = writer.flush_and_into_inner(&test_ctx()).await?;
assert_eq!(
recorder.writes,
vec![Vec::from(b"a"), Vec::from(b"bc"), Vec::from(b"de")]
);
Ok(())
}
#[tokio::test]
async fn test_write_all_borrowed_always_goes_through_buffer() -> std::io::Result<()> {
let recorder = RecorderWriter::default();
let mut writer = BufferedWriter::<2, _>::new(recorder);
let ctx = test_ctx();
writer.write_all_borrowed(b"abc", &ctx).await?;
writer.write_all_borrowed(b"d", &ctx).await?;
writer.write_all_borrowed(b"e", &ctx).await?;
writer.write_all_borrowed(b"fg", &ctx).await?;
writer.write_all_borrowed(b"hi", &ctx).await?;
writer.write_all_borrowed(b"j", &ctx).await?;
writer.write_all_borrowed(b"klmno", &ctx).await?;
let recorder = writer.flush_and_into_inner(&ctx).await?;
assert_eq!(
recorder.writes,
{
let expect: &[&[u8]] = &[b"ab", b"cd", b"ef", b"gh", b"ij", b"kl", b"mn", b"o"];
expect
}
.iter()
.map(|v| v[..].to_vec())
.collect::<Vec<_>>()
);
Ok(())
}
}

View File

@@ -0,0 +1,70 @@
use std::mem::MaybeUninit;
pub struct Buf<const N: usize> {
allocation: Box<[u8; N]>,
written: usize,
}
impl<const N: usize> Default for Buf<N> {
fn default() -> Self {
Self {
allocation: Box::new(
// SAFETY: zeroed memory is a valid [u8; N]
unsafe { MaybeUninit::zeroed().assume_init() },
),
written: 0,
}
}
}
impl<const N: usize> Buf<N> {
#[inline(always)]
fn invariants(&self) {
debug_assert!(self.written <= N, "{}", self.written);
}
pub fn as_zero_padded_slice(&self) -> &[u8; N] {
&self.allocation
}
/// panics if there's not enough capacity left
pub fn extend_from_slice(&mut self, buf: &[u8]) {
self.invariants();
let can = N - self.written;
let want = buf.len();
assert!(want <= can, "{:x} {:x}", want, can);
self.allocation[self.written..(self.written + want)].copy_from_slice(buf);
self.written += want;
self.invariants();
}
pub fn len(&self) -> usize {
self.written
}
pub fn is_empty(&self) -> bool {
self.len() == 0
}
pub fn clear(&mut self) {
self.invariants();
self.written = 0;
self.allocation[..].fill(0);
self.invariants();
}
}
/// SAFETY: the Box<> has a stable location in memory.
unsafe impl<const N: usize> tokio_epoll_uring::IoBuf for Buf<N> {
fn stable_ptr(&self) -> *const u8 {
self.allocation.as_ptr()
}
fn bytes_init(&self) -> usize {
self.written
}
fn bytes_total(&self) -> usize {
self.written // ?
}
}

View File

@@ -28,7 +28,7 @@ def platform() -> Optional[str]:
@pytest.fixture(scope="function", autouse=True)
def pageserver_virtual_file_io_engine() -> Optional[str]:
return None
return os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE")
def pytest_generate_tests(metafunc: Metafunc):
@@ -48,11 +48,8 @@ def pytest_generate_tests(metafunc: Metafunc):
# A hacky way to parametrize tests only for `pageserver_virtual_file_io_engine=std-fs`
# And do not change test name for default `pageserver_virtual_file_io_engine=tokio-epoll-uring` to keep tests statistics
if (io_engine := os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE", "")) not in (
"",
"tokio-epoll-uring",
):
metafunc.parametrize("pageserver_virtual_file_io_engine", [io_engine])
io_engine = os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE", "std-fs")
metafunc.parametrize("pageserver_virtual_file_io_engine", [io_engine])
# For performance tests, parametrize also by platform
if (