diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs index 3108b5351f..5c16548de8 100644 --- a/pageserver/benches/bench_ingest.rs +++ b/pageserver/benches/bench_ingest.rs @@ -11,6 +11,7 @@ use pageserver::task_mgr::TaskKind; use pageserver::tenant::storage_layer::InMemoryLayer; use pageserver::{page_cache, virtual_file}; use pageserver_api::key::Key; +use pageserver_api::models::virtual_file::IoMode; use pageserver_api::shard::TenantShardId; use pageserver_api::value::Value; use tokio_util::sync::CancellationToken; @@ -28,6 +29,7 @@ fn murmurhash32(mut h: u32) -> u32 { h } +#[derive(serde::Serialize, Clone, Copy, Debug)] enum KeyLayout { /// Sequential unique keys Sequential, @@ -37,6 +39,7 @@ enum KeyLayout { RandomReuse(u32), } +#[derive(serde::Serialize, Clone, Copy, Debug)] enum WriteDelta { Yes, No, @@ -138,12 +141,15 @@ async fn ingest( /// Wrapper to instantiate a tokio runtime fn ingest_main( conf: &'static PageServerConf, + io_mode: IoMode, put_size: usize, put_count: usize, key_layout: KeyLayout, write_delta: WriteDelta, ) { - let runtime = tokio::runtime::Builder::new_current_thread() + pageserver::virtual_file::set_io_mode(io_mode); + + let runtime = tokio::runtime::Builder::new_multi_thread() .enable_all() .build() .unwrap(); @@ -174,93 +180,207 @@ fn criterion_benchmark(c: &mut Criterion) { virtual_file::init( 16384, virtual_file::io_engine_for_bench(), + // immaterial, each `ingest_main` invocation below overrides this conf.virtual_file_io_mode, + // without actually doing syncs, buffered writes have an unfair advantage over direct IO writes virtual_file::SyncMode::Sync, ); page_cache::init(conf.page_cache_size); - { - let mut group = c.benchmark_group("ingest-small-values"); - let put_size = 100usize; - let put_count = 128 * 1024 * 1024 / put_size; - group.throughput(criterion::Throughput::Bytes((put_size * put_count) as u64)); - group.sample_size(10); - group.bench_function("ingest 128MB/100b seq", |b| { - b.iter(|| { - ingest_main( - conf, - put_size, - put_count, - KeyLayout::Sequential, - WriteDelta::Yes, - ) - }) - }); - group.bench_function("ingest 128MB/100b rand", |b| { - b.iter(|| { - ingest_main( - conf, - put_size, - put_count, - KeyLayout::Random, - WriteDelta::Yes, - ) - }) - }); - group.bench_function("ingest 128MB/100b rand-1024keys", |b| { - b.iter(|| { - ingest_main( - conf, - put_size, - put_count, - KeyLayout::RandomReuse(0x3ff), - WriteDelta::Yes, - ) - }) - }); - group.bench_function("ingest 128MB/100b seq, no delta", |b| { - b.iter(|| { - ingest_main( - conf, - put_size, - put_count, - KeyLayout::Sequential, - WriteDelta::No, - ) - }) - }); + #[derive(serde::Serialize)] + struct ExplodedParameters { + io_mode: IoMode, + volume_mib: usize, + key_size: usize, + key_layout: KeyLayout, + write_delta: WriteDelta, } - - { - let mut group = c.benchmark_group("ingest-big-values"); - let put_size = 8192usize; - let put_count = 128 * 1024 * 1024 / put_size; - group.throughput(criterion::Throughput::Bytes((put_size * put_count) as u64)); + #[derive(Clone)] + struct HandPickedParameters { + volume_mib: usize, + key_size: usize, + key_layout: KeyLayout, + write_delta: WriteDelta, + } + let expect = vec![ + // Small values (100b) tests + HandPickedParameters { + volume_mib: 128, + key_size: 100, + key_layout: KeyLayout::Sequential, + write_delta: WriteDelta::Yes, + }, + HandPickedParameters { + volume_mib: 128, + key_size: 100, + key_layout: KeyLayout::Random, + write_delta: WriteDelta::Yes, + }, + HandPickedParameters { + volume_mib: 128, + key_size: 100, + key_layout: KeyLayout::RandomReuse(0x3ff), + write_delta: WriteDelta::Yes, + }, + HandPickedParameters { + volume_mib: 128, + key_size: 100, + key_layout: KeyLayout::Sequential, + write_delta: WriteDelta::No, + }, + // Large values (8k) tests + HandPickedParameters { + volume_mib: 128, + key_size: 8192, + key_layout: KeyLayout::Sequential, + write_delta: WriteDelta::Yes, + }, + HandPickedParameters { + volume_mib: 128, + key_size: 8192, + key_layout: KeyLayout::Sequential, + write_delta: WriteDelta::No, + }, + ]; + let exploded_parameters = { + let mut out = Vec::new(); + for io_mode in [ + IoMode::Buffered, + #[cfg(target_os = "linux")] + IoMode::Direct, + ] { + for param in expect.clone() { + let HandPickedParameters { + volume_mib, + key_size, + key_layout, + write_delta, + } = param; + out.push(ExplodedParameters { + io_mode, + volume_mib, + key_size, + key_layout, + write_delta, + }); + } + } + out + }; + impl ExplodedParameters { + fn benchmark_id(&self) -> String { + let ExplodedParameters { + io_mode, + volume_mib, + key_size, + key_layout, + write_delta, + } = self; + format!( + "io_mode={io_mode:?} volume_mib={volume_mib:?} key_size_bytes={key_size:?} key_layout={key_layout:?} write_delta={write_delta:?}" + ) + } + } + let mut group = c.benchmark_group("ingest"); + for params in exploded_parameters { + let id = params.benchmark_id(); + let ExplodedParameters { + io_mode, + volume_mib, + key_size, + key_layout, + write_delta, + } = params; + let put_count = volume_mib * 1024 * 1024 / key_size; + group.throughput(criterion::Throughput::Bytes((key_size * put_count) as u64)); group.sample_size(10); - group.bench_function("ingest 128MB/8k seq", |b| { - b.iter(|| { - ingest_main( - conf, - put_size, - put_count, - KeyLayout::Sequential, - WriteDelta::Yes, - ) - }) - }); - group.bench_function("ingest 128MB/8k seq, no delta", |b| { - b.iter(|| { - ingest_main( - conf, - put_size, - put_count, - KeyLayout::Sequential, - WriteDelta::No, - ) - }) + group.bench_function(id, |b| { + b.iter(|| ingest_main(conf, io_mode, key_size, put_count, key_layout, write_delta)) }); } } criterion_group!(benches, criterion_benchmark); criterion_main!(benches); + +/* +cargo bench --bench bench_ingest + +im4gn.2xlarge: + +ingest/io_mode=Buffered volume_mib=128 key_size_bytes=100 key_layout=Sequential write_delta=Yes + time: [1.8491 s 1.8540 s 1.8592 s] + thrpt: [68.847 MiB/s 69.039 MiB/s 69.222 MiB/s] +ingest/io_mode=Buffered volume_mib=128 key_size_bytes=100 key_layout=Random write_delta=Yes + time: [2.6976 s 2.7123 s 2.7286 s] + thrpt: [46.911 MiB/s 47.193 MiB/s 47.450 MiB/s] +ingest/io_mode=Buffered volume_mib=128 key_size_bytes=100 key_layout=RandomReuse(1023) write_delta=Y... + time: [1.7433 s 1.7510 s 1.7600 s] + thrpt: [72.729 MiB/s 73.099 MiB/s 73.423 MiB/s] +ingest/io_mode=Buffered volume_mib=128 key_size_bytes=100 key_layout=Sequential write_delta=No + time: [499.63 ms 500.07 ms 500.46 ms] + thrpt: [255.77 MiB/s 255.96 MiB/s 256.19 MiB/s] +ingest/io_mode=Buffered volume_mib=128 key_size_bytes=8192 key_layout=Sequential write_delta=Yes + time: [456.97 ms 459.61 ms 461.92 ms] + thrpt: [277.11 MiB/s 278.50 MiB/s 280.11 MiB/s] +ingest/io_mode=Buffered volume_mib=128 key_size_bytes=8192 key_layout=Sequential write_delta=No + time: [158.82 ms 159.16 ms 159.56 ms] + thrpt: [802.22 MiB/s 804.24 MiB/s 805.93 MiB/s] +ingest/io_mode=Direct volume_mib=128 key_size_bytes=100 key_layout=Sequential write_delta=Yes + time: [1.8856 s 1.8997 s 1.9179 s] + thrpt: [66.740 MiB/s 67.380 MiB/s 67.882 MiB/s] +ingest/io_mode=Direct volume_mib=128 key_size_bytes=100 key_layout=Random write_delta=Yes + time: [2.7468 s 2.7625 s 2.7785 s] + thrpt: [46.068 MiB/s 46.335 MiB/s 46.600 MiB/s] +ingest/io_mode=Direct volume_mib=128 key_size_bytes=100 key_layout=RandomReuse(1023) write_delta=Yes + time: [1.7689 s 1.7726 s 1.7767 s] + thrpt: [72.045 MiB/s 72.208 MiB/s 72.363 MiB/s] +ingest/io_mode=Direct volume_mib=128 key_size_bytes=100 key_layout=Sequential write_delta=No + time: [497.64 ms 498.60 ms 499.67 ms] + thrpt: [256.17 MiB/s 256.72 MiB/s 257.21 MiB/s] +ingest/io_mode=Direct volume_mib=128 key_size_bytes=8192 key_layout=Sequential write_delta=Yes + time: [493.72 ms 505.07 ms 518.03 ms] + thrpt: [247.09 MiB/s 253.43 MiB/s 259.26 MiB/s] +ingest/io_mode=Direct volume_mib=128 key_size_bytes=8192 key_layout=Sequential write_delta=No + time: [267.76 ms 267.85 ms 267.96 ms] + thrpt: [477.69 MiB/s 477.88 MiB/s 478.03 MiB/s] + +Hetzner AX102: + +ingest/io_mode=Buffered volume_mib=128 key_size_bytes=100 key_layout=Sequential write_delta=Yes + time: [1.0683 s 1.1006 s 1.1386 s] + thrpt: [112.42 MiB/s 116.30 MiB/s 119.82 MiB/s] +ingest/io_mode=Buffered volume_mib=128 key_size_bytes=100 key_layout=Random write_delta=Yes + time: [1.5719 s 1.6012 s 1.6228 s] + thrpt: [78.877 MiB/s 79.938 MiB/s 81.430 MiB/s] +ingest/io_mode=Buffered volume_mib=128 key_size_bytes=100 key_layout=RandomReuse(1023) write_delta=Y... + time: [1.1095 s 1.1331 s 1.1580 s] + thrpt: [110.53 MiB/s 112.97 MiB/s 115.37 MiB/s] +ingest/io_mode=Buffered volume_mib=128 key_size_bytes=100 key_layout=Sequential write_delta=No + time: [303.20 ms 307.83 ms 311.90 ms] + thrpt: [410.39 MiB/s 415.81 MiB/s 422.16 MiB/s] +ingest/io_mode=Buffered volume_mib=128 key_size_bytes=8192 key_layout=Sequential write_delta=Yes + time: [406.34 ms 429.37 ms 451.63 ms] + thrpt: [283.42 MiB/s 298.11 MiB/s 315.00 MiB/s] +ingest/io_mode=Buffered volume_mib=128 key_size_bytes=8192 key_layout=Sequential write_delta=No + time: [134.01 ms 135.78 ms 137.48 ms] + thrpt: [931.03 MiB/s 942.68 MiB/s 955.12 MiB/s] +ingest/io_mode=Direct volume_mib=128 key_size_bytes=100 key_layout=Sequential write_delta=Yes + time: [1.0406 s 1.0580 s 1.0772 s] + thrpt: [118.83 MiB/s 120.98 MiB/s 123.00 MiB/s] +ingest/io_mode=Direct volume_mib=128 key_size_bytes=100 key_layout=Random write_delta=Yes + time: [1.5059 s 1.5339 s 1.5625 s] + thrpt: [81.920 MiB/s 83.448 MiB/s 84.999 MiB/s] +ingest/io_mode=Direct volume_mib=128 key_size_bytes=100 key_layout=RandomReuse(1023) write_delta=Yes + time: [1.0714 s 1.0934 s 1.1161 s] + thrpt: [114.69 MiB/s 117.06 MiB/s 119.47 MiB/s] +ingest/io_mode=Direct volume_mib=128 key_size_bytes=100 key_layout=Sequential write_delta=No + time: [262.68 ms 265.14 ms 267.71 ms] + thrpt: [478.13 MiB/s 482.76 MiB/s 487.29 MiB/s] +ingest/io_mode=Direct volume_mib=128 key_size_bytes=8192 key_layout=Sequential write_delta=Yes + time: [375.19 ms 393.80 ms 411.40 ms] + thrpt: [311.14 MiB/s 325.04 MiB/s 341.16 MiB/s] +ingest/io_mode=Direct volume_mib=128 key_size_bytes=8192 key_layout=Sequential write_delta=No + time: [123.02 ms 123.85 ms 124.66 ms] + thrpt: [1.0027 GiB/s 1.0093 GiB/s 1.0161 GiB/s] +*/ diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 45cd0f469b..4eb968c340 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -1369,7 +1369,7 @@ pub(crate) type IoPageSlice<'a> = static IO_MODE: once_cell::sync::Lazy = once_cell::sync::Lazy::new(|| AtomicU8::new(IoMode::preferred() as u8)); -pub(crate) fn set_io_mode(mode: IoMode) { +pub fn set_io_mode(mode: IoMode) { IO_MODE.store(mode as u8, std::sync::atomic::Ordering::Relaxed); }