mirror of
https://github.com/neondatabase/neon.git
synced 2025-12-22 21:59:59 +00:00
feat: Add configurable Direct IO alignment support (#12821)
## Problem Neon's storage system currently has hard-coded 512-byte block size for Direct IO operations, which causes I/O errors on systems with disks that have 4096-byte block sizes. This results in errors like "vec read failed" and "Invalid argument (os error 22)" on certain hardware configurations. See issue #12623 for details. ## Summary of changes Make Direct IO alignment configurable at build time to support both 512-byte and 4096-byte block sizes: - Add `io-align-512` and `io-align-4k` cargo features (default: 512-byte for backward compatibility) - Make `DEFAULT_IO_BUFFER_ALIGNMENT` configurable via cargo features in `pageserver_api` - Update `DIO_CHUNK_SIZE` in vectored_dio_read to use the configured alignment value dynamically - Add `IO_ALIGNMENT` build argument to Dockerfile to allow building images with different alignment settings - Add startup logging to display the configured IO buffer alignment for operational visibility - Fix validation logic in `virtual_file.rs` to use the configured alignment instead of hard-coded 512 This change allows Neon to run on systems with different disk block sizes by building with the appropriate feature flag, addressing the compatibility issues described in the RFC on Direct IO implementation ## Performance Note Benchmarks show 512-byte alignment performs significantly better than 4k: - Write: 512-byte is 21-71% faster across percentiles (p99: 71% faster) - Read: 512-byte is slightly faster (5-21% improvement) This is why 512-byte remains the default. However, some storage systems require 4k alignment and will fail with EINVAL otherwise. This change adds build-time configuration to support both environments.
This commit is contained in:
@@ -78,6 +78,7 @@ WORKDIR /home/nonroot
|
||||
ARG GIT_VERSION=local
|
||||
ARG BUILD_TAG
|
||||
ARG ADDITIONAL_RUSTFLAGS=""
|
||||
ARG IO_ALIGNMENT=512
|
||||
ENV CARGO_FEATURES="default"
|
||||
|
||||
# 3. Build cargo dependencies. Note that this step doesn't depend on anything else than
|
||||
@@ -101,7 +102,12 @@ COPY --chown=nonroot --from=plan /home/nonroot/Cargo.lock Carg
|
||||
RUN --mount=type=secret,uid=1000,id=SUBZERO_ACCESS_TOKEN \
|
||||
set -e \
|
||||
&& if [ -s /run/secrets/SUBZERO_ACCESS_TOKEN ]; then \
|
||||
export CARGO_FEATURES="rest_broker"; \
|
||||
export CARGO_FEATURES="${CARGO_FEATURES},rest_broker"; \
|
||||
fi \
|
||||
&& if [ "$IO_ALIGNMENT" = "4k" ]; then \
|
||||
export CARGO_FEATURES="${CARGO_FEATURES},io-align-4k"; \
|
||||
elif [ "$IO_ALIGNMENT" = "512" ]; then \
|
||||
export CARGO_FEATURES="${CARGO_FEATURES},io-align-512"; \
|
||||
fi \
|
||||
&& RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo auditable build \
|
||||
--features $CARGO_FEATURES \
|
||||
|
||||
@@ -5,8 +5,12 @@ edition = "2024"
|
||||
license.workspace = true
|
||||
|
||||
[features]
|
||||
default = ["io-align-512"]
|
||||
# See pageserver/Cargo.toml
|
||||
testing = ["dep:nix"]
|
||||
# Direct IO alignment options (mutually exclusive)
|
||||
io-align-512 = []
|
||||
io-align-4k = []
|
||||
|
||||
[dependencies]
|
||||
serde.workspace = true
|
||||
|
||||
@@ -703,6 +703,11 @@ pub mod defaults {
|
||||
|
||||
pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
|
||||
|
||||
#[cfg(feature = "io-align-4k")]
|
||||
pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 4096;
|
||||
#[cfg(all(feature = "io-align-512", not(feature = "io-align-4k")))]
|
||||
pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 512;
|
||||
#[cfg(not(any(feature = "io-align-512", feature = "io-align-4k")))]
|
||||
pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 512;
|
||||
|
||||
pub const DEFAULT_SSL_KEY_FILE: &str = "server.key";
|
||||
|
||||
@@ -10,6 +10,10 @@ default = []
|
||||
# which adds some runtime cost to run tests on outage conditions
|
||||
testing = ["fail/failpoints", "pageserver_api/testing", "wal_decoder/testing", "pageserver_client/testing"]
|
||||
|
||||
# Direct IO alignment options (propagated to pageserver_api)
|
||||
io-align-512 = ["pageserver_api/io-align-512"]
|
||||
io-align-4k = ["pageserver_api/io-align-4k"]
|
||||
|
||||
fuzz-read-path = ["testing"]
|
||||
|
||||
# Enables benchmarking only APIs
|
||||
|
||||
@@ -353,6 +353,10 @@ fn start_pageserver(
|
||||
launch_ts.to_string(),
|
||||
BUILD_TAG,
|
||||
);
|
||||
info!(
|
||||
"IO buffer alignment: {} bytes",
|
||||
pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT
|
||||
);
|
||||
set_build_info_metric(GIT_VERSION, BUILD_TAG);
|
||||
set_launch_timestamp_metric(launch_ts);
|
||||
#[cfg(target_os = "linux")]
|
||||
|
||||
@@ -73,7 +73,7 @@ pub trait Buffer: std::ops::Deref<Target = [u8]> {
|
||||
}
|
||||
|
||||
/// The minimum alignment and size requirement for disk offsets and memory buffer size for direct IO.
|
||||
const DIO_CHUNK_SIZE: usize = 512;
|
||||
const DIO_CHUNK_SIZE: usize = crate::virtual_file::get_io_buffer_alignment();
|
||||
|
||||
/// If multiple chunks need to be read, merge adjacent chunk reads into batches of max size `MAX_CHUNK_BATCH_SIZE`.
|
||||
/// (The unit is the number of chunks.)
|
||||
|
||||
@@ -852,7 +852,7 @@ impl VirtualFileInner {
|
||||
// Because the alloctor might return _more_ aligned addresses than requested,
|
||||
// there is a chance that testing would not catch violations of a runtime requirement stricter than 512.
|
||||
{
|
||||
let requirement = 512;
|
||||
let requirement = get_io_buffer_alignment();
|
||||
let remainder = addr % requirement;
|
||||
assert!(
|
||||
remainder == 0,
|
||||
@@ -866,7 +866,7 @@ impl VirtualFileInner {
|
||||
// So enforce just that and not anything more restrictive.
|
||||
// Even the shallowest testing will expose more restrictive requirements if those ever arise.
|
||||
{
|
||||
let requirement = 512;
|
||||
let requirement = get_io_buffer_alignment() as u64;
|
||||
let remainder = offset % requirement;
|
||||
assert!(
|
||||
remainder == 0,
|
||||
@@ -879,7 +879,7 @@ impl VirtualFileInner {
|
||||
// The requirement in Linux 6.1 is bdev_logical_block_size().
|
||||
// On our production systems, that is 512.
|
||||
{
|
||||
let requirement = 512;
|
||||
let requirement = get_io_buffer_alignment();
|
||||
let remainder = size % requirement;
|
||||
assert!(
|
||||
remainder == 0,
|
||||
|
||||
Reference in New Issue
Block a user