mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-17 02:12:56 +00:00
part of https://github.com/neondatabase/neon/issues/7418 # Motivation (reproducing #7418) When we do an `InMemoryLayer::write_to_disk`, there is a tremendous amount of random read I/O, as deltas from the ephemeral file (written in LSN order) are written out to the delta layer in key order. In benchmarks (https://github.com/neondatabase/neon/pull/7409) we can see that this delta layer writing phase is substantially more expensive than the initial ingest of data, and that within the delta layer write a significant amount of the CPU time is spent traversing the page cache. # High-Level Changes Add a new mode for L0 flush that works as follows: * Read the full ephemeral file into memory -- layers are much smaller than total memory, so this is afforable * Do all the random reads directly from this in memory buffer instead of using blob IO/page cache/disk reads. * Add a semaphore to limit how many timelines may concurrently do this (limit peak memory). * Make the semaphore configurable via PS config. # Implementation Details The new `BlobReaderRef::Slice` is a temporary hack until we can ditch `blob_io` for `InMemoryLayer` => Plan for this is laid out in https://github.com/neondatabase/neon/issues/8183 # Correctness The correctness of this change is quite obvious to me: we do what we did before (`blob_io`) but read from memory instead of going to disk. The highest bug potential is in doing owned-buffers IO. I refactored the API a bit in preliminary PR https://github.com/neondatabase/neon/pull/8186 to make it less error-prone, but still, careful review is requested. # Performance I manually measured single-client ingest performance from `pgbench -i ...`. Full report: https://neondatabase.notion.site/2024-06-28-benchmarking-l0-flush-performance-e98cff3807f94cb38f2054d8c818fe84?pvs=4 tl;dr: * no speed improvements during ingest, but * significantly lower pressure on PS PageCache (eviction rate drops to 1/3) * (that's why I'm working on this) * noticable but modestly lower CPU time This is good enough for merging this PR because the changes require opt-in. We'll do more testing in staging & pre-prod. # Stability / Monitoring **memory consumption**: there's no _hard_ limit on max `InMemoryLayer` size (aka "checkpoint distance") , hence there's no hard limit on the memory allocation we do for flushing. In practice, we a) [log a warning](23827c6b0d/pageserver/src/tenant/timeline.rs (L5741-L5743)) when we flush oversized layers, so we'd know which tenant is to blame and b) if we were to put a hard limit in place, we would have to decide what to do if there is an InMemoryLayer that exceeds the limit. It seems like a better option to guarantee a max size for frozen layer, dependent on `checkpoint_distance`. Then limit concurrency based on that. **metrics**: we do have the [flush_time_histo](23827c6b0d/pageserver/src/tenant/timeline.rs (L3725-L3726)), but that includes the wait time for the semaphore. We could add a separate metric for the time spent after acquiring the semaphore, so one can infer the wait time. Seems unnecessary at this point, though.
253 lines
7.7 KiB
Rust
253 lines
7.7 KiB
Rust
#![recursion_limit = "300"]
|
|
#![deny(clippy::undocumented_unsafe_blocks)]
|
|
|
|
mod auth;
|
|
pub mod basebackup;
|
|
pub mod config;
|
|
pub mod consumption_metrics;
|
|
pub mod context;
|
|
pub mod control_plane_client;
|
|
pub mod deletion_queue;
|
|
pub mod disk_usage_eviction_task;
|
|
pub mod http;
|
|
pub mod import_datadir;
|
|
pub mod l0_flush;
|
|
pub use pageserver_api::keyspace;
|
|
pub mod aux_file;
|
|
pub mod metrics;
|
|
pub mod page_cache;
|
|
pub mod page_service;
|
|
pub mod pgdatadir_mapping;
|
|
pub mod repository;
|
|
pub mod span;
|
|
pub(crate) mod statvfs;
|
|
pub mod task_mgr;
|
|
pub mod tenant;
|
|
pub mod trace;
|
|
pub mod utilization;
|
|
pub mod virtual_file;
|
|
pub mod walingest;
|
|
pub mod walrecord;
|
|
pub mod walredo;
|
|
|
|
use crate::task_mgr::TaskKind;
|
|
use camino::Utf8Path;
|
|
use deletion_queue::DeletionQueue;
|
|
use tenant::mgr::TenantManager;
|
|
use tracing::info;
|
|
|
|
/// Current storage format version
|
|
///
|
|
/// This is embedded in the header of all the layer files.
|
|
/// If you make any backwards-incompatible changes to the storage
|
|
/// format, bump this!
|
|
/// Note that TimelineMetadata uses its own version number to track
|
|
/// backwards-compatible changes to the metadata format.
|
|
pub const STORAGE_FORMAT_VERSION: u16 = 3;
|
|
|
|
pub const DEFAULT_PG_VERSION: u32 = 15;
|
|
|
|
// Magic constants used to identify different kinds of files
|
|
pub const IMAGE_FILE_MAGIC: u16 = 0x5A60;
|
|
pub const DELTA_FILE_MAGIC: u16 = 0x5A61;
|
|
|
|
static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);
|
|
|
|
pub use crate::metrics::preinitialize_metrics;
|
|
|
|
#[tracing::instrument(skip_all, fields(%exit_code))]
|
|
pub async fn shutdown_pageserver(
|
|
tenant_manager: &TenantManager,
|
|
mut deletion_queue: DeletionQueue,
|
|
exit_code: i32,
|
|
) {
|
|
use std::time::Duration;
|
|
// Shut down the libpq endpoint task. This prevents new connections from
|
|
// being accepted.
|
|
timed(
|
|
task_mgr::shutdown_tasks(Some(TaskKind::LibpqEndpointListener), None, None),
|
|
"shutdown LibpqEndpointListener",
|
|
Duration::from_secs(1),
|
|
)
|
|
.await;
|
|
|
|
// Shut down all the tenants. This flushes everything to disk and kills
|
|
// the checkpoint and GC tasks.
|
|
timed(
|
|
tenant_manager.shutdown(),
|
|
"shutdown all tenants",
|
|
Duration::from_secs(5),
|
|
)
|
|
.await;
|
|
|
|
// Shut down any page service tasks: any in-progress work for particular timelines or tenants
|
|
// should already have been canclled via mgr::shutdown_all_tenants
|
|
timed(
|
|
task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None),
|
|
"shutdown PageRequestHandlers",
|
|
Duration::from_secs(1),
|
|
)
|
|
.await;
|
|
|
|
// Best effort to persist any outstanding deletions, to avoid leaking objects
|
|
deletion_queue.shutdown(Duration::from_secs(5)).await;
|
|
|
|
// Shut down the HTTP endpoint last, so that you can still check the server's
|
|
// status while it's shutting down.
|
|
// FIXME: We should probably stop accepting commands like attach/detach earlier.
|
|
timed(
|
|
task_mgr::shutdown_tasks(Some(TaskKind::HttpEndpointListener), None, None),
|
|
"shutdown http",
|
|
Duration::from_secs(1),
|
|
)
|
|
.await;
|
|
|
|
// There should be nothing left, but let's be sure
|
|
timed(
|
|
task_mgr::shutdown_tasks(None, None, None),
|
|
"shutdown leftovers",
|
|
Duration::from_secs(1),
|
|
)
|
|
.await;
|
|
info!("Shut down successfully completed");
|
|
std::process::exit(exit_code);
|
|
}
|
|
|
|
/// Per-tenant configuration file.
|
|
/// Full path: `tenants/<tenant_id>/config-v1`.
|
|
pub(crate) const TENANT_LOCATION_CONFIG_NAME: &str = "config-v1";
|
|
|
|
/// Per-tenant copy of their remote heatmap, downloaded into the local
|
|
/// tenant path while in secondary mode.
|
|
pub(crate) const TENANT_HEATMAP_BASENAME: &str = "heatmap-v1.json";
|
|
|
|
/// A suffix used for various temporary files. Any temporary files found in the
|
|
/// data directory at pageserver startup can be automatically removed.
|
|
pub(crate) const TEMP_FILE_SUFFIX: &str = "___temp";
|
|
|
|
/// A marker file to mark that a timeline directory was not fully initialized.
|
|
/// If a timeline directory with this marker is encountered at pageserver startup,
|
|
/// the timeline directory and the marker file are both removed.
|
|
/// Full path: `tenants/<tenant_id>/timelines/<timeline_id>___uninit`.
|
|
pub(crate) const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit";
|
|
|
|
pub(crate) const TIMELINE_DELETE_MARK_SUFFIX: &str = "___delete";
|
|
|
|
pub fn is_temporary(path: &Utf8Path) -> bool {
|
|
match path.file_name() {
|
|
Some(name) => name.ends_with(TEMP_FILE_SUFFIX),
|
|
None => false,
|
|
}
|
|
}
|
|
|
|
fn ends_with_suffix(path: &Utf8Path, suffix: &str) -> bool {
|
|
match path.file_name() {
|
|
Some(name) => name.ends_with(suffix),
|
|
None => false,
|
|
}
|
|
}
|
|
|
|
// FIXME: DO NOT ADD new query methods like this, which will have a next step of parsing timelineid
|
|
// from the directory name. Instead create type "UninitMark(TimelineId)" and only parse it once
|
|
// from the name.
|
|
|
|
pub(crate) fn is_uninit_mark(path: &Utf8Path) -> bool {
|
|
ends_with_suffix(path, TIMELINE_UNINIT_MARK_SUFFIX)
|
|
}
|
|
|
|
pub(crate) fn is_delete_mark(path: &Utf8Path) -> bool {
|
|
ends_with_suffix(path, TIMELINE_DELETE_MARK_SUFFIX)
|
|
}
|
|
|
|
/// During pageserver startup, we need to order operations not to exhaust tokio worker threads by
|
|
/// blocking.
|
|
///
|
|
/// The instances of this value exist only during startup, otherwise `None` is provided, meaning no
|
|
/// delaying is needed.
|
|
#[derive(Clone)]
|
|
pub struct InitializationOrder {
|
|
/// Each initial tenant load task carries this until it is done loading timelines from remote storage
|
|
pub initial_tenant_load_remote: Option<utils::completion::Completion>,
|
|
|
|
/// Each initial tenant load task carries this until completion.
|
|
pub initial_tenant_load: Option<utils::completion::Completion>,
|
|
|
|
/// Barrier for when we can start any background jobs.
|
|
///
|
|
/// This can be broken up later on, but right now there is just one class of a background job.
|
|
pub background_jobs_can_start: utils::completion::Barrier,
|
|
}
|
|
|
|
/// Time the future with a warning when it exceeds a threshold.
|
|
async fn timed<Fut: std::future::Future>(
|
|
fut: Fut,
|
|
name: &str,
|
|
warn_at: std::time::Duration,
|
|
) -> <Fut as std::future::Future>::Output {
|
|
let started = std::time::Instant::now();
|
|
|
|
let mut fut = std::pin::pin!(fut);
|
|
|
|
match tokio::time::timeout(warn_at, &mut fut).await {
|
|
Ok(ret) => {
|
|
tracing::info!(
|
|
stage = name,
|
|
elapsed_ms = started.elapsed().as_millis(),
|
|
"completed"
|
|
);
|
|
ret
|
|
}
|
|
Err(_) => {
|
|
tracing::info!(
|
|
stage = name,
|
|
elapsed_ms = started.elapsed().as_millis(),
|
|
"still waiting, taking longer than expected..."
|
|
);
|
|
|
|
let ret = fut.await;
|
|
|
|
// this has a global allowed_errors
|
|
tracing::warn!(
|
|
stage = name,
|
|
elapsed_ms = started.elapsed().as_millis(),
|
|
"completed, took longer than expected"
|
|
);
|
|
|
|
ret
|
|
}
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod timed_tests {
|
|
use super::timed;
|
|
use std::time::Duration;
|
|
|
|
#[tokio::test]
|
|
async fn timed_completes_when_inner_future_completes() {
|
|
// A future that completes on time should have its result returned
|
|
let r1 = timed(
|
|
async move {
|
|
tokio::time::sleep(Duration::from_millis(10)).await;
|
|
123
|
|
},
|
|
"test 1",
|
|
Duration::from_millis(50),
|
|
)
|
|
.await;
|
|
assert_eq!(r1, 123);
|
|
|
|
// A future that completes too slowly should also have its result returned
|
|
let r1 = timed(
|
|
async move {
|
|
tokio::time::sleep(Duration::from_millis(50)).await;
|
|
456
|
|
},
|
|
"test 1",
|
|
Duration::from_millis(10),
|
|
)
|
|
.await;
|
|
assert_eq!(r1, 456);
|
|
}
|
|
}
|