mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-14 11:40:38 +00:00
pageserver: reorder upload queue when possible (#10218)
## Problem The upload queue currently sees significant head-of-line blocking. For example, index uploads act as upload barriers, and for every layer flush we schedule a layer and index upload, which effectively serializes layer uploads. Resolves #10096. ## Summary of changes Allow upload queue operations to bypass the queue if they don't conflict with preceding operations, increasing parallelism. NB: the upload queue currently schedules an explicit barrier after every layer flush as well (see #8550). This must be removed to enable parallelism. This will require a better mechanism for compaction backpressure, see e.g. #8390 or #5415.
This commit is contained in:
1
Cargo.lock
generated
1
Cargo.lock
generated
@@ -4044,6 +4044,7 @@ dependencies = [
|
||||
"postgres_connection",
|
||||
"postgres_ffi",
|
||||
"postgres_initdb",
|
||||
"pprof",
|
||||
"pq_proto",
|
||||
"procfs",
|
||||
"rand 0.8.5",
|
||||
|
||||
@@ -44,6 +44,7 @@ postgres_backend.workspace = true
|
||||
postgres-protocol.workspace = true
|
||||
postgres-types.workspace = true
|
||||
postgres_initdb.workspace = true
|
||||
pprof.workspace = true
|
||||
rand.workspace = true
|
||||
range-set-blaze = { version = "0.1.16", features = ["alloc"] }
|
||||
regex.workspace = true
|
||||
@@ -108,3 +109,7 @@ harness = false
|
||||
[[bench]]
|
||||
name = "bench_ingest"
|
||||
harness = false
|
||||
|
||||
[[bench]]
|
||||
name = "upload_queue"
|
||||
harness = false
|
||||
|
||||
86
pageserver/benches/upload_queue.rs
Normal file
86
pageserver/benches/upload_queue.rs
Normal file
@@ -0,0 +1,86 @@
|
||||
//! Upload queue benchmarks.
|
||||
|
||||
use std::str::FromStr as _;
|
||||
use std::sync::atomic::AtomicU32;
|
||||
use std::sync::Arc;
|
||||
|
||||
use criterion::{criterion_group, criterion_main, Bencher, Criterion};
|
||||
use pageserver::tenant::metadata::TimelineMetadata;
|
||||
use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
|
||||
use pageserver::tenant::storage_layer::LayerName;
|
||||
use pageserver::tenant::upload_queue::{Delete, UploadOp, UploadQueue, UploadTask};
|
||||
use pageserver::tenant::IndexPart;
|
||||
use pprof::criterion::{Output, PProfProfiler};
|
||||
use utils::generation::Generation;
|
||||
use utils::shard::{ShardCount, ShardIndex, ShardNumber};
|
||||
|
||||
// Register benchmarks with Criterion.
|
||||
criterion_group!(
|
||||
name = benches;
|
||||
config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
|
||||
targets = bench_upload_queue_next_ready,
|
||||
);
|
||||
criterion_main!(benches);
|
||||
|
||||
/// Benchmarks the cost of UploadQueue::next_ready() with the given number of in-progress tasks
|
||||
/// (which is equivalent to tasks ahead of it in the queue). This has linear cost, and the upload
|
||||
/// queue as a whole is thus quadratic.
|
||||
///
|
||||
/// UploadOp::UploadLayer requires an entire tenant and timeline to construct, so we just test
|
||||
/// Delete and UploadMetadata instead. This is incidentally the most expensive case.
|
||||
fn bench_upload_queue_next_ready(c: &mut Criterion) {
|
||||
let mut g = c.benchmark_group("upload_queue_next_ready");
|
||||
for inprogress in [0, 1, 10, 100, 1_000, 10_000, 100_000, 1_000_000] {
|
||||
g.bench_function(format!("inprogress={inprogress}"), |b| {
|
||||
run_bench(b, inprogress).unwrap()
|
||||
});
|
||||
}
|
||||
|
||||
fn run_bench(b: &mut Bencher, inprogress: usize) -> anyhow::Result<()> {
|
||||
// Construct two layers. layer0 is in the indexes, layer1 will be deleted.
|
||||
let layer0 = LayerName::from_str("000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51").expect("invalid name");
|
||||
let layer1 = LayerName::from_str("100000000000000000000000000000000001-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51").expect("invalid name");
|
||||
|
||||
let metadata = LayerFileMetadata {
|
||||
shard: ShardIndex::new(ShardNumber(1), ShardCount(2)),
|
||||
generation: Generation::Valid(1),
|
||||
file_size: 0,
|
||||
};
|
||||
|
||||
// Construct the (initial and uploaded) index with layer0.
|
||||
let mut index = IndexPart::empty(TimelineMetadata::example());
|
||||
index.layer_metadata.insert(layer0, metadata.clone());
|
||||
|
||||
// Construct the queue.
|
||||
let mut queue = UploadQueue::Uninitialized;
|
||||
let queue = queue.initialize_with_current_remote_index_part(&index)?;
|
||||
|
||||
// Populate inprogress_tasks with a bunch of layer1 deletions.
|
||||
let delete = UploadOp::Delete(Delete {
|
||||
layers: vec![(layer1, metadata)],
|
||||
});
|
||||
|
||||
for task_id in 0..(inprogress as u64) {
|
||||
queue.inprogress_tasks.insert(
|
||||
task_id,
|
||||
Arc::new(UploadTask {
|
||||
task_id,
|
||||
retries: AtomicU32::new(0),
|
||||
op: delete.clone(),
|
||||
}),
|
||||
);
|
||||
}
|
||||
|
||||
// Benchmark index upload scheduling.
|
||||
let index_upload = UploadOp::UploadMetadata {
|
||||
uploaded: Box::new(index),
|
||||
};
|
||||
|
||||
b.iter(|| {
|
||||
queue.queued_operations.push_front(index_upload.clone());
|
||||
assert!(queue.next_ready().is_some());
|
||||
});
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -320,7 +320,6 @@ impl TimelineMetadata {
|
||||
|
||||
// Checksums make it awkward to build a valid instance by hand. This helper
|
||||
// provides a TimelineMetadata with a valid checksum in its header.
|
||||
#[cfg(test)]
|
||||
pub fn example() -> Self {
|
||||
let instance = Self::new(
|
||||
"0/16960E8".parse::<Lsn>().unwrap(),
|
||||
|
||||
@@ -63,22 +63,18 @@
|
||||
//! The contract between client and its user is that the user is responsible of
|
||||
//! scheduling operations in an order that keeps the remote consistent as
|
||||
//! described above.
|
||||
//!
|
||||
//! From the user's perspective, the operations are executed sequentially.
|
||||
//! Internally, the client knows which operations can be performed in parallel,
|
||||
//! and which operations act like a "barrier" that require preceding operations
|
||||
//! to finish. The calling code just needs to call the schedule-functions in the
|
||||
//! correct order, and the client will parallelize the operations in a way that
|
||||
//! is safe.
|
||||
//!
|
||||
//! The caller should be careful with deletion, though. They should not delete
|
||||
//! local files that have been scheduled for upload but not yet finished uploading.
|
||||
//! Otherwise the upload will fail. To wait for an upload to finish, use
|
||||
//! the 'wait_completion' function (more on that later.)
|
||||
//! is safe. For more details, see `UploadOp::can_bypass`.
|
||||
//!
|
||||
//! All of this relies on the following invariants:
|
||||
//!
|
||||
//! - We rely on read-after write consistency in the remote storage.
|
||||
//! - Layer files are immutable
|
||||
//! - Layer files are immutable.
|
||||
//!
|
||||
//! NB: Pageserver assumes that it has exclusive write access to the tenant in remote
|
||||
//! storage. Different tenants can be attached to different pageservers, but if the
|
||||
@@ -1855,57 +1851,17 @@ impl RemoteTimelineClient {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// Pick next tasks from the queue, and start as many of them as possible without violating
|
||||
/// the ordering constraints.
|
||||
///
|
||||
/// The caller needs to already hold the `upload_queue` lock.
|
||||
/// TODO: consider limiting the number of in-progress tasks, beyond what remote_storage does.
|
||||
/// This can launch an unbounded number of queued tasks. `UploadQueue::next_ready()` also has
|
||||
/// worst-case quadratic cost in the number of tasks, and may struggle beyond 10,000 tasks.
|
||||
fn launch_queued_tasks(self: &Arc<Self>, upload_queue: &mut UploadQueueInitialized) {
|
||||
while let Some(next_op) = upload_queue.queued_operations.front() {
|
||||
// Can we run this task now?
|
||||
let can_run_now = match next_op {
|
||||
UploadOp::UploadLayer(..) => {
|
||||
// Can always be scheduled.
|
||||
true
|
||||
}
|
||||
UploadOp::UploadMetadata { .. } => {
|
||||
// These can only be performed after all the preceding operations
|
||||
// have finished.
|
||||
upload_queue.inprogress_tasks.is_empty()
|
||||
}
|
||||
UploadOp::Delete(..) => {
|
||||
// Wait for preceding uploads to finish. Concurrent deletions are OK, though.
|
||||
upload_queue.num_inprogress_deletions == upload_queue.inprogress_tasks.len()
|
||||
}
|
||||
while let Some(mut next_op) = upload_queue.next_ready() {
|
||||
debug!("starting op: {next_op}");
|
||||
|
||||
UploadOp::Barrier(_) | UploadOp::Shutdown => {
|
||||
upload_queue.inprogress_tasks.is_empty()
|
||||
}
|
||||
};
|
||||
|
||||
// If we cannot launch this task, don't look any further.
|
||||
//
|
||||
// In some cases, we could let some non-frontmost tasks to "jump the queue" and launch
|
||||
// them now, but we don't try to do that currently. For example, if the frontmost task
|
||||
// is an index-file upload that cannot proceed until preceding uploads have finished, we
|
||||
// could still start layer uploads that were scheduled later.
|
||||
if !can_run_now {
|
||||
break;
|
||||
}
|
||||
|
||||
if let UploadOp::Shutdown = next_op {
|
||||
// leave the op in the queue but do not start more tasks; it will be dropped when
|
||||
// the stop is called.
|
||||
upload_queue.shutdown_ready.close();
|
||||
break;
|
||||
}
|
||||
|
||||
// We can launch this task. Remove it from the queue first.
|
||||
let mut next_op = upload_queue.queued_operations.pop_front().unwrap();
|
||||
|
||||
debug!("starting op: {}", next_op);
|
||||
|
||||
// Update the counters and prepare
|
||||
// Prepare upload.
|
||||
match &mut next_op {
|
||||
UploadOp::UploadLayer(layer, meta, mode) => {
|
||||
if upload_queue
|
||||
@@ -1916,18 +1872,14 @@ impl RemoteTimelineClient {
|
||||
} else {
|
||||
*mode = Some(OpType::MayReorder)
|
||||
}
|
||||
upload_queue.num_inprogress_layer_uploads += 1;
|
||||
}
|
||||
UploadOp::UploadMetadata { .. } => {
|
||||
upload_queue.num_inprogress_metadata_uploads += 1;
|
||||
}
|
||||
UploadOp::UploadMetadata { .. } => {}
|
||||
UploadOp::Delete(Delete { layers }) => {
|
||||
for (name, meta) in layers {
|
||||
upload_queue
|
||||
.recently_deleted
|
||||
.insert((name.clone(), meta.generation));
|
||||
}
|
||||
upload_queue.num_inprogress_deletions += 1;
|
||||
}
|
||||
UploadOp::Barrier(sender) => {
|
||||
sender.send_replace(());
|
||||
@@ -2027,6 +1979,8 @@ impl RemoteTimelineClient {
|
||||
|
||||
let upload_result: anyhow::Result<()> = match &task.op {
|
||||
UploadOp::UploadLayer(ref layer, ref layer_metadata, mode) => {
|
||||
// TODO: check if this mechanism can be removed now that can_bypass() performs
|
||||
// conflict checks during scheduling.
|
||||
if let Some(OpType::FlushDeletion) = mode {
|
||||
if self.config.read().unwrap().block_deletions {
|
||||
// Of course, this is not efficient... but usually the queue should be empty.
|
||||
@@ -2249,13 +2203,8 @@ impl RemoteTimelineClient {
|
||||
upload_queue.inprogress_tasks.remove(&task.task_id);
|
||||
|
||||
let lsn_update = match task.op {
|
||||
UploadOp::UploadLayer(_, _, _) => {
|
||||
upload_queue.num_inprogress_layer_uploads -= 1;
|
||||
None
|
||||
}
|
||||
UploadOp::UploadLayer(_, _, _) => None,
|
||||
UploadOp::UploadMetadata { ref uploaded } => {
|
||||
upload_queue.num_inprogress_metadata_uploads -= 1;
|
||||
|
||||
// the task id is reused as a monotonicity check for storing the "clean"
|
||||
// IndexPart.
|
||||
let last_updater = upload_queue.clean.1;
|
||||
@@ -2289,10 +2238,7 @@ impl RemoteTimelineClient {
|
||||
None
|
||||
}
|
||||
}
|
||||
UploadOp::Delete(_) => {
|
||||
upload_queue.num_inprogress_deletions -= 1;
|
||||
None
|
||||
}
|
||||
UploadOp::Delete(_) => None,
|
||||
UploadOp::Barrier(..) | UploadOp::Shutdown => unreachable!(),
|
||||
};
|
||||
|
||||
@@ -2416,9 +2362,6 @@ impl RemoteTimelineClient {
|
||||
visible_remote_consistent_lsn: initialized
|
||||
.visible_remote_consistent_lsn
|
||||
.clone(),
|
||||
num_inprogress_layer_uploads: 0,
|
||||
num_inprogress_metadata_uploads: 0,
|
||||
num_inprogress_deletions: 0,
|
||||
inprogress_tasks: HashMap::default(),
|
||||
queued_operations: VecDeque::default(),
|
||||
#[cfg(feature = "testing")]
|
||||
@@ -2445,14 +2388,6 @@ impl RemoteTimelineClient {
|
||||
}
|
||||
};
|
||||
|
||||
// consistency check
|
||||
assert_eq!(
|
||||
qi.num_inprogress_layer_uploads
|
||||
+ qi.num_inprogress_metadata_uploads
|
||||
+ qi.num_inprogress_deletions,
|
||||
qi.inprogress_tasks.len()
|
||||
);
|
||||
|
||||
// We don't need to do anything here for in-progress tasks. They will finish
|
||||
// on their own, decrement the unfinished-task counter themselves, and observe
|
||||
// that the queue is Stopped.
|
||||
@@ -2899,8 +2834,8 @@ mod tests {
|
||||
let mut guard = client.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut().unwrap();
|
||||
assert!(upload_queue.queued_operations.is_empty());
|
||||
assert!(upload_queue.inprogress_tasks.len() == 2);
|
||||
assert!(upload_queue.num_inprogress_layer_uploads == 2);
|
||||
assert_eq!(upload_queue.inprogress_tasks.len(), 2);
|
||||
assert_eq!(upload_queue.num_inprogress_layer_uploads(), 2);
|
||||
|
||||
// also check that `latest_file_changes` was updated
|
||||
assert!(upload_queue.latest_files_changes_since_metadata_upload_scheduled == 2);
|
||||
@@ -2970,8 +2905,8 @@ mod tests {
|
||||
// Deletion schedules upload of the index file, and the file deletion itself
|
||||
assert_eq!(upload_queue.queued_operations.len(), 2);
|
||||
assert_eq!(upload_queue.inprogress_tasks.len(), 1);
|
||||
assert_eq!(upload_queue.num_inprogress_layer_uploads, 1);
|
||||
assert_eq!(upload_queue.num_inprogress_deletions, 0);
|
||||
assert_eq!(upload_queue.num_inprogress_layer_uploads(), 1);
|
||||
assert_eq!(upload_queue.num_inprogress_deletions(), 0);
|
||||
assert_eq!(
|
||||
upload_queue.latest_files_changes_since_metadata_upload_scheduled,
|
||||
0
|
||||
|
||||
@@ -104,7 +104,7 @@ impl IndexPart {
|
||||
|
||||
pub const FILE_NAME: &'static str = "index_part.json";
|
||||
|
||||
pub(crate) fn empty(metadata: TimelineMetadata) -> Self {
|
||||
pub fn empty(metadata: TimelineMetadata) -> Self {
|
||||
IndexPart {
|
||||
version: Self::LATEST_VERSION,
|
||||
layer_metadata: Default::default(),
|
||||
|
||||
@@ -1812,7 +1812,7 @@ enum LayerKind {
|
||||
|
||||
/// Guard for forcing a layer be resident while it exists.
|
||||
#[derive(Clone)]
|
||||
pub(crate) struct ResidentLayer {
|
||||
pub struct ResidentLayer {
|
||||
owner: Layer,
|
||||
downloaded: Arc<DownloadedLayer>,
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user