mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-15 12:10:37 +00:00
pgserver: spawn_blocking in compaction (#4265)
Compaction is usually a compute-heavy process and might affect other futures running on the thread of the compaction. Therefore, we add `block_in_place` as a temporary solution to avoid blocking other futures on the same thread as compaction in the runtime. As we are migrating towards a fully-async-style pageserver, we can revert this change when everything is async and when we move compaction to a separate runtime. --------- Signed-off-by: Alex Chi <iskyzh@gmail.com>
This commit is contained in:
@@ -88,6 +88,7 @@
|
||||
use crate::task_mgr::TaskKind;
|
||||
|
||||
// The main structure of this module, see module-level comment.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct RequestContext {
|
||||
task_kind: TaskKind,
|
||||
download_behavior: DownloadBehavior,
|
||||
@@ -95,7 +96,7 @@ pub struct RequestContext {
|
||||
|
||||
/// Desired behavior if the operation requires an on-demand download
|
||||
/// to proceed.
|
||||
#[derive(Clone, Copy, PartialEq, Eq)]
|
||||
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
|
||||
pub enum DownloadBehavior {
|
||||
/// Download the layer file. It can take a while.
|
||||
Download,
|
||||
|
||||
@@ -19,14 +19,8 @@ fn parallel_worker(paths: &[PathBuf], next_path_idx: &AtomicUsize) -> io::Result
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn par_fsync(paths: &[PathBuf]) -> io::Result<()> {
|
||||
const PARALLEL_PATH_THRESHOLD: usize = 1;
|
||||
if paths.len() <= PARALLEL_PATH_THRESHOLD {
|
||||
for path in paths {
|
||||
fsync_path(path)?;
|
||||
}
|
||||
return Ok(());
|
||||
}
|
||||
fn fsync_in_thread_pool(paths: &[PathBuf]) -> io::Result<()> {
|
||||
// TODO: remove this function in favor of `par_fsync_async` once we asyncify everything.
|
||||
|
||||
/// Use at most this number of threads.
|
||||
/// Increasing this limit will
|
||||
@@ -36,11 +30,11 @@ pub fn par_fsync(paths: &[PathBuf]) -> io::Result<()> {
|
||||
let num_threads = paths.len().min(MAX_NUM_THREADS);
|
||||
let next_path_idx = AtomicUsize::new(0);
|
||||
|
||||
crossbeam_utils::thread::scope(|s| -> io::Result<()> {
|
||||
std::thread::scope(|s| -> io::Result<()> {
|
||||
let mut handles = vec![];
|
||||
// Spawn `num_threads - 1`, as the current thread is also a worker.
|
||||
for _ in 1..num_threads {
|
||||
handles.push(s.spawn(|_| parallel_worker(paths, &next_path_idx)));
|
||||
handles.push(s.spawn(|| parallel_worker(paths, &next_path_idx)));
|
||||
}
|
||||
|
||||
parallel_worker(paths, &next_path_idx)?;
|
||||
@@ -51,5 +45,41 @@ pub fn par_fsync(paths: &[PathBuf]) -> io::Result<()> {
|
||||
|
||||
Ok(())
|
||||
})
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
/// Parallel fsync all files. Can be used in non-async context as it is using rayon thread pool.
|
||||
pub fn par_fsync(paths: &[PathBuf]) -> io::Result<()> {
|
||||
if paths.len() == 1 {
|
||||
fsync_path(&paths[0])?;
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
fsync_in_thread_pool(paths)
|
||||
}
|
||||
|
||||
/// Parallel fsync asynchronously. If number of files are less than PARALLEL_PATH_THRESHOLD, fsync is done in the current
|
||||
/// execution thread. Otherwise, we will spawn_blocking and run it in tokio.
|
||||
pub async fn par_fsync_async(paths: &[PathBuf]) -> io::Result<()> {
|
||||
const MAX_CONCURRENT_FSYNC: usize = 64;
|
||||
let mut next = paths.iter().peekable();
|
||||
let mut js = tokio::task::JoinSet::new();
|
||||
loop {
|
||||
while js.len() < MAX_CONCURRENT_FSYNC && next.peek().is_some() {
|
||||
let next = next.next().expect("just peeked");
|
||||
let next = next.to_owned();
|
||||
js.spawn_blocking(move || fsync_path(&next));
|
||||
}
|
||||
|
||||
// now the joinset has been filled up, wait for next to complete
|
||||
if let Some(res) = js.join_next().await {
|
||||
res??;
|
||||
} else {
|
||||
// last item had already completed
|
||||
assert!(
|
||||
next.peek().is_none(),
|
||||
"joinset emptied, we shouldn't have more work"
|
||||
);
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -195,8 +195,9 @@ pub struct Timeline {
|
||||
/// Layer removal lock.
|
||||
/// A lock to ensure that no layer of the timeline is removed concurrently by other tasks.
|
||||
/// This lock is acquired in [`Timeline::gc`], [`Timeline::compact`],
|
||||
/// and [`Tenant::delete_timeline`].
|
||||
pub(super) layer_removal_cs: tokio::sync::Mutex<()>,
|
||||
/// and [`Tenant::delete_timeline`]. This is an `Arc<Mutex>` lock because we need an owned
|
||||
/// lock guard in functions that will be spawned to tokio I/O pool (which requires `'static`).
|
||||
pub(super) layer_removal_cs: Arc<tokio::sync::Mutex<()>>,
|
||||
|
||||
// Needed to ensure that we can't create a branch at a point that was already garbage collected
|
||||
pub latest_gc_cutoff_lsn: Rcu<Lsn>,
|
||||
@@ -669,7 +670,7 @@ impl Timeline {
|
||||
}
|
||||
|
||||
/// Outermost timeline compaction operation; downloads needed layers.
|
||||
pub async fn compact(&self, ctx: &RequestContext) -> anyhow::Result<()> {
|
||||
pub async fn compact(self: &Arc<Self>, ctx: &RequestContext) -> anyhow::Result<()> {
|
||||
const ROUNDS: usize = 2;
|
||||
|
||||
let last_record_lsn = self.get_last_record_lsn();
|
||||
@@ -758,7 +759,7 @@ impl Timeline {
|
||||
}
|
||||
|
||||
/// Compaction which might need to be retried after downloading remote layers.
|
||||
async fn compact_inner(&self, ctx: &RequestContext) -> Result<(), CompactionError> {
|
||||
async fn compact_inner(self: &Arc<Self>, ctx: &RequestContext) -> Result<(), CompactionError> {
|
||||
//
|
||||
// High level strategy for compaction / image creation:
|
||||
//
|
||||
@@ -793,7 +794,7 @@ impl Timeline {
|
||||
// Below are functions compact_level0() and create_image_layers()
|
||||
// but they are a bit ad hoc and don't quite work like it's explained
|
||||
// above. Rewrite it.
|
||||
let layer_removal_cs = self.layer_removal_cs.lock().await;
|
||||
let layer_removal_cs = Arc::new(self.layer_removal_cs.clone().lock_owned().await);
|
||||
// Is the timeline being deleted?
|
||||
let state = *self.state.borrow();
|
||||
if state == TimelineState::Stopping {
|
||||
@@ -827,7 +828,7 @@ impl Timeline {
|
||||
|
||||
// 3. Compact
|
||||
let timer = self.metrics.compact_time_histo.start_timer();
|
||||
self.compact_level0(&layer_removal_cs, target_file_size, ctx)
|
||||
self.compact_level0(layer_removal_cs.clone(), target_file_size, ctx)
|
||||
.await?;
|
||||
timer.stop_and_record();
|
||||
}
|
||||
@@ -2168,7 +2169,7 @@ impl Timeline {
|
||||
fn delete_historic_layer(
|
||||
&self,
|
||||
// we cannot remove layers otherwise, since gc and compaction will race
|
||||
_layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>,
|
||||
_layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
|
||||
layer: Arc<dyn PersistentLayer>,
|
||||
updates: &mut BatchedUpdates<'_, dyn PersistentLayer>,
|
||||
) -> anyhow::Result<()> {
|
||||
@@ -2632,7 +2633,7 @@ impl Timeline {
|
||||
|
||||
/// Layer flusher task's main loop.
|
||||
async fn flush_loop(
|
||||
&self,
|
||||
self: &Arc<Self>,
|
||||
mut layer_flush_start_rx: tokio::sync::watch::Receiver<u64>,
|
||||
ctx: &RequestContext,
|
||||
) {
|
||||
@@ -2723,7 +2724,7 @@ impl Timeline {
|
||||
/// Flush one frozen in-memory layer to disk, as a new delta layer.
|
||||
#[instrument(skip(self, frozen_layer, ctx), fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%frozen_layer.short_id()))]
|
||||
async fn flush_frozen_layer(
|
||||
&self,
|
||||
self: &Arc<Self>,
|
||||
frozen_layer: Arc<InMemoryLayer>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
@@ -2743,7 +2744,11 @@ impl Timeline {
|
||||
.await?
|
||||
} else {
|
||||
// normal case, write out a L0 delta layer file.
|
||||
let (delta_path, metadata) = self.create_delta_layer(&frozen_layer)?;
|
||||
let this = self.clone();
|
||||
let frozen_layer = frozen_layer.clone();
|
||||
let (delta_path, metadata) =
|
||||
tokio::task::spawn_blocking(move || this.create_delta_layer(&frozen_layer))
|
||||
.await??;
|
||||
HashMap::from([(delta_path, metadata)])
|
||||
};
|
||||
|
||||
@@ -2847,7 +2852,7 @@ impl Timeline {
|
||||
|
||||
// Write out the given frozen in-memory layer as a new L0 delta file
|
||||
fn create_delta_layer(
|
||||
&self,
|
||||
self: &Arc<Self>,
|
||||
frozen_layer: &InMemoryLayer,
|
||||
) -> anyhow::Result<(LayerFileName, LayerFileMetadata)> {
|
||||
// Write it out
|
||||
@@ -2863,10 +2868,13 @@ impl Timeline {
|
||||
// TODO: If we're running inside 'flush_frozen_layers' and there are multiple
|
||||
// files to flush, it might be better to first write them all, and then fsync
|
||||
// them all in parallel.
|
||||
par_fsync::par_fsync(&[
|
||||
new_delta_path.clone(),
|
||||
self.conf.timeline_path(&self.timeline_id, &self.tenant_id),
|
||||
])?;
|
||||
|
||||
// First sync the delta layer. We still use par_fsync here to keep everything consistent. Feel free to replace
|
||||
// this with a single fsync in future refactors.
|
||||
par_fsync::par_fsync(&[new_delta_path.clone()]).context("fsync of delta layer")?;
|
||||
// Then sync the parent directory.
|
||||
par_fsync::par_fsync(&[self.conf.timeline_path(&self.timeline_id, &self.tenant_id)])
|
||||
.context("fsync of timeline dir")?;
|
||||
|
||||
// Add it to the layer map
|
||||
let l = Arc::new(new_delta);
|
||||
@@ -3090,11 +3098,15 @@ impl Timeline {
|
||||
let all_paths = image_layers
|
||||
.iter()
|
||||
.map(|layer| layer.path())
|
||||
.chain(std::iter::once(
|
||||
self.conf.timeline_path(&self.timeline_id, &self.tenant_id),
|
||||
))
|
||||
.collect::<Vec<_>>();
|
||||
par_fsync::par_fsync(&all_paths).context("fsync of newly created layer files")?;
|
||||
|
||||
par_fsync::par_fsync_async(&all_paths)
|
||||
.await
|
||||
.context("fsync of newly created layer files")?;
|
||||
|
||||
par_fsync::par_fsync_async(&[self.conf.timeline_path(&self.timeline_id, &self.tenant_id)])
|
||||
.await
|
||||
.context("fsync of timeline dir")?;
|
||||
|
||||
let mut layer_paths_to_upload = HashMap::with_capacity(image_layers.len());
|
||||
|
||||
@@ -3159,9 +3171,9 @@ impl Timeline {
|
||||
/// This method takes the `_layer_removal_cs` guard to highlight it required downloads are
|
||||
/// returned as an error. If the `layer_removal_cs` boundary is changed not to be taken in the
|
||||
/// start of level0 files compaction, the on-demand download should be revisited as well.
|
||||
async fn compact_level0_phase1(
|
||||
fn compact_level0_phase1(
|
||||
&self,
|
||||
_layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>,
|
||||
_layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
|
||||
target_file_size: u64,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<CompactLevel0Phase1Result, CompactionError> {
|
||||
@@ -3474,13 +3486,13 @@ impl Timeline {
|
||||
if !new_layers.is_empty() {
|
||||
let mut layer_paths: Vec<PathBuf> = new_layers.iter().map(|l| l.path()).collect();
|
||||
|
||||
// also sync the directory
|
||||
layer_paths.push(self.conf.timeline_path(&self.timeline_id, &self.tenant_id));
|
||||
|
||||
// Fsync all the layer files and directory using multiple threads to
|
||||
// minimize latency.
|
||||
par_fsync::par_fsync(&layer_paths).context("fsync all new layers")?;
|
||||
|
||||
par_fsync::par_fsync(&[self.conf.timeline_path(&self.timeline_id, &self.tenant_id)])
|
||||
.context("fsync of timeline dir")?;
|
||||
|
||||
layer_paths.pop().unwrap();
|
||||
}
|
||||
|
||||
@@ -3497,17 +3509,22 @@ impl Timeline {
|
||||
/// as Level 1 files.
|
||||
///
|
||||
async fn compact_level0(
|
||||
&self,
|
||||
layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>,
|
||||
self: &Arc<Self>,
|
||||
layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
|
||||
target_file_size: u64,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), CompactionError> {
|
||||
let this = self.clone();
|
||||
let ctx_inner = ctx.clone();
|
||||
let layer_removal_cs_inner = layer_removal_cs.clone();
|
||||
let CompactLevel0Phase1Result {
|
||||
new_layers,
|
||||
deltas_to_compact,
|
||||
} = self
|
||||
.compact_level0_phase1(layer_removal_cs, target_file_size, ctx)
|
||||
.await?;
|
||||
} = tokio::task::spawn_blocking(move || {
|
||||
this.compact_level0_phase1(layer_removal_cs_inner, target_file_size, &ctx_inner)
|
||||
})
|
||||
.await
|
||||
.unwrap()?;
|
||||
|
||||
if new_layers.is_empty() && deltas_to_compact.is_empty() {
|
||||
// nothing to do
|
||||
@@ -3565,7 +3582,7 @@ impl Timeline {
|
||||
let mut layer_names_to_delete = Vec::with_capacity(deltas_to_compact.len());
|
||||
for l in deltas_to_compact {
|
||||
layer_names_to_delete.push(l.filename());
|
||||
self.delete_historic_layer(layer_removal_cs, l, &mut updates)?;
|
||||
self.delete_historic_layer(layer_removal_cs.clone(), l, &mut updates)?;
|
||||
}
|
||||
updates.flush();
|
||||
drop(layers);
|
||||
@@ -3685,7 +3702,7 @@ impl Timeline {
|
||||
|
||||
fail_point!("before-timeline-gc");
|
||||
|
||||
let layer_removal_cs = self.layer_removal_cs.lock().await;
|
||||
let layer_removal_cs = Arc::new(self.layer_removal_cs.clone().lock_owned().await);
|
||||
// Is the timeline being deleted?
|
||||
let state = *self.state.borrow();
|
||||
if state == TimelineState::Stopping {
|
||||
@@ -3705,7 +3722,7 @@ impl Timeline {
|
||||
|
||||
let res = self
|
||||
.gc_timeline(
|
||||
&layer_removal_cs,
|
||||
layer_removal_cs.clone(),
|
||||
horizon_cutoff,
|
||||
pitr_cutoff,
|
||||
retain_lsns,
|
||||
@@ -3724,7 +3741,7 @@ impl Timeline {
|
||||
|
||||
async fn gc_timeline(
|
||||
&self,
|
||||
layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>,
|
||||
layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
|
||||
horizon_cutoff: Lsn,
|
||||
pitr_cutoff: Lsn,
|
||||
retain_lsns: Vec<Lsn>,
|
||||
@@ -3897,7 +3914,11 @@ impl Timeline {
|
||||
{
|
||||
for doomed_layer in layers_to_remove {
|
||||
layer_names_to_delete.push(doomed_layer.filename());
|
||||
self.delete_historic_layer(layer_removal_cs, doomed_layer, &mut updates)?; // FIXME: schedule succeeded deletions before returning?
|
||||
self.delete_historic_layer(
|
||||
layer_removal_cs.clone(),
|
||||
doomed_layer,
|
||||
&mut updates,
|
||||
)?; // FIXME: schedule succeeded deletions before returning?
|
||||
result.layers_removed += 1;
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user