mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-16 18:02:56 +00:00
initial logical size calculation wasn't polled to completion; fix that, to make tests pass
(see prev commit for stack trace) CI test failures https://neon-github-public-dev.s3.amazonaws.com/reports/pr-9353/12892883355/index.html#suites/a1c2be32556270764423c495fad75d47/92cacda354b63fd7/
This commit is contained in:
@@ -61,6 +61,7 @@ use utils::{
|
||||
use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta};
|
||||
|
||||
use std::sync::atomic::Ordering as AtomicOrdering;
|
||||
use std::sync::OnceLock;
|
||||
use std::sync::{Arc, Mutex, RwLock, Weak};
|
||||
use std::time::{Duration, Instant, SystemTime};
|
||||
use std::{
|
||||
@@ -73,7 +74,6 @@ use std::{
|
||||
collections::btree_map::Entry,
|
||||
ops::{Deref, Range},
|
||||
};
|
||||
use std::{pin::pin, sync::OnceLock};
|
||||
|
||||
use crate::{
|
||||
aux_file::AuxFileSizeEstimator,
|
||||
@@ -2831,12 +2831,10 @@ impl Timeline {
|
||||
"initial size calculation",
|
||||
// NB: don't log errors here, task_mgr will do that.
|
||||
async move {
|
||||
let cancel = task_mgr::shutdown_token();
|
||||
self_clone
|
||||
.initial_logical_size_calculation_task(
|
||||
initial_part_end,
|
||||
cancel_wait_for_background_loop_concurrency_limit_semaphore,
|
||||
cancel,
|
||||
background_ctx,
|
||||
)
|
||||
.await;
|
||||
@@ -2846,11 +2844,24 @@ impl Timeline {
|
||||
);
|
||||
}
|
||||
|
||||
/// # Cancellation
|
||||
///
|
||||
/// This method is sensitive to `Timeline::cancel`.
|
||||
///
|
||||
/// It is _not_ sensitive to task_mgr::shutdown_token().
|
||||
///
|
||||
/// The rationale is that we spawn initial logical size calculation
|
||||
/// during activation, hence
|
||||
///
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// It does Timeline IO, hence this should be polled to completion because
|
||||
/// we could be leaving in-flight IOs behind, which is safe, but annoying
|
||||
/// to reason about.
|
||||
async fn initial_logical_size_calculation_task(
|
||||
self: Arc<Self>,
|
||||
initial_part_end: Lsn,
|
||||
skip_concurrency_limiter: CancellationToken,
|
||||
cancel: CancellationToken,
|
||||
background_ctx: RequestContext,
|
||||
) {
|
||||
scopeguard::defer! {
|
||||
@@ -2863,7 +2874,6 @@ impl Timeline {
|
||||
let self_ref = &self;
|
||||
let skip_concurrency_limiter = &skip_concurrency_limiter;
|
||||
async move {
|
||||
let cancel = task_mgr::shutdown_token();
|
||||
let wait_for_permit = super::tasks::concurrent_background_tasks_rate_limit_permit(
|
||||
BackgroundLoopKind::InitialLogicalSizeCalculation,
|
||||
background_ctx,
|
||||
@@ -2877,9 +2887,6 @@ impl Timeline {
|
||||
_ = self_ref.cancel.cancelled() => {
|
||||
return Err(CalculateLogicalSizeError::Cancelled);
|
||||
}
|
||||
_ = cancel.cancelled() => {
|
||||
return Err(CalculateLogicalSizeError::Cancelled);
|
||||
},
|
||||
() = skip_concurrency_limiter.cancelled() => {
|
||||
// Some action that is part of a end user interaction requested logical size
|
||||
// => break out of the rate limit
|
||||
@@ -2950,22 +2957,18 @@ impl Timeline {
|
||||
)
|
||||
.expect("10min < 1hour"),
|
||||
);
|
||||
tokio::time::sleep(sleep_duration).await;
|
||||
tokio::select! {
|
||||
_ = tokio::time::sleep(sleep_duration) => {}
|
||||
_ = self.cancel.cancelled() => return ControlFlow::Break(()),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let (calculated_size, metrics_guard) = tokio::select! {
|
||||
res = retrying => {
|
||||
match res {
|
||||
ControlFlow::Continue(calculated_size) => calculated_size,
|
||||
ControlFlow::Break(()) => return,
|
||||
}
|
||||
}
|
||||
_ = cancel.cancelled() => {
|
||||
return;
|
||||
}
|
||||
let (calculated_size, metrics_guard) = match retrying.await {
|
||||
ControlFlow::Continue(calculated_size) => calculated_size,
|
||||
ControlFlow::Break(()) => return,
|
||||
};
|
||||
|
||||
// we cannot query current_logical_size.current_size() to know the current
|
||||
@@ -3021,9 +3024,6 @@ impl Timeline {
|
||||
receiver
|
||||
}
|
||||
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// This method is cancellation-safe.
|
||||
#[instrument(skip_all)]
|
||||
async fn logical_size_calculation_task(
|
||||
self: &Arc<Self>,
|
||||
@@ -3041,32 +3041,13 @@ impl Timeline {
|
||||
.enter()
|
||||
.map_err(|_| CalculateLogicalSizeError::Cancelled)?;
|
||||
|
||||
let self_calculation = Arc::clone(self);
|
||||
|
||||
let mut calculation = pin!(async {
|
||||
let ctx = ctx.attached_child();
|
||||
self_calculation
|
||||
.calculate_logical_size(lsn, cause, &guard, &ctx)
|
||||
.await
|
||||
});
|
||||
|
||||
tokio::select! {
|
||||
res = &mut calculation => { res }
|
||||
_ = self.cancel.cancelled() => {
|
||||
debug!("cancelling logical size calculation for timeline shutdown");
|
||||
calculation.await
|
||||
}
|
||||
}
|
||||
self.calculate_logical_size(lsn, cause, &guard, ctx).await
|
||||
}
|
||||
|
||||
/// Calculate the logical size of the database at the latest LSN.
|
||||
///
|
||||
/// NOTE: counted incrementally, includes ancestors. This can be a slow operation,
|
||||
/// especially if we need to download remote layers.
|
||||
///
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// This method is cancellation-safe.
|
||||
async fn calculate_logical_size(
|
||||
&self,
|
||||
up_to_lsn: Lsn,
|
||||
|
||||
Reference in New Issue
Block a user