WIP: fix: logical size limit is broken during PS restart

fixes https://github.com/neondatabase/neon/issues/5963

On top of https://github.com/neondatabase/neon/pull/6000

Will ship this in a release after #600
This commit is contained in:
Christian Schwarz
2023-12-01 11:52:35 +00:00
parent ef1848f002
commit 57a7b2e8e3
3 changed files with 17 additions and 51 deletions

View File

@@ -1738,34 +1738,18 @@ impl Timeline {
) -> logical_size::CurrentLogicalSize {
let current_size = self.current_logical_size.current_size();
debug!("Current size: {current_size:?}");
match (current_size.accuracy(), priority) {
(logical_size::Accuracy::Exact, _) => (), // nothing to do
(logical_size::Accuracy::Approximate, GetLogicalSizePriority::Background) => {
// background task will eventually deliver an exact value, we're in no rush
}
(logical_size::Accuracy::Approximate, GetLogicalSizePriority::User) => {
// background task is not ready, but user is asking for it now;
// => make the background task skip the line
// (The alternative would be to calculate the size here, but,
// it can actually take a long time if the user has a lot of rels.
// And we'll inevitable need it again; So, let the background task do the work.)
match self
.current_logical_size
.cancel_wait_for_background_loop_concurrency_limit_semaphore
.get()
{
Some(cancel) => cancel.cancel(),
None => {
warn!("unexpected: priority_tx not set, logical size calculation will not be prioritized");
}
};
}
}
current_size
}
// if it's not already computed, it computes it _now_
pub(crate) async fn get_current_logical_size_wait_exact(
self: &Arc<Self>,
) -> Result<logical_size::Exact, TimelineCancelled | CalculationError> {
self.current_logical_size.initial_logical_size.get_or_try_init(async {
// do calcualtion here
})
}
fn spawn_initial_logical_size_computation_task(self: &Arc<Self>, ctx: &RequestContext) {
let Some(initial_part_end) = self.current_logical_size.initial_part_end else {
// nothing to do for freshly created timelines;
@@ -1832,31 +1816,9 @@ impl Timeline {
&cancel,
);
use crate::metrics::initial_logical_size::StartCircumstances;
let (_maybe_permit, circumstances) = tokio::select! {
res = wait_for_permit => {
match res {
Ok(permit) => (Some(permit), StartCircumstances::AfterBackgroundTasksRateLimit),
Err(RateLimitError::Cancelled) => {
return Err(BackgroundCalculationError::Cancelled);
}
}
}
() = skip_concurrency_limiter.cancelled() => {
// Some action that is part of a end user interaction requested logical size
// => break out of the rate limit
// TODO: ideally we'd not run on BackgroundRuntime but the requester's runtime;
// but then again what happens if they cancel; also, we should just be using
// one runtime across the entire process, so, let's leave this for now.
(None, StartCircumstances::SkippedConcurrencyLimiter)
}
};
let metrics_guard = if attempt == 1 {
crate::metrics::initial_logical_size::START_CALCULATION.first(circumstances)
} else {
crate::metrics::initial_logical_size::START_CALCULATION.retry(circumstances)
};
self.current_logical_size.initial_logical_size.get_or_init(async {
// do calcualtion here
});
match self_ref
.logical_size_calculation_task(

View File

@@ -22,7 +22,7 @@ pub(super) struct LogicalSize {
///
/// NOTE: size at a given LSN is constant, but after a restart we will calculate
/// the initial size at a different LSN.
pub initial_logical_size: OnceCell<(
pub initial_logical_size: tokio::sync::OnceCell<(
u64,
crate::metrics::initial_logical_size::FinishedCalculationGuard,
)>,

View File

@@ -206,6 +206,10 @@ pub(super) async fn connection_manager_loop_step(
if let Some(new_candidate) = connection_manager_state.next_connection_candidate() {
info!("Switching to new connection candidate: {new_candidate:?}");
tokio::select! {
logical_size = connection_manager_state.timeline.get_current_logical_size_wait_exact().await,
_ = connection_manager.should_shutdown(),
}
connection_manager_state
.change_connection(new_candidate, ctx)
.await