pageserver: skip waiting for logical size on shard >0 (#7744)

## Problem

Shards with number >0 could hang waiting for
`await_initial_logical_size`, as we don't calculate logical size on
these shards. This causes them to hold onto semaphore units and starve
other tenants out from proceeding with warmup activation.

That doesn't hurt availability (we still have on-demand activation), but
it does mean that some background tasks like consumption metrics would
omit some tenants.

## Summary of changes

- Skip waiting for logical size calculation on shards >0
- Upgrade unexpected code paths to use debug_assert!(), which acts as an
implicit regression test for this issue, and make the info() one into a
warn()
This commit is contained in:
John Spray
2024-05-14 16:39:17 +01:00
committed by GitHub
parent 30d15ad403
commit 82960b2175
2 changed files with 23 additions and 1 deletions

View File

@@ -1127,6 +1127,11 @@ impl RemoteTimelineClient {
Ok(())
}
pub(crate) fn is_deleting(&self) -> bool {
let mut locked = self.upload_queue.lock().unwrap();
locked.stopped_mut().is_ok()
}
pub(crate) async fn preserve_initdb_archive(
self: &Arc<Self>,
tenant_id: &TenantId,

View File

@@ -2631,6 +2631,7 @@ impl Timeline {
// Don't make noise.
} else {
warn!("unexpected: cancel_wait_for_background_loop_concurrency_limit_semaphore not set, priority-boosting of logical size calculation will not work");
debug_assert!(false);
}
}
};
@@ -4355,6 +4356,21 @@ impl Timeline {
/// this Timeline is shut down. Calling this function will cause the initial
/// logical size calculation to skip waiting for the background jobs barrier.
pub(crate) async fn await_initial_logical_size(self: Arc<Self>) {
if !self.shard_identity.is_shard_zero() {
// We don't populate logical size on shard >0: skip waiting for it.
return;
}
if self
.remote_client
.as_ref()
.map(|c| c.is_deleting())
.unwrap_or(false)
{
// The timeline was created in a deletion-resume state, we don't expect logical size to be populated
return;
}
if let Some(await_bg_cancel) = self
.current_logical_size
.cancel_wait_for_background_loop_concurrency_limit_semaphore
@@ -4366,9 +4382,10 @@ impl Timeline {
// the logical size cancellation to skip the concurrency limit semaphore.
// TODO: this is an unexpected case. We should restructure so that it
// can't happen.
tracing::info!(
tracing::warn!(
"await_initial_logical_size: can't get semaphore cancel token, skipping"
);
debug_assert!(false);
}
tokio::select!(