mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-15 17:32:56 +00:00
## Problem - `shutdown_tasks` would log when a particular task was taking a long time to shut down, but not when it eventually completed. That left one uncertain as to whether the slow task was the source of a hang, or just a precursor. ## Summary of changes - Add a log line after a slow task shutdown - Add an equivalent in Gate's `warn_if_stuck`, in case we ever need it. This isn't related to the original issue but was noticed when checking through these logging paths.
173 lines
6.0 KiB
Rust
173 lines
6.0 KiB
Rust
use std::{sync::Arc, time::Duration};
|
|
|
|
/// Gates are a concurrency helper, primarily used for implementing safe shutdown.
|
|
///
|
|
/// Users of a resource call `enter()` to acquire a GateGuard, and the owner of
|
|
/// the resource calls `close()` when they want to ensure that all holders of guards
|
|
/// have released them, and that no future guards will be issued.
|
|
pub struct Gate {
|
|
/// Each caller of enter() takes one unit from the semaphore. In close(), we
|
|
/// take all the units to ensure all GateGuards are destroyed.
|
|
sem: Arc<tokio::sync::Semaphore>,
|
|
|
|
/// For observability only: a name that will be used to log warnings if a particular
|
|
/// gate is holding up shutdown
|
|
name: String,
|
|
}
|
|
|
|
/// RAII guard for a [`Gate`]: as long as this exists, calls to [`Gate::close`] will
|
|
/// not complete.
|
|
#[derive(Debug)]
|
|
pub struct GateGuard(tokio::sync::OwnedSemaphorePermit);
|
|
|
|
/// Observability helper: every `warn_period`, emit a log warning that we're still waiting on this gate
|
|
async fn warn_if_stuck<Fut: std::future::Future>(
|
|
fut: Fut,
|
|
name: &str,
|
|
warn_period: std::time::Duration,
|
|
) -> <Fut as std::future::Future>::Output {
|
|
let started = std::time::Instant::now();
|
|
|
|
let mut fut = std::pin::pin!(fut);
|
|
|
|
let mut warned = false;
|
|
let ret = loop {
|
|
match tokio::time::timeout(warn_period, &mut fut).await {
|
|
Ok(ret) => break ret,
|
|
Err(_) => {
|
|
tracing::warn!(
|
|
gate = name,
|
|
elapsed_ms = started.elapsed().as_millis(),
|
|
"still waiting, taking longer than expected..."
|
|
);
|
|
warned = true;
|
|
}
|
|
}
|
|
};
|
|
|
|
// If we emitted a warning for slowness, also emit a message when we complete, so that
|
|
// someone debugging a shutdown can know for sure whether we have moved past this operation.
|
|
if warned {
|
|
tracing::info!(
|
|
gate = name,
|
|
elapsed_ms = started.elapsed().as_millis(),
|
|
"completed, after taking longer than expected"
|
|
)
|
|
}
|
|
|
|
ret
|
|
}
|
|
|
|
#[derive(Debug)]
|
|
pub enum GateError {
|
|
GateClosed,
|
|
}
|
|
|
|
impl Gate {
|
|
const MAX_UNITS: u32 = u32::MAX;
|
|
|
|
pub fn new(name: String) -> Self {
|
|
Self {
|
|
sem: Arc::new(tokio::sync::Semaphore::new(Self::MAX_UNITS as usize)),
|
|
name,
|
|
}
|
|
}
|
|
|
|
/// Acquire a guard that will prevent close() calls from completing. If close()
|
|
/// was already called, this will return an error which should be interpreted
|
|
/// as "shutting down".
|
|
///
|
|
/// This function would typically be used from e.g. request handlers. While holding
|
|
/// the guard returned from this function, it is important to respect a CancellationToken
|
|
/// to avoid blocking close() indefinitely: typically types that contain a Gate will
|
|
/// also contain a CancellationToken.
|
|
pub fn enter(&self) -> Result<GateGuard, GateError> {
|
|
self.sem
|
|
.clone()
|
|
.try_acquire_owned()
|
|
.map(GateGuard)
|
|
.map_err(|_| GateError::GateClosed)
|
|
}
|
|
|
|
/// Types with a shutdown() method and a gate should call this method at the
|
|
/// end of shutdown, to ensure that all GateGuard holders are done.
|
|
///
|
|
/// This will wait for all guards to be destroyed. For this to complete promptly, it is
|
|
/// important that the holders of such guards are respecting a CancellationToken which has
|
|
/// been cancelled before entering this function.
|
|
pub async fn close(&self) {
|
|
warn_if_stuck(self.do_close(), &self.name, Duration::from_millis(1000)).await
|
|
}
|
|
|
|
/// Check if [`Self::close()`] has finished waiting for all [`Self::enter()`] users to finish. This
|
|
/// is usually analoguous for "Did shutdown finish?" for types that include a Gate, whereas checking
|
|
/// the CancellationToken on such types is analogous to "Did shutdown start?"
|
|
pub fn close_complete(&self) -> bool {
|
|
self.sem.is_closed()
|
|
}
|
|
|
|
async fn do_close(&self) {
|
|
tracing::debug!(gate = self.name, "Closing Gate...");
|
|
match self.sem.acquire_many(Self::MAX_UNITS).await {
|
|
Ok(_units) => {
|
|
// While holding all units, close the semaphore. All subsequent calls to enter() will fail.
|
|
self.sem.close();
|
|
}
|
|
Err(_) => {
|
|
// Semaphore closed: we are the only function that can do this, so it indicates a double-call.
|
|
// This is legal. Timeline::shutdown for example is not protected from being called more than
|
|
// once.
|
|
tracing::debug!(gate = self.name, "Double close")
|
|
}
|
|
}
|
|
tracing::debug!(gate = self.name, "Closed Gate.")
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use futures::FutureExt;
|
|
|
|
use super::*;
|
|
|
|
#[tokio::test]
|
|
async fn test_idle_gate() {
|
|
// Having taken no gates, we should not be blocked in close
|
|
let gate = Gate::new("test".to_string());
|
|
gate.close().await;
|
|
|
|
// If a guard is dropped before entering, close should not be blocked
|
|
let gate = Gate::new("test".to_string());
|
|
let guard = gate.enter().unwrap();
|
|
drop(guard);
|
|
gate.close().await;
|
|
|
|
// Entering a closed guard fails
|
|
gate.enter().expect_err("enter should fail after close");
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_busy_gate() {
|
|
let gate = Gate::new("test".to_string());
|
|
|
|
let guard = gate.enter().unwrap();
|
|
|
|
let mut close_fut = std::pin::pin!(gate.close());
|
|
|
|
// Close should be blocked
|
|
assert!(close_fut.as_mut().now_or_never().is_none());
|
|
|
|
// Attempting to enter() should fail, even though close isn't done yet.
|
|
gate.enter()
|
|
.expect_err("enter should fail after entering close");
|
|
|
|
drop(guard);
|
|
|
|
// Guard is gone, close should finish
|
|
assert!(close_fut.as_mut().now_or_never().is_some());
|
|
|
|
// Attempting to enter() is still forbidden
|
|
gate.enter().expect_err("enter should fail finishing close");
|
|
}
|
|
}
|