mirror of
https://github.com/neondatabase/neon.git
synced 2025-12-25 23:29:59 +00:00
pageserver: add Gate as a partner to CancellationToken for safe shutdown of Tenant & Timeline (#5711)
## Problem When shutting down a Tenant, it isn't just important to cause any background tasks to stop. It's also important to wait until they have stopped before declaring shutdown complete, in cases where we may re-use the tenant's local storage for something else, such as running in secondary mode, or creating a new tenant with the same ID. ## Summary of changes A `Gate` class is added, inspired by [seastar::gate](https://docs.seastar.io/master/classseastar_1_1gate.html). For types that have an important lifetime that corresponds to some physical resource, use of a Gate as well as a CancellationToken provides a robust pattern for async requests & shutdown: - Requests must always acquire the gate as long as they are using the object - Shutdown must set the cancellation token, and then `close()` the gate to wait for requests in progress before returning. This is not for memory safety: it's for expressing the difference between "Arc<Tenant> exists", and "This tenant's files on disk are eligible to be read/written". - Both Tenant and Timeline get a Gate & CancellationToken. - The Timeline gate is held during eviction of layers, and during page_service requests. - Existing cancellation support in page_service is refined to use the timeline-scope cancellation token instead of a process-scope cancellation token. This replaces the use of `task_mgr::associate_with`: tasks no longer change their tenant/timelineidentity after being spawned. The Tenant's Gate is not yet used, but will be important for Tenant-scoped operations in secondary mode, where we must ensure that our secondary-mode downloads for a tenant are gated wrt the activity of an attached Tenant. This is part of a broader move away from using the global-state driven `task_mgr` shutdown tokens: - less global state where we rely on implicit knowledge of what task a given function is running in, and more explicit references to the cancellation token that a particular function/type will respect, making shutdown easier to reason about. - eventually avoid the big global TASKS mutex. --------- Co-authored-by: Joonas Koivunen <joonas@neon.tech>
This commit is contained in:
@@ -728,12 +728,17 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
|
|||||||
|
|
||||||
trace!("got query {query_string:?}");
|
trace!("got query {query_string:?}");
|
||||||
if let Err(e) = handler.process_query(self, query_string).await {
|
if let Err(e) = handler.process_query(self, query_string).await {
|
||||||
log_query_error(query_string, &e);
|
match e {
|
||||||
let short_error = short_error(&e);
|
QueryError::Shutdown => return Ok(ProcessMsgResult::Break),
|
||||||
self.write_message_noflush(&BeMessage::ErrorResponse(
|
e => {
|
||||||
&short_error,
|
log_query_error(query_string, &e);
|
||||||
Some(e.pg_error_code()),
|
let short_error = short_error(&e);
|
||||||
))?;
|
self.write_message_noflush(&BeMessage::ErrorResponse(
|
||||||
|
&short_error,
|
||||||
|
Some(e.pg_error_code()),
|
||||||
|
))?;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
self.write_message_noflush(&BeMessage::ReadyForQuery)?;
|
self.write_message_noflush(&BeMessage::ReadyForQuery)?;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1 +1,3 @@
|
|||||||
pub mod heavier_once_cell;
|
pub mod heavier_once_cell;
|
||||||
|
|
||||||
|
pub mod gate;
|
||||||
|
|||||||
151
libs/utils/src/sync/gate.rs
Normal file
151
libs/utils/src/sync/gate.rs
Normal file
@@ -0,0 +1,151 @@
|
|||||||
|
use std::{sync::Arc, time::Duration};
|
||||||
|
|
||||||
|
/// Gates are a concurrency helper, primarily used for implementing safe shutdown.
|
||||||
|
///
|
||||||
|
/// Users of a resource call `enter()` to acquire a GateGuard, and the owner of
|
||||||
|
/// the resource calls `close()` when they want to ensure that all holders of guards
|
||||||
|
/// have released them, and that no future guards will be issued.
|
||||||
|
pub struct Gate {
|
||||||
|
/// Each caller of enter() takes one unit from the semaphore. In close(), we
|
||||||
|
/// take all the units to ensure all GateGuards are destroyed.
|
||||||
|
sem: Arc<tokio::sync::Semaphore>,
|
||||||
|
|
||||||
|
/// For observability only: a name that will be used to log warnings if a particular
|
||||||
|
/// gate is holding up shutdown
|
||||||
|
name: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// RAII guard for a [`Gate`]: as long as this exists, calls to [`Gate::close`] will
|
||||||
|
/// not complete.
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct GateGuard(tokio::sync::OwnedSemaphorePermit);
|
||||||
|
|
||||||
|
/// Observability helper: every `warn_period`, emit a log warning that we're still waiting on this gate
|
||||||
|
async fn warn_if_stuck<Fut: std::future::Future>(
|
||||||
|
fut: Fut,
|
||||||
|
name: &str,
|
||||||
|
warn_period: std::time::Duration,
|
||||||
|
) -> <Fut as std::future::Future>::Output {
|
||||||
|
let started = std::time::Instant::now();
|
||||||
|
|
||||||
|
let mut fut = std::pin::pin!(fut);
|
||||||
|
|
||||||
|
loop {
|
||||||
|
match tokio::time::timeout(warn_period, &mut fut).await {
|
||||||
|
Ok(ret) => return ret,
|
||||||
|
Err(_) => {
|
||||||
|
tracing::warn!(
|
||||||
|
gate = name,
|
||||||
|
elapsed_ms = started.elapsed().as_millis(),
|
||||||
|
"still waiting, taking longer than expected..."
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub enum GateError {
|
||||||
|
GateClosed,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Gate {
|
||||||
|
const MAX_UNITS: u32 = u32::MAX;
|
||||||
|
|
||||||
|
pub fn new(name: String) -> Self {
|
||||||
|
Self {
|
||||||
|
sem: Arc::new(tokio::sync::Semaphore::new(Self::MAX_UNITS as usize)),
|
||||||
|
name,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Acquire a guard that will prevent close() calls from completing. If close()
|
||||||
|
/// was already called, this will return an error which should be interpreted
|
||||||
|
/// as "shutting down".
|
||||||
|
///
|
||||||
|
/// This function would typically be used from e.g. request handlers. While holding
|
||||||
|
/// the guard returned from this function, it is important to respect a CancellationToken
|
||||||
|
/// to avoid blocking close() indefinitely: typically types that contain a Gate will
|
||||||
|
/// also contain a CancellationToken.
|
||||||
|
pub fn enter(&self) -> Result<GateGuard, GateError> {
|
||||||
|
self.sem
|
||||||
|
.clone()
|
||||||
|
.try_acquire_owned()
|
||||||
|
.map(GateGuard)
|
||||||
|
.map_err(|_| GateError::GateClosed)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Types with a shutdown() method and a gate should call this method at the
|
||||||
|
/// end of shutdown, to ensure that all GateGuard holders are done.
|
||||||
|
///
|
||||||
|
/// This will wait for all guards to be destroyed. For this to complete promptly, it is
|
||||||
|
/// important that the holders of such guards are respecting a CancellationToken which has
|
||||||
|
/// been cancelled before entering this function.
|
||||||
|
pub async fn close(&self) {
|
||||||
|
warn_if_stuck(self.do_close(), &self.name, Duration::from_millis(1000)).await
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn do_close(&self) {
|
||||||
|
tracing::debug!(gate = self.name, "Closing Gate...");
|
||||||
|
match self.sem.acquire_many(Self::MAX_UNITS).await {
|
||||||
|
Ok(_units) => {
|
||||||
|
// While holding all units, close the semaphore. All subsequent calls to enter() will fail.
|
||||||
|
self.sem.close();
|
||||||
|
}
|
||||||
|
Err(_) => {
|
||||||
|
// Semaphore closed: we are the only function that can do this, so it indicates a double-call.
|
||||||
|
// This is legal. Timeline::shutdown for example is not protected from being called more than
|
||||||
|
// once.
|
||||||
|
tracing::debug!(gate = self.name, "Double close")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
tracing::debug!(gate = self.name, "Closed Gate.")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use futures::FutureExt;
|
||||||
|
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_idle_gate() {
|
||||||
|
// Having taken no gates, we should not be blocked in close
|
||||||
|
let gate = Gate::new("test".to_string());
|
||||||
|
gate.close().await;
|
||||||
|
|
||||||
|
// If a guard is dropped before entering, close should not be blocked
|
||||||
|
let gate = Gate::new("test".to_string());
|
||||||
|
let guard = gate.enter().unwrap();
|
||||||
|
drop(guard);
|
||||||
|
gate.close().await;
|
||||||
|
|
||||||
|
// Entering a closed guard fails
|
||||||
|
gate.enter().expect_err("enter should fail after close");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_busy_gate() {
|
||||||
|
let gate = Gate::new("test".to_string());
|
||||||
|
|
||||||
|
let guard = gate.enter().unwrap();
|
||||||
|
|
||||||
|
let mut close_fut = std::pin::pin!(gate.close());
|
||||||
|
|
||||||
|
// Close should be blocked
|
||||||
|
assert!(close_fut.as_mut().now_or_never().is_none());
|
||||||
|
|
||||||
|
// Attempting to enter() should fail, even though close isn't done yet.
|
||||||
|
gate.enter()
|
||||||
|
.expect_err("enter should fail after entering close");
|
||||||
|
|
||||||
|
drop(guard);
|
||||||
|
|
||||||
|
// Guard is gone, close should finish
|
||||||
|
assert!(close_fut.as_mut().now_or_never().is_some());
|
||||||
|
|
||||||
|
// Attempting to enter() is still forbidden
|
||||||
|
gate.enter().expect_err("enter should fail finishing close");
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -403,7 +403,7 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
|||||||
return (evicted_bytes, evictions_failed);
|
return (evicted_bytes, evictions_failed);
|
||||||
};
|
};
|
||||||
|
|
||||||
let results = timeline.evict_layers(&batch, &cancel).await;
|
let results = timeline.evict_layers(&batch).await;
|
||||||
|
|
||||||
match results {
|
match results {
|
||||||
Ok(results) => {
|
Ok(results) => {
|
||||||
@@ -554,6 +554,11 @@ async fn collect_eviction_candidates(
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
if tenant.cancel.is_cancelled() {
|
||||||
|
info!(%tenant_id, "Skipping tenant for eviction, it is shutting down");
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
// collect layers from all timelines in this tenant
|
// collect layers from all timelines in this tenant
|
||||||
//
|
//
|
||||||
// If one of the timelines becomes `!is_active()` during the iteration,
|
// If one of the timelines becomes `!is_active()` during the iteration,
|
||||||
|
|||||||
@@ -396,6 +396,9 @@ async fn timeline_create_handler(
|
|||||||
Err(e @ tenant::CreateTimelineError::AncestorNotActive) => {
|
Err(e @ tenant::CreateTimelineError::AncestorNotActive) => {
|
||||||
json_response(StatusCode::SERVICE_UNAVAILABLE, HttpErrorBody::from_msg(e.to_string()))
|
json_response(StatusCode::SERVICE_UNAVAILABLE, HttpErrorBody::from_msg(e.to_string()))
|
||||||
}
|
}
|
||||||
|
Err(tenant::CreateTimelineError::ShuttingDown) => {
|
||||||
|
json_response(StatusCode::SERVICE_UNAVAILABLE,HttpErrorBody::from_msg("tenant shutting down".to_string()))
|
||||||
|
}
|
||||||
Err(tenant::CreateTimelineError::Other(err)) => Err(ApiError::InternalServerError(err)),
|
Err(tenant::CreateTimelineError::Other(err)) => Err(ApiError::InternalServerError(err)),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -61,14 +61,6 @@ pub async fn shutdown_pageserver(deletion_queue: Option<DeletionQueue>, exit_cod
|
|||||||
)
|
)
|
||||||
.await;
|
.await;
|
||||||
|
|
||||||
// Shut down any page service tasks.
|
|
||||||
timed(
|
|
||||||
task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None),
|
|
||||||
"shutdown PageRequestHandlers",
|
|
||||||
Duration::from_secs(1),
|
|
||||||
)
|
|
||||||
.await;
|
|
||||||
|
|
||||||
// Shut down all the tenants. This flushes everything to disk and kills
|
// Shut down all the tenants. This flushes everything to disk and kills
|
||||||
// the checkpoint and GC tasks.
|
// the checkpoint and GC tasks.
|
||||||
timed(
|
timed(
|
||||||
@@ -78,6 +70,15 @@ pub async fn shutdown_pageserver(deletion_queue: Option<DeletionQueue>, exit_cod
|
|||||||
)
|
)
|
||||||
.await;
|
.await;
|
||||||
|
|
||||||
|
// Shut down any page service tasks: any in-progress work for particular timelines or tenants
|
||||||
|
// should already have been canclled via mgr::shutdown_all_tenants
|
||||||
|
timed(
|
||||||
|
task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None),
|
||||||
|
"shutdown PageRequestHandlers",
|
||||||
|
Duration::from_secs(1),
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
|
||||||
// Best effort to persist any outstanding deletions, to avoid leaking objects
|
// Best effort to persist any outstanding deletions, to avoid leaking objects
|
||||||
if let Some(mut deletion_queue) = deletion_queue {
|
if let Some(mut deletion_queue) = deletion_queue {
|
||||||
deletion_queue.shutdown(Duration::from_secs(5)).await;
|
deletion_queue.shutdown(Duration::from_secs(5)).await;
|
||||||
|
|||||||
@@ -223,13 +223,7 @@ async fn page_service_conn_main(
|
|||||||
// and create a child per-query context when it invokes process_query.
|
// and create a child per-query context when it invokes process_query.
|
||||||
// But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
|
// But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
|
||||||
// and create the per-query context in process_query ourselves.
|
// and create the per-query context in process_query ourselves.
|
||||||
let mut conn_handler = PageServerHandler::new(
|
let mut conn_handler = PageServerHandler::new(conf, broker_client, auth, connection_ctx);
|
||||||
conf,
|
|
||||||
broker_client,
|
|
||||||
auth,
|
|
||||||
connection_ctx,
|
|
||||||
task_mgr::shutdown_token(),
|
|
||||||
);
|
|
||||||
let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
|
let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
|
||||||
|
|
||||||
match pgbackend
|
match pgbackend
|
||||||
@@ -263,10 +257,6 @@ struct PageServerHandler {
|
|||||||
/// For each query received over the connection,
|
/// For each query received over the connection,
|
||||||
/// `process_query` creates a child context from this one.
|
/// `process_query` creates a child context from this one.
|
||||||
connection_ctx: RequestContext,
|
connection_ctx: RequestContext,
|
||||||
|
|
||||||
/// A token that should fire when the tenant transitions from
|
|
||||||
/// attached state, or when the pageserver is shutting down.
|
|
||||||
cancel: CancellationToken,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl PageServerHandler {
|
impl PageServerHandler {
|
||||||
@@ -275,7 +265,6 @@ impl PageServerHandler {
|
|||||||
broker_client: storage_broker::BrokerClientChannel,
|
broker_client: storage_broker::BrokerClientChannel,
|
||||||
auth: Option<Arc<JwtAuth>>,
|
auth: Option<Arc<JwtAuth>>,
|
||||||
connection_ctx: RequestContext,
|
connection_ctx: RequestContext,
|
||||||
cancel: CancellationToken,
|
|
||||||
) -> Self {
|
) -> Self {
|
||||||
PageServerHandler {
|
PageServerHandler {
|
||||||
_conf: conf,
|
_conf: conf,
|
||||||
@@ -283,7 +272,6 @@ impl PageServerHandler {
|
|||||||
auth,
|
auth,
|
||||||
claims: None,
|
claims: None,
|
||||||
connection_ctx,
|
connection_ctx,
|
||||||
cancel,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -291,7 +279,11 @@ impl PageServerHandler {
|
|||||||
/// this rather than naked flush() in order to shut down promptly. Without this, we would
|
/// this rather than naked flush() in order to shut down promptly. Without this, we would
|
||||||
/// block shutdown of a tenant if a postgres client was failing to consume bytes we send
|
/// block shutdown of a tenant if a postgres client was failing to consume bytes we send
|
||||||
/// in the flush.
|
/// in the flush.
|
||||||
async fn flush_cancellable<IO>(&self, pgb: &mut PostgresBackend<IO>) -> Result<(), QueryError>
|
async fn flush_cancellable<IO>(
|
||||||
|
&self,
|
||||||
|
pgb: &mut PostgresBackend<IO>,
|
||||||
|
cancel: &CancellationToken,
|
||||||
|
) -> Result<(), QueryError>
|
||||||
where
|
where
|
||||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
||||||
{
|
{
|
||||||
@@ -299,7 +291,7 @@ impl PageServerHandler {
|
|||||||
flush_r = pgb.flush() => {
|
flush_r = pgb.flush() => {
|
||||||
Ok(flush_r?)
|
Ok(flush_r?)
|
||||||
},
|
},
|
||||||
_ = self.cancel.cancelled() => {
|
_ = cancel.cancelled() => {
|
||||||
Err(QueryError::Shutdown)
|
Err(QueryError::Shutdown)
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
@@ -308,6 +300,7 @@ impl PageServerHandler {
|
|||||||
fn copyin_stream<'a, IO>(
|
fn copyin_stream<'a, IO>(
|
||||||
&'a self,
|
&'a self,
|
||||||
pgb: &'a mut PostgresBackend<IO>,
|
pgb: &'a mut PostgresBackend<IO>,
|
||||||
|
cancel: &'a CancellationToken,
|
||||||
) -> impl Stream<Item = io::Result<Bytes>> + 'a
|
) -> impl Stream<Item = io::Result<Bytes>> + 'a
|
||||||
where
|
where
|
||||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
||||||
@@ -317,7 +310,7 @@ impl PageServerHandler {
|
|||||||
let msg = tokio::select! {
|
let msg = tokio::select! {
|
||||||
biased;
|
biased;
|
||||||
|
|
||||||
_ = self.cancel.cancelled() => {
|
_ = cancel.cancelled() => {
|
||||||
// We were requested to shut down.
|
// We were requested to shut down.
|
||||||
let msg = "pageserver is shutting down";
|
let msg = "pageserver is shutting down";
|
||||||
let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, None));
|
let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, None));
|
||||||
@@ -357,7 +350,7 @@ impl PageServerHandler {
|
|||||||
let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
|
let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
|
||||||
// error can't happen here, ErrorResponse serialization should be always ok
|
// error can't happen here, ErrorResponse serialization should be always ok
|
||||||
pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
|
pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
|
||||||
self.flush_cancellable(pgb).await.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
|
self.flush_cancellable(pgb, cancel).await.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
|
||||||
Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
|
Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
|
||||||
}
|
}
|
||||||
Err(QueryError::Disconnected(ConnectionError::Io(io_error))) => {
|
Err(QueryError::Disconnected(ConnectionError::Io(io_error))) => {
|
||||||
@@ -384,10 +377,6 @@ impl PageServerHandler {
|
|||||||
{
|
{
|
||||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||||
|
|
||||||
// NOTE: pagerequests handler exits when connection is closed,
|
|
||||||
// so there is no need to reset the association
|
|
||||||
task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
|
|
||||||
|
|
||||||
// Make request tracer if needed
|
// Make request tracer if needed
|
||||||
let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
|
let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
|
||||||
let mut tracer = if tenant.get_trace_read_requests() {
|
let mut tracer = if tenant.get_trace_read_requests() {
|
||||||
@@ -405,9 +394,14 @@ impl PageServerHandler {
|
|||||||
.get_timeline(timeline_id, true)
|
.get_timeline(timeline_id, true)
|
||||||
.map_err(|e| anyhow::anyhow!(e))?;
|
.map_err(|e| anyhow::anyhow!(e))?;
|
||||||
|
|
||||||
|
// Avoid starting new requests if the timeline has already started shutting down,
|
||||||
|
// and block timeline shutdown until this request is complete, or drops out due
|
||||||
|
// to cancellation.
|
||||||
|
let _timeline_guard = timeline.gate.enter().map_err(|_| QueryError::Shutdown)?;
|
||||||
|
|
||||||
// switch client to COPYBOTH
|
// switch client to COPYBOTH
|
||||||
pgb.write_message_noflush(&BeMessage::CopyBothResponse)?;
|
pgb.write_message_noflush(&BeMessage::CopyBothResponse)?;
|
||||||
self.flush_cancellable(pgb).await?;
|
self.flush_cancellable(pgb, &timeline.cancel).await?;
|
||||||
|
|
||||||
let metrics = metrics::SmgrQueryTimePerTimeline::new(&tenant_id, &timeline_id);
|
let metrics = metrics::SmgrQueryTimePerTimeline::new(&tenant_id, &timeline_id);
|
||||||
|
|
||||||
@@ -415,7 +409,7 @@ impl PageServerHandler {
|
|||||||
let msg = tokio::select! {
|
let msg = tokio::select! {
|
||||||
biased;
|
biased;
|
||||||
|
|
||||||
_ = self.cancel.cancelled() => {
|
_ = timeline.cancel.cancelled() => {
|
||||||
// We were requested to shut down.
|
// We were requested to shut down.
|
||||||
info!("shutdown request received in page handler");
|
info!("shutdown request received in page handler");
|
||||||
return Err(QueryError::Shutdown)
|
return Err(QueryError::Shutdown)
|
||||||
@@ -490,9 +484,20 @@ impl PageServerHandler {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
if let Err(e) = &response {
|
||||||
|
if timeline.cancel.is_cancelled() {
|
||||||
|
// If we fail to fulfil a request during shutdown, which may be _because_ of
|
||||||
|
// shutdown, then do not send the error to the client. Instead just drop the
|
||||||
|
// connection.
|
||||||
|
span.in_scope(|| info!("dropped response during shutdown: {e:#}"));
|
||||||
|
return Err(QueryError::Shutdown);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
let response = response.unwrap_or_else(|e| {
|
let response = response.unwrap_or_else(|e| {
|
||||||
// print the all details to the log with {:#}, but for the client the
|
// print the all details to the log with {:#}, but for the client the
|
||||||
// error message is enough
|
// error message is enough. Do not log if shutting down, as the anyhow::Error
|
||||||
|
// here includes cancellation which is not an error.
|
||||||
span.in_scope(|| error!("error reading relation or page version: {:#}", e));
|
span.in_scope(|| error!("error reading relation or page version: {:#}", e));
|
||||||
PagestreamBeMessage::Error(PagestreamErrorResponse {
|
PagestreamBeMessage::Error(PagestreamErrorResponse {
|
||||||
message: e.to_string(),
|
message: e.to_string(),
|
||||||
@@ -500,7 +505,7 @@ impl PageServerHandler {
|
|||||||
});
|
});
|
||||||
|
|
||||||
pgb.write_message_noflush(&BeMessage::CopyData(&response.serialize()))?;
|
pgb.write_message_noflush(&BeMessage::CopyData(&response.serialize()))?;
|
||||||
self.flush_cancellable(pgb).await?;
|
self.flush_cancellable(pgb, &timeline.cancel).await?;
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -522,7 +527,6 @@ impl PageServerHandler {
|
|||||||
{
|
{
|
||||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||||
|
|
||||||
task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
|
|
||||||
// Create empty timeline
|
// Create empty timeline
|
||||||
info!("creating new timeline");
|
info!("creating new timeline");
|
||||||
let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
|
let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
|
||||||
@@ -543,9 +547,9 @@ impl PageServerHandler {
|
|||||||
// Import basebackup provided via CopyData
|
// Import basebackup provided via CopyData
|
||||||
info!("importing basebackup");
|
info!("importing basebackup");
|
||||||
pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
|
pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
|
||||||
self.flush_cancellable(pgb).await?;
|
self.flush_cancellable(pgb, &tenant.cancel).await?;
|
||||||
|
|
||||||
let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb)));
|
let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &tenant.cancel)));
|
||||||
timeline
|
timeline
|
||||||
.import_basebackup_from_tar(
|
.import_basebackup_from_tar(
|
||||||
&mut copyin_reader,
|
&mut copyin_reader,
|
||||||
@@ -582,7 +586,6 @@ impl PageServerHandler {
|
|||||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
||||||
{
|
{
|
||||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||||
task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
|
|
||||||
|
|
||||||
let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
|
let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
|
||||||
let last_record_lsn = timeline.get_last_record_lsn();
|
let last_record_lsn = timeline.get_last_record_lsn();
|
||||||
@@ -598,8 +601,8 @@ impl PageServerHandler {
|
|||||||
// Import wal provided via CopyData
|
// Import wal provided via CopyData
|
||||||
info!("importing wal");
|
info!("importing wal");
|
||||||
pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
|
pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
|
||||||
self.flush_cancellable(pgb).await?;
|
self.flush_cancellable(pgb, &timeline.cancel).await?;
|
||||||
let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb)));
|
let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &timeline.cancel)));
|
||||||
import_wal_from_tar(&timeline, &mut copyin_reader, start_lsn, end_lsn, &ctx).await?;
|
import_wal_from_tar(&timeline, &mut copyin_reader, start_lsn, end_lsn, &ctx).await?;
|
||||||
info!("wal import complete");
|
info!("wal import complete");
|
||||||
|
|
||||||
@@ -807,7 +810,7 @@ impl PageServerHandler {
|
|||||||
|
|
||||||
// switch client to COPYOUT
|
// switch client to COPYOUT
|
||||||
pgb.write_message_noflush(&BeMessage::CopyOutResponse)?;
|
pgb.write_message_noflush(&BeMessage::CopyOutResponse)?;
|
||||||
self.flush_cancellable(pgb).await?;
|
self.flush_cancellable(pgb, &timeline.cancel).await?;
|
||||||
|
|
||||||
// Send a tarball of the latest layer on the timeline. Compress if not
|
// Send a tarball of the latest layer on the timeline. Compress if not
|
||||||
// fullbackup. TODO Compress in that case too (tests need to be updated)
|
// fullbackup. TODO Compress in that case too (tests need to be updated)
|
||||||
@@ -859,7 +862,7 @@ impl PageServerHandler {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pgb.write_message_noflush(&BeMessage::CopyDone)?;
|
pgb.write_message_noflush(&BeMessage::CopyDone)?;
|
||||||
self.flush_cancellable(pgb).await?;
|
self.flush_cancellable(pgb, &timeline.cancel).await?;
|
||||||
|
|
||||||
let basebackup_after = started
|
let basebackup_after = started
|
||||||
.elapsed()
|
.elapsed()
|
||||||
|
|||||||
@@ -44,6 +44,17 @@ pub enum CalculateLogicalSizeError {
|
|||||||
Other(#[from] anyhow::Error),
|
Other(#[from] anyhow::Error),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl From<PageReconstructError> for CalculateLogicalSizeError {
|
||||||
|
fn from(pre: PageReconstructError) -> Self {
|
||||||
|
match pre {
|
||||||
|
PageReconstructError::AncestorStopping(_) | PageReconstructError::Cancelled => {
|
||||||
|
Self::Cancelled
|
||||||
|
}
|
||||||
|
_ => Self::Other(pre.into()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug, thiserror::Error)]
|
#[derive(Debug, thiserror::Error)]
|
||||||
pub enum RelationError {
|
pub enum RelationError {
|
||||||
#[error("Relation Already Exists")]
|
#[error("Relation Already Exists")]
|
||||||
@@ -573,7 +584,7 @@ impl Timeline {
|
|||||||
crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
|
crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
|
||||||
|
|
||||||
// Fetch list of database dirs and iterate them
|
// Fetch list of database dirs and iterate them
|
||||||
let buf = self.get(DBDIR_KEY, lsn, ctx).await.context("read dbdir")?;
|
let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
|
||||||
let dbdir = DbDirectory::des(&buf).context("deserialize db directory")?;
|
let dbdir = DbDirectory::des(&buf).context("deserialize db directory")?;
|
||||||
|
|
||||||
let mut total_size: u64 = 0;
|
let mut total_size: u64 = 0;
|
||||||
@@ -587,10 +598,7 @@ impl Timeline {
|
|||||||
return Err(CalculateLogicalSizeError::Cancelled);
|
return Err(CalculateLogicalSizeError::Cancelled);
|
||||||
}
|
}
|
||||||
let relsize_key = rel_size_to_key(rel);
|
let relsize_key = rel_size_to_key(rel);
|
||||||
let mut buf = self
|
let mut buf = self.get(relsize_key, lsn, ctx).await?;
|
||||||
.get(relsize_key, lsn, ctx)
|
|
||||||
.await
|
|
||||||
.with_context(|| format!("read relation size of {rel:?}"))?;
|
|
||||||
let relsize = buf.get_u32_le();
|
let relsize = buf.get_u32_le();
|
||||||
|
|
||||||
total_size += relsize as u64;
|
total_size += relsize as u64;
|
||||||
|
|||||||
@@ -299,10 +299,6 @@ pub enum TaskKind {
|
|||||||
|
|
||||||
#[derive(Default)]
|
#[derive(Default)]
|
||||||
struct MutableTaskState {
|
struct MutableTaskState {
|
||||||
/// Tenant and timeline that this task is associated with.
|
|
||||||
tenant_id: Option<TenantId>,
|
|
||||||
timeline_id: Option<TimelineId>,
|
|
||||||
|
|
||||||
/// Handle for waiting for the task to exit. It can be None, if the
|
/// Handle for waiting for the task to exit. It can be None, if the
|
||||||
/// the task has already exited.
|
/// the task has already exited.
|
||||||
join_handle: Option<JoinHandle<()>>,
|
join_handle: Option<JoinHandle<()>>,
|
||||||
@@ -319,6 +315,11 @@ struct PageServerTask {
|
|||||||
// To request task shutdown, just cancel this token.
|
// To request task shutdown, just cancel this token.
|
||||||
cancel: CancellationToken,
|
cancel: CancellationToken,
|
||||||
|
|
||||||
|
/// Tasks may optionally be launched for a particular tenant/timeline, enabling
|
||||||
|
/// later cancelling tasks for that tenant/timeline in [`shutdown_tasks`]
|
||||||
|
tenant_id: Option<TenantId>,
|
||||||
|
timeline_id: Option<TimelineId>,
|
||||||
|
|
||||||
mutable: Mutex<MutableTaskState>,
|
mutable: Mutex<MutableTaskState>,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -344,11 +345,9 @@ where
|
|||||||
kind,
|
kind,
|
||||||
name: name.to_string(),
|
name: name.to_string(),
|
||||||
cancel: cancel.clone(),
|
cancel: cancel.clone(),
|
||||||
mutable: Mutex::new(MutableTaskState {
|
tenant_id,
|
||||||
tenant_id,
|
timeline_id,
|
||||||
timeline_id,
|
mutable: Mutex::new(MutableTaskState { join_handle: None }),
|
||||||
join_handle: None,
|
|
||||||
}),
|
|
||||||
});
|
});
|
||||||
|
|
||||||
TASKS.lock().unwrap().insert(task_id, Arc::clone(&task));
|
TASKS.lock().unwrap().insert(task_id, Arc::clone(&task));
|
||||||
@@ -418,8 +417,6 @@ async fn task_finish(
|
|||||||
|
|
||||||
let mut shutdown_process = false;
|
let mut shutdown_process = false;
|
||||||
{
|
{
|
||||||
let task_mut = task.mutable.lock().unwrap();
|
|
||||||
|
|
||||||
match result {
|
match result {
|
||||||
Ok(Ok(())) => {
|
Ok(Ok(())) => {
|
||||||
debug!("Task '{}' exited normally", task_name);
|
debug!("Task '{}' exited normally", task_name);
|
||||||
@@ -428,13 +425,13 @@ async fn task_finish(
|
|||||||
if shutdown_process_on_error {
|
if shutdown_process_on_error {
|
||||||
error!(
|
error!(
|
||||||
"Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}",
|
"Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}",
|
||||||
task_name, task_mut.tenant_id, task_mut.timeline_id, err
|
task_name, task.tenant_id, task.timeline_id, err
|
||||||
);
|
);
|
||||||
shutdown_process = true;
|
shutdown_process = true;
|
||||||
} else {
|
} else {
|
||||||
error!(
|
error!(
|
||||||
"Task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}",
|
"Task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}",
|
||||||
task_name, task_mut.tenant_id, task_mut.timeline_id, err
|
task_name, task.tenant_id, task.timeline_id, err
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -442,13 +439,13 @@ async fn task_finish(
|
|||||||
if shutdown_process_on_error {
|
if shutdown_process_on_error {
|
||||||
error!(
|
error!(
|
||||||
"Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}",
|
"Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}",
|
||||||
task_name, task_mut.tenant_id, task_mut.timeline_id, err
|
task_name, task.tenant_id, task.timeline_id, err
|
||||||
);
|
);
|
||||||
shutdown_process = true;
|
shutdown_process = true;
|
||||||
} else {
|
} else {
|
||||||
error!(
|
error!(
|
||||||
"Task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}",
|
"Task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}",
|
||||||
task_name, task_mut.tenant_id, task_mut.timeline_id, err
|
task_name, task.tenant_id, task.timeline_id, err
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -460,17 +457,6 @@ async fn task_finish(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// expected to be called from the task of the given id.
|
|
||||||
pub fn associate_with(tenant_id: Option<TenantId>, timeline_id: Option<TimelineId>) {
|
|
||||||
CURRENT_TASK.with(|ct| {
|
|
||||||
let mut task_mut = ct.mutable.lock().unwrap();
|
|
||||||
task_mut.tenant_id = tenant_id;
|
|
||||||
task_mut.timeline_id = timeline_id;
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Is there a task running that matches the criteria
|
|
||||||
|
|
||||||
/// Signal and wait for tasks to shut down.
|
/// Signal and wait for tasks to shut down.
|
||||||
///
|
///
|
||||||
///
|
///
|
||||||
@@ -493,17 +479,16 @@ pub async fn shutdown_tasks(
|
|||||||
{
|
{
|
||||||
let tasks = TASKS.lock().unwrap();
|
let tasks = TASKS.lock().unwrap();
|
||||||
for task in tasks.values() {
|
for task in tasks.values() {
|
||||||
let task_mut = task.mutable.lock().unwrap();
|
|
||||||
if (kind.is_none() || Some(task.kind) == kind)
|
if (kind.is_none() || Some(task.kind) == kind)
|
||||||
&& (tenant_id.is_none() || task_mut.tenant_id == tenant_id)
|
&& (tenant_id.is_none() || task.tenant_id == tenant_id)
|
||||||
&& (timeline_id.is_none() || task_mut.timeline_id == timeline_id)
|
&& (timeline_id.is_none() || task.timeline_id == timeline_id)
|
||||||
{
|
{
|
||||||
task.cancel.cancel();
|
task.cancel.cancel();
|
||||||
victim_tasks.push((
|
victim_tasks.push((
|
||||||
Arc::clone(task),
|
Arc::clone(task),
|
||||||
task.kind,
|
task.kind,
|
||||||
task_mut.tenant_id,
|
task.tenant_id,
|
||||||
task_mut.timeline_id,
|
task.timeline_id,
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -26,6 +26,7 @@ use tracing::*;
|
|||||||
use utils::completion;
|
use utils::completion;
|
||||||
use utils::crashsafe::path_with_suffix_extension;
|
use utils::crashsafe::path_with_suffix_extension;
|
||||||
use utils::fs_ext;
|
use utils::fs_ext;
|
||||||
|
use utils::sync::gate::Gate;
|
||||||
|
|
||||||
use std::cmp::min;
|
use std::cmp::min;
|
||||||
use std::collections::hash_map::Entry;
|
use std::collections::hash_map::Entry;
|
||||||
@@ -252,6 +253,14 @@ pub struct Tenant {
|
|||||||
eviction_task_tenant_state: tokio::sync::Mutex<EvictionTaskTenantState>,
|
eviction_task_tenant_state: tokio::sync::Mutex<EvictionTaskTenantState>,
|
||||||
|
|
||||||
pub(crate) delete_progress: Arc<tokio::sync::Mutex<DeleteTenantFlow>>,
|
pub(crate) delete_progress: Arc<tokio::sync::Mutex<DeleteTenantFlow>>,
|
||||||
|
|
||||||
|
// Cancellation token fires when we have entered shutdown(). This is a parent of
|
||||||
|
// Timelines' cancellation token.
|
||||||
|
pub(crate) cancel: CancellationToken,
|
||||||
|
|
||||||
|
// Users of the Tenant such as the page service must take this Gate to avoid
|
||||||
|
// trying to use a Tenant which is shutting down.
|
||||||
|
pub(crate) gate: Gate,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) enum WalRedoManager {
|
pub(crate) enum WalRedoManager {
|
||||||
@@ -395,6 +404,8 @@ pub enum CreateTimelineError {
|
|||||||
AncestorLsn(anyhow::Error),
|
AncestorLsn(anyhow::Error),
|
||||||
#[error("ancestor timeline is not active")]
|
#[error("ancestor timeline is not active")]
|
||||||
AncestorNotActive,
|
AncestorNotActive,
|
||||||
|
#[error("tenant shutting down")]
|
||||||
|
ShuttingDown,
|
||||||
#[error(transparent)]
|
#[error(transparent)]
|
||||||
Other(#[from] anyhow::Error),
|
Other(#[from] anyhow::Error),
|
||||||
}
|
}
|
||||||
@@ -1524,6 +1535,11 @@ impl Tenant {
|
|||||||
)));
|
)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let _gate = self
|
||||||
|
.gate
|
||||||
|
.enter()
|
||||||
|
.map_err(|_| CreateTimelineError::ShuttingDown)?;
|
||||||
|
|
||||||
if let Ok(existing) = self.get_timeline(new_timeline_id, false) {
|
if let Ok(existing) = self.get_timeline(new_timeline_id, false) {
|
||||||
debug!("timeline {new_timeline_id} already exists");
|
debug!("timeline {new_timeline_id} already exists");
|
||||||
|
|
||||||
@@ -1808,6 +1824,7 @@ impl Tenant {
|
|||||||
freeze_and_flush: bool,
|
freeze_and_flush: bool,
|
||||||
) -> Result<(), completion::Barrier> {
|
) -> Result<(), completion::Barrier> {
|
||||||
span::debug_assert_current_span_has_tenant_id();
|
span::debug_assert_current_span_has_tenant_id();
|
||||||
|
|
||||||
// Set tenant (and its timlines) to Stoppping state.
|
// Set tenant (and its timlines) to Stoppping state.
|
||||||
//
|
//
|
||||||
// Since we can only transition into Stopping state after activation is complete,
|
// Since we can only transition into Stopping state after activation is complete,
|
||||||
@@ -1846,6 +1863,7 @@ impl Tenant {
|
|||||||
js.spawn(async move { timeline.shutdown(freeze_and_flush).instrument(span).await });
|
js.spawn(async move { timeline.shutdown(freeze_and_flush).instrument(span).await });
|
||||||
})
|
})
|
||||||
};
|
};
|
||||||
|
tracing::info!("Waiting for timelines...");
|
||||||
while let Some(res) = js.join_next().await {
|
while let Some(res) = js.join_next().await {
|
||||||
match res {
|
match res {
|
||||||
Ok(()) => {}
|
Ok(()) => {}
|
||||||
@@ -1855,12 +1873,21 @@ impl Tenant {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// We cancel the Tenant's cancellation token _after_ the timelines have all shut down. This permits
|
||||||
|
// them to continue to do work during their shutdown methods, e.g. flushing data.
|
||||||
|
tracing::debug!("Cancelling CancellationToken");
|
||||||
|
self.cancel.cancel();
|
||||||
|
|
||||||
// shutdown all tenant and timeline tasks: gc, compaction, page service
|
// shutdown all tenant and timeline tasks: gc, compaction, page service
|
||||||
// No new tasks will be started for this tenant because it's in `Stopping` state.
|
// No new tasks will be started for this tenant because it's in `Stopping` state.
|
||||||
//
|
//
|
||||||
// this will additionally shutdown and await all timeline tasks.
|
// this will additionally shutdown and await all timeline tasks.
|
||||||
|
tracing::debug!("Waiting for tasks...");
|
||||||
task_mgr::shutdown_tasks(None, Some(self.tenant_id), None).await;
|
task_mgr::shutdown_tasks(None, Some(self.tenant_id), None).await;
|
||||||
|
|
||||||
|
// Wait for any in-flight operations to complete
|
||||||
|
self.gate.close().await;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -2267,6 +2294,7 @@ impl Tenant {
|
|||||||
initial_logical_size_can_start.cloned(),
|
initial_logical_size_can_start.cloned(),
|
||||||
initial_logical_size_attempt.cloned().flatten(),
|
initial_logical_size_attempt.cloned().flatten(),
|
||||||
state,
|
state,
|
||||||
|
self.cancel.child_token(),
|
||||||
);
|
);
|
||||||
|
|
||||||
Ok(timeline)
|
Ok(timeline)
|
||||||
@@ -2356,6 +2384,8 @@ impl Tenant {
|
|||||||
cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
|
cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
|
||||||
eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()),
|
eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()),
|
||||||
delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTenantFlow::default())),
|
delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTenantFlow::default())),
|
||||||
|
cancel: CancellationToken::default(),
|
||||||
|
gate: Gate::new(format!("Tenant<{tenant_id}>")),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -406,10 +406,12 @@ async fn fill_logical_sizes(
|
|||||||
have_any_error = true;
|
have_any_error = true;
|
||||||
}
|
}
|
||||||
Ok(Ok(TimelineAtLsnSizeResult(timeline, lsn, Err(error)))) => {
|
Ok(Ok(TimelineAtLsnSizeResult(timeline, lsn, Err(error)))) => {
|
||||||
warn!(
|
if !matches!(error, CalculateLogicalSizeError::Cancelled) {
|
||||||
timeline_id=%timeline.timeline_id,
|
warn!(
|
||||||
"failed to calculate logical size at {lsn}: {error:#}"
|
timeline_id=%timeline.timeline_id,
|
||||||
);
|
"failed to calculate logical size at {lsn}: {error:#}"
|
||||||
|
);
|
||||||
|
}
|
||||||
have_any_error = true;
|
have_any_error = true;
|
||||||
}
|
}
|
||||||
Ok(Ok(TimelineAtLsnSizeResult(timeline, lsn, Ok(size)))) => {
|
Ok(Ok(TimelineAtLsnSizeResult(timeline, lsn, Ok(size)))) => {
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ use tokio::{
|
|||||||
};
|
};
|
||||||
use tokio_util::sync::CancellationToken;
|
use tokio_util::sync::CancellationToken;
|
||||||
use tracing::*;
|
use tracing::*;
|
||||||
use utils::id::TenantTimelineId;
|
use utils::{id::TenantTimelineId, sync::gate::Gate};
|
||||||
|
|
||||||
use std::cmp::{max, min, Ordering};
|
use std::cmp::{max, min, Ordering};
|
||||||
use std::collections::{BinaryHeap, HashMap, HashSet};
|
use std::collections::{BinaryHeap, HashMap, HashSet};
|
||||||
@@ -310,6 +310,13 @@ pub struct Timeline {
|
|||||||
/// Load or creation time information about the disk_consistent_lsn and when the loading
|
/// Load or creation time information about the disk_consistent_lsn and when the loading
|
||||||
/// happened. Used for consumption metrics.
|
/// happened. Used for consumption metrics.
|
||||||
pub(crate) loaded_at: (Lsn, SystemTime),
|
pub(crate) loaded_at: (Lsn, SystemTime),
|
||||||
|
|
||||||
|
/// Gate to prevent shutdown completing while I/O is still happening to this timeline's data
|
||||||
|
pub(crate) gate: Gate,
|
||||||
|
|
||||||
|
/// Cancellation token scoped to this timeline: anything doing long-running work relating
|
||||||
|
/// to the timeline should drop out when this token fires.
|
||||||
|
pub(crate) cancel: CancellationToken,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct WalReceiverInfo {
|
pub struct WalReceiverInfo {
|
||||||
@@ -786,7 +793,11 @@ impl Timeline {
|
|||||||
// as an empty timeline. Also in unit tests, when we use the timeline
|
// as an empty timeline. Also in unit tests, when we use the timeline
|
||||||
// as a simple key-value store, ignoring the datadir layout. Log the
|
// as a simple key-value store, ignoring the datadir layout. Log the
|
||||||
// error but continue.
|
// error but continue.
|
||||||
error!("could not compact, repartitioning keyspace failed: {err:?}");
|
//
|
||||||
|
// Suppress error when it's due to cancellation
|
||||||
|
if !self.cancel.is_cancelled() {
|
||||||
|
error!("could not compact, repartitioning keyspace failed: {err:?}");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -884,7 +895,12 @@ impl Timeline {
|
|||||||
pub async fn shutdown(self: &Arc<Self>, freeze_and_flush: bool) {
|
pub async fn shutdown(self: &Arc<Self>, freeze_and_flush: bool) {
|
||||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||||
|
|
||||||
|
// Signal any subscribers to our cancellation token to drop out
|
||||||
|
tracing::debug!("Cancelling CancellationToken");
|
||||||
|
self.cancel.cancel();
|
||||||
|
|
||||||
// prevent writes to the InMemoryLayer
|
// prevent writes to the InMemoryLayer
|
||||||
|
tracing::debug!("Waiting for WalReceiverManager...");
|
||||||
task_mgr::shutdown_tasks(
|
task_mgr::shutdown_tasks(
|
||||||
Some(TaskKind::WalReceiverManager),
|
Some(TaskKind::WalReceiverManager),
|
||||||
Some(self.tenant_id),
|
Some(self.tenant_id),
|
||||||
@@ -920,6 +936,16 @@ impl Timeline {
|
|||||||
warn!("failed to await for frozen and flushed uploads: {e:#}");
|
warn!("failed to await for frozen and flushed uploads: {e:#}");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Page request handlers might be waiting for LSN to advance: they do not respect Timeline::cancel
|
||||||
|
// while doing so.
|
||||||
|
self.last_record_lsn.shutdown();
|
||||||
|
|
||||||
|
tracing::debug!("Waiting for tasks...");
|
||||||
|
task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(self.timeline_id)).await;
|
||||||
|
|
||||||
|
// Finally wait until any gate-holders are complete
|
||||||
|
self.gate.close().await;
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn set_state(&self, new_state: TimelineState) {
|
pub fn set_state(&self, new_state: TimelineState) {
|
||||||
@@ -1048,6 +1074,11 @@ impl Timeline {
|
|||||||
/// Like [`evict_layer_batch`](Self::evict_layer_batch), but for just one layer.
|
/// Like [`evict_layer_batch`](Self::evict_layer_batch), but for just one layer.
|
||||||
/// Additional case `Ok(None)` covers the case where the layer could not be found by its `layer_file_name`.
|
/// Additional case `Ok(None)` covers the case where the layer could not be found by its `layer_file_name`.
|
||||||
pub async fn evict_layer(&self, layer_file_name: &str) -> anyhow::Result<Option<bool>> {
|
pub async fn evict_layer(&self, layer_file_name: &str) -> anyhow::Result<Option<bool>> {
|
||||||
|
let _gate = self
|
||||||
|
.gate
|
||||||
|
.enter()
|
||||||
|
.map_err(|_| anyhow::anyhow!("Shutting down"))?;
|
||||||
|
|
||||||
let Some(local_layer) = self.find_layer(layer_file_name).await else {
|
let Some(local_layer) = self.find_layer(layer_file_name).await else {
|
||||||
return Ok(None);
|
return Ok(None);
|
||||||
};
|
};
|
||||||
@@ -1063,9 +1094,8 @@ impl Timeline {
|
|||||||
.as_ref()
|
.as_ref()
|
||||||
.ok_or_else(|| anyhow::anyhow!("remote storage not configured; cannot evict"))?;
|
.ok_or_else(|| anyhow::anyhow!("remote storage not configured; cannot evict"))?;
|
||||||
|
|
||||||
let cancel = CancellationToken::new();
|
|
||||||
let results = self
|
let results = self
|
||||||
.evict_layer_batch(remote_client, &[local_layer], &cancel)
|
.evict_layer_batch(remote_client, &[local_layer])
|
||||||
.await?;
|
.await?;
|
||||||
assert_eq!(results.len(), 1);
|
assert_eq!(results.len(), 1);
|
||||||
let result: Option<Result<(), EvictionError>> = results.into_iter().next().unwrap();
|
let result: Option<Result<(), EvictionError>> = results.into_iter().next().unwrap();
|
||||||
@@ -1080,15 +1110,18 @@ impl Timeline {
|
|||||||
pub(crate) async fn evict_layers(
|
pub(crate) async fn evict_layers(
|
||||||
&self,
|
&self,
|
||||||
layers_to_evict: &[Layer],
|
layers_to_evict: &[Layer],
|
||||||
cancel: &CancellationToken,
|
|
||||||
) -> anyhow::Result<Vec<Option<Result<(), EvictionError>>>> {
|
) -> anyhow::Result<Vec<Option<Result<(), EvictionError>>>> {
|
||||||
|
let _gate = self
|
||||||
|
.gate
|
||||||
|
.enter()
|
||||||
|
.map_err(|_| anyhow::anyhow!("Shutting down"))?;
|
||||||
|
|
||||||
let remote_client = self
|
let remote_client = self
|
||||||
.remote_client
|
.remote_client
|
||||||
.as_ref()
|
.as_ref()
|
||||||
.context("timeline must have RemoteTimelineClient")?;
|
.context("timeline must have RemoteTimelineClient")?;
|
||||||
|
|
||||||
self.evict_layer_batch(remote_client, layers_to_evict, cancel)
|
self.evict_layer_batch(remote_client, layers_to_evict).await
|
||||||
.await
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Evict multiple layers at once, continuing through errors.
|
/// Evict multiple layers at once, continuing through errors.
|
||||||
@@ -1109,7 +1142,6 @@ impl Timeline {
|
|||||||
&self,
|
&self,
|
||||||
remote_client: &Arc<RemoteTimelineClient>,
|
remote_client: &Arc<RemoteTimelineClient>,
|
||||||
layers_to_evict: &[Layer],
|
layers_to_evict: &[Layer],
|
||||||
cancel: &CancellationToken,
|
|
||||||
) -> anyhow::Result<Vec<Option<Result<(), EvictionError>>>> {
|
) -> anyhow::Result<Vec<Option<Result<(), EvictionError>>>> {
|
||||||
// ensure that the layers have finished uploading
|
// ensure that the layers have finished uploading
|
||||||
// (don't hold the layer_removal_cs while we do it, we're not removing anything yet)
|
// (don't hold the layer_removal_cs while we do it, we're not removing anything yet)
|
||||||
@@ -1157,7 +1189,7 @@ impl Timeline {
|
|||||||
};
|
};
|
||||||
|
|
||||||
tokio::select! {
|
tokio::select! {
|
||||||
_ = cancel.cancelled() => {},
|
_ = self.cancel.cancelled() => {},
|
||||||
_ = join => {}
|
_ = join => {}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1267,6 +1299,7 @@ impl Timeline {
|
|||||||
initial_logical_size_can_start: Option<completion::Barrier>,
|
initial_logical_size_can_start: Option<completion::Barrier>,
|
||||||
initial_logical_size_attempt: Option<completion::Completion>,
|
initial_logical_size_attempt: Option<completion::Completion>,
|
||||||
state: TimelineState,
|
state: TimelineState,
|
||||||
|
cancel: CancellationToken,
|
||||||
) -> Arc<Self> {
|
) -> Arc<Self> {
|
||||||
let disk_consistent_lsn = metadata.disk_consistent_lsn();
|
let disk_consistent_lsn = metadata.disk_consistent_lsn();
|
||||||
let (state, _) = watch::channel(state);
|
let (state, _) = watch::channel(state);
|
||||||
@@ -1367,6 +1400,8 @@ impl Timeline {
|
|||||||
|
|
||||||
initial_logical_size_can_start,
|
initial_logical_size_can_start,
|
||||||
initial_logical_size_attempt: Mutex::new(initial_logical_size_attempt),
|
initial_logical_size_attempt: Mutex::new(initial_logical_size_attempt),
|
||||||
|
cancel,
|
||||||
|
gate: Gate::new(format!("Timeline<{tenant_id}/{timeline_id}>")),
|
||||||
};
|
};
|
||||||
result.repartition_threshold =
|
result.repartition_threshold =
|
||||||
result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
|
result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
|
||||||
@@ -2030,6 +2065,10 @@ impl Timeline {
|
|||||||
let mut cont_lsn = Lsn(request_lsn.0 + 1);
|
let mut cont_lsn = Lsn(request_lsn.0 + 1);
|
||||||
|
|
||||||
'outer: loop {
|
'outer: loop {
|
||||||
|
if self.cancel.is_cancelled() {
|
||||||
|
return Err(PageReconstructError::Cancelled);
|
||||||
|
}
|
||||||
|
|
||||||
// The function should have updated 'state'
|
// The function should have updated 'state'
|
||||||
//info!("CALLED for {} at {}: {:?} with {} records, cached {}", key, cont_lsn, result, reconstruct_state.records.len(), cached_lsn);
|
//info!("CALLED for {} at {}: {:?} with {} records, cached {}", key, cont_lsn, result, reconstruct_state.records.len(), cached_lsn);
|
||||||
match result {
|
match result {
|
||||||
@@ -4366,25 +4405,10 @@ mod tests {
|
|||||||
.expect("should had been resident")
|
.expect("should had been resident")
|
||||||
.drop_eviction_guard();
|
.drop_eviction_guard();
|
||||||
|
|
||||||
let cancel = tokio_util::sync::CancellationToken::new();
|
|
||||||
let batch = [layer];
|
let batch = [layer];
|
||||||
|
|
||||||
let first = {
|
let first = async { timeline.evict_layer_batch(&rc, &batch).await.unwrap() };
|
||||||
let cancel = cancel.child_token();
|
let second = async { timeline.evict_layer_batch(&rc, &batch).await.unwrap() };
|
||||||
async {
|
|
||||||
let cancel = cancel;
|
|
||||||
timeline
|
|
||||||
.evict_layer_batch(&rc, &batch, &cancel)
|
|
||||||
.await
|
|
||||||
.unwrap()
|
|
||||||
}
|
|
||||||
};
|
|
||||||
let second = async {
|
|
||||||
timeline
|
|
||||||
.evict_layer_batch(&rc, &batch, &cancel)
|
|
||||||
.await
|
|
||||||
.unwrap()
|
|
||||||
};
|
|
||||||
|
|
||||||
let (first, second) = tokio::join!(first, second);
|
let (first, second) = tokio::join!(first, second);
|
||||||
|
|
||||||
|
|||||||
@@ -17,6 +17,7 @@ use crate::{
|
|||||||
deletion_queue::DeletionQueueClient,
|
deletion_queue::DeletionQueueClient,
|
||||||
task_mgr::{self, TaskKind},
|
task_mgr::{self, TaskKind},
|
||||||
tenant::{
|
tenant::{
|
||||||
|
debug_assert_current_span_has_tenant_and_timeline_id,
|
||||||
metadata::TimelineMetadata,
|
metadata::TimelineMetadata,
|
||||||
remote_timeline_client::{
|
remote_timeline_client::{
|
||||||
self, PersistIndexPartWithDeletedFlagError, RemoteTimelineClient,
|
self, PersistIndexPartWithDeletedFlagError, RemoteTimelineClient,
|
||||||
@@ -30,6 +31,11 @@ use super::{Timeline, TimelineResources};
|
|||||||
|
|
||||||
/// Now that the Timeline is in Stopping state, request all the related tasks to shut down.
|
/// Now that the Timeline is in Stopping state, request all the related tasks to shut down.
|
||||||
async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
|
async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
|
||||||
|
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||||
|
// Notify any timeline work to drop out of loops/requests
|
||||||
|
tracing::debug!("Cancelling CancellationToken");
|
||||||
|
timeline.cancel.cancel();
|
||||||
|
|
||||||
// Stop the walreceiver first.
|
// Stop the walreceiver first.
|
||||||
debug!("waiting for wal receiver to shutdown");
|
debug!("waiting for wal receiver to shutdown");
|
||||||
let maybe_started_walreceiver = { timeline.walreceiver.lock().unwrap().take() };
|
let maybe_started_walreceiver = { timeline.walreceiver.lock().unwrap().take() };
|
||||||
@@ -74,6 +80,11 @@ async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
|
|||||||
"failpoint: timeline-delete-before-index-deleted-at"
|
"failpoint: timeline-delete-before-index-deleted-at"
|
||||||
))?
|
))?
|
||||||
});
|
});
|
||||||
|
|
||||||
|
tracing::debug!("Waiting for gate...");
|
||||||
|
timeline.gate.close().await;
|
||||||
|
tracing::debug!("Shutdown complete");
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -277,10 +277,7 @@ impl Timeline {
|
|||||||
Some(c) => c,
|
Some(c) => c,
|
||||||
};
|
};
|
||||||
|
|
||||||
let results = match self
|
let results = match self.evict_layer_batch(remote_client, &candidates).await {
|
||||||
.evict_layer_batch(remote_client, &candidates, cancel)
|
|
||||||
.await
|
|
||||||
{
|
|
||||||
Err(pre_err) => {
|
Err(pre_err) => {
|
||||||
stats.errors += candidates.len();
|
stats.errors += candidates.len();
|
||||||
error!("could not do any evictions: {pre_err:#}");
|
error!("could not do any evictions: {pre_err:#}");
|
||||||
|
|||||||
@@ -426,7 +426,7 @@ impl ConnectionManagerState {
|
|||||||
timeline,
|
timeline,
|
||||||
new_sk.wal_source_connconf,
|
new_sk.wal_source_connconf,
|
||||||
events_sender,
|
events_sender,
|
||||||
cancellation,
|
cancellation.clone(),
|
||||||
connect_timeout,
|
connect_timeout,
|
||||||
ctx,
|
ctx,
|
||||||
node_id,
|
node_id,
|
||||||
@@ -447,7 +447,14 @@ impl ConnectionManagerState {
|
|||||||
}
|
}
|
||||||
WalReceiverError::Other(e) => {
|
WalReceiverError::Other(e) => {
|
||||||
// give out an error to have task_mgr give it a really verbose logging
|
// give out an error to have task_mgr give it a really verbose logging
|
||||||
Err(e).context("walreceiver connection handling failure")
|
if cancellation.is_cancelled() {
|
||||||
|
// Ideally we would learn about this via some path other than Other, but
|
||||||
|
// that requires refactoring all the intermediate layers of ingest code
|
||||||
|
// that only emit anyhow::Error
|
||||||
|
Ok(())
|
||||||
|
} else {
|
||||||
|
Err(e).context("walreceiver connection handling failure")
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -17,6 +17,10 @@ def test_pageserver_restarts_under_worload(neon_simple_env: NeonEnv, pg_bin: PgB
|
|||||||
n_restarts = 10
|
n_restarts = 10
|
||||||
scale = 10
|
scale = 10
|
||||||
|
|
||||||
|
# Pageserver currently logs requests on non-active tenants at error level
|
||||||
|
# https://github.com/neondatabase/neon/issues/5784
|
||||||
|
env.pageserver.allowed_errors.append(".* will not become active. Current state: Stopping.*")
|
||||||
|
|
||||||
def run_pgbench(connstr: str):
|
def run_pgbench(connstr: str):
|
||||||
log.info(f"Start a pgbench workload on pg {connstr}")
|
log.info(f"Start a pgbench workload on pg {connstr}")
|
||||||
pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr])
|
pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr])
|
||||||
|
|||||||
Reference in New Issue
Block a user