mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-08 22:12:56 +00:00
fix(pageserver): flush deletion queue in reload shutdown mode (#9884)
## Problem close https://github.com/neondatabase/neon/issues/9859 ## Summary of changes Ensure that the deletion queue gets fully flushed (i.e., the deletion lists get applied) during a graceful shutdown. It is still possible that an incomplete shutdown would leave deletion list behind and cause race upon the next startup, but we assume this will unlikely happen, and even if it happened, the pageserver should already be at a tainted state and the tenant should be moved to a new tenant with a new generation number. --------- Signed-off-by: Alex Chi Z <chi@neon.tech>
This commit is contained in:
@@ -1144,18 +1144,24 @@ pub(crate) mod mock {
|
||||
rx: tokio::sync::mpsc::UnboundedReceiver<ListWriterQueueMessage>,
|
||||
executor_rx: tokio::sync::mpsc::Receiver<DeleterMessage>,
|
||||
cancel: CancellationToken,
|
||||
executed: Arc<AtomicUsize>,
|
||||
}
|
||||
|
||||
impl ConsumerState {
|
||||
async fn consume(&mut self, remote_storage: &GenericRemoteStorage) -> usize {
|
||||
let mut executed = 0;
|
||||
|
||||
async fn consume(&mut self, remote_storage: &GenericRemoteStorage) {
|
||||
info!("Executing all pending deletions");
|
||||
|
||||
// Transform all executor messages to generic frontend messages
|
||||
while let Ok(msg) = self.executor_rx.try_recv() {
|
||||
loop {
|
||||
use either::Either;
|
||||
let msg = tokio::select! {
|
||||
left = self.executor_rx.recv() => Either::Left(left),
|
||||
right = self.rx.recv() => Either::Right(right),
|
||||
};
|
||||
match msg {
|
||||
DeleterMessage::Delete(objects) => {
|
||||
Either::Left(None) => break,
|
||||
Either::Right(None) => break,
|
||||
Either::Left(Some(DeleterMessage::Delete(objects))) => {
|
||||
for path in objects {
|
||||
match remote_storage.delete(&path, &self.cancel).await {
|
||||
Ok(_) => {
|
||||
@@ -1165,18 +1171,13 @@ pub(crate) mod mock {
|
||||
error!("Failed to delete {path}, leaking object! ({e})");
|
||||
}
|
||||
}
|
||||
executed += 1;
|
||||
self.executed.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
DeleterMessage::Flush(flush_op) => {
|
||||
Either::Left(Some(DeleterMessage::Flush(flush_op))) => {
|
||||
flush_op.notify();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
while let Ok(msg) = self.rx.try_recv() {
|
||||
match msg {
|
||||
ListWriterQueueMessage::Delete(op) => {
|
||||
Either::Right(Some(ListWriterQueueMessage::Delete(op))) => {
|
||||
let mut objects = op.objects;
|
||||
for (layer, meta) in op.layers {
|
||||
objects.push(remote_layer_path(
|
||||
@@ -1198,33 +1199,27 @@ pub(crate) mod mock {
|
||||
error!("Failed to delete {path}, leaking object! ({e})");
|
||||
}
|
||||
}
|
||||
executed += 1;
|
||||
self.executed.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
ListWriterQueueMessage::Flush(op) => {
|
||||
Either::Right(Some(ListWriterQueueMessage::Flush(op))) => {
|
||||
op.notify();
|
||||
}
|
||||
ListWriterQueueMessage::FlushExecute(op) => {
|
||||
Either::Right(Some(ListWriterQueueMessage::FlushExecute(op))) => {
|
||||
// We have already executed all prior deletions because mock does them inline
|
||||
op.notify();
|
||||
}
|
||||
ListWriterQueueMessage::Recover(_) => {
|
||||
Either::Right(Some(ListWriterQueueMessage::Recover(_))) => {
|
||||
// no-op in mock
|
||||
}
|
||||
}
|
||||
info!("All pending deletions have been executed");
|
||||
}
|
||||
|
||||
executed
|
||||
}
|
||||
}
|
||||
|
||||
pub struct MockDeletionQueue {
|
||||
tx: tokio::sync::mpsc::UnboundedSender<ListWriterQueueMessage>,
|
||||
executor_tx: tokio::sync::mpsc::Sender<DeleterMessage>,
|
||||
executed: Arc<AtomicUsize>,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
consumer: std::sync::Mutex<ConsumerState>,
|
||||
lsn_table: Arc<std::sync::RwLock<VisibleLsnUpdates>>,
|
||||
}
|
||||
|
||||
@@ -1235,29 +1230,34 @@ pub(crate) mod mock {
|
||||
|
||||
let executed = Arc::new(AtomicUsize::new(0));
|
||||
|
||||
let mut consumer = ConsumerState {
|
||||
rx,
|
||||
executor_rx,
|
||||
cancel: CancellationToken::new(),
|
||||
executed: executed.clone(),
|
||||
};
|
||||
|
||||
tokio::spawn(async move {
|
||||
if let Some(remote_storage) = &remote_storage {
|
||||
consumer.consume(remote_storage).await;
|
||||
}
|
||||
});
|
||||
|
||||
Self {
|
||||
tx,
|
||||
executor_tx,
|
||||
executed,
|
||||
remote_storage,
|
||||
consumer: std::sync::Mutex::new(ConsumerState {
|
||||
rx,
|
||||
executor_rx,
|
||||
cancel: CancellationToken::new(),
|
||||
}),
|
||||
lsn_table: Arc::new(std::sync::RwLock::new(VisibleLsnUpdates::new())),
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::await_holding_lock)]
|
||||
pub async fn pump(&self) {
|
||||
if let Some(remote_storage) = &self.remote_storage {
|
||||
// Permit holding mutex across await, because this is only ever
|
||||
// called once at a time in tests.
|
||||
let mut locked = self.consumer.lock().unwrap();
|
||||
let count = locked.consume(remote_storage).await;
|
||||
self.executed.fetch_add(count, Ordering::Relaxed);
|
||||
}
|
||||
let (tx, rx) = tokio::sync::oneshot::channel();
|
||||
self.executor_tx
|
||||
.send(DeleterMessage::Flush(FlushOp { tx }))
|
||||
.await
|
||||
.expect("Failed to send flush message");
|
||||
rx.await.ok();
|
||||
}
|
||||
|
||||
pub(crate) fn new_client(&self) -> DeletionQueueClient {
|
||||
|
||||
@@ -3215,6 +3215,18 @@ impl Tenant {
|
||||
}
|
||||
}
|
||||
|
||||
if let ShutdownMode::Reload = shutdown_mode {
|
||||
tracing::info!("Flushing deletion queue");
|
||||
if let Err(e) = self.deletion_queue_client.flush().await {
|
||||
match e {
|
||||
DeletionQueueError::ShuttingDown => {
|
||||
// This is the only error we expect for now. In the future, if more error
|
||||
// variants are added, we should handle them here.
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// We cancel the Tenant's cancellation token _after_ the timelines have all shut down. This permits
|
||||
// them to continue to do work during their shutdown methods, e.g. flushing data.
|
||||
tracing::debug!("Cancelling CancellationToken");
|
||||
|
||||
@@ -1960,7 +1960,7 @@ impl TenantManager {
|
||||
attempt.before_reset_tenant();
|
||||
|
||||
let (_guard, progress) = utils::completion::channel();
|
||||
match tenant.shutdown(progress, ShutdownMode::Flush).await {
|
||||
match tenant.shutdown(progress, ShutdownMode::Reload).await {
|
||||
Ok(()) => {
|
||||
slot_guard.drop_old_value().expect("it was just shutdown");
|
||||
}
|
||||
|
||||
@@ -894,10 +894,11 @@ pub(crate) enum ShutdownMode {
|
||||
/// While we are flushing, we continue to accept read I/O for LSNs ingested before
|
||||
/// the call to [`Timeline::shutdown`].
|
||||
FreezeAndFlush,
|
||||
/// Only flush the layers to the remote storage without freezing any open layers. This is the
|
||||
/// mode used by ancestor detach and any other operations that reloads a tenant but not increasing
|
||||
/// the generation number.
|
||||
Flush,
|
||||
/// Only flush the layers to the remote storage without freezing any open layers. Flush the deletion
|
||||
/// queue. This is the mode used by ancestor detach and any other operations that reloads a tenant
|
||||
/// but not increasing the generation number. Note that this mode cannot be used at tenant shutdown,
|
||||
/// as flushing the deletion queue at that time will cause shutdown-in-progress errors.
|
||||
Reload,
|
||||
/// Shut down immediately, without waiting for any open layers to flush.
|
||||
Hard,
|
||||
}
|
||||
@@ -1818,7 +1819,7 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
if let ShutdownMode::Flush = mode {
|
||||
if let ShutdownMode::Reload = mode {
|
||||
// drain the upload queue
|
||||
self.remote_client.shutdown().await;
|
||||
if !self.remote_client.no_pending_work() {
|
||||
|
||||
@@ -58,7 +58,7 @@ pub(crate) async fn offload_timeline(
|
||||
}
|
||||
|
||||
// Now that the Timeline is in Stopping state, request all the related tasks to shut down.
|
||||
timeline.shutdown(super::ShutdownMode::Flush).await;
|
||||
timeline.shutdown(super::ShutdownMode::Reload).await;
|
||||
|
||||
// TODO extend guard mechanism above with method
|
||||
// to make deletions possible while offloading is in progress
|
||||
|
||||
Reference in New Issue
Block a user