From 84ffdc8b4feff67e467ed8b104d4fbe538895a1e Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Mon, 16 Jan 2023 17:22:23 +0400 Subject: [PATCH] Don't keep FDs open on cancelled timelines in safekeepers. Since PR #3300 we don't remove timelines completely until next restart, so this prevents leakage. fixes https://github.com/neondatabase/neon/issues/3336 --- safekeeper/src/timeline.rs | 15 +++++++-------- safekeeper/src/wal_storage.rs | 11 +++++++++++ 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 038c32afe0..43c395574f 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -466,7 +466,7 @@ impl Timeline { Ok(_) => Ok(()), Err(e) => { // Bootstrap failed, cancel timeline and remove timeline directory. - self.cancel(); + self.cancel(shared_state); if let Err(fs_err) = std::fs::remove_dir_all(&self.timeline_dir) { warn!( @@ -487,20 +487,23 @@ impl Timeline { shared_state: &mut MutexGuard, ) -> Result<(bool, bool)> { let was_active = shared_state.active; - self.cancel(); + self.cancel(shared_state); let dir_existed = delete_dir(&self.timeline_dir)?; Ok((dir_existed, was_active)) } /// Cancel timeline to prevent further usage. Background tasks will stop /// eventually after receiving cancellation signal. - fn cancel(&self) { - info!("Timeline {} is cancelled", self.ttid); + fn cancel(&self, shared_state: &mut MutexGuard) { + info!("timeline {} is cancelled", self.ttid); let _ = self.cancellation_tx.send(true); let res = self.wal_backup_launcher_tx.blocking_send(self.ttid); if let Err(e) = res { error!("Failed to send stop signal to wal_backup_launcher: {}", e); } + // Close associated FDs. Nobody will be able to touch timeline data once + // it is cancelled, so WAL storage won't be opened again. + shared_state.sk.wal_store.close(); } /// Returns if timeline is cancelled. @@ -537,10 +540,6 @@ impl Timeline { /// De-register compute connection, shutting down timeline activity if /// pageserver doesn't need catchup. pub fn on_compute_disconnect(&self) -> Result<()> { - if self.is_cancelled() { - bail!(TimelineError::Cancelled(self.ttid)); - } - let is_wal_backup_action_pending: bool; { let mut shared_state = self.write_shared_state(); diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 41457868fe..561104bd27 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -55,6 +55,12 @@ pub trait Storage { /// that without timeline lock. fn remove_up_to(&self) -> Box Result<()>>; + /// Release resources associated with the storage -- technically, close FDs. + /// Currently we don't remove timelines until restart (#3146), so need to + /// spare descriptors. This would be useful for temporary tli detach as + /// well. + fn close(&mut self) {} + /// Get metrics for this timeline. fn get_metrics(&self) -> WalStorageMetrics; } @@ -401,6 +407,11 @@ impl Storage for PhysicalStorage { }) } + fn close(&mut self) { + // close happens in destructor + let _open_file = self.file.take(); + } + fn get_metrics(&self) -> WalStorageMetrics { self.metrics.clone() }