From d7b29aace7eec730af45e7f12fbe5620545b48aa Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Tue, 6 Feb 2024 16:20:02 +0100 Subject: [PATCH] refactor(walredo): don't create WalRedoManager for broken tenants (#6597) When we'll later introduce a global pool of pre-spawned walredo processes (https://github.com/neondatabase/neon/issues/6581), this refactoring avoids plumbing through the reference to the pool to all the places where we create a broken tenant. Builds atop the refactoring in #6583 --- pageserver/src/tenant.rs | 18 +++++++----------- pageserver/src/tenant/tasks.rs | 4 +++- pageserver/src/tenant/timeline.rs | 9 ++++++--- 3 files changed, 16 insertions(+), 15 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index fe85cf9753..f704f8c0dd 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -278,7 +278,7 @@ pub struct Tenant { // with timelines, which in turn may cause dropping replication connection, expiration of wait_for_lsn // timeout... gc_cs: tokio::sync::Mutex<()>, - walredo_mgr: Arc, + walredo_mgr: Option>, // provides access to timeline data sitting in the remote storage pub(crate) remote_storage: Option, @@ -635,7 +635,7 @@ impl Tenant { conf, attached_conf, shard_identity, - wal_redo_manager, + Some(wal_redo_manager), tenant_shard_id, remote_storage.clone(), deletion_queue_client, @@ -1195,10 +1195,6 @@ impl Tenant { tenant_shard_id: TenantShardId, reason: String, ) -> Arc { - let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new( - conf, - tenant_shard_id, - ))); Arc::new(Tenant::new( TenantState::Broken { reason, @@ -1209,7 +1205,7 @@ impl Tenant { // Shard identity isn't meaningful for a broken tenant: it's just a placeholder // to occupy the slot for this TenantShardId. ShardIdentity::broken(tenant_shard_id.shard_number, tenant_shard_id.shard_count), - wal_redo_manager, + None, tenant_shard_id, None, DeletionQueueClient::broken(), @@ -1978,7 +1974,7 @@ impl Tenant { } pub(crate) fn wal_redo_manager_status(&self) -> Option { - self.walredo_mgr.status() + self.walredo_mgr.as_ref().and_then(|mgr| mgr.status()) } /// Changes tenant status to active, unless shutdown was already requested. @@ -2613,7 +2609,7 @@ impl Tenant { self.tenant_shard_id, self.generation, self.shard_identity, - Arc::clone(&self.walredo_mgr), + self.walredo_mgr.as_ref().map(Arc::clone), resources, pg_version, state, @@ -2631,7 +2627,7 @@ impl Tenant { conf: &'static PageServerConf, attached_conf: AttachedTenantConf, shard_identity: ShardIdentity, - walredo_mgr: Arc, + walredo_mgr: Option>, tenant_shard_id: TenantShardId, remote_storage: Option, deletion_queue_client: DeletionQueueClient, @@ -4055,7 +4051,7 @@ pub(crate) mod harness { .unwrap(), // This is a legacy/test code path: sharding isn't supported here. ShardIdentity::unsharded(), - walredo_mgr, + Some(walredo_mgr), self.tenant_shard_id, Some(self.remote_storage.clone()), self.deletion_queue.new_client(), diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 5f39c46a84..950cc46e71 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -199,7 +199,9 @@ async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { // Perhaps we did no work and the walredo process has been idle for some time: // give it a chance to shut down to avoid leaving walredo process running indefinitely. - tenant.walredo_mgr.maybe_quiesce(period * 10); + if let Some(walredo_mgr) = &tenant.walredo_mgr { + walredo_mgr.maybe_quiesce(period * 10); + } // Sleep if tokio::time::timeout(sleep_duration, cancel.cancelled()) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 43aa178ab5..735b8003b4 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -215,8 +215,8 @@ pub struct Timeline { // Atomic would be more appropriate here. last_freeze_ts: RwLock, - // WAL redo manager - walredo_mgr: Arc, + // WAL redo manager. `None` only for broken tenants. + walredo_mgr: Option>, /// Remote storage client. /// See [`remote_timeline_client`](super::remote_timeline_client) module comment for details. @@ -1427,7 +1427,7 @@ impl Timeline { tenant_shard_id: TenantShardId, generation: Generation, shard_identity: ShardIdentity, - walredo_mgr: Arc, + walredo_mgr: Option>, resources: TimelineResources, pg_version: u32, state: TimelineState, @@ -4457,6 +4457,9 @@ impl Timeline { let img = match self .walredo_mgr + .as_ref() + .context("timeline has no walredo manager") + .map_err(PageReconstructError::WalRedo)? .request_redo(key, request_lsn, data.img, data.records, self.pg_version) .await .context("reconstruct a page image")