From f73aa3eb3217c52adc7d04692c959419b292a60f Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Fri, 2 Feb 2024 14:52:53 +0000 Subject: [PATCH] refactor(walredo): avoid the need for a WalRedoManager in broken tenants When we'll later introduce a global pool of pre-spawned walredo processes (https://github.com/neondatabase/neon/issues/6581), this refactoring avoids plumbing through the reference to the pool to all the places where we create a broken tenant. Builds atop the refactoring in #6583 --- pageserver/src/tenant.rs | 18 +++++++----------- pageserver/src/tenant/tasks.rs | 4 +++- pageserver/src/tenant/timeline.rs | 9 ++++++--- pageserver/src/walredo/process.rs | 4 +++- 4 files changed, 19 insertions(+), 16 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 58af80238d..10b0237faf 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -276,7 +276,7 @@ pub struct Tenant { // with timelines, which in turn may cause dropping replication connection, expiration of wait_for_lsn // timeout... gc_cs: tokio::sync::Mutex<()>, - walredo_mgr: Arc, + walredo_mgr: Option>, // provides access to timeline data sitting in the remote storage pub(crate) remote_storage: Option, @@ -630,7 +630,7 @@ impl Tenant { conf, attached_conf, shard_identity, - wal_redo_manager, + Some(wal_redo_manager), tenant_shard_id, remote_storage.clone(), deletion_queue_client, @@ -1184,10 +1184,6 @@ impl Tenant { tenant_shard_id: TenantShardId, reason: String, ) -> Arc { - let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new( - conf, - tenant_shard_id, - ))); Arc::new(Tenant::new( TenantState::Broken { reason, @@ -1198,7 +1194,7 @@ impl Tenant { // Shard identity isn't meaningful for a broken tenant: it's just a placeholder // to occupy the slot for this TenantShardId. ShardIdentity::broken(tenant_shard_id.shard_number, tenant_shard_id.shard_count), - wal_redo_manager, + None, tenant_shard_id, None, DeletionQueueClient::broken(), @@ -1967,7 +1963,7 @@ impl Tenant { } pub(crate) fn wal_redo_manager_status(&self) -> Option { - self.walredo_mgr.status() + self.walredo_mgr.as_ref().and_then(|mgr| mgr.status()) } /// Changes tenant status to active, unless shutdown was already requested. @@ -2607,7 +2603,7 @@ impl Tenant { self.tenant_shard_id, self.generation, self.shard_identity, - Arc::clone(&self.walredo_mgr), + self.walredo_mgr.as_ref().map(Arc::clone), resources, pg_version, state, @@ -2625,7 +2621,7 @@ impl Tenant { conf: &'static PageServerConf, attached_conf: AttachedTenantConf, shard_identity: ShardIdentity, - walredo_mgr: Arc, + walredo_mgr: Option>, tenant_shard_id: TenantShardId, remote_storage: Option, deletion_queue_client: DeletionQueueClient, @@ -4056,7 +4052,7 @@ pub(crate) mod harness { .unwrap(), // This is a legacy/test code path: sharding isn't supported here. ShardIdentity::unsharded(), - walredo_mgr, + Some(walredo_mgr), self.tenant_shard_id, Some(self.remote_storage.clone()), self.deletion_queue.new_client(), diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 5f39c46a84..950cc46e71 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -199,7 +199,9 @@ async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { // Perhaps we did no work and the walredo process has been idle for some time: // give it a chance to shut down to avoid leaving walredo process running indefinitely. - tenant.walredo_mgr.maybe_quiesce(period * 10); + if let Some(walredo_mgr) = &tenant.walredo_mgr { + walredo_mgr.maybe_quiesce(period * 10); + } // Sleep if tokio::time::timeout(sleep_duration, cancel.cancelled()) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 168e565edb..7b5adc4ca6 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -215,8 +215,8 @@ pub struct Timeline { // Atomic would be more appropriate here. last_freeze_ts: RwLock, - // WAL redo manager - walredo_mgr: Arc, + // WAL redo manager. `None` only for broken tenants. + walredo_mgr: Option>, /// Remote storage client. /// See [`remote_timeline_client`](super::remote_timeline_client) module comment for details. @@ -1421,7 +1421,7 @@ impl Timeline { tenant_shard_id: TenantShardId, generation: Generation, shard_identity: ShardIdentity, - walredo_mgr: Arc, + walredo_mgr: Option>, resources: TimelineResources, pg_version: u32, state: TimelineState, @@ -4445,6 +4445,9 @@ impl Timeline { let img = match self .walredo_mgr + .as_ref() + .context("timeline has no walredo manager") + .map_err(PageReconstructError::WalRedo)? .request_redo(key, request_lsn, data.img, data.records, self.pg_version) .await .context("reconstruct a page image") diff --git a/pageserver/src/walredo/process.rs b/pageserver/src/walredo/process.rs index 72f4151dca..85db3b4a4a 100644 --- a/pageserver/src/walredo/process.rs +++ b/pageserver/src/walredo/process.rs @@ -10,11 +10,13 @@ use nix::poll::{PollFd, PollFlags}; use pageserver_api::{reltag::RelTag, shard::TenantShardId}; use postgres_ffi::BLCKSZ; use std::os::fd::AsRawFd; +#[cfg(feature = "testing")] +use std::sync::atomic::AtomicUsize; use std::{ collections::VecDeque, io::{Read, Write}, process::{ChildStdin, ChildStdout, Command, Stdio}, - sync::{atomic::AtomicUsize, Mutex, MutexGuard}, + sync::{Mutex, MutexGuard}, time::Duration, }; use tracing::{debug, error, instrument, Instrument};