pageserver: shut down idle walredo processes (#5877)

The longer a pageserver runs, the more walredo processes it accumulates
from tenants that are touched intermittently (e.g. by availability
checks). This can lead to getting OOM killed.

Changes:
- Add an Instant recording the last use of the walredo process for a
tenant
- After compaction iteration in the background task, check for idleness
and stop the walredo process if idle for more than 10x compaction
period.

Cc: #3620

Co-authored-by: Joonas Koivunen <joonas@neon.tech>
Co-authored-by: Shany Pozin <shany@neon.tech>
This commit is contained in:
John Spray
2023-11-19 15:21:16 +01:00
committed by GitHub
parent 3b3f040be3
commit d22dce2e31
3 changed files with 33 additions and 0 deletions

View File

@@ -291,6 +291,16 @@ impl From<harness::TestRedoManager> for WalRedoManager {
}
impl WalRedoManager {
pub(crate) fn maybe_quiesce(&self, idle_timeout: Duration) {
match self {
Self::Prod(mgr) => mgr.maybe_quiesce(idle_timeout),
#[cfg(test)]
Self::Test(_) => {
// Not applicable to test redo manager
}
}
}
pub async fn request_redo(
&self,
key: crate::repository::Key,

View File

@@ -198,6 +198,10 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Compaction);
// Perhaps we did no work and the walredo process has been idle for some time:
// give it a chance to shut down to avoid leaving walredo process running indefinitely.
tenant.walredo_mgr.maybe_quiesce(period * 10);
// Sleep
if tokio::time::timeout(sleep_duration, cancel.cancelled())
.await

View File

@@ -91,6 +91,7 @@ struct ProcessOutput {
pub struct PostgresRedoManager {
tenant_id: TenantId,
conf: &'static PageServerConf,
last_redo_at: std::sync::Mutex<Option<Instant>>,
redo_process: RwLock<Option<Arc<WalRedoProcess>>>,
}
@@ -187,10 +188,26 @@ impl PostgresRedoManager {
PostgresRedoManager {
tenant_id,
conf,
last_redo_at: std::sync::Mutex::default(),
redo_process: RwLock::new(None),
}
}
/// This type doesn't have its own background task to check for idleness: we
/// rely on our owner calling this function periodically in its own housekeeping
/// loops.
pub(crate) fn maybe_quiesce(&self, idle_timeout: Duration) {
if let Ok(g) = self.last_redo_at.try_lock() {
if let Some(last_redo_at) = *g {
if last_redo_at.elapsed() >= idle_timeout {
drop(g);
let mut guard = self.redo_process.write().unwrap();
*guard = None;
}
}
}
}
///
/// Process one request for WAL redo using wal-redo postgres
///
@@ -205,6 +222,8 @@ impl PostgresRedoManager {
wal_redo_timeout: Duration,
pg_version: u32,
) -> anyhow::Result<Bytes> {
*(self.last_redo_at.lock().unwrap()) = Some(Instant::now());
let (rel, blknum) = key_to_rel_block(key).context("invalid record")?;
const MAX_RETRY_ATTEMPTS: u32 = 1;
let mut n_attempts = 0u32;