From 8360307ea0acfa4069533eadbd284a2abf1d80e1 Mon Sep 17 00:00:00 2001 From: John Spray Date: Thu, 26 Oct 2023 14:00:26 +0100 Subject: [PATCH] pageserver: exponential backoff on compaction/GC failures (#5672) Previously, if walredo process crashed we would try to spawn a fresh one every 2 seconds, which is expensive in itself, but also results in a high I/O load from the part of the compaction prior to the failure, which we re-run every 2 seconds. Closes: https://github.com/neondatabase/neon/issues/5671 --- pageserver/src/tenant/tasks.rs | 40 ++++++++++++++++++++++----- test_runner/fixtures/neon_fixtures.py | 6 ++-- 2 files changed, 36 insertions(+), 10 deletions(-) diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index a845456181..f419d2e73a 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -12,7 +12,7 @@ use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME}; use crate::tenant::{Tenant, TenantState}; use tokio_util::sync::CancellationToken; use tracing::*; -use utils::completion; +use utils::{backoff, completion}; static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy = once_cell::sync::Lazy::new(|| { @@ -139,7 +139,10 @@ pub fn start_background_loops( /// Compaction task's main loop /// async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { - let wait_duration = Duration::from_secs(2); + const MAX_BACKOFF_SECS: f64 = 300.0; + // How many errors we have seen consequtively + let mut error_run_count = 0; + TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); async { let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download); @@ -173,9 +176,19 @@ async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { } else { // Run compaction if let Err(e) = tenant.compaction_iteration(&cancel, &ctx).await { - error!("Compaction failed, retrying in {:?}: {e:?}", wait_duration); - wait_duration + let wait_duration = backoff::exponential_backoff_duration_seconds( + error_run_count, + 1.0, + MAX_BACKOFF_SECS, + ); + error_run_count += 1; + error!( + "Compaction failed {error_run_count} times, retrying in {:?}: {e:?}", + wait_duration + ); + Duration::from_secs_f64(wait_duration) } else { + error_run_count = 0; period } }; @@ -210,7 +223,10 @@ async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { /// GC task's main loop /// async fn gc_loop(tenant: Arc, cancel: CancellationToken) { - let wait_duration = Duration::from_secs(2); + const MAX_BACKOFF_SECS: f64 = 300.0; + // How many errors we have seen consequtively + let mut error_run_count = 0; + TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); async { // GC might require downloading, to find the cutoff LSN that corresponds to the @@ -249,9 +265,19 @@ async fn gc_loop(tenant: Arc, cancel: CancellationToken) { .gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx) .await; if let Err(e) = res { - error!("Gc failed, retrying in {:?}: {e:?}", wait_duration); - wait_duration + let wait_duration = backoff::exponential_backoff_duration_seconds( + error_run_count, + 1.0, + MAX_BACKOFF_SECS, + ); + error_run_count += 1; + error!( + "Gc failed {error_run_count} times, retrying in {:?}: {e:?}", + wait_duration + ); + Duration::from_secs_f64(wait_duration) } else { + error_run_count = 0; period } }; diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 0e9f823860..9184f0c43a 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1616,16 +1616,16 @@ class NeonPageserver(PgProtocol): ".*wait for layer upload ops to complete.*", # .*Caused by:.*wait_completion aborted because upload queue was stopped ".*gc_loop.*Gc failed, retrying in.*timeline is Stopping", # When gc checks timeline state after acquiring layer_removal_cs ".*gc_loop.*Gc failed, retrying in.*: Cannot run GC iteration on inactive tenant", # Tenant::gc precondition - ".*compaction_loop.*Compaction failed, retrying in.*timeline or pageserver is shutting down", # When compaction checks timeline state after acquiring layer_removal_cs + ".*compaction_loop.*Compaction failed.*, retrying in.*timeline or pageserver is shutting down", # When compaction checks timeline state after acquiring layer_removal_cs ".*query handler for 'pagestream.*failed: Timeline .* was not found", # postgres reconnects while timeline_delete doesn't hold the tenant's timelines.lock() ".*query handler for 'pagestream.*failed: Timeline .* is not active", # timeline delete in progress ".*task iteration took longer than the configured period.*", # this is until #3501 - ".*Compaction failed, retrying in [^:]+: Cannot run compaction iteration on inactive tenant", + ".*Compaction failed.*, retrying in [^:]+: Cannot run compaction iteration on inactive tenant", # these can happen anytime we do compactions from background task and shutdown pageserver r".*ERROR.*ancestor timeline \S+ is being stopped", # this is expected given our collaborative shutdown approach for the UploadQueue - ".*Compaction failed, retrying in .*: queue is in state Stopped.*", + ".*Compaction failed.*, retrying in .*: queue is in state Stopped.*", # Pageserver timeline deletion should be polled until it gets 404, so ignore it globally ".*Error processing HTTP request: NotFound: Timeline .* was not found", ".*took more than expected to complete.*",