pageserver: exponential backoff on compaction/GC failures (#5672)

Previously, if walredo process crashed we would try to spawn a fresh one every 2 seconds, which is expensive in itself, but also results in a high I/O load from the part of the compaction prior to the failure, which we re-run every 2 seconds. Closes: https://github.com/neondatabase/neon/issues/5671
2026-06-02 13:00:37 +00:00 · 2023-10-26 14:00:26 +01:00
parent 6129077d31
commit 8360307ea0
2 changed files with 36 additions and 10 deletions
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -12,7 +12,7 @@ use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
 use crate::tenant::{Tenant, TenantState};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
-use utils::completion;
+use utils::{backoff, completion};

 static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy<tokio::sync::Semaphore> =
    once_cell::sync::Lazy::new(|| {
@@ -139,7 +139,10 @@ pub fn start_background_loops(
 /// Compaction task's main loop
 ///
 async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
-    let wait_duration = Duration::from_secs(2);
+    const MAX_BACKOFF_SECS: f64 = 300.0;
+    // How many errors we have seen consequtively
+    let mut error_run_count = 0;
+
    TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
    async {
        let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download);
@@ -173,9 +176,19 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
            } else {
                // Run compaction
                if let Err(e) = tenant.compaction_iteration(&cancel, &ctx).await {
-                    error!("Compaction failed, retrying in {:?}: {e:?}", wait_duration);
-                    wait_duration
+                    let wait_duration = backoff::exponential_backoff_duration_seconds(
+                        error_run_count,
+                        1.0,
+                        MAX_BACKOFF_SECS,
+                    );
+                    error_run_count += 1;
+                    error!(
+                        "Compaction failed {error_run_count} times, retrying in {:?}: {e:?}",
+                        wait_duration
+                    );
+                    Duration::from_secs_f64(wait_duration)
                } else {
+                    error_run_count = 0;
                    period
                }
            };
@@ -210,7 +223,10 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
 /// GC task's main loop
 ///
 async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
-    let wait_duration = Duration::from_secs(2);
+    const MAX_BACKOFF_SECS: f64 = 300.0;
+    // How many errors we have seen consequtively
+    let mut error_run_count = 0;
+
    TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
    async {
        // GC might require downloading, to find the cutoff LSN that corresponds to the
@@ -249,9 +265,19 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                    .gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx)
                    .await;
                if let Err(e) = res {
-                    error!("Gc failed, retrying in {:?}: {e:?}", wait_duration);
-                    wait_duration
+                    let wait_duration = backoff::exponential_backoff_duration_seconds(
+                        error_run_count,
+                        1.0,
+                        MAX_BACKOFF_SECS,
+                    );
+                    error_run_count += 1;
+                    error!(
+                        "Gc failed {error_run_count} times, retrying in {:?}: {e:?}",
+                        wait_duration
+                    );
+                    Duration::from_secs_f64(wait_duration)
                } else {
+                    error_run_count = 0;
                    period
                }
            };
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1616,16 +1616,16 @@ class NeonPageserver(PgProtocol):
            ".*wait for layer upload ops to complete.*",  # .*Caused by:.*wait_completion aborted because upload queue was stopped
            ".*gc_loop.*Gc failed, retrying in.*timeline is Stopping",  # When gc checks timeline state after acquiring layer_removal_cs
            ".*gc_loop.*Gc failed, retrying in.*: Cannot run GC iteration on inactive tenant",  # Tenant::gc precondition
-            ".*compaction_loop.*Compaction failed, retrying in.*timeline or pageserver is shutting down",  # When compaction checks timeline state after acquiring layer_removal_cs
+            ".*compaction_loop.*Compaction failed.*, retrying in.*timeline or pageserver is shutting down",  # When compaction checks timeline state after acquiring layer_removal_cs
            ".*query handler for 'pagestream.*failed: Timeline .* was not found",  # postgres reconnects while timeline_delete doesn't hold the tenant's timelines.lock()
            ".*query handler for 'pagestream.*failed: Timeline .* is not active",  # timeline delete in progress
            ".*task iteration took longer than the configured period.*",
            # this is until #3501
-            ".*Compaction failed, retrying in [^:]+: Cannot run compaction iteration on inactive tenant",
+            ".*Compaction failed.*, retrying in [^:]+: Cannot run compaction iteration on inactive tenant",
            # these can happen anytime we do compactions from background task and shutdown pageserver
            r".*ERROR.*ancestor timeline \S+ is being stopped",
            # this is expected given our collaborative shutdown approach for the UploadQueue
-            ".*Compaction failed, retrying in .*: queue is in state Stopped.*",
+            ".*Compaction failed.*, retrying in .*: queue is in state Stopped.*",
            # Pageserver timeline deletion should be polled until it gets 404, so ignore it globally
            ".*Error processing HTTP request: NotFound: Timeline .* was not found",
            ".*took more than expected to complete.*",