pageserver: exponential backoff on compaction/GC failures (#5672)

Previously, if walredo process crashed we would try to spawn a fresh one
every 2 seconds, which is expensive in itself, but also results in a
high I/O load from the part of the compaction prior to the failure,
which we re-run every 2 seconds.

Closes: https://github.com/neondatabase/neon/issues/5671
This commit is contained in:
John Spray
2023-10-26 14:00:26 +01:00
committed by GitHub
parent 6129077d31
commit 8360307ea0
2 changed files with 36 additions and 10 deletions

View File

@@ -12,7 +12,7 @@ use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
use crate::tenant::{Tenant, TenantState};
use tokio_util::sync::CancellationToken;
use tracing::*;
use utils::completion;
use utils::{backoff, completion};
static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy<tokio::sync::Semaphore> =
once_cell::sync::Lazy::new(|| {
@@ -139,7 +139,10 @@ pub fn start_background_loops(
/// Compaction task's main loop
///
async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
let wait_duration = Duration::from_secs(2);
const MAX_BACKOFF_SECS: f64 = 300.0;
// How many errors we have seen consequtively
let mut error_run_count = 0;
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
async {
let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download);
@@ -173,9 +176,19 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
} else {
// Run compaction
if let Err(e) = tenant.compaction_iteration(&cancel, &ctx).await {
error!("Compaction failed, retrying in {:?}: {e:?}", wait_duration);
wait_duration
let wait_duration = backoff::exponential_backoff_duration_seconds(
error_run_count,
1.0,
MAX_BACKOFF_SECS,
);
error_run_count += 1;
error!(
"Compaction failed {error_run_count} times, retrying in {:?}: {e:?}",
wait_duration
);
Duration::from_secs_f64(wait_duration)
} else {
error_run_count = 0;
period
}
};
@@ -210,7 +223,10 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
/// GC task's main loop
///
async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
let wait_duration = Duration::from_secs(2);
const MAX_BACKOFF_SECS: f64 = 300.0;
// How many errors we have seen consequtively
let mut error_run_count = 0;
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
async {
// GC might require downloading, to find the cutoff LSN that corresponds to the
@@ -249,9 +265,19 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx)
.await;
if let Err(e) = res {
error!("Gc failed, retrying in {:?}: {e:?}", wait_duration);
wait_duration
let wait_duration = backoff::exponential_backoff_duration_seconds(
error_run_count,
1.0,
MAX_BACKOFF_SECS,
);
error_run_count += 1;
error!(
"Gc failed {error_run_count} times, retrying in {:?}: {e:?}",
wait_duration
);
Duration::from_secs_f64(wait_duration)
} else {
error_run_count = 0;
period
}
};

View File

@@ -1616,16 +1616,16 @@ class NeonPageserver(PgProtocol):
".*wait for layer upload ops to complete.*", # .*Caused by:.*wait_completion aborted because upload queue was stopped
".*gc_loop.*Gc failed, retrying in.*timeline is Stopping", # When gc checks timeline state after acquiring layer_removal_cs
".*gc_loop.*Gc failed, retrying in.*: Cannot run GC iteration on inactive tenant", # Tenant::gc precondition
".*compaction_loop.*Compaction failed, retrying in.*timeline or pageserver is shutting down", # When compaction checks timeline state after acquiring layer_removal_cs
".*compaction_loop.*Compaction failed.*, retrying in.*timeline or pageserver is shutting down", # When compaction checks timeline state after acquiring layer_removal_cs
".*query handler for 'pagestream.*failed: Timeline .* was not found", # postgres reconnects while timeline_delete doesn't hold the tenant's timelines.lock()
".*query handler for 'pagestream.*failed: Timeline .* is not active", # timeline delete in progress
".*task iteration took longer than the configured period.*",
# this is until #3501
".*Compaction failed, retrying in [^:]+: Cannot run compaction iteration on inactive tenant",
".*Compaction failed.*, retrying in [^:]+: Cannot run compaction iteration on inactive tenant",
# these can happen anytime we do compactions from background task and shutdown pageserver
r".*ERROR.*ancestor timeline \S+ is being stopped",
# this is expected given our collaborative shutdown approach for the UploadQueue
".*Compaction failed, retrying in .*: queue is in state Stopped.*",
".*Compaction failed.*, retrying in .*: queue is in state Stopped.*",
# Pageserver timeline deletion should be polled until it gets 404, so ignore it globally
".*Error processing HTTP request: NotFound: Timeline .* was not found",
".*took more than expected to complete.*",