mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-25 09:00:37 +00:00
pageserver: circuit breaker on compaction (#8359)
## Problem We already back off on compaction retries, but the impact of a failing compaction can be so great that backing off up to 300s isn't enough. The impact is consuming a lot of I/O+CPU in the case of image layer generation for large tenants, and potentially also leaking disk space. Compaction failures are extremely rare and almost always indicate a bug, frequently a bug that will not let compaction to proceed until it is fixed. Related: https://github.com/neondatabase/neon/issues/6738 ## Summary of changes - Introduce a CircuitBreaker type - Add a circuit breaker for compaction, with a policy that after 5 failures, compaction will not be attempted again for 24 hours. - Add metrics that we can alert on: any >0 value for `pageserver_circuit_breaker_broken_total` should generate an alert. - Add a test that checks this works as intended. Couple notes to reviewers: - Circuit breakers are intrinsically a defense-in-depth measure: this is not the solution to any underlying issues, it is just a general mitigation for "unknown unknowns" that might be encountered in future. - This PR isn't primarily about writing a perfect CircuitBreaker type: the one in this PR is meant to be just enough to mitigate issues in compaction, and make it easy to monitor/alert on these failures. We can refine this type in future as/when we want to use it elsewhere.
This commit is contained in:
@@ -1,12 +1,14 @@
|
||||
import enum
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder, generate_uploads_and_deletions
|
||||
from fixtures.pageserver.http import PageserverApiException
|
||||
from fixtures.utils import wait_until
|
||||
from fixtures.workload import Workload
|
||||
|
||||
AGGRESIVE_COMPACTION_TENANT_CONF = {
|
||||
@@ -257,3 +259,64 @@ def test_uploads_and_deletions(
|
||||
found_allowed_error = any(env.pageserver.log_contains(e) for e in allowed_errors)
|
||||
if not found_allowed_error:
|
||||
raise Exception("None of the allowed_errors occured in the log")
|
||||
|
||||
|
||||
def test_pageserver_compaction_circuit_breaker(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Check that repeated failures in compaction result in a circuit breaker breaking
|
||||
"""
|
||||
TENANT_CONF = {
|
||||
# Very frequent runs to rack up failures quickly
|
||||
"compaction_period": "100ms",
|
||||
# Small checkpoint distance to create many layers
|
||||
"checkpoint_distance": 1024 * 128,
|
||||
# Compact small layers
|
||||
"compaction_target_size": 1024 * 128,
|
||||
"image_creation_threshold": 1,
|
||||
}
|
||||
|
||||
FAILPOINT = "delta-layer-writer-fail-before-finish"
|
||||
BROKEN_LOG = ".*Circuit breaker broken!.*"
|
||||
|
||||
env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
|
||||
|
||||
workload = Workload(env, env.initial_tenant, env.initial_timeline)
|
||||
workload.init()
|
||||
|
||||
# Set a failpoint that will prevent compaction succeeding
|
||||
env.pageserver.http_client().configure_failpoints((FAILPOINT, "return"))
|
||||
|
||||
# Write some data to trigger compaction
|
||||
workload.write_rows(1024, upload=False)
|
||||
workload.write_rows(1024, upload=False)
|
||||
workload.write_rows(1024, upload=False)
|
||||
|
||||
def assert_broken():
|
||||
env.pageserver.assert_log_contains(BROKEN_LOG)
|
||||
assert (
|
||||
env.pageserver.http_client().get_metric_value("pageserver_circuit_breaker_broken_total")
|
||||
or 0
|
||||
) == 1
|
||||
assert (
|
||||
env.pageserver.http_client().get_metric_value(
|
||||
"pageserver_circuit_breaker_unbroken_total"
|
||||
)
|
||||
or 0
|
||||
) == 0
|
||||
|
||||
# Wait for enough failures to break the circuit breaker
|
||||
# This wait is fairly long because we back off on compaction failures, so 5 retries takes ~30s
|
||||
wait_until(60, 1, assert_broken)
|
||||
|
||||
# Sleep for a while, during which time we expect that compaction will _not_ be retried
|
||||
time.sleep(10)
|
||||
|
||||
assert (
|
||||
env.pageserver.http_client().get_metric_value("pageserver_circuit_breaker_broken_total")
|
||||
or 0
|
||||
) == 1
|
||||
assert (
|
||||
env.pageserver.http_client().get_metric_value("pageserver_circuit_breaker_unbroken_total")
|
||||
or 0
|
||||
) == 0
|
||||
assert not env.pageserver.log_contains(".*Circuit breaker failure ended.*")
|
||||
|
||||
Reference in New Issue
Block a user