mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-06 04:52:55 +00:00
storcon: Ignore stuck reconciles when considering optimizations (#12589)
## Problem The `keep_failing_reconciles` counter was introduced in #12391, but there is a special case: > if a reconciliation loop claims to have succeeded, but maybe_reconcile still thinks the tenant is in need of reconciliation, then that's a probable bug and we should activate a similar backoff to prevent flapping. This PR redefines "flapping" to include not just repeated failures, but also consecutive reconciliations of any kind (success or failure). ## Summary of Changes - Replace `keep_failing_reconciles` with a new `stuck_reconciles` metric - Replace `MAX_CONSECUTIVE_RECONCILIATION_ERRORS` with `MAX_CONSECUTIVE_RECONCILES`, and increasing that from 5 to 10 - Increment the consecutive reconciles counter for all reconciles, not just failures - Reset the counter in `reconcile_all` when no reconcile is needed for a shard - Improve and fix the related test --------- Co-authored-by: Aleksandr Sarantsev <aleksandr.sarantsev@databricks.com>
This commit is contained in:
committed by
GitHub
parent
8862e7c4bf
commit
f0c0733a64
@@ -996,7 +996,7 @@ def test_storage_controller_compute_hook_retry(
|
||||
|
||||
|
||||
@run_only_on_default_postgres("postgres behavior is not relevant")
|
||||
def test_storage_controller_compute_hook_keep_failing(
|
||||
def test_storage_controller_compute_hook_stuck_reconciles(
|
||||
httpserver: HTTPServer,
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
httpserver_listen_address: ListenAddress,
|
||||
@@ -1046,7 +1046,7 @@ def test_storage_controller_compute_hook_keep_failing(
|
||||
env.storage_controller.allowed_errors.append(NOTIFY_BLOCKED_LOG)
|
||||
env.storage_controller.allowed_errors.extend(NOTIFY_FAILURE_LOGS)
|
||||
env.storage_controller.allowed_errors.append(".*Keeping extra secondaries.*")
|
||||
env.storage_controller.allowed_errors.append(".*Shard reconciliation is keep-failing.*")
|
||||
env.storage_controller.allowed_errors.append(".*Shard reconciliation is stuck.*")
|
||||
env.storage_controller.node_configure(banned_tenant_ps.id, {"availability": "Offline"})
|
||||
|
||||
# Migrate all allowed tenant shards to the first alive pageserver
|
||||
@@ -1061,7 +1061,7 @@ def test_storage_controller_compute_hook_keep_failing(
|
||||
|
||||
# Make some reconcile_all calls to trigger optimizations
|
||||
# RECONCILE_COUNT must be greater than storcon's MAX_CONSECUTIVE_RECONCILIATION_ERRORS
|
||||
RECONCILE_COUNT = 12
|
||||
RECONCILE_COUNT = 20
|
||||
for i in range(RECONCILE_COUNT):
|
||||
try:
|
||||
n = env.storage_controller.reconcile_all()
|
||||
@@ -1074,6 +1074,8 @@ def test_storage_controller_compute_hook_keep_failing(
|
||||
assert banned_descr["shards"][0]["is_pending_compute_notification"] is True
|
||||
time.sleep(2)
|
||||
|
||||
env.storage_controller.assert_log_contains(".*Shard reconciliation is stuck.*")
|
||||
|
||||
# Check that the allowed tenant shards are optimized due to affinity rules
|
||||
locations = alive_pageservers[0].http_client().tenant_list_locations()["tenant_shards"]
|
||||
not_optimized_shard_count = 0
|
||||
|
||||
Reference in New Issue
Block a user