Fix keep-failing reconciles test & add logs (#12497)

## Problem Test is flaky due to the following warning in the logs: ``` Keeping extra secondaries: can't determine which of [NodeId(1), NodeId(2)] to remove (some nodes offline?) ``` Some nodes being offline is expected behavior in this test. ## Summary of changes - Added `Keeping extra secondaries` to the list of allowed errors - Improved logging for better debugging experience Co-authored-by: Aleksandr Sarantsev <aleksandr.sarantsev@databricks.com>
2026-01-03 19:42:55 +00:00 · 2025-07-08 12:51:50 +04:00
parent e65d5f7369
commit 2f3fc7cb57
1 changed files with 3 additions and 0 deletions
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -1034,16 +1034,19 @@ def test_storage_controller_compute_hook_keep_failing(
    alive_pageservers = [p for p in env.pageservers if p.id != banned_tenant_ps.id]

    # Stop pageserver and ban tenant to trigger failed reconciliation
+    log.info(f"Banning tenant {banned_tenant} and stopping pageserver {banned_tenant_ps.id}")
    status_by_tenant[banned_tenant] = 423
    banned_tenant_ps.stop()
    env.storage_controller.allowed_errors.append(NOTIFY_BLOCKED_LOG)
    env.storage_controller.allowed_errors.extend(NOTIFY_FAILURE_LOGS)
+    env.storage_controller.allowed_errors.append(".*Keeping extra secondaries.*")
    env.storage_controller.allowed_errors.append(".*Shard reconciliation is keep-failing.*")
    env.storage_controller.node_configure(banned_tenant_ps.id, {"availability": "Offline"})

    # Migrate all allowed tenant shards to the first alive pageserver
    # to trigger storage controller optimizations due to affinity rules
    for shard_number in range(shard_count):
+        log.info(f"Migrating shard {shard_number} of {allowed_tenant} to {alive_pageservers[0].id}")
        env.storage_controller.tenant_shard_migrate(
            TenantShardId(allowed_tenant, shard_number, shard_count),
            alive_pageservers[0].id,