Fix keep-failing reconciles test & add logs (#12497)

## Problem

Test is flaky due to the following warning in the logs:

```
Keeping extra secondaries: can't determine which of [NodeId(1), NodeId(2)] to remove (some nodes offline?)
```

Some nodes being offline is expected behavior in this test.

## Summary of changes

- Added `Keeping extra secondaries` to the list of allowed errors
- Improved logging for better debugging experience

Co-authored-by: Aleksandr Sarantsev <aleksandr.sarantsev@databricks.com>
This commit is contained in:
Aleksandr Sarantsev
2025-07-08 12:51:50 +04:00
committed by GitHub
parent e65d5f7369
commit 2f3fc7cb57

View File

@@ -1034,16 +1034,19 @@ def test_storage_controller_compute_hook_keep_failing(
alive_pageservers = [p for p in env.pageservers if p.id != banned_tenant_ps.id]
# Stop pageserver and ban tenant to trigger failed reconciliation
log.info(f"Banning tenant {banned_tenant} and stopping pageserver {banned_tenant_ps.id}")
status_by_tenant[banned_tenant] = 423
banned_tenant_ps.stop()
env.storage_controller.allowed_errors.append(NOTIFY_BLOCKED_LOG)
env.storage_controller.allowed_errors.extend(NOTIFY_FAILURE_LOGS)
env.storage_controller.allowed_errors.append(".*Keeping extra secondaries.*")
env.storage_controller.allowed_errors.append(".*Shard reconciliation is keep-failing.*")
env.storage_controller.node_configure(banned_tenant_ps.id, {"availability": "Offline"})
# Migrate all allowed tenant shards to the first alive pageserver
# to trigger storage controller optimizations due to affinity rules
for shard_number in range(shard_count):
log.info(f"Migrating shard {shard_number} of {allowed_tenant} to {alive_pageservers[0].id}")
env.storage_controller.tenant_shard_migrate(
TenantShardId(allowed_tenant, shard_number, shard_count),
alive_pageservers[0].id,