Fix flaky test_sharding_split_failures (#12199)

## Problem `test_sharding_failures` is flaky due to interference from the `background_reconcile` process. The details are in the issue https://github.com/neondatabase/neon/issues/12029. ## Summary of changes - Use `reconcile_until_idle` to ensure a stable state before running test assertions - Added error tolerance in `reconcile_until_idle` test function (Failure cases: 1, 3, 19, 20) - Ignore the `Keeping extra secondaries` warning message since it i retryable (Failure case: 2) - Deduplicated code in `assert_rolled_back` and `assert_split_done` - Added a log message before printing plenty of Node `X` seen on pageserver `Y`
2026-05-23 16:10:37 +00:00 · 2025-06-18 17:27:41 +04:00
parent 7e711ede44
commit e6a404c66d
3 changed files with 45 additions and 48 deletions
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -8778,15 +8778,22 @@ impl Service {
        let waiter_count = waiters.len();
        match self.await_waiters(waiters, RECONCILE_TIMEOUT).await {
            Ok(()) => {}
-            Err(ReconcileWaitError::Failed(_, reconcile_error))
-                if matches!(*reconcile_error, ReconcileError::Cancel) =>
-            {
-                // Ignore reconciler cancel errors: this reconciler might have shut down
-                // because some other change superceded it.  We will return a nonzero number,
-                // so the caller knows they might have to call again to quiesce the system.
-            }
            Err(e) => {
-                return Err(e);
+                if let ReconcileWaitError::Failed(_, reconcile_error) = &e {
+                    match **reconcile_error {
+                        ReconcileError::Cancel
+                        | ReconcileError::Remote(mgmt_api::Error::Cancelled) => {
+                            // Ignore reconciler cancel errors: this reconciler might have shut down
+                            // because some other change superceded it.  We will return a nonzero number,
+                            // so the caller knows they might have to call again to quiesce the system.
+                        }
+                        _ => {
+                            return Err(e);
+                        }
+                    }
+                } else {
+                    return Err(e);
+                }
            }
        };

--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2360,6 +2360,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
        delay_max = max_interval
        while n > 0:
            n = self.reconcile_all()
+
            if n == 0:
                break
            elif time.time() - start_at > timeout_secs:
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -1337,7 +1337,7 @@ def test_sharding_split_failures(
    # Create bystander tenants with various shard counts. They should not be affected by the aborted
    # splits. Regression test for https://github.com/neondatabase/cloud/issues/28589.
    bystanders = {}  # id → shard_count
-    for bystander_shard_count in [1, 2, 4, 8]:
+    for bystander_shard_count in [1, 2, 4]:
        id, _ = env.create_tenant(shard_count=bystander_shard_count)
        bystanders[id] = bystander_shard_count

@@ -1358,6 +1358,8 @@ def test_sharding_split_failures(
            ".*Reconcile error.*Cancelled.*",
            # While parent shard's client is stopped during split, flush loop updating LSNs will emit this warning
            ".*Failed to schedule metadata upload after updating disk_consistent_lsn.*",
+            # We didn't identify a secondary to remove.
+            ".*Keeping extra secondaries.*",
        ]
    )

@@ -1388,51 +1390,36 @@ def test_sharding_split_failures(
    with pytest.raises(failure.expect_exception()):
        env.storage_controller.tenant_shard_split(tenant_id, shard_count=4)

+    def assert_shard_count(shard_count: int, exclude_ps_id: int | None = None) -> None:
+        secondary_count = 0
+        attached_count = 0
+        log.info(f"Iterating over {len(env.pageservers)} pageservers to check shard count")
+        for ps in env.pageservers:
+            if exclude_ps_id is not None and ps.id == exclude_ps_id:
+                continue
+
+            locations = ps.http_client().tenant_list_locations()["tenant_shards"]
+            for loc in locations:
+                tenant_shard_id = TenantShardId.parse(loc[0])
+                if tenant_shard_id.tenant_id != tenant_id:
+                    continue  # skip bystanders
+                log.info(f"Shard {tenant_shard_id} seen on node {ps.id} in mode {loc[1]['mode']}")
+                assert tenant_shard_id.shard_count == shard_count
+                if loc[1]["mode"] == "Secondary":
+                    secondary_count += 1
+                else:
+                    attached_count += 1
+        assert secondary_count == shard_count
+        assert attached_count == shard_count
+
    # We expect that the overall operation will fail, but some split requests
    # will have succeeded: the net result should be to return to a clean state, including
    # detaching any child shards.
    def assert_rolled_back(exclude_ps_id=None) -> None:
-        secondary_count = 0
-        attached_count = 0
-        for ps in env.pageservers:
-            if exclude_ps_id is not None and ps.id == exclude_ps_id:
-                continue
-
-            locations = ps.http_client().tenant_list_locations()["tenant_shards"]
-            for loc in locations:
-                tenant_shard_id = TenantShardId.parse(loc[0])
-                if tenant_shard_id.tenant_id != tenant_id:
-                    continue  # skip bystanders
-                log.info(f"Shard {tenant_shard_id} seen on node {ps.id} in mode {loc[1]['mode']}")
-                assert tenant_shard_id.shard_count == initial_shard_count
-                if loc[1]["mode"] == "Secondary":
-                    secondary_count += 1
-                else:
-                    attached_count += 1
-
-        assert secondary_count == initial_shard_count
-        assert attached_count == initial_shard_count
+        assert_shard_count(initial_shard_count, exclude_ps_id)

    def assert_split_done(exclude_ps_id: int | None = None) -> None:
-        secondary_count = 0
-        attached_count = 0
-        for ps in env.pageservers:
-            if exclude_ps_id is not None and ps.id == exclude_ps_id:
-                continue
-
-            locations = ps.http_client().tenant_list_locations()["tenant_shards"]
-            for loc in locations:
-                tenant_shard_id = TenantShardId.parse(loc[0])
-                if tenant_shard_id.tenant_id != tenant_id:
-                    continue  # skip bystanders
-                log.info(f"Shard {tenant_shard_id} seen on node {ps.id} in mode {loc[1]['mode']}")
-                assert tenant_shard_id.shard_count == split_shard_count
-                if loc[1]["mode"] == "Secondary":
-                    secondary_count += 1
-                else:
-                    attached_count += 1
-        assert attached_count == split_shard_count
-        assert secondary_count == split_shard_count
+        assert_shard_count(split_shard_count, exclude_ps_id)

    def finish_split():
        # Having failed+rolled back, we should be able to split again
@@ -1468,6 +1455,7 @@ def test_sharding_split_failures(

        # The split should appear to be rolled back from the point of view of all pageservers
        # apart from the one that is offline
+        env.storage_controller.reconcile_until_idle(timeout_secs=60, max_interval=2)
        wait_until(lambda: assert_rolled_back(exclude_ps_id=failure.pageserver_id))

        finish_split()
@@ -1482,6 +1470,7 @@ def test_sharding_split_failures(
        log.info("Clearing failure...")
        failure.clear(env)

+        env.storage_controller.reconcile_until_idle(timeout_secs=60, max_interval=2)
        wait_until(assert_rolled_back)

        # Having rolled back, the tenant should be working