From e6a404c66d18fc966d93e3f3284fb587d29c5e68 Mon Sep 17 00:00:00 2001
From: Aleksandr Sarantsev <99037063+ephemeralsad@users.noreply.github.com>
Date: Wed, 18 Jun 2025 17:27:41 +0400
Subject: [PATCH] Fix flaky test_sharding_split_failures (#12199)

## Problem

`test_sharding_failures` is flaky due to interference from the
`background_reconcile` process.

The details are in the issue
https://github.com/neondatabase/neon/issues/12029.

## Summary of changes

- Use `reconcile_until_idle` to ensure a stable state before running
test assertions
- Added error tolerance in `reconcile_until_idle` test function (Failure
cases: 1, 3, 19, 20)
- Ignore the `Keeping extra secondaries` warning message since it i
retryable (Failure case: 2)
- Deduplicated code in `assert_rolled_back` and `assert_split_done`
- Added a log message before printing plenty of Node `X` seen on
pageserver `Y`
---
 storage_controller/src/service.rs     | 23 +++++----
 test_runner/fixtures/neon_fixtures.py |  1 +
 test_runner/regress/test_sharding.py  | 69 +++++++++++----------------
 3 files changed, 45 insertions(+), 48 deletions(-)

diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 55f91cea09..14c81ccf59 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -8778,15 +8778,22 @@ impl Service {
         let waiter_count = waiters.len();
         match self.await_waiters(waiters, RECONCILE_TIMEOUT).await {
             Ok(()) => {}
-            Err(ReconcileWaitError::Failed(_, reconcile_error))
-                if matches!(*reconcile_error, ReconcileError::Cancel) =>
-            {
-                // Ignore reconciler cancel errors: this reconciler might have shut down
-                // because some other change superceded it.  We will return a nonzero number,
-                // so the caller knows they might have to call again to quiesce the system.
-            }
             Err(e) => {
-                return Err(e);
+                if let ReconcileWaitError::Failed(_, reconcile_error) = &e {
+                    match **reconcile_error {
+                        ReconcileError::Cancel
+                        | ReconcileError::Remote(mgmt_api::Error::Cancelled) => {
+                            // Ignore reconciler cancel errors: this reconciler might have shut down
+                            // because some other change superceded it.  We will return a nonzero number,
+                            // so the caller knows they might have to call again to quiesce the system.
+                        }
+                        _ => {
+                            return Err(e);
+                        }
+                    }
+                } else {
+                    return Err(e);
+                }
             }
         };
 
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 970175a631..236c03e361 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2360,6 +2360,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
         delay_max = max_interval
         while n > 0:
             n = self.reconcile_all()
+
             if n == 0:
                 break
             elif time.time() - start_at > timeout_secs:
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 522e257ea5..93c621f564 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -1337,7 +1337,7 @@ def test_sharding_split_failures(
     # Create bystander tenants with various shard counts. They should not be affected by the aborted
     # splits. Regression test for https://github.com/neondatabase/cloud/issues/28589.
     bystanders = {}  # id → shard_count
-    for bystander_shard_count in [1, 2, 4, 8]:
+    for bystander_shard_count in [1, 2, 4]:
         id, _ = env.create_tenant(shard_count=bystander_shard_count)
         bystanders[id] = bystander_shard_count
 
@@ -1358,6 +1358,8 @@ def test_sharding_split_failures(
             ".*Reconcile error.*Cancelled.*",
             # While parent shard's client is stopped during split, flush loop updating LSNs will emit this warning
             ".*Failed to schedule metadata upload after updating disk_consistent_lsn.*",
+            # We didn't identify a secondary to remove.
+            ".*Keeping extra secondaries.*",
         ]
     )
 
@@ -1388,51 +1390,36 @@ def test_sharding_split_failures(
     with pytest.raises(failure.expect_exception()):
         env.storage_controller.tenant_shard_split(tenant_id, shard_count=4)
 
+    def assert_shard_count(shard_count: int, exclude_ps_id: int | None = None) -> None:
+        secondary_count = 0
+        attached_count = 0
+        log.info(f"Iterating over {len(env.pageservers)} pageservers to check shard count")
+        for ps in env.pageservers:
+            if exclude_ps_id is not None and ps.id == exclude_ps_id:
+                continue
+
+            locations = ps.http_client().tenant_list_locations()["tenant_shards"]
+            for loc in locations:
+                tenant_shard_id = TenantShardId.parse(loc[0])
+                if tenant_shard_id.tenant_id != tenant_id:
+                    continue  # skip bystanders
+                log.info(f"Shard {tenant_shard_id} seen on node {ps.id} in mode {loc[1]['mode']}")
+                assert tenant_shard_id.shard_count == shard_count
+                if loc[1]["mode"] == "Secondary":
+                    secondary_count += 1
+                else:
+                    attached_count += 1
+        assert secondary_count == shard_count
+        assert attached_count == shard_count
+
     # We expect that the overall operation will fail, but some split requests
     # will have succeeded: the net result should be to return to a clean state, including
     # detaching any child shards.
     def assert_rolled_back(exclude_ps_id=None) -> None:
-        secondary_count = 0
-        attached_count = 0
-        for ps in env.pageservers:
-            if exclude_ps_id is not None and ps.id == exclude_ps_id:
-                continue
-
-            locations = ps.http_client().tenant_list_locations()["tenant_shards"]
-            for loc in locations:
-                tenant_shard_id = TenantShardId.parse(loc[0])
-                if tenant_shard_id.tenant_id != tenant_id:
-                    continue  # skip bystanders
-                log.info(f"Shard {tenant_shard_id} seen on node {ps.id} in mode {loc[1]['mode']}")
-                assert tenant_shard_id.shard_count == initial_shard_count
-                if loc[1]["mode"] == "Secondary":
-                    secondary_count += 1
-                else:
-                    attached_count += 1
-
-        assert secondary_count == initial_shard_count
-        assert attached_count == initial_shard_count
+        assert_shard_count(initial_shard_count, exclude_ps_id)
 
     def assert_split_done(exclude_ps_id: int | None = None) -> None:
-        secondary_count = 0
-        attached_count = 0
-        for ps in env.pageservers:
-            if exclude_ps_id is not None and ps.id == exclude_ps_id:
-                continue
-
-            locations = ps.http_client().tenant_list_locations()["tenant_shards"]
-            for loc in locations:
-                tenant_shard_id = TenantShardId.parse(loc[0])
-                if tenant_shard_id.tenant_id != tenant_id:
-                    continue  # skip bystanders
-                log.info(f"Shard {tenant_shard_id} seen on node {ps.id} in mode {loc[1]['mode']}")
-                assert tenant_shard_id.shard_count == split_shard_count
-                if loc[1]["mode"] == "Secondary":
-                    secondary_count += 1
-                else:
-                    attached_count += 1
-        assert attached_count == split_shard_count
-        assert secondary_count == split_shard_count
+        assert_shard_count(split_shard_count, exclude_ps_id)
 
     def finish_split():
         # Having failed+rolled back, we should be able to split again
@@ -1468,6 +1455,7 @@ def test_sharding_split_failures(
 
         # The split should appear to be rolled back from the point of view of all pageservers
         # apart from the one that is offline
+        env.storage_controller.reconcile_until_idle(timeout_secs=60, max_interval=2)
         wait_until(lambda: assert_rolled_back(exclude_ps_id=failure.pageserver_id))
 
         finish_split()
@@ -1482,6 +1470,7 @@ def test_sharding_split_failures(
         log.info("Clearing failure...")
         failure.clear(env)
 
+        env.storage_controller.reconcile_until_idle(timeout_secs=60, max_interval=2)
         wait_until(assert_rolled_back)
 
         # Having rolled back, the tenant should be working