From e6a404c66d18fc966d93e3f3284fb587d29c5e68 Mon Sep 17 00:00:00 2001 From: Aleksandr Sarantsev <99037063+ephemeralsad@users.noreply.github.com> Date: Wed, 18 Jun 2025 17:27:41 +0400 Subject: [PATCH] Fix flaky test_sharding_split_failures (#12199) ## Problem `test_sharding_failures` is flaky due to interference from the `background_reconcile` process. The details are in the issue https://github.com/neondatabase/neon/issues/12029. ## Summary of changes - Use `reconcile_until_idle` to ensure a stable state before running test assertions - Added error tolerance in `reconcile_until_idle` test function (Failure cases: 1, 3, 19, 20) - Ignore the `Keeping extra secondaries` warning message since it i retryable (Failure case: 2) - Deduplicated code in `assert_rolled_back` and `assert_split_done` - Added a log message before printing plenty of Node `X` seen on pageserver `Y` --- storage_controller/src/service.rs | 23 +++++---- test_runner/fixtures/neon_fixtures.py | 1 + test_runner/regress/test_sharding.py | 69 +++++++++++---------------- 3 files changed, 45 insertions(+), 48 deletions(-) diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 55f91cea09..14c81ccf59 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -8778,15 +8778,22 @@ impl Service { let waiter_count = waiters.len(); match self.await_waiters(waiters, RECONCILE_TIMEOUT).await { Ok(()) => {} - Err(ReconcileWaitError::Failed(_, reconcile_error)) - if matches!(*reconcile_error, ReconcileError::Cancel) => - { - // Ignore reconciler cancel errors: this reconciler might have shut down - // because some other change superceded it. We will return a nonzero number, - // so the caller knows they might have to call again to quiesce the system. - } Err(e) => { - return Err(e); + if let ReconcileWaitError::Failed(_, reconcile_error) = &e { + match **reconcile_error { + ReconcileError::Cancel + | ReconcileError::Remote(mgmt_api::Error::Cancelled) => { + // Ignore reconciler cancel errors: this reconciler might have shut down + // because some other change superceded it. We will return a nonzero number, + // so the caller knows they might have to call again to quiesce the system. + } + _ => { + return Err(e); + } + } + } else { + return Err(e); + } } }; diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 970175a631..236c03e361 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -2360,6 +2360,7 @@ class NeonStorageController(MetricsGetter, LogUtils): delay_max = max_interval while n > 0: n = self.reconcile_all() + if n == 0: break elif time.time() - start_at > timeout_secs: diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index 522e257ea5..93c621f564 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -1337,7 +1337,7 @@ def test_sharding_split_failures( # Create bystander tenants with various shard counts. They should not be affected by the aborted # splits. Regression test for https://github.com/neondatabase/cloud/issues/28589. bystanders = {} # id → shard_count - for bystander_shard_count in [1, 2, 4, 8]: + for bystander_shard_count in [1, 2, 4]: id, _ = env.create_tenant(shard_count=bystander_shard_count) bystanders[id] = bystander_shard_count @@ -1358,6 +1358,8 @@ def test_sharding_split_failures( ".*Reconcile error.*Cancelled.*", # While parent shard's client is stopped during split, flush loop updating LSNs will emit this warning ".*Failed to schedule metadata upload after updating disk_consistent_lsn.*", + # We didn't identify a secondary to remove. + ".*Keeping extra secondaries.*", ] ) @@ -1388,51 +1390,36 @@ def test_sharding_split_failures( with pytest.raises(failure.expect_exception()): env.storage_controller.tenant_shard_split(tenant_id, shard_count=4) + def assert_shard_count(shard_count: int, exclude_ps_id: int | None = None) -> None: + secondary_count = 0 + attached_count = 0 + log.info(f"Iterating over {len(env.pageservers)} pageservers to check shard count") + for ps in env.pageservers: + if exclude_ps_id is not None and ps.id == exclude_ps_id: + continue + + locations = ps.http_client().tenant_list_locations()["tenant_shards"] + for loc in locations: + tenant_shard_id = TenantShardId.parse(loc[0]) + if tenant_shard_id.tenant_id != tenant_id: + continue # skip bystanders + log.info(f"Shard {tenant_shard_id} seen on node {ps.id} in mode {loc[1]['mode']}") + assert tenant_shard_id.shard_count == shard_count + if loc[1]["mode"] == "Secondary": + secondary_count += 1 + else: + attached_count += 1 + assert secondary_count == shard_count + assert attached_count == shard_count + # We expect that the overall operation will fail, but some split requests # will have succeeded: the net result should be to return to a clean state, including # detaching any child shards. def assert_rolled_back(exclude_ps_id=None) -> None: - secondary_count = 0 - attached_count = 0 - for ps in env.pageservers: - if exclude_ps_id is not None and ps.id == exclude_ps_id: - continue - - locations = ps.http_client().tenant_list_locations()["tenant_shards"] - for loc in locations: - tenant_shard_id = TenantShardId.parse(loc[0]) - if tenant_shard_id.tenant_id != tenant_id: - continue # skip bystanders - log.info(f"Shard {tenant_shard_id} seen on node {ps.id} in mode {loc[1]['mode']}") - assert tenant_shard_id.shard_count == initial_shard_count - if loc[1]["mode"] == "Secondary": - secondary_count += 1 - else: - attached_count += 1 - - assert secondary_count == initial_shard_count - assert attached_count == initial_shard_count + assert_shard_count(initial_shard_count, exclude_ps_id) def assert_split_done(exclude_ps_id: int | None = None) -> None: - secondary_count = 0 - attached_count = 0 - for ps in env.pageservers: - if exclude_ps_id is not None and ps.id == exclude_ps_id: - continue - - locations = ps.http_client().tenant_list_locations()["tenant_shards"] - for loc in locations: - tenant_shard_id = TenantShardId.parse(loc[0]) - if tenant_shard_id.tenant_id != tenant_id: - continue # skip bystanders - log.info(f"Shard {tenant_shard_id} seen on node {ps.id} in mode {loc[1]['mode']}") - assert tenant_shard_id.shard_count == split_shard_count - if loc[1]["mode"] == "Secondary": - secondary_count += 1 - else: - attached_count += 1 - assert attached_count == split_shard_count - assert secondary_count == split_shard_count + assert_shard_count(split_shard_count, exclude_ps_id) def finish_split(): # Having failed+rolled back, we should be able to split again @@ -1468,6 +1455,7 @@ def test_sharding_split_failures( # The split should appear to be rolled back from the point of view of all pageservers # apart from the one that is offline + env.storage_controller.reconcile_until_idle(timeout_secs=60, max_interval=2) wait_until(lambda: assert_rolled_back(exclude_ps_id=failure.pageserver_id)) finish_split() @@ -1482,6 +1470,7 @@ def test_sharding_split_failures( log.info("Clearing failure...") failure.clear(env) + env.storage_controller.reconcile_until_idle(timeout_secs=60, max_interval=2) wait_until(assert_rolled_back) # Having rolled back, the tenant should be working