mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-23 16:10:37 +00:00
Fix flaky test_sharding_split_failures (#12199)
## Problem `test_sharding_failures` is flaky due to interference from the `background_reconcile` process. The details are in the issue https://github.com/neondatabase/neon/issues/12029. ## Summary of changes - Use `reconcile_until_idle` to ensure a stable state before running test assertions - Added error tolerance in `reconcile_until_idle` test function (Failure cases: 1, 3, 19, 20) - Ignore the `Keeping extra secondaries` warning message since it i retryable (Failure case: 2) - Deduplicated code in `assert_rolled_back` and `assert_split_done` - Added a log message before printing plenty of Node `X` seen on pageserver `Y`
This commit is contained in:
committed by
GitHub
parent
7e711ede44
commit
e6a404c66d
@@ -8778,15 +8778,22 @@ impl Service {
|
||||
let waiter_count = waiters.len();
|
||||
match self.await_waiters(waiters, RECONCILE_TIMEOUT).await {
|
||||
Ok(()) => {}
|
||||
Err(ReconcileWaitError::Failed(_, reconcile_error))
|
||||
if matches!(*reconcile_error, ReconcileError::Cancel) =>
|
||||
{
|
||||
// Ignore reconciler cancel errors: this reconciler might have shut down
|
||||
// because some other change superceded it. We will return a nonzero number,
|
||||
// so the caller knows they might have to call again to quiesce the system.
|
||||
}
|
||||
Err(e) => {
|
||||
return Err(e);
|
||||
if let ReconcileWaitError::Failed(_, reconcile_error) = &e {
|
||||
match **reconcile_error {
|
||||
ReconcileError::Cancel
|
||||
| ReconcileError::Remote(mgmt_api::Error::Cancelled) => {
|
||||
// Ignore reconciler cancel errors: this reconciler might have shut down
|
||||
// because some other change superceded it. We will return a nonzero number,
|
||||
// so the caller knows they might have to call again to quiesce the system.
|
||||
}
|
||||
_ => {
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -2360,6 +2360,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
delay_max = max_interval
|
||||
while n > 0:
|
||||
n = self.reconcile_all()
|
||||
|
||||
if n == 0:
|
||||
break
|
||||
elif time.time() - start_at > timeout_secs:
|
||||
|
||||
@@ -1337,7 +1337,7 @@ def test_sharding_split_failures(
|
||||
# Create bystander tenants with various shard counts. They should not be affected by the aborted
|
||||
# splits. Regression test for https://github.com/neondatabase/cloud/issues/28589.
|
||||
bystanders = {} # id → shard_count
|
||||
for bystander_shard_count in [1, 2, 4, 8]:
|
||||
for bystander_shard_count in [1, 2, 4]:
|
||||
id, _ = env.create_tenant(shard_count=bystander_shard_count)
|
||||
bystanders[id] = bystander_shard_count
|
||||
|
||||
@@ -1358,6 +1358,8 @@ def test_sharding_split_failures(
|
||||
".*Reconcile error.*Cancelled.*",
|
||||
# While parent shard's client is stopped during split, flush loop updating LSNs will emit this warning
|
||||
".*Failed to schedule metadata upload after updating disk_consistent_lsn.*",
|
||||
# We didn't identify a secondary to remove.
|
||||
".*Keeping extra secondaries.*",
|
||||
]
|
||||
)
|
||||
|
||||
@@ -1388,51 +1390,36 @@ def test_sharding_split_failures(
|
||||
with pytest.raises(failure.expect_exception()):
|
||||
env.storage_controller.tenant_shard_split(tenant_id, shard_count=4)
|
||||
|
||||
def assert_shard_count(shard_count: int, exclude_ps_id: int | None = None) -> None:
|
||||
secondary_count = 0
|
||||
attached_count = 0
|
||||
log.info(f"Iterating over {len(env.pageservers)} pageservers to check shard count")
|
||||
for ps in env.pageservers:
|
||||
if exclude_ps_id is not None and ps.id == exclude_ps_id:
|
||||
continue
|
||||
|
||||
locations = ps.http_client().tenant_list_locations()["tenant_shards"]
|
||||
for loc in locations:
|
||||
tenant_shard_id = TenantShardId.parse(loc[0])
|
||||
if tenant_shard_id.tenant_id != tenant_id:
|
||||
continue # skip bystanders
|
||||
log.info(f"Shard {tenant_shard_id} seen on node {ps.id} in mode {loc[1]['mode']}")
|
||||
assert tenant_shard_id.shard_count == shard_count
|
||||
if loc[1]["mode"] == "Secondary":
|
||||
secondary_count += 1
|
||||
else:
|
||||
attached_count += 1
|
||||
assert secondary_count == shard_count
|
||||
assert attached_count == shard_count
|
||||
|
||||
# We expect that the overall operation will fail, but some split requests
|
||||
# will have succeeded: the net result should be to return to a clean state, including
|
||||
# detaching any child shards.
|
||||
def assert_rolled_back(exclude_ps_id=None) -> None:
|
||||
secondary_count = 0
|
||||
attached_count = 0
|
||||
for ps in env.pageservers:
|
||||
if exclude_ps_id is not None and ps.id == exclude_ps_id:
|
||||
continue
|
||||
|
||||
locations = ps.http_client().tenant_list_locations()["tenant_shards"]
|
||||
for loc in locations:
|
||||
tenant_shard_id = TenantShardId.parse(loc[0])
|
||||
if tenant_shard_id.tenant_id != tenant_id:
|
||||
continue # skip bystanders
|
||||
log.info(f"Shard {tenant_shard_id} seen on node {ps.id} in mode {loc[1]['mode']}")
|
||||
assert tenant_shard_id.shard_count == initial_shard_count
|
||||
if loc[1]["mode"] == "Secondary":
|
||||
secondary_count += 1
|
||||
else:
|
||||
attached_count += 1
|
||||
|
||||
assert secondary_count == initial_shard_count
|
||||
assert attached_count == initial_shard_count
|
||||
assert_shard_count(initial_shard_count, exclude_ps_id)
|
||||
|
||||
def assert_split_done(exclude_ps_id: int | None = None) -> None:
|
||||
secondary_count = 0
|
||||
attached_count = 0
|
||||
for ps in env.pageservers:
|
||||
if exclude_ps_id is not None and ps.id == exclude_ps_id:
|
||||
continue
|
||||
|
||||
locations = ps.http_client().tenant_list_locations()["tenant_shards"]
|
||||
for loc in locations:
|
||||
tenant_shard_id = TenantShardId.parse(loc[0])
|
||||
if tenant_shard_id.tenant_id != tenant_id:
|
||||
continue # skip bystanders
|
||||
log.info(f"Shard {tenant_shard_id} seen on node {ps.id} in mode {loc[1]['mode']}")
|
||||
assert tenant_shard_id.shard_count == split_shard_count
|
||||
if loc[1]["mode"] == "Secondary":
|
||||
secondary_count += 1
|
||||
else:
|
||||
attached_count += 1
|
||||
assert attached_count == split_shard_count
|
||||
assert secondary_count == split_shard_count
|
||||
assert_shard_count(split_shard_count, exclude_ps_id)
|
||||
|
||||
def finish_split():
|
||||
# Having failed+rolled back, we should be able to split again
|
||||
@@ -1468,6 +1455,7 @@ def test_sharding_split_failures(
|
||||
|
||||
# The split should appear to be rolled back from the point of view of all pageservers
|
||||
# apart from the one that is offline
|
||||
env.storage_controller.reconcile_until_idle(timeout_secs=60, max_interval=2)
|
||||
wait_until(lambda: assert_rolled_back(exclude_ps_id=failure.pageserver_id))
|
||||
|
||||
finish_split()
|
||||
@@ -1482,6 +1470,7 @@ def test_sharding_split_failures(
|
||||
log.info("Clearing failure...")
|
||||
failure.clear(env)
|
||||
|
||||
env.storage_controller.reconcile_until_idle(timeout_secs=60, max_interval=2)
|
||||
wait_until(assert_rolled_back)
|
||||
|
||||
# Having rolled back, the tenant should be working
|
||||
|
||||
Reference in New Issue
Block a user