mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-08 05:52:55 +00:00
storcon: fix split aborts removing other tenants (#11837)
## Problem When aborting a split, the code accidentally removes all other tenant shards from the in-memory map that have the same shard count as the aborted split, causing "tenant not found" errors. It will recover on a storcon restart, when it loads the persisted state. This issue has been present for at least a year. Resolves https://github.com/neondatabase/cloud/issues/28589. ## Summary of changes Only remove shards belonging to the relevant tenant when aborting a split. Also adds a regression test.
This commit is contained in:
@@ -1334,6 +1334,13 @@ def test_sharding_split_failures(
|
||||
tenant_id, timeline_id, shard_count=initial_shard_count, placement_policy='{"Attached":1}'
|
||||
)
|
||||
|
||||
# Create bystander tenants with various shard counts. They should not be affected by the aborted
|
||||
# splits. Regression test for https://github.com/neondatabase/cloud/issues/28589.
|
||||
bystanders = {} # id → shard_count
|
||||
for bystander_shard_count in [1, 2, 4, 8]:
|
||||
id, _ = env.create_tenant(shard_count=bystander_shard_count)
|
||||
bystanders[id] = bystander_shard_count
|
||||
|
||||
env.storage_controller.allowed_errors.extend(
|
||||
[
|
||||
# All split failures log a warning when then enqueue the abort operation
|
||||
@@ -1394,6 +1401,8 @@ def test_sharding_split_failures(
|
||||
locations = ps.http_client().tenant_list_locations()["tenant_shards"]
|
||||
for loc in locations:
|
||||
tenant_shard_id = TenantShardId.parse(loc[0])
|
||||
if tenant_shard_id.tenant_id != tenant_id:
|
||||
continue # skip bystanders
|
||||
log.info(f"Shard {tenant_shard_id} seen on node {ps.id} in mode {loc[1]['mode']}")
|
||||
assert tenant_shard_id.shard_count == initial_shard_count
|
||||
if loc[1]["mode"] == "Secondary":
|
||||
@@ -1414,6 +1423,8 @@ def test_sharding_split_failures(
|
||||
locations = ps.http_client().tenant_list_locations()["tenant_shards"]
|
||||
for loc in locations:
|
||||
tenant_shard_id = TenantShardId.parse(loc[0])
|
||||
if tenant_shard_id.tenant_id != tenant_id:
|
||||
continue # skip bystanders
|
||||
log.info(f"Shard {tenant_shard_id} seen on node {ps.id} in mode {loc[1]['mode']}")
|
||||
assert tenant_shard_id.shard_count == split_shard_count
|
||||
if loc[1]["mode"] == "Secondary":
|
||||
@@ -1496,6 +1507,12 @@ def test_sharding_split_failures(
|
||||
# the scheduler reaches an idle state
|
||||
env.storage_controller.reconcile_until_idle(timeout_secs=30)
|
||||
|
||||
# Check that all bystanders are still around.
|
||||
for bystander_id, bystander_shard_count in bystanders.items():
|
||||
response = env.storage_controller.tenant_describe(bystander_id)
|
||||
assert TenantId(response["tenant_id"]) == bystander_id
|
||||
assert len(response["shards"]) == bystander_shard_count
|
||||
|
||||
env.storage_controller.consistency_check()
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user