From 0e0ad073bf609fbc38e86f4030f1902c2632c5f7 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Tue, 6 May 2025 15:57:34 +0200 Subject: [PATCH] storcon: fix split aborts removing other tenants (#11837) ## Problem When aborting a split, the code accidentally removes all other tenant shards from the in-memory map that have the same shard count as the aborted split, causing "tenant not found" errors. It will recover on a storcon restart, when it loads the persisted state. This issue has been present for at least a year. Resolves https://github.com/neondatabase/cloud/issues/28589. ## Summary of changes Only remove shards belonging to the relevant tenant when aborting a split. Also adds a regression test. --- storage_controller/src/service.rs | 3 ++- test_runner/regress/test_sharding.py | 17 +++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 72379f0810..21c693af97 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -5181,7 +5181,8 @@ impl Service { } // We don't expect any new_shard_count shards to exist here, but drop them just in case - tenants.retain(|_id, s| s.shard.count != *new_shard_count); + tenants + .retain(|id, s| !(id.tenant_id == *tenant_id && s.shard.count == *new_shard_count)); detach_locations }; diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index 0bfc4b1d8c..4c9887fb92 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -1334,6 +1334,13 @@ def test_sharding_split_failures( tenant_id, timeline_id, shard_count=initial_shard_count, placement_policy='{"Attached":1}' ) + # Create bystander tenants with various shard counts. They should not be affected by the aborted + # splits. Regression test for https://github.com/neondatabase/cloud/issues/28589. + bystanders = {} # id → shard_count + for bystander_shard_count in [1, 2, 4, 8]: + id, _ = env.create_tenant(shard_count=bystander_shard_count) + bystanders[id] = bystander_shard_count + env.storage_controller.allowed_errors.extend( [ # All split failures log a warning when then enqueue the abort operation @@ -1394,6 +1401,8 @@ def test_sharding_split_failures( locations = ps.http_client().tenant_list_locations()["tenant_shards"] for loc in locations: tenant_shard_id = TenantShardId.parse(loc[0]) + if tenant_shard_id.tenant_id != tenant_id: + continue # skip bystanders log.info(f"Shard {tenant_shard_id} seen on node {ps.id} in mode {loc[1]['mode']}") assert tenant_shard_id.shard_count == initial_shard_count if loc[1]["mode"] == "Secondary": @@ -1414,6 +1423,8 @@ def test_sharding_split_failures( locations = ps.http_client().tenant_list_locations()["tenant_shards"] for loc in locations: tenant_shard_id = TenantShardId.parse(loc[0]) + if tenant_shard_id.tenant_id != tenant_id: + continue # skip bystanders log.info(f"Shard {tenant_shard_id} seen on node {ps.id} in mode {loc[1]['mode']}") assert tenant_shard_id.shard_count == split_shard_count if loc[1]["mode"] == "Secondary": @@ -1496,6 +1507,12 @@ def test_sharding_split_failures( # the scheduler reaches an idle state env.storage_controller.reconcile_until_idle(timeout_secs=30) + # Check that all bystanders are still around. + for bystander_id, bystander_shard_count in bystanders.items(): + response = env.storage_controller.tenant_describe(bystander_id) + assert TenantId(response["tenant_id"]) == bystander_id + assert len(response["shards"]) == bystander_shard_count + env.storage_controller.consistency_check()