storcon: do not retry sk migration ops if the quorum is reached

This commit is contained in:
Dmitrii Kovalkov
2025-07-23 13:38:32 +04:00
parent fc242afcc2
commit 961835add6
3 changed files with 72 additions and 7 deletions

View File

@@ -1540,6 +1540,17 @@ class NeonEnv:
raise RuntimeError(f"Pageserver with ID {id} not found")
def get_safekeeper(self, id: int) -> Safekeeper:
"""
Look up a safekeeper by its ID.
"""
for sk in self.safekeepers:
if sk.id == id:
return sk
raise RuntimeError(f"Safekeeper with ID {id} not found")
def get_tenant_pageserver(self, tenant_id: TenantId | TenantShardId):
"""
Get the NeonPageserver where this tenant shard is currently attached, according

View File

@@ -196,3 +196,33 @@ def test_safekeeper_migration_common_set_failpoints(neon_env_builder: NeonEnvBui
assert (
f"timeline {env.initial_tenant}/{env.initial_timeline} deleted" in exc.value.response.text
)
def test_migrate_from_unavailable_sk(neon_env_builder: NeonEnvBuilder):
"""
Test that we can migrate from an unavailable safekeeper
if the quorum is still alive.
"""
neon_env_builder.num_safekeepers = 4
neon_env_builder.storage_controller_config = {
"timelines_onto_safekeepers": True,
"timeline_safekeeper_count": 3,
}
env = neon_env_builder.init_start()
env.pageserver.allowed_errors.extend(ALLOWED_PAGESERVER_ERRORS)
mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline)
assert len(mconf["sk_set"]) == 3
another_sk = [sk.id for sk in env.safekeepers if sk.id not in mconf["sk_set"]][0]
unavailable_sk = mconf["sk_set"][0]
env.get_safekeeper(unavailable_sk).stop()
new_sk_set = mconf["sk_set"][1:] + [another_sk]
env.storage_controller.migrate_safekeepers(env.initial_tenant, env.initial_timeline, new_sk_set)
mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline)
assert mconf["sk_set"] == new_sk_set
assert mconf["generation"] == 3