storage controller: improve safety of shard splits coinciding with controller restarts (#11412)

## Problem

The graceful leadership transfer process involves calling step_down on
the old controller, but this was not waiting for shard splits to
complete, and the new controller could therefore end up trying to abort
a shard split while it was still going on.

We mitigated this already in #11256 by avoiding the case where shard
split completion would update the database incorrectly, but this was a
fragile fix because it assumes that is the only problematic part of the
split running concurrently.

Precursors:
- #11290 
- #11256

Closes: #11254 

## Summary of changes

- Hold the reconciler gate from shard splits, so that step_down will
wait for them. Splits should always be fairly prompt, so it is okay to
wait here.
- Defense in depth: if step_down times out (hardcoded 10 second limit),
then fully terminate the controller process rather than letting it
continue running, potentially doing split-brainy things. This makes
sense because the new controller will always declare itself leader
unilaterally if step_down fails, so leaving an old controller running is
not beneficial.
- Tests: extend
`test_storage_controller_leadership_transfer_during_split` to separately
exercise the case of a split holding up step_down, and the case where
the overall timeout on step_down is hit and the controller terminates.
This commit is contained in:
John Spray
2025-04-10 17:55:37 +01:00
committed by GitHub
parent 5487a20b72
commit 52dee408dc
3 changed files with 125 additions and 47 deletions

View File

@@ -2892,10 +2892,12 @@ def test_storage_controller_leadership_transfer(
)
@pytest.mark.parametrize("step_down_times_out", [False, True])
def test_storage_controller_leadership_transfer_during_split(
neon_env_builder: NeonEnvBuilder,
storage_controller_proxy: StorageControllerProxy,
port_distributor: PortDistributor,
step_down_times_out: bool,
):
"""
Exercise a race between shard splitting and graceful leadership transfer. This is
@@ -2936,6 +2938,18 @@ def test_storage_controller_leadership_transfer_during_split(
)
env.storage_controller.reconcile_until_idle()
# We are testing scenarios where the step down API does not complete: either because it is stuck
# doing a shard split, or because it totally times out on some other failpoint.
env.storage_controller.allowed_errors.extend(
[
".*step_down.*request was dropped before completing.*",
".*step_down.*operation timed out.*",
".*Send step down request failed, will retry.*",
".*Send step down request still failed after.*retries.*",
".*Leader .+ did not respond to step-down request.*",
]
)
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
# Start a shard split
env.storage_controller.allowed_errors.extend(
@@ -2943,6 +2957,14 @@ def test_storage_controller_leadership_transfer_during_split(
)
pause_failpoint = "shard-split-pre-complete"
env.storage_controller.configure_failpoints((pause_failpoint, "pause"))
if not step_down_times_out:
# Prevent the timeout self-terminate code from executing: we will block step down on the
# shard split itself
env.storage_controller.configure_failpoints(
("step-down-delay-timeout", "return(3600000)")
)
split_fut = executor.submit(
env.storage_controller.tenant_shard_split, list(tenants)[0], shard_count * 2
)
@@ -2961,12 +2983,20 @@ def test_storage_controller_leadership_transfer_during_split(
timeout_in_seconds=30, instance_id=2, base_port=storage_controller_2_port
)
if step_down_times_out:
# Step down will time out, original controller will terminate itself
env.storage_controller.allowed_errors.extend([".*terminating process.*"])
else:
# Step down does not time out: original controller hits its shard split completion
# code path and realises that it must not purge the parent shards from the database.
env.storage_controller.allowed_errors.extend([".*Enqueuing background abort.*"])
def passed_split_abort():
try:
log.info("Checking log for pattern...")
assert env.storage_controller.log_contains(
".*Using observed state received from leader.*"
)
# This log is indicative of entering startup_reconcile, which happens
# after the point we would abort shard splits
assert env.storage_controller.log_contains(".*Populating tenant shards.*")
except Exception:
log.exception("Failed to find pattern in log")
raise
@@ -2975,34 +3005,42 @@ def test_storage_controller_leadership_transfer_during_split(
wait_until(passed_split_abort, interval=0.1, status_interval=1.0)
assert env.storage_controller.log_contains(".*Aborting shard split.*")
# Proxy is still talking to original controller here: disable its pause failpoint so
# that its shard split can run to completion.
log.info("Disabling failpoint")
# Bypass the proxy: the python test HTTPServer is single threaded and still blocked
# on handling the shard split request.
env.storage_controller.request(
"PUT",
f"http://127.0.0.1:{storage_controller_1_port}/debug/v1/failpoints",
json=[{"name": "shard-split-pre-complete", "actions": "off"}],
headers=env.storage_controller.headers(TokenScope.ADMIN),
)
if step_down_times_out:
# We will let the old controller hit a timeout path where it terminates itself, rather than
# completing step_down and trying to complete a shard split
def old_controller_terminated():
assert env.storage_controller.log_contains(".*terminating process.*")
def previous_stepped_down():
assert (
env.storage_controller.get_leadership_status()
== StorageControllerLeadershipStatus.STEPPED_DOWN
wait_until(old_controller_terminated)
else:
# Proxy is still talking to original controller here: disable its pause failpoint so
# that its shard split can run to completion.
log.info("Disabling failpoint")
# Bypass the proxy: the python test HTTPServer is single threaded and still blocked
# on handling the shard split request.
env.storage_controller.request(
"PUT",
f"http://127.0.0.1:{storage_controller_1_port}/debug/v1/failpoints",
json=[{"name": "shard-split-pre-complete", "actions": "off"}],
headers=env.storage_controller.headers(TokenScope.ADMIN),
)
log.info("Awaiting step down")
wait_until(previous_stepped_down)
def previous_stepped_down():
assert (
env.storage_controller.get_leadership_status()
== StorageControllerLeadershipStatus.STEPPED_DOWN
)
# Let the shard split complete: this may happen _after_ the replacement has come up
# and tried to clean up the databases
log.info("Unblocking & awaiting shard split")
with pytest.raises(Exception, match="Unexpected child shard count"):
# This split fails when it tries to persist results, because it encounters
# changes already made by the new controller's abort-on-startup
split_fut.result()
log.info("Awaiting step down")
wait_until(previous_stepped_down)
# Let the shard split complete: this may happen _after_ the replacement has come up
# and tried to clean up the databases
log.info("Unblocking & awaiting shard split")
with pytest.raises(Exception, match="Unexpected child shard count"):
# This split fails when it tries to persist results, because it encounters
# changes already made by the new controller's abort-on-startup
split_fut.result()
log.info("Routing to new leader")
storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_2_port}")
@@ -3020,13 +3058,14 @@ def test_storage_controller_leadership_transfer_during_split(
env.storage_controller.wait_until_ready()
env.storage_controller.consistency_check()
# Check that the stepped down instance forwards requests
# to the new leader while it's still running.
storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_1_port}")
env.storage_controller.tenant_shard_dump()
env.storage_controller.node_configure(env.pageservers[0].id, {"scheduling": "Pause"})
status = env.storage_controller.node_status(env.pageservers[0].id)
assert status["scheduling"] == "Pause"
if not step_down_times_out:
# Check that the stepped down instance forwards requests
# to the new leader while it's still running.
storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_1_port}")
env.storage_controller.tenant_shard_dump()
env.storage_controller.node_configure(env.pageservers[0].id, {"scheduling": "Pause"})
status = env.storage_controller.node_status(env.pageservers[0].id)
assert status["scheduling"] == "Pause"
def test_storage_controller_ps_restarted_during_drain(neon_env_builder: NeonEnvBuilder):