storage_controller: make leadership protocol more robust (#11703)

## Problem We saw the following scenario in staging: 1. Pod A starts up. Becomes leader and steps down the previous pod cleanly. 2. Pod B starts up (deployment). 3. Step down request from pod B to pod A times out. Pod A did not manage to stop its reconciliations within 10 seconds and exited with return code 1 ([code](7ba8519b43/storage_controller/src/service.rs (L8686-L8702))). 4. Pod B marks itself as the leader and finishes start-up 5. k8s restarts pod A 6. k8s marks pod B as ready 7. pod A sends step down request to pod A - this succeeds => pod A is now the leader 8. k8s kills pod A because it thinks pod B is healthy and pod A is part of the old replica set We end up in a situation where the only pod we have (B) is stepped down and attempts to forward requests to a leader that doesn't exist. k8s can't detect that pod B is in a bad state since the /status endpoint simply returns 200 hundred if the pod is running. ## Summary of changes This PR includes a number of robustness improvements to the leadership protocol: * use a single step down task per controller * add a new endpoint to be used as k8s liveness probe and check leadership status there * handle restarts explicitly (i.e. don't step yourself down) * increase the step down retry count * don't kill the process on long step down since k8s will just restart it
2026-01-08 14:02:55 +00:00 · 2025-04-24 17:59:56 +01:00
parent 8afb783708
commit 6f7e3c18e4
5 changed files with 142 additions and 89 deletions
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -2894,12 +2894,10 @@ def test_storage_controller_leadership_transfer(
        )


-@pytest.mark.parametrize("step_down_times_out", [False, True])
 def test_storage_controller_leadership_transfer_during_split(
    neon_env_builder: NeonEnvBuilder,
    storage_controller_proxy: StorageControllerProxy,
    port_distributor: PortDistributor,
-    step_down_times_out: bool,
 ):
    """
    Exercise a race between shard splitting and graceful leadership transfer.  This is
@@ -2940,8 +2938,8 @@ def test_storage_controller_leadership_transfer_during_split(
        )
    env.storage_controller.reconcile_until_idle()

-    # We are testing scenarios where the step down API does not complete: either because it is stuck
-    # doing a shard split, or because it totally times out on some other failpoint.
+    # We are testing scenarios where the step down API does not complete: it is stuck
+    # doing a shard split
    env.storage_controller.allowed_errors.extend(
        [
            ".*step_down.*request was dropped before completing.*",
@@ -2949,6 +2947,7 @@ def test_storage_controller_leadership_transfer_during_split(
            ".*Send step down request failed, will retry.*",
            ".*Send step down request still failed after.*retries.*",
            ".*Leader .+ did not respond to step-down request.*",
+            ".*Stopping reconciliations during step down is taking too long.*",
        ]
    )

@@ -2960,13 +2959,6 @@ def test_storage_controller_leadership_transfer_during_split(
        pause_failpoint = "shard-split-pre-complete"
        env.storage_controller.configure_failpoints((pause_failpoint, "pause"))

-        if not step_down_times_out:
-            # Prevent the timeout self-terminate code from executing: we will block step down on the
-            # shard split itself
-            env.storage_controller.configure_failpoints(
-                ("step-down-delay-timeout", "return(3600000)")
-            )
-
        split_fut = executor.submit(
            env.storage_controller.tenant_shard_split, list(tenants)[0], shard_count * 2
        )
@@ -2985,13 +2977,9 @@ def test_storage_controller_leadership_transfer_during_split(
            timeout_in_seconds=30, instance_id=2, base_port=storage_controller_2_port
        )

-        if step_down_times_out:
-            # Step down will time out, original controller will terminate itself
-            env.storage_controller.allowed_errors.extend([".*terminating process.*"])
-        else:
-            # Step down does not time out: original controller hits its shard split completion
-            # code path and realises that it must not purge the parent shards from the database.
-            env.storage_controller.allowed_errors.extend([".*Enqueuing background abort.*"])
+        # Step down does not time out: original controller hits its shard split completion
+        # code path and realises that it must not purge the parent shards from the database.
+        env.storage_controller.allowed_errors.extend([".*Enqueuing background abort.*"])

        def passed_split_abort():
            try:
@@ -3007,42 +2995,34 @@ def test_storage_controller_leadership_transfer_during_split(
        wait_until(passed_split_abort, interval=0.1, status_interval=1.0)
        assert env.storage_controller.log_contains(".*Aborting shard split.*")

-        if step_down_times_out:
-            # We will let the old controller hit a timeout path where it terminates itself, rather than
-            # completing step_down and trying to complete a shard split
-            def old_controller_terminated():
-                assert env.storage_controller.log_contains(".*terminating process.*")
+        # Proxy is still talking to original controller here: disable its pause failpoint so
+        # that its shard split can run to completion.
+        log.info("Disabling failpoint")
+        # Bypass the proxy: the python test HTTPServer is single threaded and still blocked
+        # on handling the shard split request.
+        env.storage_controller.request(
+            "PUT",
+            f"http://127.0.0.1:{storage_controller_1_port}/debug/v1/failpoints",
+            json=[{"name": "shard-split-pre-complete", "actions": "off"}],
+            headers=env.storage_controller.headers(TokenScope.ADMIN),
+        )

-            wait_until(old_controller_terminated)
-        else:
-            # Proxy is still talking to original controller here: disable its pause failpoint so
-            # that its shard split can run to completion.
-            log.info("Disabling failpoint")
-            # Bypass the proxy: the python test HTTPServer is single threaded and still blocked
-            # on handling the shard split request.
-            env.storage_controller.request(
-                "PUT",
-                f"http://127.0.0.1:{storage_controller_1_port}/debug/v1/failpoints",
-                json=[{"name": "shard-split-pre-complete", "actions": "off"}],
-                headers=env.storage_controller.headers(TokenScope.ADMIN),
+        def previous_stepped_down():
+            assert (
+                env.storage_controller.get_leadership_status()
+                == StorageControllerLeadershipStatus.STEPPED_DOWN
            )

-            def previous_stepped_down():
-                assert (
-                    env.storage_controller.get_leadership_status()
-                    == StorageControllerLeadershipStatus.STEPPED_DOWN
-                )
+        log.info("Awaiting step down")
+        wait_until(previous_stepped_down)

-            log.info("Awaiting step down")
-            wait_until(previous_stepped_down)
-
-            # Let the shard split complete: this may happen _after_ the replacement has come up
-            # and tried to clean up the databases
-            log.info("Unblocking & awaiting shard split")
-            with pytest.raises(Exception, match="Unexpected child shard count"):
-                # This split fails when it tries to persist results, because it encounters
-                # changes already made by the new controller's abort-on-startup
-                split_fut.result()
+        # Let the shard split complete: this may happen _after_ the replacement has come up
+        # and tried to clean up the databases
+        log.info("Unblocking & awaiting shard split")
+        with pytest.raises(Exception, match="Unexpected child shard count"):
+            # This split fails when it tries to persist results, because it encounters
+            # changes already made by the new controller's abort-on-startup
+            split_fut.result()

        log.info("Routing to new leader")
        storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_2_port}")
@@ -3060,14 +3040,13 @@ def test_storage_controller_leadership_transfer_during_split(
    env.storage_controller.wait_until_ready()
    env.storage_controller.consistency_check()

-    if not step_down_times_out:
-        # Check that the stepped down instance forwards requests
-        # to the new leader while it's still running.
-        storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_1_port}")
-        env.storage_controller.tenant_shard_dump()
-        env.storage_controller.node_configure(env.pageservers[0].id, {"scheduling": "Pause"})
-        status = env.storage_controller.node_status(env.pageservers[0].id)
-        assert status["scheduling"] == "Pause"
+    # Check that the stepped down instance forwards requests
+    # to the new leader while it's still running.
+    storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_1_port}")
+    env.storage_controller.tenant_shard_dump()
+    env.storage_controller.node_configure(env.pageservers[0].id, {"scheduling": "Pause"})
+    status = env.storage_controller.node_status(env.pageservers[0].id)
+    assert status["scheduling"] == "Pause"


 def test_storage_controller_ps_restarted_during_drain(neon_env_builder: NeonEnvBuilder):