tests: perform graceful rolling restarts in storcon scale test (#8173)

## Problem Scale test doesn't exercise drain & fill. ## Summary of changes Make scale test exercise drain & fill
2026-01-10 15:02:56 +00:00 · 2024-07-04 06:04:19 +01:00
parent 778787d8e9
commit bbb2fa7cdd
3 changed files with 171 additions and 59 deletions
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -1518,49 +1518,6 @@ def test_tenant_import(neon_env_builder: NeonEnvBuilder, shard_count, remote_sto
        workload.validate()


-def retryable_node_operation(op, ps_id, max_attempts, backoff):
-    while max_attempts > 0:
-        try:
-            op(ps_id)
-            return
-        except StorageControllerApiException as e:
-            max_attempts -= 1
-            log.info(f"Operation failed ({max_attempts} attempts left): {e}")
-
-            if max_attempts == 0:
-                raise e
-
-            time.sleep(backoff)
-
-
-def poll_node_status(env, node_id, desired_scheduling_policy, max_attempts, backoff):
-    log.info(f"Polling {node_id} for {desired_scheduling_policy} scheduling policy")
-    while max_attempts > 0:
-        try:
-            status = env.storage_controller.node_status(node_id)
-            policy = status["scheduling"]
-            if policy == desired_scheduling_policy:
-                return
-            else:
-                max_attempts -= 1
-                log.info(f"Status call returned {policy=} ({max_attempts} attempts left)")
-
-                if max_attempts == 0:
-                    raise AssertionError(
-                        f"Status for {node_id=} did not reach {desired_scheduling_policy=}"
-                    )
-
-                time.sleep(backoff)
-        except StorageControllerApiException as e:
-            max_attempts -= 1
-            log.info(f"Status call failed ({max_attempts} retries left): {e}")
-
-            if max_attempts == 0:
-                raise e
-
-            time.sleep(backoff)
-
-
 def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder):
    """
    Graceful reststart of storage controller clusters use the drain and
@@ -1601,10 +1558,10 @@ def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder):

    # Perform a graceful rolling restart
    for ps in env.pageservers:
-        retryable_node_operation(
+        env.storage_controller.retryable_node_operation(
            lambda ps_id: env.storage_controller.node_drain(ps_id), ps.id, max_attempts=3, backoff=2
        )
-        poll_node_status(env, ps.id, "PauseForRestart", max_attempts=6, backoff=5)
+        env.storage_controller.poll_node_status(ps.id, "PauseForRestart", max_attempts=6, backoff=5)

        shard_counts = get_node_shard_counts(env, tenant_ids)
        log.info(f"Shard counts after draining node {ps.id}: {shard_counts}")
@@ -1614,12 +1571,12 @@ def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder):
        assert sum(shard_counts.values()) == total_shards

        ps.restart()
-        poll_node_status(env, ps.id, "Active", max_attempts=10, backoff=1)
+        env.storage_controller.poll_node_status(ps.id, "Active", max_attempts=10, backoff=1)

-        retryable_node_operation(
+        env.storage_controller.retryable_node_operation(
            lambda ps_id: env.storage_controller.node_fill(ps_id), ps.id, max_attempts=3, backoff=2
        )
-        poll_node_status(env, ps.id, "Active", max_attempts=6, backoff=5)
+        env.storage_controller.poll_node_status(ps.id, "Active", max_attempts=6, backoff=5)

        shard_counts = get_node_shard_counts(env, tenant_ids)
        log.info(f"Shard counts after filling node {ps.id}: {shard_counts}")
@@ -1657,15 +1614,15 @@ def test_background_operation_cancellation(neon_env_builder: NeonEnvBuilder):

    ps_id_to_drain = env.pageservers[0].id

-    retryable_node_operation(
+    env.storage_controller.retryable_node_operation(
        lambda ps_id: env.storage_controller.node_drain(ps_id),
        ps_id_to_drain,
        max_attempts=3,
        backoff=2,
    )

-    poll_node_status(env, ps_id_to_drain, "Draining", max_attempts=6, backoff=2)
+    env.storage_controller.poll_node_status(ps_id_to_drain, "Draining", max_attempts=6, backoff=2)

    env.storage_controller.cancel_node_drain(ps_id_to_drain)

-    poll_node_status(env, ps_id_to_drain, "Active", max_attempts=6, backoff=2)
+    env.storage_controller.poll_node_status(ps_id_to_drain, "Active", max_attempts=6, backoff=2)