diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 6fdad2188c..6c87dbbfc2 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -2213,6 +2213,30 @@ class NeonStorageController(MetricsGetter, LogUtils): headers=self.headers(TokenScope.ADMIN), ) + def node_drain(self, node_id): + log.info(f"node_drain({node_id})") + self.request( + "PUT", + f"{self.env.storage_controller_api}/control/v1/node/{node_id}/drain", + headers=self.headers(TokenScope.ADMIN), + ) + + def node_fill(self, node_id): + log.info(f"node_fill({node_id})") + self.request( + "PUT", + f"{self.env.storage_controller_api}/control/v1/node/{node_id}/fill", + headers=self.headers(TokenScope.ADMIN), + ) + + def node_status(self, node_id): + response = self.request( + "GET", + f"{self.env.storage_controller_api}/control/v1/node/{node_id}", + headers=self.headers(TokenScope.ADMIN), + ) + return response.json() + def node_list(self): response = self.request( "GET", diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 2031feaa83..5c2bfed6e5 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -1477,3 +1477,120 @@ def test_tenant_import(neon_env_builder: NeonEnvBuilder, shard_count, remote_sto workload = Workload(env, tenant_id, timeline, branch_name=branch) workload.expect_rows = expect_rows workload.validate() + + +def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder): + """ + Graceful reststart of storage controller clusters use the drain and + fill hooks in order to migrate attachments away from pageservers before + restarting. In practice, Ansible will drive this process. + """ + neon_env_builder.num_pageservers = 2 + env = neon_env_builder.init_configs() + env.start() + + tenant_count = 5 + shard_count_per_tenant = 8 + total_shards = tenant_count * shard_count_per_tenant + tenant_ids = [] + + for _ in range(0, tenant_count): + tid = TenantId.generate() + tenant_ids.append(tid) + env.neon_cli.create_tenant( + tid, placement_policy='{"Attached":1}', shard_count=shard_count_per_tenant + ) + + # Give things a chance to settle. + # A call to `reconcile_until_idle` could be used here instead, + # however since all attachments are placed on the same node, + # we'd have to wait for a long time (2 minutes-ish) for optimizations + # to quiesce. + # TODO: once the initial attachment selection is fixed, update this + # to use `reconcile_until_idle`. + time.sleep(2) + + nodes = env.storage_controller.node_list() + assert len(nodes) == 2 + + def retryable_node_operation(op, ps_id, max_attempts, backoff): + while max_attempts > 0: + try: + op(ps_id) + return + except StorageControllerApiException as e: + max_attempts -= 1 + log.info(f"Operation failed ({max_attempts} attempts left): {e}") + + if max_attempts == 0: + raise e + + time.sleep(backoff) + + def poll_node_status(node_id, desired_scheduling_policy, max_attempts, backoff): + log.info(f"Polling {node_id} for {desired_scheduling_policy} scheduling policy") + while max_attempts > 0: + try: + status = env.storage_controller.node_status(node_id) + policy = status["scheduling"] + if policy == desired_scheduling_policy: + return + else: + max_attempts -= 1 + log.info(f"Status call returned {policy=} ({max_attempts} attempts left)") + + if max_attempts == 0: + raise AssertionError( + f"Status for {node_id=} did not reach {desired_scheduling_policy=}" + ) + + time.sleep(backoff) + except StorageControllerApiException as e: + max_attempts -= 1 + log.info(f"Status call failed ({max_attempts} retries left): {e}") + + if max_attempts == 0: + raise e + + time.sleep(backoff) + + def assert_shard_counts_balanced(env: NeonEnv, shard_counts, total_shards): + # Assert that all nodes have some attached shards + assert len(shard_counts) == len(env.pageservers) + + min_shard_count = min(shard_counts.values()) + max_shard_count = max(shard_counts.values()) + + flake_factor = 5 / 100 + assert max_shard_count - min_shard_count <= int(total_shards * flake_factor) + + # Perform a graceful rolling restart + for ps in env.pageservers: + retryable_node_operation( + lambda ps_id: env.storage_controller.node_drain(ps_id), ps.id, max_attempts=3, backoff=2 + ) + poll_node_status(ps.id, "PauseForRestart", max_attempts=6, backoff=5) + + shard_counts = get_node_shard_counts(env, tenant_ids) + log.info(f"Shard counts after draining node {ps.id}: {shard_counts}") + # Assert that we've drained the node + assert shard_counts[str(ps.id)] == 0 + # Assert that those shards actually went somewhere + assert sum(shard_counts.values()) == total_shards + + ps.restart() + poll_node_status(ps.id, "Active", max_attempts=10, backoff=1) + + retryable_node_operation( + lambda ps_id: env.storage_controller.node_fill(ps_id), ps.id, max_attempts=3, backoff=2 + ) + poll_node_status(ps.id, "Active", max_attempts=6, backoff=5) + + shard_counts = get_node_shard_counts(env, tenant_ids) + log.info(f"Shard counts after filling node {ps.id}: {shard_counts}") + assert_shard_counts_balanced(env, shard_counts, total_shards) + + # Now check that shards are reasonably balanced + shard_counts = get_node_shard_counts(env, tenant_ids) + log.info(f"Shard counts after rolling restart: {shard_counts}") + assert_shard_counts_balanced(env, shard_counts, total_shards)