From fed137a5dc4bab5271d7c40cf2c6e9d324e6ddc6 Mon Sep 17 00:00:00 2001 From: John Spray Date: Mon, 17 Jun 2024 09:10:41 +0100 Subject: [PATCH] wip stress tests --- .../regress/test_storage_controller_stress.py | 66 ++++++++++++++----- 1 file changed, 49 insertions(+), 17 deletions(-) diff --git a/test_runner/regress/test_storage_controller_stress.py b/test_runner/regress/test_storage_controller_stress.py index ebe7a3aa96..99ebea76bd 100644 --- a/test_runner/regress/test_storage_controller_stress.py +++ b/test_runner/regress/test_storage_controller_stress.py @@ -2,6 +2,7 @@ import concurrent.futures import random from collections import defaultdict +import pytest from fixtures.common_types import TenantId, TenantShardId, TimelineId from fixtures.compute_reconfigure import ComputeReconfigure from fixtures.log_helper import log @@ -29,9 +30,22 @@ def get_node_shard_counts(env: NeonEnv, tenant_ids): return total, attached +FAILPOINT_API_503 = ("api-503", "5%1000*return(1)") +FAILPOINT_API_500 = ("api-500", "5%1000*return(1)") +FAILPOINT_API_HANG = ("api-hang", "5%1000*return(60000)") + + +@pytest.mark.parametrize( + "failpoints", + [ + # [], + # [FAILPOINT_API_503, FAILPOINT_API_500], + # [FAILPOINT_API_HANG], + [FAILPOINT_API_503, FAILPOINT_API_500, FAILPOINT_API_HANG], + ], +) def test_storcon_rolling_failures( - neon_env_builder: NeonEnvBuilder, - compute_reconfigure_listener: ComputeReconfigure, + neon_env_builder: NeonEnvBuilder, compute_reconfigure_listener: ComputeReconfigure, failpoints ): neon_env_builder.num_pageservers = 8 @@ -43,24 +57,30 @@ def test_storcon_rolling_failures( env = neon_env_builder.init_start() + env.storage_controller.allowed_errors.extend( + [ + # This log is emitted when a node comes online but then fails to respond to a request: this is + # expected, because we do API-level failure injection. + ".*Failed to query node location configs, cannot activate.*" + ] + ) + for ps in env.pageservers: # We will do unclean detaches ps.allowed_errors.append(".*Dropped remote consistent LSN updates.*") - n_tenants = 32 + n_tenants = 8 tenants = [(env.initial_tenant, env.initial_timeline)] for i in range(0, n_tenants - 1): tenant_id = TenantId.generate() timeline_id = TimelineId.generate() shard_count = [1, 2, 4][i % 3] env.neon_cli.create_tenant( - tenant_id, timeline_id, shard_count=shard_count, placement_policy='{"Double":1}' + tenant_id, timeline_id, shard_count=shard_count, placement_policy='{"Attached":1}' ) tenants.append((tenant_id, timeline_id)) # Background pain: - # - TODO: some fraction of pageserver API requests hang - # (this requires implementing wrap of location_conf calls with proper timeline/cancel) # - TODO: continuous tenant/timeline creation/destruction over a different ID range than # the ones we're using for availability checks. @@ -81,11 +101,13 @@ def test_storcon_rolling_failures( psid = shard["node_id"] tsid = TenantShardId.parse(shard["shard_id"]) status = env.get_pageserver(psid).http_client().tenant_status(tenant_id=tsid) + log.info(f"Shard {tsid} status on node {psid}: {status}") assert status["state"]["slug"] == "Active" log.info(f"Shard {tsid} active on node {psid}") - failpoints = ("api-503", "5%1000*return(1)") - failpoints_str = f"{failpoints[0]}={failpoints[1]}" + failpoints = [("api-503", "5%1000*return(1)")] + # failpoints_str = f"{failpoints[0]}={failpoints[1]}" + failpoints_str = ",".join(f"{f[0]}={f[1]}" for f in failpoints) for ps in env.pageservers: ps.http_client().configure_failpoints(failpoints) @@ -156,8 +178,8 @@ def test_storcon_rolling_failures( for_all_workloads(init_one, timeout=60) - for i in range(0, 20): - mode = rng.choice([0, 1, 2]) + for i in range(0, 4): + mode = rng.choice([0, 1, 2, 3]) log.info(f"Iteration {i}, mode {mode}") if mode == 0: # Traffic interval: sometimes, instead of a failure, just let the clients @@ -165,10 +187,23 @@ def test_storcon_rolling_failures( # small quantities of data in flight. traffic() elif mode == 1: - clean_fail_restore() + # Consistency check: quiesce the controller and check that runtime state matches + # database. We intentionally do _not_ do this on every iteration, so that we sometimes leave + # some background reconciliations running across iterations, rather than entering each iteration + # in a pristine state. + env.storage_controller.reconcile_until_idle() + env.storage_controller.consistency_check() elif mode == 2: + clean_fail_restore() + elif mode == 3: hard_fail_restore() + # For convenience when developing, we surface unexpected log errors at every iteration rather than waiting + # for the end of the test. + env.storage_controller.assert_no_errors() + for ps in env.pageservers: + ps.assert_no_errors() + # Fail and restart: hard-kill one node. Notify the storage controller that it is offline. # Success criteria: # - New attach locations should activate within bounded time @@ -180,9 +215,6 @@ def test_storcon_rolling_failures( # - New attach locations should activate within bounded time # - New secondary locations should fill up with data within bounded time - # TODO: somehow need to wait for reconciles to complete before doing consistency check - # (or make the check wait). - - # Do consistency check on every iteration, not just at the end: this makes it more obvious - # which change caused an issue. - env.storage_controller.consistency_check() + # Final check that we can reconcile to a clean state + env.storage_controller.reconcile_until_idle() + env.storage_controller.consistency_check()