From d3fa0f6b9efd750904a145036cf306fde32fa4f3 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Tue, 7 Jan 2025 16:30:32 +0100 Subject: [PATCH] storage_controller: rename failpoint and make it pausable The same failpoint is used for a new test by a follow up commit and that needs a pausable failpoint. --- storage_controller/src/reconciler.rs | 3 +-- test_runner/regress/test_storage_controller.py | 11 ++++++++++- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs index 475f91eff4..60ef3de67f 100644 --- a/storage_controller/src/reconciler.rs +++ b/storage_controller/src/reconciler.rs @@ -14,7 +14,6 @@ use std::sync::Arc; use std::time::{Duration, Instant}; use tokio_util::sync::CancellationToken; use utils::backoff::exponential_backoff; -use utils::failpoint_support; use utils::generation::Generation; use utils::id::{NodeId, TimelineId}; use utils::lsn::Lsn; @@ -824,7 +823,7 @@ impl Reconciler { .handle_detach(self.tenant_shard_id, self.shard.stripe_size); } - failpoint_support::sleep_millis_async!("sleep-on-reconcile-epilogue"); + pausable_failpoint!("reconciler-epilogue"); Ok(()) } diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 7062c35e05..b84e97b5a5 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -2406,7 +2406,14 @@ def test_storage_controller_step_down(neon_env_builder: NeonEnvBuilder): env.storage_controller.tenant_create(tid) env.storage_controller.reconcile_until_idle() - env.storage_controller.configure_failpoints(("sleep-on-reconcile-epilogue", "return(10000)")) + env.storage_controller.configure_failpoints(("reconciler-epilogue", "pause")) + + def unpause_failpoint(): + time.sleep(2) + env.storage_controller.configure_failpoints(("reconciler-epilogue", "off")) + + thread = threading.Thread(target=unpause_failpoint) + thread.start() # Make a change to the tenant config to trigger a slow reconcile virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True) @@ -2421,6 +2428,8 @@ def test_storage_controller_step_down(neon_env_builder: NeonEnvBuilder): observed_state = env.storage_controller.step_down() log.info(f"Storage controller stepped down with {observed_state=}") + thread.join() + # Validate that we waited for the slow reconcile to complete # and updated the observed state in the storcon before stepping down. node_id = str(env.pageserver.id)