From 75310fe441b87d399213e365f1364aa9f08aa40d Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Wed, 4 Sep 2024 10:09:41 +0100 Subject: [PATCH] storcon: make hb interval an argument and speed up tests (#8880) ## Problem Each test might wait for up to 5s in order to HB the pageserver. ## Summary of changes Make the heartbeat interval configurable and use a really tight one for neon local => startup quicker --- control_plane/src/local_env.rs | 7 +++++++ control_plane/src/storage_controller.rs | 2 ++ storage_controller/src/main.rs | 12 ++++++++++-- storage_controller/src/service.rs | 9 ++++++--- test_runner/regress/test_tenants.py | 4 +++- 5 files changed, 28 insertions(+), 6 deletions(-) diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 74caba2b56..5dbc3bcbbc 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -165,6 +165,9 @@ pub struct NeonStorageControllerConf { pub split_threshold: Option, pub max_secondary_lag_bytes: Option, + + #[serde(with = "humantime_serde")] + pub heartbeat_interval: Duration, } impl NeonStorageControllerConf { @@ -172,6 +175,9 @@ impl NeonStorageControllerConf { const DEFAULT_MAX_OFFLINE_INTERVAL: std::time::Duration = std::time::Duration::from_secs(10); const DEFAULT_MAX_WARMING_UP_INTERVAL: std::time::Duration = std::time::Duration::from_secs(30); + + // Very tight heartbeat interval to speed up tests + const DEFAULT_HEARTBEAT_INTERVAL: std::time::Duration = std::time::Duration::from_millis(100); } impl Default for NeonStorageControllerConf { @@ -183,6 +189,7 @@ impl Default for NeonStorageControllerConf { database_url: None, split_threshold: None, max_secondary_lag_bytes: None, + heartbeat_interval: Self::DEFAULT_HEARTBEAT_INTERVAL, } } } diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index 27d8e2de0c..c715d6b789 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -437,6 +437,8 @@ impl StorageController { &humantime::Duration::from(self.config.max_offline).to_string(), "--max-warming-up-interval", &humantime::Duration::from(self.config.max_warming_up).to_string(), + "--heartbeat-interval", + &humantime::Duration::from(self.config.heartbeat_interval).to_string(), "--address-for-peers", &address_for_peers.to_string(), ] diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index e3f29b84e7..00e90f4467 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -11,8 +11,8 @@ use storage_controller::metrics::preinitialize_metrics; use storage_controller::persistence::Persistence; use storage_controller::service::chaos_injector::ChaosInjector; use storage_controller::service::{ - Config, Service, MAX_OFFLINE_INTERVAL_DEFAULT, MAX_WARMING_UP_INTERVAL_DEFAULT, - RECONCILER_CONCURRENCY_DEFAULT, + Config, Service, HEARTBEAT_INTERVAL_DEFAULT, MAX_OFFLINE_INTERVAL_DEFAULT, + MAX_WARMING_UP_INTERVAL_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT, }; use tokio::signal::unix::SignalKind; use tokio_util::sync::CancellationToken; @@ -104,6 +104,10 @@ struct Cli { // a pageserver #[arg(long)] max_secondary_lag_bytes: Option, + + // Period with which to send heartbeats to registered nodes + #[arg(long)] + heartbeat_interval: Option, } enum StrictMode { @@ -285,6 +289,10 @@ async fn async_main() -> anyhow::Result<()> { split_threshold: args.split_threshold, neon_local_repo_dir: args.neon_local_repo_dir, max_secondary_lag_bytes: args.max_secondary_lag_bytes, + heartbeat_interval: args + .heartbeat_interval + .map(humantime::Duration::into) + .unwrap_or(HEARTBEAT_INTERVAL_DEFAULT), address_for_peers: args.address_for_peers, start_as_candidate: args.start_as_candidate, http_service_port: args.listen.port() as i32, diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 95821827e2..49253cb4e0 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -121,6 +121,9 @@ pub const MAX_OFFLINE_INTERVAL_DEFAULT: Duration = Duration::from_secs(30); /// being handled on the pageserver side. pub const MAX_WARMING_UP_INTERVAL_DEFAULT: Duration = Duration::from_secs(300); +/// How often to send heartbeats to registered nodes? +pub const HEARTBEAT_INTERVAL_DEFAULT: Duration = Duration::from_secs(5); + #[derive(Clone, strum_macros::Display)] enum TenantOperations { Create, @@ -326,6 +329,8 @@ pub struct Config { // upgraded to primary. pub max_secondary_lag_bytes: Option, + pub heartbeat_interval: Duration, + pub address_for_peers: Option, pub start_as_candidate: bool, @@ -909,9 +914,7 @@ impl Service { async fn spawn_heartbeat_driver(&self) { self.startup_complete.clone().wait().await; - const HEARTBEAT_INTERVAL: Duration = Duration::from_secs(5); - - let mut interval = tokio::time::interval(HEARTBEAT_INTERVAL); + let mut interval = tokio::time::interval(self.config.heartbeat_interval); while !self.cancel.is_cancelled() { tokio::select! { _ = interval.tick() => { } diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 0ebf714de0..b63ff7f6bd 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -372,8 +372,10 @@ def test_create_churn_during_restart(neon_env_builder: NeonEnvBuilder): tenant_id: TenantId = env.initial_tenant timeline_id = env.initial_timeline - # Multiple creation requests which race will generate this error + # Multiple creation requests which race will generate this error on the pageserver + # and storage controller respectively env.pageserver.allowed_errors.append(".*Conflict: Tenant is already being modified.*") + env.storage_controller.allowed_errors.append(".*Conflict: Tenant is already being modified.*") # Tenant creation requests which arrive out of order will generate complaints about # generation nubmers out of order.