storcon: make hb interval an argument and speed up tests (#8880)

## Problem
Each test might wait for up to 5s in order to HB the pageserver.

## Summary of changes
Make the heartbeat interval configurable and use a really tight one for
neon local => startup quicker
This commit is contained in:
Vlad Lazar
2024-09-04 10:09:41 +01:00
committed by GitHub
parent ecfa3d9de9
commit 75310fe441
5 changed files with 28 additions and 6 deletions

View File

@@ -165,6 +165,9 @@ pub struct NeonStorageControllerConf {
pub split_threshold: Option<u64>,
pub max_secondary_lag_bytes: Option<u64>,
#[serde(with = "humantime_serde")]
pub heartbeat_interval: Duration,
}
impl NeonStorageControllerConf {
@@ -172,6 +175,9 @@ impl NeonStorageControllerConf {
const DEFAULT_MAX_OFFLINE_INTERVAL: std::time::Duration = std::time::Duration::from_secs(10);
const DEFAULT_MAX_WARMING_UP_INTERVAL: std::time::Duration = std::time::Duration::from_secs(30);
// Very tight heartbeat interval to speed up tests
const DEFAULT_HEARTBEAT_INTERVAL: std::time::Duration = std::time::Duration::from_millis(100);
}
impl Default for NeonStorageControllerConf {
@@ -183,6 +189,7 @@ impl Default for NeonStorageControllerConf {
database_url: None,
split_threshold: None,
max_secondary_lag_bytes: None,
heartbeat_interval: Self::DEFAULT_HEARTBEAT_INTERVAL,
}
}
}

View File

@@ -437,6 +437,8 @@ impl StorageController {
&humantime::Duration::from(self.config.max_offline).to_string(),
"--max-warming-up-interval",
&humantime::Duration::from(self.config.max_warming_up).to_string(),
"--heartbeat-interval",
&humantime::Duration::from(self.config.heartbeat_interval).to_string(),
"--address-for-peers",
&address_for_peers.to_string(),
]

View File

@@ -11,8 +11,8 @@ use storage_controller::metrics::preinitialize_metrics;
use storage_controller::persistence::Persistence;
use storage_controller::service::chaos_injector::ChaosInjector;
use storage_controller::service::{
Config, Service, MAX_OFFLINE_INTERVAL_DEFAULT, MAX_WARMING_UP_INTERVAL_DEFAULT,
RECONCILER_CONCURRENCY_DEFAULT,
Config, Service, HEARTBEAT_INTERVAL_DEFAULT, MAX_OFFLINE_INTERVAL_DEFAULT,
MAX_WARMING_UP_INTERVAL_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT,
};
use tokio::signal::unix::SignalKind;
use tokio_util::sync::CancellationToken;
@@ -104,6 +104,10 @@ struct Cli {
// a pageserver
#[arg(long)]
max_secondary_lag_bytes: Option<u64>,
// Period with which to send heartbeats to registered nodes
#[arg(long)]
heartbeat_interval: Option<humantime::Duration>,
}
enum StrictMode {
@@ -285,6 +289,10 @@ async fn async_main() -> anyhow::Result<()> {
split_threshold: args.split_threshold,
neon_local_repo_dir: args.neon_local_repo_dir,
max_secondary_lag_bytes: args.max_secondary_lag_bytes,
heartbeat_interval: args
.heartbeat_interval
.map(humantime::Duration::into)
.unwrap_or(HEARTBEAT_INTERVAL_DEFAULT),
address_for_peers: args.address_for_peers,
start_as_candidate: args.start_as_candidate,
http_service_port: args.listen.port() as i32,

View File

@@ -121,6 +121,9 @@ pub const MAX_OFFLINE_INTERVAL_DEFAULT: Duration = Duration::from_secs(30);
/// being handled on the pageserver side.
pub const MAX_WARMING_UP_INTERVAL_DEFAULT: Duration = Duration::from_secs(300);
/// How often to send heartbeats to registered nodes?
pub const HEARTBEAT_INTERVAL_DEFAULT: Duration = Duration::from_secs(5);
#[derive(Clone, strum_macros::Display)]
enum TenantOperations {
Create,
@@ -326,6 +329,8 @@ pub struct Config {
// upgraded to primary.
pub max_secondary_lag_bytes: Option<u64>,
pub heartbeat_interval: Duration,
pub address_for_peers: Option<Uri>,
pub start_as_candidate: bool,
@@ -909,9 +914,7 @@ impl Service {
async fn spawn_heartbeat_driver(&self) {
self.startup_complete.clone().wait().await;
const HEARTBEAT_INTERVAL: Duration = Duration::from_secs(5);
let mut interval = tokio::time::interval(HEARTBEAT_INTERVAL);
let mut interval = tokio::time::interval(self.config.heartbeat_interval);
while !self.cancel.is_cancelled() {
tokio::select! {
_ = interval.tick() => { }

View File

@@ -372,8 +372,10 @@ def test_create_churn_during_restart(neon_env_builder: NeonEnvBuilder):
tenant_id: TenantId = env.initial_tenant
timeline_id = env.initial_timeline
# Multiple creation requests which race will generate this error
# Multiple creation requests which race will generate this error on the pageserver
# and storage controller respectively
env.pageserver.allowed_errors.append(".*Conflict: Tenant is already being modified.*")
env.storage_controller.allowed_errors.append(".*Conflict: Tenant is already being modified.*")
# Tenant creation requests which arrive out of order will generate complaints about
# generation nubmers out of order.