From 8426fb886bcb19e509b2d4d40a0682316163685f Mon Sep 17 00:00:00 2001 From: John Spray Date: Tue, 23 Apr 2024 14:20:12 +0100 Subject: [PATCH] storage_controller: wait for db on startup (#7479) ## Problem In some dev/test environments, there aren't health checks to guarantee the database is available before starting the controller. This creates friction for the developer. ## Summary of changes - Wait up to 5 seconds for the database to become available on startup --- storage_controller/src/main.rs | 3 +++ storage_controller/src/persistence.rs | 26 ++++++++++++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index 6466b9f7a3..ca55d6c593 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -5,6 +5,7 @@ use diesel::Connection; use metrics::launch_timestamp::LaunchTimestamp; use metrics::BuildInfo; use std::sync::Arc; +use std::time::Duration; use storage_controller::http::make_router; use storage_controller::metrics::preinitialize_metrics; use storage_controller::persistence::Persistence; @@ -245,6 +246,8 @@ async fn async_main() -> anyhow::Result<()> { }; // After loading secrets & config, but before starting anything else, apply database migrations + Persistence::await_connection(&secrets.database_url, Duration::from_secs(5)).await?; + migration_run(&secrets.database_url) .await .context("Running database migrations")?; diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs index 5312e1e218..dca37166ba 100644 --- a/storage_controller/src/persistence.rs +++ b/storage_controller/src/persistence.rs @@ -2,6 +2,7 @@ pub(crate) mod split_state; use std::collections::HashMap; use std::str::FromStr; use std::time::Duration; +use std::time::Instant; use self::split_state::SplitState; use camino::Utf8Path; @@ -144,6 +145,31 @@ impl Persistence { } } + /// A helper for use during startup, where we would like to tolerate concurrent restarts of the + /// database and the storage controller, therefore the database might not be available right away + pub async fn await_connection( + database_url: &str, + timeout: Duration, + ) -> Result<(), diesel::ConnectionError> { + let started_at = Instant::now(); + loop { + match PgConnection::establish(database_url) { + Ok(_) => { + tracing::info!("Connected to database."); + return Ok(()); + } + Err(e) => { + if started_at.elapsed() > timeout { + return Err(e); + } else { + tracing::info!("Database not yet available, waiting... ({e})"); + tokio::time::sleep(Duration::from_millis(100)).await; + } + } + } + } + } + /// Wraps `with_conn` in order to collect latency and error metrics async fn with_measured_conn(&self, op: DatabaseOperation, func: F) -> DatabaseResult where