storage_controller: wait for db on startup (#7479)

## Problem

In some dev/test environments, there aren't health checks to guarantee
the database is available before starting the controller. This creates
friction for the developer.

## Summary of changes

- Wait up to 5 seconds for the database to become available on startup
This commit is contained in:
John Spray
2024-04-23 14:20:12 +01:00
committed by GitHub
parent 28e7fa98c4
commit 8426fb886b
2 changed files with 29 additions and 0 deletions

View File

@@ -5,6 +5,7 @@ use diesel::Connection;
use metrics::launch_timestamp::LaunchTimestamp;
use metrics::BuildInfo;
use std::sync::Arc;
use std::time::Duration;
use storage_controller::http::make_router;
use storage_controller::metrics::preinitialize_metrics;
use storage_controller::persistence::Persistence;
@@ -245,6 +246,8 @@ async fn async_main() -> anyhow::Result<()> {
};
// After loading secrets & config, but before starting anything else, apply database migrations
Persistence::await_connection(&secrets.database_url, Duration::from_secs(5)).await?;
migration_run(&secrets.database_url)
.await
.context("Running database migrations")?;

View File

@@ -2,6 +2,7 @@ pub(crate) mod split_state;
use std::collections::HashMap;
use std::str::FromStr;
use std::time::Duration;
use std::time::Instant;
use self::split_state::SplitState;
use camino::Utf8Path;
@@ -144,6 +145,31 @@ impl Persistence {
}
}
/// A helper for use during startup, where we would like to tolerate concurrent restarts of the
/// database and the storage controller, therefore the database might not be available right away
pub async fn await_connection(
database_url: &str,
timeout: Duration,
) -> Result<(), diesel::ConnectionError> {
let started_at = Instant::now();
loop {
match PgConnection::establish(database_url) {
Ok(_) => {
tracing::info!("Connected to database.");
return Ok(());
}
Err(e) => {
if started_at.elapsed() > timeout {
return Err(e);
} else {
tracing::info!("Database not yet available, waiting... ({e})");
tokio::time::sleep(Duration::from_millis(100)).await;
}
}
}
}
}
/// Wraps `with_conn` in order to collect latency and error metrics
async fn with_measured_conn<F, R>(&self, op: DatabaseOperation, func: F) -> DatabaseResult<R>
where