From bab09fffb6742dd699ae8caea4319088b3357fa4 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Fri, 25 Jul 2025 18:48:17 +0100 Subject: [PATCH] storcon: add persistence configuration options --- storage_controller/src/main.rs | 27 ++++++++++++++++-- storage_controller/src/persistence.rs | 40 ++++++++++++++++++++++++--- storage_controller/src/service.rs | 4 ++- 3 files changed, 64 insertions(+), 7 deletions(-) diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index 34d4ac6fba..1bf6ca5803 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -16,7 +16,7 @@ use pageserver_api::config::PostHogConfig; use reqwest::Certificate; use storage_controller::http::make_router; use storage_controller::metrics::preinitialize_metrics; -use storage_controller::persistence::Persistence; +use storage_controller::persistence::{Persistence, PersistenceConfig}; use storage_controller::service::chaos_injector::ChaosInjector; use storage_controller::service::feature_flag::FeatureFlagService; use storage_controller::service::{ @@ -229,6 +229,15 @@ struct Cli { /// **Feature Flag** Whether the storage controller should act to rectify pageserver-reported local disk loss. #[arg(long, default_value = "false")] handle_ps_local_disk_loss: bool, + + #[arg(long)] + db_max_connections: Option, + + #[arg(long)] + db_idle_connection_timeout: Option, + + #[arg(long)] + db_max_connection_lifetime: Option, } enum StrictMode { @@ -429,6 +438,19 @@ async fn async_main() -> anyhow::Result<()> { None }; + let db_idle_connection_timeout: Option = args + .db_idle_connection_timeout + .map(humantime::Duration::into); + let db_max_connection_lifetime: Option = args + .db_max_connection_lifetime + .map(humantime::Duration::into); + + let persistence_config = PersistenceConfig::new( + args.db_max_connections, + db_idle_connection_timeout, + db_max_connection_lifetime, + ); + let config = Config { pageserver_jwt_token: secrets.pageserver_jwt_token, safekeeper_jwt_token: secrets.safekeeper_jwt_token, @@ -482,12 +504,13 @@ async fn async_main() -> anyhow::Result<()> { .map(humantime::Duration::into) .unwrap_or(Duration::MAX), handle_ps_local_disk_loss: args.handle_ps_local_disk_loss, + persistence_config, }; // Validate that we can connect to the database Persistence::await_connection(&secrets.database_url, args.db_connect_timeout.into()).await?; - let persistence = Arc::new(Persistence::new(secrets.database_url).await); + let persistence = Arc::new(Persistence::new(secrets.database_url, persistence_config).await); let service = Service::spawn(config, persistence.clone()).await?; diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs index 1797ae3c32..0fe47e342e 100644 --- a/storage_controller/src/persistence.rs +++ b/storage_controller/src/persistence.rs @@ -87,6 +87,38 @@ pub struct Persistence { connection_pool: Pool, } +#[derive(Copy, Clone)] +pub struct PersistenceConfig { + max_connections: u32, + idle_connection_timeout: Duration, + max_connection_lifetime: Duration, +} + +impl PersistenceConfig { + pub fn new( + max_connections: Option, + idle_connection_timeout: Option, + max_connection_lifetime: Option, + ) -> Self { + // If unspecified, use neon.com defaults + // + // The default postgres connection limit is 100. We use up to 99, to leave one free for a human admin under + // normal circumstances. This assumes we have exclusive use of the database cluster to which we connect. + pub const MAX_CONNECTIONS_DEFAULT: u32 = 99; + // We don't want to keep a lot of connections alive: close them down promptly if they aren't being used. + pub const IDLE_CONNECTION_TIMEOUT_DEFAULT: Duration = Duration::from_secs(10); + pub const MAX_CONNECTION_LIFETIME_DEFAULT: Duration = Duration::from_secs(60); + + PersistenceConfig { + max_connections: max_connections.unwrap_or(MAX_CONNECTIONS_DEFAULT), + idle_connection_timeout: idle_connection_timeout + .unwrap_or(IDLE_CONNECTION_TIMEOUT_DEFAULT), + max_connection_lifetime: max_connection_lifetime + .unwrap_or(MAX_CONNECTION_LIFETIME_DEFAULT), + } + } +} + /// Legacy format, for use in JSON compat objects in test environment #[derive(Serialize, Deserialize)] struct JsonPersistence { @@ -207,7 +239,7 @@ impl Persistence { const IDLE_CONNECTION_TIMEOUT: Duration = Duration::from_secs(10); const MAX_CONNECTION_LIFETIME: Duration = Duration::from_secs(60); - pub async fn new(database_url: String) -> Self { + pub async fn new(database_url: String, config: PersistenceConfig) -> Self { let mut mgr_config = ManagerConfig::default(); mgr_config.custom_setup = Box::new(establish_connection_rustls); @@ -219,9 +251,9 @@ impl Persistence { // We will use a connection pool: this is primarily to _limit_ our connection count, rather than to optimize time // to execute queries (database queries are not generally on latency-sensitive paths). let connection_pool = Pool::builder() - .max_size(Self::MAX_CONNECTIONS) - .max_lifetime(Some(Self::MAX_CONNECTION_LIFETIME)) - .idle_timeout(Some(Self::IDLE_CONNECTION_TIMEOUT)) + .max_size(config.max_connections) + .max_lifetime(Some(config.max_connection_lifetime)) + .idle_timeout(Some(config.idle_connection_timeout)) // Always keep at least one connection ready to go .min_idle(Some(1)) .test_on_check_out(true) diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 4c011033af..270242b9fa 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -85,7 +85,7 @@ use crate::peer_client::GlobalObservedState; use crate::persistence::split_state::SplitState; use crate::persistence::{ AbortShardSplitStatus, ControllerPersistence, DatabaseError, DatabaseResult, - MetadataHealthPersistence, Persistence, ShardGenerationState, TenantFilter, + MetadataHealthPersistence, Persistence, PersistenceConfig, ShardGenerationState, TenantFilter, TenantShardPersistence, }; use crate::reconciler::{ @@ -490,6 +490,8 @@ pub struct Config { // Feature flag: Whether the storage controller should act to rectify pageserver-reported local disk loss. pub handle_ps_local_disk_loss: bool, + + pub persistence_config: PersistenceConfig, } impl From for ApiError {