storcon: add persistence configuration options

This commit is contained in:
Vlad Lazar
2025-07-25 18:48:17 +01:00
parent 7bd73eba72
commit bab09fffb6
3 changed files with 64 additions and 7 deletions

View File

@@ -16,7 +16,7 @@ use pageserver_api::config::PostHogConfig;
use reqwest::Certificate;
use storage_controller::http::make_router;
use storage_controller::metrics::preinitialize_metrics;
use storage_controller::persistence::Persistence;
use storage_controller::persistence::{Persistence, PersistenceConfig};
use storage_controller::service::chaos_injector::ChaosInjector;
use storage_controller::service::feature_flag::FeatureFlagService;
use storage_controller::service::{
@@ -229,6 +229,15 @@ struct Cli {
/// **Feature Flag** Whether the storage controller should act to rectify pageserver-reported local disk loss.
#[arg(long, default_value = "false")]
handle_ps_local_disk_loss: bool,
#[arg(long)]
db_max_connections: Option<u32>,
#[arg(long)]
db_idle_connection_timeout: Option<humantime::Duration>,
#[arg(long)]
db_max_connection_lifetime: Option<humantime::Duration>,
}
enum StrictMode {
@@ -429,6 +438,19 @@ async fn async_main() -> anyhow::Result<()> {
None
};
let db_idle_connection_timeout: Option<Duration> = args
.db_idle_connection_timeout
.map(humantime::Duration::into);
let db_max_connection_lifetime: Option<Duration> = args
.db_max_connection_lifetime
.map(humantime::Duration::into);
let persistence_config = PersistenceConfig::new(
args.db_max_connections,
db_idle_connection_timeout,
db_max_connection_lifetime,
);
let config = Config {
pageserver_jwt_token: secrets.pageserver_jwt_token,
safekeeper_jwt_token: secrets.safekeeper_jwt_token,
@@ -482,12 +504,13 @@ async fn async_main() -> anyhow::Result<()> {
.map(humantime::Duration::into)
.unwrap_or(Duration::MAX),
handle_ps_local_disk_loss: args.handle_ps_local_disk_loss,
persistence_config,
};
// Validate that we can connect to the database
Persistence::await_connection(&secrets.database_url, args.db_connect_timeout.into()).await?;
let persistence = Arc::new(Persistence::new(secrets.database_url).await);
let persistence = Arc::new(Persistence::new(secrets.database_url, persistence_config).await);
let service = Service::spawn(config, persistence.clone()).await?;

View File

@@ -87,6 +87,38 @@ pub struct Persistence {
connection_pool: Pool<AsyncPgConnection>,
}
#[derive(Copy, Clone)]
pub struct PersistenceConfig {
max_connections: u32,
idle_connection_timeout: Duration,
max_connection_lifetime: Duration,
}
impl PersistenceConfig {
pub fn new(
max_connections: Option<u32>,
idle_connection_timeout: Option<Duration>,
max_connection_lifetime: Option<Duration>,
) -> Self {
// If unspecified, use neon.com defaults
//
// The default postgres connection limit is 100. We use up to 99, to leave one free for a human admin under
// normal circumstances. This assumes we have exclusive use of the database cluster to which we connect.
pub const MAX_CONNECTIONS_DEFAULT: u32 = 99;
// We don't want to keep a lot of connections alive: close them down promptly if they aren't being used.
pub const IDLE_CONNECTION_TIMEOUT_DEFAULT: Duration = Duration::from_secs(10);
pub const MAX_CONNECTION_LIFETIME_DEFAULT: Duration = Duration::from_secs(60);
PersistenceConfig {
max_connections: max_connections.unwrap_or(MAX_CONNECTIONS_DEFAULT),
idle_connection_timeout: idle_connection_timeout
.unwrap_or(IDLE_CONNECTION_TIMEOUT_DEFAULT),
max_connection_lifetime: max_connection_lifetime
.unwrap_or(MAX_CONNECTION_LIFETIME_DEFAULT),
}
}
}
/// Legacy format, for use in JSON compat objects in test environment
#[derive(Serialize, Deserialize)]
struct JsonPersistence {
@@ -207,7 +239,7 @@ impl Persistence {
const IDLE_CONNECTION_TIMEOUT: Duration = Duration::from_secs(10);
const MAX_CONNECTION_LIFETIME: Duration = Duration::from_secs(60);
pub async fn new(database_url: String) -> Self {
pub async fn new(database_url: String, config: PersistenceConfig) -> Self {
let mut mgr_config = ManagerConfig::default();
mgr_config.custom_setup = Box::new(establish_connection_rustls);
@@ -219,9 +251,9 @@ impl Persistence {
// We will use a connection pool: this is primarily to _limit_ our connection count, rather than to optimize time
// to execute queries (database queries are not generally on latency-sensitive paths).
let connection_pool = Pool::builder()
.max_size(Self::MAX_CONNECTIONS)
.max_lifetime(Some(Self::MAX_CONNECTION_LIFETIME))
.idle_timeout(Some(Self::IDLE_CONNECTION_TIMEOUT))
.max_size(config.max_connections)
.max_lifetime(Some(config.max_connection_lifetime))
.idle_timeout(Some(config.idle_connection_timeout))
// Always keep at least one connection ready to go
.min_idle(Some(1))
.test_on_check_out(true)

View File

@@ -85,7 +85,7 @@ use crate::peer_client::GlobalObservedState;
use crate::persistence::split_state::SplitState;
use crate::persistence::{
AbortShardSplitStatus, ControllerPersistence, DatabaseError, DatabaseResult,
MetadataHealthPersistence, Persistence, ShardGenerationState, TenantFilter,
MetadataHealthPersistence, Persistence, PersistenceConfig, ShardGenerationState, TenantFilter,
TenantShardPersistence,
};
use crate::reconciler::{
@@ -490,6 +490,8 @@ pub struct Config {
// Feature flag: Whether the storage controller should act to rectify pageserver-reported local disk loss.
pub handle_ps_local_disk_loss: bool,
pub persistence_config: PersistenceConfig,
}
impl From<DatabaseError> for ApiError {