From 13e810574029953ab4f5002724ad853fc2c39922 Mon Sep 17 00:00:00 2001 From: Alexey Kondratov Date: Thu, 5 Dec 2024 18:57:25 +0100 Subject: [PATCH] feat(compute): Allow specifying the reconfiguration concurrency (#10006) ## Problem We need a higher concurrency during reconfiguration in case of many DBs, but the instance is already running and used by the client. We can easily get out of `max_connections` limit, and the current code won't handle that. ## Summary of changes Default to 1, but also allow control plane to override this value for specific projects. It's also recommended to bump `superuser_reserved_connections` += `reconfigure_concurrency` for such projects to ensure that we always have enough spare connections for reconfiguration process to succeed. Quick workaround for neondatabase/cloud#17846 --- compute_tools/src/compute.rs | 7 +------ control_plane/src/endpoint.rs | 1 + libs/compute_api/src/spec.rs | 25 ++++++++++++++++++++++--- 3 files changed, 24 insertions(+), 9 deletions(-) diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 0d1e6d680f..d72a04f2f9 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -1243,12 +1243,7 @@ impl ComputeNode { let postgresql_conf_path = pgdata_path.join("postgresql.conf"); config::write_postgres_conf(&postgresql_conf_path, &spec, self.http_port)?; - // TODO(ololobus): We need a concurrency during reconfiguration as well, - // but DB is already running and used by user. We can easily get out of - // `max_connections` limit, and the current code won't handle that. - // let compute_state = self.state.lock().unwrap().clone(); - // let max_concurrent_connections = self.max_service_connections(&compute_state, &spec); - let max_concurrent_connections = 1; + let max_concurrent_connections = spec.reconfigure_concurrency; // Temporarily reset max_cluster_size in config // to avoid the possibility of hitting the limit, while we are reconfiguring: diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index 1ca6dc43c4..360857f365 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -618,6 +618,7 @@ impl Endpoint { pgbouncer_settings: None, shard_stripe_size: Some(shard_stripe_size), local_proxy_config: None, + reconfigure_concurrency: 1, }; let spec_path = self.endpoint_path().join("spec.json"); std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?; diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index 8a447563dc..6d9c353cda 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -19,6 +19,10 @@ pub type PgIdent = String; /// String type alias representing Postgres extension version pub type ExtVersion = String; +fn default_reconfigure_concurrency() -> usize { + 1 +} + /// Cluster spec or configuration represented as an optional number of /// delta operations + final cluster state description. #[derive(Clone, Debug, Default, Deserialize, Serialize)] @@ -67,7 +71,7 @@ pub struct ComputeSpec { pub cluster: Cluster, pub delta_operations: Option>, - /// An optinal hint that can be passed to speed up startup time if we know + /// An optional hint that can be passed to speed up startup time if we know /// that no pg catalog mutations (like role creation, database creation, /// extension creation) need to be done on the actual database to start. #[serde(default)] // Default false @@ -86,9 +90,7 @@ pub struct ComputeSpec { // etc. GUCs in cluster.settings. TODO: Once the control plane has been // updated to fill these fields, we can make these non optional. pub tenant_id: Option, - pub timeline_id: Option, - pub pageserver_connstring: Option, #[serde(default)] @@ -113,6 +115,20 @@ pub struct ComputeSpec { /// Local Proxy configuration used for JWT authentication #[serde(default)] pub local_proxy_config: Option, + + /// Number of concurrent connections during the parallel RunInEachDatabase + /// phase of the apply config process. + /// + /// We need a higher concurrency during reconfiguration in case of many DBs, + /// but instance is already running and used by client. We can easily get out of + /// `max_connections` limit, and the current code won't handle that. + /// + /// Default is 1, but also allow control plane to override this value for specific + /// projects. It's also recommended to bump `superuser_reserved_connections` += + /// `reconfigure_concurrency` for such projects to ensure that we always have + /// enough spare connections for reconfiguration process to succeed. + #[serde(default = "default_reconfigure_concurrency")] + pub reconfigure_concurrency: usize, } /// Feature flag to signal `compute_ctl` to enable certain experimental functionality. @@ -315,6 +331,9 @@ mod tests { // Features list defaults to empty vector. assert!(spec.features.is_empty()); + + // Reconfigure concurrency defaults to 1. + assert_eq!(spec.reconfigure_concurrency, 1); } #[test]