diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index d7ff381f1b..db7746b8eb 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -36,6 +36,8 @@ use std::ffi::OsString; use std::fs::File; use std::process::exit; +use std::sync::Arc; +use std::sync::atomic::AtomicU64; use std::sync::mpsc; use std::thread; use std::time::Duration; @@ -190,7 +192,9 @@ fn main() -> Result<()> { cgroup: cli.cgroup, #[cfg(target_os = "linux")] vm_monitor_addr: cli.vm_monitor_addr, - installed_extensions_collection_interval: cli.installed_extensions_collection_interval, + installed_extensions_collection_interval: Arc::new(AtomicU64::new( + cli.installed_extensions_collection_interval, + )), }, config, )?; diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 381f2d45ba..9dcd4fc17c 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -25,7 +25,7 @@ use std::os::unix::fs::{PermissionsExt, symlink}; use std::path::Path; use std::process::{Command, Stdio}; use std::str::FromStr; -use std::sync::atomic::{AtomicU32, Ordering}; +use std::sync::atomic::{AtomicU32, AtomicU64, Ordering}; use std::sync::{Arc, Condvar, Mutex, RwLock}; use std::time::{Duration, Instant}; use std::{env, fs}; @@ -70,6 +70,7 @@ pub static BUILD_TAG: Lazy = Lazy::new(|| { .unwrap_or(BUILD_TAG_DEFAULT) .to_string() }); +const DEFAULT_INSTALLED_EXTENSIONS_COLLECTION_INTERVAL: u64 = 3600; /// Static configuration params that don't change after startup. These mostly /// come from the CLI args, or are derived from them. @@ -103,7 +104,7 @@ pub struct ComputeNodeParams { pub remote_ext_base_url: Option, /// Interval for installed extensions collection - pub installed_extensions_collection_interval: u64, + pub installed_extensions_collection_interval: Arc, } /// Compute node info shared across several `compute_ctl` threads. @@ -126,6 +127,9 @@ pub struct ComputeNode { // key: ext_archive_name, value: started download time, download_completed? pub ext_download_progress: RwLock, bool)>>, pub compute_ctl_config: ComputeCtlConfig, + + /// Handle to the extension stats collection task + extension_stats_task: Mutex>>, } // store some metrics about download size that might impact startup time @@ -428,6 +432,7 @@ impl ComputeNode { state_changed: Condvar::new(), ext_download_progress: RwLock::new(HashMap::new()), compute_ctl_config: config.compute_ctl_config, + extension_stats_task: Mutex::new(None), }) } @@ -515,6 +520,9 @@ impl ComputeNode { None }; + // Terminate the extension stats collection task + this.terminate_extension_stats_task(); + // Terminate the vm_monitor so it releases the file watcher on // /sys/fs/cgroup/neon-postgres. // Note: the vm-monitor only runs on linux because it requires cgroups. @@ -1671,6 +1679,8 @@ impl ComputeNode { tls_config = self.compute_ctl_config.tls.clone(); } + self.update_installed_extensions_collection_interval(&spec); + let max_concurrent_connections = self.max_service_connections(compute_state, &spec); // Merge-apply spec & changes to PostgreSQL state. @@ -1735,6 +1745,8 @@ impl ComputeNode { let tls_config = self.tls_config(&spec); + self.update_installed_extensions_collection_interval(&spec); + if let Some(ref pgbouncer_settings) = spec.pgbouncer_settings { info!("tuning pgbouncer"); @@ -2339,10 +2351,20 @@ LIMIT 100", } pub fn spawn_extension_stats_task(&self) { + // Cancel any existing task + if let Some(handle) = self.extension_stats_task.lock().unwrap().take() { + handle.abort(); + } + let conf = self.tokio_conn_conf.clone(); - let installed_extensions_collection_interval = - self.params.installed_extensions_collection_interval; - tokio::spawn(async move { + let atomic_interval = self.params.installed_extensions_collection_interval.clone(); + let mut installed_extensions_collection_interval = + 2 * atomic_interval.load(std::sync::atomic::Ordering::SeqCst); + info!( + "[NEON_EXT_SPAWN] Spawning background installed extensions worker with Timeout: {}", + installed_extensions_collection_interval + ); + let handle = tokio::spawn(async move { // An initial sleep is added to ensure that two collections don't happen at the same time. // The first collection happens during compute startup. tokio::time::sleep(tokio::time::Duration::from_secs( @@ -2355,8 +2377,48 @@ LIMIT 100", loop { interval.tick().await; let _ = installed_extensions(conf.clone()).await; + // Acquire a read lock on the compute spec and then update the interval if necessary + interval = tokio::time::interval(tokio::time::Duration::from_secs(std::cmp::max( + installed_extensions_collection_interval, + 2 * atomic_interval.load(std::sync::atomic::Ordering::SeqCst), + ))); + installed_extensions_collection_interval = interval.period().as_secs(); } }); + + // Store the new task handle + *self.extension_stats_task.lock().unwrap() = Some(handle); + } + + fn terminate_extension_stats_task(&self) { + if let Some(handle) = self.extension_stats_task.lock().unwrap().take() { + handle.abort(); + } + } + + fn update_installed_extensions_collection_interval(&self, spec: &ComputeSpec) { + // Update the interval for collecting installed extensions statistics + // If the value is -1, we never suspend so set the value to default collection. + // If the value is 0, it means default, we will just continue to use the default. + if spec.suspend_timeout_seconds == -1 || spec.suspend_timeout_seconds == 0 { + info!( + "[NEON_EXT_INT_UPD] Spec Timeout: {}, New Timeout: {}", + spec.suspend_timeout_seconds, DEFAULT_INSTALLED_EXTENSIONS_COLLECTION_INTERVAL + ); + self.params.installed_extensions_collection_interval.store( + DEFAULT_INSTALLED_EXTENSIONS_COLLECTION_INTERVAL, + std::sync::atomic::Ordering::SeqCst, + ); + } else { + info!( + "[NEON_EXT_INT_UPD] Spec Timeout: {}", + spec.suspend_timeout_seconds + ); + self.params.installed_extensions_collection_interval.store( + spec.suspend_timeout_seconds as u64, + std::sync::atomic::Ordering::SeqCst, + ); + } } } diff --git a/compute_tools/tests/cluster_spec.json b/compute_tools/tests/cluster_spec.json index 5655a94de4..439d80c057 100644 --- a/compute_tools/tests/cluster_spec.json +++ b/compute_tools/tests/cluster_spec.json @@ -3,7 +3,8 @@ "timestamp": "2021-05-23T18:25:43.511Z", "operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8b", - + "suspend_timeout_seconds": 3600, + "cluster": { "cluster_id": "test-cluster-42", "name": "Zenith Test", diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index e6fe7d90a2..424101b9a4 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -780,6 +780,7 @@ impl Endpoint { endpoint_storage_addr: Some(endpoint_storage_addr), endpoint_storage_token: Some(endpoint_storage_token), autoprewarm: false, + suspend_timeout_seconds: -1, // Only used in neon_local. }; // this strange code is needed to support respec() in tests diff --git a/docker-compose/compute_wrapper/var/db/postgres/configs/config.json b/docker-compose/compute_wrapper/var/db/postgres/configs/config.json index 21caf3800c..60e232425b 100644 --- a/docker-compose/compute_wrapper/var/db/postgres/configs/config.json +++ b/docker-compose/compute_wrapper/var/db/postgres/configs/config.json @@ -4,6 +4,7 @@ "timestamp": "2022-10-12T18:00:00.000Z", "operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8c", + "suspend_timeout_seconds": -1, "cluster": { "cluster_id": "docker_compose", diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index 508040c5e5..6b2caa9d3a 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -184,6 +184,11 @@ pub struct ComputeSpec { /// Download LFC state from endpoint_storage and pass it to Postgres on startup #[serde(default)] pub autoprewarm: bool, + + /// Suspend timeout in seconds. + /// + /// We use this value to derive other values, such as the installed extensions metric. + pub suspend_timeout_seconds: i64, } /// Feature flag to signal `compute_ctl` to enable certain experimental functionality. diff --git a/libs/compute_api/tests/cluster_spec.json b/libs/compute_api/tests/cluster_spec.json index 2dd2aae015..94d7f1e081 100644 --- a/libs/compute_api/tests/cluster_spec.json +++ b/libs/compute_api/tests/cluster_spec.json @@ -3,6 +3,7 @@ "timestamp": "2021-05-23T18:25:43.511Z", "operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8b", + "suspend_timeout_seconds": 3600, "cluster": { "cluster_id": "test-cluster-42",