diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index 43c63e7ef4..b70bd2e1b5 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -20,7 +20,16 @@ use pageserver_client::mgmt_api::ResponseErrorMessageExt; use postgres_backend::AuthType; use reqwest::Method; use serde::{de::DeserializeOwned, Deserialize, Serialize}; -use std::{fs, net::SocketAddr, path::PathBuf, str::FromStr, sync::OnceLock}; +use std::{ + ffi::OsStr, + fs, + net::SocketAddr, + path::PathBuf, + process::ExitStatus, + str::FromStr, + sync::OnceLock, + time::{Duration, Instant}, +}; use tokio::process::Command; use tracing::instrument; use url::Url; @@ -168,16 +177,6 @@ impl StorageController { .expect("non-Unicode path") } - /// PIDFile for the postgres instance used to store storage controller state - fn postgres_pid_file(&self) -> Utf8PathBuf { - Utf8PathBuf::from_path_buf( - self.env - .base_data_dir - .join("storage_controller_postgres.pid"), - ) - .expect("non-Unicode path") - } - /// Find the directory containing postgres subdirectories, such `bin` and `lib` /// /// This usually uses STORAGE_CONTROLLER_POSTGRES_VERSION of postgres, but will fall back @@ -296,6 +295,31 @@ impl StorageController { .map_err(anyhow::Error::new) } + /// Wrapper for the pg_ctl binary, which we spawn as a short-lived subprocess when starting and stopping postgres + async fn pg_ctl(&self, args: I) -> ExitStatus + where + I: IntoIterator, + S: AsRef, + { + let pg_bin_dir = self.get_pg_bin_dir().await.unwrap(); + let bin_path = pg_bin_dir.join("pg_ctl"); + + let pg_lib_dir = self.get_pg_lib_dir().await.unwrap(); + let envs = [ + ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), + ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), + ]; + + Command::new(bin_path) + .args(args) + .envs(envs) + .spawn() + .expect("Failed to spawn pg_ctl, binary_missing?") + .wait() + .await + .expect("Failed to wait for pg_ctl termination") + } + pub async fn start(&self, start_args: NeonStorageControllerStartArgs) -> anyhow::Result<()> { let instance_dir = self.storage_controller_instance_dir(start_args.instance_id); if let Err(err) = tokio::fs::create_dir(&instance_dir).await { @@ -404,20 +428,34 @@ impl StorageController { db_start_args ); - background_process::start_process( - "storage_controller_db", - &self.env.base_data_dir, - pg_bin_dir.join("pg_ctl").as_std_path(), - db_start_args, - vec![ - ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), - ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), - ], - background_process::InitialPidFile::Create(self.postgres_pid_file()), - &start_args.start_timeout, - || self.pg_isready(&pg_bin_dir, postgres_port), - ) - .await?; + let db_start_status = self.pg_ctl(db_start_args).await; + let start_timeout: Duration = start_args.start_timeout.into(); + let db_start_deadline = Instant::now() + start_timeout; + if !db_start_status.success() { + return Err(anyhow::anyhow!( + "Failed to start postgres {}", + db_start_status.code().unwrap() + )); + } + + loop { + if Instant::now() > db_start_deadline { + return Err(anyhow::anyhow!("Timed out waiting for postgres to start")); + } + + match self.pg_isready(&pg_bin_dir, postgres_port).await { + Ok(true) => { + tracing::info!("storage controller postgres is now ready"); + break; + } + Ok(false) => { + tokio::time::sleep(Duration::from_millis(100)).await; + } + Err(e) => { + tracing::warn!("Failed to check postgres status: {e}") + } + } + } self.setup_database(postgres_port).await?; } @@ -583,15 +621,10 @@ impl StorageController { } let pg_data_path = self.env.base_data_dir.join("storage_controller_db"); - let pg_bin_dir = self.get_pg_bin_dir().await?; println!("Stopping storage controller database..."); let pg_stop_args = ["-D", &pg_data_path.to_string_lossy(), "stop"]; - let stop_status = Command::new(pg_bin_dir.join("pg_ctl")) - .args(pg_stop_args) - .spawn()? - .wait() - .await?; + let stop_status = self.pg_ctl(pg_stop_args).await; if !stop_status.success() { match self.is_postgres_running().await { Ok(false) => { @@ -612,14 +645,9 @@ impl StorageController { async fn is_postgres_running(&self) -> anyhow::Result { let pg_data_path = self.env.base_data_dir.join("storage_controller_db"); - let pg_bin_dir = self.get_pg_bin_dir().await?; let pg_status_args = ["-D", &pg_data_path.to_string_lossy(), "status"]; - let status_exitcode = Command::new(pg_bin_dir.join("pg_ctl")) - .args(pg_status_args) - .spawn()? - .wait() - .await?; + let status_exitcode = self.pg_ctl(pg_status_args).await; // pg_ctl status returns this exit code if postgres is not running: in this case it is // fine that stop failed. Otherwise it is an error that stop failed.