From e3ff87ce3bfa99662a9b2d299b78be1bfa35b8cd Mon Sep 17 00:00:00 2001 From: John Spray Date: Wed, 23 Oct 2024 17:29:55 +0100 Subject: [PATCH] tests: avoid using background_process when invoking pg_ctl (#9469) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Problem Occasionally, we get failures to start the storage controller's db with errors like: ``` aborting due to panic at /__w/neon/neon/control_plane/src/background_process.rs:349:67: claim pid file: lock file Caused by: file is already locked ``` e.g. https://neon-github-public-dev.s3.amazonaws.com/reports/pr-9428/11380574562/index.html#/testresult/1c68d413ea9ecd4a This is happening in a stop,start cycle during a test. Presumably the pidfile from the startup background process is still held at the point we stop, because we let pg_ctl keep running in the background. ## Summary of changes - Refactor pg_ctl invocations into a helper - In the controller's `start` function, use pg_ctl & a wait loop for pg_isready, instead of using background_process --------- Co-authored-by: Arpad Müller --- control_plane/src/storage_controller.rs | 102 +++++++++++++++--------- 1 file changed, 65 insertions(+), 37 deletions(-) diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index 43c63e7ef4..b70bd2e1b5 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -20,7 +20,16 @@ use pageserver_client::mgmt_api::ResponseErrorMessageExt; use postgres_backend::AuthType; use reqwest::Method; use serde::{de::DeserializeOwned, Deserialize, Serialize}; -use std::{fs, net::SocketAddr, path::PathBuf, str::FromStr, sync::OnceLock}; +use std::{ + ffi::OsStr, + fs, + net::SocketAddr, + path::PathBuf, + process::ExitStatus, + str::FromStr, + sync::OnceLock, + time::{Duration, Instant}, +}; use tokio::process::Command; use tracing::instrument; use url::Url; @@ -168,16 +177,6 @@ impl StorageController { .expect("non-Unicode path") } - /// PIDFile for the postgres instance used to store storage controller state - fn postgres_pid_file(&self) -> Utf8PathBuf { - Utf8PathBuf::from_path_buf( - self.env - .base_data_dir - .join("storage_controller_postgres.pid"), - ) - .expect("non-Unicode path") - } - /// Find the directory containing postgres subdirectories, such `bin` and `lib` /// /// This usually uses STORAGE_CONTROLLER_POSTGRES_VERSION of postgres, but will fall back @@ -296,6 +295,31 @@ impl StorageController { .map_err(anyhow::Error::new) } + /// Wrapper for the pg_ctl binary, which we spawn as a short-lived subprocess when starting and stopping postgres + async fn pg_ctl(&self, args: I) -> ExitStatus + where + I: IntoIterator, + S: AsRef, + { + let pg_bin_dir = self.get_pg_bin_dir().await.unwrap(); + let bin_path = pg_bin_dir.join("pg_ctl"); + + let pg_lib_dir = self.get_pg_lib_dir().await.unwrap(); + let envs = [ + ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), + ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), + ]; + + Command::new(bin_path) + .args(args) + .envs(envs) + .spawn() + .expect("Failed to spawn pg_ctl, binary_missing?") + .wait() + .await + .expect("Failed to wait for pg_ctl termination") + } + pub async fn start(&self, start_args: NeonStorageControllerStartArgs) -> anyhow::Result<()> { let instance_dir = self.storage_controller_instance_dir(start_args.instance_id); if let Err(err) = tokio::fs::create_dir(&instance_dir).await { @@ -404,20 +428,34 @@ impl StorageController { db_start_args ); - background_process::start_process( - "storage_controller_db", - &self.env.base_data_dir, - pg_bin_dir.join("pg_ctl").as_std_path(), - db_start_args, - vec![ - ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), - ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), - ], - background_process::InitialPidFile::Create(self.postgres_pid_file()), - &start_args.start_timeout, - || self.pg_isready(&pg_bin_dir, postgres_port), - ) - .await?; + let db_start_status = self.pg_ctl(db_start_args).await; + let start_timeout: Duration = start_args.start_timeout.into(); + let db_start_deadline = Instant::now() + start_timeout; + if !db_start_status.success() { + return Err(anyhow::anyhow!( + "Failed to start postgres {}", + db_start_status.code().unwrap() + )); + } + + loop { + if Instant::now() > db_start_deadline { + return Err(anyhow::anyhow!("Timed out waiting for postgres to start")); + } + + match self.pg_isready(&pg_bin_dir, postgres_port).await { + Ok(true) => { + tracing::info!("storage controller postgres is now ready"); + break; + } + Ok(false) => { + tokio::time::sleep(Duration::from_millis(100)).await; + } + Err(e) => { + tracing::warn!("Failed to check postgres status: {e}") + } + } + } self.setup_database(postgres_port).await?; } @@ -583,15 +621,10 @@ impl StorageController { } let pg_data_path = self.env.base_data_dir.join("storage_controller_db"); - let pg_bin_dir = self.get_pg_bin_dir().await?; println!("Stopping storage controller database..."); let pg_stop_args = ["-D", &pg_data_path.to_string_lossy(), "stop"]; - let stop_status = Command::new(pg_bin_dir.join("pg_ctl")) - .args(pg_stop_args) - .spawn()? - .wait() - .await?; + let stop_status = self.pg_ctl(pg_stop_args).await; if !stop_status.success() { match self.is_postgres_running().await { Ok(false) => { @@ -612,14 +645,9 @@ impl StorageController { async fn is_postgres_running(&self) -> anyhow::Result { let pg_data_path = self.env.base_data_dir.join("storage_controller_db"); - let pg_bin_dir = self.get_pg_bin_dir().await?; let pg_status_args = ["-D", &pg_data_path.to_string_lossy(), "status"]; - let status_exitcode = Command::new(pg_bin_dir.join("pg_ctl")) - .args(pg_status_args) - .spawn()? - .wait() - .await?; + let status_exitcode = self.pg_ctl(pg_status_args).await; // pg_ctl status returns this exit code if postgres is not running: in this case it is // fine that stop failed. Otherwise it is an error that stop failed.