From 150bddb9298e6b9bd3ac4a28d0d2eec029e53eea Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 16 Nov 2022 18:56:39 +0200 Subject: [PATCH] Clean up process start/stop handling * Poll more frequently when waiting for process start/stop. This speeds up startup and shutdown in tests. We did this already in commit 52ce1c9d53, which reduced the interval to 100 ms, but it was inadvertently increased back to 500 ms in commit d42700280f. Reduce it to 100 ms again, for both start and stop operations. * Harmonize the start and stop loops, printing the dots and notices the same way in both. I considered extracting the logic to a separate retry-function that takes a closure as argument that does the polling, but as long as we only have two copies, the code duplication isn't that bad. * Remove newline after "Starting pageserver" and "Starting etcd" messages, so that the progress-indicator dots that are printed once a second are printed on the same line. Before: Starting pageserver at '127.0.0.1:64000' in '.neon' ... pageserver started, pid: 2538937 After: Starting pageserver at '127.0.0.1:64000' in '.neon'... pageserver started, pid: 2538937 The "Starting safekeeper" message already got this right. * Update example output in README.md to match --- README.md | 23 ++++++------ control_plane/src/background_process.rs | 50 +++++++++++++++++-------- control_plane/src/etcd.rs | 2 +- control_plane/src/pageserver.rs | 2 +- 4 files changed, 47 insertions(+), 30 deletions(-) diff --git a/README.md b/README.md index e9c30668e0..01de24cdf6 100644 --- a/README.md +++ b/README.md @@ -125,24 +125,23 @@ Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (r # Create repository in .neon with proper paths to binaries and data # Later that would be responsibility of a package install script > ./target/debug/neon_local init -Starting pageserver at '127.0.0.1:64000' in '.neon' - -Pageserver started -Successfully initialized timeline 7dd0907914ac399ff3be45fb252bfdb7 -Stopping pageserver gracefully...done! +Starting pageserver at '127.0.0.1:64000' in '.neon'. +pageserver started, pid: 2545906 +Successfully initialized timeline de200bd42b49cc1814412c7e592dd6e9 +Stopped pageserver 1 process with pid 2545906 # start pageserver and safekeeper > ./target/debug/neon_local start -Starting etcd broker using /usr/bin/etcd -Starting pageserver at '127.0.0.1:64000' in '.neon' - -Pageserver started -Starting safekeeper at '127.0.0.1:5454' in '.neon/safekeepers/sk1' -Safekeeper started +Starting etcd broker using "/usr/bin/etcd" +etcd started, pid: 2545996 +Starting pageserver at '127.0.0.1:64000' in '.neon'. +pageserver started, pid: 2546005 +Starting safekeeper at '127.0.0.1:5454' in '.neon/safekeepers/sk1'. +safekeeper 1 started, pid: 2546041 # start postgres compute node > ./target/debug/neon_local pg start main -Starting new postgres main on timeline de200bd42b49cc1814412c7e592dd6e9 ... +Starting new postgres (v14) main on timeline de200bd42b49cc1814412c7e592dd6e9 ... Extracting base backup to create postgres instance: path=.neon/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/main port=55432 Starting postgres node at 'host=127.0.0.1 port=55432 user=cloud_admin dbname=postgres' diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs index 2f8098b7c9..c558c09854 100644 --- a/control_plane/src/background_process.rs +++ b/control_plane/src/background_process.rs @@ -26,8 +26,18 @@ use nix::unistd::Pid; use utils::lock_file; -const RETRIES: u32 = 15; -const RETRY_TIMEOUT_MILLIS: u64 = 500; +// These constants control the loop used to poll for process start / stop. +// +// The loop waits for at most 10 seconds, polling every 100 ms. +// Once a second, it prints a dot ("."), to give the user an indication that +// it's waiting. If the process hasn't started/stopped after 5 seconds, +// it prints a notice that it's taking long, but keeps waiting. +// +const RETRY_UNTIL_SECS: u64 = 10; +const RETRIES: u64 = (RETRY_UNTIL_SECS * 1000) / RETRY_INTERVAL_MILLIS; +const RETRY_INTERVAL_MILLIS: u64 = 100; +const DOT_EVERY_RETRIES: u64 = 10; +const NOTICE_AFTER_RETRIES: u64 = 50; /// Argument to `start_process`, to indicate whether it should create pidfile or if the process creates /// it itself. @@ -107,16 +117,16 @@ where return Ok(spawned_process); } Ok(false) => { - if retries < 5 { + if retries == NOTICE_AFTER_RETRIES { + // The process is taking a long time to start up. Keep waiting, but + // print a message + print!("\n{process_name} has not started yet, continuing to wait"); + } + if retries % DOT_EVERY_RETRIES == 0 { print!("."); io::stdout().flush().unwrap(); - } else { - if retries == 5 { - println!() // put a line break after dots for second message - } - println!("{process_name} has not started yet, retrying ({retries})..."); } - thread::sleep(Duration::from_millis(RETRY_TIMEOUT_MILLIS)); + thread::sleep(Duration::from_millis(RETRY_INTERVAL_MILLIS)); } Err(e) => { println!("{process_name} failed to start: {e:#}"); @@ -127,7 +137,8 @@ where } } } - anyhow::bail!("{process_name} could not start in {RETRIES} attempts"); + println!(); + anyhow::bail!("{process_name} did not start in {RETRY_UNTIL_SECS} seconds"); } /// Stops the process, using the pid file given. Returns Ok also if the process is already not running. @@ -158,7 +169,7 @@ pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> any } // Wait until process is gone - for _ in 0..RETRIES { + for retries in 0..RETRIES { match process_has_stopped(pid) { Ok(true) => { println!("\n{process_name} stopped"); @@ -170,9 +181,16 @@ pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> any return Ok(()); } Ok(false) => { - print!("."); - io::stdout().flush().unwrap(); - thread::sleep(Duration::from_secs(1)) + if retries == NOTICE_AFTER_RETRIES { + // The process is taking a long time to start up. Keep waiting, but + // print a message + print!("\n{process_name} has not stopped yet, continuing to wait"); + } + if retries % DOT_EVERY_RETRIES == 0 { + print!("."); + io::stdout().flush().unwrap(); + } + thread::sleep(Duration::from_millis(RETRY_INTERVAL_MILLIS)); } Err(e) => { println!("{process_name} with pid {pid} failed to stop: {e:#}"); @@ -180,8 +198,8 @@ pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> any } } } - - anyhow::bail!("{process_name} with pid {pid} failed to stop in {RETRIES} attempts"); + println!(); + anyhow::bail!("{process_name} with pid {pid} did not stop in {RETRY_UNTIL_SECS} seconds"); } fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command { diff --git a/control_plane/src/etcd.rs b/control_plane/src/etcd.rs index 4c15914e24..60aa5da780 100644 --- a/control_plane/src/etcd.rs +++ b/control_plane/src/etcd.rs @@ -6,7 +6,7 @@ use crate::{background_process, local_env}; pub fn start_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> { let etcd_broker = &env.etcd_broker; - println!( + print!( "Starting etcd broker using {:?}", etcd_broker.etcd_binary_path ); diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 18d6aee68d..aec6f5bc2c 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -237,7 +237,7 @@ impl PageServerNode { datadir: &Path, update_config: bool, ) -> anyhow::Result { - println!( + print!( "Starting pageserver at '{}' in '{}'", self.pg_connection_config.raw_address(), datadir.display()