Clean up process start/stop handling

* Poll more frequently when waiting for process start/stop. This
  speeds up startup and shutdown in tests. We did this already in
  commit 52ce1c9d53, which reduced the interval to 100 ms, but it was
  inadvertently increased back to 500 ms in commit d42700280f. Reduce
  it to 100 ms again, for both start and stop operations.

* Harmonize the start and stop loops, printing the dots and notices
  the same way in both. I considered extracting the logic to a
  separate retry-function that takes a closure as argument that does
  the polling, but as long as we only have two copies, the code
  duplication isn't that bad.

* Remove newline after "Starting pageserver" and "Starting etcd"
  messages, so that the progress-indicator dots that are printed once
  a second are printed on the same line. Before:

    Starting pageserver at '127.0.0.1:64000' in '.neon'
    ...
    pageserver started, pid: 2538937

  After:

    Starting pageserver at '127.0.0.1:64000' in '.neon'...
    pageserver started, pid: 2538937

  The "Starting safekeeper" message already got this right.

* Update example output in README.md to match
This commit is contained in:
Heikki Linnakangas
2022-11-16 18:56:39 +02:00
committed by Heikki Linnakangas
parent 2b728bc69e
commit 150bddb929
4 changed files with 47 additions and 30 deletions

View File

@@ -26,8 +26,18 @@ use nix::unistd::Pid;
use utils::lock_file;
const RETRIES: u32 = 15;
const RETRY_TIMEOUT_MILLIS: u64 = 500;
// These constants control the loop used to poll for process start / stop.
//
// The loop waits for at most 10 seconds, polling every 100 ms.
// Once a second, it prints a dot ("."), to give the user an indication that
// it's waiting. If the process hasn't started/stopped after 5 seconds,
// it prints a notice that it's taking long, but keeps waiting.
//
const RETRY_UNTIL_SECS: u64 = 10;
const RETRIES: u64 = (RETRY_UNTIL_SECS * 1000) / RETRY_INTERVAL_MILLIS;
const RETRY_INTERVAL_MILLIS: u64 = 100;
const DOT_EVERY_RETRIES: u64 = 10;
const NOTICE_AFTER_RETRIES: u64 = 50;
/// Argument to `start_process`, to indicate whether it should create pidfile or if the process creates
/// it itself.
@@ -107,16 +117,16 @@ where
return Ok(spawned_process);
}
Ok(false) => {
if retries < 5 {
if retries == NOTICE_AFTER_RETRIES {
// The process is taking a long time to start up. Keep waiting, but
// print a message
print!("\n{process_name} has not started yet, continuing to wait");
}
if retries % DOT_EVERY_RETRIES == 0 {
print!(".");
io::stdout().flush().unwrap();
} else {
if retries == 5 {
println!() // put a line break after dots for second message
}
println!("{process_name} has not started yet, retrying ({retries})...");
}
thread::sleep(Duration::from_millis(RETRY_TIMEOUT_MILLIS));
thread::sleep(Duration::from_millis(RETRY_INTERVAL_MILLIS));
}
Err(e) => {
println!("{process_name} failed to start: {e:#}");
@@ -127,7 +137,8 @@ where
}
}
}
anyhow::bail!("{process_name} could not start in {RETRIES} attempts");
println!();
anyhow::bail!("{process_name} did not start in {RETRY_UNTIL_SECS} seconds");
}
/// Stops the process, using the pid file given. Returns Ok also if the process is already not running.
@@ -158,7 +169,7 @@ pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> any
}
// Wait until process is gone
for _ in 0..RETRIES {
for retries in 0..RETRIES {
match process_has_stopped(pid) {
Ok(true) => {
println!("\n{process_name} stopped");
@@ -170,9 +181,16 @@ pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> any
return Ok(());
}
Ok(false) => {
print!(".");
io::stdout().flush().unwrap();
thread::sleep(Duration::from_secs(1))
if retries == NOTICE_AFTER_RETRIES {
// The process is taking a long time to start up. Keep waiting, but
// print a message
print!("\n{process_name} has not stopped yet, continuing to wait");
}
if retries % DOT_EVERY_RETRIES == 0 {
print!(".");
io::stdout().flush().unwrap();
}
thread::sleep(Duration::from_millis(RETRY_INTERVAL_MILLIS));
}
Err(e) => {
println!("{process_name} with pid {pid} failed to stop: {e:#}");
@@ -180,8 +198,8 @@ pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> any
}
}
}
anyhow::bail!("{process_name} with pid {pid} failed to stop in {RETRIES} attempts");
println!();
anyhow::bail!("{process_name} with pid {pid} did not stop in {RETRY_UNTIL_SECS} seconds");
}
fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command {

View File

@@ -6,7 +6,7 @@ use crate::{background_process, local_env};
pub fn start_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
let etcd_broker = &env.etcd_broker;
println!(
print!(
"Starting etcd broker using {:?}",
etcd_broker.etcd_binary_path
);

View File

@@ -237,7 +237,7 @@ impl PageServerNode {
datadir: &Path,
update_config: bool,
) -> anyhow::Result<Child> {
println!(
print!(
"Starting pageserver at '{}' in '{}'",
self.pg_connection_config.raw_address(),
datadir.display()