From aac913f9dc1dde105d2b53f58c2379ffe2faee91 Mon Sep 17 00:00:00 2001 From: Eric Seppanen Date: Mon, 3 May 2021 20:43:45 -0700 Subject: [PATCH] use nix kill instead of spawning a process Since we are now calling the syscall directly, read_pidfile can now parse an integer. We also verify the pid is >= 1, because calling kill on 0 or negative values goes straight to crazytown. --- Cargo.lock | 14 ++++++ control_plane/Cargo.toml | 3 +- control_plane/src/storage.rs | 83 +++++++++++++++--------------------- 3 files changed, 51 insertions(+), 49 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 2bac054279..ff93a9aa2e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -404,6 +404,7 @@ dependencies = [ "fs_extra", "hex", "lazy_static", + "nix", "pageserver", "postgres", "postgres_ffi", @@ -411,6 +412,7 @@ dependencies = [ "regex", "serde", "tar", + "thiserror", "tokio-postgres", "toml", "walkeeper", @@ -1163,6 +1165,18 @@ dependencies = [ "tempfile", ] +[[package]] +name = "nix" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa9b4819da1bc61c0ea48b63b7bc8604064dd43013e7cc325df098d49cd7c18a" +dependencies = [ + "bitflags", + "cc", + "cfg-if 1.0.0", + "libc", +] + [[package]] name = "nom" version = "5.1.2" diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index f408763b01..71fdf918d0 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -11,7 +11,6 @@ rand = "0.8.3" tar = "0.4.33" postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" } tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" } - serde = { version = "1.0", features = ["derive"] } toml = "0.5" lazy_static = "1.4" @@ -20,6 +19,8 @@ anyhow = "1.0" hex = "0.4.3" bytes = "1.0.1" fs_extra = "1.2.0" +nix = "0.20" +thiserror = "1" pageserver = { path = "../pageserver" } walkeeper = { path = "../walkeeper" } diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index 09a941603d..0af74ace0d 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -1,8 +1,9 @@ -use anyhow::Result; +use anyhow::{anyhow, bail, Context, Result}; +use nix::sys::signal::{kill, Signal}; +use nix::unistd::Pid; +use std::convert::TryInto; use std::fs; -use std::io; -use std::net::SocketAddr; -use std::net::TcpStream; +use std::net::{SocketAddr, TcpStream}; use std::path::{Path, PathBuf}; use std::process::Command; use std::str::FromStr; @@ -171,7 +172,7 @@ impl PageServerNode { .env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap()); if !cmd.status()?.success() { - anyhow::bail!( + bail!( "Pageserver failed to start. See '{}' for details.", self.repo_path().join("pageserver.log").display() ); @@ -192,20 +193,13 @@ impl PageServerNode { } pub fn stop(&self) -> Result<()> { - let pidfile = self.pid_file(); - let pid = read_pidfile(&pidfile)?; - - let status = Command::new("kill") - .arg(&pid) - .env_clear() - .status() - .expect("failed to execute kill"); - - if !status.success() { - anyhow::bail!("Failed to kill pageserver with pid {}", pid); + let pid = read_pidfile(&self.pid_file())?; + let pid = Pid::from_raw(pid); + if kill(pid, Signal::SIGTERM).is_err() { + bail!("Failed to kill pageserver with pid {}", pid); } - // await for pageserver stop + // wait for pageserver stop for _ in 0..5 { let stream = TcpStream::connect(self.address()); if let Err(_e) = stream { @@ -215,12 +209,7 @@ impl PageServerNode { thread::sleep(Duration::from_secs(1)); } - // ok, we failed to stop pageserver, let's panic - if !status.success() { - anyhow::bail!("Failed to stop pageserver with pid {}", pid); - } else { - Ok(()) - } + bail!("Failed to stop pageserver with pid {}", pid); } pub fn page_server_psql(&self, sql: &str) -> Vec { @@ -304,24 +293,22 @@ impl WalAcceptorNode { } } - pub fn stop(&self) -> std::result::Result<(), io::Error> { + pub fn stop(&self) -> Result<()> { println!("Stopping wal acceptor on {}", self.listen); let pidfile = self.data_dir.join("wal_acceptor.pid"); let pid = read_pidfile(&pidfile)?; - // Ignores any failures when running this command - let _status = Command::new("kill") - .arg(pid) - .env_clear() - .status() - .expect("failed to execute kill"); - + let pid = Pid::from_raw(pid); + if kill(pid, Signal::SIGTERM).is_err() { + bail!("Failed to kill wal_acceptor with pid {}", pid); + } Ok(()) } } impl Drop for WalAcceptorNode { fn drop(&mut self) { - self.stop().unwrap(); + // Ignore errors. + let _ = self.stop(); } } @@ -333,15 +320,10 @@ pub struct WalProposerNode { impl WalProposerNode { pub fn stop(&self) { - let status = Command::new("kill") - .arg(self.pid.to_string()) - .env_clear() - .status() - .expect("failed to execute kill"); - - if !status.success() { - panic!("kill start failed"); - } + // std::process::Child::id() returns u32, we need i32. + let pid: i32 = self.pid.try_into().unwrap(); + let pid = Pid::from_raw(pid); + kill(pid, Signal::SIGTERM).expect("failed to execute kill"); } } @@ -353,11 +335,16 @@ impl Drop for WalProposerNode { /// Read a PID file /// -/// This should contain an unsigned integer, but we return it as a String -/// because our callers only want to pass it back into a subcommand. -fn read_pidfile(pidfile: &Path) -> std::result::Result { - fs::read_to_string(pidfile).map_err(|err| { - eprintln!("failed to read pidfile {:?}: {:?}", pidfile, err); - err - }) +/// We expect a file that contains a single integer. +/// We return an i32 for compatibility with libc and nix. +fn read_pidfile(pidfile: &Path) -> Result { + let pid_str = fs::read_to_string(pidfile) + .with_context(|| format!("failed to read pidfile {:?}", pidfile))?; + let pid: i32 = pid_str + .parse() + .map_err(|_| anyhow!("failed to parse pidfile {:?}", pidfile))?; + if pid < 1 { + bail!("pidfile {:?} contained bad value '{}'", pidfile, pid); + } + Ok(pid) }