diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs
index d21a939cb7..1a5ac1e2fe 100644
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -14,17 +14,19 @@
use std::ffi::OsStr;
use std::io::Write;
-use std::path::Path;
+use std::os::unix::prelude::AsRawFd;
+use std::os::unix::process::CommandExt;
+use std::path::{Path, PathBuf};
use std::process::{Child, Command};
use std::time::Duration;
use std::{fs, io, thread};
-use anyhow::{anyhow, bail, Context, Result};
+use anyhow::Context;
use nix::errno::Errno;
+use nix::fcntl::{FcntlArg, FdFlag};
use nix::sys::signal::{kill, Signal};
use nix::unistd::Pid;
-
-use utils::lock_file;
+use utils::pid_file::{self, PidFileRead};
// These constants control the loop used to poll for process start / stop.
//
@@ -86,6 +88,14 @@ where
let filled_cmd = fill_aws_secrets_vars(fill_rust_env_vars(background_command));
filled_cmd.envs(envs);
+ let pid_file_to_check = match initial_pid_file {
+ InitialPidFile::Create(path) => {
+ pre_exec_create_pidfile(filled_cmd, path);
+ path
+ }
+ InitialPidFile::Expect(path) => path,
+ };
+
let mut spawned_process = filled_cmd.spawn().with_context(|| {
format!("Could not spawn {process_name}, see console output and log files for details.")
})?;
@@ -95,29 +105,8 @@ where
.with_context(|| format!("Subprocess {process_name} has invalid pid {pid}"))?,
);
- let pid_file_to_check = match initial_pid_file {
- InitialPidFile::Create(target_pid_file_path) => {
- match lock_file::create_lock_file(target_pid_file_path, pid.to_string()) {
- lock_file::LockCreationResult::Created { .. } => {
- // We use "lock" file here only to create the pid file. The lock on the pidfile will be dropped as soon
- // as this CLI invocation exits, so it's a bit useless, but doesn't any harm either.
- }
- lock_file::LockCreationResult::AlreadyLocked { .. } => {
- anyhow::bail!("Cannot write pid file for {process_name} at path {target_pid_file_path:?}: file is already locked by another process")
- }
- lock_file::LockCreationResult::CreationFailed(e) => {
- return Err(e.context(format!(
- "Failed to create pid file for {process_name} at path {target_pid_file_path:?}"
- )))
- }
- }
- None
- }
- InitialPidFile::Expect(pid_file_path) => Some(pid_file_path),
- };
-
for retries in 0..RETRIES {
- match process_started(pid, pid_file_to_check, &process_status_check) {
+ match process_started(pid, Some(pid_file_to_check), &process_status_check) {
Ok(true) => {
println!("\n{process_name} started, pid: {pid}");
return Ok(spawned_process);
@@ -165,12 +154,27 @@ pub fn send_stop_child_process(child: &std::process::Child) -> anyhow::Result<()
/// Stops the process, using the pid file given. Returns Ok also if the process is already not running.
pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> anyhow::Result<()> {
- if !pid_file.exists() {
- println!("{process_name} is already stopped: no pid file {pid_file:?} is present");
- return Ok(());
- }
- let pid = read_pidfile(pid_file)?;
+ let pid = match pid_file::read(pid_file)
+ .with_context(|| format!("read pid_file {pid_file:?}"))?
+ {
+ PidFileRead::NotExist => {
+ println!("{process_name} is already stopped: no pid file present at {pid_file:?}");
+ return Ok(());
+ }
+ PidFileRead::NotHeldByAnyProcess(_) => {
+ // Don't try to kill according to file contents beacuse the pid might have been re-used by another process.
+ // Don't delete the file either, it can race with new pid file creation.
+ // Read `pid_file` module comment for details.
+ println!(
+ "No process is holding the pidfile. The process must have already exited. Leave in place to avoid race conditions: {pid_file:?}"
+ );
+ return Ok(());
+ }
+ PidFileRead::LockedByOtherProcess(pid) => pid,
+ };
+ // XXX the pid could become invalid (and recycled) at any time before the kill() below.
+ // send signal
let sig = if immediate {
print!("Stopping {process_name} with pid {pid} immediately..");
Signal::SIGQUIT
@@ -182,8 +186,9 @@ pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> any
match kill(pid, sig) {
Ok(()) => (),
Err(Errno::ESRCH) => {
+ // Again, don't delete the pid file. The unlink can race with a new pid file being created.
println!(
- "{process_name} with pid {pid} does not exist, but a pid file {pid_file:?} was found"
+ "{process_name} with pid {pid} does not exist, but a pid file {pid_file:?} was found. Likely the pid got recycled. Lucky we didn't harm anyone."
);
return Ok(());
}
@@ -252,6 +257,69 @@ fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command {
cmd
}
+/// Add a `pre_exec` to the cmd that, inbetween fork() and exec(),
+/// 1. Claims a pidfile with a fcntl lock on it and
+/// 2. Sets up the pidfile's file descriptor so that it (and the lock)
+/// will remain held until the cmd exits.
+fn pre_exec_create_pidfile
(cmd: &mut Command, path: P) -> &mut Command
+where
+ P: Into,
+{
+ let path: PathBuf = path.into();
+ // SAFETY
+ // pre_exec is marked unsafe because it runs between fork and exec.
+ // Why is that dangerous in various ways?
+ // Long answer: https://github.com/rust-lang/rust/issues/39575
+ // Short answer: in a multi-threaded program, other threads may have
+ // been inside of critical sections at the time of fork. In the
+ // original process, that was allright, assuming they protected
+ // the critical sections appropriately, e.g., through locks.
+ // Fork adds another process to the mix that
+ // 1. Has a single thread T
+ // 2. In an exact copy of the address space at the time of fork.
+ // A variety of problems scan occur now:
+ // 1. T tries to grab a lock that was locked at the time of fork.
+ // It will wait forever since in its address space, the lock
+ // is in state 'taken' but the thread that would unlock it is
+ // not there.
+ // 2. A rust object that represented some external resource in the
+ // parent now got implicitly copied by the the fork, even though
+ // the object's type is not `Copy`. The parent program may use
+ // non-copyability as way to enforce unique ownership of an
+ // external resource in the typesystem. The fork breaks that
+ // assumption, as now both parent and child process have an
+ // owned instance of the object that represents the same
+ // underlying resource.
+ // While these seem like niche problems, (1) in particular is
+ // highly relevant. For example, `malloc()` may grab a mutex internally,
+ // and so, if we forked while another thread was mallocing' and our
+ // pre_exec closure allocates as well, it will block on the malloc
+ // mutex forever
+ //
+ // The proper solution is to only use C library functions that are marked
+ // "async-signal-safe": https://man7.org/linux/man-pages/man7/signal-safety.7.html
+ //
+ // With this specific pre_exec() closure, the non-error path doesn't allocate.
+ // The error path uses `anyhow`, and hence does allocate.
+ // We take our chances there, hoping that any potential disaster is constrained
+ // to the child process (e.g., malloc has no state ourside of the child process).
+ // Last, `expect` prints to stderr, and stdio is not async-signal-safe.
+ // Again, we take our chances, making the same assumptions as for malloc.
+ unsafe {
+ cmd.pre_exec(move || {
+ let file = pid_file::claim_for_current_process(&path).expect("claim pid file");
+ // Remove the FD_CLOEXEC flag on the pidfile descriptor so that the pidfile
+ // remains locked after exec.
+ nix::fcntl::fcntl(file.as_raw_fd(), FcntlArg::F_SETFD(FdFlag::empty()))
+ .expect("remove FD_CLOEXEC");
+ // Don't run drop(file), it would close the file before we actually exec.
+ std::mem::forget(file);
+ Ok(())
+ });
+ }
+ cmd
+}
+
fn process_started(
pid: Pid,
pid_file_to_check: Option<&Path>,
@@ -262,14 +330,11 @@ where
{
match status_check() {
Ok(true) => match pid_file_to_check {
- Some(pid_file_path) => {
- if pid_file_path.exists() {
- let pid_in_file = read_pidfile(pid_file_path)?;
- Ok(pid_in_file == pid)
- } else {
- Ok(false)
- }
- }
+ Some(pid_file_path) => match pid_file::read(pid_file_path)? {
+ PidFileRead::NotExist => Ok(false),
+ PidFileRead::LockedByOtherProcess(pid_in_file) => Ok(pid_in_file == pid),
+ PidFileRead::NotHeldByAnyProcess(_) => Ok(false),
+ },
None => Ok(true),
},
Ok(false) => Ok(false),
@@ -277,21 +342,6 @@ where
}
}
-/// Read a PID file
-///
-/// We expect a file that contains a single integer.
-fn read_pidfile(pidfile: &Path) -> Result {
- let pid_str = fs::read_to_string(pidfile)
- .with_context(|| format!("failed to read pidfile {pidfile:?}"))?;
- let pid: i32 = pid_str
- .parse()
- .map_err(|_| anyhow!("failed to parse pidfile {pidfile:?}"))?;
- if pid < 1 {
- bail!("pidfile {pidfile:?} contained bad value '{pid}'");
- }
- Ok(Pid::from_raw(pid))
-}
-
fn process_has_stopped(pid: Pid) -> anyhow::Result {
match kill(pid, None) {
// Process exists, keep waiting
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 42a9199037..99ddae862d 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -324,7 +324,7 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result {
pg_version,
)
.unwrap_or_else(|e| {
- eprintln!("pageserver init failed: {e}");
+ eprintln!("pageserver init failed: {e:?}");
exit(1);
});
diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs
index b93afb0a59..6d35fd9f7b 100644
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -34,6 +34,7 @@ pub mod sock_split;
pub mod logging;
pub mod lock_file;
+pub mod pid_file;
// Misc
pub mod accum;
diff --git a/libs/utils/src/lock_file.rs b/libs/utils/src/lock_file.rs
index 4fef65852b..adbf47eb7a 100644
--- a/libs/utils/src/lock_file.rs
+++ b/libs/utils/src/lock_file.rs
@@ -1,81 +1,133 @@
-//! A module to create and read lock files. A lock file ensures that only one
-//! process is running at a time, in a particular directory.
+//! A module to create and read lock files.
//!
-//! File locking is done using [`fcntl::flock`], which means that holding the
-//! lock on file only prevents acquiring another lock on it; all other
-//! operations are still possible on files. Other process can still open, read,
-//! write, or remove the file, for example.
-//! If the file is removed while a process is holding a lock on it,
-//! the process that holds the lock does not get any error or notification.
-//! Furthermore, you can create a new file with the same name and lock the new file,
-//! while the old process is still running.
-//! Deleting the lock file while the locking process is still running is a bad idea!
+//! File locking is done using [`fcntl::flock`] exclusive locks.
+//! The only consumer of this module is currently [`pid_file`].
+//! See the module-level comment there for potential pitfalls
+//! with lock files that are used to store PIDs (pidfiles).
-use std::{fs, os::unix::prelude::AsRawFd, path::Path};
+use std::{
+ fs,
+ io::{Read, Write},
+ ops::Deref,
+ os::unix::prelude::AsRawFd,
+ path::{Path, PathBuf},
+};
use anyhow::Context;
-use nix::fcntl;
+use nix::{errno::Errno::EAGAIN, fcntl};
use crate::crashsafe;
-pub enum LockCreationResult {
- Created {
- new_lock_contents: String,
- file: fs::File,
- },
- AlreadyLocked {
- existing_lock_contents: String,
- },
- CreationFailed(anyhow::Error),
+/// A handle to an open and unlocked, but not-yet-written lock file.
+/// Returned by [`create_exclusive`].
+#[must_use]
+pub struct UnwrittenLockFile {
+ path: PathBuf,
+ file: fs::File,
}
-/// Creates a lock file in the path given and writes the given contents into the file.
-/// Note: The lock is automatically released when the file closed. You might want to use Box::leak to make sure it lives until the end of the program.
-pub fn create_lock_file(lock_file_path: &Path, contents: String) -> LockCreationResult {
- let lock_file = match fs::OpenOptions::new()
+/// Returned by [`UnwrittenLockFile::write_content`].
+#[must_use]
+pub struct LockFileGuard(fs::File);
+
+impl Deref for LockFileGuard {
+ type Target = fs::File;
+
+ fn deref(&self) -> &Self::Target {
+ &self.0
+ }
+}
+
+impl UnwrittenLockFile {
+ /// Replace the content of this lock file with the byte representation of `contents`.
+ pub fn write_content(mut self, contents: String) -> anyhow::Result {
+ self.file
+ .set_len(0)
+ .context("Failed to truncate lockfile")?;
+ self.file
+ .write_all(contents.as_bytes())
+ .with_context(|| format!("Failed to write '{contents}' contents into lockfile"))?;
+ crashsafe::fsync_file_and_parent(&self.path).context("fsync lockfile")?;
+ Ok(LockFileGuard(self.file))
+ }
+}
+
+/// Creates and opens a lock file in the path, grabs an exclusive flock on it, and returns
+/// a handle that allows overwriting the locked file's content.
+///
+/// The exclusive lock is released when dropping the returned handle.
+///
+/// It is not an error if the file already exists.
+/// It is an error if the file is already locked.
+pub fn create_exclusive(lock_file_path: &Path) -> anyhow::Result {
+ let lock_file = fs::OpenOptions::new()
.create(true) // O_CREAT
.write(true)
.open(lock_file_path)
- .context("Failed to open lock file")
- {
- Ok(file) => file,
- Err(e) => return LockCreationResult::CreationFailed(e),
- };
+ .context("open lock file")?;
- match fcntl::flock(
+ let res = fcntl::flock(
lock_file.as_raw_fd(),
fcntl::FlockArg::LockExclusiveNonblock,
- ) {
- Ok(()) => {
- match lock_file
- .set_len(0)
- .context("Failed to truncate lockfile")
- .and_then(|()| {
- fs::write(lock_file_path, &contents).with_context(|| {
- format!("Failed to write '{contents}' contents into lockfile")
- })
- })
- .and_then(|()| {
- crashsafe::fsync_file_and_parent(lock_file_path)
- .context("Failed to fsync lockfile")
- }) {
- Ok(()) => LockCreationResult::Created {
- new_lock_contents: contents,
- file: lock_file,
- },
- Err(e) => LockCreationResult::CreationFailed(e),
- }
- }
- Err(nix::errno::Errno::EAGAIN) => {
- match fs::read_to_string(lock_file_path).context("Failed to read lockfile contents") {
- Ok(existing_lock_contents) => LockCreationResult::AlreadyLocked {
- existing_lock_contents,
- },
- Err(e) => LockCreationResult::CreationFailed(e),
- }
- }
- Err(e) => {
- LockCreationResult::CreationFailed(anyhow::anyhow!("Failed to lock lockfile: {e}"))
- }
+ );
+ match res {
+ Ok(()) => Ok(UnwrittenLockFile {
+ path: lock_file_path.to_owned(),
+ file: lock_file,
+ }),
+ Err(EAGAIN) => anyhow::bail!("file is already locked"),
+ Err(e) => Err(e).context("flock error"),
+ }
+}
+
+/// Returned by [`read_and_hold_lock_file`].
+/// Check out the [`pid_file`] module for what the variants mean
+/// and potential caveats if the lock files that are used to store PIDs.
+pub enum LockFileRead {
+ /// No file exists at the given path.
+ NotExist,
+ /// No other process held the lock file, so we grabbed an flock
+ /// on it and read its contents.
+ /// Release the flock by dropping the [`LockFileGuard`].
+ NotHeldByAnyProcess(LockFileGuard, String),
+ /// The file exists but another process was holding an flock on it.
+ LockedByOtherProcess {
+ not_locked_file: fs::File,
+ content: String,
+ },
+}
+
+/// Open & try to lock the lock file at the given `path`, returning a [handle][`LockFileRead`] to
+/// inspect its content. It is not an `Err(...)` if the file does not exist or is already locked.
+/// Check the [`LockFileRead`] variants for details.
+pub fn read_and_hold_lock_file(path: &Path) -> anyhow::Result {
+ let res = fs::OpenOptions::new().read(true).open(path);
+ let mut lock_file = match res {
+ Ok(f) => f,
+ Err(e) => match e.kind() {
+ std::io::ErrorKind::NotFound => return Ok(LockFileRead::NotExist),
+ _ => return Err(e).context("open lock file"),
+ },
+ };
+ let res = fcntl::flock(
+ lock_file.as_raw_fd(),
+ fcntl::FlockArg::LockExclusiveNonblock,
+ );
+ // We need the content regardless of lock success / failure.
+ // But, read it after flock so that, if it succeeded, the content is consistent.
+ let mut content = String::new();
+ lock_file
+ .read_to_string(&mut content)
+ .context("read lock file")?;
+ match res {
+ Ok(()) => Ok(LockFileRead::NotHeldByAnyProcess(
+ LockFileGuard(lock_file),
+ content,
+ )),
+ Err(EAGAIN) => Ok(LockFileRead::LockedByOtherProcess {
+ not_locked_file: lock_file,
+ content,
+ }),
+ Err(e) => Err(e).context("flock error"),
}
}
diff --git a/libs/utils/src/pid_file.rs b/libs/utils/src/pid_file.rs
new file mode 100644
index 0000000000..e634b08f2a
--- /dev/null
+++ b/libs/utils/src/pid_file.rs
@@ -0,0 +1,165 @@
+//! Abstraction to create & read pidfiles.
+//!
+//! A pidfile is a file in the filesystem that stores a process's PID.
+//! Its purpose is to implement a singleton behavior where only
+//! one process of some "kind" is supposed to be running at a given time.
+//! The "kind" is identified by the pidfile.
+//!
+//! During process startup, the process that is supposed to be a singleton
+//! must [claim][`claim_for_current_process`] the pidfile first.
+//! If that is unsuccessful, the process must not act as the singleton, i.e.,
+//! it must not access any of the resources that only the singleton may access.
+//!
+//! A common need is to signal a running singleton process, e.g., to make
+//! it shut down and exit.
+//! For that, we have to [`read`] the pidfile. The result of the `read` operation
+//! tells us if there is any singleton process, and if so, what PID it has.
+//! We can then proceed to signal it, although some caveats still apply.
+//! Read the function-level documentation of [`read`] for that.
+//!
+//! ## Never Remove Pidfiles
+//!
+//! It would be natural to assume that the process who claimed the pidfile
+//! should remove it upon exit to avoid leaving a stale pidfile in place.
+//! However, we already have a reliable way to detect staleness of the pidfile,
+//! i.e., the `flock` that [claiming][`claim_for_current_process`] puts on it.
+//!
+//! And further, removing pidfiles would introduce a **catastrophic race condition**
+//! where two processes are running that are supposed to be singletons.
+//! Suppose we were to remove our pidfile during process shutdown.
+//! Here is how the race plays out:
+//! - Suppose we have a service called `myservice` with pidfile `myservice.pidfile`.
+//! - Process `A` starts to shut down.
+//! - Process `B` is just starting up
+//! - It `open("myservice.pid", O_WRONLY|O_CREAT)` the file
+//! - It blocks on `flock`
+//! - Process `A` removes the pidfile as the last step of its shutdown procedure
+//! - `unlink("myservice.pid")
+//! - Process `A` exits
+//! - This releases its `flock` and unblocks `B`
+//! - Process `B` still has the file descriptor for `myservice.pid` open
+//! - Process `B` writes its PID into `myservice.pid`.
+//! - But the `myservice.pid` file has been unlinked, so, there is `myservice.pid`
+//! in the directory.
+//! - Process `C` starts
+//! - It `open("myservice.pid", O_WRONLY|O_CREAT)` which creates a new file (new inode)
+//! - It `flock`s the file, which, since it's a different file, does not block
+//! - It writes its PID into the file
+//!
+//! At this point, `B` and `C` are running, which is hazardous.
+//! Morale of the story: don't unlink pidfiles, ever.
+
+use std::{ops::Deref, path::Path};
+
+use anyhow::Context;
+use nix::unistd::Pid;
+
+use crate::lock_file::{self, LockFileRead};
+
+/// Keeps a claim on a pidfile alive until it is dropped.
+/// Returned by [`claim_for_current_process`].
+#[must_use]
+pub struct PidFileGuard(lock_file::LockFileGuard);
+
+impl Deref for PidFileGuard {
+ type Target = lock_file::LockFileGuard;
+
+ fn deref(&self) -> &Self::Target {
+ &self.0
+ }
+}
+
+/// Try to claim `path` as a pidfile for the current process.
+///
+/// If another process has already claimed the pidfile, and it is still running,
+/// this function returns ane error.
+/// Otherwise, the function `flock`s the file and updates its contents to the
+/// current process's PID.
+/// If the update fails, the flock is released and an error returned.
+/// On success, the function returns a [`PidFileGuard`] to keep the flock alive.
+///
+/// ### Maintaining A Claim
+///
+/// It is the caller's responsibility to maintain the claim.
+/// The claim ends as soon as the returned guard object is dropped.
+/// To maintain the claim for the remaining lifetime of the current process,
+/// use [`std::mem::forget`] or similar.
+pub fn claim_for_current_process(path: &Path) -> anyhow::Result {
+ let unwritten_lock_file = lock_file::create_exclusive(path).context("lock file")?;
+ // if any of the next steps fail, we drop the file descriptor and thereby release the lock
+ let guard = unwritten_lock_file
+ .write_content(Pid::this().to_string())
+ .context("write pid to lock file")?;
+ Ok(PidFileGuard(guard))
+}
+
+/// Returned by [`read`].
+pub enum PidFileRead {
+ /// No file exists at the given path.
+ NotExist,
+ /// The given pidfile is currently not claimed by any process.
+ /// To determine this, the [`read`] operation acquired
+ /// an exclusive flock on the file. The lock is still held and responsibility
+ /// to release it is returned through the guard object.
+ /// Before releasing it, other [`claim_for_current_process`] or [`read`] calls
+ /// will fail.
+ ///
+ /// ### Caveats
+ ///
+ /// Do not unlink the pidfile from the filesystem. See module-comment for why.
+ NotHeldByAnyProcess(PidFileGuard),
+ /// The given pidfile is still claimed by another process whose PID is given
+ /// as part of this variant.
+ ///
+ /// ### Caveats
+ ///
+ /// 1. The other process might exit at any time, turning the given PID stale.
+ /// 2. There is a small window in which `claim_for_current_process` has already
+ /// locked the file but not yet updates its contents. [`read`] will return
+ /// this variant here, but with the old file contents, i.e., a stale PID.
+ ///
+ /// The kernel is free to recycle PID once it has been `wait(2)`ed upon by
+ /// its creator. Thus, acting upon a stale PID, e.g., by issuing a `kill`
+ /// system call on it, bears the risk of killing an unrelated process.
+ /// This is an inherent limitation of using pidfiles.
+ /// The only race-free solution is to have a supervisor-process with a lifetime
+ /// that exceeds that of all of its child-processes (e.g., `runit`, `supervisord`).
+ LockedByOtherProcess(Pid),
+}
+
+/// Try to read the file at the given path as a pidfile that was previously created
+/// through [`claim_for_current_process`].
+///
+/// On success, this function returns a [`PidFileRead`].
+/// Check its docs for a description of the meaning of its different variants.
+pub fn read(pidfile: &Path) -> anyhow::Result {
+ let res = lock_file::read_and_hold_lock_file(pidfile).context("read and hold pid file")?;
+ let ret = match res {
+ LockFileRead::NotExist => PidFileRead::NotExist,
+ LockFileRead::NotHeldByAnyProcess(guard, _) => {
+ PidFileRead::NotHeldByAnyProcess(PidFileGuard(guard))
+ }
+ LockFileRead::LockedByOtherProcess {
+ not_locked_file: _not_locked_file,
+ content,
+ } => {
+ // XXX the read races with the write in claim_pid_file_for_pid().
+ // But pids are smaller than a page, so the kernel page cache will lock for us.
+ // The only problem is that we might get the old contents here.
+ // Can only fix that by implementing some scheme that downgrades the
+ // exclusive lock to shared lock in claim_pid_file_for_pid().
+ PidFileRead::LockedByOtherProcess(parse_pidfile_content(&content)?)
+ }
+ };
+ Ok(ret)
+}
+
+fn parse_pidfile_content(content: &str) -> anyhow::Result {
+ let pid: i32 = content
+ .parse()
+ .map_err(|_| anyhow::anyhow!("parse pidfile content to PID"))?;
+ if pid < 1 {
+ anyhow::bail!("bad value in pidfile '{pid}'");
+ }
+ Ok(Pid::from_raw(pid))
+}
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index f55fe0886a..3995229e03 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -7,7 +7,6 @@ use std::{env, ops::ControlFlow, path::Path, str::FromStr};
use anyhow::{anyhow, Context};
use clap::{Arg, ArgAction, Command};
use fail::FailScenario;
-use nix::unistd::Pid;
use tracing::*;
use metrics::set_build_info_metric;
@@ -23,7 +22,7 @@ use pageserver::{
use remote_storage::GenericRemoteStorage;
use utils::{
auth::JwtAuth,
- lock_file, logging,
+ logging,
postgres_backend::AuthType,
project_git_version,
sentry_init::{init_sentry, release_name},
@@ -220,28 +219,13 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
}
let lock_file_path = conf.workdir.join(PID_FILE_NAME);
- let lock_file = match lock_file::create_lock_file(&lock_file_path, Pid::this().to_string()) {
- lock_file::LockCreationResult::Created {
- new_lock_contents,
- file,
- } => {
- info!("Created lock file at {lock_file_path:?} with contenst {new_lock_contents}");
- file
- }
- lock_file::LockCreationResult::AlreadyLocked {
- existing_lock_contents,
- } => anyhow::bail!(
- "Could not lock pid file; pageserver is already running in {:?} with PID {}",
- conf.workdir,
- existing_lock_contents
- ),
- lock_file::LockCreationResult::CreationFailed(e) => {
- return Err(e.context(format!("Failed to create lock file at {lock_file_path:?}")))
- }
- };
+ let lock_file =
+ utils::pid_file::claim_for_current_process(&lock_file_path).context("claim pid file")?;
+ info!("Claimed pid file at {lock_file_path:?}");
+
// ensure that the lock file is held even if the main thread of the process is panics
// we need to release the lock file only when the current process is gone
- let _ = Box::leak(Box::new(lock_file));
+ std::mem::forget(lock_file);
// TODO: Check that it looks like a valid repository before going further
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 8a2894b32d..fcd3065c65 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -4,7 +4,6 @@
use anyhow::{bail, Context, Result};
use clap::{value_parser, Arg, ArgAction, Command};
use const_format::formatcp;
-use nix::unistd::Pid;
use remote_storage::RemoteStorageConfig;
use std::fs::{self, File};
use std::io::{ErrorKind, Write};
@@ -15,7 +14,7 @@ use tokio::sync::mpsc;
use toml_edit::Document;
use tracing::*;
use url::{ParseError, Url};
-use utils::lock_file;
+use utils::pid_file;
use metrics::set_build_info_metric;
use safekeeper::broker;
@@ -147,28 +146,13 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: bo
// Prevent running multiple safekeepers on the same directory
let lock_file_path = conf.workdir.join(PID_FILE_NAME);
- let lock_file = match lock_file::create_lock_file(&lock_file_path, Pid::this().to_string()) {
- lock_file::LockCreationResult::Created {
- new_lock_contents,
- file,
- } => {
- info!("Created lock file at {lock_file_path:?} with contenst {new_lock_contents}");
- file
- }
- lock_file::LockCreationResult::AlreadyLocked {
- existing_lock_contents,
- } => anyhow::bail!(
- "Could not lock pid file; safekeeper is already running in {:?} with PID {}",
- conf.workdir,
- existing_lock_contents
- ),
- lock_file::LockCreationResult::CreationFailed(e) => {
- return Err(e.context(format!("Failed to create lock file at {lock_file_path:?}")))
- }
- };
+ let lock_file =
+ pid_file::claim_for_current_process(&lock_file_path).context("claim pid file")?;
+ info!("Claimed pid file at {lock_file_path:?}");
+
// ensure that the lock file is held even if the main thread of the process is panics
// we need to release the lock file only when the current process is gone
- let _ = Box::leak(Box::new(lock_file));
+ std::mem::forget(lock_file);
// Set or read our ID.
set_id(&mut conf, given_id)?;