improve pidfile handling

This patch centralize the logic of creating & reading pid files into the new pid_file module and improves upon / makes explicit a few race conditions that existed with the previous code. Starting Processes / Creating Pidfiles ====================================== Before this patch, we had three places that had very similar-looking match lock_file::create_lock_file { ... } blocks. After this change, they can use a straight-forward call provided by the pid_file: pid_file::claim_pid_file_for_pid() Stopping Processes / Reading Pidfiles ===================================== The new pid_file module provides a function to read a pidfile, called read_pidfile(), that returns a pub enum PidFileRead { NotExist, NotHeldByAnyProcess(PidFileGuard), LockedByOtherProcess(Pid), } If we get back NotExist, there is nothing to kill. If we get back NotHeldByAnyProcess, the pid file is stale and we must ignore its contents. If it's LockedByOtherProcess, it's either another pidfile reader or, more likely, the daemon that is still running. In this case, we can read the pid in the pidfile and kill it. There's still a small window where this is racy, but it's not a regression compared to what we have before. The NotHeldByAnyProcess is an improvement over what we had before this patch. Before, we would blindly read the pidfile contents and kill, even if no other process held the flock. If the pidfile was stale (NotHeldByAnyProcess), then that kill would either result in ESRCH or hit some other unrelated process on the system. This patch avoids the latter cacse by grabbing an exclusive flock before reading the pidfile, and returning the flock to the caller in the form of a guard object, to avoid concurrent reads / kills. It's hopefully irrelevant in practice, but it's a little robustness that we get for free here. Maintain flock on Pidfile of ETCD / any InitialPidFile::Create() ================================================================ Pageserver and safekeeper create their pidfiles themselves. But for etcd, neon_local creates the pidfile (InitialPidFile::Create()). Before this change, we would unlock the etcd pidfile as soon as `neon_local start` exits, simply because no-one else kept the FD open. During `neon_local stop`, that results in a stale pid file, aka, NotHeldByAnyProcess, and it would henceforth not trust that the PID stored in the file is still valid. With this patch, we make the etcd process inherit the pidfile FD, thereby keeping the flock held until it exits.
2026-01-07 05:22:56 +00:00 · 2022-12-01 07:05:20 -05:00
parent 6dfd7cb1d0
commit ac0c167a85
7 changed files with 400 additions and 164 deletions
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -14,17 +14,19 @@

 use std::ffi::OsStr;
 use std::io::Write;
-use std::path::Path;
+use std::os::unix::prelude::AsRawFd;
+use std::os::unix::process::CommandExt;
+use std::path::{Path, PathBuf};
 use std::process::{Child, Command};
 use std::time::Duration;
 use std::{fs, io, thread};

-use anyhow::{anyhow, bail, Context, Result};
+use anyhow::Context;
 use nix::errno::Errno;
+use nix::fcntl::{FcntlArg, FdFlag};
 use nix::sys::signal::{kill, Signal};
 use nix::unistd::Pid;
-
-use utils::lock_file;
+use utils::pid_file::{self, PidFileRead};

 // These constants control the loop used to poll for process start / stop.
 //
@@ -86,6 +88,14 @@ where
    let filled_cmd = fill_aws_secrets_vars(fill_rust_env_vars(background_command));
    filled_cmd.envs(envs);

+    let pid_file_to_check = match initial_pid_file {
+        InitialPidFile::Create(path) => {
+            pre_exec_create_pidfile(filled_cmd, path);
+            path
+        }
+        InitialPidFile::Expect(path) => path,
+    };
+
    let mut spawned_process = filled_cmd.spawn().with_context(|| {
        format!("Could not spawn {process_name}, see console output and log files for details.")
    })?;
@@ -95,29 +105,8 @@ where
            .with_context(|| format!("Subprocess {process_name} has invalid pid {pid}"))?,
    );

-    let pid_file_to_check = match initial_pid_file {
-        InitialPidFile::Create(target_pid_file_path) => {
-            match lock_file::create_lock_file(target_pid_file_path, pid.to_string()) {
-                lock_file::LockCreationResult::Created { .. } => {
-                    // We use "lock" file here only to create the pid file. The lock on the pidfile will be dropped as soon
-                    // as this CLI invocation exits, so it's a bit useless, but doesn't any harm either.
-                }
-                lock_file::LockCreationResult::AlreadyLocked { .. } => {
-                    anyhow::bail!("Cannot write pid file for {process_name} at path {target_pid_file_path:?}: file is already locked by another process")
-                }
-                lock_file::LockCreationResult::CreationFailed(e) => {
-                    return Err(e.context(format!(
-                    "Failed to create pid file for {process_name} at path {target_pid_file_path:?}"
-                )))
-                }
-            }
-            None
-        }
-        InitialPidFile::Expect(pid_file_path) => Some(pid_file_path),
-    };
-
    for retries in 0..RETRIES {
-        match process_started(pid, pid_file_to_check, &process_status_check) {
+        match process_started(pid, Some(pid_file_to_check), &process_status_check) {
            Ok(true) => {
                println!("\n{process_name} started, pid: {pid}");
                return Ok(spawned_process);
@@ -165,12 +154,27 @@ pub fn send_stop_child_process(child: &std::process::Child) -> anyhow::Result<()

 /// Stops the process, using the pid file given. Returns Ok also if the process is already not running.
 pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> anyhow::Result<()> {
-    if !pid_file.exists() {
-        println!("{process_name} is already stopped: no pid file {pid_file:?} is present");
-        return Ok(());
-    }
-    let pid = read_pidfile(pid_file)?;
+    let pid = match pid_file::read(pid_file)
+        .with_context(|| format!("read pid_file {pid_file:?}"))?
+    {
+        PidFileRead::NotExist => {
+            println!("{process_name} is already stopped: no pid file present at {pid_file:?}");
+            return Ok(());
+        }
+        PidFileRead::NotHeldByAnyProcess(_) => {
+            // Don't try to kill according to file contents beacuse the pid might have been re-used by another process.
+            // Don't delete the file either, it can race with new pid file creation.
+            // Read `pid_file` module comment for details.
+            println!(
+                "No process is holding the pidfile. The process must have already exited. Leave in place to avoid race conditions: {pid_file:?}"
+            );
+            return Ok(());
+        }
+        PidFileRead::LockedByOtherProcess(pid) => pid,
+    };
+    // XXX the pid could become invalid (and recycled) at any time before the kill() below.

+    // send signal
    let sig = if immediate {
        print!("Stopping {process_name} with pid {pid} immediately..");
        Signal::SIGQUIT
@@ -182,8 +186,9 @@ pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> any
    match kill(pid, sig) {
        Ok(()) => (),
        Err(Errno::ESRCH) => {
+            // Again, don't delete the pid file. The unlink can race with a new pid file being created.
            println!(
-                "{process_name} with pid {pid} does not exist, but a pid file {pid_file:?} was found"
+                "{process_name} with pid {pid} does not exist, but a pid file {pid_file:?} was found. Likely the pid got recycled. Lucky we didn't harm anyone."
            );
            return Ok(());
        }
@@ -252,6 +257,69 @@ fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command {
    cmd
 }

+/// Add a `pre_exec` to the cmd that, inbetween fork() and exec(),
+/// 1. Claims a pidfile with a fcntl lock on it and
+/// 2. Sets up the pidfile's file descriptor so that it (and the lock)
+///    will remain held until the cmd exits.
+fn pre_exec_create_pidfile<P>(cmd: &mut Command, path: P) -> &mut Command
+where
+    P: Into<PathBuf>,
+{
+    let path: PathBuf = path.into();
+    // SAFETY
+    // pre_exec is marked unsafe because it runs between fork and exec.
+    // Why is that dangerous in various ways?
+    // Long answer:  https://github.com/rust-lang/rust/issues/39575
+    // Short answer: in a multi-threaded program, other threads may have
+    // been inside of critical sections at the time of fork. In the
+    // original process, that was allright, assuming they protected
+    // the critical sections appropriately, e.g., through locks.
+    // Fork adds another process to the mix that
+    //   1. Has a single thread T
+    //   2. In an exact copy of the address space at the time of fork.
+    // A variety of problems scan occur now:
+    //   1. T tries to grab a lock that was locked at the time of fork.
+    //      It will wait forever since in its address space, the lock
+    //      is in state 'taken' but the thread that would unlock it is
+    //      not there.
+    //   2. A rust object that represented some external resource in the
+    //      parent now got implicitly copied by the the fork, even though
+    //      the object's type is not `Copy`. The parent program may use
+    //      non-copyability as way to enforce unique ownership of an
+    //      external resource in the typesystem. The fork breaks that
+    //      assumption, as now both parent and child process have an
+    //      owned instance of the object that represents the same
+    //      underlying resource.
+    // While these seem like niche problems, (1) in particular is
+    // highly relevant. For example, `malloc()` may grab a mutex internally,
+    // and so, if we forked while another thread was mallocing' and our
+    // pre_exec closure allocates as well, it will block on the malloc
+    // mutex forever
+    //
+    // The proper solution is to only use C library functions that are marked
+    // "async-signal-safe": https://man7.org/linux/man-pages/man7/signal-safety.7.html
+    //
+    // With this specific pre_exec() closure, the non-error path doesn't allocate.
+    // The error path uses `anyhow`, and hence does allocate.
+    // We take our chances there, hoping that any potential disaster is constrained
+    // to the child process (e.g., malloc has no state ourside of the child process).
+    // Last, `expect` prints to stderr, and stdio is not async-signal-safe.
+    // Again, we take our chances, making the same assumptions as for malloc.
+    unsafe {
+        cmd.pre_exec(move || {
+            let file = pid_file::claim_for_current_process(&path).expect("claim pid file");
+            // Remove the FD_CLOEXEC flag on the pidfile descriptor so that the pidfile
+            // remains locked after exec.
+            nix::fcntl::fcntl(file.as_raw_fd(), FcntlArg::F_SETFD(FdFlag::empty()))
+                .expect("remove FD_CLOEXEC");
+            // Don't run drop(file), it would close the file before we actually exec.
+            std::mem::forget(file);
+            Ok(())
+        });
+    }
+    cmd
+}
+
 fn process_started<F>(
    pid: Pid,
    pid_file_to_check: Option<&Path>,
@@ -262,14 +330,11 @@ where
 {
    match status_check() {
        Ok(true) => match pid_file_to_check {
-            Some(pid_file_path) => {
-                if pid_file_path.exists() {
-                    let pid_in_file = read_pidfile(pid_file_path)?;
-                    Ok(pid_in_file == pid)
-                } else {
-                    Ok(false)
-                }
-            }
+            Some(pid_file_path) => match pid_file::read(pid_file_path)? {
+                PidFileRead::NotExist => Ok(false),
+                PidFileRead::LockedByOtherProcess(pid_in_file) => Ok(pid_in_file == pid),
+                PidFileRead::NotHeldByAnyProcess(_) => Ok(false),
+            },
            None => Ok(true),
        },
        Ok(false) => Ok(false),
@@ -277,21 +342,6 @@ where
    }
 }

-/// Read a PID file
-///
-/// We expect a file that contains a single integer.
-fn read_pidfile(pidfile: &Path) -> Result<Pid> {
-    let pid_str = fs::read_to_string(pidfile)
-        .with_context(|| format!("failed to read pidfile {pidfile:?}"))?;
-    let pid: i32 = pid_str
-        .parse()
-        .map_err(|_| anyhow!("failed to parse pidfile {pidfile:?}"))?;
-    if pid < 1 {
-        bail!("pidfile {pidfile:?} contained bad value '{pid}'");
-    }
-    Ok(Pid::from_raw(pid))
-}
-
 fn process_has_stopped(pid: Pid) -> anyhow::Result<bool> {
    match kill(pid, None) {
        // Process exists, keep waiting
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -324,7 +324,7 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
            pg_version,
        )
        .unwrap_or_else(|e| {
-            eprintln!("pageserver init failed: {e}");
+            eprintln!("pageserver init failed: {e:?}");
            exit(1);
        });

--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -34,6 +34,7 @@ pub mod sock_split;
 pub mod logging;

 pub mod lock_file;
+pub mod pid_file;

 // Misc
 pub mod accum;
--- a/libs/utils/src/lock_file.rs
+++ b/libs/utils/src/lock_file.rs
@@ -1,81 +1,133 @@
-//! A module to create and read lock files. A lock file ensures that only one
-//! process is running at a time, in a particular directory.
+//! A module to create and read lock files.
 //!
-//! File locking is done using [`fcntl::flock`], which means that holding the
-//! lock on file only prevents acquiring another lock on it; all other
-//! operations are still possible on files. Other process can still open, read,
-//! write, or remove the file, for example.
-//! If the file is removed while a process is holding a lock on it,
-//! the process that holds the lock does not get any error or notification.
-//! Furthermore, you can create a new file with the same name and lock the new file,
-//! while the old process is still running.
-//! Deleting the lock file while the locking process is still running is a bad idea!
+//! File locking is done using [`fcntl::flock`] exclusive locks.
+//! The only consumer of this module is currently [`pid_file`].
+//! See the module-level comment there for potential pitfalls
+//! with lock files that are used to store PIDs (pidfiles).

-use std::{fs, os::unix::prelude::AsRawFd, path::Path};
+use std::{
+    fs,
+    io::{Read, Write},
+    ops::Deref,
+    os::unix::prelude::AsRawFd,
+    path::{Path, PathBuf},
+};

 use anyhow::Context;
-use nix::fcntl;
+use nix::{errno::Errno::EAGAIN, fcntl};

 use crate::crashsafe;

-pub enum LockCreationResult {
-    Created {
-        new_lock_contents: String,
-        file: fs::File,
-    },
-    AlreadyLocked {
-        existing_lock_contents: String,
-    },
-    CreationFailed(anyhow::Error),
+/// A handle to an open and unlocked, but not-yet-written lock file.
+/// Returned by [`create_exclusive`].
+#[must_use]
+pub struct UnwrittenLockFile {
+    path: PathBuf,
+    file: fs::File,
 }

-/// Creates a lock file in the path given and writes the given contents into the file.
-/// Note: The lock is automatically released when the file closed. You might want to use Box::leak to make sure it lives until the end of the program.
-pub fn create_lock_file(lock_file_path: &Path, contents: String) -> LockCreationResult {
-    let lock_file = match fs::OpenOptions::new()
+/// Returned by [`UnwrittenLockFile::write_content`].
+#[must_use]
+pub struct LockFileGuard(fs::File);
+
+impl Deref for LockFileGuard {
+    type Target = fs::File;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+impl UnwrittenLockFile {
+    /// Replace the content of this lock file with the byte representation of `contents`.
+    pub fn write_content(mut self, contents: String) -> anyhow::Result<LockFileGuard> {
+        self.file
+            .set_len(0)
+            .context("Failed to truncate lockfile")?;
+        self.file
+            .write_all(contents.as_bytes())
+            .with_context(|| format!("Failed to write '{contents}' contents into lockfile"))?;
+        crashsafe::fsync_file_and_parent(&self.path).context("fsync lockfile")?;
+        Ok(LockFileGuard(self.file))
+    }
+}
+
+/// Creates and opens a lock file in the path, grabs an exclusive flock on it, and returns
+/// a handle that allows overwriting the locked file's content.
+///
+/// The exclusive lock is released when dropping the returned handle.
+///
+/// It is not an error if the file already exists.
+/// It is an error if the file is already locked.
+pub fn create_exclusive(lock_file_path: &Path) -> anyhow::Result<UnwrittenLockFile> {
+    let lock_file = fs::OpenOptions::new()
        .create(true) // O_CREAT
        .write(true)
        .open(lock_file_path)
-        .context("Failed to open lock file")
-    {
-        Ok(file) => file,
-        Err(e) => return LockCreationResult::CreationFailed(e),
-    };
+        .context("open lock file")?;

-    match fcntl::flock(
+    let res = fcntl::flock(
        lock_file.as_raw_fd(),
        fcntl::FlockArg::LockExclusiveNonblock,
-    ) {
-        Ok(()) => {
-            match lock_file
-                .set_len(0)
-                .context("Failed to truncate lockfile")
-                .and_then(|()| {
-                    fs::write(lock_file_path, &contents).with_context(|| {
-                        format!("Failed to write '{contents}' contents into lockfile")
-                    })
-                })
-                .and_then(|()| {
-                    crashsafe::fsync_file_and_parent(lock_file_path)
-                        .context("Failed to fsync lockfile")
-                }) {
-                Ok(()) => LockCreationResult::Created {
-                    new_lock_contents: contents,
-                    file: lock_file,
-                },
-                Err(e) => LockCreationResult::CreationFailed(e),
-            }
-        }
-        Err(nix::errno::Errno::EAGAIN) => {
-            match fs::read_to_string(lock_file_path).context("Failed to read lockfile contents") {
-                Ok(existing_lock_contents) => LockCreationResult::AlreadyLocked {
-                    existing_lock_contents,
-                },
-                Err(e) => LockCreationResult::CreationFailed(e),
-            }
-        }
-        Err(e) => {
-            LockCreationResult::CreationFailed(anyhow::anyhow!("Failed to lock lockfile: {e}"))
-        }
+    );
+    match res {
+        Ok(()) => Ok(UnwrittenLockFile {
+            path: lock_file_path.to_owned(),
+            file: lock_file,
+        }),
+        Err(EAGAIN) => anyhow::bail!("file is already locked"),
+        Err(e) => Err(e).context("flock error"),
+    }
+}
+
+/// Returned by [`read_and_hold_lock_file`].
+/// Check out the [`pid_file`] module for what the variants mean
+/// and potential caveats if the lock files that are used to store PIDs.
+pub enum LockFileRead {
+    /// No file exists at the given path.
+    NotExist,
+    /// No other process held the lock file, so we grabbed an flock
+    /// on it and read its contents.
+    /// Release the flock by dropping the [`LockFileGuard`].
+    NotHeldByAnyProcess(LockFileGuard, String),
+    /// The file exists but another process was holding an flock on it.
+    LockedByOtherProcess {
+        not_locked_file: fs::File,
+        content: String,
+    },
+}
+
+/// Open & try to lock the lock file at the given `path`, returning a [handle][`LockFileRead`] to
+/// inspect its content. It is not an `Err(...)` if the file does not exist or is already locked.
+/// Check the [`LockFileRead`] variants for details.
+pub fn read_and_hold_lock_file(path: &Path) -> anyhow::Result<LockFileRead> {
+    let res = fs::OpenOptions::new().read(true).open(path);
+    let mut lock_file = match res {
+        Ok(f) => f,
+        Err(e) => match e.kind() {
+            std::io::ErrorKind::NotFound => return Ok(LockFileRead::NotExist),
+            _ => return Err(e).context("open lock file"),
+        },
+    };
+    let res = fcntl::flock(
+        lock_file.as_raw_fd(),
+        fcntl::FlockArg::LockExclusiveNonblock,
+    );
+    // We need the content regardless of lock success / failure.
+    // But, read it after flock so that, if it succeeded, the content is consistent.
+    let mut content = String::new();
+    lock_file
+        .read_to_string(&mut content)
+        .context("read lock file")?;
+    match res {
+        Ok(()) => Ok(LockFileRead::NotHeldByAnyProcess(
+            LockFileGuard(lock_file),
+            content,
+        )),
+        Err(EAGAIN) => Ok(LockFileRead::LockedByOtherProcess {
+            not_locked_file: lock_file,
+            content,
+        }),
+        Err(e) => Err(e).context("flock error"),
    }
 }
--- a/libs/utils/src/pid_file.rs
+++ b/libs/utils/src/pid_file.rs
@@ -0,0 +1,165 @@
+//! Abstraction to create & read pidfiles.
+//!
+//! A pidfile is a file in the filesystem that stores a process's PID.
+//! Its purpose is to implement a singleton behavior where only
+//! one process of some "kind" is supposed to be running at a given time.
+//! The "kind" is identified by the pidfile.
+//!
+//! During process startup, the process that is supposed to be a singleton
+//! must [claim][`claim_for_current_process`] the pidfile first.
+//! If that is unsuccessful, the process must not act as the singleton, i.e.,
+//! it must not access any of the resources that only the singleton may access.
+//!
+//! A common need is to signal a running singleton process, e.g., to make
+//! it shut down and exit.
+//! For that, we have to [`read`] the pidfile. The result of the `read` operation
+//! tells us if there is any singleton process, and if so, what PID it has.
+//! We can then proceed to signal it, although some caveats still apply.
+//! Read the function-level documentation of [`read`] for that.
+//!
+//! ## Never Remove Pidfiles
+//!
+//! It would be natural to assume that the process who claimed the pidfile
+//! should remove it upon exit to avoid leaving a stale pidfile in place.
+//! However, we already have a reliable way to detect staleness of the pidfile,
+//! i.e., the `flock` that [claiming][`claim_for_current_process`] puts on it.
+//!
+//! And further, removing pidfiles would introduce a **catastrophic race condition**
+//! where two processes are running that are supposed to be singletons.
+//! Suppose we were to remove our pidfile during process shutdown.
+//! Here is how the race plays out:
+//! - Suppose we have a service called `myservice` with pidfile `myservice.pidfile`.
+//! - Process `A` starts to shut down.
+//! - Process `B` is just starting up
+//!     - It `open("myservice.pid", O_WRONLY|O_CREAT)` the file
+//!     - It blocks on `flock`
+//! - Process `A` removes the pidfile as the last step of its shutdown procedure
+//!     - `unlink("myservice.pid")
+//! - Process `A` exits
+//!     - This releases its `flock` and unblocks `B`
+//! - Process `B` still has the file descriptor for `myservice.pid` open
+//! - Process `B` writes its PID into `myservice.pid`.
+//! - But the `myservice.pid` file has been unlinked, so, there is `myservice.pid`
+//!   in the directory.
+//! - Process `C` starts
+//!     - It `open("myservice.pid", O_WRONLY|O_CREAT)` which creates a new file (new inode)
+//!     - It `flock`s the file, which, since it's a different file, does not block
+//!     - It writes its PID into the file
+//!
+//! At this point, `B` and `C` are running, which is hazardous.
+//! Morale of the story: don't unlink pidfiles, ever.
+
+use std::{ops::Deref, path::Path};
+
+use anyhow::Context;
+use nix::unistd::Pid;
+
+use crate::lock_file::{self, LockFileRead};
+
+/// Keeps a claim on a pidfile alive until it is dropped.
+/// Returned by [`claim_for_current_process`].
+#[must_use]
+pub struct PidFileGuard(lock_file::LockFileGuard);
+
+impl Deref for PidFileGuard {
+    type Target = lock_file::LockFileGuard;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+/// Try to claim `path` as a pidfile for the current process.
+///
+/// If another process has already claimed the pidfile, and it is still running,
+/// this function returns ane error.
+/// Otherwise, the function `flock`s the file and updates its contents to the
+/// current process's PID.
+/// If the update fails, the flock is released and an error returned.
+/// On success, the function returns a [`PidFileGuard`] to keep the flock alive.
+///
+/// ### Maintaining A Claim
+///
+/// It is the caller's responsibility to maintain the claim.
+/// The claim ends as soon as the returned guard object is dropped.
+/// To maintain the claim for the remaining lifetime of the current process,
+/// use [`std::mem::forget`] or similar.
+pub fn claim_for_current_process(path: &Path) -> anyhow::Result<PidFileGuard> {
+    let unwritten_lock_file = lock_file::create_exclusive(path).context("lock file")?;
+    // if any of the next steps fail, we drop the file descriptor and thereby release the lock
+    let guard = unwritten_lock_file
+        .write_content(Pid::this().to_string())
+        .context("write pid to lock file")?;
+    Ok(PidFileGuard(guard))
+}
+
+/// Returned by [`read`].
+pub enum PidFileRead {
+    /// No file exists at the given path.
+    NotExist,
+    /// The given pidfile is currently not claimed by any process.
+    /// To determine this, the [`read`] operation acquired
+    /// an exclusive flock on the file. The lock is still held and responsibility
+    /// to release it is returned through the guard object.
+    /// Before releasing it, other [`claim_for_current_process`] or [`read`] calls
+    /// will fail.
+    ///
+    /// ### Caveats
+    ///
+    /// Do not unlink the pidfile from the filesystem. See module-comment for why.
+    NotHeldByAnyProcess(PidFileGuard),
+    /// The given pidfile is still claimed by another process whose PID is given
+    /// as part of this variant.
+    ///
+    /// ### Caveats
+    ///
+    /// 1. The other process might exit at any time, turning the given PID stale.
+    /// 2. There is a small window in which `claim_for_current_process` has already
+    ///    locked the file but not yet updates its contents. [`read`] will return
+    ///    this variant here, but with the old file contents, i.e., a stale PID.
+    ///
+    /// The kernel is free to recycle PID once it has been `wait(2)`ed upon by
+    /// its creator. Thus, acting upon a stale PID, e.g., by issuing a `kill`
+    /// system call on it, bears the risk of killing an unrelated process.
+    /// This is an inherent limitation of using pidfiles.
+    /// The only race-free solution is to have a supervisor-process with a lifetime
+    /// that exceeds that of all of its child-processes (e.g., `runit`, `supervisord`).
+    LockedByOtherProcess(Pid),
+}
+
+/// Try to read the file at the given path as a pidfile that was previously created
+/// through [`claim_for_current_process`].
+///
+/// On success, this function returns a [`PidFileRead`].
+/// Check its docs for a description of the meaning of its different variants.
+pub fn read(pidfile: &Path) -> anyhow::Result<PidFileRead> {
+    let res = lock_file::read_and_hold_lock_file(pidfile).context("read and hold pid file")?;
+    let ret = match res {
+        LockFileRead::NotExist => PidFileRead::NotExist,
+        LockFileRead::NotHeldByAnyProcess(guard, _) => {
+            PidFileRead::NotHeldByAnyProcess(PidFileGuard(guard))
+        }
+        LockFileRead::LockedByOtherProcess {
+            not_locked_file: _not_locked_file,
+            content,
+        } => {
+            // XXX the read races with the write in claim_pid_file_for_pid().
+            // But pids are smaller than a page, so the kernel page cache will lock for us.
+            // The only problem is that we might get the old contents here.
+            // Can only fix that by implementing some scheme that downgrades the
+            // exclusive lock to shared lock in claim_pid_file_for_pid().
+            PidFileRead::LockedByOtherProcess(parse_pidfile_content(&content)?)
+        }
+    };
+    Ok(ret)
+}
+
+fn parse_pidfile_content(content: &str) -> anyhow::Result<Pid> {
+    let pid: i32 = content
+        .parse()
+        .map_err(|_| anyhow::anyhow!("parse pidfile content to PID"))?;
+    if pid < 1 {
+        anyhow::bail!("bad value in pidfile '{pid}'");
+    }
+    Ok(Pid::from_raw(pid))
+}
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -7,7 +7,6 @@ use std::{env, ops::ControlFlow, path::Path, str::FromStr};
 use anyhow::{anyhow, Context};
 use clap::{Arg, ArgAction, Command};
 use fail::FailScenario;
-use nix::unistd::Pid;
 use tracing::*;

 use metrics::set_build_info_metric;
@@ -23,7 +22,7 @@ use pageserver::{
 use remote_storage::GenericRemoteStorage;
 use utils::{
    auth::JwtAuth,
-    lock_file, logging,
+    logging,
    postgres_backend::AuthType,
    project_git_version,
    sentry_init::{init_sentry, release_name},
@@ -220,28 +219,13 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
    }

    let lock_file_path = conf.workdir.join(PID_FILE_NAME);
-    let lock_file = match lock_file::create_lock_file(&lock_file_path, Pid::this().to_string()) {
-        lock_file::LockCreationResult::Created {
-            new_lock_contents,
-            file,
-        } => {
-            info!("Created lock file at {lock_file_path:?} with contenst {new_lock_contents}");
-            file
-        }
-        lock_file::LockCreationResult::AlreadyLocked {
-            existing_lock_contents,
-        } => anyhow::bail!(
-            "Could not lock pid file; pageserver is already running in {:?} with PID {}",
-            conf.workdir,
-            existing_lock_contents
-        ),
-        lock_file::LockCreationResult::CreationFailed(e) => {
-            return Err(e.context(format!("Failed to create lock file at {lock_file_path:?}")))
-        }
-    };
+    let lock_file =
+        utils::pid_file::claim_for_current_process(&lock_file_path).context("claim pid file")?;
+    info!("Claimed pid file at {lock_file_path:?}");
+
    // ensure that the lock file is held even if the main thread of the process is panics
    // we need to release the lock file only when the current process is gone
-    let _ = Box::leak(Box::new(lock_file));
+    std::mem::forget(lock_file);

    // TODO: Check that it looks like a valid repository before going further

--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -4,7 +4,6 @@
 use anyhow::{bail, Context, Result};
 use clap::{value_parser, Arg, ArgAction, Command};
 use const_format::formatcp;
-use nix::unistd::Pid;
 use remote_storage::RemoteStorageConfig;
 use std::fs::{self, File};
 use std::io::{ErrorKind, Write};
@@ -15,7 +14,7 @@ use tokio::sync::mpsc;
 use toml_edit::Document;
 use tracing::*;
 use url::{ParseError, Url};
-use utils::lock_file;
+use utils::pid_file;

 use metrics::set_build_info_metric;
 use safekeeper::broker;
@@ -147,28 +146,13 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option<NodeId>, init: bo

    // Prevent running multiple safekeepers on the same directory
    let lock_file_path = conf.workdir.join(PID_FILE_NAME);
-    let lock_file = match lock_file::create_lock_file(&lock_file_path, Pid::this().to_string()) {
-        lock_file::LockCreationResult::Created {
-            new_lock_contents,
-            file,
-        } => {
-            info!("Created lock file at {lock_file_path:?} with contenst {new_lock_contents}");
-            file
-        }
-        lock_file::LockCreationResult::AlreadyLocked {
-            existing_lock_contents,
-        } => anyhow::bail!(
-            "Could not lock pid file; safekeeper is already running in {:?} with PID {}",
-            conf.workdir,
-            existing_lock_contents
-        ),
-        lock_file::LockCreationResult::CreationFailed(e) => {
-            return Err(e.context(format!("Failed to create lock file at {lock_file_path:?}")))
-        }
-    };
+    let lock_file =
+        pid_file::claim_for_current_process(&lock_file_path).context("claim pid file")?;
+    info!("Claimed pid file at {lock_file_path:?}");
+
    // ensure that the lock file is held even if the main thread of the process is panics
    // we need to release the lock file only when the current process is gone
-    let _ = Box::leak(Box::new(lock_file));
+    std::mem::forget(lock_file);

    // Set or read our ID.
    set_id(&mut conf, given_id)?;