mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-04 03:52:56 +00:00
Don't remove PID file in neon_local, and wait after "pageserver init". (#2983)
Our shutdown procedure for "pageserver init" was buggy. Firstly, it merely sent the process a SIGKILL, but did not wait for it to actually exit. Normally, it should exit quickly as SIGKILL cannot be caught or ignored by the target process, but it's still asynchronous and the process can still be alive when the kill(2) call returns. Secondly, "neon_local" removed the PID file after sending SIGKILL, even though the process was still running. That hid the first problem: if we didn't remove the PID file, and you start a new pageserver process while the old one is still running, you would get an error when the new process tries to lock the PID file. We've been seeing a lot of "Cannot assign requested address" failures in the CI lately. Our theory is that when we run "pageserver init" immediately followed by "pageserver start", the first process is still running and listening on the port when the second invocation starts up. This commit hopefully fixes the problem. It is generally a bad idea for the "neon_local" to remove the PID file on the child process's behalf. The correct way would be for the server process to remove the PID file, after it has fully shutdown everything else. We don't currently have a robust way to ensure that everything has truly shut down and closed, however. A simpler way is to simply never remove the PID file. It's not necessary to remove the PID file for correctness: we cannot rely on the cleanup to happen anyway, if the server process crashes for example. Because of that, we already have all the logic in place to deal with a stale PID file that belonged to a process that already exited. Let's rely on that on normal shutdown too.
This commit is contained in:
committed by
GitHub
parent
d9ab42013f
commit
faf1d20e6a
@@ -147,6 +147,22 @@ where
|
||||
anyhow::bail!("{process_name} did not start in {RETRY_UNTIL_SECS} seconds");
|
||||
}
|
||||
|
||||
/// Send SIGTERM to child process
|
||||
pub fn send_stop_child_process(child: &std::process::Child) -> anyhow::Result<()> {
|
||||
let pid = child.id();
|
||||
match kill(
|
||||
nix::unistd::Pid::from_raw(pid.try_into().unwrap()),
|
||||
Signal::SIGTERM,
|
||||
) {
|
||||
Ok(()) => Ok(()),
|
||||
Err(Errno::ESRCH) => {
|
||||
println!("child process with pid {pid} does not exist");
|
||||
Ok(())
|
||||
}
|
||||
Err(e) => anyhow::bail!("Failed to send signal to child process with pid {pid}: {e}"),
|
||||
}
|
||||
}
|
||||
|
||||
/// Stops the process, using the pid file given. Returns Ok also if the process is already not running.
|
||||
pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> anyhow::Result<()> {
|
||||
if !pid_file.exists() {
|
||||
@@ -179,11 +195,6 @@ pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> any
|
||||
match process_has_stopped(pid) {
|
||||
Ok(true) => {
|
||||
println!("\n{process_name} stopped");
|
||||
if let Err(e) = fs::remove_file(pid_file) {
|
||||
if e.kind() != io::ErrorKind::NotFound {
|
||||
eprintln!("Failed to remove pid file {pid_file:?} after stopping the process: {e:#}");
|
||||
}
|
||||
}
|
||||
return Ok(());
|
||||
}
|
||||
Ok(false) => {
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
use std::collections::HashMap;
|
||||
use std::fs::{self, File};
|
||||
use std::fs::File;
|
||||
use std::io::{BufReader, Write};
|
||||
use std::num::NonZeroU64;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::process::Child;
|
||||
use std::{io, result};
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use anyhow::{bail, ensure, Context};
|
||||
use pageserver_api::models::{
|
||||
TenantConfigRequest, TenantCreateRequest, TenantInfo, TimelineCreateRequest, TimelineInfo,
|
||||
};
|
||||
@@ -168,29 +168,21 @@ impl PageServerNode {
|
||||
}
|
||||
Err(e) => eprintln!("{e:#}"),
|
||||
}
|
||||
match pageserver_process.kill() {
|
||||
Err(e) => {
|
||||
eprintln!(
|
||||
"Failed to stop pageserver {} process with pid {}: {e:#}",
|
||||
self.env.pageserver.id,
|
||||
pageserver_process.id(),
|
||||
)
|
||||
}
|
||||
Ok(()) => {
|
||||
println!(
|
||||
"Stopped pageserver {} process with pid {}",
|
||||
self.env.pageserver.id,
|
||||
pageserver_process.id(),
|
||||
);
|
||||
// cleanup after pageserver startup, since we do not call regular `stop_process` during init
|
||||
let pid_file = self.pid_file();
|
||||
if let Err(e) = fs::remove_file(&pid_file) {
|
||||
if e.kind() != io::ErrorKind::NotFound {
|
||||
eprintln!("Failed to remove pid file {pid_file:?} after stopping the process: {e:#}");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
background_process::send_stop_child_process(&pageserver_process)?;
|
||||
|
||||
let exit_code = pageserver_process.wait()?;
|
||||
ensure!(
|
||||
exit_code.success(),
|
||||
format!(
|
||||
"pageserver init failed with exit code {:?}",
|
||||
exit_code.code()
|
||||
)
|
||||
);
|
||||
println!(
|
||||
"Stopped pageserver {} process with pid {}",
|
||||
self.env.pageserver.id,
|
||||
pageserver_process.id(),
|
||||
);
|
||||
init_result
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user