From 83069f6ca10de1bc66bfb71fe870811138adfb22 Mon Sep 17 00:00:00 2001 From: Suhas Thalanki <54014218+thesuhas@users.noreply.github.com> Date: Tue, 17 Jun 2025 15:56:05 -0700 Subject: [PATCH] fix: terminate pgbouncer on compute suspend (#12153) ## Problem PgBouncer does not terminate connections on a suspend: https://github.com/neondatabase/cloud/issues/16282 ## Summary of changes 1. Adds a pid file to store the pid of PgBouncer 2. Terminates connections on a compute suspend --------- Co-authored-by: Alexey Kondratov --- compute/etc/pgbouncer.ini | 2 + compute_tools/src/bin/compute_ctl.rs | 14 +++-- compute_tools/src/compute.rs | 60 ++++++++++++++++++- compute_tools/src/http/routes/terminate.rs | 2 +- compute_tools/src/lib.rs | 1 + compute_tools/src/pgbouncer.rs | 1 + control_plane/src/bin/neon_local.rs | 8 +++ control_plane/src/endpoint.rs | 5 ++ .../compute_wrapper/shell/compute.sh | 1 + test_runner/fixtures/neon_cli.py | 3 + 10 files changed, 90 insertions(+), 7 deletions(-) create mode 100644 compute_tools/src/pgbouncer.rs diff --git a/compute/etc/pgbouncer.ini b/compute/etc/pgbouncer.ini index 9d68cbb8d5..fbcdfd4a87 100644 --- a/compute/etc/pgbouncer.ini +++ b/compute/etc/pgbouncer.ini @@ -21,6 +21,8 @@ unix_socket_dir=/tmp/ unix_socket_mode=0777 ; required for pgbouncer_exporter ignore_startup_parameters=extra_float_digits +; pidfile for graceful termination +pidfile=/tmp/pgbouncer.pid ;; Disable connection logging. It produces a lot of logs that no one looks at, ;; and we can get similar log entries from the proxy too. We had incidents in diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index 8b502a058e..d7ff381f1b 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -124,6 +124,10 @@ struct Cli { /// Interval in seconds for collecting installed extensions statistics #[arg(long, default_value = "3600")] pub installed_extensions_collection_interval: u64, + + /// Run in development mode, skipping VM-specific operations like process termination + #[arg(long, action = clap::ArgAction::SetTrue)] + pub dev: bool, } impl Cli { @@ -159,7 +163,7 @@ fn main() -> Result<()> { .build()?; let _rt_guard = runtime.enter(); - runtime.block_on(init())?; + runtime.block_on(init(cli.dev))?; // enable core dumping for all child processes setrlimit(Resource::CORE, rlimit::INFINITY, rlimit::INFINITY)?; @@ -198,13 +202,13 @@ fn main() -> Result<()> { deinit_and_exit(exit_code); } -async fn init() -> Result<()> { +async fn init(dev_mode: bool) -> Result<()> { init_tracing_and_logging(DEFAULT_LOG_LEVEL).await?; let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?; thread::spawn(move || { for sig in signals.forever() { - handle_exit_signal(sig); + handle_exit_signal(sig, dev_mode); } }); @@ -263,9 +267,9 @@ fn deinit_and_exit(exit_code: Option) -> ! { /// When compute_ctl is killed, send also termination signal to sync-safekeepers /// to prevent leakage. TODO: it is better to convert compute_ctl to async and /// wait for termination which would be easy then. -fn handle_exit_signal(sig: i32) { +fn handle_exit_signal(sig: i32, dev_mode: bool) { info!("received {sig} termination signal"); - forward_termination_signal(); + forward_termination_signal(dev_mode); exit(1); } diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 50c254224a..c591d9711a 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -35,6 +35,7 @@ use url::Url; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; use utils::measured_stream::MeasuredReader; +use utils::pid_file; use crate::configurator::launch_configurator; use crate::disk_quota::set_disk_quota; @@ -44,6 +45,7 @@ use crate::lsn_lease::launch_lsn_lease_bg_task_for_static; use crate::metrics::COMPUTE_CTL_UP; use crate::monitor::launch_monitor; use crate::pg_helpers::*; +use crate::pgbouncer::*; use crate::rsyslog::{ PostgresLogsRsyslogConfig, configure_audit_rsyslog, configure_postgres_logs_export, launch_pgaudit_gc, @@ -2246,12 +2248,68 @@ pub async fn installed_extensions(conf: tokio_postgres::Config) -> Result<()> { Ok(()) } -pub fn forward_termination_signal() { +pub fn forward_termination_signal(dev_mode: bool) { let ss_pid = SYNC_SAFEKEEPERS_PID.load(Ordering::SeqCst); if ss_pid != 0 { let ss_pid = nix::unistd::Pid::from_raw(ss_pid as i32); kill(ss_pid, Signal::SIGTERM).ok(); } + + if !dev_mode { + info!("not in dev mode, terminating pgbouncer"); + + // Terminate pgbouncer with SIGKILL + match pid_file::read(PGBOUNCER_PIDFILE.into()) { + Ok(pid_file::PidFileRead::LockedByOtherProcess(pid)) => { + info!("sending SIGKILL to pgbouncer process pid: {}", pid); + if let Err(e) = kill(pid, Signal::SIGKILL) { + error!("failed to terminate pgbouncer: {}", e); + } + } + // pgbouncer does not lock the pid file, so we read and kill the process directly + Ok(pid_file::PidFileRead::NotHeldByAnyProcess(_)) => { + if let Ok(pid_str) = std::fs::read_to_string(PGBOUNCER_PIDFILE) { + if let Ok(pid) = pid_str.trim().parse::() { + info!( + "sending SIGKILL to pgbouncer process pid: {} (from unlocked pid file)", + pid + ); + if let Err(e) = kill(Pid::from_raw(pid), Signal::SIGKILL) { + error!("failed to terminate pgbouncer: {}", e); + } + } + } else { + info!("pgbouncer pid file exists but process not running"); + } + } + Ok(pid_file::PidFileRead::NotExist) => { + info!("pgbouncer pid file not found, process may not be running"); + } + Err(e) => { + error!("error reading pgbouncer pid file: {}", e); + } + } + } + + // Terminate local_proxy + match pid_file::read("/etc/local_proxy/pid".into()) { + Ok(pid_file::PidFileRead::LockedByOtherProcess(pid)) => { + info!("sending SIGTERM to local_proxy process pid: {}", pid); + if let Err(e) = kill(pid, Signal::SIGTERM) { + error!("failed to terminate local_proxy: {}", e); + } + } + Ok(pid_file::PidFileRead::NotHeldByAnyProcess(_)) => { + info!("local_proxy PID file exists but process not running"); + } + Ok(pid_file::PidFileRead::NotExist) => { + info!("local_proxy PID file not found, process may not be running"); + } + Err(e) => { + error!("error reading local_proxy PID file: {}", e); + } + } + let pg_pid = PG_PID.load(Ordering::SeqCst); if pg_pid != 0 { let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32); diff --git a/compute_tools/src/http/routes/terminate.rs b/compute_tools/src/http/routes/terminate.rs index 2c24d4ad6b..92a89c0ee7 100644 --- a/compute_tools/src/http/routes/terminate.rs +++ b/compute_tools/src/http/routes/terminate.rs @@ -26,7 +26,7 @@ pub(in crate::http) async fn terminate(State(compute): State>) drop(state); } - forward_termination_signal(); + forward_termination_signal(false); info!("sent signal and notified waiters"); // Spawn a blocking thread to wait for compute to become Terminated. diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs index 7218067a8a..3899a1ca76 100644 --- a/compute_tools/src/lib.rs +++ b/compute_tools/src/lib.rs @@ -22,6 +22,7 @@ mod migration; pub mod monitor; pub mod params; pub mod pg_helpers; +pub mod pgbouncer; pub mod rsyslog; pub mod spec; mod spec_apply; diff --git a/compute_tools/src/pgbouncer.rs b/compute_tools/src/pgbouncer.rs new file mode 100644 index 0000000000..189dfabac9 --- /dev/null +++ b/compute_tools/src/pgbouncer.rs @@ -0,0 +1 @@ +pub const PGBOUNCER_PIDFILE: &str = "/tmp/pgbouncer.pid"; diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 01ca28fce0..aeabf4a519 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -672,6 +672,13 @@ struct EndpointStartCmdArgs { #[clap(short = 't', long, value_parser= humantime::parse_duration, help = "timeout until we fail the command")] #[arg(default_value = "90s")] start_timeout: Duration, + + #[clap( + long, + help = "Run in development mode, skipping VM-specific operations like process termination", + action = clap::ArgAction::SetTrue + )] + dev: bool, } #[derive(clap::Args)] @@ -1590,6 +1597,7 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res stripe_size.0 as usize, args.create_test_user, args.start_timeout, + args.dev, ) .await?; } diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index 03156c1809..2df71df57d 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -691,6 +691,7 @@ impl Endpoint { shard_stripe_size: usize, create_test_user: bool, start_timeout: Duration, + dev: bool, ) -> Result<()> { if self.status() == EndpointStatus::Running { anyhow::bail!("The endpoint is already running"); @@ -861,6 +862,10 @@ impl Endpoint { cmd.args(["--remote-ext-base-url", remote_ext_base_url]); } + if dev { + cmd.arg("--dev"); + } + let child = cmd.spawn()?; // set up a scopeguard to kill & wait for the child in case we panic or bail below let child = scopeguard::guard(child, |mut child| { diff --git a/docker-compose/compute_wrapper/shell/compute.sh b/docker-compose/compute_wrapper/shell/compute.sh index c8ca812bf9..1e62e91fd0 100755 --- a/docker-compose/compute_wrapper/shell/compute.sh +++ b/docker-compose/compute_wrapper/shell/compute.sh @@ -95,3 +95,4 @@ echo "Start compute node" -b /usr/local/bin/postgres \ --compute-id "compute-${RANDOM}" \ --config "${CONFIG_FILE}" + --dev diff --git a/test_runner/fixtures/neon_cli.py b/test_runner/fixtures/neon_cli.py index bb07e2b6d1..48a1a36e66 100644 --- a/test_runner/fixtures/neon_cli.py +++ b/test_runner/fixtures/neon_cli.py @@ -564,6 +564,7 @@ class NeonLocalCli(AbstractNeonCli): basebackup_request_tries: int | None = None, timeout: str | None = None, env: dict[str, str] | None = None, + dev: bool = False, ) -> subprocess.CompletedProcess[str]: args = [ "endpoint", @@ -589,6 +590,8 @@ class NeonLocalCli(AbstractNeonCli): args.extend(["--create-test-user"]) if timeout is not None: args.extend(["--start-timeout", str(timeout)]) + if dev: + args.extend(["--dev"]) res = self.raw_cli(args, extra_env_vars) res.check_returncode()