From 86b9703f067a773e4d2bab4d52663ffc0dbaffeb Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Wed, 26 Feb 2025 15:36:05 +0100 Subject: [PATCH] pageserver: set `SO_KEEPALIVE` on the page service socket (#10992) ## Problem If the client connection goes dead without an explicit close (e.g. due to network infrastructure dropping the connection) then we currently won't detect it for a long time, which may e.g. block GetPage flushes and keep the task running. Touches https://github.com/neondatabase/cloud/issues/23515. ## Summary of changes Enable `SO_KEEPALIVE` on the page service socket, to enable periodic TCP keepalive probes. These are configured via Linux sysctls, which will be deployed separately. By default, the first probe is sent after 2 hours, so this doesn't have a practical effect until we change the sysctls. --- pageserver/src/bin/pageserver.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index ab8d37df2e..703629aed5 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -14,6 +14,7 @@ use camino::Utf8Path; use clap::{Arg, ArgAction, Command}; use metrics::launch_timestamp::{LaunchTimestamp, set_launch_timestamp_metric}; use metrics::set_build_info_metric; +use nix::sys::socket::{setsockopt, sockopt}; use pageserver::config::{PageServerConf, PageserverIdentity}; use pageserver::controller_upcall_client::ControllerUpcallClient; use pageserver::deletion_queue::DeletionQueue; @@ -347,6 +348,13 @@ fn start_pageserver( info!("Starting pageserver pg protocol handler on {pg_addr}"); let pageserver_listener = tcp_listener::bind(pg_addr)?; + // Enable SO_KEEPALIVE on the socket, to detect dead connections faster. + // These are configured via net.ipv4.tcp_keepalive_* sysctls. + // + // TODO: also set this on the walreceiver socket, but tokio-postgres doesn't + // support enabling keepalives while using the default OS sysctls. + setsockopt(&pageserver_listener, sockopt::KeepAlive, &true)?; + // Launch broker client // The storage_broker::connect call needs to happen inside a tokio runtime thread. let broker_client = WALRECEIVER_RUNTIME