pageserver: set SO_KEEPALIVE on the page service socket (#10992)

## Problem If the client connection goes dead without an explicit close (e.g. due to network infrastructure dropping the connection) then we currently won't detect it for a long time, which may e.g. block GetPage flushes and keep the task running. Touches https://github.com/neondatabase/cloud/issues/23515. ## Summary of changes Enable `SO_KEEPALIVE` on the page service socket, to enable periodic TCP keepalive probes. These are configured via Linux sysctls, which will be deployed separately. By default, the first probe is sent after 2 hours, so this doesn't have a practical effect until we change the sysctls.
2026-01-10 23:12:54 +00:00 · 2025-02-26 15:36:05 +01:00
parent 01581f3af5
commit 86b9703f06
1 changed files with 8 additions and 0 deletions
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -14,6 +14,7 @@ use camino::Utf8Path;
 use clap::{Arg, ArgAction, Command};
 use metrics::launch_timestamp::{LaunchTimestamp, set_launch_timestamp_metric};
 use metrics::set_build_info_metric;
+use nix::sys::socket::{setsockopt, sockopt};
 use pageserver::config::{PageServerConf, PageserverIdentity};
 use pageserver::controller_upcall_client::ControllerUpcallClient;
 use pageserver::deletion_queue::DeletionQueue;
@@ -347,6 +348,13 @@ fn start_pageserver(
    info!("Starting pageserver pg protocol handler on {pg_addr}");
    let pageserver_listener = tcp_listener::bind(pg_addr)?;

+    // Enable SO_KEEPALIVE on the socket, to detect dead connections faster.
+    // These are configured via net.ipv4.tcp_keepalive_* sysctls.
+    //
+    // TODO: also set this on the walreceiver socket, but tokio-postgres doesn't
+    // support enabling keepalives while using the default OS sysctls.
+    setsockopt(&pageserver_listener, sockopt::KeepAlive, &true)?;
+
    // Launch broker client
    // The storage_broker::connect call needs to happen inside a tokio runtime thread.
    let broker_client = WALRECEIVER_RUNTIME