From 62c0152e6bbb00b6fdd1061516317383a2e0ad82 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Thu, 17 Jul 2025 22:03:55 +0100 Subject: [PATCH] pageserver: shut down compute connections at libpq level (#12642) ## Problem Previously, if a get page failure was cause by timeline shutdown, the pageserver would attempt to tear down the connection gracefully: `shutdown(SHUT_WR)` followed by `close()`. This triggers a code path on the compute where it has to tell apart between an idle connection and a closed one. That code is bug prone, so we can just side-step the issue by shutting down the connection via a libpq error message. This surfaced as instability in test_shard_resolve_during_split_abort. It's a new test, but the issue existed for ages. ## Summary of Changes Send a libpq error message instead of doing graceful TCP connection shutdown. Closes LKB-648 --- libs/postgres_backend/src/lib.rs | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs index 851d824291..20afa8bb46 100644 --- a/libs/postgres_backend/src/lib.rs +++ b/libs/postgres_backend/src/lib.rs @@ -749,7 +749,18 @@ impl PostgresBackend { trace!("got query {query_string:?}"); if let Err(e) = handler.process_query(self, query_string).await { match e { - QueryError::Shutdown => return Ok(ProcessMsgResult::Break), + err @ QueryError::Shutdown => { + // Notify postgres of the connection shutdown at the libpq + // protocol level. This avoids postgres having to tell apart + // from an idle connection and a stale one, which is bug prone. + let shutdown_error = short_error(&err); + self.write_message_noflush(&BeMessage::ErrorResponse( + &shutdown_error, + Some(err.pg_error_code()), + ))?; + + return Ok(ProcessMsgResult::Break); + } QueryError::SimulatedConnectionError => { return Err(QueryError::SimulatedConnectionError); }