From 3b7cc4234c8675b777a3f85798734c0b41748d11 Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Fri, 9 May 2025 19:02:24 +0200 Subject: [PATCH] Fix PS connect attempt timeouts when facing interrupts (#11880) With the 50ms timeouts of pumping state in connector.c, we need to correctly handle these timeouts that also wake up pg_usleep. This new approach makes the connection attempts re-start the wait whenever it gets woken up early; and CHECK_FOR_INTERRUPTS() is called to make sure we don't miss query cancellations. ## Problem https://neondb.slack.com/archives/C04DGM6SMTM/p1746794528680269 ## Summary of changes Make sure we start sleeping again if pg_usleep got woken up ahead of time. --- pgxn/neon/libpagestore.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index ee4e6ccc5b..3b6c4247c3 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -433,7 +433,6 @@ pageserver_connect(shardno_t shard_no, int elevel) now = GetCurrentTimestamp(); us_since_last_attempt = (int64) (now - shard->last_reconnect_time); - shard->last_reconnect_time = now; /* * Make sure we don't do exponential backoff with a constant multiplier @@ -447,14 +446,23 @@ pageserver_connect(shardno_t shard_no, int elevel) /* * If we did other tasks between reconnect attempts, then we won't * need to wait as long as a full delay. + * + * This is a loop to protect against interrupted sleeps. */ - if (us_since_last_attempt < shard->delay_us) + while (us_since_last_attempt < shard->delay_us) { pg_usleep(shard->delay_us - us_since_last_attempt); + + /* At least we should handle cancellations here */ + CHECK_FOR_INTERRUPTS(); + + now = GetCurrentTimestamp(); + us_since_last_attempt = (int64) (now - shard->last_reconnect_time); } /* update the delay metric */ shard->delay_us = Min(shard->delay_us * 2, MAX_RECONNECT_INTERVAL_USEC); + shard->last_reconnect_time = now; /* * Connect using the connection string we got from the