From fd0b22f5cd11d5df2013bd0d9c79cb70086b3fa8 Mon Sep 17 00:00:00 2001
From: MMeent <matthias@neon.tech>
Date: Wed, 19 Jun 2024 15:05:31 +0200
Subject: [PATCH] Make sure we can handle temporarily offline PS when we first
 connect (#8094)

Fixes https://github.com/neondatabase/neon/issues/7897

## Problem

`shard->delay_us` was potentially uninitialized when we connect to PS,
as it wasn't set to a non-0 value until we've first connected to the
shard's pageserver.

That caused the exponential backoff to use an initial value (multiplier)
of 0 for the first connection attempt to that pageserver, thus causing a
hot retry loop with connection attempts to the pageserver without
significant delay. That in turn caused attemmpts to reconnect to quickly
fail, rather than showing the expected 'wait until pageserver is
available' behaviour.

## Summary of changes

We initialize shard->delay_us before connection initialization if we
notice it is not initialized yet.
---
 pgxn/neon/libpagestore.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 5eae2d8204..a665cafafe 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -381,6 +381,15 @@ pageserver_connect(shardno_t shard_no, int elevel)
 		us_since_last_attempt = (int64) (now - shard->last_reconnect_time);
 		shard->last_reconnect_time = now;
 
+		/*
+		 * Make sure we don't do exponential backoff with a constant multiplier
+		 * of 0 us, as that doesn't really do much for timeouts...
+		 *
+		 * cf. https://github.com/neondatabase/neon/issues/7897
+		 */
+		if (shard->delay_us == 0)
+			shard->delay_us = MIN_RECONNECT_INTERVAL_USEC;
+
 		/*
 		 * If we did other tasks between reconnect attempts, then we won't
 		 * need to wait as long as a full delay.