From 164863987489b683ca70071fe4c1d9b32007d8d2 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 25 Jan 2024 15:00:19 +0000
Subject: [PATCH] fix(neon_local): long init_tenant_mgr causes pageserver
 startup failure

Before this PR, if neon_local's `start_process()` ran out of retries
before pageserver started listening for requests, it would give up.
As of PR #6474 we at least kill the starting pageserver process in that
case, before that, we would leak it.

Pageserver `bind()s` the mgmt API early, but only starts `accept()`ing
HTTP requests after it has finished `init_tenant_mgr()` (plus some other
stuff).

init_tenant_mgr can take a long time with many tenants, i.e., longer
than the number of retries that neon_local permits.

Changes
=======

This PR changes the status check that neon_local performs when starting
pageserver to ignore connect & timeout errors, as those are expected
(see explanation above).

I verified that this allows for arbitrarily long `init_tenant_mgr()`
by adding a timeout at the top of that function.
---
 control_plane/src/pageserver.rs | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 1db21c9a37..35aefa5675 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -236,11 +236,18 @@ impl PageServerNode {
             self.pageserver_env_variables()?,
             background_process::InitialPidFile::Expect(self.pid_file()),
             || async {
-                let st = self.check_status().await;
-                match st {
-                    Ok(()) => Ok(true),
-                    Err(mgmt_api::Error::ReceiveBody(_)) => Ok(false),
-                    Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")),
+                match self.http_client.list_tenants().await {
+                    Ok(_) => Ok(true),
+                    Err(e) => match e {
+                        mgmt_api::Error::ReceiveBody(e) => {
+                            if e.is_connect() || e.is_timeout() {
+                                Ok(false)
+                            } else {
+                                Ok(true)
+                            }
+                        }
+                        e => anyhow::bail!(e),
+                    },
                 }
             },
         )