From 164863987489b683ca70071fe4c1d9b32007d8d2 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Thu, 25 Jan 2024 15:00:19 +0000 Subject: [PATCH] fix(neon_local): long init_tenant_mgr causes pageserver startup failure Before this PR, if neon_local's `start_process()` ran out of retries before pageserver started listening for requests, it would give up. As of PR #6474 we at least kill the starting pageserver process in that case, before that, we would leak it. Pageserver `bind()s` the mgmt API early, but only starts `accept()`ing HTTP requests after it has finished `init_tenant_mgr()` (plus some other stuff). init_tenant_mgr can take a long time with many tenants, i.e., longer than the number of retries that neon_local permits. Changes ======= This PR changes the status check that neon_local performs when starting pageserver to ignore connect & timeout errors, as those are expected (see explanation above). I verified that this allows for arbitrarily long `init_tenant_mgr()` by adding a timeout at the top of that function. --- control_plane/src/pageserver.rs | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 1db21c9a37..35aefa5675 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -236,11 +236,18 @@ impl PageServerNode { self.pageserver_env_variables()?, background_process::InitialPidFile::Expect(self.pid_file()), || async { - let st = self.check_status().await; - match st { - Ok(()) => Ok(true), - Err(mgmt_api::Error::ReceiveBody(_)) => Ok(false), - Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")), + match self.http_client.list_tenants().await { + Ok(_) => Ok(true), + Err(e) => match e { + mgmt_api::Error::ReceiveBody(e) => { + if e.is_connect() || e.is_timeout() { + Ok(false) + } else { + Ok(true) + } + } + e => anyhow::bail!(e), + }, } }, )