From be3038890136922f9d51f2befcf620804f5f19cf Mon Sep 17 00:00:00 2001 From: Sasha Krassovsky Date: Thu, 1 Feb 2024 11:50:04 -0900 Subject: [PATCH] Add retry to fetching basebackup (#6537) ## Problem Currently we have no retry mechanism for fetching basebackup. If there's an unstable connection, starting compute will just fail. ## Summary of changes Adds an exponential backoff with 7 retries to get the basebackup. --- compute_tools/src/compute.rs | 30 +++++++++++++++++++++- test_runner/regress/test_bad_connection.py | 8 +++--- 2 files changed, 33 insertions(+), 5 deletions(-) diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 07e0abe6ff..1976299e93 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -319,7 +319,7 @@ impl ComputeNode { // Get basebackup from the libpq connection to pageserver using `connstr` and // unarchive it to `pgdata` directory overriding all its previous content. #[instrument(skip_all, fields(%lsn))] - fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> { + fn try_get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> { let spec = compute_state.pspec.as_ref().expect("spec must be set"); let start_time = Instant::now(); @@ -390,6 +390,34 @@ impl ComputeNode { Ok(()) } + // Gets the basebackup in a retry loop + #[instrument(skip_all, fields(%lsn))] + pub fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> { + let mut retry_period_ms = 500; + let mut attempts = 0; + let max_attempts = 5; + loop { + let result = self.try_get_basebackup(compute_state, lsn); + match result { + Ok(_) => { + return result; + } + Err(ref e) if attempts < max_attempts => { + warn!( + "Failed to get basebackup: {} (attempt {}/{})", + e, attempts, max_attempts + ); + std::thread::sleep(std::time::Duration::from_millis(retry_period_ms)); + retry_period_ms *= 2; + } + Err(_) => { + return result; + } + } + attempts += 1; + } + } + pub async fn check_safekeepers_synced_async( &self, compute_state: &ComputeState, diff --git a/test_runner/regress/test_bad_connection.py b/test_runner/regress/test_bad_connection.py index ba0624c730..c808fa0f54 100644 --- a/test_runner/regress/test_bad_connection.py +++ b/test_runner/regress/test_bad_connection.py @@ -9,14 +9,14 @@ def test_compute_pageserver_connection_stress(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() env.pageserver.allowed_errors.append(".*simulated connection error.*") + # Enable failpoint before starting everything else up so that we exercise the retry + # on fetching basebackup pageserver_http = env.pageserver.http_client() + pageserver_http.configure_failpoints(("simulated-bad-compute-connection", "50%return(15)")) + env.neon_cli.create_branch("test_compute_pageserver_connection_stress") endpoint = env.endpoints.create_start("test_compute_pageserver_connection_stress") - # Enable failpoint after starting everything else up so that loading initial - # basebackup doesn't fail - pageserver_http.configure_failpoints(("simulated-bad-compute-connection", "50%return(15)")) - pg_conn = endpoint.connect() cur = pg_conn.cursor()