From 4ac4b2159838f9b98d766d53a5f876fedb94c2e4 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Tue, 9 Apr 2024 07:18:26 +0300 Subject: [PATCH] Add retries to cloud_admin client. --- Cargo.lock | 1 + s3_scrubber/Cargo.toml | 1 + s3_scrubber/src/cloud_admin_api.rs | 189 ++++++++++++++++------------- 3 files changed, 108 insertions(+), 83 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f2f06210cf..2b100560dd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5127,6 +5127,7 @@ dependencies = [ "tokio-postgres", "tokio-rustls 0.25.0", "tokio-stream", + "tokio-util", "tracing", "tracing-appender", "tracing-subscriber", diff --git a/s3_scrubber/Cargo.toml b/s3_scrubber/Cargo.toml index 37124e6caf..dd5d453a2b 100644 --- a/s3_scrubber/Cargo.toml +++ b/s3_scrubber/Cargo.toml @@ -27,6 +27,7 @@ postgres-native-tls.workspace = true postgres_ffi.workspace = true tokio-stream.workspace = true tokio-postgres.workspace = true +tokio-util = { workspace = true } futures-util.workspace = true itertools.workspace = true camino.workspace = true diff --git a/s3_scrubber/src/cloud_admin_api.rs b/s3_scrubber/src/cloud_admin_api.rs index d35dc7e3ca..70b108cf23 100644 --- a/s3_scrubber/src/cloud_admin_api.rs +++ b/s3_scrubber/src/cloud_admin_api.rs @@ -1,11 +1,13 @@ -use std::time::Duration; - use chrono::{DateTime, Utc}; +use futures::Future; use hex::FromHex; + use reqwest::{header, Client, StatusCode, Url}; use serde::Deserialize; use tokio::sync::Semaphore; +use tokio_util::sync::CancellationToken; +use utils::backoff; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; @@ -210,30 +212,39 @@ impl CloudAdminApiClient { .await .expect("Semaphore is not closed"); - let response = self - .http_client - .get(self.append_url("/projects")) - .query(&[ - ("tenant_id", tenant_id.to_string()), - ("show_deleted", "true".to_string()), - ]) - .header(header::ACCEPT, "application/json") - .bearer_auth(&self.token) - .send() - .await - .map_err(|e| { - Error::new( - "Find project for tenant".to_string(), - ErrorKind::RequestSend(e), - ) - })?; + let response = CloudAdminApiClient::with_retries( + || async { + let response = self + .http_client + .get(self.append_url("/projects")) + .query(&[ + ("tenant_id", tenant_id.to_string()), + ("show_deleted", "true".to_string()), + ]) + .header(header::ACCEPT, "application/json") + .bearer_auth(&self.token) + .send() + .await + .map_err(|e| { + Error::new( + "Find project for tenant".to_string(), + ErrorKind::RequestSend(e), + ) + })?; + + let response: AdminApiResponse> = + response.json().await.map_err(|e| { + Error::new( + "Find project for tenant".to_string(), + ErrorKind::BodyRead(e), + ) + })?; + Ok(response) + }, + "find_tenant_project", + ) + .await?; - let response: AdminApiResponse> = response.json().await.map_err(|e| { - Error::new( - "Find project for tenant".to_string(), - ErrorKind::BodyRead(e), - ) - })?; match response.data.len() { 0 => Ok(None), 1 => Ok(Some( @@ -261,42 +272,34 @@ impl CloudAdminApiClient { const PAGINATION_LIMIT: usize = 512; let mut result: Vec = Vec::with_capacity(PAGINATION_LIMIT); loop { - let response = self - .http_client - .get(self.append_url("/projects")) - .query(&[ - ("show_deleted", "false".to_string()), - ("limit", format!("{PAGINATION_LIMIT}")), - ("offset", format!("{pagination_offset}")), - ]) - .header(header::ACCEPT, "application/json") - .bearer_auth(&self.token) - .send() - .await - .map_err(|e| { - Error::new( - "List active projects".to_string(), - ErrorKind::RequestSend(e), - ) - })?; + let response_bytes = CloudAdminApiClient::with_retries( + || async { + let response = self + .http_client + .get(self.append_url("/projects")) + .query(&[ + ("show_deleted", "false".to_string()), + ("limit", format!("{PAGINATION_LIMIT}")), + ("offset", format!("{pagination_offset}")), + ]) + .header(header::ACCEPT, "application/json") + .bearer_auth(&self.token) + .send() + .await + .map_err(|e| { + Error::new( + "List active projects".to_string(), + ErrorKind::RequestSend(e), + ) + })?; - match response.status() { - StatusCode::OK => {} - StatusCode::SERVICE_UNAVAILABLE | StatusCode::TOO_MANY_REQUESTS => { - tokio::time::sleep(Duration::from_millis(500)).await; - continue; - } - _status => { - return Err(Error::new( - "List active projects".to_string(), - ErrorKind::ResponseStatus(response.status()), - )) - } - } - - let response_bytes = response.bytes().await.map_err(|e| { - Error::new("List active projects".to_string(), ErrorKind::BodyRead(e)) - })?; + response.bytes().await.map_err(|e| { + Error::new("List active projects".to_string(), ErrorKind::BodyRead(e)) + }) + }, + "list_projects", + ) + .await?; let decode_result = serde_json::from_slice::>>(&response_bytes); @@ -336,30 +339,39 @@ impl CloudAdminApiClient { .await .expect("Semaphore is not closed"); - let response = self - .http_client - .get(self.append_url("/branches")) - .query(&[ - ("timeline_id", timeline_id.to_string()), - ("show_deleted", "true".to_string()), - ]) - .header(header::ACCEPT, "application/json") - .bearer_auth(&self.token) - .send() - .await - .map_err(|e| { - Error::new( - "Find branch for timeline".to_string(), - ErrorKind::RequestSend(e), - ) - })?; + let response = CloudAdminApiClient::with_retries( + || async { + let response = self + .http_client + .get(self.append_url("/branches")) + .query(&[ + ("timeline_id", timeline_id.to_string()), + ("show_deleted", "true".to_string()), + ]) + .header(header::ACCEPT, "application/json") + .bearer_auth(&self.token) + .send() + .await + .map_err(|e| { + Error::new( + "Find branch for timeline".to_string(), + ErrorKind::RequestSend(e), + ) + })?; + + let response: AdminApiResponse> = + response.json().await.map_err(|e| { + Error::new( + "Find branch for timeline".to_string(), + ErrorKind::BodyRead(e), + ) + })?; + Ok(response) + }, + "find_timeline_branch", + ) + .await?; - let response: AdminApiResponse> = response.json().await.map_err(|e| { - Error::new( - "Find branch for timeline".to_string(), - ErrorKind::BodyRead(e), - ) - })?; let mut branches: Vec = response.data.into_iter().collect(); // Normally timeline_id is unique. However, we do have at least one case // of the same timeline_id in two different projects, apparently after @@ -542,4 +554,15 @@ impl CloudAdminApiClient { .parse() .unwrap_or_else(|e| panic!("Could not append {subpath} to base url: {e}")) } + + async fn with_retries(op: O, description: &str) -> Result + where + O: FnMut() -> F, + F: Future>, + { + let cancel = CancellationToken::new(); // not really used + backoff::retry(op, |_| false, 1, 20, description, &cancel) + .await + .expect("cancellations are disabled") + } }