From 375e8e559232081b31099bf0f6db0c8d1a1a6e6e Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Thu, 3 Jul 2025 14:02:43 +0200
Subject: [PATCH] Improve retries and logging

---
 pageserver/client_grpc/src/client.rs | 147 +++++++++++++++++++--------
 1 file changed, 106 insertions(+), 41 deletions(-)
diff --git a/pageserver/client_grpc/src/client.rs b/pageserver/client_grpc/src/client.rs
index 3f7084fe43..cf067aec8f 100644
--- a/pageserver/client_grpc/src/client.rs
+++ b/pageserver/client_grpc/src/client.rs
@@ -3,8 +3,9 @@ use std::sync::Arc;
 
 use anyhow::{anyhow, ensure};
 use pageserver_page_api as page_api;
-use tokio_util::sync::CancellationToken;
-use utils::backoff;
+use tokio::time::Instant;
+use tracing::{error, info, warn};
+use utils::backoff::exponential_backoff_duration;
 use utils::id::{TenantId, TimelineId};
 use utils::shard::{ShardCount, ShardIndex, ShardNumber};
 
@@ -43,7 +44,7 @@ impl PageserverClient {
         &self,
         req: page_api::CheckRelExistsRequest,
     ) -> tonic::Result<page_api::CheckRelExistsResponse> {
-        self.with_retries("check_rel_exists", async || {
+        self.with_retries(async || {
             // Relation metadata is only available on shard 0.
             let mut client = self.shards.get_zero().client().await?;
             client.check_rel_exists(req).await
@@ -56,7 +57,7 @@ impl PageserverClient {
         &self,
         req: page_api::GetDbSizeRequest,
     ) -> tonic::Result<page_api::GetDbSizeResponse> {
-        self.with_retries("get_db_size", async || {
+        self.with_retries(async || {
             // Relation metadata is only available on shard 0.
             let mut client = self.shards.get_zero().client().await?;
             client.get_db_size(req).await
@@ -75,7 +76,7 @@ impl PageserverClient {
         // TODO: support multiple shards.
         let shard_id = ShardIndex::unsharded();
 
-        self.with_retries("get_page", async || {
+        self.with_retries(async || {
             let stream = self.shards.get(shard_id)?.stream().await;
             let resp = stream.send(req.clone()).await?;
 
@@ -96,7 +97,7 @@ impl PageserverClient {
         &self,
         req: page_api::GetRelSizeRequest,
     ) -> tonic::Result<page_api::GetRelSizeResponse> {
-        self.with_retries("get_rel_size", async || {
+        self.with_retries(async || {
             // Relation metadata is only available on shard 0.
             let mut client = self.shards.get_zero().client().await?;
             client.get_rel_size(req).await
@@ -109,7 +110,7 @@ impl PageserverClient {
         &self,
         req: page_api::GetSlruSegmentRequest,
     ) -> tonic::Result<page_api::GetSlruSegmentResponse> {
-        self.with_retries("get_slru_segment", async || {
+        self.with_retries(async || {
             // SLRU segments are only available on shard 0.
             let mut client = self.shards.get_zero().client().await?;
             client.get_slru_segment(req).await
@@ -117,50 +118,114 @@ impl PageserverClient {
         .await
     }
 
-    /// Runs the given closure with exponential backoff retries.
-    async fn with_retries<T, F, O>(&self, name: &str, f: F) -> tonic::Result<T>
+    /// Runs the given closure with retries (exponential backoff). Logs errors.
+    async fn with_retries<T, F, O>(&self, mut f: F) -> tonic::Result<T>
     where
         F: FnMut() -> O,
         O: Future<Output = tonic::Result<T>>,
     {
-        /// TODO: tune retry parameters (retry forever?).
-        /// TODO: add timeouts?
-        const WARN_THRESHOLD: u32 = 1;
-        const MAX_RETRIES: u32 = 10;
-
-        fn is_permanent(err: &tonic::Status) -> bool {
-            match err.code() {
-                // Not really an error, but whatever. Don't retry.
-                tonic::Code::Ok => true,
+        // TODO: tune these, and/or make them configurable. Should we retry forever?
+        const REQUEST_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(10);
+        const TOTAL_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(60);
+        const BASE_BACKOFF: f64 = 0.1;
+        const MAX_BACKOFF: f64 = 10.0;
 
+        fn should_retry(code: tonic::Code) -> bool {
+            match code {
+                tonic::Code::Ok => panic!("unexpected Ok status code"),
                 // These codes are transient, so retry them.
-                tonic::Code::Aborted => false,
-                tonic::Code::Cancelled => false,
-                tonic::Code::DeadlineExceeded => false, // maybe transient slowness
-                tonic::Code::Internal => false,         // maybe transient failure
-                tonic::Code::ResourceExhausted => false,
-                tonic::Code::Unavailable => false,
-                tonic::Code::Unknown => false, // may as well retry?
-
+                tonic::Code::Aborted => true,
+                tonic::Code::Cancelled => true,
+                tonic::Code::DeadlineExceeded => true, // maybe transient slowness
+                tonic::Code::Internal => true,         // maybe transient failure?
+                tonic::Code::ResourceExhausted => true,
+                tonic::Code::Unavailable => true,
                 // The following codes will like continue to fail, so don't retry.
-                tonic::Code::AlreadyExists => true,
-                tonic::Code::DataLoss => true,
-                tonic::Code::FailedPrecondition => true,
-                tonic::Code::InvalidArgument => true,
-                tonic::Code::NotFound => true,
-                tonic::Code::OutOfRange => true,
-                tonic::Code::PermissionDenied => true,
-                tonic::Code::Unimplemented => true,
-                tonic::Code::Unauthenticated => true,
+                tonic::Code::AlreadyExists => false,
+                tonic::Code::DataLoss => false,
+                tonic::Code::FailedPrecondition => false,
+                tonic::Code::InvalidArgument => false,
+                tonic::Code::NotFound => false,
+                tonic::Code::OutOfRange => false,
+                tonic::Code::PermissionDenied => false,
+                tonic::Code::Unauthenticated => false,
+                tonic::Code::Unimplemented => false,
+                tonic::Code::Unknown => false,
             }
         }
 
-        // TODO: consider custom logic and logging here, using the caller's span for name.
-        // TODO: cancellation? Could just drop the future.
-        let cancel = CancellationToken::new();
-        backoff::retry(f, is_permanent, WARN_THRESHOLD, MAX_RETRIES, name, &cancel)
-            .await
-            .expect("never cancelled (for now)")
+        let started = Instant::now();
+        let deadline = started + TOTAL_TIMEOUT;
+        let mut last_error = None;
+        let mut retries = 0;
+        loop {
+            // Set up a future to wait for the backoff (if any) and run the request with a timeout.
+            let backoff = exponential_backoff_duration(retries, BASE_BACKOFF, MAX_BACKOFF);
+            let backoff_and_try = async {
+                tokio::time::sleep(backoff).await;
+                let request_started = Instant::now();
+                tokio::time::timeout(REQUEST_TIMEOUT, f())
+                    .await
+                    .map_err(|_| {
+                        tonic::Status::deadline_exceeded(format!(
+                            "request timed out after {:.3}s",
+                            request_started.elapsed().as_secs_f64()
+                        ))
+                    })?
+            };
+
+            // Wait for the backoff and request, or bail out if the total timeout is exceeded.
+            let result = tokio::select! {
+                result = backoff_and_try => result,
+
+                _ = tokio::time::sleep_until(deadline) => {
+                    let last_error = last_error.unwrap_or_else(|| {
+                        tonic::Status::deadline_exceeded(format!(
+                            "request timed out after {:.3}s",
+                            started.elapsed().as_secs_f64()
+                        ))
+                    });
+                    error!(
+                        "giving up after {:.3}s and {retries} retries, last error {:?}: {}",
+                        started.elapsed().as_secs_f64(), last_error.code(), last_error.message(),
+                    );
+                    return Err(last_error);
+                }
+            };
+
+            match result {
+                Ok(result) => {
+                    if retries > 0 {
+                        info!(
+                            "request succeeded after {retries} retries in {:.3}s",
+                            started.elapsed().as_secs_f64(),
+                        );
+                    }
+
+                    return Ok(result);
+                }
+
+                Err(status) => {
+                    let (code, message) = (status.code(), status.message());
+                    let should_retry = should_retry(code);
+                    let attempt = retries + 1;
+
+                    if !should_retry {
+                        // NB: include the attempt here too. This isn't necessarily the first
+                        // attempt, because the error may change between attempts.
+                        error!(
+                            "request failed with {code:?}: {message}, not retrying (attempt {attempt})"
+                        );
+                        return Err(status);
+                    }
+
+                    warn!("request failed with {code:?}: {message}, retrying (attempt {attempt})");
+
+                    retries += 1;
+                    last_error = Some(status);
+                }
+            }
+        }
     }
 }