Altered retry timing parameters for connect to compute, to get more and quicker retries (#5358)

## Problem Compute start time has improved, but the timing of connection retries from the proxy is rather slow, meaning we could be making clients wait hundreds of milliseconds longer than necessary. ## Summary of changes Previously, retry time in ms was `100 * 1.5**n`, and `n` starts at 1, giving: 150, 225, 337, 506, 759, 1139, 1709, ... This PR changes that to `25 * sqrt(2)**(n - 1)` instead, giving: 25, 35, 50, 71, 100, 141, 200, ...
2026-07-04 20:50:40 +00:00 · 2023-09-25 12:27:41 +01:00
parent 211f882428
commit d8977d5199
2 changed files with 9 additions and 9 deletions
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -28,10 +28,11 @@ use tracing::{error, info, info_span, warn, Instrument};
 use utils::measured_stream::MeasuredStream;

 /// Number of times we should retry the `/proxy_wake_compute` http request.
-/// Retry duration is BASE_RETRY_WAIT_DURATION * 1.5^n
-pub const NUM_RETRIES_CONNECT: u32 = 10;
+/// Retry duration is BASE_RETRY_WAIT_DURATION * RETRY_WAIT_EXPONENT_BASE ^ n, where n starts at 0
+pub const NUM_RETRIES_CONNECT: u32 = 16;
 const CONNECT_TIMEOUT: time::Duration = time::Duration::from_secs(2);
-const BASE_RETRY_WAIT_DURATION: time::Duration = time::Duration::from_millis(100);
+const BASE_RETRY_WAIT_DURATION: time::Duration = time::Duration::from_millis(25);
+const RETRY_WAIT_EXPONENT_BASE: f64 = std::f64::consts::SQRT_2;

 const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
 const ERR_PROTO_VIOLATION: &str = "protocol violation";
@@ -553,8 +554,7 @@ impl ShouldRetry for compute::ConnectionError {
 }

 pub fn retry_after(num_retries: u32) -> time::Duration {
-    // 1.5 seems to be an ok growth factor heuristic
-    BASE_RETRY_WAIT_DURATION.mul_f64(1.5_f64.powi(num_retries as i32))
+    BASE_RETRY_WAIT_DURATION.mul_f64(RETRY_WAIT_EXPONENT_BASE.powi((num_retries as i32) - 1))
 }

 /// Finish client connection initialization: confirm auth success, send params, etc.
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -303,7 +303,7 @@ async fn scram_auth_mock() -> anyhow::Result<()> {
 #[test]
 fn connect_compute_total_wait() {
    let mut total_wait = tokio::time::Duration::ZERO;
-    for num_retries in 1..10 {
+    for num_retries in 1..NUM_RETRIES_CONNECT {
        total_wait += retry_after(num_retries);
    }
    assert!(total_wait < tokio::time::Duration::from_secs(12));
@@ -494,11 +494,11 @@ async fn connect_to_compute_non_retry_2() {
 /// Retry for at most `NUM_RETRIES_CONNECT` times.
 #[tokio::test]
 async fn connect_to_compute_non_retry_3() {
-    assert_eq!(NUM_RETRIES_CONNECT, 10);
+    assert_eq!(NUM_RETRIES_CONNECT, 16);
    use ConnectAction::*;
    let mechanism = TestConnectMechanism::new(vec![
-        Retry, Wake, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry,
-        /* the 11th time */ Retry,
+        Retry, Wake, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry,
+        Retry, Retry, Retry, Retry, /* the 17th time */ Retry,
    ]);
    let (cache, extra, creds) = helper_create_connect_info(&mechanism);
    connect_to_compute(&mechanism, cache, &extra, &creds)