Altered retry timing parameters for connect to compute, to get more and quicker retries (#5358)

## Problem

Compute start time has improved, but the timing of connection retries
from the proxy is rather slow, meaning we could be making clients wait
hundreds of milliseconds longer than necessary.

## Summary of changes

Previously, retry time in ms was `100 * 1.5**n`, and `n` starts at 1,
giving: 150, 225, 337, 506, 759, 1139, 1709, ...

This PR changes that to `25 * sqrt(2)**(n - 1)` instead, giving: 25, 35,
50, 71, 100, 141, 200, ...
This commit is contained in:
George MacKerron
2023-09-25 12:27:41 +01:00
committed by GitHub
parent 211f882428
commit d8977d5199
2 changed files with 9 additions and 9 deletions

View File

@@ -28,10 +28,11 @@ use tracing::{error, info, info_span, warn, Instrument};
use utils::measured_stream::MeasuredStream;
/// Number of times we should retry the `/proxy_wake_compute` http request.
/// Retry duration is BASE_RETRY_WAIT_DURATION * 1.5^n
pub const NUM_RETRIES_CONNECT: u32 = 10;
/// Retry duration is BASE_RETRY_WAIT_DURATION * RETRY_WAIT_EXPONENT_BASE ^ n, where n starts at 0
pub const NUM_RETRIES_CONNECT: u32 = 16;
const CONNECT_TIMEOUT: time::Duration = time::Duration::from_secs(2);
const BASE_RETRY_WAIT_DURATION: time::Duration = time::Duration::from_millis(100);
const BASE_RETRY_WAIT_DURATION: time::Duration = time::Duration::from_millis(25);
const RETRY_WAIT_EXPONENT_BASE: f64 = std::f64::consts::SQRT_2;
const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
const ERR_PROTO_VIOLATION: &str = "protocol violation";
@@ -553,8 +554,7 @@ impl ShouldRetry for compute::ConnectionError {
}
pub fn retry_after(num_retries: u32) -> time::Duration {
// 1.5 seems to be an ok growth factor heuristic
BASE_RETRY_WAIT_DURATION.mul_f64(1.5_f64.powi(num_retries as i32))
BASE_RETRY_WAIT_DURATION.mul_f64(RETRY_WAIT_EXPONENT_BASE.powi((num_retries as i32) - 1))
}
/// Finish client connection initialization: confirm auth success, send params, etc.

View File

@@ -303,7 +303,7 @@ async fn scram_auth_mock() -> anyhow::Result<()> {
#[test]
fn connect_compute_total_wait() {
let mut total_wait = tokio::time::Duration::ZERO;
for num_retries in 1..10 {
for num_retries in 1..NUM_RETRIES_CONNECT {
total_wait += retry_after(num_retries);
}
assert!(total_wait < tokio::time::Duration::from_secs(12));
@@ -494,11 +494,11 @@ async fn connect_to_compute_non_retry_2() {
/// Retry for at most `NUM_RETRIES_CONNECT` times.
#[tokio::test]
async fn connect_to_compute_non_retry_3() {
assert_eq!(NUM_RETRIES_CONNECT, 10);
assert_eq!(NUM_RETRIES_CONNECT, 16);
use ConnectAction::*;
let mechanism = TestConnectMechanism::new(vec![
Retry, Wake, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry,
/* the 11th time */ Retry,
Retry, Wake, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry,
Retry, Retry, Retry, Retry, /* the 17th time */ Retry,
]);
let (cache, extra, creds) = helper_create_connect_info(&mechanism);
connect_to_compute(&mechanism, cache, &extra, &creds)