From e074ccf1700ea4d71abfc57c381de43354dd036a Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Mon, 17 Jul 2023 20:05:26 +0100 Subject: [PATCH] reduce proxy timeouts (#4708) ## Problem 10 retries * 10 second timeouts makes for a very long retry window. ## Summary of changes Adds a 2s timeout to sql_over_http connections, and also reduces the 10s timeout in TCP. --- proxy/src/proxy.rs | 26 ++++---------------------- 1 file changed, 4 insertions(+), 22 deletions(-) diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index d4a3f2641e..a43192c11e 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -31,7 +31,8 @@ use utils::measured_stream::MeasuredStream; /// Number of times we should retry the `/proxy_wake_compute` http request. /// Retry duration is BASE_RETRY_WAIT_DURATION * 1.5^n -const NUM_RETRIES_WAKE_COMPUTE: u32 = 10; +const NUM_RETRIES_CONNECT: u32 = 10; +const CONNECT_TIMEOUT: time::Duration = time::Duration::from_secs(2); const BASE_RETRY_WAIT_DURATION: time::Duration = time::Duration::from_millis(100); const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)"; @@ -393,25 +394,6 @@ where let mut state = ConnectionState::::Cached(node_info); loop { - // Set a shorter timeout for the initial connection attempt. - // - // In case we try to connect to an outdated address that is no longer valid, the - // default behavior of Kubernetes is to drop the packets, causing us to wait for - // the entire timeout period. We want to fail fast in such cases. - // - // A specific case to consider is when we have cached compute node information - // with a 4-minute TTL (Time To Live), but the user has executed a `/suspend` API - // call, resulting in the nonexistence of the compute node. - // - // We only use caching in case of scram proxy backed by the console, so reduce - // the timeout only in that case. - let is_scram_proxy = matches!(creds, auth::BackendType::Console(_, _)); - let timeout = if is_scram_proxy && num_retries == 0 { - time::Duration::from_secs(2) - } else { - time::Duration::from_secs(10) - }; - match state { ConnectionState::Invalid(config, err) => { match try_wake(&config, extra, creds).await { @@ -437,7 +419,7 @@ where } } ConnectionState::Cached(node_info) => { - match mechanism.connect_once(&node_info, timeout).await { + match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await { Ok(res) => return Ok(res), Err(e) => { error!(error = ?e, "could not connect to compute node"); @@ -497,7 +479,7 @@ pub trait ShouldRetry { match self { // retry all errors at least once _ if num_retries == 0 => true, - _ if num_retries >= NUM_RETRIES_WAKE_COMPUTE => false, + _ if num_retries >= NUM_RETRIES_CONNECT => false, err => err.could_retry(), } }