reduce proxy timeouts (#4708)

## Problem

10 retries * 10 second timeouts makes for a very long retry window.

## Summary of changes

Adds a 2s timeout to sql_over_http connections, and also reduces the 10s
timeout in TCP.
This commit is contained in:
Conrad Ludgate
2023-07-17 20:05:26 +01:00
committed by GitHub
parent 196943c78f
commit e074ccf170

View File

@@ -31,7 +31,8 @@ use utils::measured_stream::MeasuredStream;
/// Number of times we should retry the `/proxy_wake_compute` http request.
/// Retry duration is BASE_RETRY_WAIT_DURATION * 1.5^n
const NUM_RETRIES_WAKE_COMPUTE: u32 = 10;
const NUM_RETRIES_CONNECT: u32 = 10;
const CONNECT_TIMEOUT: time::Duration = time::Duration::from_secs(2);
const BASE_RETRY_WAIT_DURATION: time::Duration = time::Duration::from_millis(100);
const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
@@ -393,25 +394,6 @@ where
let mut state = ConnectionState::<M::ConnectError>::Cached(node_info);
loop {
// Set a shorter timeout for the initial connection attempt.
//
// In case we try to connect to an outdated address that is no longer valid, the
// default behavior of Kubernetes is to drop the packets, causing us to wait for
// the entire timeout period. We want to fail fast in such cases.
//
// A specific case to consider is when we have cached compute node information
// with a 4-minute TTL (Time To Live), but the user has executed a `/suspend` API
// call, resulting in the nonexistence of the compute node.
//
// We only use caching in case of scram proxy backed by the console, so reduce
// the timeout only in that case.
let is_scram_proxy = matches!(creds, auth::BackendType::Console(_, _));
let timeout = if is_scram_proxy && num_retries == 0 {
time::Duration::from_secs(2)
} else {
time::Duration::from_secs(10)
};
match state {
ConnectionState::Invalid(config, err) => {
match try_wake(&config, extra, creds).await {
@@ -437,7 +419,7 @@ where
}
}
ConnectionState::Cached(node_info) => {
match mechanism.connect_once(&node_info, timeout).await {
match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await {
Ok(res) => return Ok(res),
Err(e) => {
error!(error = ?e, "could not connect to compute node");
@@ -497,7 +479,7 @@ pub trait ShouldRetry {
match self {
// retry all errors at least once
_ if num_retries == 0 => true,
_ if num_retries >= NUM_RETRIES_WAKE_COMPUTE => false,
_ if num_retries >= NUM_RETRIES_CONNECT => false,
err => err.could_retry(),
}
}