proxy: add more error cases to retry connect (#4707)

## Problem

In the logs, I noticed we still weren't retrying in some cases. Seemed
to be timeouts but we explicitly wanted to handle those

## Summary of changes

Retry on io::ErrorKind::TimedOut errors.
Handle IO errors in tokio_postgres::Error.
This commit is contained in:
Conrad Ludgate
2023-07-13 11:47:27 +01:00
committed by GitHub
parent 0626e0bfd3
commit db4d094afa
2 changed files with 40 additions and 24 deletions

View File

@@ -10,7 +10,10 @@ use crate::{auth, console};
use super::sql_over_http::MAX_RESPONSE_SIZE;
use crate::proxy::{invalidate_cache, retry_after, try_wake, NUM_RETRIES_WAKE_COMPUTE};
use crate::proxy::{
can_retry_tokio_postgres_error, invalidate_cache, retry_after, try_wake,
NUM_RETRIES_WAKE_COMPUTE,
};
use tracing::error;
use tracing::info;
@@ -272,19 +275,11 @@ async fn connect_to_compute(
}
fn can_retry_error(err: &tokio_postgres::Error, num_retries: u32) -> bool {
use tokio_postgres::error::SqlState;
match err.code() {
match err {
// retry all errors at least once
_ if num_retries == 0 => true,
// keep retrying connection errors
Some(
&SqlState::CONNECTION_FAILURE
| &SqlState::CONNECTION_EXCEPTION
| &SqlState::CONNECTION_DOES_NOT_EXIST
| &SqlState::SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION,
) if num_retries < NUM_RETRIES_WAKE_COMPUTE => true,
// otherwise, don't retry
_ => false,
_ if num_retries >= NUM_RETRIES_WAKE_COMPUTE => false,
err => can_retry_tokio_postgres_error(err),
}
}

View File

@@ -20,7 +20,7 @@ use hyper::StatusCode;
use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
use once_cell::sync::Lazy;
use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
use std::{ops::ControlFlow, sync::Arc};
use std::{error::Error, ops::ControlFlow, sync::Arc};
use tokio::{
io::{AsyncRead, AsyncWrite, AsyncWriteExt},
time,
@@ -445,24 +445,45 @@ pub async fn try_wake(
}
fn can_retry_error(err: &compute::ConnectionError, num_retries: u32) -> bool {
use std::io::ErrorKind;
match err {
// retry all errors at least once
_ if num_retries == 0 => true,
// keep retrying connection errors
compute::ConnectionError::CouldNotConnect(io_err)
if num_retries < NUM_RETRIES_WAKE_COMPUTE =>
{
matches!(
io_err.kind(),
ErrorKind::ConnectionRefused | ErrorKind::AddrNotAvailable
)
}
// otherwise, don't retry
_ if num_retries >= NUM_RETRIES_WAKE_COMPUTE => false,
compute::ConnectionError::Postgres(err) => can_retry_tokio_postgres_error(err),
compute::ConnectionError::CouldNotConnect(err) => is_io_connection_err(err),
_ => false,
}
}
pub fn can_retry_tokio_postgres_error(err: &tokio_postgres::Error) -> bool {
if let Some(io_err) = err.source().and_then(|x| x.downcast_ref()) {
is_io_connection_err(io_err)
} else if let Some(db_err) = err.source().and_then(|x| x.downcast_ref()) {
is_sql_connection_err(db_err)
} else {
false
}
}
fn is_sql_connection_err(err: &tokio_postgres::error::DbError) -> bool {
use tokio_postgres::error::SqlState;
matches!(
err.code(),
&SqlState::CONNECTION_FAILURE
| &SqlState::CONNECTION_EXCEPTION
| &SqlState::CONNECTION_DOES_NOT_EXIST
| &SqlState::SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION,
)
}
fn is_io_connection_err(err: &std::io::Error) -> bool {
use std::io::ErrorKind;
matches!(
err.kind(),
ErrorKind::ConnectionRefused | ErrorKind::AddrNotAvailable | ErrorKind::TimedOut
)
}
pub fn retry_after(num_retries: u32) -> time::Duration {
match num_retries {
0 => time::Duration::ZERO,