From db4d094afaea07eafdf0cacfa78e549f97c672f3 Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Thu, 13 Jul 2023 11:47:27 +0100 Subject: [PATCH] proxy: add more error cases to retry connect (#4707) ## Problem In the logs, I noticed we still weren't retrying in some cases. Seemed to be timeouts but we explicitly wanted to handle those ## Summary of changes Retry on io::ErrorKind::TimedOut errors. Handle IO errors in tokio_postgres::Error. --- proxy/src/http/conn_pool.rs | 19 ++++++---------- proxy/src/proxy.rs | 45 +++++++++++++++++++++++++++---------- 2 files changed, 40 insertions(+), 24 deletions(-) diff --git a/proxy/src/http/conn_pool.rs b/proxy/src/http/conn_pool.rs index fb53c663c8..49c94a830f 100644 --- a/proxy/src/http/conn_pool.rs +++ b/proxy/src/http/conn_pool.rs @@ -10,7 +10,10 @@ use crate::{auth, console}; use super::sql_over_http::MAX_RESPONSE_SIZE; -use crate::proxy::{invalidate_cache, retry_after, try_wake, NUM_RETRIES_WAKE_COMPUTE}; +use crate::proxy::{ + can_retry_tokio_postgres_error, invalidate_cache, retry_after, try_wake, + NUM_RETRIES_WAKE_COMPUTE, +}; use tracing::error; use tracing::info; @@ -272,19 +275,11 @@ async fn connect_to_compute( } fn can_retry_error(err: &tokio_postgres::Error, num_retries: u32) -> bool { - use tokio_postgres::error::SqlState; - match err.code() { + match err { // retry all errors at least once _ if num_retries == 0 => true, - // keep retrying connection errors - Some( - &SqlState::CONNECTION_FAILURE - | &SqlState::CONNECTION_EXCEPTION - | &SqlState::CONNECTION_DOES_NOT_EXIST - | &SqlState::SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION, - ) if num_retries < NUM_RETRIES_WAKE_COMPUTE => true, - // otherwise, don't retry - _ => false, + _ if num_retries >= NUM_RETRIES_WAKE_COMPUTE => false, + err => can_retry_tokio_postgres_error(err), } } diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 2204fc62c6..8722109b80 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -20,7 +20,7 @@ use hyper::StatusCode; use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec}; use once_cell::sync::Lazy; use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams}; -use std::{ops::ControlFlow, sync::Arc}; +use std::{error::Error, ops::ControlFlow, sync::Arc}; use tokio::{ io::{AsyncRead, AsyncWrite, AsyncWriteExt}, time, @@ -445,24 +445,45 @@ pub async fn try_wake( } fn can_retry_error(err: &compute::ConnectionError, num_retries: u32) -> bool { - use std::io::ErrorKind; match err { // retry all errors at least once _ if num_retries == 0 => true, - // keep retrying connection errors - compute::ConnectionError::CouldNotConnect(io_err) - if num_retries < NUM_RETRIES_WAKE_COMPUTE => - { - matches!( - io_err.kind(), - ErrorKind::ConnectionRefused | ErrorKind::AddrNotAvailable - ) - } - // otherwise, don't retry + _ if num_retries >= NUM_RETRIES_WAKE_COMPUTE => false, + compute::ConnectionError::Postgres(err) => can_retry_tokio_postgres_error(err), + compute::ConnectionError::CouldNotConnect(err) => is_io_connection_err(err), _ => false, } } +pub fn can_retry_tokio_postgres_error(err: &tokio_postgres::Error) -> bool { + if let Some(io_err) = err.source().and_then(|x| x.downcast_ref()) { + is_io_connection_err(io_err) + } else if let Some(db_err) = err.source().and_then(|x| x.downcast_ref()) { + is_sql_connection_err(db_err) + } else { + false + } +} + +fn is_sql_connection_err(err: &tokio_postgres::error::DbError) -> bool { + use tokio_postgres::error::SqlState; + matches!( + err.code(), + &SqlState::CONNECTION_FAILURE + | &SqlState::CONNECTION_EXCEPTION + | &SqlState::CONNECTION_DOES_NOT_EXIST + | &SqlState::SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION, + ) +} + +fn is_io_connection_err(err: &std::io::Error) -> bool { + use std::io::ErrorKind; + matches!( + err.kind(), + ErrorKind::ConnectionRefused | ErrorKind::AddrNotAvailable | ErrorKind::TimedOut + ) +} + pub fn retry_after(num_retries: u32) -> time::Duration { match num_retries { 0 => time::Duration::ZERO,