mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-07 05:22:56 +00:00
proxy: add more error cases to retry connect (#4707)
## Problem In the logs, I noticed we still weren't retrying in some cases. Seemed to be timeouts but we explicitly wanted to handle those ## Summary of changes Retry on io::ErrorKind::TimedOut errors. Handle IO errors in tokio_postgres::Error.
This commit is contained in:
@@ -10,7 +10,10 @@ use crate::{auth, console};
|
|||||||
|
|
||||||
use super::sql_over_http::MAX_RESPONSE_SIZE;
|
use super::sql_over_http::MAX_RESPONSE_SIZE;
|
||||||
|
|
||||||
use crate::proxy::{invalidate_cache, retry_after, try_wake, NUM_RETRIES_WAKE_COMPUTE};
|
use crate::proxy::{
|
||||||
|
can_retry_tokio_postgres_error, invalidate_cache, retry_after, try_wake,
|
||||||
|
NUM_RETRIES_WAKE_COMPUTE,
|
||||||
|
};
|
||||||
|
|
||||||
use tracing::error;
|
use tracing::error;
|
||||||
use tracing::info;
|
use tracing::info;
|
||||||
@@ -272,19 +275,11 @@ async fn connect_to_compute(
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn can_retry_error(err: &tokio_postgres::Error, num_retries: u32) -> bool {
|
fn can_retry_error(err: &tokio_postgres::Error, num_retries: u32) -> bool {
|
||||||
use tokio_postgres::error::SqlState;
|
match err {
|
||||||
match err.code() {
|
|
||||||
// retry all errors at least once
|
// retry all errors at least once
|
||||||
_ if num_retries == 0 => true,
|
_ if num_retries == 0 => true,
|
||||||
// keep retrying connection errors
|
_ if num_retries >= NUM_RETRIES_WAKE_COMPUTE => false,
|
||||||
Some(
|
err => can_retry_tokio_postgres_error(err),
|
||||||
&SqlState::CONNECTION_FAILURE
|
|
||||||
| &SqlState::CONNECTION_EXCEPTION
|
|
||||||
| &SqlState::CONNECTION_DOES_NOT_EXIST
|
|
||||||
| &SqlState::SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION,
|
|
||||||
) if num_retries < NUM_RETRIES_WAKE_COMPUTE => true,
|
|
||||||
// otherwise, don't retry
|
|
||||||
_ => false,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -20,7 +20,7 @@ use hyper::StatusCode;
|
|||||||
use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
|
use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
|
||||||
use once_cell::sync::Lazy;
|
use once_cell::sync::Lazy;
|
||||||
use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
|
use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
|
||||||
use std::{ops::ControlFlow, sync::Arc};
|
use std::{error::Error, ops::ControlFlow, sync::Arc};
|
||||||
use tokio::{
|
use tokio::{
|
||||||
io::{AsyncRead, AsyncWrite, AsyncWriteExt},
|
io::{AsyncRead, AsyncWrite, AsyncWriteExt},
|
||||||
time,
|
time,
|
||||||
@@ -445,24 +445,45 @@ pub async fn try_wake(
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn can_retry_error(err: &compute::ConnectionError, num_retries: u32) -> bool {
|
fn can_retry_error(err: &compute::ConnectionError, num_retries: u32) -> bool {
|
||||||
use std::io::ErrorKind;
|
|
||||||
match err {
|
match err {
|
||||||
// retry all errors at least once
|
// retry all errors at least once
|
||||||
_ if num_retries == 0 => true,
|
_ if num_retries == 0 => true,
|
||||||
// keep retrying connection errors
|
_ if num_retries >= NUM_RETRIES_WAKE_COMPUTE => false,
|
||||||
compute::ConnectionError::CouldNotConnect(io_err)
|
compute::ConnectionError::Postgres(err) => can_retry_tokio_postgres_error(err),
|
||||||
if num_retries < NUM_RETRIES_WAKE_COMPUTE =>
|
compute::ConnectionError::CouldNotConnect(err) => is_io_connection_err(err),
|
||||||
{
|
|
||||||
matches!(
|
|
||||||
io_err.kind(),
|
|
||||||
ErrorKind::ConnectionRefused | ErrorKind::AddrNotAvailable
|
|
||||||
)
|
|
||||||
}
|
|
||||||
// otherwise, don't retry
|
|
||||||
_ => false,
|
_ => false,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn can_retry_tokio_postgres_error(err: &tokio_postgres::Error) -> bool {
|
||||||
|
if let Some(io_err) = err.source().and_then(|x| x.downcast_ref()) {
|
||||||
|
is_io_connection_err(io_err)
|
||||||
|
} else if let Some(db_err) = err.source().and_then(|x| x.downcast_ref()) {
|
||||||
|
is_sql_connection_err(db_err)
|
||||||
|
} else {
|
||||||
|
false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_sql_connection_err(err: &tokio_postgres::error::DbError) -> bool {
|
||||||
|
use tokio_postgres::error::SqlState;
|
||||||
|
matches!(
|
||||||
|
err.code(),
|
||||||
|
&SqlState::CONNECTION_FAILURE
|
||||||
|
| &SqlState::CONNECTION_EXCEPTION
|
||||||
|
| &SqlState::CONNECTION_DOES_NOT_EXIST
|
||||||
|
| &SqlState::SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_io_connection_err(err: &std::io::Error) -> bool {
|
||||||
|
use std::io::ErrorKind;
|
||||||
|
matches!(
|
||||||
|
err.kind(),
|
||||||
|
ErrorKind::ConnectionRefused | ErrorKind::AddrNotAvailable | ErrorKind::TimedOut
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
pub fn retry_after(num_retries: u32) -> time::Duration {
|
pub fn retry_after(num_retries: u32) -> time::Duration {
|
||||||
match num_retries {
|
match num_retries {
|
||||||
0 => time::Duration::ZERO,
|
0 => time::Duration::ZERO,
|
||||||
|
|||||||
Reference in New Issue
Block a user