proxy: improve error classification (#6841)

## Problem

## Summary of changes

1. Classify further cplane API errors
2. add 'serviceratelimit' and make a few of the timeout errors return
that.
3. a few additional minor changes
This commit is contained in:
Conrad Ludgate
2024-02-21 10:04:09 +00:00
committed by GitHub
parent e7452d3756
commit e0af945f8f
5 changed files with 37 additions and 52 deletions

View File

@@ -171,16 +171,8 @@ async fn task_main(
.context("failed to set socket option")?;
info!(%peer_addr, "serving");
let mut ctx =
RequestMonitoring::new(session_id, peer_addr.ip(), "sni_router", "sni");
handle_client(
&mut ctx,
dest_suffix,
tls_config,
tls_server_end_point,
socket,
)
.await
let ctx = RequestMonitoring::new(session_id, peer_addr.ip(), "sni_router", "sni");
handle_client(ctx, dest_suffix, tls_config, tls_server_end_point, socket).await
}
.unwrap_or_else(|e| {
// Acknowledge that the task has finished with an error.
@@ -248,7 +240,7 @@ async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
}
async fn handle_client(
ctx: &mut RequestMonitoring,
mut ctx: RequestMonitoring,
dest_suffix: Arc<String>,
tls_config: Arc<rustls::ServerConfig>,
tls_server_end_point: TlsServerEndPoint,

View File

@@ -87,6 +87,22 @@ pub mod errors {
impl ReportableError for ApiError {
fn get_error_kind(&self) -> crate::error::ErrorKind {
match self {
ApiError::Console {
status: http::StatusCode::NOT_FOUND | http::StatusCode::NOT_ACCEPTABLE,
..
} => crate::error::ErrorKind::User,
ApiError::Console {
status: http::StatusCode::LOCKED,
text,
} if text.contains("quota exceeded")
|| text.contains("the limit for current plan reached") =>
{
crate::error::ErrorKind::User
}
ApiError::Console {
status: http::StatusCode::TOO_MANY_REQUESTS,
..
} => crate::error::ErrorKind::ServiceRateLimit,
ApiError::Console { .. } => crate::error::ErrorKind::ControlPlane,
ApiError::Transport(_) => crate::error::ErrorKind::ControlPlane,
}
@@ -222,7 +238,7 @@ pub mod errors {
match self {
WakeComputeError::BadComputeAddress(_) => crate::error::ErrorKind::ControlPlane,
WakeComputeError::ApiError(e) => e.get_error_kind(),
WakeComputeError::TimeoutError => crate::error::ErrorKind::RateLimit,
WakeComputeError::TimeoutError => crate::error::ErrorKind::ServiceRateLimit,
}
}
}

View File

@@ -147,15 +147,13 @@ impl RequestMonitoring {
self.success = true;
}
pub fn log(&mut self) {
pub fn log(self) {}
}
impl Drop for RequestMonitoring {
fn drop(&mut self) {
if let Some(tx) = self.sender.take() {
let _: Result<(), _> = tx.send(self.clone());
}
}
}
impl Drop for RequestMonitoring {
fn drop(&mut self) {
self.log()
}
}

View File

@@ -37,9 +37,12 @@ pub enum ErrorKind {
/// Network error between user and proxy. Not necessarily user error
ClientDisconnect,
/// Proxy self-imposed rate limits
/// Proxy self-imposed user rate limits
RateLimit,
/// Proxy self-imposed service-wise rate limits
ServiceRateLimit,
/// internal errors
Service,
@@ -54,25 +57,12 @@ pub enum ErrorKind {
}
impl ErrorKind {
pub fn to_str(&self) -> &'static str {
match self {
ErrorKind::User => "request failed due to user error",
ErrorKind::ClientDisconnect => "client disconnected",
ErrorKind::RateLimit => "request cancelled due to rate limit",
ErrorKind::Service => "internal service error",
ErrorKind::ControlPlane => "non-retryable control plane error",
ErrorKind::Postgres => "postgres error",
ErrorKind::Compute => {
"non-retryable compute connection error (or exhausted retry capacity)"
}
}
}
pub fn to_metric_label(&self) -> &'static str {
match self {
ErrorKind::User => "user",
ErrorKind::ClientDisconnect => "clientdisconnect",
ErrorKind::RateLimit => "ratelimit",
ErrorKind::ServiceRateLimit => "serviceratelimit",
ErrorKind::Service => "service",
ErrorKind::ControlPlane => "controlplane",
ErrorKind::Postgres => "postgres",
@@ -85,12 +75,6 @@ pub trait ReportableError: fmt::Display + Send + 'static {
fn get_error_kind(&self) -> ErrorKind;
}
impl ReportableError for tokio::time::error::Elapsed {
fn get_error_kind(&self) -> ErrorKind {
ErrorKind::RateLimit
}
}
impl ReportableError for tokio_postgres::error::Error {
fn get_error_kind(&self) -> ErrorKind {
if self.as_db_error().is_some() {

View File

@@ -12,7 +12,7 @@ use hyper::StatusCode;
use hyper::{Body, HeaderMap, Request};
use serde_json::json;
use serde_json::Value;
use tokio::join;
use tokio::try_join;
use tokio_postgres::error::DbError;
use tokio_postgres::error::ErrorPosition;
use tokio_postgres::GenericClient;
@@ -32,11 +32,9 @@ use crate::auth::ComputeUserInfoParseError;
use crate::config::ProxyConfig;
use crate::config::TlsConfig;
use crate::context::RequestMonitoring;
use crate::error::ReportableError;
use crate::metrics::HTTP_CONTENT_LENGTH;
use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE;
use crate::proxy::NeonOptions;
use crate::serverless::backend::HttpConnError;
use crate::DbName;
use crate::RoleName;
@@ -287,8 +285,10 @@ pub async fn handle(
)?
}
},
Err(e) => {
ctx.set_error_kind(e.get_error_kind());
Err(_) => {
// TODO: when http error classification is done, distinguish between
// timeout on sql vs timeout in proxy/cplane
// ctx.set_error_kind(crate::error::ErrorKind::RateLimit);
let message = format!(
"HTTP-Connection timed out, execution time exeeded {} seconds",
@@ -402,16 +402,11 @@ async fn handle_inner(
// not strictly necessary to mark success here,
// but it's just insurance for if we forget it somewhere else
ctx.latency_timer.success();
Ok::<_, HttpConnError>(client)
Ok::<_, anyhow::Error>(client)
};
// Run both operations in parallel
let (payload_result, auth_and_connect_result) =
join!(fetch_and_process_request, authenticate_and_connect,);
// Handle the results
let payload = payload_result?; // Handle errors appropriately
let mut client = auth_and_connect_result?; // Handle errors appropriately
let (payload, mut client) = try_join!(fetch_and_process_request, authenticate_and_connect)?;
let mut response = Response::builder()
.status(StatusCode::OK)