diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs index 9869b95768..3b7d681a41 100644 --- a/proxy/src/console/messages.rs +++ b/proxy/src/console/messages.rs @@ -1,16 +1,183 @@ use measured::FixedCardinalityLabel; use serde::{Deserialize, Serialize}; -use std::fmt; +use std::fmt::{self, Display}; use crate::auth::IpPattern; use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt}; +use crate::proxy::retry::ShouldRetry; /// Generic error response with human-readable description. /// Note that we can't always present it to user as is. #[derive(Debug, Deserialize)] pub struct ConsoleError { pub error: Box, + #[serde(skip)] + pub http_status_code: http::StatusCode, + pub status: Option, +} + +impl ConsoleError { + pub fn get_reason(&self) -> Reason { + self.status + .as_ref() + .and_then(|s| s.details.error_info.as_ref()) + .map(|e| e.reason) + .unwrap_or(Reason::Unknown) + } + pub fn get_user_facing_message(&self) -> String { + use super::provider::errors::REQUEST_FAILED; + self.status + .as_ref() + .and_then(|s| s.details.user_facing_message.as_ref()) + .map(|m| m.message.clone().into()) + .unwrap_or_else(|| { + // Ask @neondatabase/control-plane for review before adding more. + match self.http_status_code { + http::StatusCode::NOT_FOUND => { + // Status 404: failed to get a project-related resource. + format!("{REQUEST_FAILED}: endpoint cannot be found") + } + http::StatusCode::NOT_ACCEPTABLE => { + // Status 406: endpoint is disabled (we don't allow connections). + format!("{REQUEST_FAILED}: endpoint is disabled") + } + http::StatusCode::LOCKED | http::StatusCode::UNPROCESSABLE_ENTITY => { + // Status 423: project might be in maintenance mode (or bad state), or quotas exceeded. + format!("{REQUEST_FAILED}: endpoint is temporarily unavailable. Check your quotas and/or contact our support.") + } + _ => REQUEST_FAILED.to_owned(), + } + }) + } +} + +impl Display for ConsoleError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let msg = self + .status + .as_ref() + .and_then(|s| s.details.user_facing_message.as_ref()) + .map(|m| m.message.as_ref()) + .unwrap_or_else(|| &self.error); + write!(f, "{}", msg) + } +} + +impl ShouldRetry for ConsoleError { + fn could_retry(&self) -> bool { + if self.status.is_none() || self.status.as_ref().unwrap().details.retry_info.is_none() { + // retry some temporary failures because the compute was in a bad state + // (bad request can be returned when the endpoint was in transition) + return match &self { + ConsoleError { + http_status_code: http::StatusCode::BAD_REQUEST, + .. + } => true, + // don't retry when quotas are exceeded + ConsoleError { + http_status_code: http::StatusCode::UNPROCESSABLE_ENTITY, + ref error, + .. + } => !error.contains("compute time quota of non-primary branches is exceeded"), + // locked can be returned when the endpoint was in transition + // or when quotas are exceeded. don't retry when quotas are exceeded + ConsoleError { + http_status_code: http::StatusCode::LOCKED, + ref error, + .. + } => { + !error.contains("quota exceeded") + && !error.contains("the limit for current plan reached") + } + _ => false, + }; + } + + // retry if the response has a retry delay + if let Some(retry_info) = self + .status + .as_ref() + .and_then(|s| s.details.retry_info.as_ref()) + { + retry_info.retry_delay_ms > 0 + } else { + false + } + } +} + +#[derive(Debug, Deserialize)] +pub struct Status { + pub code: Box, + pub message: Box, + pub details: Details, +} + +#[derive(Debug, Deserialize)] +pub struct Details { + pub error_info: Option, + pub retry_info: Option, + pub user_facing_message: Option, +} + +#[derive(Debug, Deserialize)] +pub struct ErrorInfo { + pub reason: Reason, + // Schema could also have `metadata` field, but it's not structured. Skip it for now. +} + +#[derive(Clone, Copy, Debug, Deserialize, Default)] +pub enum Reason { + #[serde(rename = "ROLE_PROTECTED")] + RoleProtected, + #[serde(rename = "RESOURCE_NOT_FOUND")] + ResourceNotFound, + #[serde(rename = "PROJECT_NOT_FOUND")] + ProjectNotFound, + #[serde(rename = "ENDPOINT_NOT_FOUND")] + EndpointNotFound, + #[serde(rename = "BRANCH_NOT_FOUND")] + BranchNotFound, + #[serde(rename = "RATE_LIMIT_EXCEEDED")] + RateLimitExceeded, + #[serde(rename = "NON_PRIMARY_BRANCH_COMPUTE_TIME_EXCEEDED")] + NonPrimaryBranchComputeTimeExceeded, + #[serde(rename = "ACTIVE_TIME_QUOTA_EXCEEDED")] + ActiveTimeQuotaExceeded, + #[serde(rename = "COMPUTE_TIME_QUOTA_EXCEEDED")] + ComputeTimeQuotaExceeded, + #[serde(rename = "WRITTEN_DATA_QUOTA_EXCEEDED")] + WrittenDataQuotaExceeded, + #[serde(rename = "DATA_TRANSFER_QUOTA_EXCEEDED")] + DataTransferQuotaExceeded, + #[serde(rename = "LOGICAL_SIZE_QUOTA_EXCEEDED")] + LogicalSizeQuotaExceeded, + #[default] + #[serde(other)] + Unknown, +} + +impl Reason { + pub fn is_not_found(&self) -> bool { + matches!( + self, + Reason::ResourceNotFound + | Reason::ProjectNotFound + | Reason::EndpointNotFound + | Reason::BranchNotFound + ) + } +} + +#[derive(Debug, Deserialize)] +pub struct RetryInfo { + pub retry_delay_ms: u64, +} + +#[derive(Debug, Deserialize)] +pub struct UserFacingMessage { + pub message: Box, } /// Response which holds client's auth secret, e.g. [`crate::scram::ServerSecret`]. diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs index 634ec9042c..915c2ee7a6 100644 --- a/proxy/src/console/provider.rs +++ b/proxy/src/console/provider.rs @@ -25,8 +25,8 @@ use tracing::info; pub mod errors { use crate::{ + console::messages::{self, ConsoleError}, error::{io_error, ReportableError, UserFacingError}, - http, proxy::retry::ShouldRetry, }; use thiserror::Error; @@ -34,17 +34,14 @@ pub mod errors { use super::ApiLockError; /// A go-to error message which doesn't leak any detail. - const REQUEST_FAILED: &str = "Console request failed"; + pub const REQUEST_FAILED: &str = "Console request failed"; /// Common console API error. #[derive(Debug, Error)] pub enum ApiError { /// Error returned by the console itself. - #[error("{REQUEST_FAILED} with {}: {}", .status, .text)] - Console { - status: http::StatusCode, - text: Box, - }, + #[error("{REQUEST_FAILED} with {0}")] + Console(ConsoleError), /// Various IO errors like broken pipe or malformed payload. #[error("{REQUEST_FAILED}: {0}")] @@ -53,11 +50,11 @@ pub mod errors { impl ApiError { /// Returns HTTP status code if it's the reason for failure. - pub fn http_status_code(&self) -> Option { + pub fn get_reason(&self) -> messages::Reason { use ApiError::*; match self { - Console { status, .. } => Some(*status), - _ => None, + Console(e) => e.get_reason(), + _ => messages::Reason::Unknown, } } } @@ -67,22 +64,7 @@ pub mod errors { use ApiError::*; match self { // To minimize risks, only select errors are forwarded to users. - // Ask @neondatabase/control-plane for review before adding more. - Console { status, .. } => match *status { - http::StatusCode::NOT_FOUND => { - // Status 404: failed to get a project-related resource. - format!("{REQUEST_FAILED}: endpoint cannot be found") - } - http::StatusCode::NOT_ACCEPTABLE => { - // Status 406: endpoint is disabled (we don't allow connections). - format!("{REQUEST_FAILED}: endpoint is disabled") - } - http::StatusCode::LOCKED | http::StatusCode::UNPROCESSABLE_ENTITY => { - // Status 423: project might be in maintenance mode (or bad state), or quotas exceeded. - format!("{REQUEST_FAILED}: endpoint is temporarily unavailable. Check your quotas and/or contact our support.") - } - _ => REQUEST_FAILED.to_owned(), - }, + Console(c) => c.get_user_facing_message(), _ => REQUEST_FAILED.to_owned(), } } @@ -91,29 +73,56 @@ pub mod errors { impl ReportableError for ApiError { fn get_error_kind(&self) -> crate::error::ErrorKind { match self { - ApiError::Console { - status: http::StatusCode::NOT_FOUND | http::StatusCode::NOT_ACCEPTABLE, - .. - } => crate::error::ErrorKind::User, - ApiError::Console { - status: http::StatusCode::UNPROCESSABLE_ENTITY, - text, - } if text.contains("compute time quota of non-primary branches is exceeded") => { - crate::error::ErrorKind::User + ApiError::Console(e) => { + use crate::error::ErrorKind::*; + match e.get_reason() { + crate::console::messages::Reason::RoleProtected => User, + crate::console::messages::Reason::ResourceNotFound => User, + crate::console::messages::Reason::ProjectNotFound => User, + crate::console::messages::Reason::EndpointNotFound => User, + crate::console::messages::Reason::BranchNotFound => User, + crate::console::messages::Reason::RateLimitExceeded => ServiceRateLimit, + crate::console::messages::Reason::NonPrimaryBranchComputeTimeExceeded => { + User + } + crate::console::messages::Reason::ActiveTimeQuotaExceeded => User, + crate::console::messages::Reason::ComputeTimeQuotaExceeded => User, + crate::console::messages::Reason::WrittenDataQuotaExceeded => User, + crate::console::messages::Reason::DataTransferQuotaExceeded => User, + crate::console::messages::Reason::LogicalSizeQuotaExceeded => User, + crate::console::messages::Reason::Unknown => match &e { + ConsoleError { + http_status_code: + http::StatusCode::NOT_FOUND | http::StatusCode::NOT_ACCEPTABLE, + .. + } => crate::error::ErrorKind::User, + ConsoleError { + http_status_code: http::StatusCode::UNPROCESSABLE_ENTITY, + error, + .. + } if error.contains( + "compute time quota of non-primary branches is exceeded", + ) => + { + crate::error::ErrorKind::User + } + ConsoleError { + http_status_code: http::StatusCode::LOCKED, + error, + .. + } if error.contains("quota exceeded") + || error.contains("the limit for current plan reached") => + { + crate::error::ErrorKind::User + } + ConsoleError { + http_status_code: http::StatusCode::TOO_MANY_REQUESTS, + .. + } => crate::error::ErrorKind::ServiceRateLimit, + ConsoleError { .. } => crate::error::ErrorKind::ControlPlane, + }, + } } - ApiError::Console { - status: http::StatusCode::LOCKED, - text, - } if text.contains("quota exceeded") - || text.contains("the limit for current plan reached") => - { - crate::error::ErrorKind::User - } - ApiError::Console { - status: http::StatusCode::TOO_MANY_REQUESTS, - .. - } => crate::error::ErrorKind::ServiceRateLimit, - ApiError::Console { .. } => crate::error::ErrorKind::ControlPlane, ApiError::Transport(_) => crate::error::ErrorKind::ControlPlane, } } @@ -124,31 +133,7 @@ pub mod errors { match self { // retry some transport errors Self::Transport(io) => io.could_retry(), - // retry some temporary failures because the compute was in a bad state - // (bad request can be returned when the endpoint was in transition) - Self::Console { - status: http::StatusCode::BAD_REQUEST, - .. - } => true, - // don't retry when quotas are exceeded - Self::Console { - status: http::StatusCode::UNPROCESSABLE_ENTITY, - ref text, - } => !text.contains("compute time quota of non-primary branches is exceeded"), - // locked can be returned when the endpoint was in transition - // or when quotas are exceeded. don't retry when quotas are exceeded - Self::Console { - status: http::StatusCode::LOCKED, - ref text, - } => { - // written data quota exceeded - // data transfer quota exceeded - // compute time quota exceeded - // logical size quota exceeded - !text.contains("quota exceeded") - && !text.contains("the limit for current plan reached") - } - _ => false, + Self::Console(e) => e.could_retry(), } } } @@ -509,7 +494,7 @@ impl ApiLocks { self.metrics .semaphore_acquire_seconds .observe(now.elapsed().as_secs_f64()); - + info!("acquired permit {:?}", now.elapsed().as_secs_f64()); Ok(WakeComputePermit { permit: permit? }) } diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs index d72229b029..41bd2f4956 100644 --- a/proxy/src/console/provider/neon.rs +++ b/proxy/src/console/provider/neon.rs @@ -94,12 +94,14 @@ impl Api { let body = match parse_body::(response).await { Ok(body) => body, // Error 404 is special: it's ok not to have a secret. - Err(e) => match e.http_status_code() { - Some(http::StatusCode::NOT_FOUND) => { + // TODO(anna): retry + Err(e) => { + if e.get_reason().is_not_found() { return Ok(AuthInfo::default()); + } else { + return Err(e.into()); } - _otherwise => return Err(e.into()), - }, + } }; let secret = if body.role_secret.is_empty() { @@ -328,19 +330,24 @@ async fn parse_body serde::Deserialize<'a>>( info!("request succeeded, processing the body"); return Ok(response.json().await?); } + let s = response.bytes().await?; + // Log plaintext to be able to detect, whether there are some cases not covered by the error struct. + info!("response_error plaintext: {:?}", s); // Don't throw an error here because it's not as important // as the fact that the request itself has failed. - let body = response.json().await.unwrap_or_else(|e| { + let mut body = serde_json::from_slice(&s).unwrap_or_else(|e| { warn!("failed to parse error body: {e}"); ConsoleError { error: "reason unclear (malformed error message)".into(), + http_status_code: status, + status: None, } }); + body.http_status_code = status; - let text = body.error; - error!("console responded with an error ({status}): {text}"); - Err(ApiError::Console { status, text }) + error!("console responded with an error ({status}): {body:?}"); + Err(ApiError::Console(body)) } fn parse_host_port(input: &str) -> Option<(&str, u16)> { diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs index ad48af0093..96683511fe 100644 --- a/proxy/src/proxy/tests.rs +++ b/proxy/src/proxy/tests.rs @@ -12,7 +12,7 @@ use crate::auth::backend::{ }; use crate::config::{CertResolver, RetryConfig}; use crate::console::caches::NodeInfoCache; -use crate::console::messages::MetricsAuxInfo; +use crate::console::messages::{ConsoleError, MetricsAuxInfo}; use crate::console::provider::{CachedAllowedIps, CachedRoleSecret, ConsoleBackend}; use crate::console::{self, CachedNodeInfo, NodeInfo}; use crate::error::ErrorKind; @@ -484,18 +484,20 @@ impl TestBackend for TestConnectMechanism { match action { ConnectAction::Wake => Ok(helper_create_cached_node_info(self.cache)), ConnectAction::WakeFail => { - let err = console::errors::ApiError::Console { - status: http::StatusCode::FORBIDDEN, - text: "TEST".into(), - }; + let err = console::errors::ApiError::Console(ConsoleError { + http_status_code: http::StatusCode::FORBIDDEN, + error: "TEST".into(), + status: None, + }); assert!(!err.could_retry()); Err(console::errors::WakeComputeError::ApiError(err)) } ConnectAction::WakeRetry => { - let err = console::errors::ApiError::Console { - status: http::StatusCode::BAD_REQUEST, - text: "TEST".into(), - }; + let err = console::errors::ApiError::Console(ConsoleError { + http_status_code: http::StatusCode::BAD_REQUEST, + error: "TEST".into(), + status: None, + }); assert!(err.could_retry()); Err(console::errors::WakeComputeError::ApiError(err)) } diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs index 94b03e1ccc..c166cf4389 100644 --- a/proxy/src/proxy/wake_compute.rs +++ b/proxy/src/proxy/wake_compute.rs @@ -1,4 +1,5 @@ use crate::config::RetryConfig; +use crate::console::messages::ConsoleError; use crate::console::{errors::WakeComputeError, provider::CachedNodeInfo}; use crate::context::RequestMonitoring; use crate::metrics::{ @@ -88,36 +89,76 @@ fn report_error(e: &WakeComputeError, retry: bool) { let kind = match e { WakeComputeError::BadComputeAddress(_) => WakeupFailureKind::BadComputeAddress, WakeComputeError::ApiError(ApiError::Transport(_)) => WakeupFailureKind::ApiTransportError, - WakeComputeError::ApiError(ApiError::Console { - status: StatusCode::LOCKED, - ref text, - }) if text.contains("written data quota exceeded") - || text.contains("the limit for current plan reached") => - { - WakeupFailureKind::QuotaExceeded - } - WakeComputeError::ApiError(ApiError::Console { - status: StatusCode::UNPROCESSABLE_ENTITY, - ref text, - }) if text.contains("compute time quota of non-primary branches is exceeded") => { - WakeupFailureKind::QuotaExceeded - } - WakeComputeError::ApiError(ApiError::Console { - status: StatusCode::LOCKED, - .. - }) => WakeupFailureKind::ApiConsoleLocked, - WakeComputeError::ApiError(ApiError::Console { - status: StatusCode::BAD_REQUEST, - .. - }) => WakeupFailureKind::ApiConsoleBadRequest, - WakeComputeError::ApiError(ApiError::Console { status, .. }) - if status.is_server_error() => - { - WakeupFailureKind::ApiConsoleOtherServerError - } - WakeComputeError::ApiError(ApiError::Console { .. }) => { - WakeupFailureKind::ApiConsoleOtherError - } + WakeComputeError::ApiError(ApiError::Console(e)) => match e.get_reason() { + crate::console::messages::Reason::RoleProtected => { + WakeupFailureKind::ApiConsoleBadRequest + } + crate::console::messages::Reason::ResourceNotFound => { + WakeupFailureKind::ApiConsoleBadRequest + } + crate::console::messages::Reason::ProjectNotFound => { + WakeupFailureKind::ApiConsoleBadRequest + } + crate::console::messages::Reason::EndpointNotFound => { + WakeupFailureKind::ApiConsoleBadRequest + } + crate::console::messages::Reason::BranchNotFound => { + WakeupFailureKind::ApiConsoleBadRequest + } + crate::console::messages::Reason::RateLimitExceeded => { + WakeupFailureKind::ApiConsoleLocked + } + crate::console::messages::Reason::NonPrimaryBranchComputeTimeExceeded => { + WakeupFailureKind::QuotaExceeded + } + crate::console::messages::Reason::ActiveTimeQuotaExceeded => { + WakeupFailureKind::QuotaExceeded + } + crate::console::messages::Reason::ComputeTimeQuotaExceeded => { + WakeupFailureKind::QuotaExceeded + } + crate::console::messages::Reason::WrittenDataQuotaExceeded => { + WakeupFailureKind::QuotaExceeded + } + crate::console::messages::Reason::DataTransferQuotaExceeded => { + WakeupFailureKind::QuotaExceeded + } + crate::console::messages::Reason::LogicalSizeQuotaExceeded => { + WakeupFailureKind::QuotaExceeded + } + crate::console::messages::Reason::Unknown => match e { + ConsoleError { + http_status_code: StatusCode::LOCKED, + ref error, + .. + } if error.contains("written data quota exceeded") + || error.contains("the limit for current plan reached") => + { + WakeupFailureKind::QuotaExceeded + } + ConsoleError { + http_status_code: StatusCode::UNPROCESSABLE_ENTITY, + ref error, + .. + } if error.contains("compute time quota of non-primary branches is exceeded") => { + WakeupFailureKind::QuotaExceeded + } + ConsoleError { + http_status_code: StatusCode::LOCKED, + .. + } => WakeupFailureKind::ApiConsoleLocked, + ConsoleError { + http_status_code: StatusCode::BAD_REQUEST, + .. + } => WakeupFailureKind::ApiConsoleBadRequest, + ConsoleError { + http_status_code, .. + } if http_status_code.is_server_error() => { + WakeupFailureKind::ApiConsoleOtherServerError + } + ConsoleError { .. } => WakeupFailureKind::ApiConsoleOtherError, + }, + }, WakeComputeError::TooManyConnections => WakeupFailureKind::ApiConsoleLocked, WakeComputeError::TooManyConnectionAttempts(_) => WakeupFailureKind::TimeoutError, };