mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-15 04:00:38 +00:00
## Problem
If there's a quota error, it makes sense to cache it for a short window
of time. Many clients do not handle database connection errors
gracefully, so just spam retry 🤡
## Summary of changes
Updates the node_info cache to support storing console errors. Store
console errors if they cannot be retried (using our own heuristic.
should only trigger for quota exceeded errors).
448 lines
15 KiB
Rust
448 lines
15 KiB
Rust
use measured::FixedCardinalityLabel;
|
|
use serde::{Deserialize, Serialize};
|
|
use std::fmt::{self, Display};
|
|
|
|
use crate::auth::IpPattern;
|
|
|
|
use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt};
|
|
use crate::proxy::retry::CouldRetry;
|
|
|
|
/// Generic error response with human-readable description.
|
|
/// Note that we can't always present it to user as is.
|
|
#[derive(Debug, Deserialize, Clone)]
|
|
pub struct ConsoleError {
|
|
pub error: Box<str>,
|
|
#[serde(skip)]
|
|
pub http_status_code: http::StatusCode,
|
|
pub status: Option<Status>,
|
|
}
|
|
|
|
impl ConsoleError {
|
|
pub fn get_reason(&self) -> Reason {
|
|
self.status
|
|
.as_ref()
|
|
.and_then(|s| s.details.error_info.as_ref())
|
|
.map(|e| e.reason)
|
|
.unwrap_or(Reason::Unknown)
|
|
}
|
|
pub fn get_user_facing_message(&self) -> String {
|
|
use super::provider::errors::REQUEST_FAILED;
|
|
self.status
|
|
.as_ref()
|
|
.and_then(|s| s.details.user_facing_message.as_ref())
|
|
.map(|m| m.message.clone().into())
|
|
.unwrap_or_else(|| {
|
|
// Ask @neondatabase/control-plane for review before adding more.
|
|
match self.http_status_code {
|
|
http::StatusCode::NOT_FOUND => {
|
|
// Status 404: failed to get a project-related resource.
|
|
format!("{REQUEST_FAILED}: endpoint cannot be found")
|
|
}
|
|
http::StatusCode::NOT_ACCEPTABLE => {
|
|
// Status 406: endpoint is disabled (we don't allow connections).
|
|
format!("{REQUEST_FAILED}: endpoint is disabled")
|
|
}
|
|
http::StatusCode::LOCKED | http::StatusCode::UNPROCESSABLE_ENTITY => {
|
|
// Status 423: project might be in maintenance mode (or bad state), or quotas exceeded.
|
|
format!("{REQUEST_FAILED}: endpoint is temporarily unavailable. Check your quotas and/or contact our support.")
|
|
}
|
|
_ => REQUEST_FAILED.to_owned(),
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
impl Display for ConsoleError {
|
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
|
let msg = self
|
|
.status
|
|
.as_ref()
|
|
.and_then(|s| s.details.user_facing_message.as_ref())
|
|
.map(|m| m.message.as_ref())
|
|
.unwrap_or_else(|| &self.error);
|
|
write!(f, "{}", msg)
|
|
}
|
|
}
|
|
|
|
impl CouldRetry for ConsoleError {
|
|
fn could_retry(&self) -> bool {
|
|
// If the error message does not have a status,
|
|
// the error is unknown and probably should not retry automatically
|
|
let Some(status) = &self.status else {
|
|
return false;
|
|
};
|
|
|
|
// retry if the retry info is set.
|
|
if status.details.retry_info.is_some() {
|
|
return true;
|
|
}
|
|
|
|
// if no retry info set, attempt to use the error code to guess the retry state.
|
|
let reason = status
|
|
.details
|
|
.error_info
|
|
.map_or(Reason::Unknown, |e| e.reason);
|
|
|
|
reason.can_retry()
|
|
}
|
|
}
|
|
|
|
#[derive(Debug, Deserialize, Clone)]
|
|
pub struct Status {
|
|
pub code: Box<str>,
|
|
pub message: Box<str>,
|
|
pub details: Details,
|
|
}
|
|
|
|
#[derive(Debug, Deserialize, Clone)]
|
|
pub struct Details {
|
|
pub error_info: Option<ErrorInfo>,
|
|
pub retry_info: Option<RetryInfo>,
|
|
pub user_facing_message: Option<UserFacingMessage>,
|
|
}
|
|
|
|
#[derive(Copy, Clone, Debug, Deserialize)]
|
|
pub struct ErrorInfo {
|
|
pub reason: Reason,
|
|
// Schema could also have `metadata` field, but it's not structured. Skip it for now.
|
|
}
|
|
|
|
#[derive(Clone, Copy, Debug, Deserialize, Default)]
|
|
pub enum Reason {
|
|
/// RoleProtected indicates that the role is protected and the attempted operation is not permitted on protected roles.
|
|
#[serde(rename = "ROLE_PROTECTED")]
|
|
RoleProtected,
|
|
/// ResourceNotFound indicates that a resource (project, endpoint, branch, etc.) wasn't found,
|
|
/// usually due to the provided ID not being correct or because the subject doesn't have enough permissions to
|
|
/// access the requested resource.
|
|
/// Prefer a more specific reason if possible, e.g., ProjectNotFound, EndpointNotFound, etc.
|
|
#[serde(rename = "RESOURCE_NOT_FOUND")]
|
|
ResourceNotFound,
|
|
/// ProjectNotFound indicates that the project wasn't found, usually due to the provided ID not being correct,
|
|
/// or that the subject doesn't have enough permissions to access the requested project.
|
|
#[serde(rename = "PROJECT_NOT_FOUND")]
|
|
ProjectNotFound,
|
|
/// EndpointNotFound indicates that the endpoint wasn't found, usually due to the provided ID not being correct,
|
|
/// or that the subject doesn't have enough permissions to access the requested endpoint.
|
|
#[serde(rename = "ENDPOINT_NOT_FOUND")]
|
|
EndpointNotFound,
|
|
/// BranchNotFound indicates that the branch wasn't found, usually due to the provided ID not being correct,
|
|
/// or that the subject doesn't have enough permissions to access the requested branch.
|
|
#[serde(rename = "BRANCH_NOT_FOUND")]
|
|
BranchNotFound,
|
|
/// RateLimitExceeded indicates that the rate limit for the operation has been exceeded.
|
|
#[serde(rename = "RATE_LIMIT_EXCEEDED")]
|
|
RateLimitExceeded,
|
|
/// NonDefaultBranchComputeTimeExceeded indicates that the compute time quota of non-default branches has been
|
|
/// exceeded.
|
|
#[serde(rename = "NON_PRIMARY_BRANCH_COMPUTE_TIME_EXCEEDED")]
|
|
NonDefaultBranchComputeTimeExceeded,
|
|
/// ActiveTimeQuotaExceeded indicates that the active time quota was exceeded.
|
|
#[serde(rename = "ACTIVE_TIME_QUOTA_EXCEEDED")]
|
|
ActiveTimeQuotaExceeded,
|
|
/// ComputeTimeQuotaExceeded indicates that the compute time quota was exceeded.
|
|
#[serde(rename = "COMPUTE_TIME_QUOTA_EXCEEDED")]
|
|
ComputeTimeQuotaExceeded,
|
|
/// WrittenDataQuotaExceeded indicates that the written data quota was exceeded.
|
|
#[serde(rename = "WRITTEN_DATA_QUOTA_EXCEEDED")]
|
|
WrittenDataQuotaExceeded,
|
|
/// DataTransferQuotaExceeded indicates that the data transfer quota was exceeded.
|
|
#[serde(rename = "DATA_TRANSFER_QUOTA_EXCEEDED")]
|
|
DataTransferQuotaExceeded,
|
|
/// LogicalSizeQuotaExceeded indicates that the logical size quota was exceeded.
|
|
#[serde(rename = "LOGICAL_SIZE_QUOTA_EXCEEDED")]
|
|
LogicalSizeQuotaExceeded,
|
|
/// RunningOperations indicates that the project already has some running operations
|
|
/// and scheduling of new ones is prohibited.
|
|
#[serde(rename = "RUNNING_OPERATIONS")]
|
|
RunningOperations,
|
|
/// ConcurrencyLimitReached indicates that the concurrency limit for an action was reached.
|
|
#[serde(rename = "CONCURRENCY_LIMIT_REACHED")]
|
|
ConcurrencyLimitReached,
|
|
/// LockAlreadyTaken indicates that the we attempted to take a lock that was already taken.
|
|
#[serde(rename = "LOCK_ALREADY_TAKEN")]
|
|
LockAlreadyTaken,
|
|
#[default]
|
|
#[serde(other)]
|
|
Unknown,
|
|
}
|
|
|
|
impl Reason {
|
|
pub fn is_not_found(&self) -> bool {
|
|
matches!(
|
|
self,
|
|
Reason::ResourceNotFound
|
|
| Reason::ProjectNotFound
|
|
| Reason::EndpointNotFound
|
|
| Reason::BranchNotFound
|
|
)
|
|
}
|
|
|
|
pub fn can_retry(&self) -> bool {
|
|
match self {
|
|
// do not retry role protected errors
|
|
// not a transitive error
|
|
Reason::RoleProtected => false,
|
|
// on retry, it will still not be found
|
|
Reason::ResourceNotFound
|
|
| Reason::ProjectNotFound
|
|
| Reason::EndpointNotFound
|
|
| Reason::BranchNotFound => false,
|
|
// we were asked to go away
|
|
Reason::RateLimitExceeded
|
|
| Reason::NonDefaultBranchComputeTimeExceeded
|
|
| Reason::ActiveTimeQuotaExceeded
|
|
| Reason::ComputeTimeQuotaExceeded
|
|
| Reason::WrittenDataQuotaExceeded
|
|
| Reason::DataTransferQuotaExceeded
|
|
| Reason::LogicalSizeQuotaExceeded => false,
|
|
// transitive error. control plane is currently busy
|
|
// but might be ready soon
|
|
Reason::RunningOperations
|
|
| Reason::ConcurrencyLimitReached
|
|
| Reason::LockAlreadyTaken => true,
|
|
// unknown error. better not retry it.
|
|
Reason::Unknown => false,
|
|
}
|
|
}
|
|
}
|
|
|
|
#[derive(Copy, Clone, Debug, Deserialize)]
|
|
pub struct RetryInfo {
|
|
pub retry_delay_ms: u64,
|
|
}
|
|
|
|
#[derive(Debug, Deserialize, Clone)]
|
|
pub struct UserFacingMessage {
|
|
pub message: Box<str>,
|
|
}
|
|
|
|
/// Response which holds client's auth secret, e.g. [`crate::scram::ServerSecret`].
|
|
/// Returned by the `/proxy_get_role_secret` API method.
|
|
#[derive(Deserialize)]
|
|
pub struct GetRoleSecret {
|
|
pub role_secret: Box<str>,
|
|
pub allowed_ips: Option<Vec<IpPattern>>,
|
|
pub project_id: Option<ProjectIdInt>,
|
|
}
|
|
|
|
// Manually implement debug to omit sensitive info.
|
|
impl fmt::Debug for GetRoleSecret {
|
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
|
f.debug_struct("GetRoleSecret").finish_non_exhaustive()
|
|
}
|
|
}
|
|
|
|
/// Response which holds compute node's `host:port` pair.
|
|
/// Returned by the `/proxy_wake_compute` API method.
|
|
#[derive(Debug, Deserialize)]
|
|
pub struct WakeCompute {
|
|
pub address: Box<str>,
|
|
pub aux: MetricsAuxInfo,
|
|
}
|
|
|
|
/// Async response which concludes the link auth flow.
|
|
/// Also known as `kickResponse` in the console.
|
|
#[derive(Debug, Deserialize)]
|
|
pub struct KickSession<'a> {
|
|
/// Session ID is assigned by the proxy.
|
|
pub session_id: &'a str,
|
|
|
|
/// Compute node connection params.
|
|
#[serde(deserialize_with = "KickSession::parse_db_info")]
|
|
pub result: DatabaseInfo,
|
|
}
|
|
|
|
impl KickSession<'_> {
|
|
fn parse_db_info<'de, D>(des: D) -> Result<DatabaseInfo, D::Error>
|
|
where
|
|
D: serde::Deserializer<'de>,
|
|
{
|
|
#[derive(Deserialize)]
|
|
enum Wrapper {
|
|
// Currently, console only reports `Success`.
|
|
// `Failure(String)` used to be here... RIP.
|
|
Success(DatabaseInfo),
|
|
}
|
|
|
|
Wrapper::deserialize(des).map(|x| match x {
|
|
Wrapper::Success(info) => info,
|
|
})
|
|
}
|
|
}
|
|
|
|
/// Compute node connection params.
|
|
#[derive(Deserialize)]
|
|
pub struct DatabaseInfo {
|
|
pub host: Box<str>,
|
|
pub port: u16,
|
|
pub dbname: Box<str>,
|
|
pub user: Box<str>,
|
|
/// Console always provides a password, but it might
|
|
/// be inconvenient for debug with local PG instance.
|
|
pub password: Option<Box<str>>,
|
|
pub aux: MetricsAuxInfo,
|
|
}
|
|
|
|
// Manually implement debug to omit sensitive info.
|
|
impl fmt::Debug for DatabaseInfo {
|
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
|
f.debug_struct("DatabaseInfo")
|
|
.field("host", &self.host)
|
|
.field("port", &self.port)
|
|
.field("dbname", &self.dbname)
|
|
.field("user", &self.user)
|
|
.finish_non_exhaustive()
|
|
}
|
|
}
|
|
|
|
/// Various labels for prometheus metrics.
|
|
/// Also known as `ProxyMetricsAuxInfo` in the console.
|
|
#[derive(Debug, Deserialize, Clone)]
|
|
pub struct MetricsAuxInfo {
|
|
pub endpoint_id: EndpointIdInt,
|
|
pub project_id: ProjectIdInt,
|
|
pub branch_id: BranchIdInt,
|
|
#[serde(default)]
|
|
pub cold_start_info: ColdStartInfo,
|
|
}
|
|
|
|
#[derive(Debug, Default, Serialize, Deserialize, Clone, Copy, FixedCardinalityLabel)]
|
|
#[serde(rename_all = "snake_case")]
|
|
pub enum ColdStartInfo {
|
|
#[default]
|
|
Unknown,
|
|
/// Compute was already running
|
|
Warm,
|
|
#[serde(rename = "pool_hit")]
|
|
#[label(rename = "pool_hit")]
|
|
/// Compute was not running but there was an available VM
|
|
VmPoolHit,
|
|
#[serde(rename = "pool_miss")]
|
|
#[label(rename = "pool_miss")]
|
|
/// Compute was not running and there were no VMs available
|
|
VmPoolMiss,
|
|
|
|
// not provided by control plane
|
|
/// Connection available from HTTP pool
|
|
HttpPoolHit,
|
|
/// Cached connection info
|
|
WarmCached,
|
|
}
|
|
|
|
impl ColdStartInfo {
|
|
pub fn as_str(&self) -> &'static str {
|
|
match self {
|
|
ColdStartInfo::Unknown => "unknown",
|
|
ColdStartInfo::Warm => "warm",
|
|
ColdStartInfo::VmPoolHit => "pool_hit",
|
|
ColdStartInfo::VmPoolMiss => "pool_miss",
|
|
ColdStartInfo::HttpPoolHit => "http_pool_hit",
|
|
ColdStartInfo::WarmCached => "warm_cached",
|
|
}
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
use serde_json::json;
|
|
|
|
fn dummy_aux() -> serde_json::Value {
|
|
json!({
|
|
"endpoint_id": "endpoint",
|
|
"project_id": "project",
|
|
"branch_id": "branch",
|
|
"cold_start_info": "unknown",
|
|
})
|
|
}
|
|
|
|
#[test]
|
|
fn parse_kick_session() -> anyhow::Result<()> {
|
|
// This is what the console's kickResponse looks like.
|
|
let json = json!({
|
|
"session_id": "deadbeef",
|
|
"result": {
|
|
"Success": {
|
|
"host": "localhost",
|
|
"port": 5432,
|
|
"dbname": "postgres",
|
|
"user": "john_doe",
|
|
"password": "password",
|
|
"aux": dummy_aux(),
|
|
}
|
|
}
|
|
});
|
|
let _: KickSession = serde_json::from_str(&json.to_string())?;
|
|
|
|
Ok(())
|
|
}
|
|
|
|
#[test]
|
|
fn parse_db_info() -> anyhow::Result<()> {
|
|
// with password
|
|
let _: DatabaseInfo = serde_json::from_value(json!({
|
|
"host": "localhost",
|
|
"port": 5432,
|
|
"dbname": "postgres",
|
|
"user": "john_doe",
|
|
"password": "password",
|
|
"aux": dummy_aux(),
|
|
}))?;
|
|
|
|
// without password
|
|
let _: DatabaseInfo = serde_json::from_value(json!({
|
|
"host": "localhost",
|
|
"port": 5432,
|
|
"dbname": "postgres",
|
|
"user": "john_doe",
|
|
"aux": dummy_aux(),
|
|
}))?;
|
|
|
|
// new field (forward compatibility)
|
|
let _: DatabaseInfo = serde_json::from_value(json!({
|
|
"host": "localhost",
|
|
"port": 5432,
|
|
"dbname": "postgres",
|
|
"user": "john_doe",
|
|
"project": "hello_world",
|
|
"N.E.W": "forward compatibility check",
|
|
"aux": dummy_aux(),
|
|
}))?;
|
|
|
|
Ok(())
|
|
}
|
|
|
|
#[test]
|
|
fn parse_wake_compute() -> anyhow::Result<()> {
|
|
let json = json!({
|
|
"address": "0.0.0.0",
|
|
"aux": dummy_aux(),
|
|
});
|
|
let _: WakeCompute = serde_json::from_str(&json.to_string())?;
|
|
Ok(())
|
|
}
|
|
|
|
#[test]
|
|
fn parse_get_role_secret() -> anyhow::Result<()> {
|
|
// Empty `allowed_ips` field.
|
|
let json = json!({
|
|
"role_secret": "secret",
|
|
});
|
|
let _: GetRoleSecret = serde_json::from_str(&json.to_string())?;
|
|
let json = json!({
|
|
"role_secret": "secret",
|
|
"allowed_ips": ["8.8.8.8"],
|
|
});
|
|
let _: GetRoleSecret = serde_json::from_str(&json.to_string())?;
|
|
let json = json!({
|
|
"role_secret": "secret",
|
|
"allowed_ips": ["8.8.8.8"],
|
|
"project_id": "project",
|
|
});
|
|
let _: GetRoleSecret = serde_json::from_str(&json.to_string())?;
|
|
|
|
Ok(())
|
|
}
|
|
}
|