mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-17 10:22:56 +00:00
pageserver: prefer 503 to 500 for transient unavailability (#5439)
## Problem The 500 status code should only be used for bugs or unrecoverable failures: situations we did not expect. Currently, the pageserver is misusing this response code for some situations that are totally normal, like requests targeting tenants that are in the process of activating. The 503 response is a convenient catch-all for "I can't right now, but I will be able to". ## Summary of changes - Change some transient availability error conditions to return 503 instead of 500 - Update the HTTP client configuration in integration tests to retry on 503 After these changes, things like creating a tenant and then trying to create a timeline within it will no longer require carefully checking its status first, or retrying on 500s. Instead, a client which is properly configured to retry on 503 can quietly handle such situations.
This commit is contained in:
@@ -24,6 +24,9 @@ pub enum ApiError {
|
||||
#[error("Precondition failed: {0}")]
|
||||
PreconditionFailed(Box<str>),
|
||||
|
||||
#[error("Resource temporarily unavailable: {0}")]
|
||||
ResourceUnavailable(String),
|
||||
|
||||
#[error("Shutting down")]
|
||||
ShuttingDown,
|
||||
|
||||
@@ -59,6 +62,10 @@ impl ApiError {
|
||||
"Shutting down".to_string(),
|
||||
StatusCode::SERVICE_UNAVAILABLE,
|
||||
),
|
||||
ApiError::ResourceUnavailable(err) => HttpErrorBody::response_from_msg_and_status(
|
||||
err.to_string(),
|
||||
StatusCode::SERVICE_UNAVAILABLE,
|
||||
),
|
||||
ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
|
||||
err.to_string(),
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
|
||||
@@ -132,7 +132,7 @@ impl From<PageReconstructError> for ApiError {
|
||||
ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
|
||||
}
|
||||
PageReconstructError::AncestorStopping(_) => {
|
||||
ApiError::InternalServerError(anyhow::Error::new(pre))
|
||||
ApiError::ResourceUnavailable(format!("{pre}"))
|
||||
}
|
||||
PageReconstructError::WalRedo(pre) => {
|
||||
ApiError::InternalServerError(anyhow::Error::new(pre))
|
||||
@@ -145,7 +145,7 @@ impl From<TenantMapInsertError> for ApiError {
|
||||
fn from(tmie: TenantMapInsertError) -> ApiError {
|
||||
match tmie {
|
||||
TenantMapInsertError::StillInitializing | TenantMapInsertError::ShuttingDown => {
|
||||
ApiError::InternalServerError(anyhow::Error::new(tmie))
|
||||
ApiError::ResourceUnavailable(format!("{tmie}"))
|
||||
}
|
||||
TenantMapInsertError::TenantAlreadyExists(id, state) => {
|
||||
ApiError::Conflict(format!("tenant {id} already exists, state: {state:?}"))
|
||||
@@ -159,6 +159,12 @@ impl From<TenantStateError> for ApiError {
|
||||
fn from(tse: TenantStateError) -> ApiError {
|
||||
match tse {
|
||||
TenantStateError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()),
|
||||
TenantStateError::NotActive(_) => {
|
||||
ApiError::ResourceUnavailable("Tenant not yet active".into())
|
||||
}
|
||||
TenantStateError::IsStopping(_) => {
|
||||
ApiError::ResourceUnavailable("Tenant is stopping".into())
|
||||
}
|
||||
_ => ApiError::InternalServerError(anyhow::Error::new(tse)),
|
||||
}
|
||||
}
|
||||
@@ -168,14 +174,17 @@ impl From<GetTenantError> for ApiError {
|
||||
fn from(tse: GetTenantError) -> ApiError {
|
||||
match tse {
|
||||
GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()),
|
||||
e @ GetTenantError::NotActive(_) => {
|
||||
GetTenantError::Broken(reason) => {
|
||||
ApiError::InternalServerError(anyhow!("tenant is broken: {}", reason))
|
||||
}
|
||||
GetTenantError::NotActive(_) => {
|
||||
// Why is this not `ApiError::NotFound`?
|
||||
// Because we must be careful to never return 404 for a tenant if it does
|
||||
// in fact exist locally. If we did, the caller could draw the conclusion
|
||||
// that it can attach the tenant to another PS and we'd be in split-brain.
|
||||
//
|
||||
// (We can produce this variant only in `mgr::get_tenant(..., active=true)` calls).
|
||||
ApiError::InternalServerError(anyhow::Error::new(e))
|
||||
ApiError::ResourceUnavailable("Tenant not yet active".into())
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -622,8 +631,9 @@ async fn tenant_list_handler(
|
||||
let response_data = mgr::list_tenants()
|
||||
.instrument(info_span!("tenant_list"))
|
||||
.await
|
||||
.map_err(anyhow::Error::new)
|
||||
.map_err(ApiError::InternalServerError)?
|
||||
.map_err(|_| {
|
||||
ApiError::ResourceUnavailable("Tenant map is initializing or shutting down".to_string())
|
||||
})?
|
||||
.iter()
|
||||
.map(|(id, state)| TenantInfo {
|
||||
id: *id,
|
||||
|
||||
@@ -1265,7 +1265,10 @@ async fn get_active_tenant_with_timeout(
|
||||
Ok(tenant) => tenant,
|
||||
Err(e @ GetTenantError::NotFound(_)) => return Err(GetActiveTenantError::NotFound(e)),
|
||||
Err(GetTenantError::NotActive(_)) => {
|
||||
unreachable!("we're calling get_tenant with active=false")
|
||||
unreachable!("we're calling get_tenant with active_only=false")
|
||||
}
|
||||
Err(GetTenantError::Broken(_)) => {
|
||||
unreachable!("we're calling get_tenant with active_only=false")
|
||||
}
|
||||
};
|
||||
let wait_time = Duration::from_secs(30);
|
||||
|
||||
@@ -510,6 +510,11 @@ pub enum GetTenantError {
|
||||
NotFound(TenantId),
|
||||
#[error("Tenant {0} is not active")]
|
||||
NotActive(TenantId),
|
||||
/// Broken is logically a subset of NotActive, but a distinct error is useful as
|
||||
/// NotActive is usually a retryable state for API purposes, whereas Broken
|
||||
/// is a stuck error state
|
||||
#[error("Tenant is broken: {0}")]
|
||||
Broken(String),
|
||||
}
|
||||
|
||||
/// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query.
|
||||
@@ -524,10 +529,20 @@ pub async fn get_tenant(
|
||||
let tenant = m
|
||||
.get(&tenant_id)
|
||||
.ok_or(GetTenantError::NotFound(tenant_id))?;
|
||||
if active_only && !tenant.is_active() {
|
||||
Err(GetTenantError::NotActive(tenant_id))
|
||||
} else {
|
||||
Ok(Arc::clone(tenant))
|
||||
|
||||
match tenant.current_state() {
|
||||
TenantState::Broken {
|
||||
reason,
|
||||
backtrace: _,
|
||||
} if active_only => Err(GetTenantError::Broken(reason)),
|
||||
TenantState::Active => Ok(Arc::clone(tenant)),
|
||||
_ => {
|
||||
if active_only {
|
||||
Err(GetTenantError::NotActive(tenant_id))
|
||||
} else {
|
||||
Ok(Arc::clone(tenant))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -37,6 +37,7 @@ from psycopg2.extensions import connection as PgConnection
|
||||
from psycopg2.extensions import cursor as PgCursor
|
||||
from psycopg2.extensions import make_dsn, parse_dsn
|
||||
from typing_extensions import Literal
|
||||
from urllib3.util.retry import Retry
|
||||
|
||||
from fixtures.broker import NeonBroker
|
||||
from fixtures.log_helper import log
|
||||
@@ -1651,11 +1652,14 @@ class NeonPageserver(PgProtocol):
|
||||
if '"testing"' not in self.version:
|
||||
pytest.skip("pageserver was built without 'testing' feature")
|
||||
|
||||
def http_client(self, auth_token: Optional[str] = None) -> PageserverHttpClient:
|
||||
def http_client(
|
||||
self, auth_token: Optional[str] = None, retries: Optional[Retry] = None
|
||||
) -> PageserverHttpClient:
|
||||
return PageserverHttpClient(
|
||||
port=self.service_port.http,
|
||||
auth_token=auth_token,
|
||||
is_testing_enabled_or_skip=self.is_testing_enabled_or_skip,
|
||||
retries=retries,
|
||||
)
|
||||
|
||||
@property
|
||||
|
||||
@@ -7,6 +7,8 @@ from dataclasses import dataclass
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
import requests
|
||||
from requests.adapters import HTTPAdapter
|
||||
from urllib3.util.retry import Retry
|
||||
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.metrics import Metrics, parse_metrics
|
||||
@@ -113,12 +115,40 @@ class TenantConfig:
|
||||
|
||||
|
||||
class PageserverHttpClient(requests.Session):
|
||||
def __init__(self, port: int, is_testing_enabled_or_skip: Fn, auth_token: Optional[str] = None):
|
||||
def __init__(
|
||||
self,
|
||||
port: int,
|
||||
is_testing_enabled_or_skip: Fn,
|
||||
auth_token: Optional[str] = None,
|
||||
retries: Optional[Retry] = None,
|
||||
):
|
||||
super().__init__()
|
||||
self.port = port
|
||||
self.auth_token = auth_token
|
||||
self.is_testing_enabled_or_skip = is_testing_enabled_or_skip
|
||||
|
||||
if retries is None:
|
||||
# We apply a retry policy that is different to the default `requests` behavior,
|
||||
# because the pageserver has various transiently unavailable states that benefit
|
||||
# from a client retrying on 503
|
||||
|
||||
retries = Retry(
|
||||
# Status retries are for retrying on 503 while e.g. waiting for tenants to activate
|
||||
status=5,
|
||||
# Connection retries are for waiting for the pageserver to come up and listen
|
||||
connect=5,
|
||||
# No read retries: if a request hangs that is not expected behavior
|
||||
# (this may change in future if we do fault injection of a kind that causes
|
||||
# requests TCP flows to stick)
|
||||
read=False,
|
||||
backoff_factor=0,
|
||||
status_forcelist=[503],
|
||||
allowed_methods=None,
|
||||
remove_headers_on_redirect=[],
|
||||
)
|
||||
|
||||
self.mount("http://", HTTPAdapter(max_retries=retries))
|
||||
|
||||
if auth_token is not None:
|
||||
self.headers["Authorization"] = f"Bearer {auth_token}"
|
||||
|
||||
|
||||
@@ -34,6 +34,7 @@ from fixtures.remote_storage import (
|
||||
)
|
||||
from fixtures.types import Lsn, TenantId, TimelineId
|
||||
from fixtures.utils import query_scalar, run_pg_bench_small, wait_until
|
||||
from urllib3.util.retry import Retry
|
||||
|
||||
|
||||
def test_timeline_delete(neon_simple_env: NeonEnv):
|
||||
@@ -614,7 +615,7 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
child_timeline_id = env.neon_cli.create_branch("child", "main")
|
||||
|
||||
ps_http = env.pageserver.http_client()
|
||||
ps_http = env.pageserver.http_client(retries=Retry(0, read=False))
|
||||
|
||||
failpoint_name = "persist_deleted_index_part"
|
||||
ps_http.configure_failpoints((failpoint_name, "pause"))
|
||||
@@ -854,7 +855,7 @@ def test_timeline_delete_resumed_on_attach(
|
||||
# error from http response is also logged
|
||||
".*InternalServerError\\(Tenant is marked as deleted on remote storage.*",
|
||||
# Polling after attach may fail with this
|
||||
f".*InternalServerError\\(Tenant {tenant_id} is not active.*",
|
||||
".*Resource temporarily unavailable.*Tenant not yet active",
|
||||
'.*shutdown_pageserver{exit_code=0}: stopping left-over name="remote upload".*',
|
||||
)
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user