From ace0c775fc42a1ce6d3d2644ae1739a978d2fbaa Mon Sep 17 00:00:00 2001 From: John Spray Date: Tue, 3 Oct 2023 17:00:55 +0100 Subject: [PATCH] pageserver: prefer 503 to 500 for transient unavailability (#5439) ## Problem The 500 status code should only be used for bugs or unrecoverable failures: situations we did not expect. Currently, the pageserver is misusing this response code for some situations that are totally normal, like requests targeting tenants that are in the process of activating. The 503 response is a convenient catch-all for "I can't right now, but I will be able to". ## Summary of changes - Change some transient availability error conditions to return 503 instead of 500 - Update the HTTP client configuration in integration tests to retry on 503 After these changes, things like creating a tenant and then trying to create a timeline within it will no longer require carefully checking its status first, or retrying on 500s. Instead, a client which is properly configured to retry on 503 can quietly handle such situations. --- libs/utils/src/http/error.rs | 7 +++++ pageserver/src/http/routes.rs | 22 ++++++++++---- pageserver/src/page_service.rs | 5 +++- pageserver/src/tenant/mgr.rs | 23 ++++++++++++--- test_runner/fixtures/neon_fixtures.py | 6 +++- test_runner/fixtures/pageserver/http.py | 32 ++++++++++++++++++++- test_runner/regress/test_timeline_delete.py | 5 ++-- 7 files changed, 85 insertions(+), 15 deletions(-) diff --git a/libs/utils/src/http/error.rs b/libs/utils/src/http/error.rs index dd54cd6ecd..08182aea07 100644 --- a/libs/utils/src/http/error.rs +++ b/libs/utils/src/http/error.rs @@ -24,6 +24,9 @@ pub enum ApiError { #[error("Precondition failed: {0}")] PreconditionFailed(Box), + #[error("Resource temporarily unavailable: {0}")] + ResourceUnavailable(String), + #[error("Shutting down")] ShuttingDown, @@ -59,6 +62,10 @@ impl ApiError { "Shutting down".to_string(), StatusCode::SERVICE_UNAVAILABLE, ), + ApiError::ResourceUnavailable(err) => HttpErrorBody::response_from_msg_and_status( + err.to_string(), + StatusCode::SERVICE_UNAVAILABLE, + ), ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status( err.to_string(), StatusCode::INTERNAL_SERVER_ERROR, diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index e61a9dcf3f..4144eef018 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -132,7 +132,7 @@ impl From for ApiError { ApiError::InternalServerError(anyhow::anyhow!("request was cancelled")) } PageReconstructError::AncestorStopping(_) => { - ApiError::InternalServerError(anyhow::Error::new(pre)) + ApiError::ResourceUnavailable(format!("{pre}")) } PageReconstructError::WalRedo(pre) => { ApiError::InternalServerError(anyhow::Error::new(pre)) @@ -145,7 +145,7 @@ impl From for ApiError { fn from(tmie: TenantMapInsertError) -> ApiError { match tmie { TenantMapInsertError::StillInitializing | TenantMapInsertError::ShuttingDown => { - ApiError::InternalServerError(anyhow::Error::new(tmie)) + ApiError::ResourceUnavailable(format!("{tmie}")) } TenantMapInsertError::TenantAlreadyExists(id, state) => { ApiError::Conflict(format!("tenant {id} already exists, state: {state:?}")) @@ -159,6 +159,12 @@ impl From for ApiError { fn from(tse: TenantStateError) -> ApiError { match tse { TenantStateError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()), + TenantStateError::NotActive(_) => { + ApiError::ResourceUnavailable("Tenant not yet active".into()) + } + TenantStateError::IsStopping(_) => { + ApiError::ResourceUnavailable("Tenant is stopping".into()) + } _ => ApiError::InternalServerError(anyhow::Error::new(tse)), } } @@ -168,14 +174,17 @@ impl From for ApiError { fn from(tse: GetTenantError) -> ApiError { match tse { GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()), - e @ GetTenantError::NotActive(_) => { + GetTenantError::Broken(reason) => { + ApiError::InternalServerError(anyhow!("tenant is broken: {}", reason)) + } + GetTenantError::NotActive(_) => { // Why is this not `ApiError::NotFound`? // Because we must be careful to never return 404 for a tenant if it does // in fact exist locally. If we did, the caller could draw the conclusion // that it can attach the tenant to another PS and we'd be in split-brain. // // (We can produce this variant only in `mgr::get_tenant(..., active=true)` calls). - ApiError::InternalServerError(anyhow::Error::new(e)) + ApiError::ResourceUnavailable("Tenant not yet active".into()) } } } @@ -622,8 +631,9 @@ async fn tenant_list_handler( let response_data = mgr::list_tenants() .instrument(info_span!("tenant_list")) .await - .map_err(anyhow::Error::new) - .map_err(ApiError::InternalServerError)? + .map_err(|_| { + ApiError::ResourceUnavailable("Tenant map is initializing or shutting down".to_string()) + })? .iter() .map(|(id, state)| TenantInfo { id: *id, diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 2a87ee0381..8c179a9add 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -1265,7 +1265,10 @@ async fn get_active_tenant_with_timeout( Ok(tenant) => tenant, Err(e @ GetTenantError::NotFound(_)) => return Err(GetActiveTenantError::NotFound(e)), Err(GetTenantError::NotActive(_)) => { - unreachable!("we're calling get_tenant with active=false") + unreachable!("we're calling get_tenant with active_only=false") + } + Err(GetTenantError::Broken(_)) => { + unreachable!("we're calling get_tenant with active_only=false") } }; let wait_time = Duration::from_secs(30); diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 17bcc9eb5f..ba86aefe44 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -510,6 +510,11 @@ pub enum GetTenantError { NotFound(TenantId), #[error("Tenant {0} is not active")] NotActive(TenantId), + /// Broken is logically a subset of NotActive, but a distinct error is useful as + /// NotActive is usually a retryable state for API purposes, whereas Broken + /// is a stuck error state + #[error("Tenant is broken: {0}")] + Broken(String), } /// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query. @@ -524,10 +529,20 @@ pub async fn get_tenant( let tenant = m .get(&tenant_id) .ok_or(GetTenantError::NotFound(tenant_id))?; - if active_only && !tenant.is_active() { - Err(GetTenantError::NotActive(tenant_id)) - } else { - Ok(Arc::clone(tenant)) + + match tenant.current_state() { + TenantState::Broken { + reason, + backtrace: _, + } if active_only => Err(GetTenantError::Broken(reason)), + TenantState::Active => Ok(Arc::clone(tenant)), + _ => { + if active_only { + Err(GetTenantError::NotActive(tenant_id)) + } else { + Ok(Arc::clone(tenant)) + } + } } } diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 92e7cd06cd..c8b970100b 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -37,6 +37,7 @@ from psycopg2.extensions import connection as PgConnection from psycopg2.extensions import cursor as PgCursor from psycopg2.extensions import make_dsn, parse_dsn from typing_extensions import Literal +from urllib3.util.retry import Retry from fixtures.broker import NeonBroker from fixtures.log_helper import log @@ -1651,11 +1652,14 @@ class NeonPageserver(PgProtocol): if '"testing"' not in self.version: pytest.skip("pageserver was built without 'testing' feature") - def http_client(self, auth_token: Optional[str] = None) -> PageserverHttpClient: + def http_client( + self, auth_token: Optional[str] = None, retries: Optional[Retry] = None + ) -> PageserverHttpClient: return PageserverHttpClient( port=self.service_port.http, auth_token=auth_token, is_testing_enabled_or_skip=self.is_testing_enabled_or_skip, + retries=retries, ) @property diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index 9fdcd22bc2..460a30ad56 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -7,6 +7,8 @@ from dataclasses import dataclass from typing import Any, Dict, List, Optional, Tuple import requests +from requests.adapters import HTTPAdapter +from urllib3.util.retry import Retry from fixtures.log_helper import log from fixtures.metrics import Metrics, parse_metrics @@ -113,12 +115,40 @@ class TenantConfig: class PageserverHttpClient(requests.Session): - def __init__(self, port: int, is_testing_enabled_or_skip: Fn, auth_token: Optional[str] = None): + def __init__( + self, + port: int, + is_testing_enabled_or_skip: Fn, + auth_token: Optional[str] = None, + retries: Optional[Retry] = None, + ): super().__init__() self.port = port self.auth_token = auth_token self.is_testing_enabled_or_skip = is_testing_enabled_or_skip + if retries is None: + # We apply a retry policy that is different to the default `requests` behavior, + # because the pageserver has various transiently unavailable states that benefit + # from a client retrying on 503 + + retries = Retry( + # Status retries are for retrying on 503 while e.g. waiting for tenants to activate + status=5, + # Connection retries are for waiting for the pageserver to come up and listen + connect=5, + # No read retries: if a request hangs that is not expected behavior + # (this may change in future if we do fault injection of a kind that causes + # requests TCP flows to stick) + read=False, + backoff_factor=0, + status_forcelist=[503], + allowed_methods=None, + remove_headers_on_redirect=[], + ) + + self.mount("http://", HTTPAdapter(max_retries=retries)) + if auth_token is not None: self.headers["Authorization"] = f"Bearer {auth_token}" diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index 3af144c31c..c412809a3a 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -34,6 +34,7 @@ from fixtures.remote_storage import ( ) from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import query_scalar, run_pg_bench_small, wait_until +from urllib3.util.retry import Retry def test_timeline_delete(neon_simple_env: NeonEnv): @@ -614,7 +615,7 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder): child_timeline_id = env.neon_cli.create_branch("child", "main") - ps_http = env.pageserver.http_client() + ps_http = env.pageserver.http_client(retries=Retry(0, read=False)) failpoint_name = "persist_deleted_index_part" ps_http.configure_failpoints((failpoint_name, "pause")) @@ -854,7 +855,7 @@ def test_timeline_delete_resumed_on_attach( # error from http response is also logged ".*InternalServerError\\(Tenant is marked as deleted on remote storage.*", # Polling after attach may fail with this - f".*InternalServerError\\(Tenant {tenant_id} is not active.*", + ".*Resource temporarily unavailable.*Tenant not yet active", '.*shutdown_pageserver{exit_code=0}: stopping left-over name="remote upload".*', ) )