diff --git a/libs/utils/src/http/error.rs b/libs/utils/src/http/error.rs index dd54cd6ecd..08182aea07 100644 --- a/libs/utils/src/http/error.rs +++ b/libs/utils/src/http/error.rs @@ -24,6 +24,9 @@ pub enum ApiError { #[error("Precondition failed: {0}")] PreconditionFailed(Box), + #[error("Resource temporarily unavailable: {0}")] + ResourceUnavailable(String), + #[error("Shutting down")] ShuttingDown, @@ -59,6 +62,10 @@ impl ApiError { "Shutting down".to_string(), StatusCode::SERVICE_UNAVAILABLE, ), + ApiError::ResourceUnavailable(err) => HttpErrorBody::response_from_msg_and_status( + err.to_string(), + StatusCode::SERVICE_UNAVAILABLE, + ), ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status( err.to_string(), StatusCode::INTERNAL_SERVER_ERROR, diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index e61a9dcf3f..4144eef018 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -132,7 +132,7 @@ impl From for ApiError { ApiError::InternalServerError(anyhow::anyhow!("request was cancelled")) } PageReconstructError::AncestorStopping(_) => { - ApiError::InternalServerError(anyhow::Error::new(pre)) + ApiError::ResourceUnavailable(format!("{pre}")) } PageReconstructError::WalRedo(pre) => { ApiError::InternalServerError(anyhow::Error::new(pre)) @@ -145,7 +145,7 @@ impl From for ApiError { fn from(tmie: TenantMapInsertError) -> ApiError { match tmie { TenantMapInsertError::StillInitializing | TenantMapInsertError::ShuttingDown => { - ApiError::InternalServerError(anyhow::Error::new(tmie)) + ApiError::ResourceUnavailable(format!("{tmie}")) } TenantMapInsertError::TenantAlreadyExists(id, state) => { ApiError::Conflict(format!("tenant {id} already exists, state: {state:?}")) @@ -159,6 +159,12 @@ impl From for ApiError { fn from(tse: TenantStateError) -> ApiError { match tse { TenantStateError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()), + TenantStateError::NotActive(_) => { + ApiError::ResourceUnavailable("Tenant not yet active".into()) + } + TenantStateError::IsStopping(_) => { + ApiError::ResourceUnavailable("Tenant is stopping".into()) + } _ => ApiError::InternalServerError(anyhow::Error::new(tse)), } } @@ -168,14 +174,17 @@ impl From for ApiError { fn from(tse: GetTenantError) -> ApiError { match tse { GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()), - e @ GetTenantError::NotActive(_) => { + GetTenantError::Broken(reason) => { + ApiError::InternalServerError(anyhow!("tenant is broken: {}", reason)) + } + GetTenantError::NotActive(_) => { // Why is this not `ApiError::NotFound`? // Because we must be careful to never return 404 for a tenant if it does // in fact exist locally. If we did, the caller could draw the conclusion // that it can attach the tenant to another PS and we'd be in split-brain. // // (We can produce this variant only in `mgr::get_tenant(..., active=true)` calls). - ApiError::InternalServerError(anyhow::Error::new(e)) + ApiError::ResourceUnavailable("Tenant not yet active".into()) } } } @@ -622,8 +631,9 @@ async fn tenant_list_handler( let response_data = mgr::list_tenants() .instrument(info_span!("tenant_list")) .await - .map_err(anyhow::Error::new) - .map_err(ApiError::InternalServerError)? + .map_err(|_| { + ApiError::ResourceUnavailable("Tenant map is initializing or shutting down".to_string()) + })? .iter() .map(|(id, state)| TenantInfo { id: *id, diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 2a87ee0381..8c179a9add 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -1265,7 +1265,10 @@ async fn get_active_tenant_with_timeout( Ok(tenant) => tenant, Err(e @ GetTenantError::NotFound(_)) => return Err(GetActiveTenantError::NotFound(e)), Err(GetTenantError::NotActive(_)) => { - unreachable!("we're calling get_tenant with active=false") + unreachable!("we're calling get_tenant with active_only=false") + } + Err(GetTenantError::Broken(_)) => { + unreachable!("we're calling get_tenant with active_only=false") } }; let wait_time = Duration::from_secs(30); diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 17bcc9eb5f..ba86aefe44 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -510,6 +510,11 @@ pub enum GetTenantError { NotFound(TenantId), #[error("Tenant {0} is not active")] NotActive(TenantId), + /// Broken is logically a subset of NotActive, but a distinct error is useful as + /// NotActive is usually a retryable state for API purposes, whereas Broken + /// is a stuck error state + #[error("Tenant is broken: {0}")] + Broken(String), } /// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query. @@ -524,10 +529,20 @@ pub async fn get_tenant( let tenant = m .get(&tenant_id) .ok_or(GetTenantError::NotFound(tenant_id))?; - if active_only && !tenant.is_active() { - Err(GetTenantError::NotActive(tenant_id)) - } else { - Ok(Arc::clone(tenant)) + + match tenant.current_state() { + TenantState::Broken { + reason, + backtrace: _, + } if active_only => Err(GetTenantError::Broken(reason)), + TenantState::Active => Ok(Arc::clone(tenant)), + _ => { + if active_only { + Err(GetTenantError::NotActive(tenant_id)) + } else { + Ok(Arc::clone(tenant)) + } + } } } diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 92e7cd06cd..c8b970100b 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -37,6 +37,7 @@ from psycopg2.extensions import connection as PgConnection from psycopg2.extensions import cursor as PgCursor from psycopg2.extensions import make_dsn, parse_dsn from typing_extensions import Literal +from urllib3.util.retry import Retry from fixtures.broker import NeonBroker from fixtures.log_helper import log @@ -1651,11 +1652,14 @@ class NeonPageserver(PgProtocol): if '"testing"' not in self.version: pytest.skip("pageserver was built without 'testing' feature") - def http_client(self, auth_token: Optional[str] = None) -> PageserverHttpClient: + def http_client( + self, auth_token: Optional[str] = None, retries: Optional[Retry] = None + ) -> PageserverHttpClient: return PageserverHttpClient( port=self.service_port.http, auth_token=auth_token, is_testing_enabled_or_skip=self.is_testing_enabled_or_skip, + retries=retries, ) @property diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index 9fdcd22bc2..460a30ad56 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -7,6 +7,8 @@ from dataclasses import dataclass from typing import Any, Dict, List, Optional, Tuple import requests +from requests.adapters import HTTPAdapter +from urllib3.util.retry import Retry from fixtures.log_helper import log from fixtures.metrics import Metrics, parse_metrics @@ -113,12 +115,40 @@ class TenantConfig: class PageserverHttpClient(requests.Session): - def __init__(self, port: int, is_testing_enabled_or_skip: Fn, auth_token: Optional[str] = None): + def __init__( + self, + port: int, + is_testing_enabled_or_skip: Fn, + auth_token: Optional[str] = None, + retries: Optional[Retry] = None, + ): super().__init__() self.port = port self.auth_token = auth_token self.is_testing_enabled_or_skip = is_testing_enabled_or_skip + if retries is None: + # We apply a retry policy that is different to the default `requests` behavior, + # because the pageserver has various transiently unavailable states that benefit + # from a client retrying on 503 + + retries = Retry( + # Status retries are for retrying on 503 while e.g. waiting for tenants to activate + status=5, + # Connection retries are for waiting for the pageserver to come up and listen + connect=5, + # No read retries: if a request hangs that is not expected behavior + # (this may change in future if we do fault injection of a kind that causes + # requests TCP flows to stick) + read=False, + backoff_factor=0, + status_forcelist=[503], + allowed_methods=None, + remove_headers_on_redirect=[], + ) + + self.mount("http://", HTTPAdapter(max_retries=retries)) + if auth_token is not None: self.headers["Authorization"] = f"Bearer {auth_token}" diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index 3af144c31c..c412809a3a 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -34,6 +34,7 @@ from fixtures.remote_storage import ( ) from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import query_scalar, run_pg_bench_small, wait_until +from urllib3.util.retry import Retry def test_timeline_delete(neon_simple_env: NeonEnv): @@ -614,7 +615,7 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder): child_timeline_id = env.neon_cli.create_branch("child", "main") - ps_http = env.pageserver.http_client() + ps_http = env.pageserver.http_client(retries=Retry(0, read=False)) failpoint_name = "persist_deleted_index_part" ps_http.configure_failpoints((failpoint_name, "pause")) @@ -854,7 +855,7 @@ def test_timeline_delete_resumed_on_attach( # error from http response is also logged ".*InternalServerError\\(Tenant is marked as deleted on remote storage.*", # Polling after attach may fail with this - f".*InternalServerError\\(Tenant {tenant_id} is not active.*", + ".*Resource temporarily unavailable.*Tenant not yet active", '.*shutdown_pageserver{exit_code=0}: stopping left-over name="remote upload".*', ) )