From 37ecebe45bfc0572c66b4e9f1fa27b2699f28812 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Thu, 25 May 2023 11:37:12 +0200 Subject: [PATCH] mgr::get_tenant: distinguished error type (#4300) Before this patch, it would use error type `TenantStateError` which has many more error variants than can actually happen with `mgr::get_tenant`. Along the way, I also introduced `SetNewTenantConfigError` because it uses `mgr::get_tenant` and also can only fail in much fewer ways than `TenantStateError` suggests. The new `page_service.rs`'s `GetActiveTimelineError` and `GetActiveTenantError` types were necessary to avoid an `Other` variant on the `GetTenantError`. This patch is a by-product of reading code that subscribes to `Tenant::state` changes. Can't really connect it to any given project. --- pageserver/src/http/routes.rs | 36 ++++++++++++++++++++++++-- pageserver/src/page_service.rs | 43 ++++++++++++++++++++++++++----- pageserver/src/tenant.rs | 47 +++++++++++++++++++++++++++++----- pageserver/src/tenant/mgr.rs | 29 ++++++++++++++++----- 4 files changed, 133 insertions(+), 22 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 83d478ac3d..c530952aaf 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -24,7 +24,9 @@ use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL}; use crate::pgdatadir_mapping::LsnForTimestamp; use crate::task_mgr::TaskKind; use crate::tenant::config::TenantConfOpt; -use crate::tenant::mgr::{TenantMapInsertError, TenantStateError}; +use crate::tenant::mgr::{ + GetTenantError, SetNewTenantConfigError, TenantMapInsertError, TenantStateError, +}; use crate::tenant::size::ModelInputs; use crate::tenant::storage_layer::LayerAccessStatsReset; use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, Timeline}; @@ -140,6 +142,36 @@ impl From for ApiError { } } +impl From for ApiError { + fn from(tse: GetTenantError) -> ApiError { + match tse { + GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid)), + e @ GetTenantError::NotActive(_) => { + // Why is this not `ApiError::NotFound`? + // Because we must be careful to never return 404 for a tenant if it does + // in fact exist locally. If we did, the caller could draw the conclusion + // that it can attach the tenant to another PS and we'd be in split-brain. + // + // (We can produce this variant only in `mgr::get_tenant(..., active=true)` calls). + ApiError::InternalServerError(anyhow::Error::new(e)) + } + } + } +} + +impl From for ApiError { + fn from(e: SetNewTenantConfigError) -> ApiError { + match e { + SetNewTenantConfigError::GetTenant(tid) => { + ApiError::NotFound(anyhow!("tenant {}", tid)) + } + e @ SetNewTenantConfigError::Persist(_) => { + ApiError::InternalServerError(anyhow::Error::new(e)) + } + } + } +} + impl From for ApiError { fn from(value: crate::tenant::DeleteTimelineError) -> Self { use crate::tenant::DeleteTimelineError::*; @@ -159,7 +191,7 @@ impl From for ApiError { match value { // Report Precondition failed so client can distinguish between // "tenant is missing" case from "timeline is missing" - Tenant(TenantStateError::NotFound(..)) => { + Tenant(GetTenantError::NotFound(..)) => { ApiError::PreconditionFailed("Requested tenant is missing") } Tenant(t) => ApiError::from(t), diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index bd3ece2dfc..fd442783f9 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -50,7 +50,9 @@ use crate::import_datadir::import_wal_from_tar; use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME}; use crate::task_mgr; use crate::task_mgr::TaskKind; +use crate::tenant; use crate::tenant::mgr; +use crate::tenant::mgr::GetTenantError; use crate::tenant::{Tenant, Timeline}; use crate::trace::Tracer; @@ -1131,7 +1133,9 @@ enum GetActiveTenantError { wait_time: Duration, }, #[error(transparent)] - Other(#[from] anyhow::Error), + NotFound(GetTenantError), + #[error(transparent)] + WaitTenantActive(tenant::WaitToBecomeActiveError), } impl From for QueryError { @@ -1140,7 +1144,8 @@ impl From for QueryError { GetActiveTenantError::WaitForActiveTimeout { .. } => QueryError::Disconnected( ConnectionError::Io(io::Error::new(io::ErrorKind::TimedOut, e.to_string())), ), - GetActiveTenantError::Other(e) => QueryError::Other(e), + GetActiveTenantError::WaitTenantActive(e) => QueryError::Other(anyhow::Error::new(e)), + GetActiveTenantError::NotFound(e) => QueryError::Other(anyhow::Error::new(e)), } } } @@ -1156,13 +1161,16 @@ async fn get_active_tenant_with_timeout( ) -> Result, GetActiveTenantError> { let tenant = match mgr::get_tenant(tenant_id, false).await { Ok(tenant) => tenant, - Err(e) => return Err(GetActiveTenantError::Other(e.into())), + Err(e @ GetTenantError::NotFound(_)) => return Err(GetActiveTenantError::NotFound(e)), + Err(GetTenantError::NotActive(_)) => { + unreachable!("we're calling get_tenant with active=false") + } }; let wait_time = Duration::from_secs(30); match tokio::time::timeout(wait_time, tenant.wait_to_become_active()).await { Ok(Ok(())) => Ok(tenant), // no .context(), the error message is good enough and some tests depend on it - Ok(Err(wait_error)) => Err(GetActiveTenantError::Other(wait_error)), + Ok(Err(e)) => Err(GetActiveTenantError::WaitTenantActive(e)), Err(_) => { let latest_state = tenant.current_state(); if latest_state == TenantState::Active { @@ -1177,13 +1185,34 @@ async fn get_active_tenant_with_timeout( } } +#[derive(Debug, thiserror::Error)] +enum GetActiveTimelineError { + #[error(transparent)] + Tenant(GetActiveTenantError), + #[error(transparent)] + Timeline(anyhow::Error), +} + +impl From for QueryError { + fn from(e: GetActiveTimelineError) -> Self { + match e { + GetActiveTimelineError::Tenant(e) => e.into(), + GetActiveTimelineError::Timeline(e) => QueryError::Other(e), + } + } +} + /// Shorthand for getting a reference to a Timeline of an Active tenant. async fn get_active_tenant_timeline( tenant_id: TenantId, timeline_id: TimelineId, ctx: &RequestContext, -) -> Result, GetActiveTenantError> { - let tenant = get_active_tenant_with_timeout(tenant_id, ctx).await?; - let timeline = tenant.get_timeline(timeline_id, true)?; +) -> Result, GetActiveTimelineError> { + let tenant = get_active_tenant_with_timeout(tenant_id, ctx) + .await + .map_err(GetActiveTimelineError::Tenant)?; + let timeline = tenant + .get_timeline(timeline_id, true) + .map_err(GetActiveTimelineError::Timeline)?; Ok(timeline) } diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index e75d9f0d26..6806b2c99d 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -450,6 +450,34 @@ struct RemoteStartupData { remote_metadata: TimelineMetadata, } +#[derive(Debug, thiserror::Error)] +pub(crate) enum WaitToBecomeActiveError { + WillNotBecomeActive { + tenant_id: TenantId, + state: TenantState, + }, + TenantDropped { + tenant_id: TenantId, + }, +} + +impl std::fmt::Display for WaitToBecomeActiveError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + WaitToBecomeActiveError::WillNotBecomeActive { tenant_id, state } => { + write!( + f, + "Tenant {} will not become active. Current state: {:?}", + tenant_id, state + ) + } + WaitToBecomeActiveError::TenantDropped { tenant_id } => { + write!(f, "Tenant {tenant_id} will not become active (dropped)") + } + } + } +} + impl Tenant { /// Yet another helper for timeline initialization. /// Contains the common part of `load_local_timeline` and `load_remote_timeline`. @@ -1753,25 +1781,30 @@ impl Tenant { self.state.subscribe() } - pub async fn wait_to_become_active(&self) -> anyhow::Result<()> { + pub(crate) async fn wait_to_become_active(&self) -> Result<(), WaitToBecomeActiveError> { let mut receiver = self.state.subscribe(); loop { let current_state = receiver.borrow_and_update().clone(); match current_state { TenantState::Loading | TenantState::Attaching => { // in these states, there's a chance that we can reach ::Active - receiver.changed().await?; + receiver.changed().await.map_err( + |_e: tokio::sync::watch::error::RecvError| { + WaitToBecomeActiveError::TenantDropped { + tenant_id: self.tenant_id, + } + }, + )?; } TenantState::Active { .. } => { return Ok(()); } TenantState::Broken { .. } | TenantState::Stopping => { // There's no chance the tenant can transition back into ::Active - anyhow::bail!( - "Tenant {} will not become active. Current state: {:?}", - self.tenant_id, - ¤t_state, - ); + return Err(WaitToBecomeActiveError::WillNotBecomeActive { + tenant_id: self.tenant_id, + state: current_state, + }); } } } diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 53d69a15dc..fa9769b0f8 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -300,11 +300,19 @@ pub async fn create_tenant( }).await } +#[derive(Debug, thiserror::Error)] +pub enum SetNewTenantConfigError { + #[error(transparent)] + GetTenant(#[from] GetTenantError), + #[error(transparent)] + Persist(anyhow::Error), +} + pub async fn set_new_tenant_config( conf: &'static PageServerConf, new_tenant_conf: TenantConfOpt, tenant_id: TenantId, -) -> Result<(), TenantStateError> { +) -> Result<(), SetNewTenantConfigError> { info!("configuring tenant {tenant_id}"); let tenant = get_tenant(tenant_id, true).await?; @@ -314,23 +322,32 @@ pub async fn set_new_tenant_config( &tenant_config_path, new_tenant_conf, false, - )?; + ) + .map_err(SetNewTenantConfigError::Persist)?; tenant.set_new_tenant_config(new_tenant_conf); Ok(()) } +#[derive(Debug, thiserror::Error)] +pub enum GetTenantError { + #[error("Tenant {0} not found")] + NotFound(TenantId), + #[error("Tenant {0} is not active")] + NotActive(TenantId), +} + /// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query. /// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants. pub async fn get_tenant( tenant_id: TenantId, active_only: bool, -) -> Result, TenantStateError> { +) -> Result, GetTenantError> { let m = TENANTS.read().await; let tenant = m .get(&tenant_id) - .ok_or(TenantStateError::NotFound(tenant_id))?; + .ok_or(GetTenantError::NotFound(tenant_id))?; if active_only && !tenant.is_active() { - Err(TenantStateError::NotActive(tenant_id)) + Err(GetTenantError::NotActive(tenant_id)) } else { Ok(Arc::clone(tenant)) } @@ -339,7 +356,7 @@ pub async fn get_tenant( #[derive(Debug, thiserror::Error)] pub enum DeleteTimelineError { #[error("Tenant {0}")] - Tenant(#[from] TenantStateError), + Tenant(#[from] GetTenantError), #[error("Timeline {0}")] Timeline(#[from] crate::tenant::DeleteTimelineError),