mgr::get_tenant: distinguished error type (#4300)

Before this patch, it would use error type `TenantStateError` which has
many more error variants than can actually happen with
`mgr::get_tenant`.

Along the way, I also introduced `SetNewTenantConfigError` because it
uses `mgr::get_tenant` and also can only fail in much fewer ways than
`TenantStateError` suggests.

The new `page_service.rs`'s `GetActiveTimelineError` and
`GetActiveTenantError` types were necessary to avoid an `Other` variant
on the `GetTenantError`.

This patch is a by-product of reading code that subscribes to
`Tenant::state` changes.
Can't really connect it to any given project.
This commit is contained in:
Christian Schwarz
2023-05-25 11:37:12 +02:00
committed by GitHub
parent 6052ecee07
commit 37ecebe45b
4 changed files with 133 additions and 22 deletions

View File

@@ -24,7 +24,9 @@ use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL};
use crate::pgdatadir_mapping::LsnForTimestamp;
use crate::task_mgr::TaskKind;
use crate::tenant::config::TenantConfOpt;
use crate::tenant::mgr::{TenantMapInsertError, TenantStateError};
use crate::tenant::mgr::{
GetTenantError, SetNewTenantConfigError, TenantMapInsertError, TenantStateError,
};
use crate::tenant::size::ModelInputs;
use crate::tenant::storage_layer::LayerAccessStatsReset;
use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, Timeline};
@@ -140,6 +142,36 @@ impl From<TenantStateError> for ApiError {
}
}
impl From<GetTenantError> for ApiError {
fn from(tse: GetTenantError) -> ApiError {
match tse {
GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid)),
e @ GetTenantError::NotActive(_) => {
// Why is this not `ApiError::NotFound`?
// Because we must be careful to never return 404 for a tenant if it does
// in fact exist locally. If we did, the caller could draw the conclusion
// that it can attach the tenant to another PS and we'd be in split-brain.
//
// (We can produce this variant only in `mgr::get_tenant(..., active=true)` calls).
ApiError::InternalServerError(anyhow::Error::new(e))
}
}
}
}
impl From<SetNewTenantConfigError> for ApiError {
fn from(e: SetNewTenantConfigError) -> ApiError {
match e {
SetNewTenantConfigError::GetTenant(tid) => {
ApiError::NotFound(anyhow!("tenant {}", tid))
}
e @ SetNewTenantConfigError::Persist(_) => {
ApiError::InternalServerError(anyhow::Error::new(e))
}
}
}
}
impl From<crate::tenant::DeleteTimelineError> for ApiError {
fn from(value: crate::tenant::DeleteTimelineError) -> Self {
use crate::tenant::DeleteTimelineError::*;
@@ -159,7 +191,7 @@ impl From<crate::tenant::mgr::DeleteTimelineError> for ApiError {
match value {
// Report Precondition failed so client can distinguish between
// "tenant is missing" case from "timeline is missing"
Tenant(TenantStateError::NotFound(..)) => {
Tenant(GetTenantError::NotFound(..)) => {
ApiError::PreconditionFailed("Requested tenant is missing")
}
Tenant(t) => ApiError::from(t),

View File

@@ -50,7 +50,9 @@ use crate::import_datadir::import_wal_from_tar;
use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME};
use crate::task_mgr;
use crate::task_mgr::TaskKind;
use crate::tenant;
use crate::tenant::mgr;
use crate::tenant::mgr::GetTenantError;
use crate::tenant::{Tenant, Timeline};
use crate::trace::Tracer;
@@ -1131,7 +1133,9 @@ enum GetActiveTenantError {
wait_time: Duration,
},
#[error(transparent)]
Other(#[from] anyhow::Error),
NotFound(GetTenantError),
#[error(transparent)]
WaitTenantActive(tenant::WaitToBecomeActiveError),
}
impl From<GetActiveTenantError> for QueryError {
@@ -1140,7 +1144,8 @@ impl From<GetActiveTenantError> for QueryError {
GetActiveTenantError::WaitForActiveTimeout { .. } => QueryError::Disconnected(
ConnectionError::Io(io::Error::new(io::ErrorKind::TimedOut, e.to_string())),
),
GetActiveTenantError::Other(e) => QueryError::Other(e),
GetActiveTenantError::WaitTenantActive(e) => QueryError::Other(anyhow::Error::new(e)),
GetActiveTenantError::NotFound(e) => QueryError::Other(anyhow::Error::new(e)),
}
}
}
@@ -1156,13 +1161,16 @@ async fn get_active_tenant_with_timeout(
) -> Result<Arc<Tenant>, GetActiveTenantError> {
let tenant = match mgr::get_tenant(tenant_id, false).await {
Ok(tenant) => tenant,
Err(e) => return Err(GetActiveTenantError::Other(e.into())),
Err(e @ GetTenantError::NotFound(_)) => return Err(GetActiveTenantError::NotFound(e)),
Err(GetTenantError::NotActive(_)) => {
unreachable!("we're calling get_tenant with active=false")
}
};
let wait_time = Duration::from_secs(30);
match tokio::time::timeout(wait_time, tenant.wait_to_become_active()).await {
Ok(Ok(())) => Ok(tenant),
// no .context(), the error message is good enough and some tests depend on it
Ok(Err(wait_error)) => Err(GetActiveTenantError::Other(wait_error)),
Ok(Err(e)) => Err(GetActiveTenantError::WaitTenantActive(e)),
Err(_) => {
let latest_state = tenant.current_state();
if latest_state == TenantState::Active {
@@ -1177,13 +1185,34 @@ async fn get_active_tenant_with_timeout(
}
}
#[derive(Debug, thiserror::Error)]
enum GetActiveTimelineError {
#[error(transparent)]
Tenant(GetActiveTenantError),
#[error(transparent)]
Timeline(anyhow::Error),
}
impl From<GetActiveTimelineError> for QueryError {
fn from(e: GetActiveTimelineError) -> Self {
match e {
GetActiveTimelineError::Tenant(e) => e.into(),
GetActiveTimelineError::Timeline(e) => QueryError::Other(e),
}
}
}
/// Shorthand for getting a reference to a Timeline of an Active tenant.
async fn get_active_tenant_timeline(
tenant_id: TenantId,
timeline_id: TimelineId,
ctx: &RequestContext,
) -> Result<Arc<Timeline>, GetActiveTenantError> {
let tenant = get_active_tenant_with_timeout(tenant_id, ctx).await?;
let timeline = tenant.get_timeline(timeline_id, true)?;
) -> Result<Arc<Timeline>, GetActiveTimelineError> {
let tenant = get_active_tenant_with_timeout(tenant_id, ctx)
.await
.map_err(GetActiveTimelineError::Tenant)?;
let timeline = tenant
.get_timeline(timeline_id, true)
.map_err(GetActiveTimelineError::Timeline)?;
Ok(timeline)
}

View File

@@ -450,6 +450,34 @@ struct RemoteStartupData {
remote_metadata: TimelineMetadata,
}
#[derive(Debug, thiserror::Error)]
pub(crate) enum WaitToBecomeActiveError {
WillNotBecomeActive {
tenant_id: TenantId,
state: TenantState,
},
TenantDropped {
tenant_id: TenantId,
},
}
impl std::fmt::Display for WaitToBecomeActiveError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
WaitToBecomeActiveError::WillNotBecomeActive { tenant_id, state } => {
write!(
f,
"Tenant {} will not become active. Current state: {:?}",
tenant_id, state
)
}
WaitToBecomeActiveError::TenantDropped { tenant_id } => {
write!(f, "Tenant {tenant_id} will not become active (dropped)")
}
}
}
}
impl Tenant {
/// Yet another helper for timeline initialization.
/// Contains the common part of `load_local_timeline` and `load_remote_timeline`.
@@ -1753,25 +1781,30 @@ impl Tenant {
self.state.subscribe()
}
pub async fn wait_to_become_active(&self) -> anyhow::Result<()> {
pub(crate) async fn wait_to_become_active(&self) -> Result<(), WaitToBecomeActiveError> {
let mut receiver = self.state.subscribe();
loop {
let current_state = receiver.borrow_and_update().clone();
match current_state {
TenantState::Loading | TenantState::Attaching => {
// in these states, there's a chance that we can reach ::Active
receiver.changed().await?;
receiver.changed().await.map_err(
|_e: tokio::sync::watch::error::RecvError| {
WaitToBecomeActiveError::TenantDropped {
tenant_id: self.tenant_id,
}
},
)?;
}
TenantState::Active { .. } => {
return Ok(());
}
TenantState::Broken { .. } | TenantState::Stopping => {
// There's no chance the tenant can transition back into ::Active
anyhow::bail!(
"Tenant {} will not become active. Current state: {:?}",
self.tenant_id,
&current_state,
);
return Err(WaitToBecomeActiveError::WillNotBecomeActive {
tenant_id: self.tenant_id,
state: current_state,
});
}
}
}

View File

@@ -300,11 +300,19 @@ pub async fn create_tenant(
}).await
}
#[derive(Debug, thiserror::Error)]
pub enum SetNewTenantConfigError {
#[error(transparent)]
GetTenant(#[from] GetTenantError),
#[error(transparent)]
Persist(anyhow::Error),
}
pub async fn set_new_tenant_config(
conf: &'static PageServerConf,
new_tenant_conf: TenantConfOpt,
tenant_id: TenantId,
) -> Result<(), TenantStateError> {
) -> Result<(), SetNewTenantConfigError> {
info!("configuring tenant {tenant_id}");
let tenant = get_tenant(tenant_id, true).await?;
@@ -314,23 +322,32 @@ pub async fn set_new_tenant_config(
&tenant_config_path,
new_tenant_conf,
false,
)?;
)
.map_err(SetNewTenantConfigError::Persist)?;
tenant.set_new_tenant_config(new_tenant_conf);
Ok(())
}
#[derive(Debug, thiserror::Error)]
pub enum GetTenantError {
#[error("Tenant {0} not found")]
NotFound(TenantId),
#[error("Tenant {0} is not active")]
NotActive(TenantId),
}
/// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query.
/// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants.
pub async fn get_tenant(
tenant_id: TenantId,
active_only: bool,
) -> Result<Arc<Tenant>, TenantStateError> {
) -> Result<Arc<Tenant>, GetTenantError> {
let m = TENANTS.read().await;
let tenant = m
.get(&tenant_id)
.ok_or(TenantStateError::NotFound(tenant_id))?;
.ok_or(GetTenantError::NotFound(tenant_id))?;
if active_only && !tenant.is_active() {
Err(TenantStateError::NotActive(tenant_id))
Err(GetTenantError::NotActive(tenant_id))
} else {
Ok(Arc::clone(tenant))
}
@@ -339,7 +356,7 @@ pub async fn get_tenant(
#[derive(Debug, thiserror::Error)]
pub enum DeleteTimelineError {
#[error("Tenant {0}")]
Tenant(#[from] TenantStateError),
Tenant(#[from] GetTenantError),
#[error("Timeline {0}")]
Timeline(#[from] crate::tenant::DeleteTimelineError),