pageserver: return 503 for GET on InProgress tenant

This commit is contained in:
John Spray
2023-10-27 16:54:53 +01:00
parent b2176043fe
commit 1874f9427a
3 changed files with 39 additions and 18 deletions

View File

@@ -222,6 +222,7 @@ impl From<GetTenantError> for ApiError {
// (We can produce this variant only in `mgr::get_tenant(..., active=true)` calls).
ApiError::ResourceUnavailable("Tenant not yet active".into())
}
GetTenantError::MapState(e) => ApiError::ResourceUnavailable(format!("{e}").into()),
}
}
}

View File

@@ -1323,6 +1323,9 @@ async fn get_active_tenant_with_timeout(
Err(GetTenantError::Broken(_)) => {
unreachable!("we're calling get_tenant with active_only=false")
}
Err(GetTenantError::MapState(_)) => {
unreachable!("TenantManager is initialized before page service starts")
}
};
let wait_time = Duration::from_secs(30);
match tokio::time::timeout(wait_time, tenant.wait_to_become_active()).await {

View File

@@ -748,7 +748,7 @@ pub(crate) async fn upsert_location(
// existng tenant.
{
let locked = TENANTS.read().unwrap();
let peek_slot = tenant_map_peek_slot(&locked, &tenant_id)?;
let peek_slot = tenant_map_peek_slot(&locked, &tenant_id, false)?;
match (&new_location_config.mode, peek_slot) {
(LocationMode::Attached(attach_conf), Some(TenantSlot::Attached(tenant))) => {
if attach_conf.generation == tenant.generation {
@@ -874,6 +874,10 @@ pub(crate) enum GetTenantError {
/// is a stuck error state
#[error("Tenant is broken: {0}")]
Broken(String),
// Initializing or shutting down: cannot authoritatively say whether we have this tenant
#[error("Tenant map is not available: {0}")]
MapState(#[from] TenantMapError),
}
/// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query.
@@ -884,24 +888,26 @@ pub(crate) fn get_tenant(
tenant_id: TenantId,
active_only: bool,
) -> Result<Arc<Tenant>, GetTenantError> {
let m = TENANTS.read().unwrap();
let tenant = m
.get(&tenant_id)
.ok_or(GetTenantError::NotFound(tenant_id))?;
let locked = TENANTS.read().unwrap();
let peek_slot = tenant_map_peek_slot(&locked, &tenant_id, true)?;
match tenant.current_state() {
TenantState::Broken {
reason,
backtrace: _,
} if active_only => Err(GetTenantError::Broken(reason)),
TenantState::Active => Ok(Arc::clone(tenant)),
_ => {
if active_only {
Err(GetTenantError::NotActive(tenant_id))
} else {
Ok(Arc::clone(tenant))
match peek_slot {
Some(TenantSlot::Attached(tenant)) => match tenant.current_state() {
TenantState::Broken {
reason,
backtrace: _,
} if active_only => Err(GetTenantError::Broken(reason)),
TenantState::Active => Ok(Arc::clone(tenant)),
_ => {
if active_only {
Err(GetTenantError::NotActive(tenant_id))
} else {
Ok(Arc::clone(tenant))
}
}
}
},
Some(TenantSlot::InProgress(_)) => Err(GetTenantError::NotActive(tenant_id)),
None | Some(TenantSlot::Secondary) => Err(GetTenantError::NotFound(tenant_id)),
}
}
@@ -1372,13 +1378,24 @@ impl Drop for SlotGuard {
}
}
/// `allow_shutdown=true` is appropriate for read-only APIs that should stay working while
/// the pageserver is shutting down. Otherwise, set it to false to refuse attempts to
/// operate on a slot while we are shutting down.
///
fn tenant_map_peek_slot<'a>(
tenants: &'a std::sync::RwLockReadGuard<'a, TenantsMap>,
tenant_id: &TenantId,
allow_shutdown: bool,
) -> Result<Option<&'a TenantSlot>, TenantMapError> {
let m = match tenants.deref() {
TenantsMap::Initializing => return Err(TenantMapError::StillInitializing),
TenantsMap::ShuttingDown(_) => return Err(TenantMapError::ShuttingDown),
TenantsMap::ShuttingDown(m) => {
if allow_shutdown {
return Err(TenantMapError::ShuttingDown);
} else {
m
}
}
TenantsMap::Open(m) => m,
};