Return NotFound in mgmt API requests when tenant is not present in the pageserver (#3818)

## Describe your changes
Add Error enum for tenant state response to allow better error handling
in mgmt api
## Issue ticket number and link
#2238
## Checklist before requesting a review
- [x] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.
This commit is contained in:
Shany Pozin
2023-03-19 10:44:42 +02:00
committed by GitHub
parent f6e2e0042d
commit 93f3f4ab5f
7 changed files with 104 additions and 105 deletions

View File

@@ -21,7 +21,7 @@ use crate::context::{DownloadBehavior, RequestContext};
use crate::pgdatadir_mapping::LsnForTimestamp;
use crate::task_mgr::TaskKind;
use crate::tenant::config::TenantConfOpt;
use crate::tenant::mgr::TenantMapInsertError;
use crate::tenant::mgr::{TenantMapInsertError, TenantStateError};
use crate::tenant::size::ModelInputs;
use crate::tenant::storage_layer::LayerAccessStatsReset;
use crate::tenant::{PageReconstructError, Timeline};
@@ -89,32 +89,45 @@ fn check_permission(request: &Request<Body>, tenant_id: Option<TenantId>) -> Res
})
}
fn apierror_from_prerror(err: PageReconstructError) -> ApiError {
match err {
PageReconstructError::Other(err) => ApiError::InternalServerError(err),
PageReconstructError::NeedsDownload(_, _) => {
// This shouldn't happen, because we use a RequestContext that requests to
// download any missing layer files on-demand.
ApiError::InternalServerError(anyhow::anyhow!("need to download remote layer file"))
}
PageReconstructError::Cancelled => {
ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
}
PageReconstructError::WalRedo(err) => {
ApiError::InternalServerError(anyhow::Error::new(err))
impl From<PageReconstructError> for ApiError {
fn from(pre: PageReconstructError) -> ApiError {
match pre {
PageReconstructError::Other(pre) => ApiError::InternalServerError(pre),
PageReconstructError::NeedsDownload(_, _) => {
// This shouldn't happen, because we use a RequestContext that requests to
// download any missing layer files on-demand.
ApiError::InternalServerError(anyhow::anyhow!("need to download remote layer file"))
}
PageReconstructError::Cancelled => {
ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
}
PageReconstructError::WalRedo(pre) => {
ApiError::InternalServerError(anyhow::Error::new(pre))
}
}
}
}
fn apierror_from_tenant_map_insert_error(e: TenantMapInsertError) -> ApiError {
match e {
TenantMapInsertError::StillInitializing | TenantMapInsertError::ShuttingDown => {
ApiError::InternalServerError(anyhow::Error::new(e))
impl From<TenantMapInsertError> for ApiError {
fn from(tmie: TenantMapInsertError) -> ApiError {
match tmie {
TenantMapInsertError::StillInitializing | TenantMapInsertError::ShuttingDown => {
ApiError::InternalServerError(anyhow::Error::new(tmie))
}
TenantMapInsertError::TenantAlreadyExists(id, state) => {
ApiError::Conflict(format!("tenant {id} already exists, state: {state:?}"))
}
TenantMapInsertError::Closure(e) => ApiError::InternalServerError(e),
}
TenantMapInsertError::TenantAlreadyExists(id, state) => {
ApiError::Conflict(format!("tenant {id} already exists, state: {state:?}"))
}
}
impl From<TenantStateError> for ApiError {
fn from(tse: TenantStateError) -> ApiError {
match tse {
TenantStateError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid)),
_ => ApiError::InternalServerError(anyhow::Error::new(tse)),
}
TenantMapInsertError::Closure(e) => ApiError::InternalServerError(e),
}
}
@@ -218,9 +231,7 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Error);
let tenant = mgr::get_tenant(tenant_id, true)
.await
.map_err(ApiError::NotFound)?;
let tenant = mgr::get_tenant(tenant_id, true).await?;
match tenant.create_timeline(
new_timeline_id,
request_data.ancestor_timeline_id.map(TimelineId::from),
@@ -250,9 +261,7 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let response_data = async {
let tenant = mgr::get_tenant(tenant_id, true)
.await
.map_err(ApiError::NotFound)?;
let tenant = mgr::get_tenant(tenant_id, true).await?;
let timelines = tenant.list_timelines();
let mut response_data = Vec::with_capacity(timelines.len());
@@ -268,7 +277,7 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
response_data.push(timeline_info);
}
Ok(response_data)
Ok::<Vec<TimelineInfo>, ApiError>(response_data)
}
.instrument(info_span!("timeline_list", tenant = %tenant_id))
.await?;
@@ -287,9 +296,7 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let timeline_info = async {
let tenant = mgr::get_tenant(tenant_id, true)
.await
.map_err(ApiError::NotFound)?;
let tenant = mgr::get_tenant(tenant_id, true).await?;
let timeline = tenant
.get_timeline(timeline_id, false)
@@ -325,10 +332,7 @@ async fn get_lsn_by_timestamp_handler(request: Request<Body>) -> Result<Response
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
let result = timeline
.find_lsn_for_timestamp(timestamp_pg, &ctx)
.await
.map_err(apierror_from_prerror)?;
let result = timeline.find_lsn_for_timestamp(timestamp_pg, &ctx).await?;
let result = match result {
LsnForTimestamp::Present(lsn) => format!("{lsn}"),
@@ -353,8 +357,7 @@ async fn tenant_attach_handler(request: Request<Body>) -> Result<Response<Body>,
if let Some(remote_storage) = &state.remote_storage {
mgr::attach_tenant(state.conf, tenant_id, remote_storage.clone(), &ctx)
.instrument(info_span!("tenant_attach", tenant = %tenant_id))
.await
.map_err(apierror_from_tenant_map_insert_error)?;
.await?;
} else {
return Err(ApiError::BadRequest(anyhow!(
"attach_tenant is not possible because pageserver was configured without remote storage"
@@ -373,11 +376,7 @@ async fn timeline_delete_handler(request: Request<Body>) -> Result<Response<Body
mgr::delete_timeline(tenant_id, timeline_id, &ctx)
.instrument(info_span!("timeline_delete", tenant = %tenant_id, timeline = %timeline_id))
.await
// FIXME: Errors from `delete_timeline` can occur for a number of reasons, incuding both
// user and internal errors. Replace this with better handling once the error type permits
// it.
.map_err(ApiError::InternalServerError)?;
.await?;
json_response(StatusCode::OK, ())
}
@@ -390,10 +389,7 @@ async fn tenant_detach_handler(request: Request<Body>) -> Result<Response<Body>,
let conf = state.conf;
mgr::detach_tenant(conf, tenant_id)
.instrument(info_span!("tenant_detach", tenant = %tenant_id))
.await
// FIXME: Errors from `detach_tenant` can be caused by both both user and internal errors.
// Replace this with better handling once the error type permits it.
.map_err(ApiError::InternalServerError)?;
.await?;
json_response(StatusCode::OK, ())
}
@@ -407,8 +403,7 @@ async fn tenant_load_handler(request: Request<Body>) -> Result<Response<Body>, A
let state = get_state(&request);
mgr::load_tenant(state.conf, tenant_id, state.remote_storage.clone(), &ctx)
.instrument(info_span!("load", tenant = %tenant_id))
.await
.map_err(apierror_from_tenant_map_insert_error)?;
.await?;
json_response(StatusCode::ACCEPTED, ())
}
@@ -421,10 +416,7 @@ async fn tenant_ignore_handler(request: Request<Body>) -> Result<Response<Body>,
let conf = state.conf;
mgr::ignore_tenant(conf, tenant_id)
.instrument(info_span!("ignore_tenant", tenant = %tenant_id))
.await
// FIXME: Errors from `ignore_tenant` can be caused by both both user and internal errors.
// Replace this with better handling once the error type permits it.
.map_err(ApiError::InternalServerError)?;
.await?;
json_response(StatusCode::OK, ())
}
@@ -498,9 +490,7 @@ async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, A
let headers = request.headers();
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let tenant = mgr::get_tenant(tenant_id, true)
.await
.map_err(ApiError::InternalServerError)?;
let tenant = mgr::get_tenant(tenant_id, true).await?;
// this can be long operation
let inputs = tenant
@@ -763,8 +753,7 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
&ctx,
)
.instrument(info_span!("tenant_create", tenant = ?target_tenant_id))
.await
.map_err(apierror_from_tenant_map_insert_error)?;
.await?;
// We created the tenant. Existing API semantics are that the tenant
// is Active when this function returns.
@@ -788,9 +777,7 @@ async fn get_tenant_config_handler(request: Request<Body>) -> Result<Response<Bo
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
let tenant = mgr::get_tenant(tenant_id, false)
.await
.map_err(ApiError::NotFound)?;
let tenant = mgr::get_tenant(tenant_id, false).await?;
let response = HashMap::from([
(
@@ -885,10 +872,7 @@ async fn update_tenant_config_handler(
let state = get_state(&request);
mgr::set_new_tenant_config(state.conf, tenant_conf, tenant_id)
.instrument(info_span!("tenant_config", tenant = ?tenant_id))
.await
// FIXME: `update_tenant_config` can fail because of both user and internal errors.
// Replace this `map_err` with better error handling once the type permits it
.map_err(ApiError::InternalServerError)?;
.await?;
json_response(StatusCode::OK, ())
}
@@ -1025,9 +1009,7 @@ async fn active_timeline_of_active_tenant(
tenant_id: TenantId,
timeline_id: TimelineId,
) -> Result<Arc<Timeline>, ApiError> {
let tenant = mgr::get_tenant(tenant_id, true)
.await
.map_err(ApiError::NotFound)?;
let tenant = mgr::get_tenant(tenant_id, true).await?;
tenant
.get_timeline(timeline_id, true)
.map_err(ApiError::NotFound)

View File

@@ -1107,7 +1107,10 @@ async fn get_active_tenant_with_timeout(
tenant_id: TenantId,
_ctx: &RequestContext, /* require get a context to support cancellation in the future */
) -> Result<Arc<Tenant>, GetActiveTenantError> {
let tenant = mgr::get_tenant(tenant_id, false).await?;
let tenant = match mgr::get_tenant(tenant_id, false).await {
Ok(tenant) => tenant,
Err(e) => return Err(GetActiveTenantError::Other(e.into())),
};
let wait_time = Duration::from_secs(30);
match tokio::time::timeout(wait_time, tenant.wait_to_become_active()).await {
Ok(Ok(())) => Ok(tenant),

View File

@@ -289,7 +289,7 @@ pub async fn set_new_tenant_config(
conf: &'static PageServerConf,
new_tenant_conf: TenantConfOpt,
tenant_id: TenantId,
) -> anyhow::Result<()> {
) -> Result<(), TenantStateError> {
info!("configuring tenant {tenant_id}");
let tenant = get_tenant(tenant_id, true).await?;
@@ -306,16 +306,20 @@ pub async fn set_new_tenant_config(
/// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query.
/// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants.
pub async fn get_tenant(tenant_id: TenantId, active_only: bool) -> anyhow::Result<Arc<Tenant>> {
pub async fn get_tenant(
tenant_id: TenantId,
active_only: bool,
) -> Result<Arc<Tenant>, TenantStateError> {
let m = TENANTS.read().await;
let tenant = m
.get(&tenant_id)
.with_context(|| format!("Tenant {tenant_id} not found in the local state"))?;
.ok_or(TenantStateError::NotFound(tenant_id))?;
if active_only && !tenant.is_active() {
anyhow::bail!(
tracing::warn!(
"Tenant {tenant_id} is not active. Current state: {:?}",
tenant.current_state()
)
);
Err(TenantStateError::NotActive(tenant_id))
} else {
Ok(Arc::clone(tenant))
}
@@ -325,21 +329,28 @@ pub async fn delete_timeline(
tenant_id: TenantId,
timeline_id: TimelineId,
ctx: &RequestContext,
) -> anyhow::Result<()> {
match get_tenant(tenant_id, true).await {
Ok(tenant) => {
tenant.delete_timeline(timeline_id, ctx).await?;
}
Err(e) => anyhow::bail!("Cannot access tenant {tenant_id} in local tenant state: {e:?}"),
}
) -> Result<(), TenantStateError> {
let tenant = get_tenant(tenant_id, true).await?;
tenant.delete_timeline(timeline_id, ctx).await?;
Ok(())
}
#[derive(Debug, thiserror::Error)]
pub enum TenantStateError {
#[error("Tenant {0} not found")]
NotFound(TenantId),
#[error("Tenant {0} is stopping")]
IsStopping(TenantId),
#[error("Tenant {0} is not active")]
NotActive(TenantId),
#[error(transparent)]
Other(#[from] anyhow::Error),
}
pub async fn detach_tenant(
conf: &'static PageServerConf,
tenant_id: TenantId,
) -> anyhow::Result<()> {
) -> Result<(), TenantStateError> {
remove_tenant_from_memory(tenant_id, async {
let local_tenant_directory = conf.tenant_path(&tenant_id);
fs::remove_dir_all(&local_tenant_directory)
@@ -379,7 +390,7 @@ pub async fn load_tenant(
pub async fn ignore_tenant(
conf: &'static PageServerConf,
tenant_id: TenantId,
) -> anyhow::Result<()> {
) -> Result<(), TenantStateError> {
remove_tenant_from_memory(tenant_id, async {
let ignore_mark_file = conf.tenant_ignore_mark_file_path(tenant_id);
fs::File::create(&ignore_mark_file)
@@ -489,7 +500,7 @@ where
async fn remove_tenant_from_memory<V, F>(
tenant_id: TenantId,
tenant_cleanup: F,
) -> anyhow::Result<V>
) -> Result<V, TenantStateError>
where
F: std::future::Future<Output = anyhow::Result<V>>,
{
@@ -505,11 +516,9 @@ where
| TenantState::Loading
| TenantState::Broken
| TenantState::Active => tenant.set_stopping(),
TenantState::Stopping => {
anyhow::bail!("Tenant {tenant_id} is stopping already")
}
TenantState::Stopping => return Err(TenantStateError::IsStopping(tenant_id)),
},
None => anyhow::bail!("Tenant not found for id {tenant_id}"),
None => return Err(TenantStateError::NotFound(tenant_id)),
}
}
@@ -532,10 +541,15 @@ where
Err(e) => {
let tenants_accessor = TENANTS.read().await;
match tenants_accessor.get(&tenant_id) {
Some(tenant) => tenant.set_broken(&e.to_string()),
None => warn!("Tenant {tenant_id} got removed from memory"),
Some(tenant) => {
tenant.set_broken(&e.to_string());
}
None => {
warn!("Tenant {tenant_id} got removed from memory");
return Err(TenantStateError::NotFound(tenant_id));
}
}
Err(e)
Err(TenantStateError::Other(e))
}
}
}
@@ -555,7 +569,7 @@ pub async fn immediate_gc(
let tenant = guard
.get(&tenant_id)
.map(Arc::clone)
.with_context(|| format!("Tenant {tenant_id} not found"))
.with_context(|| format!("tenant {tenant_id}"))
.map_err(ApiError::NotFound)?;
let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon());
@@ -605,7 +619,7 @@ pub async fn immediate_compact(
let tenant = guard
.get(&tenant_id)
.map(Arc::clone)
.with_context(|| format!("Tenant {tenant_id} not found"))
.with_context(|| format!("tenant {tenant_id}"))
.map_err(ApiError::NotFound)?;
let timeline = tenant

View File

@@ -2086,8 +2086,8 @@ class NeonPageserver(PgProtocol):
# https://github.com/neondatabase/neon/issues/2442
".*could not remove ephemeral file.*No such file or directory.*",
# FIXME: These need investigation
".*gc_loop.*Failed to get a tenant .* Tenant .* not found in the local state.*",
".*compaction_loop.*Failed to get a tenant .* Tenant .* not found in the local state.*",
".*gc_loop.*Failed to get a tenant .* Tenant .* not found.*",
".*compaction_loop.*Failed to get a tenant .* Tenant .* not found.*",
".*manual_gc.*is_shutdown_requested\\(\\) called in an unexpected task or thread.*",
".*tenant_list: timeline is not found in remote index while it is present in the tenants registry.*",
".*Removing intermediate uninit mark file.*",

View File

@@ -225,7 +225,7 @@ def test_tenant_reattach_while_busy(
# Attempts to connect from compute to pageserver while the tenant is
# temporarily detached produces these errors in the pageserver log.
env.pageserver.allowed_errors.append(".*Tenant .* not found in the local state.*")
env.pageserver.allowed_errors.append(".*Tenant .* not found.*")
env.pageserver.allowed_errors.append(
".*Tenant .* will not become active\\. Current state: Stopping.*"
)
@@ -257,18 +257,18 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
env = neon_env_builder.init_start()
pageserver_http = env.pageserver.http_client()
env.pageserver.allowed_errors.append(".*NotFound: Tenant .* not found")
env.pageserver.allowed_errors.append(".*NotFound: Tenant .*")
# first check for non existing tenant
tenant_id = TenantId.generate()
with pytest.raises(
expected_exception=PageserverApiException,
match=f"Tenant not found for id {tenant_id}",
match=f"NotFound: tenant {tenant_id}",
):
pageserver_http.tenant_detach(tenant_id)
# the error will be printed to the log too
env.pageserver.allowed_errors.append(".*Tenant not found for id.*")
env.pageserver.allowed_errors.append(".*NotFound: tenant *")
# create new nenant
tenant_id, timeline_id = env.neon_cli.create_tenant()
@@ -294,7 +294,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
# the error will be printed to the log too
env.pageserver.allowed_errors.append(".*gc target timeline does not exist.*")
# Timelines get stopped during detach, ignore the gc calls that error, whitnessing that
# Timelines get stopped during detach, ignore the gc calls that error, witnessing that
env.pageserver.allowed_errors.append(".*InternalServerError\\(timeline is Stopping.*")
# Detach while running manual GC.
@@ -320,7 +320,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
assert not (env.repo_dir / "tenants" / str(tenant_id)).exists()
with pytest.raises(
expected_exception=PageserverApiException, match=f"Tenant {tenant_id} not found"
expected_exception=PageserverApiException, match=f"NotFound: tenant {tenant_id}"
):
pageserver_http.timeline_gc(tenant_id, timeline_id, 0)

View File

@@ -10,7 +10,7 @@ def test_timeline_delete(neon_simple_env: NeonEnv):
env.pageserver.allowed_errors.append(".*Timeline .* was not found.*")
env.pageserver.allowed_errors.append(".*timeline not found.*")
env.pageserver.allowed_errors.append(".*Cannot delete timeline which has child timelines.*")
env.pageserver.allowed_errors.append(".*Tenant .* not found in the local state.*")
env.pageserver.allowed_errors.append(".*NotFound: tenant .*")
ps_http = env.pageserver.http_client()
@@ -24,7 +24,7 @@ def test_timeline_delete(neon_simple_env: NeonEnv):
invalid_tenant_id = TenantId.generate()
with pytest.raises(
PageserverApiException,
match=f"Tenant {invalid_tenant_id} not found in the local state",
match=f"NotFound: tenant {invalid_tenant_id}",
):
ps_http.timeline_delete(tenant_id=invalid_tenant_id, timeline_id=invalid_timeline_id)

View File

@@ -23,7 +23,7 @@ def assert_child_processes(pageserver_pid, wal_redo_present=False, defunct_prese
def test_walredo_not_left_behind_on_detach(neon_env_builder: NeonEnvBuilder):
env = neon_env_builder.init_start()
# We intentionally test for a non-existent tenant.
env.pageserver.allowed_errors.append(".*Tenant not found.*")
env.pageserver.allowed_errors.append(".*NotFound: tenant.*")
pageserver_http = env.pageserver.http_client()
pagserver_pid = int((env.repo_dir / "pageserver.pid").read_text())
@@ -34,7 +34,7 @@ def test_walredo_not_left_behind_on_detach(neon_env_builder: NeonEnvBuilder):
tenant_id = TenantId.generate()
with pytest.raises(
expected_exception=PageserverApiException,
match=f"Tenant not found for id {tenant_id}",
match=f"NotFound: tenant {tenant_id}",
):
pageserver_http.tenant_detach(tenant_id)