From 93f3f4ab5fb589296ea44929f5c113f680777ee6 Mon Sep 17 00:00:00 2001 From: Shany Pozin Date: Sun, 19 Mar 2023 10:44:42 +0200 Subject: [PATCH] Return NotFound in mgmt API requests when tenant is not present in the pageserver (#3818) ## Describe your changes Add Error enum for tenant state response to allow better error handling in mgmt api ## Issue ticket number and link #2238 ## Checklist before requesting a review - [x] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. --- pageserver/src/http/routes.rs | 116 ++++++++---------- pageserver/src/page_service.rs | 5 +- pageserver/src/tenant/mgr.rs | 64 ++++++---- test_runner/fixtures/neon_fixtures.py | 4 +- test_runner/regress/test_tenant_detach.py | 12 +- test_runner/regress/test_timeline_delete.py | 4 +- .../test_walredo_not_left_behind_on_detach.py | 4 +- 7 files changed, 104 insertions(+), 105 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 9faa994f16..2f03e251fd 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -21,7 +21,7 @@ use crate::context::{DownloadBehavior, RequestContext}; use crate::pgdatadir_mapping::LsnForTimestamp; use crate::task_mgr::TaskKind; use crate::tenant::config::TenantConfOpt; -use crate::tenant::mgr::TenantMapInsertError; +use crate::tenant::mgr::{TenantMapInsertError, TenantStateError}; use crate::tenant::size::ModelInputs; use crate::tenant::storage_layer::LayerAccessStatsReset; use crate::tenant::{PageReconstructError, Timeline}; @@ -89,32 +89,45 @@ fn check_permission(request: &Request, tenant_id: Option) -> Res }) } -fn apierror_from_prerror(err: PageReconstructError) -> ApiError { - match err { - PageReconstructError::Other(err) => ApiError::InternalServerError(err), - PageReconstructError::NeedsDownload(_, _) => { - // This shouldn't happen, because we use a RequestContext that requests to - // download any missing layer files on-demand. - ApiError::InternalServerError(anyhow::anyhow!("need to download remote layer file")) - } - PageReconstructError::Cancelled => { - ApiError::InternalServerError(anyhow::anyhow!("request was cancelled")) - } - PageReconstructError::WalRedo(err) => { - ApiError::InternalServerError(anyhow::Error::new(err)) +impl From for ApiError { + fn from(pre: PageReconstructError) -> ApiError { + match pre { + PageReconstructError::Other(pre) => ApiError::InternalServerError(pre), + PageReconstructError::NeedsDownload(_, _) => { + // This shouldn't happen, because we use a RequestContext that requests to + // download any missing layer files on-demand. + ApiError::InternalServerError(anyhow::anyhow!("need to download remote layer file")) + } + PageReconstructError::Cancelled => { + ApiError::InternalServerError(anyhow::anyhow!("request was cancelled")) + } + PageReconstructError::WalRedo(pre) => { + ApiError::InternalServerError(anyhow::Error::new(pre)) + } } } } -fn apierror_from_tenant_map_insert_error(e: TenantMapInsertError) -> ApiError { - match e { - TenantMapInsertError::StillInitializing | TenantMapInsertError::ShuttingDown => { - ApiError::InternalServerError(anyhow::Error::new(e)) +impl From for ApiError { + fn from(tmie: TenantMapInsertError) -> ApiError { + match tmie { + TenantMapInsertError::StillInitializing | TenantMapInsertError::ShuttingDown => { + ApiError::InternalServerError(anyhow::Error::new(tmie)) + } + TenantMapInsertError::TenantAlreadyExists(id, state) => { + ApiError::Conflict(format!("tenant {id} already exists, state: {state:?}")) + } + TenantMapInsertError::Closure(e) => ApiError::InternalServerError(e), } - TenantMapInsertError::TenantAlreadyExists(id, state) => { - ApiError::Conflict(format!("tenant {id} already exists, state: {state:?}")) + } +} + +impl From for ApiError { + fn from(tse: TenantStateError) -> ApiError { + match tse { + TenantStateError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid)), + _ => ApiError::InternalServerError(anyhow::Error::new(tse)), } - TenantMapInsertError::Closure(e) => ApiError::InternalServerError(e), } } @@ -218,9 +231,7 @@ async fn timeline_create_handler(mut request: Request) -> Result) -> Result, let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); let response_data = async { - let tenant = mgr::get_tenant(tenant_id, true) - .await - .map_err(ApiError::NotFound)?; + let tenant = mgr::get_tenant(tenant_id, true).await?; let timelines = tenant.list_timelines(); let mut response_data = Vec::with_capacity(timelines.len()); @@ -268,7 +277,7 @@ async fn timeline_list_handler(request: Request) -> Result, response_data.push(timeline_info); } - Ok(response_data) + Ok::, ApiError>(response_data) } .instrument(info_span!("timeline_list", tenant = %tenant_id)) .await?; @@ -287,9 +296,7 @@ async fn timeline_detail_handler(request: Request) -> Result) -> Result format!("{lsn}"), @@ -353,8 +357,7 @@ async fn tenant_attach_handler(request: Request) -> Result, if let Some(remote_storage) = &state.remote_storage { mgr::attach_tenant(state.conf, tenant_id, remote_storage.clone(), &ctx) .instrument(info_span!("tenant_attach", tenant = %tenant_id)) - .await - .map_err(apierror_from_tenant_map_insert_error)?; + .await?; } else { return Err(ApiError::BadRequest(anyhow!( "attach_tenant is not possible because pageserver was configured without remote storage" @@ -373,11 +376,7 @@ async fn timeline_delete_handler(request: Request) -> Result) -> Result, let conf = state.conf; mgr::detach_tenant(conf, tenant_id) .instrument(info_span!("tenant_detach", tenant = %tenant_id)) - .await - // FIXME: Errors from `detach_tenant` can be caused by both both user and internal errors. - // Replace this with better handling once the error type permits it. - .map_err(ApiError::InternalServerError)?; + .await?; json_response(StatusCode::OK, ()) } @@ -407,8 +403,7 @@ async fn tenant_load_handler(request: Request) -> Result, A let state = get_state(&request); mgr::load_tenant(state.conf, tenant_id, state.remote_storage.clone(), &ctx) .instrument(info_span!("load", tenant = %tenant_id)) - .await - .map_err(apierror_from_tenant_map_insert_error)?; + .await?; json_response(StatusCode::ACCEPTED, ()) } @@ -421,10 +416,7 @@ async fn tenant_ignore_handler(request: Request) -> Result, let conf = state.conf; mgr::ignore_tenant(conf, tenant_id) .instrument(info_span!("ignore_tenant", tenant = %tenant_id)) - .await - // FIXME: Errors from `ignore_tenant` can be caused by both both user and internal errors. - // Replace this with better handling once the error type permits it. - .map_err(ApiError::InternalServerError)?; + .await?; json_response(StatusCode::OK, ()) } @@ -498,9 +490,7 @@ async fn tenant_size_handler(request: Request) -> Result, A let headers = request.headers(); let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let tenant = mgr::get_tenant(tenant_id, true) - .await - .map_err(ApiError::InternalServerError)?; + let tenant = mgr::get_tenant(tenant_id, true).await?; // this can be long operation let inputs = tenant @@ -763,8 +753,7 @@ async fn tenant_create_handler(mut request: Request) -> Result) -> Result Result, ApiError> { - let tenant = mgr::get_tenant(tenant_id, true) - .await - .map_err(ApiError::NotFound)?; + let tenant = mgr::get_tenant(tenant_id, true).await?; tenant .get_timeline(timeline_id, true) .map_err(ApiError::NotFound) diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index aad6099952..b63ee31d5e 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -1107,7 +1107,10 @@ async fn get_active_tenant_with_timeout( tenant_id: TenantId, _ctx: &RequestContext, /* require get a context to support cancellation in the future */ ) -> Result, GetActiveTenantError> { - let tenant = mgr::get_tenant(tenant_id, false).await?; + let tenant = match mgr::get_tenant(tenant_id, false).await { + Ok(tenant) => tenant, + Err(e) => return Err(GetActiveTenantError::Other(e.into())), + }; let wait_time = Duration::from_secs(30); match tokio::time::timeout(wait_time, tenant.wait_to_become_active()).await { Ok(Ok(())) => Ok(tenant), diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index a44cb02b4d..a4212ea8a6 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -289,7 +289,7 @@ pub async fn set_new_tenant_config( conf: &'static PageServerConf, new_tenant_conf: TenantConfOpt, tenant_id: TenantId, -) -> anyhow::Result<()> { +) -> Result<(), TenantStateError> { info!("configuring tenant {tenant_id}"); let tenant = get_tenant(tenant_id, true).await?; @@ -306,16 +306,20 @@ pub async fn set_new_tenant_config( /// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query. /// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants. -pub async fn get_tenant(tenant_id: TenantId, active_only: bool) -> anyhow::Result> { +pub async fn get_tenant( + tenant_id: TenantId, + active_only: bool, +) -> Result, TenantStateError> { let m = TENANTS.read().await; let tenant = m .get(&tenant_id) - .with_context(|| format!("Tenant {tenant_id} not found in the local state"))?; + .ok_or(TenantStateError::NotFound(tenant_id))?; if active_only && !tenant.is_active() { - anyhow::bail!( + tracing::warn!( "Tenant {tenant_id} is not active. Current state: {:?}", tenant.current_state() - ) + ); + Err(TenantStateError::NotActive(tenant_id)) } else { Ok(Arc::clone(tenant)) } @@ -325,21 +329,28 @@ pub async fn delete_timeline( tenant_id: TenantId, timeline_id: TimelineId, ctx: &RequestContext, -) -> anyhow::Result<()> { - match get_tenant(tenant_id, true).await { - Ok(tenant) => { - tenant.delete_timeline(timeline_id, ctx).await?; - } - Err(e) => anyhow::bail!("Cannot access tenant {tenant_id} in local tenant state: {e:?}"), - } - +) -> Result<(), TenantStateError> { + let tenant = get_tenant(tenant_id, true).await?; + tenant.delete_timeline(timeline_id, ctx).await?; Ok(()) } +#[derive(Debug, thiserror::Error)] +pub enum TenantStateError { + #[error("Tenant {0} not found")] + NotFound(TenantId), + #[error("Tenant {0} is stopping")] + IsStopping(TenantId), + #[error("Tenant {0} is not active")] + NotActive(TenantId), + #[error(transparent)] + Other(#[from] anyhow::Error), +} + pub async fn detach_tenant( conf: &'static PageServerConf, tenant_id: TenantId, -) -> anyhow::Result<()> { +) -> Result<(), TenantStateError> { remove_tenant_from_memory(tenant_id, async { let local_tenant_directory = conf.tenant_path(&tenant_id); fs::remove_dir_all(&local_tenant_directory) @@ -379,7 +390,7 @@ pub async fn load_tenant( pub async fn ignore_tenant( conf: &'static PageServerConf, tenant_id: TenantId, -) -> anyhow::Result<()> { +) -> Result<(), TenantStateError> { remove_tenant_from_memory(tenant_id, async { let ignore_mark_file = conf.tenant_ignore_mark_file_path(tenant_id); fs::File::create(&ignore_mark_file) @@ -489,7 +500,7 @@ where async fn remove_tenant_from_memory( tenant_id: TenantId, tenant_cleanup: F, -) -> anyhow::Result +) -> Result where F: std::future::Future>, { @@ -505,11 +516,9 @@ where | TenantState::Loading | TenantState::Broken | TenantState::Active => tenant.set_stopping(), - TenantState::Stopping => { - anyhow::bail!("Tenant {tenant_id} is stopping already") - } + TenantState::Stopping => return Err(TenantStateError::IsStopping(tenant_id)), }, - None => anyhow::bail!("Tenant not found for id {tenant_id}"), + None => return Err(TenantStateError::NotFound(tenant_id)), } } @@ -532,10 +541,15 @@ where Err(e) => { let tenants_accessor = TENANTS.read().await; match tenants_accessor.get(&tenant_id) { - Some(tenant) => tenant.set_broken(&e.to_string()), - None => warn!("Tenant {tenant_id} got removed from memory"), + Some(tenant) => { + tenant.set_broken(&e.to_string()); + } + None => { + warn!("Tenant {tenant_id} got removed from memory"); + return Err(TenantStateError::NotFound(tenant_id)); + } } - Err(e) + Err(TenantStateError::Other(e)) } } } @@ -555,7 +569,7 @@ pub async fn immediate_gc( let tenant = guard .get(&tenant_id) .map(Arc::clone) - .with_context(|| format!("Tenant {tenant_id} not found")) + .with_context(|| format!("tenant {tenant_id}")) .map_err(ApiError::NotFound)?; let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon()); @@ -605,7 +619,7 @@ pub async fn immediate_compact( let tenant = guard .get(&tenant_id) .map(Arc::clone) - .with_context(|| format!("Tenant {tenant_id} not found")) + .with_context(|| format!("tenant {tenant_id}")) .map_err(ApiError::NotFound)?; let timeline = tenant diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 8a64be51f1..8b228ad804 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -2086,8 +2086,8 @@ class NeonPageserver(PgProtocol): # https://github.com/neondatabase/neon/issues/2442 ".*could not remove ephemeral file.*No such file or directory.*", # FIXME: These need investigation - ".*gc_loop.*Failed to get a tenant .* Tenant .* not found in the local state.*", - ".*compaction_loop.*Failed to get a tenant .* Tenant .* not found in the local state.*", + ".*gc_loop.*Failed to get a tenant .* Tenant .* not found.*", + ".*compaction_loop.*Failed to get a tenant .* Tenant .* not found.*", ".*manual_gc.*is_shutdown_requested\\(\\) called in an unexpected task or thread.*", ".*tenant_list: timeline is not found in remote index while it is present in the tenants registry.*", ".*Removing intermediate uninit mark file.*", diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index 27ec38e1be..e061ab92a4 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -225,7 +225,7 @@ def test_tenant_reattach_while_busy( # Attempts to connect from compute to pageserver while the tenant is # temporarily detached produces these errors in the pageserver log. - env.pageserver.allowed_errors.append(".*Tenant .* not found in the local state.*") + env.pageserver.allowed_errors.append(".*Tenant .* not found.*") env.pageserver.allowed_errors.append( ".*Tenant .* will not become active\\. Current state: Stopping.*" ) @@ -257,18 +257,18 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() pageserver_http = env.pageserver.http_client() - env.pageserver.allowed_errors.append(".*NotFound: Tenant .* not found") + env.pageserver.allowed_errors.append(".*NotFound: Tenant .*") # first check for non existing tenant tenant_id = TenantId.generate() with pytest.raises( expected_exception=PageserverApiException, - match=f"Tenant not found for id {tenant_id}", + match=f"NotFound: tenant {tenant_id}", ): pageserver_http.tenant_detach(tenant_id) # the error will be printed to the log too - env.pageserver.allowed_errors.append(".*Tenant not found for id.*") + env.pageserver.allowed_errors.append(".*NotFound: tenant *") # create new nenant tenant_id, timeline_id = env.neon_cli.create_tenant() @@ -294,7 +294,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): # the error will be printed to the log too env.pageserver.allowed_errors.append(".*gc target timeline does not exist.*") - # Timelines get stopped during detach, ignore the gc calls that error, whitnessing that + # Timelines get stopped during detach, ignore the gc calls that error, witnessing that env.pageserver.allowed_errors.append(".*InternalServerError\\(timeline is Stopping.*") # Detach while running manual GC. @@ -320,7 +320,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): assert not (env.repo_dir / "tenants" / str(tenant_id)).exists() with pytest.raises( - expected_exception=PageserverApiException, match=f"Tenant {tenant_id} not found" + expected_exception=PageserverApiException, match=f"NotFound: tenant {tenant_id}" ): pageserver_http.timeline_gc(tenant_id, timeline_id, 0) diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index 2226cab8ff..b9c4f5b83f 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -10,7 +10,7 @@ def test_timeline_delete(neon_simple_env: NeonEnv): env.pageserver.allowed_errors.append(".*Timeline .* was not found.*") env.pageserver.allowed_errors.append(".*timeline not found.*") env.pageserver.allowed_errors.append(".*Cannot delete timeline which has child timelines.*") - env.pageserver.allowed_errors.append(".*Tenant .* not found in the local state.*") + env.pageserver.allowed_errors.append(".*NotFound: tenant .*") ps_http = env.pageserver.http_client() @@ -24,7 +24,7 @@ def test_timeline_delete(neon_simple_env: NeonEnv): invalid_tenant_id = TenantId.generate() with pytest.raises( PageserverApiException, - match=f"Tenant {invalid_tenant_id} not found in the local state", + match=f"NotFound: tenant {invalid_tenant_id}", ): ps_http.timeline_delete(tenant_id=invalid_tenant_id, timeline_id=invalid_timeline_id) diff --git a/test_runner/regress/test_walredo_not_left_behind_on_detach.py b/test_runner/regress/test_walredo_not_left_behind_on_detach.py index 24045e2eb7..395d54b8c3 100644 --- a/test_runner/regress/test_walredo_not_left_behind_on_detach.py +++ b/test_runner/regress/test_walredo_not_left_behind_on_detach.py @@ -23,7 +23,7 @@ def assert_child_processes(pageserver_pid, wal_redo_present=False, defunct_prese def test_walredo_not_left_behind_on_detach(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() # We intentionally test for a non-existent tenant. - env.pageserver.allowed_errors.append(".*Tenant not found.*") + env.pageserver.allowed_errors.append(".*NotFound: tenant.*") pageserver_http = env.pageserver.http_client() pagserver_pid = int((env.repo_dir / "pageserver.pid").read_text()) @@ -34,7 +34,7 @@ def test_walredo_not_left_behind_on_detach(neon_env_builder: NeonEnvBuilder): tenant_id = TenantId.generate() with pytest.raises( expected_exception=PageserverApiException, - match=f"Tenant not found for id {tenant_id}", + match=f"NotFound: tenant {tenant_id}", ): pageserver_http.tenant_detach(tenant_id)