From ce44dfe3532b22689464cbeddbf392d05049b9c4 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Wed, 22 May 2024 16:55:34 +0300 Subject: [PATCH] openapi: document timeline ancestor detach (#7650) The openapi description with the error descriptions: - 200 is used for "detached or has been detached previously" - 400 is used for "cannot be detached right now" -- it's an odd thing, but good enough - 404 is used for tenant or timeline not found - 409 is used for "can never be detached" (root timeline) - 500 is used for transient errors (basically ill-defined shutdown errors) - 503 is used for busy (other tenant ancestor detach underway, pageserver shutdown) Cc: #6994 --- pageserver/src/http/openapi_spec.yml | 87 +++++++++++++++++++ pageserver/src/http/routes.rs | 27 +++--- .../src/tenant/timeline/detach_ancestor.rs | 30 ++++++- .../regress/test_timeline_detach_ancestor.py | 36 +++++++- 4 files changed, 161 insertions(+), 19 deletions(-) diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 107bcd4a22..e5eafc51f4 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -612,6 +612,80 @@ paths: schema: $ref: "#/components/schemas/Error" + /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/detach_ancestor: + parameters: + - name: tenant_shard_id + in: path + required: true + schema: + type: string + - name: timeline_id + in: path + ŕequired: true + schema: + type: string + + put: + description: | + Detach a timeline from its ancestor and reparent all ancestors timelines with lower `ancestor_lsn`. + Current implementation might not be retryable across failure cases, but will be enhanced in future. + Detaching should be expected to be expensive operation. Timeouts should be retried. + responses: + "200": + description: | + The timeline has been detached from it's ancestor (now or earlier), and at least the returned timelines have been reparented. + If any timelines were deleted after reparenting, they might not be on this list. + content: + application/json: + schema: + $ref: "#/components/schemas/AncestorDetached" + + "400": + description: | + Number of early checks meaning the timeline cannot be detached now: + - the ancestor of timeline has an ancestor: not supported, see RFC + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + + "404": + description: Tenant or timeline not found. + content: + application/json: + schema: + $ref: "#/components/schemas/NotFoundError" + + "409": + description: | + The timeline can never be detached: + - timeline has no ancestor, implying that the timeline has never had an ancestor + content: + application/json: + schema: + $ref: "#/components/schemas/ConflictError" + + "500": + description: | + Transient error, for example, pageserver shutdown happened while + processing the request but we were unable to distinguish that. Must + be retried. + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + + "503": + description: | + Temporarily unavailable, please retry. Possible reasons: + - another timeline detach for the same tenant is underway, please retry later + - detected shutdown error + content: + application/json: + schema: + $ref: "#/components/schemas/ServiceUnavailableError" + + /v1/tenant/: get: description: Get tenants list @@ -1077,6 +1151,19 @@ components: format: int64 description: How many bytes of layer content were in the latest layer heatmap + AncestorDetached: + type: object + required: + - reparented_timelines + properties: + reparented_timelines: + type: array + description: Set of reparented timeline ids + properties: + type: string + format: hex + description: TimelineId + Error: type: object diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index c75e4ca5a9..34b9806a26 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -74,6 +74,7 @@ use crate::tenant::storage_layer::LayerAccessStatsReset; use crate::tenant::storage_layer::LayerName; use crate::tenant::timeline::CompactFlags; use crate::tenant::timeline::Timeline; +use crate::tenant::GetTimelineError; use crate::tenant::SpawnMode; use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError}; use crate::{config::PageServerConf, tenant::mgr}; @@ -279,6 +280,13 @@ impl From for ApiError { } } +impl From for ApiError { + fn from(gte: GetTimelineError) -> Self { + // Rationale: tenant is activated only after eligble timelines activate + ApiError::NotFound(gte.into()) + } +} + impl From for ApiError { fn from(e: GetActiveTenantError) -> ApiError { match e { @@ -643,9 +651,7 @@ async fn timeline_preserve_initdb_handler( .tenant_manager .get_attached_tenant_shard(tenant_shard_id)?; - let timeline = tenant - .get_timeline(timeline_id, false) - .map_err(|e| ApiError::NotFound(e.into()))?; + let timeline = tenant.get_timeline(timeline_id, false)?; timeline .preserve_initdb_archive() @@ -687,9 +693,7 @@ async fn timeline_detail_handler( tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; - let timeline = tenant - .get_timeline(timeline_id, false) - .map_err(|e| ApiError::NotFound(e.into()))?; + let timeline = tenant.get_timeline(timeline_id, false)?; let timeline_info = build_timeline_info( &timeline, @@ -1901,14 +1905,11 @@ async fn timeline_detach_ancestor_handler( let ctx = RequestContext::new(TaskKind::DetachAncestor, DownloadBehavior::Download); let ctx = &ctx; - let timeline = tenant - .get_timeline(timeline_id, true) - .map_err(|e| ApiError::NotFound(e.into()))?; + let timeline = tenant.get_timeline(timeline_id, true)?; let (_guard, prepared) = timeline .prepare_to_detach_from_ancestor(&tenant, options, ctx) - .await - .map_err(|e| ApiError::InternalServerError(e.into()))?; + .await?; let res = state .tenant_manager @@ -2042,9 +2043,7 @@ async fn active_timeline_of_active_tenant( tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; - tenant - .get_timeline(timeline_id, true) - .map_err(|e| ApiError::NotFound(e.into())) + Ok(tenant.get_timeline(timeline_id, true)?) } async fn always_panic_handler( diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs index 4d8e570181..e6ddabe5b5 100644 --- a/pageserver/src/tenant/timeline/detach_ancestor.rs +++ b/pageserver/src/tenant/timeline/detach_ancestor.rs @@ -12,7 +12,7 @@ use crate::{ }; use tokio_util::sync::CancellationToken; use tracing::Instrument; -use utils::{completion, generation::Generation, id::TimelineId, lsn::Lsn}; +use utils::{completion, generation::Generation, http::error::ApiError, id::TimelineId, lsn::Lsn}; #[derive(Debug, thiserror::Error)] pub(crate) enum Error { @@ -41,6 +41,27 @@ pub(crate) enum Error { Unexpected(#[source] anyhow::Error), } +impl From for ApiError { + fn from(value: Error) -> Self { + match value { + e @ Error::NoAncestor => ApiError::Conflict(e.to_string()), + // TODO: ApiError converts the anyhow using debug formatting ... just stop using ApiError? + e @ Error::TooManyAncestors => ApiError::BadRequest(anyhow::anyhow!("{}", e)), + Error::ShuttingDown => ApiError::ShuttingDown, + Error::OtherTimelineDetachOngoing(_) => { + ApiError::ResourceUnavailable("other timeline detach is already ongoing".into()) + } + // All of these contain shutdown errors, in fact, it's the most common + e @ Error::FlushAncestor(_) + | e @ Error::RewrittenDeltaDownloadFailed(_) + | e @ Error::CopyDeltaPrefix(_) + | e @ Error::UploadRewritten(_) + | e @ Error::CopyFailed(_) + | e @ Error::Unexpected(_) => ApiError::InternalServerError(e.into()), + } + } +} + pub(crate) struct PreparedTimelineDetach { layers: Vec, } @@ -75,6 +96,11 @@ pub(super) async fn prepare( .as_ref() .map(|tl| (tl.clone(), detached.ancestor_lsn)) else { + // TODO: check if we have already been detached; for this we need to read the stored data + // on remote client, for that we need a follow-up which makes uploads cheaper and maintains + // a projection of the commited data. + // + // the error is wrong per openapi return Err(NoAncestor); }; @@ -84,7 +110,7 @@ pub(super) async fn prepare( if ancestor.ancestor_timeline.is_some() { // non-technical requirement; we could flatten N ancestors just as easily but we chose - // not to + // not to, at least initially return Err(TooManyAncestors); } diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py index 1563c161e0..f0b2f7d733 100644 --- a/test_runner/regress/test_timeline_detach_ancestor.py +++ b/test_runner/regress/test_timeline_detach_ancestor.py @@ -8,9 +8,13 @@ from typing import List, Tuple import pytest from fixtures.common_types import Lsn, TimelineId from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, wait_for_last_flush_lsn -from fixtures.pageserver.http import HistoricLayerInfo -from fixtures.pageserver.utils import wait_timeline_detail_404 +from fixtures.neon_fixtures import ( + NeonEnvBuilder, + PgBin, + wait_for_last_flush_lsn, +) +from fixtures.pageserver.http import HistoricLayerInfo, PageserverApiException +from fixtures.pageserver.utils import wait_tenant_status_404, wait_timeline_detail_404 from fixtures.remote_storage import LocalFsStorage from fixtures.utils import assert_pageserver_backups_equal @@ -555,6 +559,32 @@ def test_compaction_induced_by_detaches_in_history( assert_pageserver_backups_equal(fullbackup_before, fullbackup_after, set()) +def test_timeline_ancestor_errors(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + + client = env.pageserver.http_client() + + with pytest.raises(PageserverApiException, match=".* no ancestors") as info: + client.detach_ancestor(env.initial_tenant, env.initial_timeline) + assert info.value.status_code == 409 + + first_branch = env.neon_cli.create_branch("first_branch") + second_branch = env.neon_cli.create_branch("second_branch", ancestor_branch_name="first_branch") + + # funnily enough this does not have a prefix + with pytest.raises(PageserverApiException, match="too many ancestors") as info: + client.detach_ancestor(env.initial_tenant, second_branch) + assert info.value.status_code == 400 + + client.tenant_delete(env.initial_tenant) + wait_tenant_status_404(client, env.initial_tenant, 10, 1) + + with pytest.raises(PageserverApiException) as e: + client.detach_ancestor(env.initial_tenant, first_branch) + assert e.value.status_code == 404 + + # TODO: # - after starting the operation, tenant is deleted # - after starting the operation, pageserver is shutdown, restarted