From 274b2a611bdbcb33a2d168cd6864712429954731 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Wed, 24 Jul 2024 13:48:25 +0000 Subject: [PATCH] test: handle the case where timeline cannot be found at least do not double-panick. --- .../src/tenant/timeline/detach_ancestor.rs | 25 ++++++----- .../regress/test_timeline_detach_ancestor.py | 45 +++++++++++++++++++ 2 files changed, 58 insertions(+), 12 deletions(-) diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs index e44622b262..d0742399a4 100644 --- a/pageserver/src/tenant/timeline/detach_ancestor.rs +++ b/pageserver/src/tenant/timeline/detach_ancestor.rs @@ -44,6 +44,9 @@ pub(crate) enum Error { #[error("wait for tenant to activate after restarting")] WaitToActivate(#[source] GetActiveTenantError), + #[error("detached timeline was not found after restart")] + DetachedNotFoundAfterRestart, + #[error("unexpected error")] Unexpected(#[source] anyhow::Error), @@ -73,6 +76,7 @@ impl From for ApiError { | e @ Error::CopyFailed(_) | e @ Error::Unexpected(_) | e @ Error::Failpoint(_) => ApiError::InternalServerError(e.into()), + Error::DetachedNotFoundAfterRestart => ApiError::NotFound(value.into()), } } } @@ -283,20 +287,13 @@ impl SharedState { self.inner.lock().unwrap().validate(&attempt); - let mut attempt = scopeguard::guard(attempt, |attempt| { - // our attempt will no longer be valid, so release it - self.inner.lock().unwrap().cancel(attempt); - }); - tenant .wait_to_become_active(std::time::Duration::from_secs(9999)) .await?; - // TODO: pause failpoint here to catch the situation where detached timeline is deleted...? - // we are not yet holding the gate so it could advance to the point of removing from - // timelines. - // - // pause failpoint which triggers after activating but before completing here + utils::pausable_failpoint!( + "timeline-detach-ancestor::after_activating_before_finding-pausable" + ); let Some(timeline) = tenant .timelines @@ -305,10 +302,14 @@ impl SharedState { .get(&attempt.timeline_id) .cloned() else { - // FIXME: this needs a test case ... basically deletion right after activation? - unreachable!("unsure if there is an ordering, but perhaps this is possible?"); + return Err(Error::DetachedNotFoundAfterRestart); }; + let mut attempt = scopeguard::guard(attempt, |attempt| { + // our attempt will no longer be valid, so release it + self.inner.lock().unwrap().cancel(attempt); + }); + // the gate being entered does not matter much, but lets be strict if attempt.gate_entered.is_none() { let entered = timeline.gate.enter().map_err(|_| Error::ShuttingDown)?; diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py index 47dfd26bba..b0dfec2e5c 100644 --- a/test_runner/regress/test_timeline_detach_ancestor.py +++ b/test_runner/regress/test_timeline_detach_ancestor.py @@ -1221,6 +1221,51 @@ def test_retried_detach_ancestor_after_failed_reparenting(neon_env_builder: Neon assert metric == 0 +def test_deletion_after_timeline_ancestor_detach_before_completion( + neon_env_builder: NeonEnvBuilder, +): + env = neon_env_builder.init_start() + + env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + + http = env.pageserver.http_client() + + detached = env.neon_cli.create_branch("detached") + + failpoint = "timeline-detach-ancestor::after_activating_before_finding-pausable" + + http.configure_failpoints((failpoint, "pause")) + + def detach_and_get_stuck(): + return http.detach_ancestor(env.initial_tenant, detached) + + def pausepoint_hit(): + env.pageserver.assert_log_contains(f"at failpoint {failpoint}") + + def delete_detached(): + return http.timeline_delete(env.initial_tenant, detached) + + try: + with ThreadPoolExecutor(max_workers=1) as pool: + detach = pool.submit(detach_and_get_stuck) + + wait_until(10, 1.0, pausepoint_hit) + + delete_detached() + + wait_timeline_detail_404(http, env.initial_tenant, detached, 10, 1.0) + + http.configure_failpoints((failpoint, "off")) + + with pytest.raises(PageserverApiException) as exc: + detach.result() + + # FIXME: this should be 404 but because there is another Anyhow conversion it is 500 + assert exc.value.status_code == 500 + finally: + http.configure_failpoints((failpoint, "off")) + + # TODO: # - branch near existing L1 boundary, image layers? # - investigate: why are layers started at uneven lsn? not just after branching, but in general.