test: handle the case where timeline cannot be found

at least do not double-panick.
This commit is contained in:
Joonas Koivunen
2024-07-24 13:48:25 +00:00
parent a7153bf9b2
commit 274b2a611b
2 changed files with 58 additions and 12 deletions

View File

@@ -44,6 +44,9 @@ pub(crate) enum Error {
#[error("wait for tenant to activate after restarting")]
WaitToActivate(#[source] GetActiveTenantError),
#[error("detached timeline was not found after restart")]
DetachedNotFoundAfterRestart,
#[error("unexpected error")]
Unexpected(#[source] anyhow::Error),
@@ -73,6 +76,7 @@ impl From<Error> for ApiError {
| e @ Error::CopyFailed(_)
| e @ Error::Unexpected(_)
| e @ Error::Failpoint(_) => ApiError::InternalServerError(e.into()),
Error::DetachedNotFoundAfterRestart => ApiError::NotFound(value.into()),
}
}
}
@@ -283,20 +287,13 @@ impl SharedState {
self.inner.lock().unwrap().validate(&attempt);
let mut attempt = scopeguard::guard(attempt, |attempt| {
// our attempt will no longer be valid, so release it
self.inner.lock().unwrap().cancel(attempt);
});
tenant
.wait_to_become_active(std::time::Duration::from_secs(9999))
.await?;
// TODO: pause failpoint here to catch the situation where detached timeline is deleted...?
// we are not yet holding the gate so it could advance to the point of removing from
// timelines.
//
// pause failpoint which triggers after activating but before completing here
utils::pausable_failpoint!(
"timeline-detach-ancestor::after_activating_before_finding-pausable"
);
let Some(timeline) = tenant
.timelines
@@ -305,10 +302,14 @@ impl SharedState {
.get(&attempt.timeline_id)
.cloned()
else {
// FIXME: this needs a test case ... basically deletion right after activation?
unreachable!("unsure if there is an ordering, but perhaps this is possible?");
return Err(Error::DetachedNotFoundAfterRestart);
};
let mut attempt = scopeguard::guard(attempt, |attempt| {
// our attempt will no longer be valid, so release it
self.inner.lock().unwrap().cancel(attempt);
});
// the gate being entered does not matter much, but lets be strict
if attempt.gate_entered.is_none() {
let entered = timeline.gate.enter().map_err(|_| Error::ShuttingDown)?;

View File

@@ -1221,6 +1221,51 @@ def test_retried_detach_ancestor_after_failed_reparenting(neon_env_builder: Neon
assert metric == 0
def test_deletion_after_timeline_ancestor_detach_before_completion(
neon_env_builder: NeonEnvBuilder,
):
env = neon_env_builder.init_start()
env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
http = env.pageserver.http_client()
detached = env.neon_cli.create_branch("detached")
failpoint = "timeline-detach-ancestor::after_activating_before_finding-pausable"
http.configure_failpoints((failpoint, "pause"))
def detach_and_get_stuck():
return http.detach_ancestor(env.initial_tenant, detached)
def pausepoint_hit():
env.pageserver.assert_log_contains(f"at failpoint {failpoint}")
def delete_detached():
return http.timeline_delete(env.initial_tenant, detached)
try:
with ThreadPoolExecutor(max_workers=1) as pool:
detach = pool.submit(detach_and_get_stuck)
wait_until(10, 1.0, pausepoint_hit)
delete_detached()
wait_timeline_detail_404(http, env.initial_tenant, detached, 10, 1.0)
http.configure_failpoints((failpoint, "off"))
with pytest.raises(PageserverApiException) as exc:
detach.result()
# FIXME: this should be 404 but because there is another Anyhow conversion it is 500
assert exc.value.status_code == 500
finally:
http.configure_failpoints((failpoint, "off"))
# TODO:
# - branch near existing L1 boundary, image layers?
# - investigate: why are layers started at uneven lsn? not just after branching, but in general.