mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-15 09:22:55 +00:00
test: handle the case where timeline cannot be found
at least do not double-panick.
This commit is contained in:
@@ -44,6 +44,9 @@ pub(crate) enum Error {
|
||||
#[error("wait for tenant to activate after restarting")]
|
||||
WaitToActivate(#[source] GetActiveTenantError),
|
||||
|
||||
#[error("detached timeline was not found after restart")]
|
||||
DetachedNotFoundAfterRestart,
|
||||
|
||||
#[error("unexpected error")]
|
||||
Unexpected(#[source] anyhow::Error),
|
||||
|
||||
@@ -73,6 +76,7 @@ impl From<Error> for ApiError {
|
||||
| e @ Error::CopyFailed(_)
|
||||
| e @ Error::Unexpected(_)
|
||||
| e @ Error::Failpoint(_) => ApiError::InternalServerError(e.into()),
|
||||
Error::DetachedNotFoundAfterRestart => ApiError::NotFound(value.into()),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -283,20 +287,13 @@ impl SharedState {
|
||||
|
||||
self.inner.lock().unwrap().validate(&attempt);
|
||||
|
||||
let mut attempt = scopeguard::guard(attempt, |attempt| {
|
||||
// our attempt will no longer be valid, so release it
|
||||
self.inner.lock().unwrap().cancel(attempt);
|
||||
});
|
||||
|
||||
tenant
|
||||
.wait_to_become_active(std::time::Duration::from_secs(9999))
|
||||
.await?;
|
||||
|
||||
// TODO: pause failpoint here to catch the situation where detached timeline is deleted...?
|
||||
// we are not yet holding the gate so it could advance to the point of removing from
|
||||
// timelines.
|
||||
//
|
||||
// pause failpoint which triggers after activating but before completing here
|
||||
utils::pausable_failpoint!(
|
||||
"timeline-detach-ancestor::after_activating_before_finding-pausable"
|
||||
);
|
||||
|
||||
let Some(timeline) = tenant
|
||||
.timelines
|
||||
@@ -305,10 +302,14 @@ impl SharedState {
|
||||
.get(&attempt.timeline_id)
|
||||
.cloned()
|
||||
else {
|
||||
// FIXME: this needs a test case ... basically deletion right after activation?
|
||||
unreachable!("unsure if there is an ordering, but perhaps this is possible?");
|
||||
return Err(Error::DetachedNotFoundAfterRestart);
|
||||
};
|
||||
|
||||
let mut attempt = scopeguard::guard(attempt, |attempt| {
|
||||
// our attempt will no longer be valid, so release it
|
||||
self.inner.lock().unwrap().cancel(attempt);
|
||||
});
|
||||
|
||||
// the gate being entered does not matter much, but lets be strict
|
||||
if attempt.gate_entered.is_none() {
|
||||
let entered = timeline.gate.enter().map_err(|_| Error::ShuttingDown)?;
|
||||
|
||||
@@ -1221,6 +1221,51 @@ def test_retried_detach_ancestor_after_failed_reparenting(neon_env_builder: Neon
|
||||
assert metric == 0
|
||||
|
||||
|
||||
def test_deletion_after_timeline_ancestor_detach_before_completion(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
):
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
|
||||
|
||||
http = env.pageserver.http_client()
|
||||
|
||||
detached = env.neon_cli.create_branch("detached")
|
||||
|
||||
failpoint = "timeline-detach-ancestor::after_activating_before_finding-pausable"
|
||||
|
||||
http.configure_failpoints((failpoint, "pause"))
|
||||
|
||||
def detach_and_get_stuck():
|
||||
return http.detach_ancestor(env.initial_tenant, detached)
|
||||
|
||||
def pausepoint_hit():
|
||||
env.pageserver.assert_log_contains(f"at failpoint {failpoint}")
|
||||
|
||||
def delete_detached():
|
||||
return http.timeline_delete(env.initial_tenant, detached)
|
||||
|
||||
try:
|
||||
with ThreadPoolExecutor(max_workers=1) as pool:
|
||||
detach = pool.submit(detach_and_get_stuck)
|
||||
|
||||
wait_until(10, 1.0, pausepoint_hit)
|
||||
|
||||
delete_detached()
|
||||
|
||||
wait_timeline_detail_404(http, env.initial_tenant, detached, 10, 1.0)
|
||||
|
||||
http.configure_failpoints((failpoint, "off"))
|
||||
|
||||
with pytest.raises(PageserverApiException) as exc:
|
||||
detach.result()
|
||||
|
||||
# FIXME: this should be 404 but because there is another Anyhow conversion it is 500
|
||||
assert exc.value.status_code == 500
|
||||
finally:
|
||||
http.configure_failpoints((failpoint, "off"))
|
||||
|
||||
|
||||
# TODO:
|
||||
# - branch near existing L1 boundary, image layers?
|
||||
# - investigate: why are layers started at uneven lsn? not just after branching, but in general.
|
||||
|
||||
Reference in New Issue
Block a user