mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-27 18:10:37 +00:00
Fix hang deleting offloaded timelines (#12366)
We don't have cancellation support for timeline deletions. In other words, timeline deletion might still go on in an older generation while we are attaching it in a newer generation already, because the cancellation simply hasn't reached the deletion code. This has caused us to hit a situation with offloaded timelines in which the timeline was in an unrecoverable state: always returning an accepted response, but never a 404 like it should be. The detailed description can be found in [here](https://github.com/neondatabase/cloud/issues/30406#issuecomment-3008667859) (private repo link). TLDR: 1. we ask to delete timeline on old pageserver/generation, starts process in background 2. the storcon migrates the tenant to a different pageserver. - during attach, the pageserver still finds an index part, so it adds it to `offloaded_timelines` 4. the timeline deletion finishes, removing the index part in S3 5. there is a retry of the timeline deletion endpoint, sent to the new pageserver location. it is bound to fail however: - as the index part is gone, we print `Timeline already deleted in remote storage`. - the problem is that we then return an accepted response code, and not a 404. - this confuses the code calling us. it thinks the timeline is not deleted, so keeps retrying. - this state never gets recovered from until a reset/detach, because of the `offloaded_timelines` entry staying there. This is where this PR fixes things: if no index part can be found, we can safely assume that the timeline is gone in S3 (it's the last thing to be deleted), so we can remove it from `offloaded_timelines` and trigger a reupload of the manifest. Subsequent retries will pick that up. Why not improve the cancellation support? It is a more disruptive code change, that might have its own risks. So we don't do it for now. Fixes https://github.com/neondatabase/cloud/issues/30406
This commit is contained in:
@@ -241,8 +241,17 @@ impl DeleteTimelineFlow {
|
||||
{
|
||||
Ok(r) => r,
|
||||
Err(DownloadError::NotFound) => {
|
||||
// Deletion is already complete
|
||||
// Deletion is already complete.
|
||||
// As we came here, we will need to remove the timeline from the tenant though.
|
||||
tracing::info!("Timeline already deleted in remote storage");
|
||||
if let TimelineOrOffloaded::Offloaded(_) = &timeline {
|
||||
// We only supoprt this for offloaded timelines, as we don't know which state non-offloaded timelines are in.
|
||||
tracing::info!(
|
||||
"Timeline with gone index part is offloaded timeline. Removing from tenant."
|
||||
);
|
||||
remove_maybe_offloaded_timeline_from_tenant(tenant, &timeline, &guard)
|
||||
.await?;
|
||||
}
|
||||
return Ok(());
|
||||
}
|
||||
Err(e) => {
|
||||
|
||||
@@ -896,6 +896,134 @@ def test_timeline_retain_lsn(
|
||||
assert sum == pre_branch_sum
|
||||
|
||||
|
||||
def test_timeline_offload_delete_race(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Regression test for https://github.com/neondatabase/cloud/issues/30406
|
||||
"""
|
||||
remote_storage_kind = s3_storage()
|
||||
neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
|
||||
neon_env_builder.num_pageservers = 2
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
# Turn off gc and compaction loops: we want to issue them manually for better reliability
|
||||
tenant_id, root_timeline_id = env.create_tenant(
|
||||
conf={
|
||||
"gc_period": "0s",
|
||||
"compaction_period": "0s",
|
||||
"checkpoint_distance": f"{1024**2}",
|
||||
}
|
||||
)
|
||||
|
||||
origin_ps = env.get_tenant_pageserver(tenant_id)
|
||||
assert origin_ps
|
||||
origin_ps.allowed_errors.extend(
|
||||
[
|
||||
".*Timed out waiting for deletion queue flush.*",
|
||||
".*Timed out waiting for flush to remote storage.*",
|
||||
]
|
||||
)
|
||||
origin_ps_http = origin_ps.http_client()
|
||||
|
||||
# We are not sharding this tenant
|
||||
tenant_shard_id = TenantShardId(tenant_id, 0, 0)
|
||||
|
||||
# Create a branch and archive it
|
||||
child_timeline_id = env.create_branch("test_archived_branch_persisted", tenant_id)
|
||||
|
||||
with env.endpoints.create_start(
|
||||
"test_archived_branch_persisted", tenant_id=tenant_id
|
||||
) as endpoint:
|
||||
endpoint.safe_psql_many(
|
||||
[
|
||||
"CREATE TABLE foo(key serial primary key, t text default 'data_content')",
|
||||
"INSERT INTO foo SELECT FROM generate_series(1,512)",
|
||||
]
|
||||
)
|
||||
last_flush_lsn_upload(env, endpoint, tenant_id, child_timeline_id)
|
||||
|
||||
assert_prefix_not_empty(
|
||||
neon_env_builder.pageserver_remote_storage,
|
||||
prefix=f"tenants/{str(tenant_id)}/",
|
||||
)
|
||||
assert_prefix_not_empty(
|
||||
neon_env_builder.pageserver_remote_storage,
|
||||
prefix=f"tenants/{str(tenant_id)}/tenant-manifest",
|
||||
)
|
||||
|
||||
origin_ps_http.timeline_archival_config(
|
||||
tenant_id,
|
||||
child_timeline_id,
|
||||
state=TimelineArchivalState.ARCHIVED,
|
||||
)
|
||||
|
||||
def timeline_offloaded_api(timeline_id: TimelineId) -> bool:
|
||||
return any(
|
||||
timeline["timeline_id"] == str(timeline_id)
|
||||
for timeline in origin_ps_http.timeline_and_offloaded_list(
|
||||
tenant_id=tenant_id
|
||||
).offloaded
|
||||
)
|
||||
|
||||
def child_offloaded():
|
||||
origin_ps_http.timeline_offload(tenant_id=tenant_id, timeline_id=child_timeline_id)
|
||||
assert timeline_offloaded_api(child_timeline_id)
|
||||
|
||||
wait_until(child_offloaded)
|
||||
|
||||
# Delete the timeline from the origin pageserver, holding up the deletion queue so that it doesn't finish
|
||||
failpoint_deletion_queue = "deletion-queue-before-execute-pause"
|
||||
origin_ps_http.configure_failpoints((failpoint_deletion_queue, "pause"))
|
||||
origin_ps_http.timeline_delete(tenant_id, child_timeline_id)
|
||||
|
||||
dest_ps = [ps for ps in env.pageservers if ps.id != origin_ps.id][0]
|
||||
assert dest_ps
|
||||
log.info(f"Migrating {tenant_id} {origin_ps.id}->{dest_ps.id}")
|
||||
env.storage_controller.tenant_shard_migrate(tenant_shard_id, dest_ps_id=dest_ps.id)
|
||||
|
||||
log.info("unstuck the DELETE")
|
||||
origin_ps_http.configure_failpoints((failpoint_deletion_queue, "off"))
|
||||
|
||||
def child_prefix_empty():
|
||||
assert_prefix_empty(
|
||||
neon_env_builder.pageserver_remote_storage,
|
||||
prefix=f"tenants/{str(tenant_id)}/{str(child_timeline_id)}/",
|
||||
)
|
||||
|
||||
wait_until(child_prefix_empty)
|
||||
|
||||
dest_ps_http = dest_ps.http_client()
|
||||
|
||||
# We can't use timeline_delete_wait_completed here as timeline status will return 404, but we want to return 404 from the deletion endpoint
|
||||
def timeline_is_missing():
|
||||
data = None
|
||||
try:
|
||||
data = dest_ps_http.timeline_delete(tenant_id, child_timeline_id)
|
||||
log.info(f"timeline delete {data}")
|
||||
except PageserverApiException as e:
|
||||
log.debug(e)
|
||||
if e.status_code == 404:
|
||||
return
|
||||
|
||||
raise RuntimeError(f"Timeline exists {data}")
|
||||
|
||||
wait_until(timeline_is_missing)
|
||||
# (dest_ps_http, tenant_id, child_timeline_id)
|
||||
|
||||
#
|
||||
# Now ensure that scrubber doesn't have anything to clean up.
|
||||
#
|
||||
|
||||
# Sleep some amount larger than min_age_secs
|
||||
time.sleep(3)
|
||||
|
||||
# Ensure that min_age_secs has a deletion impeding effect
|
||||
gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=1, mode="full")
|
||||
assert gc_summary["remote_storage_errors"] == 0
|
||||
assert gc_summary["indices_deleted"] == 0
|
||||
assert gc_summary["tenant_manifests_deleted"] == 0
|
||||
|
||||
|
||||
def test_timeline_offload_generations(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Test for scrubber deleting old generations of manifests
|
||||
|
||||
Reference in New Issue
Block a user