From 103c866a9080608496bdb5473e605722884ecde9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Fri, 27 Jun 2025 17:14:55 +0200 Subject: [PATCH] Fix hang deleting offloaded timelines (#12366) We don't have cancellation support for timeline deletions. In other words, timeline deletion might still go on in an older generation while we are attaching it in a newer generation already, because the cancellation simply hasn't reached the deletion code. This has caused us to hit a situation with offloaded timelines in which the timeline was in an unrecoverable state: always returning an accepted response, but never a 404 like it should be. The detailed description can be found in [here](https://github.com/neondatabase/cloud/issues/30406#issuecomment-3008667859) (private repo link). TLDR: 1. we ask to delete timeline on old pageserver/generation, starts process in background 2. the storcon migrates the tenant to a different pageserver. - during attach, the pageserver still finds an index part, so it adds it to `offloaded_timelines` 4. the timeline deletion finishes, removing the index part in S3 5. there is a retry of the timeline deletion endpoint, sent to the new pageserver location. it is bound to fail however: - as the index part is gone, we print `Timeline already deleted in remote storage`. - the problem is that we then return an accepted response code, and not a 404. - this confuses the code calling us. it thinks the timeline is not deleted, so keeps retrying. - this state never gets recovered from until a reset/detach, because of the `offloaded_timelines` entry staying there. This is where this PR fixes things: if no index part can be found, we can safely assume that the timeline is gone in S3 (it's the last thing to be deleted), so we can remove it from `offloaded_timelines` and trigger a reupload of the manifest. Subsequent retries will pick that up. Why not improve the cancellation support? It is a more disruptive code change, that might have its own risks. So we don't do it for now. Fixes https://github.com/neondatabase/cloud/issues/30406 --- pageserver/src/tenant/timeline/delete.rs | 11 +- test_runner/regress/test_timeline_archive.py | 128 +++++++++++++++++++ 2 files changed, 138 insertions(+), 1 deletion(-) diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index 51bdd59f4f..f7dc44be90 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -241,8 +241,17 @@ impl DeleteTimelineFlow { { Ok(r) => r, Err(DownloadError::NotFound) => { - // Deletion is already complete + // Deletion is already complete. + // As we came here, we will need to remove the timeline from the tenant though. tracing::info!("Timeline already deleted in remote storage"); + if let TimelineOrOffloaded::Offloaded(_) = &timeline { + // We only supoprt this for offloaded timelines, as we don't know which state non-offloaded timelines are in. + tracing::info!( + "Timeline with gone index part is offloaded timeline. Removing from tenant." + ); + remove_maybe_offloaded_timeline_from_tenant(tenant, &timeline, &guard) + .await?; + } return Ok(()); } Err(e) => { diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py index 8d46ef8306..41286a2adc 100644 --- a/test_runner/regress/test_timeline_archive.py +++ b/test_runner/regress/test_timeline_archive.py @@ -896,6 +896,134 @@ def test_timeline_retain_lsn( assert sum == pre_branch_sum +def test_timeline_offload_delete_race(neon_env_builder: NeonEnvBuilder): + """ + Regression test for https://github.com/neondatabase/cloud/issues/30406 + """ + remote_storage_kind = s3_storage() + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) + neon_env_builder.num_pageservers = 2 + + env = neon_env_builder.init_start() + + # Turn off gc and compaction loops: we want to issue them manually for better reliability + tenant_id, root_timeline_id = env.create_tenant( + conf={ + "gc_period": "0s", + "compaction_period": "0s", + "checkpoint_distance": f"{1024**2}", + } + ) + + origin_ps = env.get_tenant_pageserver(tenant_id) + assert origin_ps + origin_ps.allowed_errors.extend( + [ + ".*Timed out waiting for deletion queue flush.*", + ".*Timed out waiting for flush to remote storage.*", + ] + ) + origin_ps_http = origin_ps.http_client() + + # We are not sharding this tenant + tenant_shard_id = TenantShardId(tenant_id, 0, 0) + + # Create a branch and archive it + child_timeline_id = env.create_branch("test_archived_branch_persisted", tenant_id) + + with env.endpoints.create_start( + "test_archived_branch_persisted", tenant_id=tenant_id + ) as endpoint: + endpoint.safe_psql_many( + [ + "CREATE TABLE foo(key serial primary key, t text default 'data_content')", + "INSERT INTO foo SELECT FROM generate_series(1,512)", + ] + ) + last_flush_lsn_upload(env, endpoint, tenant_id, child_timeline_id) + + assert_prefix_not_empty( + neon_env_builder.pageserver_remote_storage, + prefix=f"tenants/{str(tenant_id)}/", + ) + assert_prefix_not_empty( + neon_env_builder.pageserver_remote_storage, + prefix=f"tenants/{str(tenant_id)}/tenant-manifest", + ) + + origin_ps_http.timeline_archival_config( + tenant_id, + child_timeline_id, + state=TimelineArchivalState.ARCHIVED, + ) + + def timeline_offloaded_api(timeline_id: TimelineId) -> bool: + return any( + timeline["timeline_id"] == str(timeline_id) + for timeline in origin_ps_http.timeline_and_offloaded_list( + tenant_id=tenant_id + ).offloaded + ) + + def child_offloaded(): + origin_ps_http.timeline_offload(tenant_id=tenant_id, timeline_id=child_timeline_id) + assert timeline_offloaded_api(child_timeline_id) + + wait_until(child_offloaded) + + # Delete the timeline from the origin pageserver, holding up the deletion queue so that it doesn't finish + failpoint_deletion_queue = "deletion-queue-before-execute-pause" + origin_ps_http.configure_failpoints((failpoint_deletion_queue, "pause")) + origin_ps_http.timeline_delete(tenant_id, child_timeline_id) + + dest_ps = [ps for ps in env.pageservers if ps.id != origin_ps.id][0] + assert dest_ps + log.info(f"Migrating {tenant_id} {origin_ps.id}->{dest_ps.id}") + env.storage_controller.tenant_shard_migrate(tenant_shard_id, dest_ps_id=dest_ps.id) + + log.info("unstuck the DELETE") + origin_ps_http.configure_failpoints((failpoint_deletion_queue, "off")) + + def child_prefix_empty(): + assert_prefix_empty( + neon_env_builder.pageserver_remote_storage, + prefix=f"tenants/{str(tenant_id)}/{str(child_timeline_id)}/", + ) + + wait_until(child_prefix_empty) + + dest_ps_http = dest_ps.http_client() + + # We can't use timeline_delete_wait_completed here as timeline status will return 404, but we want to return 404 from the deletion endpoint + def timeline_is_missing(): + data = None + try: + data = dest_ps_http.timeline_delete(tenant_id, child_timeline_id) + log.info(f"timeline delete {data}") + except PageserverApiException as e: + log.debug(e) + if e.status_code == 404: + return + + raise RuntimeError(f"Timeline exists {data}") + + wait_until(timeline_is_missing) + # (dest_ps_http, tenant_id, child_timeline_id) + + # + # Now ensure that scrubber doesn't have anything to clean up. + # + + # Sleep some amount larger than min_age_secs + time.sleep(3) + + # Ensure that min_age_secs has a deletion impeding effect + gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=1, mode="full") + assert gc_summary["remote_storage_errors"] == 0 + assert gc_summary["indices_deleted"] == 0 + assert gc_summary["tenant_manifests_deleted"] == 0 + + def test_timeline_offload_generations(neon_env_builder: NeonEnvBuilder): """ Test for scrubber deleting old generations of manifests