Compare commits

...

3 Commits

Author SHA1 Message Date
Alex Chi Z
2025f8763f fix scrubber handle deleted timelines
Signed-off-by: Alex Chi Z <chi@neon.tech>
2025-07-30 16:36:12 -04:00
Alex Chi Z
e635891dbb maybe fix test cases?
Signed-off-by: Alex Chi Z <chi@neon.tech>
2025-07-30 15:39:03 -04:00
Alex Chi Z
59bb30669c fix(pageserver): do not delete index_part.json during timeline deletion
Signed-off-by: Alex Chi Z <chi@neon.tech>
2025-07-30 15:39:03 -04:00
5 changed files with 37 additions and 9 deletions

View File

@@ -1992,11 +1992,25 @@ impl RemoteTimelineClient {
)))?
});
debug!("enqueuing index part deletion");
self.deletion_queue_client
.push_immediate([latest_index].to_vec())
.await
.map_err(|_| DeleteTimelineError::Cancelled)?;
// Skip deleting the index part.json for now. Isolated pageserver will cause attach issues if we
// delete index part here.
//
// - Pageserver 1 attaches the tenant with generation N and creates a timeline A.
// - Pageserver 1 gets isolated from the network. Storcon attaches the tenant to pageserver 2 with generation N+1.
// - Pageserver 2 delete timeline A, now the timeline directory is empty.
// - Pageserver 1 rejoins the network, ingests the new data from safekeeper, and uploads the index_part.json
// with the old generation N.
// - Now we are left with a timeline directory with index_part.json with generation N, but no layers
// except the newly-uploaded one from the isolated pageserver 1.
//
// As a solution, we will keep the tombstone index_part.json (with `deleted_at` set) so that we don't
// run into the issue above.
// debug!("enqueuing index part deletion");
// self.deletion_queue_client
// .push_immediate([latest_index].to_vec())
// .await
// .map_err(|_| DeleteTimelineError::Cancelled)?;
// Timeline deletion is rare and we have probably emitted a reasonably number of objects: wait
// for a flush to a persistent deletion list so that we may be sure deletion will occur.

View File

@@ -142,6 +142,12 @@ impl TenantRefAccumulator {
.or_default()
.insert(this_shard_idx);
// TODO: change this to "is X days ago?"
if index_part.deleted_at.is_some() {
tracing::info!(%ttid, "The timeline is already deleted, skipping");
return;
}
let mut ancestor_refs = Vec::new();
for (layer_name, layer_metadata) in &index_part.layer_metadata {
if layer_metadata.shard != this_shard_idx {

View File

@@ -621,6 +621,7 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
path
for path in remote_timeline_path.iterdir()
if not (path.name.endswith("initdb.tar.zst"))
and not (path.name.startswith("index_part.json"))
]
assert len(filtered) == 0

View File

@@ -410,6 +410,7 @@ def test_scrubber_physical_gc_timeline_deletion(neon_env_builder: NeonEnvBuilder
# Delete the timeline
env.storage_controller.pageserver_api().timeline_delete(tenant_id, timeline_id)
time.sleep(1) # give scrubber some time to wait for min_age_secs
# Subsequently doing physical GC should clean up the ancestor layers
gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=0, mode="full")

View File

@@ -491,7 +491,7 @@ def test_timeline_archival_chaos(neon_env_builder: NeonEnvBuilder):
return (active_ids, offloaded_ids)
def timeline_objects(tenant_shard_id, timeline_id):
def timeline_objects_exclude_index_part(tenant_shard_id, timeline_id):
response = list_prefix(
env.pageserver_remote_storage, # type: ignore
prefix="/".join(
@@ -505,7 +505,11 @@ def test_timeline_archival_chaos(neon_env_builder: NeonEnvBuilder):
+ "/",
)
return [k["Key"] for k in response.get("Contents", [])]
return [
k["Key"]
for k in response.get("Contents", [])
if not k["Key"].startswith("index_part.json")
]
def worker():
"""
@@ -533,7 +537,7 @@ def test_timeline_archival_chaos(neon_env_builder: NeonEnvBuilder):
state.created = True
if (
timeline_objects(
timeline_objects_exclude_index_part(
tenant_shard_id=tenant_shard_id, timeline_id=state.timeline_id
)
== []
@@ -550,7 +554,9 @@ def test_timeline_archival_chaos(neon_env_builder: NeonEnvBuilder):
violations.append(msg)
raise RuntimeError(msg)
objects = timeline_objects(tenant_shard_id, state.timeline_id)
objects = timeline_objects_exclude_index_part(
tenant_shard_id, state.timeline_id
)
if len(objects) == 0:
log.info(f"Confirmed deletion of timeline {state.timeline_id}")
timelines_deleted.append(state.timeline_id)