From 878c1c7110348ef0352f4c0cd282746cd62f0fea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Fri, 14 Feb 2025 11:21:50 +0100 Subject: [PATCH] offload_timeline: check if the timeline is archived on HasChildren error (#10776) PR #10305 makes sure that there is no *actual* race, i.e. we will never attempt to offload a timeline that has just been unarchived, or similar. However, if a timeline has been unarchived and has children that are unarchived too, we will get an error log line. Such races can occur as in compaction we check if the timeline can be offloaded way before we attempt to offload it: the result might change in the meantime. This patch checks if the delete guard can't be obtained because the timeline has unarchived children, and if yes, it does another check for whether the timeline has become unarchived or not. If it is unarchived, it just prints an info log msg and integrates itself into the error suppression logic of the compaction calling into it. If you squint at it really closely, there is still a possible race in which we print an error log, but this one is unlikely because the timeline and its children need to be archived right after the check for whether the timeline has any unarchived children, and right before the check whether the timeline is archived. Archival involves a network operation while nothing between these two checks does that, so it's very unlikely to happen in real life. https://github.com/neondatabase/cloud/issues/23979#issuecomment-2651265729 --- pageserver/src/tenant/timeline/offload.rs | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/pageserver/src/tenant/timeline/offload.rs b/pageserver/src/tenant/timeline/offload.rs index 3b5bf8290c..93e5a1100d 100644 --- a/pageserver/src/tenant/timeline/offload.rs +++ b/pageserver/src/tenant/timeline/offload.rs @@ -7,7 +7,9 @@ use super::Timeline; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; use crate::tenant::remote_timeline_client::ShutdownIfArchivedError; use crate::tenant::timeline::delete::{make_timeline_delete_guard, TimelineDeleteGuardKind}; -use crate::tenant::{OffloadedTimeline, Tenant, TenantManifestError, TimelineOrOffloaded}; +use crate::tenant::{ + DeleteTimelineError, OffloadedTimeline, Tenant, TenantManifestError, TimelineOrOffloaded, +}; #[derive(thiserror::Error, Debug)] pub(crate) enum OffloadError { @@ -37,12 +39,25 @@ pub(crate) async fn offload_timeline( debug_assert_current_span_has_tenant_and_timeline_id(); tracing::info!("offloading archived timeline"); - let (timeline, guard) = make_timeline_delete_guard( + let delete_guard_res = make_timeline_delete_guard( tenant, timeline.timeline_id, TimelineDeleteGuardKind::Offload, - ) - .map_err(|e| OffloadError::Other(anyhow::anyhow!(e)))?; + ); + if let Err(DeleteTimelineError::HasChildren(children)) = delete_guard_res { + let is_archived = timeline.is_archived(); + if is_archived == Some(true) { + tracing::error!("timeline is archived but has non-archived children: {children:?}"); + return Err(OffloadError::NotArchived); + } + tracing::info!( + ?is_archived, + "timeline is not archived and has unarchived children" + ); + return Err(OffloadError::NotArchived); + }; + let (timeline, guard) = + delete_guard_res.map_err(|e| OffloadError::Other(anyhow::anyhow!(e)))?; let TimelineOrOffloaded::Timeline(timeline) = timeline else { tracing::error!("timeline already offloaded, but given timeline object");