From 1303cd5d05062c95660ec00f8846b4258fc62b4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Fri, 3 Jan 2025 13:36:01 +0100 Subject: [PATCH] Fix defusing race between Tenant::shutdown and offload_timeline (#10150) There is a race condition between `Tenant::shutdown`'s `defuse_for_drop` loop and `offload_timeline`, where timeline offloading can insert into a tenant that is in the process of shutting down, in fact so far progressed that the `defuse_for_drop` has already been called. This prevents warn log lines of the form: ``` offloaded timeline was dropped without having cleaned it up at the ancestor ``` The solution piggybacks on the `offloaded_timelines` lock: both the defuse loop and the offloaded timeline insertion need to acquire the lock, and we know that the defuse loop only runs after the tenant has set its `TenantState` to `Stopping`. So if we hold the `offloaded_timelines` lock, and know that the `TenantState` is not `Stopping`, then we know that the defuse loop has not ran yet, and holding the lock ensures that it doesn't start running while we are inserting the offloaded timeline. Fixes #10070 --- pageserver/src/tenant/timeline/offload.rs | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pageserver/src/tenant/timeline/offload.rs b/pageserver/src/tenant/timeline/offload.rs index 3bfbfb5061..15628a9645 100644 --- a/pageserver/src/tenant/timeline/offload.rs +++ b/pageserver/src/tenant/timeline/offload.rs @@ -1,5 +1,7 @@ use std::sync::Arc; +use pageserver_api::models::TenantState; + use super::delete::{delete_local_timeline_directory, DeleteTimelineFlow, DeletionGuard}; use super::Timeline; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; @@ -70,6 +72,15 @@ pub(crate) async fn offload_timeline( { let mut offloaded_timelines = tenant.timelines_offloaded.lock().unwrap(); + if matches!( + tenant.current_state(), + TenantState::Stopping { .. } | TenantState::Broken { .. } + ) { + // Cancel the operation if the tenant is shutting down. Do this while the + // timelines_offloaded lock is held to prevent a race with Tenant::shutdown + // for defusing the lock + return Err(OffloadError::Cancelled); + } offloaded_timelines.insert( timeline.timeline_id, Arc::new(