diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index e3dab2fc1d..8e61d09de7 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -48,6 +48,7 @@ use timeline::compaction::GcCompactJob; use timeline::compaction::ScheduledCompactionTask; use timeline::import_pgdata; use timeline::offload::offload_timeline; +use timeline::offload::OffloadError; use timeline::CompactFlags; use timeline::CompactOptions; use timeline::CompactionError; @@ -2039,7 +2040,7 @@ impl Tenant { ) -> Result, TimelineArchivalError> { info!("unoffloading timeline"); - // We activate the timeline below manually, so this must be called on an active timeline. + // We activate the timeline below manually, so this must be called on an active tenant. // We expect callers of this function to ensure this. match self.current_state() { TenantState::Activating { .. } @@ -3100,9 +3101,17 @@ impl Tenant { }; has_pending_task |= pending_task_left.unwrap_or(false); if pending_task_left == Some(false) && *can_offload { - offload_timeline(self, timeline) + pausable_failpoint!("before-timeline-auto-offload"); + match offload_timeline(self, timeline) .instrument(info_span!("offload_timeline", %timeline_id)) - .await?; + .await + { + Err(OffloadError::NotArchived) => { + // Ignore this, we likely raced with unarchival + Ok(()) + } + other => other, + }?; } } diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index b27ac3e933..813111245d 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -304,6 +304,15 @@ pub enum WaitCompletionError { #[derive(Debug, thiserror::Error)] #[error("Upload queue either in unexpected state or hasn't downloaded manifest yet")] pub struct UploadQueueNotReadyError; + +#[derive(Debug, thiserror::Error)] +pub enum ShutdownIfArchivedError { + #[error(transparent)] + NotInitialized(NotInitialized), + #[error("timeline is not archived")] + NotArchived, +} + /// Behavioral modes that enable seamless live migration. /// /// See docs/rfcs/028-pageserver-migration.md to understand how these fit in. @@ -816,6 +825,55 @@ impl RemoteTimelineClient { Ok(need_wait) } + /// Shuts the timeline client down, but only if the timeline is archived. + /// + /// This function and [`Self::schedule_index_upload_for_timeline_archival_state`] use the + /// same lock to prevent races between unarchival and offloading: unarchival requires the + /// upload queue to be initialized, and leaves behind an upload queue where either dirty + /// or clean has archived_at of `None`. offloading leaves behind an uninitialized upload + /// queue. + pub(crate) async fn shutdown_if_archived( + self: &Arc, + ) -> Result<(), ShutdownIfArchivedError> { + { + let mut guard = self.upload_queue.lock().unwrap(); + let upload_queue = guard + .initialized_mut() + .map_err(ShutdownIfArchivedError::NotInitialized)?; + + match ( + upload_queue.dirty.archived_at.is_none(), + upload_queue.clean.0.archived_at.is_none(), + ) { + // The expected case: the timeline is archived and we don't want to unarchive + (false, false) => {} + (true, false) => { + tracing::info!("can't shut down timeline: timeline slated for unarchival"); + return Err(ShutdownIfArchivedError::NotArchived); + } + (dirty_archived, true) => { + tracing::info!(%dirty_archived, "can't shut down timeline: timeline not archived in remote storage"); + return Err(ShutdownIfArchivedError::NotArchived); + } + } + + // Set the shutting_down flag while the guard from the archival check is held. + // This prevents a race with unarchival, as initialized_mut will not return + // an upload queue from this point. + // Also launch the queued tasks like shutdown() does. + if !upload_queue.shutting_down { + upload_queue.shutting_down = true; + upload_queue.queued_operations.push_back(UploadOp::Shutdown); + // this operation is not counted similar to Barrier + self.launch_queued_tasks(upload_queue); + } + } + + self.shutdown().await; + + Ok(()) + } + /// Launch an index-file upload operation in the background, setting `import_pgdata` field. pub(crate) fn schedule_index_upload_for_import_pgdata_state_update( self: &Arc, diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index 47a93b19d2..ae44af3fad 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -194,7 +194,9 @@ impl DeleteTimelineFlow { super::debug_assert_current_span_has_tenant_and_timeline_id(); let allow_offloaded_children = false; - let (timeline, mut guard) = Self::prepare(tenant, timeline_id, allow_offloaded_children)?; + let set_stopping = true; + let (timeline, mut guard) = + Self::prepare(tenant, timeline_id, allow_offloaded_children, set_stopping)?; guard.mark_in_progress()?; @@ -334,6 +336,7 @@ impl DeleteTimelineFlow { tenant: &Tenant, timeline_id: TimelineId, allow_offloaded_children: bool, + set_stopping: bool, ) -> Result<(TimelineOrOffloaded, DeletionGuard), DeleteTimelineError> { // Note the interaction between this guard and deletion guard. // Here we attempt to lock deletion guard when we're holding a lock on timelines. @@ -389,8 +392,10 @@ impl DeleteTimelineFlow { } }; - if let TimelineOrOffloaded::Timeline(timeline) = &timeline { - timeline.set_state(TimelineState::Stopping); + if set_stopping { + if let TimelineOrOffloaded::Timeline(timeline) = &timeline { + timeline.set_state(TimelineState::Stopping); + } } Ok((timeline, delete_lock_guard)) diff --git a/pageserver/src/tenant/timeline/offload.rs b/pageserver/src/tenant/timeline/offload.rs index 15628a9645..6c6b19e8b1 100644 --- a/pageserver/src/tenant/timeline/offload.rs +++ b/pageserver/src/tenant/timeline/offload.rs @@ -1,10 +1,11 @@ use std::sync::Arc; -use pageserver_api::models::TenantState; +use pageserver_api::models::{TenantState, TimelineState}; use super::delete::{delete_local_timeline_directory, DeleteTimelineFlow, DeletionGuard}; use super::Timeline; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; +use crate::tenant::remote_timeline_client::ShutdownIfArchivedError; use crate::tenant::{OffloadedTimeline, Tenant, TenantManifestError, TimelineOrOffloaded}; #[derive(thiserror::Error, Debug)] @@ -36,28 +37,29 @@ pub(crate) async fn offload_timeline( tracing::info!("offloading archived timeline"); let allow_offloaded_children = true; - let (timeline, guard) = - DeleteTimelineFlow::prepare(tenant, timeline.timeline_id, allow_offloaded_children) - .map_err(|e| OffloadError::Other(anyhow::anyhow!(e)))?; + let set_stopping = false; + let (timeline, guard) = DeleteTimelineFlow::prepare( + tenant, + timeline.timeline_id, + allow_offloaded_children, + set_stopping, + ) + .map_err(|e| OffloadError::Other(anyhow::anyhow!(e)))?; let TimelineOrOffloaded::Timeline(timeline) = timeline else { tracing::error!("timeline already offloaded, but given timeline object"); return Ok(()); }; - let is_archived = timeline.is_archived(); - match is_archived { - Some(true) => (), - Some(false) => { - tracing::warn!("tried offloading a non-archived timeline"); - return Err(OffloadError::NotArchived); - } - None => { - // This is legal: calls to this function can race with the timeline shutting down - tracing::info!("tried offloading a timeline whose remote storage is not initialized"); - return Err(OffloadError::Cancelled); + match timeline.remote_client.shutdown_if_archived().await { + Ok(()) => {} + Err(ShutdownIfArchivedError::NotInitialized(_)) => { + // Either the timeline is being deleted, the operation is being retried, or we are shutting down. + // Don't return cancelled here to keep it idempotent. } + Err(ShutdownIfArchivedError::NotArchived) => return Err(OffloadError::NotArchived), } + timeline.set_state(TimelineState::Stopping); // Now that the Timeline is in Stopping state, request all the related tasks to shut down. timeline.shutdown(super::ShutdownMode::Reload).await; diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py index 9b3a48add9..bec8270582 100644 --- a/test_runner/regress/test_timeline_archive.py +++ b/test_runner/regress/test_timeline_archive.py @@ -959,3 +959,103 @@ def test_timeline_offload_generations(neon_env_builder: NeonEnvBuilder): assert gc_summary["remote_storage_errors"] == 0 assert gc_summary["indices_deleted"] > 0 assert gc_summary["tenant_manifests_deleted"] > 0 + + +@pytest.mark.parametrize("end_with_offloaded", [False, True]) +def test_timeline_offload_race_unarchive( + neon_env_builder: NeonEnvBuilder, end_with_offloaded: bool +): + """ + Ensure that unarchive and timeline offload don't race each other + """ + # Regression test for issue https://github.com/neondatabase/neon/issues/10220 + # (automatic) timeline offloading defaults to false for now + neon_env_builder.pageserver_config_override = "timeline_offloading = true" + + failpoint = "before-timeline-auto-offload" + + env = neon_env_builder.init_start() + ps_http = env.pageserver.http_client() + + # Turn off gc and compaction loops: we want to issue them manually for better reliability + tenant_id, initial_timeline_id = env.create_tenant( + conf={ + "gc_period": "0s", + "compaction_period": "1s", + } + ) + + # Create a branch + leaf_timeline_id = env.create_branch("test_ancestor_branch_archive", tenant_id) + + # write some stuff to the leaf + with env.endpoints.create_start( + "test_ancestor_branch_archive", tenant_id=tenant_id + ) as endpoint: + endpoint.safe_psql_many( + [ + "CREATE TABLE foo(key serial primary key, t text default 'data_content')", + "INSERT INTO foo SELECT FROM generate_series(1,1000)", + ] + ) + sum = endpoint.safe_psql("SELECT sum(key) from foo where key % 7 = 1") + + ps_http.configure_failpoints((failpoint, "pause")) + + ps_http.timeline_archival_config( + tenant_id, + leaf_timeline_id, + state=TimelineArchivalState.ARCHIVED, + ) + leaf_detail = ps_http.timeline_detail( + tenant_id, + leaf_timeline_id, + ) + assert leaf_detail["is_archived"] is True + + # The actual race: get the compaction task to right before + # offloading the timeline and attempt to unarchive it + wait_until(lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}")) + + # This unarchival should go through + ps_http.timeline_archival_config( + tenant_id, + leaf_timeline_id, + state=TimelineArchivalState.UNARCHIVED, + ) + + def timeline_offloaded_api(timeline_id: TimelineId) -> bool: + # TODO add a proper API to check if a timeline has been offloaded or not + return not any( + timeline["timeline_id"] == str(timeline_id) + for timeline in ps_http.timeline_list(tenant_id=tenant_id) + ) + + def leaf_offloaded(): + assert timeline_offloaded_api(leaf_timeline_id) + + # Ensure that we've hit the failed offload attempt + ps_http.configure_failpoints((failpoint, "off")) + wait_until( + lambda: env.pageserver.assert_log_contains( + f".*compaction_loop.*offload_timeline.*{leaf_timeline_id}.*can't shut down timeline.*" + ) + ) + + with env.endpoints.create_start( + "test_ancestor_branch_archive", tenant_id=tenant_id + ) as endpoint: + sum_again = endpoint.safe_psql("SELECT sum(key) from foo where key % 7 = 1") + assert sum == sum_again + + if end_with_offloaded: + # Ensure that offloading still works after all of this + ps_http.timeline_archival_config( + tenant_id, + leaf_timeline_id, + state=TimelineArchivalState.ARCHIVED, + ) + wait_until(leaf_offloaded) + else: + # Test that deletion of leaf timeline works + ps_http.timeline_delete(tenant_id, leaf_timeline_id)