From 89a65a9e5a30c7525d165d1a9c2675d05811bfcb Mon Sep 17 00:00:00 2001 From: John Spray Date: Wed, 16 Oct 2024 13:39:58 +0100 Subject: [PATCH] pageserver: improve handling of archival_config calls during Timeline shutdown (#9415) ## Problem In test `test_timeline_offloading`, we see failures like: ``` PageserverApiException: queue is in state Stopped ``` Example failure: https://neon-github-public-dev.s3.amazonaws.com/reports/main/11356917668/index.html#testresult/ff0e348a78a974ee/retries ## Summary of changes - Amend code paths that handle errors from RemoteTimelineClient to check for cancellation and emit the Cancelled error variant in these cases (will give clients a 503 to retry) - Remove the implicit `#[from]` for the Other error case, to make it harder to add code that accidentally squashes errors into this (500-equivalent) error variant. This would be neater if we made RemoteTimelineClient return a structured error instead of anyhow::Error, but that's a bigger refactor. I'm not sure if the test really intends to hit this path, but the error handling fix makes sense either way. --- pageserver/src/tenant.rs | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 44d1bb74ca..20925c7fd6 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -67,7 +67,7 @@ use self::metadata::TimelineMetadata; use self::mgr::GetActiveTenantError; use self::mgr::GetTenantError; use self::remote_timeline_client::upload::upload_index_part; -use self::remote_timeline_client::RemoteTimelineClient; +use self::remote_timeline_client::{RemoteTimelineClient, WaitCompletionError}; use self::timeline::uninit::TimelineCreateGuard; use self::timeline::uninit::TimelineExclusionError; use self::timeline::uninit::UninitializedTimeline; @@ -632,7 +632,7 @@ pub enum TimelineArchivalError { AlreadyInProgress, #[error(transparent)] - Other(#[from] anyhow::Error), + Other(anyhow::Error), } impl Debug for TimelineArchivalError { @@ -1602,7 +1602,8 @@ impl Tenant { "failed to load remote timeline {} for tenant {}", timeline_id, self.tenant_shard_id ) - })?; + }) + .map_err(TimelineArchivalError::Other)?; let timelines = self.timelines.lock().unwrap(); if let Some(timeline) = timelines.get(&timeline_id) { let mut offloaded_timelines = self.timelines_offloaded.lock().unwrap(); @@ -1672,9 +1673,19 @@ impl Tenant { }; // Third part: upload new timeline archival state and block until it is present in S3 - let upload_needed = timeline + let upload_needed = match timeline .remote_client - .schedule_index_upload_for_timeline_archival_state(new_state)?; + .schedule_index_upload_for_timeline_archival_state(new_state) + { + Ok(upload_needed) => upload_needed, + Err(e) => { + if timeline.cancel.is_cancelled() { + return Err(TimelineArchivalError::Cancelled); + } else { + return Err(TimelineArchivalError::Other(e)); + } + } + }; if upload_needed { info!("Uploading new state"); @@ -1685,7 +1696,14 @@ impl Tenant { tracing::warn!("reached timeout for waiting on upload queue"); return Err(TimelineArchivalError::Timeout); }; - v.map_err(|e| TimelineArchivalError::Other(anyhow::anyhow!(e)))?; + v.map_err(|e| match e { + WaitCompletionError::NotInitialized(e) => { + TimelineArchivalError::Other(anyhow::anyhow!(e)) + } + WaitCompletionError::UploadQueueShutDownOrStopped => { + TimelineArchivalError::Cancelled + } + })?; } Ok(()) }