diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index db739f1033..168e565edb 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -457,6 +457,21 @@ pub(crate) enum GetVectoredError { InvalidLsn(Lsn), } +#[derive(thiserror::Error, Debug)] +pub(crate) enum GetReadyAncestorError { + #[error("ancestor timeline {0} is being stopped")] + AncestorStopping(TimelineId), + + #[error("Ancestor LSN wait error: {0}")] + AncestorLsnTimeout(#[from] WaitLsnError), + + #[error("Cancelled")] + Cancelled, + + #[error(transparent)] + Other(#[from] anyhow::Error), +} + #[derive(Clone, Copy)] pub enum LogicalSizeCalculationCause { Initial, @@ -535,6 +550,18 @@ impl From for CreateImageLayersError { } } +impl From for PageReconstructError { + fn from(e: GetReadyAncestorError) -> Self { + use GetReadyAncestorError::*; + match e { + AncestorStopping(tid) => PageReconstructError::AncestorStopping(tid), + AncestorLsnTimeout(wait_err) => PageReconstructError::AncestorLsnTimeout(wait_err), + Cancelled => PageReconstructError::Cancelled, + Other(other) => PageReconstructError::Other(other), + } + } +} + /// Public interface functions impl Timeline { /// Get the LSN where this branch was created @@ -2400,60 +2427,8 @@ impl Timeline { timeline.ancestor_lsn, cont_lsn ); - let ancestor = match timeline.get_ancestor_timeline() { - Ok(timeline) => timeline, - Err(e) => return Err(PageReconstructError::from(e)), - }; - // It's possible that the ancestor timeline isn't active yet, or - // is active but hasn't yet caught up to the branch point. Wait - // for it. - // - // This cannot happen while the pageserver is running normally, - // because you cannot create a branch from a point that isn't - // present in the pageserver yet. However, we don't wait for the - // branch point to be uploaded to cloud storage before creating - // a branch. I.e., the branch LSN need not be remote consistent - // for the branching operation to succeed. - // - // Hence, if we try to load a tenant in such a state where - // 1. the existence of the branch was persisted (in IndexPart and/or locally) - // 2. but the ancestor state is behind branch_lsn because it was not yet persisted - // then we will need to wait for the ancestor timeline to - // re-stream WAL up to branch_lsn before we access it. - // - // How can a tenant get in such a state? - // - ungraceful pageserver process exit - // - detach+attach => this is a bug, https://github.com/neondatabase/neon/issues/4219 - // - // NB: this could be avoided by requiring - // branch_lsn >= remote_consistent_lsn - // during branch creation. - match ancestor.wait_to_become_active(ctx).await { - Ok(()) => {} - Err(TimelineState::Stopping) => { - return Err(PageReconstructError::AncestorStopping(ancestor.timeline_id)); - } - Err(state) => { - return Err(PageReconstructError::Other(anyhow::anyhow!( - "Timeline {} will not become active. Current state: {:?}", - ancestor.timeline_id, - &state, - ))); - } - } - ancestor - .wait_lsn(timeline.ancestor_lsn, ctx) - .await - .map_err(|e| match e { - e @ WaitLsnError::Timeout(_) => PageReconstructError::AncestorLsnTimeout(e), - WaitLsnError::Shutdown => PageReconstructError::Cancelled, - e @ WaitLsnError::BadState => { - PageReconstructError::Other(anyhow::anyhow!(e)) - } - })?; - - timeline_owned = ancestor; + timeline_owned = timeline.get_ready_ancestor_timeline(ctx).await?; timeline = &*timeline_owned; prev_lsn = Lsn(u64::MAX); continue 'outer; @@ -2583,6 +2558,66 @@ impl Timeline { Some((lsn, img)) } + async fn get_ready_ancestor_timeline( + &self, + ctx: &RequestContext, + ) -> Result, GetReadyAncestorError> { + let ancestor = match self.get_ancestor_timeline() { + Ok(timeline) => timeline, + Err(e) => return Err(GetReadyAncestorError::from(e)), + }; + + // It's possible that the ancestor timeline isn't active yet, or + // is active but hasn't yet caught up to the branch point. Wait + // for it. + // + // This cannot happen while the pageserver is running normally, + // because you cannot create a branch from a point that isn't + // present in the pageserver yet. However, we don't wait for the + // branch point to be uploaded to cloud storage before creating + // a branch. I.e., the branch LSN need not be remote consistent + // for the branching operation to succeed. + // + // Hence, if we try to load a tenant in such a state where + // 1. the existence of the branch was persisted (in IndexPart and/or locally) + // 2. but the ancestor state is behind branch_lsn because it was not yet persisted + // then we will need to wait for the ancestor timeline to + // re-stream WAL up to branch_lsn before we access it. + // + // How can a tenant get in such a state? + // - ungraceful pageserver process exit + // - detach+attach => this is a bug, https://github.com/neondatabase/neon/issues/4219 + // + // NB: this could be avoided by requiring + // branch_lsn >= remote_consistent_lsn + // during branch creation. + match ancestor.wait_to_become_active(ctx).await { + Ok(()) => {} + Err(TimelineState::Stopping) => { + return Err(GetReadyAncestorError::AncestorStopping( + ancestor.timeline_id, + )); + } + Err(state) => { + return Err(GetReadyAncestorError::Other(anyhow::anyhow!( + "Timeline {} will not become active. Current state: {:?}", + ancestor.timeline_id, + &state, + ))); + } + } + ancestor + .wait_lsn(self.ancestor_lsn, ctx) + .await + .map_err(|e| match e { + e @ WaitLsnError::Timeout(_) => GetReadyAncestorError::AncestorLsnTimeout(e), + WaitLsnError::Shutdown => GetReadyAncestorError::Cancelled, + e @ WaitLsnError::BadState => GetReadyAncestorError::Other(anyhow::anyhow!(e)), + })?; + + Ok(ancestor) + } + fn get_ancestor_timeline(&self) -> anyhow::Result> { let ancestor = self.ancestor_timeline.as_ref().with_context(|| { format!(