From 61f267d8f9bcb5c86013ecacdd8a588379337352 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Tue, 29 Jul 2025 14:33:02 +0200
Subject: [PATCH] pageserver: only retry `WaitForActiveTimeout` during shard
 resolution (#12772)

## Problem

In https://github.com/neondatabase/neon/pull/12467, timeouts and retries
were added to `Cache::get` tenant shard resolution to paper over an
issue with read unavailability during shard splits. However, this
retries _all_ errors, including irrecoverable errors like `NotFound`.

This causes problems with gRPC child shard routing in #12702, which
targets specific shards with `ShardSelector::Known` and relies on prompt
`NotFound` errors to reroute requests to child shards. These retries
introduce a 1s delay for all reads during child routing.

The broader problem of read unavailability during shard splits is left
as future work, see https://databricks.atlassian.net/browse/LKB-672.

Touches #12702.
Touches [LKB-191](https://databricks.atlassian.net/browse/LKB-191).

## Summary of changes

* Change `TenantManager` to always return a concrete
`GetActiveTimelineError`.
* Only retry `WaitForActiveTimeout` errors.
* Lots of code unindentation due to the simplified error handling.

Out of caution, we do not gate the retries on `ShardSelector`, since
this can trigger other races. Improvements here are left as future work.
---
 pageserver/src/page_service.rs           |   9 --
 pageserver/src/tenant/mgr.rs             |   6 +
 pageserver/src/tenant/timeline/handle.rs | 157 +++++++++++------------
 3 files changed, 82 insertions(+), 90 deletions(-)
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index a0998a7598..b3bc42a55e 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -466,13 +466,6 @@ impl TimelineHandles {
         self.handles
             .get(timeline_id, shard_selector, &self.wrapper)
             .await
-            .map_err(|e| match e {
-                timeline::handle::GetError::TenantManager(e) => e,
-                timeline::handle::GetError::PerTimelineStateShutDown => {
-                    trace!("per-timeline state shut down");
-                    GetActiveTimelineError::Timeline(GetTimelineError::ShuttingDown)
-                }
-            })
     }
 
     fn tenant_id(&self) -> Option<TenantId> {
@@ -488,11 +481,9 @@ pub(crate) struct TenantManagerWrapper {
     tenant_id: once_cell::sync::OnceCell<TenantId>,
 }
 
-#[derive(Debug)]
 pub(crate) struct TenantManagerTypes;
 
 impl timeline::handle::Types for TenantManagerTypes {
-    type TenantManagerError = GetActiveTimelineError;
     type TenantManager = TenantManagerWrapper;
     type Timeline = TenantManagerCacheItem;
 }
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index b47bab16d8..4432b4bba8 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -1522,6 +1522,12 @@ impl TenantManager {
         self.resources.deletion_queue_client.flush_advisory();
 
         // Phase 2: Put the parent shard to InProgress and grab a reference to the parent Tenant
+        //
+        // TODO: keeping the parent as InProgress while spawning the children causes read
+        // unavailability, as we can't acquire a timeline handle for it. The parent should be
+        // available for reads until the children are ready -- potentially until *all* subsplits
+        // across all parent shards are complete and the compute has been notified. See:
+        // <https://databricks.atlassian.net/browse/LKB-672>.
         drop(tenant);
         let mut parent_slot_guard =
             self.tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
diff --git a/pageserver/src/tenant/timeline/handle.rs b/pageserver/src/tenant/timeline/handle.rs
index 3570cab301..537b9ff373 100644
--- a/pageserver/src/tenant/timeline/handle.rs
+++ b/pageserver/src/tenant/timeline/handle.rs
@@ -224,11 +224,11 @@ use tracing::{instrument, trace};
 use utils::id::TimelineId;
 use utils::shard::{ShardIndex, ShardNumber};
 
-use crate::tenant::mgr::ShardSelector;
+use crate::page_service::GetActiveTimelineError;
+use crate::tenant::GetTimelineError;
+use crate::tenant::mgr::{GetActiveTenantError, ShardSelector};
 
-/// The requirement for Debug is so that #[derive(Debug)] works in some places.
-pub(crate) trait Types: Sized + std::fmt::Debug {
-    type TenantManagerError: Sized + std::fmt::Debug;
+pub(crate) trait Types: Sized {
     type TenantManager: TenantManager<Self> + Sized;
     type Timeline: Timeline<Self> + Sized;
 }
@@ -307,12 +307,11 @@ impl<T: Types> Default for PerTimelineState<T> {
 /// Abstract view of [`crate::tenant::mgr`], for testability.
 pub(crate) trait TenantManager<T: Types> {
     /// Invoked by [`Cache::get`] to resolve a [`ShardTimelineId`] to a [`Types::Timeline`].
-    /// Errors are returned as [`GetError::TenantManager`].
     async fn resolve(
         &self,
         timeline_id: TimelineId,
         shard_selector: ShardSelector,
-    ) -> Result<T::Timeline, T::TenantManagerError>;
+    ) -> Result<T::Timeline, GetActiveTimelineError>;
 }
 
 /// Abstract view of an [`Arc<Timeline>`], for testability.
@@ -322,13 +321,6 @@ pub(crate) trait Timeline<T: Types> {
     fn per_timeline_state(&self) -> &PerTimelineState<T>;
 }
 
-/// Errors returned by [`Cache::get`].
-#[derive(Debug)]
-pub(crate) enum GetError<T: Types> {
-    TenantManager(T::TenantManagerError),
-    PerTimelineStateShutDown,
-}
-
 /// Internal type used in [`Cache::get`].
 enum RoutingResult<T: Types> {
     FastPath(Handle<T>),
@@ -345,7 +337,7 @@ impl<T: Types> Cache<T> {
         timeline_id: TimelineId,
         shard_selector: ShardSelector,
         tenant_manager: &T::TenantManager,
-    ) -> Result<Handle<T>, GetError<T>> {
+    ) -> Result<Handle<T>, GetActiveTimelineError> {
         const GET_MAX_RETRIES: usize = 10;
         const RETRY_BACKOFF: Duration = Duration::from_millis(100);
         let mut attempt = 0;
@@ -356,7 +348,11 @@ impl<T: Types> Cache<T> {
                 .await
             {
                 Ok(handle) => return Ok(handle),
-                Err(e) => {
+                Err(
+                    e @ GetActiveTimelineError::Tenant(GetActiveTenantError::WaitForActiveTimeout {
+                        ..
+                    }),
+                ) => {
                     // Retry on tenant manager error to handle tenant split more gracefully
                     if attempt < GET_MAX_RETRIES {
                         tokio::time::sleep(RETRY_BACKOFF).await;
@@ -370,6 +366,7 @@ impl<T: Types> Cache<T> {
                         return Err(e);
                     }
                 }
+                Err(err) => return Err(err),
             }
         }
     }
@@ -388,7 +385,7 @@ impl<T: Types> Cache<T> {
         timeline_id: TimelineId,
         shard_selector: ShardSelector,
         tenant_manager: &T::TenantManager,
-    ) -> Result<Handle<T>, GetError<T>> {
+    ) -> Result<Handle<T>, GetActiveTimelineError> {
         // terminates because when every iteration we remove an element from the map
         let miss: ShardSelector = loop {
             let routing_state = self.shard_routing(timeline_id, shard_selector);
@@ -468,60 +465,50 @@ impl<T: Types> Cache<T> {
         timeline_id: TimelineId,
         shard_selector: ShardSelector,
         tenant_manager: &T::TenantManager,
-    ) -> Result<Handle<T>, GetError<T>> {
-        match tenant_manager.resolve(timeline_id, shard_selector).await {
-            Ok(timeline) => {
-                let key = timeline.shard_timeline_id();
-                match &shard_selector {
-                    ShardSelector::Zero => assert_eq!(key.shard_index.shard_number, ShardNumber(0)),
-                    ShardSelector::Page(_) => (), // gotta trust tenant_manager
-                    ShardSelector::Known(idx) => assert_eq!(idx, &key.shard_index),
-                }
-
-                trace!("creating new HandleInner");
-                let timeline = Arc::new(timeline);
-                let handle_inner_arc =
-                    Arc::new(Mutex::new(HandleInner::Open(Arc::clone(&timeline))));
-                let handle_weak = WeakHandle {
-                    inner: Arc::downgrade(&handle_inner_arc),
-                };
-                let handle = handle_weak
-                    .upgrade()
-                    .ok()
-                    .expect("we just created it and it's not linked anywhere yet");
-                {
-                    let mut lock_guard = timeline
-                        .per_timeline_state()
-                        .handles
-                        .lock()
-                        .expect("mutex poisoned");
-                    match &mut *lock_guard {
-                        Some(per_timeline_state) => {
-                            let replaced =
-                                per_timeline_state.insert(self.id, Arc::clone(&handle_inner_arc));
-                            assert!(replaced.is_none(), "some earlier code left a stale handle");
-                            match self.map.entry(key) {
-                                hash_map::Entry::Occupied(_o) => {
-                                    // This cannot not happen because
-                                    // 1. we're the _miss_ handle, i.e., `self.map` didn't contain an entry and
-                                    // 2. we were holding &mut self during .resolve().await above, so, no other thread can have inserted a handle
-                                    //    while we were waiting for the tenant manager.
-                                    unreachable!()
-                                }
-                                hash_map::Entry::Vacant(v) => {
-                                    v.insert(handle_weak);
-                                }
-                            }
-                        }
-                        None => {
-                            return Err(GetError::PerTimelineStateShutDown);
-                        }
-                    }
-                }
-                Ok(handle)
-            }
-            Err(e) => Err(GetError::TenantManager(e)),
+    ) -> Result<Handle<T>, GetActiveTimelineError> {
+        let timeline = tenant_manager.resolve(timeline_id, shard_selector).await?;
+        let key = timeline.shard_timeline_id();
+        match &shard_selector {
+            ShardSelector::Zero => assert_eq!(key.shard_index.shard_number, ShardNumber(0)),
+            ShardSelector::Page(_) => (), // gotta trust tenant_manager
+            ShardSelector::Known(idx) => assert_eq!(idx, &key.shard_index),
         }
+
+        trace!("creating new HandleInner");
+        let timeline = Arc::new(timeline);
+        let handle_inner_arc = Arc::new(Mutex::new(HandleInner::Open(Arc::clone(&timeline))));
+        let handle_weak = WeakHandle {
+            inner: Arc::downgrade(&handle_inner_arc),
+        };
+        let handle = handle_weak
+            .upgrade()
+            .ok()
+            .expect("we just created it and it's not linked anywhere yet");
+        let mut lock_guard = timeline
+            .per_timeline_state()
+            .handles
+            .lock()
+            .expect("mutex poisoned");
+        let Some(per_timeline_state) = &mut *lock_guard else {
+            return Err(GetActiveTimelineError::Timeline(
+                GetTimelineError::ShuttingDown,
+            ));
+        };
+        let replaced = per_timeline_state.insert(self.id, Arc::clone(&handle_inner_arc));
+        assert!(replaced.is_none(), "some earlier code left a stale handle");
+        match self.map.entry(key) {
+            hash_map::Entry::Occupied(_o) => {
+                // This cannot not happen because
+                // 1. we're the _miss_ handle, i.e., `self.map` didn't contain an entry and
+                // 2. we were holding &mut self during .resolve().await above, so, no other thread can have inserted a handle
+                //    while we were waiting for the tenant manager.
+                unreachable!()
+            }
+            hash_map::Entry::Vacant(v) => {
+                v.insert(handle_weak);
+            }
+        }
+        Ok(handle)
     }
 }
 
@@ -655,7 +642,8 @@ mod tests {
     use pageserver_api::models::ShardParameters;
     use pageserver_api::reltag::RelTag;
     use pageserver_api::shard::DEFAULT_STRIPE_SIZE;
-    use utils::shard::ShardCount;
+    use utils::id::TenantId;
+    use utils::shard::{ShardCount, TenantShardId};
     use utils::sync::gate::GateGuard;
 
     use super::*;
@@ -665,7 +653,6 @@ mod tests {
     #[derive(Debug)]
     struct TestTypes;
     impl Types for TestTypes {
-        type TenantManagerError = anyhow::Error;
         type TenantManager = StubManager;
         type Timeline = Entered;
     }
@@ -716,40 +703,48 @@ mod tests {
             &self,
             timeline_id: TimelineId,
             shard_selector: ShardSelector,
-        ) -> anyhow::Result<Entered> {
+        ) -> Result<Entered, GetActiveTimelineError> {
+            fn enter_gate(
+                timeline: &StubTimeline,
+            ) -> Result<Arc<GateGuard>, GetActiveTimelineError> {
+                Ok(Arc::new(timeline.gate.enter().map_err(|_| {
+                    GetActiveTimelineError::Timeline(GetTimelineError::ShuttingDown)
+                })?))
+            }
+
             for timeline in &self.shards {
                 if timeline.id == timeline_id {
-                    let enter_gate = || {
-                        let gate_guard = timeline.gate.enter()?;
-                        let gate_guard = Arc::new(gate_guard);
-                        anyhow::Ok(gate_guard)
-                    };
                     match &shard_selector {
                         ShardSelector::Zero if timeline.shard.is_shard_zero() => {
                             return Ok(Entered {
                                 timeline: Arc::clone(timeline),
-                                gate_guard: enter_gate()?,
+                                gate_guard: enter_gate(timeline)?,
                             });
                         }
                         ShardSelector::Zero => continue,
                         ShardSelector::Page(key) if timeline.shard.is_key_local(key) => {
                             return Ok(Entered {
                                 timeline: Arc::clone(timeline),
-                                gate_guard: enter_gate()?,
+                                gate_guard: enter_gate(timeline)?,
                             });
                         }
                         ShardSelector::Page(_) => continue,
                         ShardSelector::Known(idx) if idx == &timeline.shard.shard_index() => {
                             return Ok(Entered {
                                 timeline: Arc::clone(timeline),
-                                gate_guard: enter_gate()?,
+                                gate_guard: enter_gate(timeline)?,
                             });
                         }
                         ShardSelector::Known(_) => continue,
                     }
                 }
             }
-            anyhow::bail!("not found")
+            Err(GetActiveTimelineError::Timeline(
+                GetTimelineError::NotFound {
+                    tenant_id: TenantShardId::unsharded(TenantId::from([0; 16])),
+                    timeline_id,
+                },
+            ))
         }
     }