Simplify low-hanging fruit

Changes
refactor: log errors with {:#}
2026-05-20 22:50:38 +00:00 · 2023-05-16 18:00:11 +03:00 · 2023-05-16 17:09:33 +03:00 · 2023-05-11 20:01:24 +03:00 · 2023-05-11 20:00:33 +03:00 · 2023-05-10 18:21:19 +03:00
15 changed files with 1201 additions and 485 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2727,7 +2727,6 @@ dependencies = [
 "num-traits",
 "once_cell",
 "pageserver_api",
- "parking_lot",
 "pin-project-lite",
 "postgres",
 "postgres-protocol",
@@ -5425,7 +5424,6 @@ dependencies = [
 "num-bigint",
 "num-integer",
 "num-traits",
- "parking_lot",
 "prost",
 "rand",
 "regex",
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -60,6 +60,10 @@ pub mod tracing_span_assert;

 pub mod rate_limit;

+/// Primitive for coalescing operations into a single task which will not be cancelled by for
+/// example external http client closing the connection.
+pub mod shared_retryable;
+
 /// use with fail::cfg("$name", "return(2000)")
 #[macro_export]
 macro_rules! failpoint_sleep_millis_async {
--- a/libs/utils/src/shared_retryable.rs
+++ b/libs/utils/src/shared_retryable.rs
@@ -0,0 +1,659 @@
+use std::future::Future;
+use std::sync::Arc;
+
+/// Container using which many request handlers can come together and join a single task to
+/// completion instead of racing each other and their own cancellation.
+///
+/// In a picture:
+///
+/// ```text
+/// SharedRetryable::try_restart         Spawned task completes with only one concurrent attempt
+///                             \       /
+///      request handler 1 ---->|--X
+///      request handler 2 ---->|-------|
+///      request handler 3 ---->|-------|
+///                             |       |
+///                             v       |
+///       one spawned task      \------>/
+///
+/// (X = cancelled during await)
+/// ```
+///
+/// Implementation is cancel safe. Implementation and internal structure are hurt by the inability
+/// to just spawn the task, but this is needed for pageserver usage.
+///
+/// Implementation exposes a fully decomposed [`SharedRetryable::try_restart`] which requires the
+/// caller to do the spawning before awaiting for the result. If the caller is dropped while this
+/// happens, a new attempt will be required, and all concurrent awaiters will see a
+/// [`RetriedTaskPanicked`] error.
+///
+/// There is another "family of APIs" [`SharedRetryable::attempt_spawn`] for infallible futures. It is
+/// just provided for completeness, and it does not have a fully decomposed version like
+/// `try_restart`.
+///
+/// For `try_restart_*` family of APIs, there is a concept of two leveled results. The inner level
+/// is returned by the executed future. It needs to be `Clone`. Most errors are not `Clone`, so
+/// implementation advice is to log the happened error, and not propagate more than a label as the
+/// "inner error" which will be used to build an outer error. The outer error will also have to be
+/// convertable from [`RetriedTaskPanicked`] to absorb that case as well.
+///
+/// ## Example
+///
+/// A shared service value completes the infallible work once, even if called concurrently by
+/// multiple cancellable tasks.
+///
+/// Example moved as a test `service_example`.
+#[derive(Clone)]
+pub struct SharedRetryable<V> {
+    inner: Arc<tokio::sync::Mutex<Option<MaybeDone<V>>>>,
+}
+
+impl<V> Default for SharedRetryable<V> {
+    fn default() -> Self {
+        Self {
+            inner: Arc::new(tokio::sync::Mutex::new(None)),
+        }
+    }
+}
+
+/// Determine if an error is transient or permanent.
+pub trait Retryable {
+    fn is_permanent(&self) -> bool {
+        true
+    }
+}
+
+pub trait MakeFuture {
+    type Future: Future<Output = Self::Output> + Send + 'static;
+    type Output: Send + 'static;
+
+    fn make_future(self) -> Self::Future;
+}
+
+impl<Fun, Fut, R> MakeFuture for Fun
+where
+    Fun: FnOnce() -> Fut,
+    Fut: Future<Output = R> + Send + 'static,
+    R: Send + 'static,
+{
+    type Future = Fut;
+    type Output = R;
+
+    fn make_future(self) -> Self::Future {
+        self()
+    }
+}
+
+/// Retried task panicked, was cancelled, or never spawned (see [`SharedRetryable::try_restart`]).
+#[derive(Debug, PartialEq, Eq)]
+pub struct RetriedTaskPanicked;
+
+impl<T, E1> SharedRetryable<Result<T, E1>>
+where
+    T: Clone + std::fmt::Debug + Send + 'static,
+    E1: Retryable + Clone + std::fmt::Debug + Send + 'static,
+{
+    /// Restart a previously failed operation unless it already completed with a terminal result.
+    ///
+    /// Many futures can call this function and and get the terminal result from an earlier attempt
+    /// or start a new attempt, or join an existing one.
+    ///
+    /// Compared to `Self::try_restart`, this method also spawns the future to run, which would
+    /// otherwise have to be done manually.
+    #[cfg(test)]
+    pub async fn try_restart_spawn<E2>(
+        &self,
+        retry_with: impl MakeFuture<Output = Result<T, E1>>,
+    ) -> Result<T, E2>
+    where
+        E2: From<E1> + From<RetriedTaskPanicked> + Send + 'static,
+    {
+        let (recv, maybe_fut) = self.try_restart(retry_with).await;
+
+        if let Some(fut) = maybe_fut {
+            // top level function, we must spawn, pageserver cannot use this
+            tokio::spawn(fut);
+        }
+
+        recv.await
+    }
+
+    /// Restart a previously failed operation unless it already completed with a terminal result.
+    ///
+    /// Many futures can call this function and get the terminal result from an earlier attempt or
+    /// start a new attempt, or join an existing one.
+    ///
+    /// If a task calling this method is cancelled, at worst, a new attempt which is immediatedly
+    /// deemed as having panicked will happen, but without a panic ever happening.
+    ///
+    /// Returns one future for waiting for the result and possibly another which needs to be
+    /// spawned when `Some`. Spawning has to happen before waiting is started, otherwise the first
+    /// future will never make progress.
+    ///
+    /// This complication exists because on pageserver we cannot use `tokio::spawn` directly
+    /// at this time.
+    pub async fn try_restart<E2>(
+        &self,
+        retry_with: impl MakeFuture<Output = Result<T, E1>>,
+    ) -> (
+        impl Future<Output = Result<T, E2>> + Send + 'static,
+        Option<impl Future<Output = ()> + Send + 'static>,
+    )
+    where
+        E2: From<E1> + From<RetriedTaskPanicked> + Send + 'static,
+    {
+        use futures::future::Either;
+
+        match self.decide_to_retry_or_join(retry_with).await {
+            Ok(terminal) => (Either::Left(async move { terminal }), None),
+            Err((rx, maybe_fut)) => {
+                let recv = Self::make_oneshot_alike_receiver(rx);
+
+                (Either::Right(recv), maybe_fut)
+            }
+        }
+    }
+
+    /// Returns a Ok if the previous attempt had resulted in a terminal result. Err is returned
+    /// when an attempt can be joined and possibly needs to be spawned.
+    async fn decide_to_retry_or_join<E2>(
+        &self,
+        retry_with: impl MakeFuture<Output = Result<T, E1>>,
+    ) -> Result<
+        Result<T, E2>,
+        (
+            tokio::sync::broadcast::Receiver<Result<T, E1>>,
+            Option<impl Future<Output = ()> + Send + 'static>,
+        ),
+    >
+    where
+        E2: From<E1> + From<RetriedTaskPanicked>,
+    {
+        let mut g = self.inner.lock().await;
+
+        let maybe_rx = match g.as_ref() {
+            Some(MaybeDone::Done(Ok(t))) => return Ok(Ok(t.to_owned())),
+            Some(MaybeDone::Done(Err(e))) if e.is_permanent() => {
+                return Ok(Err(E2::from(e.to_owned())))
+            }
+            Some(MaybeDone::Pending(weak)) => {
+                // failure to upgrade can mean only one thing: there was an unexpected
+                // panic which we consider as a transient retryable error.
+                weak.upgrade()
+            }
+            Some(MaybeDone::Done(Err(_retryable))) => None,
+            None => None,
+        };
+
+        let (strong, maybe_fut) = match maybe_rx {
+            Some(strong) => (strong, None),
+            None => {
+                // new attempt
+                // panic safety: invoke the factory before configuring the pending value
+                let fut = retry_with.make_future();
+
+                let (strong, fut) = self.make_run_and_complete(fut, &mut g);
+                (strong, Some(fut))
+            }
+        };
+
+        // important: the Arc<Receiver> is not held after unlocking
+        // important: we resubscribe before lock is released to be sure to get a message which
+        // is sent once receiver is dropped
+        let rx = strong.resubscribe();
+        drop(strong);
+        Err((rx, maybe_fut))
+    }
+
+    /// Configure a new attempt, but leave spawning it to the caller.
+    ///
+    /// Returns an `Arc<Receiver<V>>` which is valid until the attempt completes, and the future
+    /// which will need to run to completion outside the lifecycle of the caller.
+    fn make_run_and_complete(
+        &self,
+        fut: impl Future<Output = Result<T, E1>> + Send + 'static,
+        g: &mut tokio::sync::MutexGuard<'_, Option<MaybeDone<Result<T, E1>>>>,
+    ) -> (
+        Arc<tokio::sync::broadcast::Receiver<Result<T, E1>>>,
+        impl Future<Output = ()> + Send + 'static,
+    ) {
+        #[cfg(debug_assertions)]
+        match &**g {
+            Some(MaybeDone::Pending(weak)) => {
+                assert!(
+                    weak.upgrade().is_none(),
+                    "when starting a restart, should no longer have an upgradeable channel"
+                );
+            }
+            Some(MaybeDone::Done(Err(err))) => {
+                assert!(
+                    !err.is_permanent(),
+                    "when restarting, the err must be transient"
+                );
+            }
+            Some(MaybeDone::Done(Ok(_))) => {
+                panic!("unexpected restart after a completion on MaybeDone");
+            }
+            None => {}
+        }
+
+        self.make_run_and_complete_any(fut, g)
+    }
+
+    /// Oneshot alike as in it's a future which will be consumed by an `await`.
+    ///
+    /// Otherwise the caller might think it's beneficial or reasonable to poll the channel multiple
+    /// times.
+    async fn make_oneshot_alike_receiver<E2>(
+        mut rx: tokio::sync::broadcast::Receiver<Result<T, E1>>,
+    ) -> Result<T, E2>
+    where
+        E2: From<E1> + From<RetriedTaskPanicked>,
+    {
+        use tokio::sync::broadcast::error::RecvError;
+
+        match rx.recv().await {
+            Ok(Ok(t)) => Ok(t),
+            Ok(Err(e)) => Err(E2::from(e)),
+            Err(RecvError::Closed | RecvError::Lagged(_)) => {
+                // lagged doesn't mean anything with 1 send, but whatever, handle it the same
+                // this case should only ever happen if a panick happened in the `fut`.
+                Err(E2::from(RetriedTaskPanicked))
+            }
+        }
+    }
+}
+
+impl<V> SharedRetryable<V>
+where
+    V: std::fmt::Debug + Clone + Send + 'static,
+{
+    /// Attempt to run once a spawned future to completion.
+    ///
+    /// Any previous attempt which panicked will be retried, but the `RetriedTaskPanicked` will be
+    /// returned when the most recent attempt panicked.
+    #[cfg(test)]
+    pub async fn attempt_spawn(
+        &self,
+        attempt_with: impl MakeFuture<Output = V>,
+    ) -> Result<V, RetriedTaskPanicked> {
+        let (rx, maybe_fut) = {
+            let mut g = self.inner.lock().await;
+
+            let maybe_rx = match g.as_ref() {
+                Some(MaybeDone::Done(v)) => return Ok(v.to_owned()),
+                Some(MaybeDone::Pending(weak)) => {
+                    // see comment in try_restart
+                    weak.upgrade()
+                }
+                None => None,
+            };
+
+            let (strong, maybe_fut) = match maybe_rx {
+                Some(strong) => (strong, None),
+                None => {
+                    let fut = attempt_with.make_future();
+
+                    let (strong, fut) = self.make_run_and_complete_any(fut, &mut g);
+                    (strong, Some(fut))
+                }
+            };
+
+            // see decide_to_retry_or_join for important notes
+            let rx = strong.resubscribe();
+            drop(strong);
+            (rx, maybe_fut)
+        };
+
+        if let Some(fut) = maybe_fut {
+            // this is a top level function, need to spawn directly
+            // from pageserver one wouldn't use this but more piecewise functions
+            tokio::spawn(fut);
+        }
+
+        let recv = Self::make_oneshot_alike_receiver_any(rx);
+
+        recv.await
+    }
+
+    /// Configure a new attempt, but leave spawning it to the caller.
+    ///
+    /// Forgetting the returned future is outside of scope of any correctness guarantees; all of
+    /// the waiters will then be deadlocked, and the MaybeDone will forever be pending. Dropping
+    /// and not running the future will then require a new attempt.
+    ///
+    /// Also returns an `Arc<Receiver<V>>` which is valid until the attempt completes.
+    fn make_run_and_complete_any(
+        &self,
+        fut: impl Future<Output = V> + Send + 'static,
+        g: &mut tokio::sync::MutexGuard<'_, Option<MaybeDone<V>>>,
+    ) -> (
+        Arc<tokio::sync::broadcast::Receiver<V>>,
+        impl Future<Output = ()> + Send + 'static,
+    ) {
+        let (tx, rx) = tokio::sync::broadcast::channel(1);
+        let strong = Arc::new(rx);
+
+        **g = Some(MaybeDone::Pending(Arc::downgrade(&strong)));
+
+        let retry = {
+            let strong = strong.clone();
+            self.clone().run_and_complete(fut, tx, strong)
+        };
+
+        #[cfg(debug_assertions)]
+        match &**g {
+            Some(MaybeDone::Pending(weak)) => {
+                let rx = weak.upgrade().expect("holding the weak and strong locally");
+                assert!(Arc::ptr_eq(&strong, &rx));
+            }
+            _ => unreachable!("MaybeDone::pending must be set after spawn_and_run_complete_any"),
+        }
+
+        (strong, retry)
+    }
+
+    /// Run the actual attempt, and communicate the response via both:
+    /// - setting the `MaybeDone::Done`
+    /// - the broadcast channel
+    async fn run_and_complete(
+        self,
+        fut: impl Future<Output = V>,
+        tx: tokio::sync::broadcast::Sender<V>,
+        strong: Arc<tokio::sync::broadcast::Receiver<V>>,
+    ) {
+        let res = fut.await;
+
+        {
+            let mut g = self.inner.lock().await;
+            MaybeDone::complete(&mut *g, &strong, res.clone());
+
+            // make the weak un-upgradeable by dropping the final alive
+            // reference to it. it is final Arc because the Arc never escapes
+            // the critical section in `decide_to_retry_or_join` or `attempt_spawn`.
+            Arc::try_unwrap(strong).expect("expected this to be the only Arc<Receiver<V>>");
+        }
+
+        // now no one can get the Pending(weak) value to upgrade and they only see
+        // the Done(res).
+        //
+        // send the result value to listeners, if any
+        drop(tx.send(res));
+    }
+
+    #[cfg(test)]
+    async fn make_oneshot_alike_receiver_any(
+        mut rx: tokio::sync::broadcast::Receiver<V>,
+    ) -> Result<V, RetriedTaskPanicked> {
+        use tokio::sync::broadcast::error::RecvError;
+
+        match rx.recv().await {
+            Ok(t) => Ok(t),
+            Err(RecvError::Closed | RecvError::Lagged(_)) => {
+                // lagged doesn't mean anything with 1 send, but whatever, handle it the same
+                // this case should only ever happen if a panick happened in the `fut`.
+                Err(RetriedTaskPanicked)
+            }
+        }
+    }
+}
+
+/// MaybeDone handles synchronization for multiple requests and the single actual task.
+///
+/// If request handlers witness `Pending` which they are able to upgrade, they are guaranteed a
+/// useful `recv().await`, where useful means "value" or "disconnect" arrives. If upgrade fails,
+/// this means that "disconnect" has happened in the past.
+///
+/// On successful execution the one executing task will set this to `Done` variant, with the actual
+/// resulting value.
+#[derive(Debug)]
+pub enum MaybeDone<V> {
+    Pending(std::sync::Weak<tokio::sync::broadcast::Receiver<V>>),
+    Done(V),
+}
+
+impl<V: std::fmt::Debug> MaybeDone<V> {
+    fn complete(
+        this: &mut Option<MaybeDone<V>>,
+        _strong: &Arc<tokio::sync::broadcast::Receiver<V>>,
+        outcome: V,
+    ) {
+        #[cfg(debug_assertions)]
+        match this {
+            Some(MaybeDone::Pending(weak)) => {
+                let same = weak
+                    .upgrade()
+                    // we don't yet have Receiver::same_channel
+                    .map(|rx| Arc::ptr_eq(_strong, &rx))
+                    .unwrap_or(false);
+                assert!(same, "different channel had been replaced or dropped");
+            }
+            other => panic!("unexpected MaybeDone: {other:?}"),
+        }
+
+        *this = Some(MaybeDone::Done(outcome));
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::{RetriedTaskPanicked, Retryable, SharedRetryable};
+    use std::sync::Arc;
+
+    #[derive(Debug)]
+    enum OuterError {
+        AttemptPanicked,
+        Unlucky,
+    }
+
+    #[derive(Clone, Debug)]
+    enum InnerError {
+        Unlucky,
+    }
+
+    impl Retryable for InnerError {
+        fn is_permanent(&self) -> bool {
+            false
+        }
+    }
+
+    impl From<InnerError> for OuterError {
+        fn from(_: InnerError) -> Self {
+            OuterError::Unlucky
+        }
+    }
+
+    impl From<RetriedTaskPanicked> for OuterError {
+        fn from(_: RetriedTaskPanicked) -> Self {
+            OuterError::AttemptPanicked
+        }
+    }
+
+    #[tokio::test]
+    async fn restartable_until_permanent() {
+        let shr = SharedRetryable::<Result<u8, InnerError>>::default();
+
+        let res = shr
+            .try_restart_spawn(|| async move { panic!("really unlucky") })
+            .await;
+
+        assert!(matches!(res, Err(OuterError::AttemptPanicked)));
+
+        let res = shr
+            .try_restart_spawn(|| async move { Err(InnerError::Unlucky) })
+            .await;
+
+        assert!(matches!(res, Err(OuterError::Unlucky)));
+
+        let res = shr.try_restart_spawn(|| async move { Ok(42) }).await;
+
+        assert!(matches!(res, Ok::<u8, OuterError>(42)));
+
+        let res = shr
+            .try_restart_spawn(|| async move { panic!("rerun should clone Ok(42)") })
+            .await;
+
+        assert!(matches!(res, Ok::<u8, OuterError>(42)));
+    }
+
+    /// Demonstration of the SharedRetryable::attempt
+    #[tokio::test]
+    async fn attemptable_until_no_panic() {
+        let shr = SharedRetryable::<u8>::default();
+
+        let res = shr
+            .attempt_spawn(|| async move { panic!("should not interfere") })
+            .await;
+
+        assert!(matches!(res, Err(RetriedTaskPanicked)), "{res:?}");
+
+        let res = shr.attempt_spawn(|| async move { 42 }).await;
+
+        assert_eq!(res, Ok(42));
+
+        let res = shr
+            .attempt_spawn(|| async move { panic!("should not be called") })
+            .await;
+
+        assert_eq!(res, Ok(42));
+    }
+
+    #[tokio::test]
+    async fn cancelling_spawner_is_fine() {
+        let shr = SharedRetryable::<Result<u8, InnerError>>::default();
+
+        let (recv1, maybe_fut) = shr
+            .try_restart(|| async move { panic!("should not have been called") })
+            .await;
+        let should_be_spawned = maybe_fut.unwrap();
+
+        let (recv2, maybe_fut) = shr
+            .try_restart(|| async move {
+                panic!("should never be called because waiting on should_be_spawned")
+            })
+            .await;
+        assert!(
+            matches!(maybe_fut, None),
+            "only the first one should had created the future"
+        );
+
+        let mut recv1 = std::pin::pin!(recv1);
+        let mut recv2 = std::pin::pin!(recv2);
+
+        tokio::select! {
+            _ = tokio::time::sleep(std::time::Duration::from_millis(100)) => {},
+            _ = &mut recv1 => unreachable!("should not have completed because should_be_spawned not spawned"),
+            _ = &mut recv2 => unreachable!("should not have completed because should_be_spawned not spawned"),
+        }
+
+        drop(should_be_spawned);
+
+        let res = recv1.await;
+        assert!(matches!(res, Err(OuterError::AttemptPanicked)), "{res:?}");
+
+        let res = recv2.await;
+        assert!(matches!(res, Err(OuterError::AttemptPanicked)), "{res:?}");
+
+        // but we can still reach a terminal state if the api is not misused or the
+        // should_be_spawned winner is not cancelled
+
+        let recv1 = shr.try_restart_spawn::<OuterError>(|| async move { Ok(42) });
+        let recv2 = shr.try_restart_spawn::<OuterError>(|| async move { Ok(43) });
+
+        assert_eq!(recv1.await.unwrap(), 42);
+        assert_eq!(recv2.await.unwrap(), 42, "43 should never be returned");
+    }
+
+    #[tokio::test]
+    async fn service_example() {
+        #[derive(Debug, Clone, Copy)]
+        enum OneLevelError {
+            TaskPanicked,
+        }
+
+        impl Retryable for OneLevelError {
+            fn is_permanent(&self) -> bool {
+                // for a single level errors, this wording is weird
+                !matches!(self, OneLevelError::TaskPanicked)
+            }
+        }
+
+        impl From<RetriedTaskPanicked> for OneLevelError {
+            fn from(_: RetriedTaskPanicked) -> Self {
+                OneLevelError::TaskPanicked
+            }
+        }
+
+        #[derive(Clone, Default)]
+        struct Service(SharedRetryable<Result<u8, OneLevelError>>);
+
+        impl Service {
+            async fn work(
+                &self,
+                completions: Arc<std::sync::atomic::AtomicUsize>,
+            ) -> Result<u8, OneLevelError> {
+                self.0
+                    .try_restart_spawn(|| async move {
+                        // give time to cancel some of the tasks
+                        tokio::time::sleep(std::time::Duration::from_secs(1)).await;
+                        completions.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
+                        Self::work_once().await
+                    })
+                    .await
+            }
+
+            async fn work_once() -> Result<u8, OneLevelError> {
+                Ok(42)
+            }
+        }
+
+        let svc = Service::default();
+
+        let mut js = tokio::task::JoinSet::new();
+
+        let barrier = Arc::new(tokio::sync::Barrier::new(10 + 1));
+        let completions = Arc::new(std::sync::atomic::AtomicUsize::new(0));
+
+        let handles = (0..10)
+            .map(|_| {
+                js.spawn({
+                    let svc = svc.clone();
+                    let barrier = barrier.clone();
+                    let completions = completions.clone();
+                    async move {
+                        // make sure all tasks are ready to start at the same time
+                        barrier.wait().await;
+                        // after successfully starting the work, any of the futures could get cancelled
+                        svc.work(completions).await
+                    }
+                })
+            })
+            .collect::<Vec<_>>();
+
+        barrier.wait().await;
+
+        tokio::time::sleep(std::time::Duration::from_millis(100)).await;
+
+        handles[5].abort();
+
+        let mut cancellations = 0;
+
+        while let Some(res) = js.join_next().await {
+            // all complete with the same result
+            match res {
+                Ok(res) => assert_eq!(res.unwrap(), 42),
+                Err(je) => {
+                    // except for the one task we cancelled; it's cancelling
+                    // does not interfere with the result
+                    assert!(je.is_cancelled());
+                    cancellations += 1;
+                    assert_eq!(cancellations, 1, "only 6th task was aborted");
+                    // however we cannot assert that everytime we get to cancel the 6th task
+                }
+            }
+        }
+
+        // there will be at most one terminal completion
+        assert_eq!(completions.load(std::sync::atomic::Ordering::Relaxed), 1);
+    }
+}
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -75,8 +75,6 @@ enum-map.workspace = true
 enumset.workspace = true
 strum.workspace = true
 strum_macros.workspace = true
-# feature "send_guard" markes it so that lock guards are Send
-parking_lot = { workspace = true, default-features = false, features = [ "send_guard" ] }

 [dev-dependencies]
 criterion.workspace = true
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -272,6 +272,9 @@ pub enum TaskKind {

    #[cfg(test)]
    UnitTest,
+
+    /// Task which is the only task to delete this particular timeline
+    DeleteTimeline,
 }

 #[derive(Default)]
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -39,7 +39,8 @@ use std::process::Stdio;
 use std::sync::atomic::AtomicU64;
 use std::sync::atomic::Ordering;
 use std::sync::Arc;
-use std::sync::RwLock;
+use std::sync::MutexGuard;
+use std::sync::{Mutex, RwLock};
 use std::time::{Duration, Instant};

 use self::config::TenantConf;
@@ -132,7 +133,7 @@ pub struct Tenant {
    tenant_conf: Arc<RwLock<TenantConfOpt>>,

    tenant_id: TenantId,
-    timelines: parking_lot::Mutex<HashMap<TimelineId, Arc<Timeline>>>,
+    timelines: Mutex<HashMap<TimelineId, Arc<Timeline>>>,
    // This mutex prevents creation of new timelines during GC.
    // Adding yet another mutex (in addition to `timelines`) is needed because holding
    // `timelines` mutex during all GC iteration
@@ -183,7 +184,7 @@ impl UninitializedTimeline<'_> {
    /// The new timeline is initialized in Active state, and its background jobs are
    /// started
    pub fn initialize(self, ctx: &RequestContext) -> anyhow::Result<Arc<Timeline>> {
-        let mut timelines = self.owning_tenant.timelines.lock();
+        let mut timelines = self.owning_tenant.timelines.lock().unwrap();
        self.initialize_with_lock(ctx, &mut timelines, true, true)
    }

@@ -274,7 +275,7 @@ impl UninitializedTimeline<'_> {

        // Initialize without loading the layer map. We started with an empty layer map, and already
        // updated it for the layers that we created during the import.
-        let mut timelines = self.owning_tenant.timelines.lock();
+        let mut timelines = self.owning_tenant.timelines.lock().unwrap();
        self.initialize_with_lock(ctx, &mut timelines, false, true)
    }

@@ -455,6 +456,50 @@ pub enum DeleteTimelineError {
    Other(#[from] anyhow::Error),
 }

+#[derive(Debug, thiserror::Error, Clone)]
+enum InnerDeleteTimelineError {
+    #[error("upload queue is uninitialized, likely the timeline was in Broken state prior to this call because it failed to fetch IndexPart during load or attach, check the logs")]
+    QueueUninitialized,
+
+    #[error("failpoint: {0}")]
+    Failpoint(&'static str),
+
+    #[error("failed to remove local timeline directory")]
+    FailedToRemoveLocalTimelineDirectory,
+
+    #[error("index_part.json upload failed")]
+    UploadFailed,
+
+    #[error("the deleted timeline grew branches while deleting it; tenant should now be broken")]
+    DeletedGrewChildren,
+}
+
+impl utils::shared_retryable::Retryable for InnerDeleteTimelineError {
+    fn is_permanent(&self) -> bool {
+        use InnerDeleteTimelineError::*;
+
+        match self {
+            QueueUninitialized => false,
+            Failpoint(_) => false,
+            FailedToRemoveLocalTimelineDirectory => false,
+            UploadFailed => false,
+            DeletedGrewChildren => true,
+        }
+    }
+}
+
+impl From<InnerDeleteTimelineError> for DeleteTimelineError {
+    fn from(value: InnerDeleteTimelineError) -> Self {
+        DeleteTimelineError::Other(anyhow::Error::new(value))
+    }
+}
+
+impl From<utils::shared_retryable::RetriedTaskPanicked> for DeleteTimelineError {
+    fn from(_: utils::shared_retryable::RetriedTaskPanicked) -> Self {
+        DeleteTimelineError::Other(anyhow::anyhow!("deleting timeline failed, please retry"))
+    }
+}
+
 struct RemoteStartupData {
    index_part: IndexPart,
    remote_metadata: TimelineMetadata,
@@ -493,7 +538,7 @@ impl Tenant {

        let timeline = {
            // avoiding holding it across awaits
-            let mut timelines_accessor = self.timelines.lock();
+            let mut timelines_accessor = self.timelines.lock().unwrap();
            if timelines_accessor.contains_key(&timeline_id) {
                anyhow::bail!(
                    "Timeline {tenant_id}/{timeline_id} already exists in the tenant map"
@@ -560,6 +605,7 @@ impl Tenant {
                || timeline
                    .layers
                    .read()
+                    .unwrap()
                    .iter_historic_layers()
                    .next()
                    .is_some(),
@@ -804,7 +850,7 @@ impl Tenant {
            .context("Failed to create new timeline directory")?;

        let ancestor = if let Some(ancestor_id) = remote_metadata.ancestor_timeline() {
-            let timelines = self.timelines.lock();
+            let timelines = self.timelines.lock().unwrap();
            Some(Arc::clone(timelines.get(&ancestor_id).ok_or_else(
                || {
                    anyhow::anyhow!(
@@ -1139,7 +1185,7 @@ impl Tenant {
        timeline_id: TimelineId,
        active_only: bool,
    ) -> anyhow::Result<Arc<Timeline>> {
-        let timelines_accessor = self.timelines.lock();
+        let timelines_accessor = self.timelines.lock().unwrap();
        let timeline = timelines_accessor.get(&timeline_id).with_context(|| {
            format!("Timeline {}/{} was not found", self.tenant_id, timeline_id)
        })?;
@@ -1159,7 +1205,12 @@ impl Tenant {
    /// Lists timelines the tenant contains.
    /// Up to tenant's implementation to omit certain timelines that ar not considered ready for use.
    pub fn list_timelines(&self) -> Vec<Arc<Timeline>> {
-        self.timelines.lock().values().map(Arc::clone).collect()
+        self.timelines
+            .lock()
+            .unwrap()
+            .values()
+            .map(Arc::clone)
+            .collect()
    }

    /// This is used to create the initial 'main' timeline during bootstrapping,
@@ -1177,7 +1228,7 @@ impl Tenant {
            "Cannot create empty timelines on inactive tenant"
        );

-        let timelines = self.timelines.lock();
+        let timelines = self.timelines.lock().unwrap();
        let timeline_uninit_mark = self.create_timeline_uninit_mark(new_timeline_id, &timelines)?;
        drop(timelines);

@@ -1309,7 +1360,7 @@ impl Tenant {
        // compactions.  We don't want to block everything else while the
        // compaction runs.
        let timelines_to_compact = {
-            let timelines = self.timelines.lock();
+            let timelines = self.timelines.lock().unwrap();
            let timelines_to_compact = timelines
                .iter()
                .map(|(timeline_id, timeline)| (*timeline_id, timeline.clone()))
@@ -1338,7 +1389,7 @@ impl Tenant {
        // flushing. We don't want to block everything else while the
        // flushing is performed.
        let timelines_to_flush = {
-            let timelines = self.timelines.lock();
+            let timelines = self.timelines.lock().unwrap();
            timelines
                .iter()
                .map(|(_id, timeline)| Arc::clone(timeline))
@@ -1354,7 +1405,7 @@ impl Tenant {

    /// Removes timeline-related in-memory data
    pub async fn delete_timeline(
-        &self,
+        self: &Arc<Tenant>,
        timeline_id: TimelineId,
        _ctx: &RequestContext,
    ) -> Result<(), DeleteTimelineError> {
@@ -1363,7 +1414,7 @@ impl Tenant {
        // Transition the timeline into TimelineState::Stopping.
        // This should prevent new operations from starting.
        let timeline = {
-            let mut timelines = self.timelines.lock();
+            let mut timelines = self.timelines.lock().unwrap();

            // Ensure that there are no child timelines **attached to that pageserver**,
            // because detach removes files, which will break child branches
@@ -1387,162 +1438,196 @@ impl Tenant {
            timeline
        };

-        // Now that the Timeline is in Stopping state, request all the related tasks to
-        // shut down.
+        let span = tracing::Span::current();
+
+        // if we have concurrent requests, we will only execute one version of following future;
+        // initially it did not have any means to be cancelled.
        //
-        // NB: If you call delete_timeline multiple times concurrently, they will
-        // all go through the motions here. Make sure the code here is idempotent,
-        // and don't error out if some of the shutdown tasks have already been
-        // completed!
+        // NOTE: Unlike "the usual" futures, this one will log any errors instead of just propagating
+        // them to the caller. This is because this one future produces a value, which will need to
+        // be cloneable to everyone interested, and normal `std::error::Error` are not clonable.
+        // Also, it wouldn't make sense to log the same failure multiple times, it would look like
+        // multiple failures to anyone reading the logs.
+        let factory = || {
+            let tenant = self.clone();
+            let tenant_id = self.tenant_id;
+            let timeline = timeline.clone();
+            let timeline_id = timeline.timeline_id;

-        // Stop the walreceiver first.
-        debug!("waiting for wal receiver to shutdown");
-        timeline.walreceiver.stop().await;
-        debug!("wal receiver shutdown confirmed");
+            async move {
+                // Now that the Timeline is in Stopping state, request all the related tasks to
+                // shut down.
+                //
+                // Stop the walreceiver first.
+                debug!("waiting for wal receiver to shutdown");
+                timeline.walreceiver.stop().await;
+                debug!("wal receiver shutdown confirmed");

-        // Prevent new uploads from starting.
-        if let Some(remote_client) = timeline.remote_client.as_ref() {
-            let res = remote_client.stop();
-            match res {
-                Ok(()) => {}
-                Err(e) => match e {
-                    remote_timeline_client::StopError::QueueUninitialized => {
-                        // This case shouldn't happen currently because the
-                        // load and attach code bails out if _any_ of the timeline fails to fetch its IndexPart.
-                        // That is, before we declare the Tenant as Active.
-                        // But we only allow calls to delete_timeline on Active tenants.
-                        return Err(DeleteTimelineError::Other(anyhow::anyhow!("upload queue is uninitialized, likely the timeline was in Broken state prior to this call because it failed to fetch IndexPart during load or attach, check the logs")));
+                // Prevent new uploads from starting.
+                if let Some(remote_client) = timeline.remote_client.as_ref() {
+                    let res = remote_client.stop();
+                    match res {
+                        Ok(()) => {}
+                        Err(e) => match e {
+                            remote_timeline_client::StopError::QueueUninitialized => {
+                                // This case shouldn't happen currently because the
+                                // load and attach code bails out if _any_ of the timeline fails to fetch its IndexPart.
+                                // That is, before we declare the Tenant as Active.
+                                // But we only allow calls to delete_timeline on Active tenants.
+                                warn!("failed to stop RemoteTimelineClient due to uninitialized queue");
+                                return Err(InnerDeleteTimelineError::QueueUninitialized);
+                            }
+                        },
                    }
+                }
+
+                // Stop & wait for the remaining timeline tasks, including upload tasks.
+                info!("waiting for timeline tasks to shutdown");
+                task_mgr::shutdown_tasks(None, Some(tenant_id), Some(timeline_id)).await;
+
+                // Mark timeline as deleted in S3 so we won't pick it up next time
+                // during attach or pageserver restart.
+                // See comment in persist_index_part_with_deleted_flag.
+                if let Some(remote_client) = timeline.remote_client.as_ref() {
+                    match remote_client.persist_index_part_with_deleted_flag().await {
+                        // If we (now, or already) marked it successfully as deleted, we can proceed
+                        Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (),
+                        // Bail out otherwise
+                        Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_))
+                        | Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => {
+                            warn!("upload failed: {e}");
+                            return Err(InnerDeleteTimelineError::UploadFailed);
+                        }
+                    }
+                }
+
+                {
+                    // Grab the layer_removal_cs lock, and actually perform the deletion.
+                    //
+                    // This lock prevents multiple concurrent delete_timeline calls from
+                    // stepping on each other's toes, while deleting the files. It also
+                    // prevents GC or compaction from running at the same time.
+                    //
+                    // Note that there are still other race conditions between
+                    // GC, compaction and timeline deletion. GC task doesn't
+                    // register itself properly with the timeline it's
+                    // operating on. See
+                    // https://github.com/neondatabase/neon/issues/2671
+                    //
+                    // No timeout here, GC & Compaction should be responsive to the
+                    // `TimelineState::Stopping` change.
+                    info!("waiting for layer_removal_cs.lock()");
+                    let _layer_removal_guard = timeline.layer_removal_cs.lock().await;
+                    info!("got layer_removal_cs.lock(), deleting layer files");
+
+                    // NB: storage_sync upload tasks that reference these layers have been cancelled
+                    //     by us earlier.
+
+                    let local_timeline_directory =
+                        tenant.conf.timeline_path(&timeline_id, &tenant_id);
+
+                    fail::fail_point!("timeline-delete-before-rm", |_| {
+                        Err(InnerDeleteTimelineError::Failpoint(
+                            "failpoint: timeline-delete-before-rm",
+                        ))
+                    });
+
+                    // NB: This need not be atomic because the deleted flag in the IndexPart
+                    // will be observed during tenant/timeline load. The deletion will be resumed there.
+                    //
+                    // For configurations without remote storage, we tolerate that we're not crash-safe here.
+                    // The timeline may come up Active but with missing layer files, in such setups.
+                    // See https://github.com/neondatabase/neon/pull/3919#issuecomment-1531726720
+                    match std::fs::remove_dir_all(&local_timeline_directory) {
+                        Ok(()) => {}
+                        Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
+                            // This can happen if we're called a second time, e.g.,
+                            // because of a previous failure/cancellation at/after
+                            // failpoint timeline-delete-after-rm.
+                            //
+                            // It can also happen if we race with tenant detach, because,
+                            // it doesn't grab the layer_removal_cs lock.
+                            //
+                            // For now, log and continue.
+                            // warn! level is technically not appropriate for the
+                            // first case because we should expect retries to happen.
+                            // But the error is so rare, it seems better to get attention if it happens.
+                            let tenant_state = tenant.current_state();
+                            warn!(
+                                timeline_dir=?local_timeline_directory,
+                                ?tenant_state,
+                                "timeline directory not found, proceeding anyway"
+                            );
+                        }
+                        Err(e) => {
+                            warn!(
+                                "failed to remove local timeline directory {}: {e:#}",
+                                local_timeline_directory.display()
+                            );
+                            return Err(
+                                InnerDeleteTimelineError::FailedToRemoveLocalTimelineDirectory,
+                            );
+                        }
+                    }
+
+                    info!("finished deleting layer files, releasing layer_removal_cs.lock()");
+                }
+
+                fail::fail_point!("timeline-delete-after-rm", |_| {
+                    Err(InnerDeleteTimelineError::Failpoint(
+                        "timeline-delete-after-rm",
+                    ))
+                });
+
+                // Remove the timeline from the map or poison it if we've grown children.
+                let removed_timeline =
+                    std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
+                        let mut timelines = tenant.timelines.lock().unwrap();
+                        let children_exist = timelines.iter().any(|(_, entry)| {
+                            entry.get_ancestor_timeline_id() == Some(timeline_id)
+                        });
+                        // XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`.
+                        // We already deleted the layer files, so it's probably best to panic.
+                        // (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart)
+                        if children_exist {
+                            panic!("Timeline grew children while we removed layer files");
+                        }
+                        timelines.remove(&timeline_id)
+                    }));
+
+                match removed_timeline {
+                    Ok(Some(_)) => {}
+                    Ok(None) => {
+                        // with SharedRetryable this should no longer happen
+                        warn!("no other task should had dropped the Timeline");
+                    }
+                    Err(_panic) => return Err(InnerDeleteTimelineError::DeletedGrewChildren),
+                }
+
+                Ok(())
+            }
+            // execute in the *winner's* span so we will capture the request id etc.
+            .instrument(span)
+        };
+
+        let (recv, maybe_fut) = timeline.delete_self.try_restart(factory).await;
+
+        if let Some(fut) = maybe_fut {
+            crate::task_mgr::spawn(
+                &tokio::runtime::Handle::current(),
+                TaskKind::DeleteTimeline,
+                Some(self.tenant_id()),
+                None,
+                &format!("delete_timeline {}", timeline.timeline_id),
+                false,
+                async move {
+                    fut.await;
+                    Ok(())
                },
-            }
+            );
        }

-        // Stop & wait for the remaining timeline tasks, including upload tasks.
-        // NB: This and other delete_timeline calls do not run as a task_mgr task,
-        //     so, they are not affected by this shutdown_tasks() call.
-        info!("waiting for timeline tasks to shutdown");
-        task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id)).await;
-
-        // Mark timeline as deleted in S3 so we won't pick it up next time
-        // during attach or pageserver restart.
-        // See comment in persist_index_part_with_deleted_flag.
-        if let Some(remote_client) = timeline.remote_client.as_ref() {
-            match remote_client.persist_index_part_with_deleted_flag().await {
-                // If we (now, or already) marked it successfully as deleted, we can proceed
-                Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (),
-                // Bail out otherwise
-                Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_))
-                | Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => {
-                    return Err(DeleteTimelineError::Other(anyhow::anyhow!(e)));
-                }
-            }
-        }
-
-        {
-            // Grab the layer_removal_cs lock, and actually perform the deletion.
-            //
-            // This lock prevents multiple concurrent delete_timeline calls from
-            // stepping on each other's toes, while deleting the files. It also
-            // prevents GC or compaction from running at the same time.
-            //
-            // Note that there are still other race conditions between
-            // GC, compaction and timeline deletion. GC task doesn't
-            // register itself properly with the timeline it's
-            // operating on. See
-            // https://github.com/neondatabase/neon/issues/2671
-            //
-            // No timeout here, GC & Compaction should be responsive to the
-            // `TimelineState::Stopping` change.
-            info!("waiting for layer_removal_cs.lock()");
-            let layer_removal_guard = timeline.layer_removal_cs.lock().await;
-            info!("got layer_removal_cs.lock(), deleting layer files");
-
-            // NB: storage_sync upload tasks that reference these layers have been cancelled
-            //     by the caller.
-
-            let local_timeline_directory = self.conf.timeline_path(&timeline_id, &self.tenant_id);
-
-            fail::fail_point!("timeline-delete-before-rm", |_| {
-                Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))?
-            });
-
-            // NB: This need not be atomic because the deleted flag in the IndexPart
-            // will be observed during tenant/timeline load. The deletion will be resumed there.
-            //
-            // For configurations without remote storage, we tolerate that we're not crash-safe here.
-            // The timeline may come up Active but with missing layer files, in such setups.
-            // See https://github.com/neondatabase/neon/pull/3919#issuecomment-1531726720
-            match std::fs::remove_dir_all(&local_timeline_directory) {
-                Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
-                    // This can happen if we're called a second time, e.g.,
-                    // because of a previous failure/cancellation at/after
-                    // failpoint timeline-delete-after-rm.
-                    //
-                    // It can also happen if we race with tenant detach, because,
-                    // it doesn't grab the layer_removal_cs lock.
-                    //
-                    // For now, log and continue.
-                    // warn! level is technically not appropriate for the
-                    // first case because we should expect retries to happen.
-                    // But the error is so rare, it seems better to get attention if it happens.
-                    let tenant_state = self.current_state();
-                    warn!(
-                        timeline_dir=?local_timeline_directory,
-                        ?tenant_state,
-                        "timeline directory not found, proceeding anyway"
-                    );
-                    // continue with the rest of the deletion
-                }
-                res => res.with_context(|| {
-                    format!(
-                        "Failed to remove local timeline directory '{}'",
-                        local_timeline_directory.display()
-                    )
-                })?,
-            }
-
-            info!("finished deleting layer files, releasing layer_removal_cs.lock()");
-            drop(layer_removal_guard);
-        }
-
-        fail::fail_point!("timeline-delete-after-rm", |_| {
-            Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
-        });
-
-        // Remove the timeline from the map.
-        let mut timelines = self.timelines.lock();
-        let children_exist = timelines
-            .iter()
-            .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id));
-        // XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`.
-        // We already deleted the layer files, so it's probably best to panic.
-        // (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart)
-        if children_exist {
-            panic!("Timeline grew children while we removed layer files");
-        }
-        let removed_timeline = timelines.remove(&timeline_id);
-        if removed_timeline.is_none() {
-            // This can legitimately happen if there's a concurrent call to this function.
-            //   T1                                             T2
-            //   lock
-            //   unlock
-            //                                                  lock
-            //                                                  unlock
-            //                                                  remove files
-            //                                                  lock
-            //                                                  remove from map
-            //                                                  unlock
-            //                                                  return
-            //   remove files
-            //   lock
-            //   remove from map observes empty map
-            //   unlock
-            //   return
-            debug!("concurrent call to this function won the race");
-        }
-        drop(timelines);
-
-        Ok(())
+        recv.await
    }

    pub fn current_state(&self) -> TenantState {
@@ -1578,7 +1663,7 @@ impl Tenant {

                    debug!(tenant_id = %self.tenant_id, "Activating tenant");

-                    let timelines_accessor = self.timelines.lock();
+                    let timelines_accessor = self.timelines.lock().unwrap();
                    let not_broken_timelines = timelines_accessor
                        .values()
                        .filter(|timeline| timeline.current_state() != TimelineState::Broken);
@@ -1645,7 +1730,7 @@ impl Tenant {
                    // might be created after this. That's harmless, as the Timelines
                    // won't be accessible to anyone, when the Tenant is in Stopping
                    // state.
-                    let timelines_accessor = self.timelines.lock();
+                    let timelines_accessor = self.timelines.lock().unwrap();
                    let not_broken_timelines = timelines_accessor
                        .values()
                        .filter(|timeline| timeline.current_state() != TimelineState::Broken);
@@ -1938,7 +2023,7 @@ impl Tenant {
            // activation times.
            loading_started_at: Instant::now(),
            tenant_conf: Arc::new(RwLock::new(tenant_conf)),
-            timelines: parking_lot::Mutex::new(HashMap::new()),
+            timelines: Mutex::new(HashMap::new()),
            gc_cs: tokio::sync::Mutex::new(()),
            walredo_mgr,
            remote_storage,
@@ -2166,7 +2251,7 @@ impl Tenant {
        // Scan all timelines. For each timeline, remember the timeline ID and
        // the branch point where it was created.
        let (all_branchpoints, timeline_ids): (BTreeSet<(TimelineId, Lsn)>, _) = {
-            let timelines = self.timelines.lock();
+            let timelines = self.timelines.lock().unwrap();
            let mut all_branchpoints = BTreeSet::new();
            let timeline_ids = {
                if let Some(target_timeline_id) = target_timeline_id.as_ref() {
@@ -2266,7 +2351,7 @@ impl Tenant {
        // Create a placeholder for the new branch. This will error
        // out if the new timeline ID is already in use.
        let timeline_uninit_mark = {
-            let timelines = self.timelines.lock();
+            let timelines = self.timelines.lock().unwrap();
            self.create_timeline_uninit_mark(dst_id, &timelines)?
        };

@@ -2331,7 +2416,7 @@ impl Tenant {
            src_timeline.initdb_lsn,
            src_timeline.pg_version,
        );
-        let mut timelines = self.timelines.lock();
+        let mut timelines = self.timelines.lock().unwrap();
        let new_timeline = self
            .prepare_timeline(
                dst_id,
@@ -2368,7 +2453,7 @@ impl Tenant {
        ctx: &RequestContext,
    ) -> anyhow::Result<Arc<Timeline>> {
        let timeline_uninit_mark = {
-            let timelines = self.timelines.lock();
+            let timelines = self.timelines.lock().unwrap();
            self.create_timeline_uninit_mark(timeline_id, &timelines)?
        };
        // create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/`
@@ -2454,7 +2539,7 @@ impl Tenant {
        // Initialize the timeline without loading the layer map, because we already updated the layer
        // map above, when we imported the datadir.
        let timeline = {
-            let mut timelines = self.timelines.lock();
+            let mut timelines = self.timelines.lock().unwrap();
            raw_timeline.initialize_with_lock(ctx, &mut timelines, false, true)?
        };

@@ -2501,7 +2586,8 @@ impl Tenant {
        ) {
            Ok(new_timeline) => {
                if init_layers {
-                    new_timeline.layers.write().next_open_layer_at = Some(new_timeline.initdb_lsn);
+                    new_timeline.layers.write().unwrap().next_open_layer_at =
+                        Some(new_timeline.initdb_lsn);
                }
                debug!(
                    "Successfully created initial files for timeline {tenant_id}/{new_timeline_id}"
@@ -2556,7 +2642,7 @@ impl Tenant {
    fn create_timeline_uninit_mark(
        &self,
        timeline_id: TimelineId,
-        timelines: &parking_lot::MutexGuard<HashMap<TimelineId, Arc<Timeline>>>,
+        timelines: &MutexGuard<HashMap<TimelineId, Arc<Timeline>>>,
    ) -> anyhow::Result<TimelineUninitMark> {
        let tenant_id = self.tenant_id;

--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -22,7 +22,6 @@ use pageserver_api::models::{
 };
 use std::ops::Range;
 use std::path::PathBuf;
-use std::pin::Pin;
 use std::sync::{Arc, Mutex};
 use std::time::{Duration, SystemTime, UNIX_EPOCH};
 use tracing::warn;
@@ -336,13 +335,6 @@ impl LayerAccessStats {
    }
 }

-pub(crate) type GetValueReconstructFuture = Pin<
-    Box<
-        dyn Send
-            + std::future::Future<Output = Result<(ValueReconstructState, ValueReconstructResult)>>,
-    >,
->;
-
 /// Supertrait of the [`Layer`] trait that captures the bare minimum interface
 /// required by [`LayerMap`].
 ///
@@ -380,12 +372,12 @@ pub trait Layer: std::fmt::Debug + Send + Sync {
    /// the predecessor layer and call again with the same 'reconstruct_data' to
    /// collect more data.
    fn get_value_reconstruct_data(
-        self: Arc<Self>,
+        &self,
        key: Key,
        lsn_range: Range<Lsn>,
-        reconstruct_data: ValueReconstructState,
-        ctx: RequestContext,
-    ) -> GetValueReconstructFuture;
+        reconstruct_data: &mut ValueReconstructState,
+        ctx: &RequestContext,
+    ) -> Result<ValueReconstructResult>;

    /// A short ID string that uniquely identifies the given layer within a [`LayerMap`].
    fn short_id(&self) -> String;
@@ -494,12 +486,12 @@ impl Layer for LayerDescriptor {
    }

    fn get_value_reconstruct_data(
-        self: Arc<Self>,
+        &self,
        _key: Key,
        _lsn_range: Range<Lsn>,
-        _reconstruct_data: ValueReconstructState,
-        _ctx: RequestContext,
-    ) -> GetValueReconstructFuture {
+        _reconstruct_data: &mut ValueReconstructState,
+        _ctx: &RequestContext,
+    ) -> Result<ValueReconstructResult> {
        todo!("This method shouldn't be part of the Layer trait")
    }

--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -46,7 +46,7 @@ use std::io::{Seek, SeekFrom};
 use std::ops::Range;
 use std::os::unix::fs::FileExt;
 use std::path::{Path, PathBuf};
-use std::sync::{Arc, RwLock, RwLockReadGuard, RwLockWriteGuard};
+use std::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
 use tracing::*;

 use utils::{
@@ -56,8 +56,8 @@ use utils::{
 };

 use super::{
-    DeltaFileName, GetValueReconstructFuture, Layer, LayerAccessStats, LayerAccessStatsReset,
-    LayerFileName, LayerIter, LayerKeyIter, PathOrConf,
+    DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, LayerFileName, LayerIter,
+    LayerKeyIter, PathOrConf,
 };

 ///
@@ -318,94 +318,89 @@ impl Layer for DeltaLayer {
    }

    fn get_value_reconstruct_data(
-        self: Arc<Self>,
+        &self,
        key: Key,
        lsn_range: Range<Lsn>,
-        mut reconstruct_state: ValueReconstructState,
-        ctx: RequestContext,
-    ) -> GetValueReconstructFuture {
-        Box::pin(async move {
-            tokio::task::spawn_blocking(move || {
-                ensure!(lsn_range.start >= self.lsn_range.start);
-                let mut need_image = true;
+        reconstruct_state: &mut ValueReconstructState,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<ValueReconstructResult> {
+        ensure!(lsn_range.start >= self.lsn_range.start);
+        let mut need_image = true;

-                ensure!(self.key_range.contains(&key));
+        ensure!(self.key_range.contains(&key));

-                {
-                    // Open the file and lock the metadata in memory
-                    let inner = self.load(LayerAccessKind::GetValueReconstructData, &ctx)?;
+        {
+            // Open the file and lock the metadata in memory
+            let inner = self.load(LayerAccessKind::GetValueReconstructData, ctx)?;

-                    // Scan the page versions backwards, starting from `lsn`.
-                    let file = inner.file.as_ref().unwrap();
-                    let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
-                        inner.index_start_blk,
-                        inner.index_root_blk,
-                        file,
-                    );
-                    let search_key = DeltaKey::from_key_lsn(&key, Lsn(lsn_range.end.0 - 1));
+            // Scan the page versions backwards, starting from `lsn`.
+            let file = inner.file.as_ref().unwrap();
+            let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
+                inner.index_start_blk,
+                inner.index_root_blk,
+                file,
+            );
+            let search_key = DeltaKey::from_key_lsn(&key, Lsn(lsn_range.end.0 - 1));

-                    let mut offsets: Vec<(Lsn, u64)> = Vec::new();
+            let mut offsets: Vec<(Lsn, u64)> = Vec::new();

-                    tree_reader.visit(&search_key.0, VisitDirection::Backwards, |key, value| {
-                        let blob_ref = BlobRef(value);
-                        if key[..KEY_SIZE] != search_key.0[..KEY_SIZE] {
-                            return false;
-                        }
-                        let entry_lsn = DeltaKey::extract_lsn_from_buf(key);
-                        if entry_lsn < lsn_range.start {
-                            return false;
-                        }
-                        offsets.push((entry_lsn, blob_ref.pos()));
+            tree_reader.visit(&search_key.0, VisitDirection::Backwards, |key, value| {
+                let blob_ref = BlobRef(value);
+                if key[..KEY_SIZE] != search_key.0[..KEY_SIZE] {
+                    return false;
+                }
+                let entry_lsn = DeltaKey::extract_lsn_from_buf(key);
+                if entry_lsn < lsn_range.start {
+                    return false;
+                }
+                offsets.push((entry_lsn, blob_ref.pos()));

-                        !blob_ref.will_init()
-                    })?;
+                !blob_ref.will_init()
+            })?;

-                    // Ok, 'offsets' now contains the offsets of all the entries we need to read
-                    let mut cursor = file.block_cursor();
-                    let mut buf = Vec::new();
-                    for (entry_lsn, pos) in offsets {
-                        cursor.read_blob_into_buf(pos, &mut buf).with_context(|| {
-                            format!(
-                                "Failed to read blob from virtual file {}",
-                                file.file.path.display()
-                            )
-                        })?;
-                        let val = Value::des(&buf).with_context(|| {
-                            format!(
-                                "Failed to deserialize file blob from virtual file {}",
-                                file.file.path.display()
-                            )
-                        })?;
-                        match val {
-                            Value::Image(img) => {
-                                reconstruct_state.img = Some((entry_lsn, img));
-                                need_image = false;
-                                break;
-                            }
-                            Value::WalRecord(rec) => {
-                                let will_init = rec.will_init();
-                                reconstruct_state.records.push((entry_lsn, rec));
-                                if will_init {
-                                    // This WAL record initializes the page, so no need to go further back
-                                    need_image = false;
-                                    break;
-                                }
-                            }
+            // Ok, 'offsets' now contains the offsets of all the entries we need to read
+            let mut cursor = file.block_cursor();
+            let mut buf = Vec::new();
+            for (entry_lsn, pos) in offsets {
+                cursor.read_blob_into_buf(pos, &mut buf).with_context(|| {
+                    format!(
+                        "Failed to read blob from virtual file {}",
+                        file.file.path.display()
+                    )
+                })?;
+                let val = Value::des(&buf).with_context(|| {
+                    format!(
+                        "Failed to deserialize file blob from virtual file {}",
+                        file.file.path.display()
+                    )
+                })?;
+                match val {
+                    Value::Image(img) => {
+                        reconstruct_state.img = Some((entry_lsn, img));
+                        need_image = false;
+                        break;
+                    }
+                    Value::WalRecord(rec) => {
+                        let will_init = rec.will_init();
+                        reconstruct_state.records.push((entry_lsn, rec));
+                        if will_init {
+                            // This WAL record initializes the page, so no need to go further back
+                            need_image = false;
+                            break;
                        }
                    }
-                    // release metadata lock and close the file
                }
-                // If an older page image is needed to reconstruct the page, let the
-                // caller know.
-                if need_image {
-                    Ok((reconstruct_state, ValueReconstructResult::Continue))
-                } else {
-                    Ok((reconstruct_state, ValueReconstructResult::Complete))
-                }
-            })
-            .await
-            .context("spawn_blocking")?
-        })
+            }
+            // release metadata lock and close the file
+        }
+
+        // If an older page image is needed to reconstruct the page, let the
+        // caller know.
+        if need_image {
+            Ok(ValueReconstructResult::Continue)
+        } else {
+            Ok(ValueReconstructResult::Complete)
+        }
    }
 }

--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -43,7 +43,7 @@ use std::io::{Seek, SeekFrom};
 use std::ops::Range;
 use std::os::unix::prelude::FileExt;
 use std::path::{Path, PathBuf};
-use std::sync::{Arc, RwLock, RwLockReadGuard};
+use std::sync::{RwLock, RwLockReadGuard};
 use tracing::*;

 use utils::{
@@ -53,7 +53,7 @@ use utils::{
 };

 use super::filename::{ImageFileName, LayerFileName};
-use super::{GetValueReconstructFuture, Layer, LayerAccessStatsReset, LayerIter, PathOrConf};
+use super::{Layer, LayerAccessStatsReset, LayerIter, PathOrConf};

 ///
 /// Header stored in the beginning of the file
@@ -197,45 +197,38 @@ impl Layer for ImageLayer {

    /// Look up given page in the file
    fn get_value_reconstruct_data(
-        self: Arc<Self>,
+        &self,
        key: Key,
        lsn_range: Range<Lsn>,
-        mut reconstruct_state: ValueReconstructState,
-        ctx: RequestContext,
-    ) -> GetValueReconstructFuture {
-        Box::pin(async move {
-            tokio::task::spawn_blocking(move || {
-                assert!(self.key_range.contains(&key));
-                assert!(lsn_range.start >= self.lsn);
-                assert!(lsn_range.end >= self.lsn);
+        reconstruct_state: &mut ValueReconstructState,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<ValueReconstructResult> {
+        assert!(self.key_range.contains(&key));
+        assert!(lsn_range.start >= self.lsn);
+        assert!(lsn_range.end >= self.lsn);

-                let inner = self.load(LayerAccessKind::GetValueReconstructData, &ctx)?;
+        let inner = self.load(LayerAccessKind::GetValueReconstructData, ctx)?;

-                let file = inner.file.as_ref().unwrap();
-                let tree_reader =
-                    DiskBtreeReader::new(inner.index_start_blk, inner.index_root_blk, file);
+        let file = inner.file.as_ref().unwrap();
+        let tree_reader = DiskBtreeReader::new(inner.index_start_blk, inner.index_root_blk, file);

-                let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
-                key.write_to_byte_slice(&mut keybuf);
-                if let Some(offset) = tree_reader.get(&keybuf)? {
-                    let blob = file.block_cursor().read_blob(offset).with_context(|| {
-                        format!(
-                            "failed to read value from data file {} at offset {}",
-                            self.path().display(),
-                            offset
-                        )
-                    })?;
-                    let value = Bytes::from(blob);
+        let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
+        key.write_to_byte_slice(&mut keybuf);
+        if let Some(offset) = tree_reader.get(&keybuf)? {
+            let blob = file.block_cursor().read_blob(offset).with_context(|| {
+                format!(
+                    "failed to read value from data file {} at offset {}",
+                    self.path().display(),
+                    offset
+                )
+            })?;
+            let value = Bytes::from(blob);

-                    reconstruct_state.img = Some((self.lsn, value));
-                    Ok((reconstruct_state, ValueReconstructResult::Complete))
-                } else {
-                    Ok((reconstruct_state, ValueReconstructResult::Missing))
-                }
-            })
-            .await
-            .context("spawn_blocking")?
-        })
+            reconstruct_state.img = Some((self.lsn, value));
+            Ok(ValueReconstructResult::Complete)
+        } else {
+            Ok(ValueReconstructResult::Missing)
+        }
    }
 }

--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -12,7 +12,7 @@ use crate::tenant::block_io::BlockReader;
 use crate::tenant::ephemeral_file::EphemeralFile;
 use crate::tenant::storage_layer::{ValueReconstructResult, ValueReconstructState};
 use crate::walrecord;
-use anyhow::{ensure, Context, Result};
+use anyhow::{ensure, Result};
 use pageserver_api::models::InMemoryLayerInfo;
 use std::cell::RefCell;
 use std::collections::HashMap;
@@ -27,9 +27,9 @@ use utils::{
 // while being able to use std::fmt::Write's methods
 use std::fmt::Write as _;
 use std::ops::Range;
-use std::sync::{Arc, RwLock};
+use std::sync::RwLock;

-use super::{DeltaLayer, DeltaLayerWriter, GetValueReconstructFuture, Layer};
+use super::{DeltaLayer, DeltaLayerWriter, Layer};

 thread_local! {
    /// A buffer for serializing object during [`InMemoryLayer::put_value`].
@@ -191,60 +191,52 @@ impl Layer for InMemoryLayer {

    /// Look up given value in the layer.
    fn get_value_reconstruct_data(
-        self: Arc<Self>,
+        &self,
        key: Key,
        lsn_range: Range<Lsn>,
-        mut reconstruct_state: ValueReconstructState,
-        _ctx: RequestContext,
-    ) -> GetValueReconstructFuture {
-        Box::pin(async move {
-            // The in-memory layer isn't actually in-memory. It uses EphemeralFile.
-            // So, this does do IO.
-            tokio::task::spawn_blocking(move || {
-                ensure!(lsn_range.start >= self.start_lsn);
-                let mut need_image = true;
+        reconstruct_state: &mut ValueReconstructState,
+        _ctx: &RequestContext,
+    ) -> anyhow::Result<ValueReconstructResult> {
+        ensure!(lsn_range.start >= self.start_lsn);
+        let mut need_image = true;

-                let inner = self.inner.read().unwrap();
+        let inner = self.inner.read().unwrap();

-                let mut reader = inner.file.block_cursor();
+        let mut reader = inner.file.block_cursor();

-                // Scan the page versions backwards, starting from `lsn`.
-                if let Some(vec_map) = inner.index.get(&key) {
-                    let slice = vec_map.slice_range(lsn_range);
-                    for (entry_lsn, pos) in slice.iter().rev() {
-                        let buf = reader.read_blob(*pos)?;
-                        let value = Value::des(&buf)?;
-                        match value {
-                            Value::Image(img) => {
-                                reconstruct_state.img = Some((*entry_lsn, img));
-                                return Ok((reconstruct_state, ValueReconstructResult::Complete));
-                            }
-                            Value::WalRecord(rec) => {
-                                let will_init = rec.will_init();
-                                reconstruct_state.records.push((*entry_lsn, rec));
-                                if will_init {
-                                    // This WAL record initializes the page, so no need to go further back
-                                    need_image = false;
-                                    break;
-                                }
-                            }
+        // Scan the page versions backwards, starting from `lsn`.
+        if let Some(vec_map) = inner.index.get(&key) {
+            let slice = vec_map.slice_range(lsn_range);
+            for (entry_lsn, pos) in slice.iter().rev() {
+                let buf = reader.read_blob(*pos)?;
+                let value = Value::des(&buf)?;
+                match value {
+                    Value::Image(img) => {
+                        reconstruct_state.img = Some((*entry_lsn, img));
+                        return Ok(ValueReconstructResult::Complete);
+                    }
+                    Value::WalRecord(rec) => {
+                        let will_init = rec.will_init();
+                        reconstruct_state.records.push((*entry_lsn, rec));
+                        if will_init {
+                            // This WAL record initializes the page, so no need to go further back
+                            need_image = false;
+                            break;
                        }
                    }
                }
+            }
+        }

-                // release lock on 'inner'
+        // release lock on 'inner'

-                // If an older page image is needed to reconstruct the page, let the
-                // caller know.
-                if need_image {
-                    Ok((reconstruct_state, ValueReconstructResult::Continue))
-                } else {
-                    Ok((reconstruct_state, ValueReconstructResult::Complete))
-                }
-            })
-            .await
-            .context("spawn_blocking")?
-        })
+        // If an older page image is needed to reconstruct the page, let the
+        // caller know.
+        if need_image {
+            Ok(ValueReconstructResult::Continue)
+        } else {
+            Ok(ValueReconstructResult::Complete)
+        }
    }
 }

--- a/pageserver/src/tenant/storage_layer/remote_layer.rs
+++ b/pageserver/src/tenant/storage_layer/remote_layer.rs
@@ -6,7 +6,7 @@ use crate::context::RequestContext;
 use crate::repository::Key;
 use crate::tenant::layer_map::BatchedUpdates;
 use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
-use crate::tenant::storage_layer::{Layer, ValueReconstructState};
+use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
 use anyhow::{bail, Result};
 use pageserver_api::models::HistoricLayerInfo;
 use std::ops::Range;
@@ -21,8 +21,8 @@ use utils::{
 use super::filename::{DeltaFileName, ImageFileName, LayerFileName};
 use super::image_layer::ImageLayer;
 use super::{
-    DeltaLayer, GetValueReconstructFuture, LayerAccessStats, LayerAccessStatsReset, LayerIter,
-    LayerKeyIter, LayerResidenceStatus, PersistentLayer,
+    DeltaLayer, LayerAccessStats, LayerAccessStatsReset, LayerIter, LayerKeyIter,
+    LayerResidenceStatus, PersistentLayer,
 };

 /// RemoteLayer is a not yet downloaded [`ImageLayer`] or
@@ -83,18 +83,16 @@ impl Layer for RemoteLayer {
    }

    fn get_value_reconstruct_data(
-        self: Arc<Self>,
+        &self,
        _key: Key,
        _lsn_range: Range<Lsn>,
-        _reconstruct_state: ValueReconstructState,
-        _ctx: RequestContext,
-    ) -> GetValueReconstructFuture {
-        Box::pin(async move {
-            bail!(
-                "layer {} needs to be downloaded",
-                self.filename().file_name()
-            );
-        })
+        _reconstruct_state: &mut ValueReconstructState,
+        _ctx: &RequestContext,
+    ) -> Result<ValueReconstructResult> {
+        bail!(
+            "layer {} needs to be downloaded",
+            self.filename().file_name()
+        );
    }

    fn is_incremental(&self) -> bool {
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -121,7 +121,7 @@ pub struct Timeline {

    pub pg_version: u32,

-    pub(super) layers: parking_lot::RwLock<LayerMap<dyn PersistentLayer>>,
+    pub(super) layers: RwLock<LayerMap<dyn PersistentLayer>>,

    last_freeze_at: AtomicLsn,
    // Atomic would be more appropriate here.
@@ -227,6 +227,9 @@ pub struct Timeline {
    state: watch::Sender<TimelineState>,

    eviction_task_timeline_state: tokio::sync::Mutex<EvictionTaskTimelineState>,
+
+    pub(super) delete_self:
+        utils::shared_retryable::SharedRetryable<Result<(), super::InnerDeleteTimelineError>>,
 }

 /// Internal structure to hold all data needed for logical size calculation.
@@ -504,13 +507,12 @@ impl Timeline {
            None => None,
        };

-        let reconstruct_state = ValueReconstructState {
+        let mut reconstruct_state = ValueReconstructState {
            records: Vec::new(),
            img: cached_page_img,
        };

-        let reconstruct_state = self
-            .get_reconstruct_data(key, lsn, reconstruct_state, ctx)
+        self.get_reconstruct_data(key, lsn, &mut reconstruct_state, ctx)
            .await?;

        self.metrics
@@ -548,7 +550,7 @@ impl Timeline {
    /// This method makes no distinction between local and remote layers.
    /// Hence, the result **does not represent local filesystem usage**.
    pub fn layer_size_sum(&self) -> u64 {
-        let layer_map = self.layers.read();
+        let layer_map = self.layers.read().unwrap();
        let mut size = 0;
        for l in layer_map.iter_historic_layers() {
            size += l.file_size();
@@ -848,7 +850,7 @@ impl Timeline {
    /// safekeepers to regard pageserver as caught up and suspend activity.
    pub fn check_checkpoint_distance(self: &Arc<Timeline>) -> anyhow::Result<()> {
        let last_lsn = self.get_last_record_lsn();
-        let layers = self.layers.read();
+        let layers = self.layers.read().unwrap();
        if let Some(open_layer) = &layers.open_layer {
            let open_layer_size = open_layer.size()?;
            drop(layers);
@@ -928,7 +930,7 @@ impl Timeline {
    }

    pub fn layer_map_info(&self, reset: LayerAccessStatsReset) -> LayerMapInfo {
-        let layer_map = self.layers.read();
+        let layer_map = self.layers.read().unwrap();
        let mut in_memory_layers = Vec::with_capacity(layer_map.frozen_layers.len() + 1);
        if let Some(open_layer) = &layer_map.open_layer {
            in_memory_layers.push(open_layer.info());
@@ -1048,7 +1050,7 @@ impl Timeline {
        }

        // start the batch update
-        let mut layer_map = self.layers.write();
+        let mut layer_map = self.layers.write().unwrap();
        let mut batch_updates = layer_map.batch_update();

        let mut results = Vec::with_capacity(layers_to_evict.len());
@@ -1312,7 +1314,7 @@ impl Timeline {
                timeline_id,
                tenant_id,
                pg_version,
-                layers: parking_lot::RwLock::new(LayerMap::default()),
+                layers: RwLock::new(LayerMap::default()),

                walredo_mgr,
                walreceiver,
@@ -1380,6 +1382,8 @@ impl Timeline {
                eviction_task_timeline_state: tokio::sync::Mutex::new(
                    EvictionTaskTimelineState::default(),
                ),
+
+                delete_self: utils::shared_retryable::SharedRetryable::default(),
            };
            result.repartition_threshold = result.get_checkpoint_distance() / 10;
            result
@@ -1453,7 +1457,7 @@ impl Timeline {
    /// Returns all timeline-related files that were found and loaded.
    ///
    pub(super) fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result<()> {
-        let mut layers = self.layers.write();
+        let mut layers = self.layers.write().unwrap();
        let mut updates = layers.batch_update();
        let mut num_layers = 0;

@@ -1582,7 +1586,7 @@ impl Timeline {

        // We're holding a layer map lock for a while but this
        // method is only called during init so it's fine.
-        let mut layer_map = self.layers.write();
+        let mut layer_map = self.layers.write().unwrap();
        let mut updates = layer_map.batch_update();
        for remote_layer_name in &index_part.timeline_layers {
            let local_layer = local_only_layers.remove(remote_layer_name);
@@ -1735,6 +1739,7 @@ impl Timeline {
        let local_layers = self
            .layers
            .read()
+            .unwrap()
            .iter_historic_layers()
            .map(|l| (l.filename(), l))
            .collect::<HashMap<_, _>>();
@@ -2060,7 +2065,7 @@ impl Timeline {
    }

    fn find_layer(&self, layer_file_name: &str) -> Option<Arc<dyn PersistentLayer>> {
-        for historic_layer in self.layers.read().iter_historic_layers() {
+        for historic_layer in self.layers.read().unwrap().iter_historic_layers() {
            let historic_layer_name = historic_layer.filename().file_name();
            if layer_file_name == historic_layer_name {
                return Some(historic_layer);
@@ -2143,31 +2148,13 @@ impl Timeline {
    ///
    /// This function takes the current timeline's locked LayerMap as an argument,
    /// so callers can avoid potential race conditions.
-    ///
-    // TODO: find a way to not hold the Timeline::layers lock during get_value_reconstruct_data calls.
-    //
-    // Since these calls do local disk IO, they'll be reasonably fast, until come disk IOPS bound.
-    // We have lots of headroom on current pageservers, so, it's going to be fine for now.
-    //
-    // We can't use tokio::sync::RwLock that easily because its guard is not Send, but,
-    // many tasks that access Timeline::layers run inside task_mgr tasks, which are required
-    // to be Send. It has been tried in origin/problame/asyncify-get-reconstruct-data--tokio-sync.
-    //
-    // The solution will probably be to have an immutable + multi-versioned layer map, allowing
-    // us to grab a snapshot of the layer map once and execute this function on the snapshot.
-    //
-    // Or, we could invest time to figure out whether we can drop the layer map lock after
-    // we grabbed the layer, do the IO, re-aquire, and continue the traversal.
-    //
-    // (Why is this allow() not inside the function? Because clippy doesn't respect it then).
-    #[allow(clippy::await_holding_lock)]
    async fn get_reconstruct_data(
        &self,
        key: Key,
        request_lsn: Lsn,
-        mut reconstruct_state: ValueReconstructState,
+        reconstruct_state: &mut ValueReconstructState,
        ctx: &RequestContext,
-    ) -> Result<ValueReconstructState, PageReconstructError> {
+    ) -> Result<(), PageReconstructError> {
        // Start from the current timeline.
        let mut timeline_owned;
        let mut timeline = self;
@@ -2194,12 +2181,12 @@ impl Timeline {
            // The function should have updated 'state'
            //info!("CALLED for {} at {}: {:?} with {} records, cached {}", key, cont_lsn, result, reconstruct_state.records.len(), cached_lsn);
            match result {
-                ValueReconstructResult::Complete => return Ok(reconstruct_state),
+                ValueReconstructResult::Complete => return Ok(()),
                ValueReconstructResult::Continue => {
                    // If we reached an earlier cached page image, we're done.
                    if cont_lsn == cached_lsn + 1 {
                        self.metrics.materialized_page_cache_hit_counter.inc_by(1);
-                        return Ok(reconstruct_state);
+                        return Ok(());
                    }
                    if prev_lsn <= cont_lsn {
                        // Didn't make any progress in last iteration. Error out to avoid
@@ -2245,7 +2232,7 @@ impl Timeline {
            #[allow(clippy::never_loop)] // see comment at bottom of this loop
            'layer_map_search: loop {
                let remote_layer = {
-                    let layers = timeline.layers.read();
+                    let layers = timeline.layers.read().unwrap();

                    // Check the open and frozen in-memory layers first, in order from newest
                    // to oldest.
@@ -2256,19 +2243,13 @@ impl Timeline {
                            // Get all the data needed to reconstruct the page version from this layer.
                            // But if we have an older cached page image, no need to go past that.
                            let lsn_floor = max(cached_lsn + 1, start_lsn);
-                            result = match Arc::clone(open_layer)
-                                .get_value_reconstruct_data(
-                                    key,
-                                    lsn_floor..cont_lsn,
-                                    reconstruct_state,
-                                    ctx.attached_child(),
-                                )
-                                .await
-                            {
-                                Ok((new_reconstruct_state, result)) => {
-                                    reconstruct_state = new_reconstruct_state;
-                                    result
-                                }
+                            result = match open_layer.get_value_reconstruct_data(
+                                key,
+                                lsn_floor..cont_lsn,
+                                reconstruct_state,
+                                ctx,
+                            ) {
+                                Ok(result) => result,
                                Err(e) => return Err(PageReconstructError::from(e)),
                            };
                            cont_lsn = lsn_floor;
@@ -2288,19 +2269,13 @@ impl Timeline {
                        if cont_lsn > start_lsn {
                            //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display());
                            let lsn_floor = max(cached_lsn + 1, start_lsn);
-                            result = match Arc::clone(frozen_layer)
-                                .get_value_reconstruct_data(
-                                    key,
-                                    lsn_floor..cont_lsn,
-                                    reconstruct_state,
-                                    ctx.attached_child(),
-                                )
-                                .await
-                            {
-                                Ok((new_reconstruct_state, result)) => {
-                                    reconstruct_state = new_reconstruct_state;
-                                    result
-                                }
+                            result = match frozen_layer.get_value_reconstruct_data(
+                                key,
+                                lsn_floor..cont_lsn,
+                                reconstruct_state,
+                                ctx,
+                            ) {
+                                Ok(result) => result,
                                Err(e) => return Err(PageReconstructError::from(e)),
                            };
                            cont_lsn = lsn_floor;
@@ -2328,19 +2303,13 @@ impl Timeline {
                            // Get all the data needed to reconstruct the page version from this layer.
                            // But if we have an older cached page image, no need to go past that.
                            let lsn_floor = max(cached_lsn + 1, lsn_floor);
-                            result = match Arc::clone(&layer)
-                                .get_value_reconstruct_data(
-                                    key,
-                                    lsn_floor..cont_lsn,
-                                    reconstruct_state,
-                                    ctx.attached_child(),
-                                )
-                                .await
-                            {
-                                Ok((new_reconstruct_state, result)) => {
-                                    reconstruct_state = new_reconstruct_state;
-                                    result
-                                }
+                            result = match layer.get_value_reconstruct_data(
+                                key,
+                                lsn_floor..cont_lsn,
+                                reconstruct_state,
+                                ctx,
+                            ) {
+                                Ok(result) => result,
                                Err(e) => return Err(PageReconstructError::from(e)),
                            };
                            cont_lsn = lsn_floor;
@@ -2443,7 +2412,7 @@ impl Timeline {
    /// Get a handle to the latest layer for appending.
    ///
    fn get_layer_for_write(&self, lsn: Lsn) -> anyhow::Result<Arc<InMemoryLayer>> {
-        let mut layers = self.layers.write();
+        let mut layers = self.layers.write().unwrap();

        ensure!(lsn.is_aligned());

@@ -2516,7 +2485,7 @@ impl Timeline {
        } else {
            Some(self.write_lock.lock().unwrap())
        };
-        let mut layers = self.layers.write();
+        let mut layers = self.layers.write().unwrap();
        if let Some(open_layer) = &layers.open_layer {
            let open_layer_rc = Arc::clone(open_layer);
            // Does this layer need freezing?
@@ -2554,7 +2523,7 @@ impl Timeline {
            let flush_counter = *layer_flush_start_rx.borrow();
            let result = loop {
                let layer_to_flush = {
-                    let layers = self.layers.read();
+                    let layers = self.layers.read().unwrap();
                    layers.frozen_layers.front().cloned()
                    // drop 'layers' lock to allow concurrent reads and writes
                };
@@ -2655,7 +2624,7 @@ impl Timeline {
        // The new on-disk layers are now in the layer map. We can remove the
        // in-memory layer from the map now.
        {
-            let mut layers = self.layers.write();
+            let mut layers = self.layers.write().unwrap();
            let l = layers.frozen_layers.pop_front();

            // Only one thread may call this function at a time (for this
@@ -2773,7 +2742,7 @@ impl Timeline {

        // Add it to the layer map
        let l = Arc::new(new_delta);
-        let mut layers = self.layers.write();
+        let mut layers = self.layers.write().unwrap();
        let mut batch_updates = layers.batch_update();
        l.access_stats().record_residence_event(
            &batch_updates,
@@ -2828,7 +2797,7 @@ impl Timeline {
    fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> anyhow::Result<bool> {
        let threshold = self.get_image_creation_threshold();

-        let layers = self.layers.read();
+        let layers = self.layers.read().unwrap();

        let mut max_deltas = 0;

@@ -2971,7 +2940,7 @@ impl Timeline {

        let mut layer_paths_to_upload = HashMap::with_capacity(image_layers.len());

-        let mut layers = self.layers.write();
+        let mut layers = self.layers.write().unwrap();
        let mut updates = layers.batch_update();
        let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id);
        for l in image_layers {
@@ -3038,7 +3007,7 @@ impl Timeline {
        target_file_size: u64,
        ctx: &RequestContext,
    ) -> Result<CompactLevel0Phase1Result, CompactionError> {
-        let layers = self.layers.read();
+        let layers = self.layers.read().unwrap();
        let mut level0_deltas = layers.get_level0_deltas()?;
        drop(layers);

@@ -3161,7 +3130,7 @@ impl Timeline {
        // Determine N largest holes where N is number of compacted layers.
        let max_holes = deltas_to_compact.len();
        let last_record_lsn = self.get_last_record_lsn();
-        let layers = self.layers.read(); // Is'n it better to hold original layers lock till here?
+        let layers = self.layers.read().unwrap(); // Is'n it better to hold original layers lock till here?
        let min_hole_range = (target_file_size / page_cache::PAGE_SZ as u64) as i128;
        let min_hole_coverage_size = 3; // TODO: something more flexible?

@@ -3398,7 +3367,7 @@ impl Timeline {
                .context("wait for layer upload ops to complete")?;
        }

-        let mut layers = self.layers.write();
+        let mut layers = self.layers.write().unwrap();
        let mut updates = layers.batch_update();
        let mut new_layer_paths = HashMap::with_capacity(new_layers.len());
        for l in new_layers {
@@ -3657,7 +3626,7 @@ impl Timeline {
        // 4. newer on-disk image layers cover the layer's whole key range
        //
        // TODO holding a write lock is too agressive and avoidable
-        let mut layers = self.layers.write();
+        let mut layers = self.layers.write().unwrap();
        'outer: for l in layers.iter_historic_layers() {
            result.layers_total += 1;

@@ -3940,7 +3909,7 @@ impl Timeline {

                    // Download complete. Replace the RemoteLayer with the corresponding
                    // Delta- or ImageLayer in the layer map.
-                    let mut layers = self_clone.layers.write();
+                    let mut layers = self_clone.layers.write().unwrap();
                    let mut updates = layers.batch_update();
                    let new_layer = remote_layer.create_downloaded_layer(&updates, self_clone.conf, *size);
                    {
@@ -4098,7 +4067,7 @@ impl Timeline {
    ) {
        let mut downloads = Vec::new();
        {
-            let layers = self.layers.read();
+            let layers = self.layers.read().unwrap();
            layers
                .iter_historic_layers()
                .filter_map(|l| l.downcast_remote_layer())
@@ -4201,7 +4170,7 @@ impl LocalLayerInfoForDiskUsageEviction {

 impl Timeline {
    pub(crate) fn get_local_layers_for_disk_usage_eviction(&self) -> DiskUsageEvictionInfo {
-        let layers = self.layers.read();
+        let layers = self.layers.read().unwrap();

        let mut max_layer_size: Option<u64> = None;
        let mut resident_layers = Vec::new();
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -178,7 +178,7 @@ impl Timeline {
        // We don't want to hold the layer map lock during eviction.
        // So, we just need to deal with this.
        let candidates: Vec<Arc<dyn PersistentLayer>> = {
-            let layers = self.layers.read();
+            let layers = self.layers.read().unwrap();
            let mut candidates = Vec::new();
            for hist_layer in layers.iter_historic_layers() {
                if hist_layer.is_remote_layer() {
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -346,20 +346,31 @@ def test_concurrent_timeline_delete_if_first_stuck_at_index_upload(
    failpoint_name = "persist_index_part_with_deleted_flag_after_set_before_upload_pause"
    ps_http.configure_failpoints((failpoint_name, "pause"))

-    def first_call(result_queue):
+    def delete_timeline_call(name, result_queue, barrier):
+        if barrier:
+            barrier.wait()
        try:
-            log.info("first call start")
+            log.info(f"{name} call start")
            ps_http.timeline_delete(env.initial_tenant, child_timeline_id, timeout=10)
-            log.info("first call success")
+            log.info(f"{name} call success")
            result_queue.put("success")
        except Exception:
-            log.exception("first call failed")
+            log.exception(f"{name} call failed")
            result_queue.put("failure, see log for stack trace")

-    first_call_result: queue.Queue[str] = queue.Queue()
-    first_call_thread = threading.Thread(target=first_call, args=(first_call_result,))
+    delete_results: queue.Queue[str] = queue.Queue()
+    first_call_thread = threading.Thread(
+        target=delete_timeline_call,
+        args=(
+            "1st",
+            delete_results,
+            None,
+        ),
+    )
    first_call_thread.start()

+    second_call_thread = None
+
    try:

        def first_call_hit_failpoint():
@@ -369,30 +380,42 @@ def test_concurrent_timeline_delete_if_first_stuck_at_index_upload(

        wait_until(50, 0.1, first_call_hit_failpoint)

-        # make the second call and assert behavior
-        log.info("second call start")
-        error_msg_re = "another task is already setting the deleted_flag, started at"
-        with pytest.raises(PageserverApiException, match=error_msg_re) as second_call_err:
-            ps_http.timeline_delete(env.initial_tenant, child_timeline_id)
-        assert second_call_err.value.status_code == 500
-        env.pageserver.allowed_errors.append(f".*{child_timeline_id}.*{error_msg_re}.*")
-        # the second call will try to transition the timeline into Stopping state as well
+        barrier = threading.Barrier(2)
+        second_call_thread = threading.Thread(
+            target=delete_timeline_call,
+            args=(
+                "2nd",
+                delete_results,
+                barrier,
+            ),
+        )
+        second_call_thread.start()
+
+        barrier.wait()
+
+        # release the pause
+        ps_http.configure_failpoints((failpoint_name, "off"))
+
+        # both should had succeeded: the second call will coalesce with the already-ongoing first call
+        result = delete_results.get()
+        assert result == "success"
+        result = delete_results.get()
+        assert result == "success"
+
+        # the second call will try to transition the timeline into Stopping state, but it's already in that state
+        # (the transition to Stopping state is not part of the request coalescing, because Tenant and Timeline states are a mess already)
        env.pageserver.allowed_errors.append(
            f".*{child_timeline_id}.*Ignoring new state, equal to the existing one: Stopping"
        )
-        log.info("second call failed as expected")
-
-        # by now we know that the second call failed, let's ensure the first call will finish
-        ps_http.configure_failpoints((failpoint_name, "off"))
-
-        result = first_call_result.get()
-        assert result == "success"
-
    finally:
-        log.info("joining first call thread")
+        log.info("joining 1st thread")
        # in any case, make sure the lifetime of the thread is bounded to this test
        first_call_thread.join()

+        if second_call_thread:
+            log.info("joining 2nd thread")
+            second_call_thread.join()
+

 def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder):
    """
@@ -440,9 +463,16 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder):
    # ok, retry without failpoint, it should succeed
    ps_http.configure_failpoints((failpoint_name, "off"))

-    # this should succeed
-    ps_http.timeline_delete(env.initial_tenant, child_timeline_id, timeout=2)
-    # the second call will try to transition the timeline into Stopping state, but it's already in that state
-    env.pageserver.allowed_errors.append(
-        f".*{child_timeline_id}.*Ignoring new state, equal to the existing one: Stopping"
-    )
+    try:
+        ps_http.timeline_delete(env.initial_tenant, child_timeline_id)
+        env.pageserver.allowed_errors.append(
+            f".*{child_timeline_id}.*Ignoring new state, equal to the existing one: Stopping"
+        )
+    except PageserverApiException as e:
+        if e.status_code != 404:
+            raise e
+        else:
+            # mock_s3 was fast enough to delete before we got the request in
+            env.pageserver.allowed_errors.append(
+                f".*{child_timeline_id}.*Error processing HTTP request: NotFound: timeline not found"
+            )
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -36,7 +36,6 @@ nom = { version = "7" }
 num-bigint = { version = "0.4" }
 num-integer = { version = "0.1", features = ["i128"] }
 num-traits = { version = "0.2", features = ["i128"] }
-parking_lot = { version = "0.12", features = ["send_guard"] }
 prost = { version = "0.11" }
 rand = { version = "0.8", features = ["small_rng"] }
 regex = { version = "1" }
Author	SHA1	Message	Date
Dmitry Ivanov	389dfb6809	Simplify low-hanging fruit	2023-05-16 18:00:11 +03:00
Dmitry Ivanov	b2bdd1b117	Changes	2023-05-16 17:09:33 +03:00
Joonas Koivunen	0b42ba9615	refactor: log errors with {:#} even though it's a std::io::Error Co-authored-by: Christian Schwarz <christian@neon.tech>	2023-05-11 20:01:24 +03:00
Joonas Koivunen	6d3be4cf7d	doc: review suggestions Co-authored-by: Christian Schwarz <christian@neon.tech>	2023-05-11 20:00:33 +03:00
Joonas Koivunen	7605aed5aa	fixup: refactor: stop using todo! for a panic!	2023-05-10 18:21:19 +03:00
Joonas Koivunen	e13ae811bf	fixup: move doc test as test case and hide _spawn variants	2023-05-10 17:55:45 +03:00
Joonas Koivunen	36e86e370b	fixup: nicer ascii art	2023-05-10 17:55:05 +03:00
Joonas Koivunen	8e51d5d8f3	doc: remove FIXME simplify uploading there's no need to simplify the uploading; the uploading is not on within this one future which will be executed so it's not safe to just remove it or revert it back. also I am hesitant to add new changes at this point, the diff is large enough.	2023-05-10 17:35:26 +03:00
Joonas Koivunen	72131e15ff	test: less racy test_delete_timeline_client_hangup with the SharedRetried the first request creates a task which will continue to the pause point regardless of the first request, so we need to accept a 404 as success.	2023-05-10 17:31:19 +03:00
Joonas Koivunen	0495043d30	test: fix test_concurrent_timeline_delete_if_first_stuck_at_index_upload manually cherry-picked `245a2a0592`	2023-05-10 17:30:51 +03:00
Joonas Koivunen	c87a742bbd	fixup: one missed logging opportunity	2023-05-10 16:53:14 +03:00
Joonas Koivunen	c7c514aae4	doc: more cleanup	2023-05-10 16:53:03 +03:00
Joonas Koivunen	f7b0beb1cf	refactor: simplify, racy removes no longer intended	2023-05-10 16:27:08 +03:00
Joonas Koivunen	b5b0a2e872	fixup: doc: align strong usage, fix broken references	2023-05-10 16:25:43 +03:00
Joonas Koivunen	cd787070f6	fixup: doc: explain convoluted return type	2023-05-10 16:25:07 +03:00
Joonas Koivunen	05d4a85c2f	fixup: doc: many futures instead of tasks as in, you could run many of these futures with tokio::select within one task or `runtime.block_on`; it does not matter.	2023-05-10 16:24:18 +03:00
Joonas Koivunen	9a35ae40b5	fixup: clippy	2023-05-10 15:54:14 +03:00
Joonas Koivunen	3d33ea465d	fix: coalesce requests to delete_timeline	2023-05-10 15:46:55 +03:00
Joonas Koivunen	fd50c95b9d	feat: utils::shared_retryable::SharedRetryable documented in the module.	2023-05-10 15:08:42 +03:00