initial logical size calculation: always poll to completion (#10471)

# Refs - extracted from https://github.com/neondatabase/neon/pull/9353 # Problem Before this PR, when task_mgr shutdown is signalled, e.g. during pageserver shutdown or Tenant shutdown, initial logical size calculation stops polling and drops the future that represents the calculation. This is against the current policy that we poll all futures to completion. This became apparent during development of concurrent IO which warns if we drop a `Timeline::get_vectored` future that still has in-flight IOs. We may revise the policy in the future, but, right now initial logical size calculation is the only part of the codebase that doesn't adhere to the policy, so let's fix it. ## Code Changes - make sensitive exclusively to `Timeline::cancel` - This should be sufficient for all cases of shutdowns; the sensitivity to task_mgr shutdown is unnecessary. - this broke the various cancel tests in `test_timeline_size.py`, e.g., `test_timeline_initial_logical_size_calculation_cancellation` - the tests would time out because the await point was not sensitive to cancellation - to fix this, refactor `pausable_failpoint` so that it accepts a cancellation token - side note: we _really_ should write our own failpoint library; maybe after we get heap-allocated RequestContext, we can plumb failpoints through there.
2026-01-06 04:52:55 +00:00 · 2025-01-22 13:28:26 +01:00
parent b4d87b9dfe
commit b31ce14083
3 changed files with 66 additions and 62 deletions
--- a/libs/utils/src/failpoint_support.rs
+++ b/libs/utils/src/failpoint_support.rs
@@ -11,31 +11,55 @@ use tracing::*;

 /// Declare a failpoint that can use to `pause` failpoint action.
 /// We don't want to block the executor thread, hence, spawn_blocking + await.
+///
+/// Optionally pass a cancellation token, and this failpoint will drop out of
+/// its pause when the cancellation token fires. This is useful for testing
+/// cases where we would like to block something, but test its clean shutdown behavior.
+/// The macro evaluates to a Result in that case, where Ok(()) is the case
+/// where the failpoint was not paused, and Err() is the case where cancellation
+/// token fired while evaluating the failpoint.
+///
+/// Remember to unpause the failpoint in the test; until that happens, one of the
+/// limited number of spawn_blocking thread pool threads is leaked.
 #[macro_export]
 macro_rules! pausable_failpoint {
-    ($name:literal) => {
+    ($name:literal) => {{
        if cfg!(feature = "testing") {
-            tokio::task::spawn_blocking({
-                let current = tracing::Span::current();
+            let cancel = ::tokio_util::sync::CancellationToken::new();
+            let _ = $crate::pausable_failpoint!($name, &cancel);
+        }
+    }};
+    ($name:literal, $cancel:expr) => {{
+        if cfg!(feature = "testing") {
+            let failpoint_fut = ::tokio::task::spawn_blocking({
+                let current = ::tracing::Span::current();
                move || {
                    let _entered = current.entered();
-                    tracing::info!("at failpoint {}", $name);
-                    fail::fail_point!($name);
+                    ::tracing::info!("at failpoint {}", $name);
+                    ::fail::fail_point!($name);
+                }
+            });
+            let cancel_fut = async move {
+                $cancel.cancelled().await;
+            };
+            ::tokio::select! {
+                res = failpoint_fut => {
+                    res.expect("spawn_blocking");
+                    // continue with execution
+                    Ok(())
+                },
+                _ = cancel_fut => {
+                    Err(())
                }
-            })
-            .await
-            .expect("spawn_blocking");
-        }
-    };
-    ($name:literal, $cond:expr) => {
-        if cfg!(feature = "testing") {
-            if $cond {
-                pausable_failpoint!($name)
            }
+        } else {
+            Ok(())
        }
-    };
+    }};
 }

+pub use pausable_failpoint;
+
 /// use with fail::cfg("$name", "return(2000)")
 ///
 /// The effect is similar to a "sleep(2000)" action, i.e. we sleep for the