WIP: fix: logical size limit is broken during PS restart

fixes https://github.com/neondatabase/neon/issues/5963 On top of https://github.com/neondatabase/neon/pull/6000 Will ship this in a release after #600
2026-01-15 09:22:55 +00:00 · 2023-12-01 11:52:35 +00:00
parent ef1848f002
commit 57a7b2e8e3
3 changed files with 17 additions and 51 deletions
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1738,34 +1738,18 @@ impl Timeline {
    ) -> logical_size::CurrentLogicalSize {
        let current_size = self.current_logical_size.current_size();
        debug!("Current size: {current_size:?}");
-
-        match (current_size.accuracy(), priority) {
-            (logical_size::Accuracy::Exact, _) => (), // nothing to do
-            (logical_size::Accuracy::Approximate, GetLogicalSizePriority::Background) => {
-                // background task will eventually deliver an exact value, we're in no rush
-            }
-            (logical_size::Accuracy::Approximate, GetLogicalSizePriority::User) => {
-                // background task is not ready, but user is asking for it now;
-                // => make the background task skip the line
-                // (The alternative would be to calculate the size here, but,
-                //  it can actually take a long time if the user has a lot of rels.
-                //  And we'll inevitable need it again; So, let the background task do the work.)
-                match self
-                    .current_logical_size
-                    .cancel_wait_for_background_loop_concurrency_limit_semaphore
-                    .get()
-                {
-                    Some(cancel) => cancel.cancel(),
-                    None => {
-                        warn!("unexpected: priority_tx not set, logical size calculation will not be prioritized");
-                    }
-                };
-            }
-        }
-
        current_size
    }

+    // if it's not already computed, it computes it _now_
+    pub(crate) async fn get_current_logical_size_wait_exact(
+        self: &Arc<Self>,
+    ) -> Result<logical_size::Exact, TimelineCancelled | CalculationError> {
+        self.current_logical_size.initial_logical_size.get_or_try_init(async {
+            // do calcualtion here
+        })
+    }
+
    fn spawn_initial_logical_size_computation_task(self: &Arc<Self>, ctx: &RequestContext) {
        let Some(initial_part_end) = self.current_logical_size.initial_part_end else {
            // nothing to do for freshly created timelines;
@@ -1832,31 +1816,9 @@ impl Timeline {
                    &cancel,
                );

-                use crate::metrics::initial_logical_size::StartCircumstances;
-                let (_maybe_permit, circumstances) = tokio::select! {
-                    res = wait_for_permit => {
-                        match res {
-                            Ok(permit) => (Some(permit), StartCircumstances::AfterBackgroundTasksRateLimit),
-                            Err(RateLimitError::Cancelled) => {
-                                return Err(BackgroundCalculationError::Cancelled);
-                            }
-                        }
-                    }
-                    () = skip_concurrency_limiter.cancelled() => {
-                        // Some action that is part of a end user interaction requested logical size
-                        // => break out of the rate limit
-                        // TODO: ideally we'd not run on BackgroundRuntime but the requester's runtime;
-                        // but then again what happens if they cancel; also, we should just be using
-                        // one runtime across the entire process, so, let's leave this for now.
-                        (None, StartCircumstances::SkippedConcurrencyLimiter)
-                    }
-                };
-
-                let metrics_guard = if attempt == 1 {
-                    crate::metrics::initial_logical_size::START_CALCULATION.first(circumstances)
-                } else {
-                    crate::metrics::initial_logical_size::START_CALCULATION.retry(circumstances)
-                };
+                self.current_logical_size.initial_logical_size.get_or_init(async {
+                    // do calcualtion here
+                });

                match self_ref
                    .logical_size_calculation_task(
--- a/pageserver/src/tenant/timeline/logical_size.rs
+++ b/pageserver/src/tenant/timeline/logical_size.rs
@@ -22,7 +22,7 @@ pub(super) struct LogicalSize {
    ///
    /// NOTE: size at a given LSN is constant, but after a restart we will calculate
    /// the initial size at a different LSN.
-    pub initial_logical_size: OnceCell<(
+    pub initial_logical_size: tokio::sync::OnceCell<(
        u64,
        crate::metrics::initial_logical_size::FinishedCalculationGuard,
    )>,
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -206,6 +206,10 @@ pub(super) async fn connection_manager_loop_step(

        if let Some(new_candidate) = connection_manager_state.next_connection_candidate() {
            info!("Switching to new connection candidate: {new_candidate:?}");
+            tokio::select! {
+                logical_size = connection_manager_state.timeline.get_current_logical_size_wait_exact().await,
+                _ = connection_manager.should_shutdown(),
+            }
            connection_manager_state
                .change_connection(new_candidate, ctx)
                .await