From ed9a114bde38b971f49dd12b53163587477fdcc4 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 3 May 2024 14:57:26 +0300
Subject: [PATCH] fix: find gc cutoff points without holding Tenant::gc_cs
 (#7585)

The current implementation of finding timeline gc cutoff Lsn(s) is done
while holding `Tenant::gc_cs`. In recent incidents long create branch
times were caused by holding the `Tenant::gc_cs` over extremely long
`Timeline::find_lsn_by_timestamp`. The fix is to find the GC cutoff
values before taking the `Tenant::gc_cs` lock. This change is safe to do
because the GC cutoff values and the branch points have no dependencies
on each other. In the case of `Timeline::find_gc_cutoff` taking a long
time with this change, we should no longer see `Tenant::gc_cs`
interfering with branch creation.

Additionally, the `Tenant::refresh_gc_info` is now tolerant of timeline
deletions (or any other failures to find the pitr_cutoff). This helps
with the synthetic size calculation being constantly completed instead
of having a break for a timely timeline deletion.

Fixes: #7560
Fixes: #7587
---
 pageserver/src/tenant.rs                | 75 +++++++++++++++++++++----
 pageserver/src/tenant/size.rs           |  5 +-
 pageserver/src/tenant/timeline.rs       |  2 +
 test_runner/regress/test_branching.py   | 24 ++++++++
 test_runner/regress/test_tenant_size.py | 67 +++++++++++++++++++++-
 5 files changed, 157 insertions(+), 16 deletions(-)
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index a6cd1471ff..8fa484e7b2 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -62,9 +62,9 @@ use self::timeline::uninit::TimelineCreateGuard;
 use self::timeline::uninit::TimelineExclusionError;
 use self::timeline::uninit::UninitializedTimeline;
 use self::timeline::EvictionTaskTenantState;
-use self::timeline::GcInfo;
 use self::timeline::TimelineResources;
 use self::timeline::WaitLsnError;
+use self::timeline::{GcCutoffs, GcInfo};
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::deletion_queue::DeletionQueueClient;
@@ -2812,7 +2812,48 @@ impl Tenant {
         cancel: &CancellationToken,
         ctx: &RequestContext,
     ) -> anyhow::Result<Vec<Arc<Timeline>>> {
-        // grab mutex to prevent new timelines from being created here.
+        // before taking the gc_cs lock, do the heavier weight finding of gc_cutoff points for
+        // currently visible timelines.
+        let timelines = self
+            .timelines
+            .lock()
+            .unwrap()
+            .values()
+            .filter(|tl| match target_timeline_id.as_ref() {
+                Some(target) => &tl.timeline_id == target,
+                None => true,
+            })
+            .cloned()
+            .collect::<Vec<_>>();
+
+        let mut gc_cutoffs: HashMap<TimelineId, GcCutoffs> =
+            HashMap::with_capacity(timelines.len());
+
+        for timeline in timelines.iter() {
+            let cutoff = timeline
+                .get_last_record_lsn()
+                .checked_sub(horizon)
+                .unwrap_or(Lsn(0));
+
+            let res = timeline.find_gc_cutoffs(cutoff, pitr, cancel, ctx).await;
+
+            match res {
+                Ok(cutoffs) => {
+                    let old = gc_cutoffs.insert(timeline.timeline_id, cutoffs);
+                    assert!(old.is_none());
+                }
+                Err(e) => {
+                    tracing::warn!(timeline_id = %timeline.timeline_id, "ignoring failure to find gc cutoffs: {e:#}");
+                }
+            }
+        }
+
+        if !self.is_active() {
+            anyhow::bail!("shutting down");
+        }
+
+        // grab mutex to prevent new timelines from being created here; avoid doing long operations
+        // because that will stall branch creation.
         let gc_cs = self.gc_cs.lock().await;
 
         // Scan all timelines. For each timeline, remember the timeline ID and
@@ -2874,11 +2915,6 @@ impl Tenant {
                 }
             }
 
-            let cutoff = timeline
-                .get_last_record_lsn()
-                .checked_sub(horizon)
-                .unwrap_or(Lsn(0));
-
             let branchpoints: Vec<Lsn> = all_branchpoints
                 .range((
                     Included((timeline_id, Lsn(0))),
@@ -2886,12 +2922,27 @@ impl Tenant {
                 ))
                 .map(|&x| x.1)
                 .collect();
-            let cutoffs = timeline.find_gc_cutoffs(cutoff, pitr, cancel, ctx).await?;
 
-            *timeline.gc_info.write().unwrap() = GcInfo {
-                retain_lsns: branchpoints,
-                cutoffs,
-            };
+            {
+                let mut target = timeline.gc_info.write().unwrap();
+
+                match gc_cutoffs.remove(&timeline_id) {
+                    Some(cutoffs) => {
+                        *target = GcInfo {
+                            retain_lsns: branchpoints,
+                            cutoffs,
+                        };
+                    }
+                    None => {
+                        // reasons for this being unavailable:
+                        // - this timeline was created while we were finding cutoffs
+                        // - lsn for timestamp search fails for this timeline repeatedly
+                        //
+                        // in both cases, refreshing the branchpoints is correct.
+                        target.retain_lsns = branchpoints;
+                    }
+                };
+            }
 
             gc_timelines.push(timeline);
         }
diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs
index 974c1091fd..64fff5536c 100644
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -118,9 +118,6 @@ pub(super) async fn gather_inputs(
     ctx: &RequestContext,
 ) -> anyhow::Result<ModelInputs> {
     // refresh is needed to update gc related pitr_cutoff and horizon_cutoff
-    //
-    // FIXME: if a single timeline is deleted while refresh gc info is ongoing, we will fail the
-    // whole computation. It does not make sense from the billing perspective.
     tenant
         .refresh_gc_info(cancel, ctx)
         .await
@@ -221,6 +218,8 @@ pub(super) async fn gather_inputs(
             .map(|lsn| (lsn, LsnKind::BranchPoint))
             .collect::<Vec<_>>();
 
+        drop(gc_info);
+
         // Add branch points we collected earlier, just in case there were any that were
         // not present in retain_lsns. We will remove any duplicates below later.
         if let Some(this_branchpoints) = branchpoints.get(&timeline_id) {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 7aeb3a6a59..19228bc1f1 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4434,6 +4434,8 @@ impl Timeline {
             .start_timer()
             .record_on_drop();
 
+        pausable_failpoint!("Timeline::find_gc_cutoffs-pausable");
+
         // First, calculate pitr_cutoff_timestamp and then convert it to LSN.
         //
         // Some unit tests depend on garbage-collection working even when
diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py
index 5b69649007..9fe9f77fea 100644
--- a/test_runner/regress/test_branching.py
+++ b/test_runner/regress/test_branching.py
@@ -1,6 +1,7 @@
 import random
 import threading
 import time
+from concurrent.futures import ThreadPoolExecutor
 from typing import List
 
 import pytest
@@ -405,6 +406,29 @@ def test_duplicate_creation(neon_env_builder: NeonEnvBuilder):
     assert len(ps_http.timeline_list(tenant_id=env.initial_tenant)) == 1
 
 
+def test_branching_while_stuck_find_gc_cutoffs(neon_env_builder: NeonEnvBuilder):
+    env = neon_env_builder.init_start()
+
+    client = env.pageserver.http_client()
+
+    failpoint = "Timeline::find_gc_cutoffs-pausable"
+
+    client.configure_failpoints((failpoint, "pause"))
+
+    with ThreadPoolExecutor(max_workers=1) as exec:
+        completion = exec.submit(client.timeline_gc, env.initial_tenant, env.initial_timeline, None)
+
+        wait_until_paused(env, failpoint)
+
+        env.neon_cli.create_branch(
+            tenant_id=env.initial_tenant, ancestor_branch_name="main", new_branch_name="branch"
+        )
+
+        client.configure_failpoints((failpoint, "off"))
+
+        completion.result()
+
+
 def wait_until_paused(env: NeonEnv, failpoint: str):
     found = False
     msg = f"at failpoint {failpoint}"
diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py
index a588f6ab53..53da548524 100644
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -1,4 +1,5 @@
 import os
+from concurrent.futures import ThreadPoolExecutor
 from pathlib import Path
 from typing import List, Tuple
 
@@ -11,13 +12,15 @@ from fixtures.neon_fixtures import (
     wait_for_last_flush_lsn,
     wait_for_wal_insert_lsn,
 )
-from fixtures.pageserver.http import PageserverHttpClient
+from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
 from fixtures.pageserver.utils import (
+    tenant_delete_wait_completed,
     timeline_delete_wait_completed,
     wait_until_tenant_active,
 )
 from fixtures.pg_version import PgVersion
 from fixtures.types import Lsn, TenantId, TimelineId
+from fixtures.utils import wait_until
 
 
 def test_empty_tenant_size(neon_env_builder: NeonEnvBuilder):
@@ -616,6 +619,68 @@ def test_get_tenant_size_with_multiple_branches(
     size_debug_file.write(size_debug)
 
 
+def test_synthetic_size_while_deleting(neon_env_builder: NeonEnvBuilder):
+    """
+    Makes sure synthetic size can still be calculated even if one of the
+    timelines is deleted or the tenant is deleted.
+    """
+
+    env = neon_env_builder.init_start()
+    failpoint = "Timeline::find_gc_cutoffs-pausable"
+    client = env.pageserver.http_client()
+
+    orig_size = client.tenant_size(env.initial_tenant)
+
+    branch_id = env.neon_cli.create_branch(
+        tenant_id=env.initial_tenant, ancestor_branch_name="main", new_branch_name="branch"
+    )
+    client.configure_failpoints((failpoint, "pause"))
+
+    with ThreadPoolExecutor(max_workers=1) as exec:
+        completion = exec.submit(client.tenant_size, env.initial_tenant)
+        _, last_offset = wait_until(
+            10, 1.0, lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}")
+        )
+
+        timeline_delete_wait_completed(client, env.initial_tenant, branch_id)
+
+        client.configure_failpoints((failpoint, "off"))
+        size = completion.result()
+
+        assert_size_approx_equal(orig_size, size)
+
+    branch_id = env.neon_cli.create_branch(
+        tenant_id=env.initial_tenant, ancestor_branch_name="main", new_branch_name="branch2"
+    )
+    client.configure_failpoints((failpoint, "pause"))
+
+    with ThreadPoolExecutor(max_workers=1) as exec:
+        completion = exec.submit(client.tenant_size, env.initial_tenant)
+        wait_until(
+            10,
+            1.0,
+            lambda: env.pageserver.assert_log_contains(
+                f"at failpoint {failpoint}", offset=last_offset
+            ),
+        )
+
+        tenant_delete_wait_completed(client, env.initial_tenant, 10)
+
+        client.configure_failpoints((failpoint, "off"))
+
+        with pytest.raises(
+            PageserverApiException, match="Failed to refresh gc_info before gathering inputs"
+        ):
+            completion.result()
+
+    # this happens on both cases
+    env.pageserver.allowed_errors.append(
+        ".*ignoring failure to find gc cutoffs: timeline shutting down.*"
+    )
+    # this happens only in the case of deletion (http response logging)
+    env.pageserver.allowed_errors.append(".*Failed to refresh gc_info before gathering inputs.*")
+
+
 # Helper for tests that compare timeline_inputs
 # We don't want to compare the exact values, because they can be unstable
 # and cause flaky tests. So replace the values with useful invariants.