feat(storcon): forward gc blocking and unblocking (#8956)

Currently using gc blocking and unblocking with storage controller managed pageservers is painful. Implement the API on storage controller. Fixes: #8893
2026-01-08 14:02:55 +00:00 · 2024-09-07 00:42:55 +03:00
parent fa3fc73c1b
commit 3dbd34aa78
6 changed files with 220 additions and 18 deletions
--- a/test_runner/regress/test_timeline_gc_blocking.py
+++ b/test_runner/regress/test_timeline_gc_blocking.py
@@ -1,17 +1,32 @@
 import time
+from concurrent.futures import ThreadPoolExecutor
+from dataclasses import dataclass
+from typing import List, Optional

+import pytest
+from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
+    LogCursor,
    NeonEnvBuilder,
+    NeonPageserver,
 )
 from fixtures.pageserver.utils import wait_timeline_detail_404


-def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder):
+@pytest.mark.parametrize("sharded", [True, False])
+def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder, sharded: bool):
+    neon_env_builder.num_pageservers = 2 if sharded else 1
    env = neon_env_builder.init_start(
-        initial_tenant_conf={"gc_period": "1s", "lsn_lease_length": "0s"}
+        initial_tenant_conf={"gc_period": "1s", "lsn_lease_length": "0s"},
+        initial_tenant_shard_count=2 if sharded else None,
    )
-    ps = env.pageserver
-    http = ps.http_client()
+
+    if sharded:
+        http = env.storage_controller.pageserver_api()
+    else:
+        http = env.pageserver.http_client()
+
+    pss = ManyPageservers(list(map(lambda ps: ScrollableLog(ps, None), env.pageservers)))

    foo_branch = env.neon_cli.create_branch("foo", "main", env.initial_tenant)

@@ -22,9 +37,8 @@ def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder):
    tenant_before = http.tenant_status(env.initial_tenant)

    wait_for_another_gc_round()
-    _, offset = ps.assert_log_contains(gc_active_line)
-
-    assert ps.log_contains(gc_skipped_line, offset) is None
+    pss.assert_log_contains(gc_active_line)
+    pss.assert_log_does_not_contain(gc_skipped_line)

    http.timeline_block_gc(env.initial_tenant, foo_branch)

@@ -34,34 +48,78 @@ def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder):
    assert gc_blocking == "BlockingReasons { timelines: 1, reasons: EnumSet(Manual) }"

    wait_for_another_gc_round()
-    _, offset = ps.assert_log_contains(gc_skipped_line, offset)
+    pss.assert_log_contains(gc_skipped_line)

-    ps.restart()
-    ps.quiesce_tenants()
+    pss.restart()
+    pss.quiesce_tenants()

-    _, offset = env.pageserver.assert_log_contains(init_gc_skipped, offset)
+    pss.assert_log_contains(init_gc_skipped)

    wait_for_another_gc_round()
-    _, offset = ps.assert_log_contains(gc_skipped_line, offset)
+    pss.assert_log_contains(gc_skipped_line)

    # deletion unblocks gc
    http.timeline_delete(env.initial_tenant, foo_branch)
    wait_timeline_detail_404(http, env.initial_tenant, foo_branch, 10, 1.0)

    wait_for_another_gc_round()
-    _, offset = ps.assert_log_contains(gc_active_line, offset)
+    pss.assert_log_contains(gc_active_line)

    http.timeline_block_gc(env.initial_tenant, env.initial_timeline)

    wait_for_another_gc_round()
-    _, offset = ps.assert_log_contains(gc_skipped_line, offset)
+    pss.assert_log_contains(gc_skipped_line)

    # removing the manual block also unblocks gc
    http.timeline_unblock_gc(env.initial_tenant, env.initial_timeline)

    wait_for_another_gc_round()
-    _, offset = ps.assert_log_contains(gc_active_line, offset)
+    pss.assert_log_contains(gc_active_line)


 def wait_for_another_gc_round():
    time.sleep(2)
+
+
+@dataclass
+class ScrollableLog:
+    pageserver: NeonPageserver
+    offset: Optional[LogCursor]
+
+    def assert_log_contains(self, what: str):
+        msg, offset = self.pageserver.assert_log_contains(what, offset=self.offset)
+        old = self.offset
+        self.offset = offset
+        log.info(f"{old} -> {offset}: {msg}")
+
+    def assert_log_does_not_contain(self, what: str):
+        assert self.pageserver.log_contains(what) is None
+
+
+@dataclass(frozen=True)
+class ManyPageservers:
+    many: List[ScrollableLog]
+
+    def assert_log_contains(self, what: str):
+        for one in self.many:
+            one.assert_log_contains(what)
+
+    def assert_log_does_not_contain(self, what: str):
+        for one in self.many:
+            one.assert_log_does_not_contain(what)
+
+    def restart(self):
+        def do_restart(x: ScrollableLog):
+            x.pageserver.restart()
+
+        with ThreadPoolExecutor(max_workers=len(self.many)) as rt:
+            rt.map(do_restart, self.many)
+            rt.shutdown(wait=True)
+
+    def quiesce_tenants(self):
+        def do_quiesce(x: ScrollableLog):
+            x.pageserver.quiesce_tenants()
+
+        with ThreadPoolExecutor(max_workers=len(self.many)) as rt:
+            rt.map(do_quiesce, self.many)
+            rt.shutdown(wait=True)