Optimize branch creation (#2101)

Resolves #2054 **Context**: branch creation needs to wait for GC to acquire `gc_cs` lock, which prevents creating new timelines during GC. However, because individual timeline GC iteration also requires `compaction_cs` lock, branch creation may also need to wait for compactions of multiple timelines. This results in large latency when creating a new branch, which we advertised as *"instantly"*. This PR optimizes the latency of branch creation by separating GC into two phases: 1. Collect GC data (branching points, cutoff LSNs, etc) 2. Perform GC for each timeline The GC bottleneck comes from step 2, which must wait for compaction of multiple timelines. This PR modifies the branch creation and GC functions to allow GC to hold the GC lock only in step 1. As a result, branch creation doesn't need to wait for compaction to finish but only needs to wait for GC data collection step, which is fast.
2026-01-08 05:52:55 +00:00 · 2022-07-19 14:56:25 -04:00
parent 98dd2e4f52
commit 160e52ec7e
6 changed files with 359 additions and 126 deletions
--- a/test_runner/batch_others/test_branch_and_gc.py
+++ b/test_runner/batch_others/test_branch_and_gc.py
@@ -1,3 +1,5 @@
+import threading
+import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv
 from fixtures.utils import lsn_from_hex
@@ -99,3 +101,67 @@ def test_branch_and_gc(neon_simple_env: NeonEnv):

    branch_cur.execute('SELECT count(*) FROM foo')
    assert branch_cur.fetchone() == (200000, )
+
+
+# This test simulates a race condition happening when branch creation and GC are performed concurrently.
+#
+# Suppose we want to create a new timeline 't' from a source timeline 's' starting
+# from a lsn 'lsn'. Upon creating 't', if we don't hold the GC lock and compare 'lsn' with
+# the latest GC information carefully, it's possible for GC to accidentally remove data
+# needed by the new timeline.
+#
+# In this test, GC is requested before the branch creation but is delayed to happen after branch creation.
+# As a result, when doing GC for the source timeline, we don't have any information about
+# the upcoming new branches, so it's possible to remove data that may be needed by the new branches.
+# It's the branch creation task's job to make sure the starting 'lsn' is not out of scope
+# and prevent creating branches with invalid starting LSNs.
+#
+# For more details, see discussion in https://github.com/neondatabase/neon/pull/2101#issuecomment-1185273447.
+def test_branch_creation_before_gc(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+    # Disable background GC but set the `pitr_interval` to be small, so GC can delete something
+    tenant, _ = env.neon_cli.create_tenant(
+        conf={
+            # disable background GC
+            'gc_period': '10 m',
+            'gc_horizon': f'{10 * 1024 ** 3}',
+
+            # small checkpoint distance to create more delta layer files
+            'checkpoint_distance': f'{1024 ** 2}',
+
+            # set the target size to be large to allow the image layer to cover the whole key space
+            'compaction_target_size': f'{1024 ** 3}',
+
+            # tweak the default settings to allow quickly create image layers and L1 layers
+            'compaction_period': '1 s',
+            'compaction_threshold': '2',
+            'image_creation_threshold': '1',
+
+            # set PITR interval to be small, so we can do GC
+            'pitr_interval': '1 s'
+        })
+
+    b0 = env.neon_cli.create_branch('b0', tenant_id=tenant)
+    pg0 = env.postgres.create_start('b0', tenant_id=tenant)
+    res = pg0.safe_psql_many(queries=[
+        "CREATE TABLE t(key serial primary key)",
+        "INSERT INTO t SELECT FROM generate_series(1, 100000)",
+        "SELECT pg_current_wal_insert_lsn()",
+        "INSERT INTO t SELECT FROM generate_series(1, 100000)",
+    ])
+    lsn = res[2][0][0]
+
+    # Use `failpoint=sleep` and `threading` to make the GC iteration triggers *before* the
+    # branch creation task but the individual timeline GC iteration happens *after*
+    # the branch creation task.
+    env.pageserver.safe_psql(f"failpoints before-timeline-gc=sleep(2000)")
+
+    def do_gc():
+        env.pageserver.safe_psql(f"do_gc {tenant.hex} {b0.hex} 0")
+
+    thread = threading.Thread(target=do_gc, daemon=True)
+    thread.start()
+
+    # The starting LSN is invalid as the corresponding record is scheduled to be removed by in-queue GC.
+    with pytest.raises(Exception, match="invalid branch start lsn"):
+        env.neon_cli.create_branch('b1', 'b0', tenant_id=tenant, ancestor_start_lsn=lsn)
--- a/test_runner/performance/test_branch_creation.py
+++ b/test_runner/performance/test_branch_creation.py
@@ -0,0 +1,110 @@
+import random
+import time
+import statistics
+import threading
+import timeit
+import pytest
+from typing import List
+from fixtures.benchmark_fixture import MetricReport
+from fixtures.compare_fixtures import NeonCompare
+from fixtures.log_helper import log
+
+
+def _record_branch_creation_durations(neon_compare: NeonCompare, durs: List[float]):
+    neon_compare.zenbenchmark.record("branch_creation_duration_max",
+                                     max(durs),
+                                     's',
+                                     MetricReport.LOWER_IS_BETTER)
+    neon_compare.zenbenchmark.record("branch_creation_duration_avg",
+                                     statistics.mean(durs),
+                                     's',
+                                     MetricReport.LOWER_IS_BETTER)
+    neon_compare.zenbenchmark.record("branch_creation_duration_stdev",
+                                     statistics.stdev(durs),
+                                     's',
+                                     MetricReport.LOWER_IS_BETTER)
+
+
+@pytest.mark.parametrize("n_branches", [20])
+# Test measures the latency of branch creation during a heavy [1] workload.
+#
+# [1]: to simulate a heavy workload, the test tweaks the GC and compaction settings
+# to increase the task's frequency. The test runs `pgbench` in each new branch.
+# Each branch is created from a randomly picked source branch.
+def test_branch_creation_heavy_write(neon_compare: NeonCompare, n_branches: int):
+    env = neon_compare.env
+    pg_bin = neon_compare.pg_bin
+
+    # Use aggressive GC and checkpoint settings, so GC and compaction happen more often during the test
+    tenant, _ = env.neon_cli.create_tenant(
+         conf={
+             'gc_period': '5 s',
+             'gc_horizon': f'{4 * 1024 ** 2}',
+             'checkpoint_distance': f'{2 * 1024 ** 2}',
+             'compaction_target_size': f'{1024 ** 2}',
+             'compaction_threshold': '2',
+             # set PITR interval to be small, so we can do GC
+             'pitr_interval': '5 s'
+         })
+
+    def run_pgbench(branch: str):
+        log.info(f"Start a pgbench workload on branch {branch}")
+
+        pg = env.postgres.create_start(branch, tenant_id=tenant)
+        connstr = pg.connstr()
+
+        pg_bin.run_capture(['pgbench', '-i', connstr])
+        pg_bin.run_capture(['pgbench', '-c10', '-T10', connstr])
+
+        pg.stop()
+
+    env.neon_cli.create_branch('b0', tenant_id=tenant)
+
+    threads: List[threading.Thread] = []
+    threads.append(threading.Thread(target=run_pgbench, args=('b0', ), daemon=True))
+    threads[-1].start()
+
+    branch_creation_durations = []
+    for i in range(n_branches):
+        time.sleep(1.0)
+
+        # random a source branch
+        p = random.randint(0, i)
+
+        timer = timeit.default_timer()
+        env.neon_cli.create_branch('b{}'.format(i + 1), 'b{}'.format(p), tenant_id=tenant)
+        dur = timeit.default_timer() - timer
+
+        log.info(f"Creating branch b{i+1} took {dur}s")
+        branch_creation_durations.append(dur)
+
+        threads.append(threading.Thread(target=run_pgbench, args=(f'b{i+1}', ), daemon=True))
+        threads[-1].start()
+
+    for thread in threads:
+        thread.join()
+
+    _record_branch_creation_durations(neon_compare, branch_creation_durations)
+
+
+@pytest.mark.parametrize("n_branches", [1024])
+# Test measures the latency of branch creation when creating a lot of branches.
+def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int):
+    env = neon_compare.env
+
+    env.neon_cli.create_branch('b0')
+
+    pg = env.postgres.create_start('b0')
+    neon_compare.pg_bin.run_capture(['pgbench', '-i', '-s10', pg.connstr()])
+
+    branch_creation_durations = []
+
+    for i in range(n_branches):
+        # random a source branch
+        p = random.randint(0, i)
+        timer = timeit.default_timer()
+        env.neon_cli.create_branch('b{}'.format(i + 1), 'b{}'.format(p))
+        dur = timeit.default_timer() - timer
+        branch_creation_durations.append(dur)
+
+    _record_branch_creation_durations(neon_compare, branch_creation_durations)