From bc5c2d00cbc2e6937a19bbcaa2fbe46504f72474 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 9 May 2024 16:06:10 +0100
Subject: [PATCH] tests: add test_sharding_autosplit

---
 .../performance/test_sharding_autosplit.py    | 262 ++++++++++++++++++
 1 file changed, 262 insertions(+)
 create mode 100644 test_runner/performance/test_sharding_autosplit.py

diff --git a/test_runner/performance/test_sharding_autosplit.py b/test_runner/performance/test_sharding_autosplit.py
new file mode 100644
index 0000000000..8f9a7124e9
--- /dev/null
+++ b/test_runner/performance/test_sharding_autosplit.py
@@ -0,0 +1,262 @@
+import concurrent.futures
+import re
+from pathlib import Path
+
+import pytest
+from fixtures.common_types import TenantId, TimelineId
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+    PgBin,
+    tenant_get_shards,
+)
+
+
+@pytest.mark.timeout(600)
+def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
+    """
+    Check that sharding, including auto-splitting, "just works" under pgbench workloads.
+
+    This is not a benchmark, but it lives in the same place as benchmarks in order to be run
+    on a dedicated node that can sustain some significant throughput.
+
+    Other tests validate the details of shard splitting, error cases etc.  This test is
+    the sanity check that it all really works as expected with realistic amounts of data
+    and under load.
+
+    Success conditions:
+    - Tenants auto-split when their capacity grows
+    - Client workloads are not interrupted while that happens
+    """
+
+    neon_env_builder.num_pageservers = 8
+    neon_env_builder.storage_controller_config = {
+        # Split tenants at 500MB: it's up to the storage controller how it interprets this (logical
+        # sizes, physical sizes, etc).  We will write this much data logically, therefore other sizes
+        # will reliably be greater.
+        "split_threshold": 1024 * 1024 * 500
+    }
+
+    tenant_conf = {
+        # We want layer rewrites to happen as soon as possible (this is the most stressful
+        # case for the system), so set PITR interval to something tiny.
+        "pitr_interval": "5s",
+        # Scaled down thresholds.  We will run at ~1GB scale but would like to emulate
+        # the behavior of a system running at ~100GB scale.
+        "checkpoint_distance": f"{1024 * 1024}",
+        "compaction_threshold": "1",
+        "compaction_target_size": f"{1024 * 1024}",
+        "image_creation_threshold": "2",
+        "image_layer_creation_check_threshold": "0",
+    }
+
+    env = neon_env_builder.init_start()
+
+    for ps in env.pageservers:
+        ps.allowed_errors.extend(
+            [
+                # We shut down pageservers while they might have some compaction work going on
+                ".*Compaction failed.*shutting down.*"
+            ]
+        )
+
+    # Total tenants
+    tenant_count = 4
+
+    # Transaction rate: we set this rather than running at full-speed because we
+    # might run on a slow node that doesn't cope well with many full-speed pgbenches running concurrently.
+    transaction_rate = 100
+
+    class TenantState:
+        def __init__(self, timeline_id, endpoint):
+            self.timeline_id = timeline_id
+            self.endpoint = endpoint
+
+    # Create tenants
+    tenants = {}
+    for tenant_id in set(TenantId.generate() for _i in range(0, tenant_count)):
+        timeline_id = TimelineId.generate()
+        env.neon_cli.create_tenant(tenant_id, timeline_id, conf=tenant_conf)
+        endpoint = env.endpoints.create("main", tenant_id=tenant_id)
+        tenants[tenant_id] = TenantState(timeline_id, endpoint)
+        endpoint.start()
+
+    def run_pgbench_init(endpoint):
+        pg_bin.run_capture(
+            [
+                "pgbench",
+                "-s50",
+                "-i",
+                f"postgres://cloud_admin@localhost:{endpoint.pg_port}/postgres",
+            ]
+        )
+
+    def check_pgbench_output(out_path: str):
+        """
+        When we run pgbench, we want not just an absence of errors, but also continuous evidence
+        of I/O progressing: our shard splitting and migration should not interrrupt the benchmark.
+        """
+        matched_lines = 0
+        stderr = Path(f"{out_path}.stderr").read_text()
+
+        low_watermark = None
+
+        # Apply this as a threshold for what we consider an unacceptable interruption to I/O
+        min_tps = transaction_rate // 10
+
+        for line in stderr.split("\n"):
+            match = re.match(r"progress: ([0-9\.]+) s, ([0-9\.]+) tps, .* ([0-9]+) failed", line)
+            if match is None:
+                continue
+            else:
+                matched_lines += 1
+
+            (_time, tps, failed) = match.groups()
+            tps = float(tps)
+            failed = int(failed)
+
+            if failed > 0:
+                raise RuntimeError(
+                    f"pgbench on tenant {endpoint.tenant_id} run at {out_path} has failed > 0"
+                )
+
+            if low_watermark is None or low_watermark > tps:
+                low_watermark = tps
+
+            if tps < min_tps:
+                raise RuntimeError(
+                    f"pgbench on tenant {endpoint.tenant_id} run at {out_path} has tps < {min_tps}"
+                )
+
+        log.info(f"Checked {matched_lines} progress lines, lowest TPS was {min_tps}")
+
+        if matched_lines == 0:
+            raise RuntimeError(f"pgbench output at {out_path} contained no progress lines")
+
+    def run_pgbench_main(endpoint):
+        out_path = pg_bin.run_capture(
+            [
+                "pgbench",
+                "-s50",
+                "-T",
+                "300",
+                "-R",
+                f"{transaction_rate}",
+                "-P",
+                "1",
+                f"postgres://cloud_admin@localhost:{endpoint.pg_port}/postgres",
+            ]
+        )
+
+        check_pgbench_output(out_path)
+
+    def run_pgbench_read(endpoint):
+        out_path = pg_bin.run_capture(
+            [
+                "pgbench",
+                "-s50",
+                "-T",
+                "60",
+                "-R",
+                f"{transaction_rate}",
+                "-S",
+                "-P",
+                "1",
+                f"postgres://cloud_admin@localhost:{endpoint.pg_port}/postgres",
+            ]
+        )
+
+        check_pgbench_output(out_path)
+
+    with concurrent.futures.ThreadPoolExecutor(max_workers=tenant_count) as pgbench_threads:
+        pgbench_futs = []
+        for tenant_state in tenants.values():
+            fut = pgbench_threads.submit(run_pgbench_init, tenant_state.endpoint)
+            pgbench_futs.append(fut)
+
+        log.info("Waiting for pgbench inits")
+        for fut in pgbench_futs:
+            fut.result()
+
+        pgbench_futs = []
+        for tenant_state in tenants.values():
+            fut = pgbench_threads.submit(run_pgbench_main, tenant_state.endpoint)
+            pgbench_futs.append(fut)
+
+        log.info("Waiting for pgbench read/write pass")
+        for fut in pgbench_futs:
+            fut.result()
+
+    def assert_all_split():
+        for tenant_id in tenants.keys():
+            shards = tenant_get_shards(env, tenant_id)
+            assert len(shards) == 8
+
+    # This is not a wait_until, because we wanted the splits to happen _while_ pgbench is running: otherwise
+    # this test is not properly doing its job of validating that splits work nicely under load.
+    assert_all_split()
+
+    env.storage_controller.assert_log_contains(".*Successful auto-split.*")
+
+    # Log timeline sizes, useful for debug, and implicitly validates that the shards
+    # are available in the places the controller thinks they should be.
+    for tenant_id, tenant_state in tenants.items():
+        (shard_zero_id, shard_zero_ps) = tenant_get_shards(env, tenant_id)[0]
+        timeline_info = shard_zero_ps.http_client().timeline_detail(
+            shard_zero_id, tenant_state.timeline_id
+        )
+        log.info(f"{shard_zero_id} timeline: {timeline_info}")
+
+    # Run compaction for all tenants, restart endpoint so that on subsequent reads we will
+    # definitely hit pageserver for reads.  This compaction passis expected to drop unwanted
+    # layers but not do any rewrites (we're still in the same generation)
+    for tenant_id, tenant_state in tenants.items():
+        tenant_state.endpoint.stop()
+        for shard_id, shard_ps in tenant_get_shards(env, tenant_id):
+            shard_ps.http_client().timeline_gc(shard_id, tenant_state.timeline_id, gc_horizon=None)
+            shard_ps.http_client().timeline_compact(shard_id, tenant_state.timeline_id)
+        tenant_state.endpoint.start()
+
+    with concurrent.futures.ThreadPoolExecutor(max_workers=tenant_count) as pgbench_threads:
+        pgbench_futs = []
+        for tenant_state in tenants.values():
+            fut = pgbench_threads.submit(run_pgbench_read, tenant_state.endpoint)
+            pgbench_futs.append(fut)
+
+        log.info("Waiting for pgbench read pass")
+        for fut in pgbench_futs:
+            fut.result()
+
+    env.storage_controller.consistency_check()
+
+    # Restart the storage controller
+    env.storage_controller.stop()
+    env.storage_controller.start()
+
+    env.storage_controller.consistency_check()
+
+    # Restart all pageservers
+    for ps in env.pageservers:
+        ps.stop()
+        ps.start()
+
+    # Freshen gc_info in Timeline, so that when compaction runs in the background in the
+    # subsequent pgbench period, the last_gc_cutoff is updated and enables the conditions for a rewrite to pass.
+    for tenant_id, tenant_state in tenants.items():
+        for shard_id, shard_ps in tenant_get_shards(env, tenant_id):
+            shard_ps.http_client().timeline_gc(shard_id, tenant_state.timeline_id, gc_horizon=None)
+
+    # One last check data remains readable after everything has restarted
+    with concurrent.futures.ThreadPoolExecutor(max_workers=tenant_count) as pgbench_threads:
+        pgbench_futs = []
+        for tenant_state in tenants.values():
+            fut = pgbench_threads.submit(run_pgbench_read, tenant_state.endpoint)
+            pgbench_futs.append(fut)
+
+        log.info("Waiting for pgbench read pass")
+        for fut in pgbench_futs:
+            fut.result()
+
+    # Assert that some rewrites happened
+    # TODO: uncomment this after https://github.com/neondatabase/neon/pull/7531 is merged
+    # assert any(ps.log_contains(".*Rewriting layer after shard split.*") for ps in env.pageservers)