test_runner: run benchmarks in parallel (#4683)

## Problem Benchmarks run takes about an hour on main branch (in a single job), which delays pipeline results. And it takes another hour if we want to restart the job due to some failures. ## Summary of changes - Use `pytest-split` plugin to run benchmarks on separate CI runners in 4 parallel jobs - Add `scripts/benchmark_durations.py` for getting benchmark durations from the database to help `pytest-split` schedule tests more evenly. It uses p99 for the last 10 days' results (durations). The current distribution could be better; each worker's durations vary from 9m to 35m, but this could be improved in consequent PRs.
2026-06-03 21:40:39 +00:00 · 2023-07-17 20:09:45 +01:00
parent e074ccf170
commit 4580f5085a
5 changed files with 204 additions and 4 deletions
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -150,6 +150,14 @@ runs:
          EXTRA_PARAMS="--flaky-tests-json $TEST_OUTPUT/flaky.json $EXTRA_PARAMS"
        fi

+        # We use pytest-split plugin to run benchmarks in parallel on different CI runners
+        if [ "${TEST_SELECTION}" = "test_runner/performance" ] && [ "${{ inputs.build_type }}" != "remote" ]; then
+          mkdir -p $TEST_OUTPUT
+          poetry run ./scripts/benchmark_durations.py "${TEST_RESULT_CONNSTR}" --days 10 --output "$TEST_OUTPUT/benchmark_durations.json"
+
+          EXTRA_PARAMS="--durations-path $TEST_OUTPUT/benchmark_durations.json $EXTRA_PARAMS"
+        fi
+
        if [[ "${{ inputs.build_type }}" == "debug" ]]; then
          cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
        elif [[ "${{ inputs.build_type }}" == "release" ]]; then
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -396,13 +396,11 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
+        pytest_split_group: [ 1, 2, 3, 4 ]
        build_type: [ release ]
    steps:
      - name: Checkout
        uses: actions/checkout@v3
-        with:
-          submodules: true
-          fetch-depth: 1

      - name: Pytest benchmarks
        uses: ./.github/actions/run-python-test-set
@@ -411,9 +409,11 @@ jobs:
          test_selection: performance
          run_in_parallel: false
          save_perf_report: ${{ github.ref_name == 'main' }}
+          extra_params: --splits ${{ strategy.job-total }} --group ${{ matrix.pytest_split_group }}
        env:
          VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
          PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
+          TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}"
      # XXX: no coverage data handling here, since benchmarks are run on release builds,
      # while coverage is currently collected for the debug ones

--- a/poetry.lock
+++ b/poetry.lock
@@ -1868,6 +1868,20 @@ files = [
 packaging = ">=17.1"
 pytest = ">=5.3"

+[[package]]
+name = "pytest-split"
+version = "0.8.1"
+description = "Pytest plugin which splits the test suite to equally sized sub suites based on test execution time."
+optional = false
+python-versions = ">=3.7.1,<4.0"
+files = [
+    {file = "pytest_split-0.8.1-py3-none-any.whl", hash = "sha256:74b110ea091bd147cc1c5f9665a59506e5cedfa66f96a89fb03e4ab447c2c168"},
+    {file = "pytest_split-0.8.1.tar.gz", hash = "sha256:2d88bd3dc528689a7a3f58fc12ea165c3aa62e90795e420dfad920afe5612d6d"},
+]
+
+[package.dependencies]
+pytest = ">=5,<8"
+
 [[package]]
 name = "pytest-timeout"
 version = "2.1.0"
@@ -2513,4 +2527,4 @@ testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>=
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "fe771b153ef7e308d6d04421d0eb3f97d00780882277d2b4fc1f296054d8db79"
+content-hash = "e16a65d8fdff4e2173610e552e0e7306e301de2c640ae6082ef6cc5755f566d2"
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,6 +36,7 @@ pytest-httpserver = "^1.0.8"
 aiohttp = "3.7.4"
 pytest-rerunfailures = "^11.1.2"
 types-pytest-lazy-fixture = "^0.6.3.3"
+pytest-split = "^0.8.1"

 [tool.poetry.group.dev.dependencies]
 black = "^23.3.0"
--- a/scripts/benchmark_durations.py
+++ b/scripts/benchmark_durations.py
@@ -0,0 +1,177 @@
+#! /usr/bin/env python3
+
+import argparse
+import json
+import logging
+from typing import Dict
+
+import psycopg2
+import psycopg2.extras
+
+"""
+The script fetches the durations of benchmarks from the database and stores it in a file compatible with pytest-split plugin.
+"""
+
+
+BENCHMARKS_DURATION_QUERY = """
+    SELECT
+        DISTINCT parent_suite, suite, test,
+        PERCENTILE_DISC(%s) WITHIN GROUP (ORDER BY duration_ms) as percentile_ms
+    FROM
+        (
+            SELECT
+                jsonb_array_elements(data -> 'children') ->> 'name' as parent_suite,
+                jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') ->> 'name' as suite,
+                jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') ->> 'name' as test,
+                jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') ->> 'status' as status,
+                to_timestamp((jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'time' -> 'start')::bigint / 1000)::date as timestamp,
+                (jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'time' -> 'duration')::int as duration_ms
+            FROM
+                regress_test_results
+            WHERE
+                reference = 'refs/heads/main'
+        ) data
+    WHERE
+        timestamp > CURRENT_DATE - INTERVAL '%s' day
+        AND parent_suite = 'test_runner.performance'
+        AND status = 'passed'
+    GROUP BY
+        parent_suite, suite, test
+    ;
+"""
+
+# For out benchmarks the default distibution for 4 worked produces pretty uneven chunks,
+# the total duration varies from 8 to 40 minutes.
+# We use some pre-collected durations as a fallback to have a better distribution.
+FALLBACK_DURATION = {
+    "test_runner/performance/test_branch_creation.py::test_branch_creation_heavy_write[20]": 57.0,
+    "test_runner/performance/test_branch_creation.py::test_branch_creation_many_relations": 28.0,
+    "test_runner/performance/test_branch_creation.py::test_branch_creation_many[1024]": 71.0,
+    "test_runner/performance/test_branching.py::test_compare_child_and_root_pgbench_perf": 27.0,
+    "test_runner/performance/test_branching.py::test_compare_child_and_root_read_perf": 11.0,
+    "test_runner/performance/test_branching.py::test_compare_child_and_root_write_perf": 30.0,
+    "test_runner/performance/test_bulk_insert.py::test_bulk_insert[neon]": 40.0,
+    "test_runner/performance/test_bulk_insert.py::test_bulk_insert[vanilla]": 5.0,
+    "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[1]": 3.0,
+    "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[5]": 10.0,
+    "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[10]": 19.0,
+    "test_runner/performance/test_bulk_update.py::test_bulk_update[10]": 66.0,
+    "test_runner/performance/test_bulk_update.py::test_bulk_update[50]": 30.0,
+    "test_runner/performance/test_bulk_update.py::test_bulk_update[100]": 60.0,
+    "test_runner/performance/test_compaction.py::test_compaction": 77.0,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[neon-5-10-100]": 11.0,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[vanilla-5-10-100]": 16.0,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[neon-5-10-100]": 11.0,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[vanilla-5-10-100]": 18.0,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[neon-5-10-100]": 11.0,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[vanilla-5-10-100]": 16.0,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-1]": 11.0,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-10]": 11.0,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-1]": 10.0,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-10]": 10.0,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[neon-5-10-100]": 11.0,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[vanilla-5-10-100]": 16.0,
+    "test_runner/performance/test_copy.py::test_copy[neon]": 12.0,
+    "test_runner/performance/test_copy.py::test_copy[vanilla]": 10.0,
+    "test_runner/performance/test_gc_feedback.py::test_gc_feedback": 284.0,
+    "test_runner/performance/test_gist_build.py::test_gist_buffering_build[neon]": 11.0,
+    "test_runner/performance/test_gist_build.py::test_gist_buffering_build[vanilla]": 7.0,
+    "test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[neon-1]": 85.0,
+    "test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[vanilla-1]": 29.0,
+    "test_runner/performance/test_layer_map.py::test_layer_map": 44.0,
+    "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[neon]": 16.0,
+    "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[vanilla]": 67.0,
+    "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[neon]": 67.0,
+    "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[vanilla]": 80.0,
+    "test_runner/performance/test_perf_pgbench.py::test_pgbench[neon-45-10]": 102.0,
+    "test_runner/performance/test_perf_pgbench.py::test_pgbench[vanilla-45-10]": 99.0,
+    "test_runner/performance/test_random_writes.py::test_random_writes[neon]": 9.0,
+    "test_runner/performance/test_random_writes.py::test_random_writes[vanilla]": 2.0,
+    "test_runner/performance/test_seqscans.py::test_seqscans[neon-100000-100-0]": 4.0,
+    "test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-0]": 80.0,
+    "test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-4]": 68.0,
+    "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-100000-100-0]": 0.0,
+    "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-0]": 11.0,
+    "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-4]": 10.0,
+    "test_runner/performance/test_startup.py::test_startup_simple": 2.0,
+    "test_runner/performance/test_startup.py::test_startup": 539.0,
+    "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_off-10-5-5]": 375.0,
+    "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_on-10-5-5]": 370.0,
+    "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[vanilla-10-5-5]": 94.0,
+    "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_off-1000]": 164.0,
+    "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_on-1000]": 274.0,
+    "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[vanilla-1000]": 949.0,
+    "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_off-45-100]": 142.0,
+    "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_on-45-100]": 151.0,
+    "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[vanilla-45-100]": 182.0,
+    "test_runner/performance/test_write_amplification.py::test_write_amplification[neon]": 13.0,
+    "test_runner/performance/test_write_amplification.py::test_write_amplification[vanilla]": 16.0,
+}
+
+
+def main(args: argparse.Namespace):
+    connstr = args.connstr
+    interval_days = args.days
+    output = args.output
+    percentile = args.percentile
+
+    res: Dict[str, float] = {}
+
+    try:
+        logging.info("connecting to the database...")
+        with psycopg2.connect(connstr, connect_timeout=30) as conn:
+            with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
+                logging.info("fetching benchmarks...")
+                cur.execute(BENCHMARKS_DURATION_QUERY, (percentile, interval_days))
+                rows = cur.fetchall()
+    except psycopg2.OperationalError as exc:
+        logging.error("cannot fetch benchmarks duration from the DB due to an error", exc)
+        rows = []
+        res = FALLBACK_DURATION
+
+    for row in rows:
+        pytest_name = f"{row['parent_suite'].replace('.', '/')}/{row['suite']}.py::{row['test']}"
+        duration = row["percentile_ms"] / 1000
+        logging.info(f"\t{pytest_name}: {duration}")
+        res[pytest_name] = duration
+
+    logging.info(f"saving results to {output.name}")
+    json.dump(res, output, indent=2)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Get <percentile> of benchmarks duration for the last <N> days"
+    )
+    parser.add_argument(
+        "--output",
+        type=argparse.FileType("w"),
+        default=".test_durations",
+        help="path to output json file (default: .test_durations)",
+    )
+    parser.add_argument(
+        "--percentile",
+        type=float,
+        default="0.99",
+        help="percentile (default: 0.99)",
+    )
+    parser.add_argument(
+        "--days",
+        required=False,
+        default=10,
+        type=int,
+        help="how many days to look back for (default: 10)",
+    )
+    parser.add_argument(
+        "connstr",
+        help="connection string to the test results database",
+    )
+    args = parser.parse_args()
+
+    level = logging.INFO
+    logging.basicConfig(
+        format="%(message)s",
+        level=level,
+    )
+
+    main(args)