diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index dec1f47e47..ceb6f4aa90 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -150,6 +150,14 @@ runs: EXTRA_PARAMS="--flaky-tests-json $TEST_OUTPUT/flaky.json $EXTRA_PARAMS" fi + # We use pytest-split plugin to run benchmarks in parallel on different CI runners + if [ "${TEST_SELECTION}" = "test_runner/performance" ] && [ "${{ inputs.build_type }}" != "remote" ]; then + mkdir -p $TEST_OUTPUT + poetry run ./scripts/benchmark_durations.py "${TEST_RESULT_CONNSTR}" --days 10 --output "$TEST_OUTPUT/benchmark_durations.json" + + EXTRA_PARAMS="--durations-path $TEST_OUTPUT/benchmark_durations.json $EXTRA_PARAMS" + fi + if [[ "${{ inputs.build_type }}" == "debug" ]]; then cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run) elif [[ "${{ inputs.build_type }}" == "release" ]]; then diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index daa0a0da98..5f3e4f1145 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -396,13 +396,11 @@ jobs: strategy: fail-fast: false matrix: + pytest_split_group: [ 1, 2, 3, 4 ] build_type: [ release ] steps: - name: Checkout uses: actions/checkout@v3 - with: - submodules: true - fetch-depth: 1 - name: Pytest benchmarks uses: ./.github/actions/run-python-test-set @@ -411,9 +409,11 @@ jobs: test_selection: performance run_in_parallel: false save_perf_report: ${{ github.ref_name == 'main' }} + extra_params: --splits ${{ strategy.job-total }} --group ${{ matrix.pytest_split_group }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" + TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}" # XXX: no coverage data handling here, since benchmarks are run on release builds, # while coverage is currently collected for the debug ones diff --git a/poetry.lock b/poetry.lock index aadbb7c33f..b22a6a5bc9 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1868,6 +1868,20 @@ files = [ packaging = ">=17.1" pytest = ">=5.3" +[[package]] +name = "pytest-split" +version = "0.8.1" +description = "Pytest plugin which splits the test suite to equally sized sub suites based on test execution time." +optional = false +python-versions = ">=3.7.1,<4.0" +files = [ + {file = "pytest_split-0.8.1-py3-none-any.whl", hash = "sha256:74b110ea091bd147cc1c5f9665a59506e5cedfa66f96a89fb03e4ab447c2c168"}, + {file = "pytest_split-0.8.1.tar.gz", hash = "sha256:2d88bd3dc528689a7a3f58fc12ea165c3aa62e90795e420dfad920afe5612d6d"}, +] + +[package.dependencies] +pytest = ">=5,<8" + [[package]] name = "pytest-timeout" version = "2.1.0" @@ -2513,4 +2527,4 @@ testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>= [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "fe771b153ef7e308d6d04421d0eb3f97d00780882277d2b4fc1f296054d8db79" +content-hash = "e16a65d8fdff4e2173610e552e0e7306e301de2c640ae6082ef6cc5755f566d2" diff --git a/pyproject.toml b/pyproject.toml index ac4e8fa2dd..f02c350587 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,6 +36,7 @@ pytest-httpserver = "^1.0.8" aiohttp = "3.7.4" pytest-rerunfailures = "^11.1.2" types-pytest-lazy-fixture = "^0.6.3.3" +pytest-split = "^0.8.1" [tool.poetry.group.dev.dependencies] black = "^23.3.0" diff --git a/scripts/benchmark_durations.py b/scripts/benchmark_durations.py new file mode 100755 index 0000000000..37f8470038 --- /dev/null +++ b/scripts/benchmark_durations.py @@ -0,0 +1,177 @@ +#! /usr/bin/env python3 + +import argparse +import json +import logging +from typing import Dict + +import psycopg2 +import psycopg2.extras + +""" +The script fetches the durations of benchmarks from the database and stores it in a file compatible with pytest-split plugin. +""" + + +BENCHMARKS_DURATION_QUERY = """ + SELECT + DISTINCT parent_suite, suite, test, + PERCENTILE_DISC(%s) WITHIN GROUP (ORDER BY duration_ms) as percentile_ms + FROM + ( + SELECT + jsonb_array_elements(data -> 'children') ->> 'name' as parent_suite, + jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') ->> 'name' as suite, + jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') ->> 'name' as test, + jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') ->> 'status' as status, + to_timestamp((jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'time' -> 'start')::bigint / 1000)::date as timestamp, + (jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'time' -> 'duration')::int as duration_ms + FROM + regress_test_results + WHERE + reference = 'refs/heads/main' + ) data + WHERE + timestamp > CURRENT_DATE - INTERVAL '%s' day + AND parent_suite = 'test_runner.performance' + AND status = 'passed' + GROUP BY + parent_suite, suite, test + ; +""" + +# For out benchmarks the default distibution for 4 worked produces pretty uneven chunks, +# the total duration varies from 8 to 40 minutes. +# We use some pre-collected durations as a fallback to have a better distribution. +FALLBACK_DURATION = { + "test_runner/performance/test_branch_creation.py::test_branch_creation_heavy_write[20]": 57.0, + "test_runner/performance/test_branch_creation.py::test_branch_creation_many_relations": 28.0, + "test_runner/performance/test_branch_creation.py::test_branch_creation_many[1024]": 71.0, + "test_runner/performance/test_branching.py::test_compare_child_and_root_pgbench_perf": 27.0, + "test_runner/performance/test_branching.py::test_compare_child_and_root_read_perf": 11.0, + "test_runner/performance/test_branching.py::test_compare_child_and_root_write_perf": 30.0, + "test_runner/performance/test_bulk_insert.py::test_bulk_insert[neon]": 40.0, + "test_runner/performance/test_bulk_insert.py::test_bulk_insert[vanilla]": 5.0, + "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[1]": 3.0, + "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[5]": 10.0, + "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[10]": 19.0, + "test_runner/performance/test_bulk_update.py::test_bulk_update[10]": 66.0, + "test_runner/performance/test_bulk_update.py::test_bulk_update[50]": 30.0, + "test_runner/performance/test_bulk_update.py::test_bulk_update[100]": 60.0, + "test_runner/performance/test_compaction.py::test_compaction": 77.0, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[neon-5-10-100]": 11.0, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[vanilla-5-10-100]": 16.0, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[neon-5-10-100]": 11.0, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[vanilla-5-10-100]": 18.0, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[neon-5-10-100]": 11.0, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[vanilla-5-10-100]": 16.0, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-1]": 11.0, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-10]": 11.0, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-1]": 10.0, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-10]": 10.0, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[neon-5-10-100]": 11.0, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[vanilla-5-10-100]": 16.0, + "test_runner/performance/test_copy.py::test_copy[neon]": 12.0, + "test_runner/performance/test_copy.py::test_copy[vanilla]": 10.0, + "test_runner/performance/test_gc_feedback.py::test_gc_feedback": 284.0, + "test_runner/performance/test_gist_build.py::test_gist_buffering_build[neon]": 11.0, + "test_runner/performance/test_gist_build.py::test_gist_buffering_build[vanilla]": 7.0, + "test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[neon-1]": 85.0, + "test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[vanilla-1]": 29.0, + "test_runner/performance/test_layer_map.py::test_layer_map": 44.0, + "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[neon]": 16.0, + "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[vanilla]": 67.0, + "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[neon]": 67.0, + "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[vanilla]": 80.0, + "test_runner/performance/test_perf_pgbench.py::test_pgbench[neon-45-10]": 102.0, + "test_runner/performance/test_perf_pgbench.py::test_pgbench[vanilla-45-10]": 99.0, + "test_runner/performance/test_random_writes.py::test_random_writes[neon]": 9.0, + "test_runner/performance/test_random_writes.py::test_random_writes[vanilla]": 2.0, + "test_runner/performance/test_seqscans.py::test_seqscans[neon-100000-100-0]": 4.0, + "test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-0]": 80.0, + "test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-4]": 68.0, + "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-100000-100-0]": 0.0, + "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-0]": 11.0, + "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-4]": 10.0, + "test_runner/performance/test_startup.py::test_startup_simple": 2.0, + "test_runner/performance/test_startup.py::test_startup": 539.0, + "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_off-10-5-5]": 375.0, + "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_on-10-5-5]": 370.0, + "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[vanilla-10-5-5]": 94.0, + "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_off-1000]": 164.0, + "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_on-1000]": 274.0, + "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[vanilla-1000]": 949.0, + "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_off-45-100]": 142.0, + "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_on-45-100]": 151.0, + "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[vanilla-45-100]": 182.0, + "test_runner/performance/test_write_amplification.py::test_write_amplification[neon]": 13.0, + "test_runner/performance/test_write_amplification.py::test_write_amplification[vanilla]": 16.0, +} + + +def main(args: argparse.Namespace): + connstr = args.connstr + interval_days = args.days + output = args.output + percentile = args.percentile + + res: Dict[str, float] = {} + + try: + logging.info("connecting to the database...") + with psycopg2.connect(connstr, connect_timeout=30) as conn: + with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: + logging.info("fetching benchmarks...") + cur.execute(BENCHMARKS_DURATION_QUERY, (percentile, interval_days)) + rows = cur.fetchall() + except psycopg2.OperationalError as exc: + logging.error("cannot fetch benchmarks duration from the DB due to an error", exc) + rows = [] + res = FALLBACK_DURATION + + for row in rows: + pytest_name = f"{row['parent_suite'].replace('.', '/')}/{row['suite']}.py::{row['test']}" + duration = row["percentile_ms"] / 1000 + logging.info(f"\t{pytest_name}: {duration}") + res[pytest_name] = duration + + logging.info(f"saving results to {output.name}") + json.dump(res, output, indent=2) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Get of benchmarks duration for the last days" + ) + parser.add_argument( + "--output", + type=argparse.FileType("w"), + default=".test_durations", + help="path to output json file (default: .test_durations)", + ) + parser.add_argument( + "--percentile", + type=float, + default="0.99", + help="percentile (default: 0.99)", + ) + parser.add_argument( + "--days", + required=False, + default=10, + type=int, + help="how many days to look back for (default: 10)", + ) + parser.add_argument( + "connstr", + help="connection string to the test results database", + ) + args = parser.parse_args() + + level = logging.INFO + logging.basicConfig( + format="%(message)s", + level=level, + ) + + main(args)