From e65f0fe874aa4762d5d4702349647677ea2c352e Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Tue, 6 Feb 2024 17:00:55 +0000 Subject: [PATCH] CI(benchmarks): make job split consistent across reruns (#6614) ## Problem We've got several issues with the current `benchmarks` job setup: - `benchmark_durations.json` file (that we generate in runtime to split tests into several jobs[0]) is not consistent between these jobs (and very not consistent with the file if we rerun the job). I.e. test selection for each job can be different, which could end up in missed tests in a test run. - `scripts/benchmark_durations` doesn't fetch all tests from the database (it doesn't expect any extra directories inside `test_runner/performance`) - For some reason, currently split into 4 groups ends up with the 4th group has no tests to run, which fails the job[1] - [0] https://github.com/neondatabase/neon/pull/4683 - [1] https://github.com/neondatabase/neon/issues/6629 ## Summary of changes - Generate `benchmark_durations.json` file once before we start `benchmarks` jobs (this makes it consistent across the jobs) and pass the file content through the GitHub Actions input (this makes it consistent for reruns) - `scripts/benchmark_durations` fix SQL query for getting all required tests - Split benchmarks into 5 jobs instead of 4 jobs. --- .../actions/run-python-test-set/action.yml | 6 +- .github/workflows/build_and_test.yml | 39 ++++- scripts/benchmark_durations.py | 133 +++++++++--------- 3 files changed, 111 insertions(+), 67 deletions(-) diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index 8dfa6c465f..7a88e4f73b 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -44,6 +44,10 @@ inputs: description: 'Postgres version to use for tests' required: false default: 'v14' + benchmark_durations: + description: 'benchmark durations JSON' + required: false + default: '{}' runs: using: "composite" @@ -160,7 +164,7 @@ runs: # We use pytest-split plugin to run benchmarks in parallel on different CI runners if [ "${TEST_SELECTION}" = "test_runner/performance" ] && [ "${{ inputs.build_type }}" != "remote" ]; then mkdir -p $TEST_OUTPUT - poetry run ./scripts/benchmark_durations.py "${TEST_RESULT_CONNSTR}" --days 10 --output "$TEST_OUTPUT/benchmark_durations.json" + echo '${{ inputs.benchmark_durations || '{}' }}' > $TEST_OUTPUT/benchmark_durations.json EXTRA_PARAMS="--durations-path $TEST_OUTPUT/benchmark_durations.json $EXTRA_PARAMS" fi diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 9fe9636d67..066f4a21eb 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -478,8 +478,40 @@ jobs: if: matrix.build_type == 'debug' && matrix.pg_version == 'v14' uses: ./.github/actions/save-coverage-data + get-benchmarks-durations: + outputs: + json: ${{ steps.get-benchmark-durations.outputs.json }} + needs: [ check-permissions, build-buildtools-image ] + runs-on: [ self-hosted, gen3, small ] + container: + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }} + options: --init + if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks') + steps: + - name: Checkout + uses: actions/checkout@v3 + + - name: Cache poetry deps + uses: actions/cache@v3 + with: + path: ~/.cache/pypoetry/virtualenvs + key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }} + + - name: Install Python deps + run: ./scripts/pysync + + - name: get benchmark durations + id: get-benchmark-durations + env: + TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} + run: | + poetry run ./scripts/benchmark_durations.py "${TEST_RESULT_CONNSTR}" \ + --days 10 \ + --output /tmp/benchmark_durations.json + echo "json=$(jq --compact-output '.' /tmp/benchmark_durations.json)" >> $GITHUB_OUTPUT + benchmarks: - needs: [ check-permissions, build-neon, build-buildtools-image ] + needs: [ check-permissions, build-neon, build-buildtools-image, get-benchmarks-durations ] runs-on: [ self-hosted, gen3, small ] container: image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }} @@ -490,7 +522,7 @@ jobs: fail-fast: false matrix: # the amount of groups (N) should be reflected in `extra_params: --splits N ...` - pytest_split_group: [ 1, 2, 3, 4 ] + pytest_split_group: [ 1, 2, 3, 4, 5 ] build_type: [ release ] steps: - name: Checkout @@ -503,7 +535,8 @@ jobs: test_selection: performance run_in_parallel: false save_perf_report: ${{ github.ref_name == 'main' }} - extra_params: --splits 4 --group ${{ matrix.pytest_split_group }} + extra_params: --splits 5 --group ${{ matrix.pytest_split_group }} + benchmark_durations: ${{ needs.get-benchmarks-durations.outputs.json }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" diff --git a/scripts/benchmark_durations.py b/scripts/benchmark_durations.py index 7f05d72a03..01f34a1b96 100755 --- a/scripts/benchmark_durations.py +++ b/scripts/benchmark_durations.py @@ -20,7 +20,7 @@ BENCHMARKS_DURATION_QUERY = """ FROM results WHERE started_at > CURRENT_DATE - INTERVAL '%s' day - AND parent_suite = 'test_runner.performance' + AND starts_with(parent_suite, 'test_runner.performance') AND status = 'passed' GROUP BY parent_suite, suite, name @@ -31,68 +31,75 @@ BENCHMARKS_DURATION_QUERY = """ # the total duration varies from 8 to 40 minutes. # We use some pre-collected durations as a fallback to have a better distribution. FALLBACK_DURATION = { - "test_runner/performance/test_branch_creation.py::test_branch_creation_heavy_write[20]": 62.144, - "test_runner/performance/test_branch_creation.py::test_branch_creation_many[1024]": 90.941, - "test_runner/performance/test_branch_creation.py::test_branch_creation_many_relations": 26.053, - "test_runner/performance/test_branching.py::test_compare_child_and_root_pgbench_perf": 25.67, - "test_runner/performance/test_branching.py::test_compare_child_and_root_read_perf": 14.497, - "test_runner/performance/test_branching.py::test_compare_child_and_root_write_perf": 18.852, - "test_runner/performance/test_bulk_insert.py::test_bulk_insert[neon]": 26.572, - "test_runner/performance/test_bulk_insert.py::test_bulk_insert[vanilla]": 6.259, - "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[10]": 21.206, - "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[1]": 3.474, - "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[5]": 11.262, - "test_runner/performance/test_bulk_update.py::test_bulk_update[100]": 94.225, - "test_runner/performance/test_bulk_update.py::test_bulk_update[10]": 68.159, - "test_runner/performance/test_bulk_update.py::test_bulk_update[50]": 76.719, - "test_runner/performance/test_compaction.py::test_compaction": 110.222, - "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[neon-5-10-100]": 10.743, - "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[vanilla-5-10-100]": 16.541, - "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[neon-5-10-100]": 11.109, - "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[vanilla-5-10-100]": 18.121, - "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[neon-5-10-100]": 11.3, - "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[vanilla-5-10-100]": 16.086, - "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-10]": 12.024, - "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-1]": 11.14, - "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-10]": 10.375, - "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-1]": 10.075, - "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[neon-5-10-100]": 11.147, - "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[vanilla-5-10-100]": 16.321, - "test_runner/performance/test_copy.py::test_copy[neon]": 16.579, - "test_runner/performance/test_copy.py::test_copy[vanilla]": 10.094, - "test_runner/performance/test_gc_feedback.py::test_gc_feedback": 590.157, - "test_runner/performance/test_gist_build.py::test_gist_buffering_build[neon]": 14.102, - "test_runner/performance/test_gist_build.py::test_gist_buffering_build[vanilla]": 8.677, - "test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[neon-1]": 31.079, - "test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[vanilla-1]": 38.119, - "test_runner/performance/test_layer_map.py::test_layer_map": 24.784, - "test_runner/performance/test_logical_replication.py::test_logical_replication": 117.707, - "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[neon]": 21.194, - "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[vanilla]": 59.068, - "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[neon]": 73.235, - "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[vanilla]": 82.586, - "test_runner/performance/test_perf_pgbench.py::test_pgbench[neon-45-10]": 106.536, - "test_runner/performance/test_perf_pgbench.py::test_pgbench[vanilla-45-10]": 98.753, - "test_runner/performance/test_random_writes.py::test_random_writes[neon]": 6.975, - "test_runner/performance/test_random_writes.py::test_random_writes[vanilla]": 3.69, - "test_runner/performance/test_seqscans.py::test_seqscans[neon-100000-100-0]": 3.529, - "test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-0]": 64.522, - "test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-4]": 40.964, - "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-100000-100-0]": 0.55, - "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-0]": 12.189, - "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-4]": 13.899, - "test_runner/performance/test_startup.py::test_startup_simple": 2.51, - "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_off-10-5-5]": 527.245, - "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_on-10-5-5]": 583.46, - "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[vanilla-10-5-5]": 113.653, - "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_off-1000]": 233.728, - "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_on-1000]": 419.093, - "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[vanilla-1000]": 982.461, - "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_off-45-100]": 116.522, - "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_on-45-100]": 115.583, - "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[vanilla-45-100]": 155.282, - "test_runner/performance/test_write_amplification.py::test_write_amplification[neon]": 26.704, - "test_runner/performance/test_write_amplification.py::test_write_amplification[vanilla]": 16.088, + "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[1-13-30]": 400.15, + "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[1-6-30]": 372.521, + "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[10-13-30]": 420.017, + "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[10-6-30]": 373.769, + "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[100-13-30]": 678.742, + "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[100-6-30]": 512.135, + "test_runner/performance/test_branch_creation.py::test_branch_creation_heavy_write[20]": 58.036, + "test_runner/performance/test_branch_creation.py::test_branch_creation_many_relations": 22.104, + "test_runner/performance/test_branch_creation.py::test_branch_creation_many[1024]": 126.073, + "test_runner/performance/test_branching.py::test_compare_child_and_root_pgbench_perf": 25.759, + "test_runner/performance/test_branching.py::test_compare_child_and_root_read_perf": 6.885, + "test_runner/performance/test_branching.py::test_compare_child_and_root_write_perf": 8.758, + "test_runner/performance/test_bulk_insert.py::test_bulk_insert[neon]": 18.275, + "test_runner/performance/test_bulk_insert.py::test_bulk_insert[vanilla]": 9.533, + "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[1]": 12.09, + "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[10]": 35.145, + "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[5]": 22.28, + "test_runner/performance/test_bulk_update.py::test_bulk_update[10]": 66.353, + "test_runner/performance/test_bulk_update.py::test_bulk_update[100]": 75.487, + "test_runner/performance/test_bulk_update.py::test_bulk_update[50]": 54.142, + "test_runner/performance/test_compaction.py::test_compaction": 110.715, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[neon-5-10-100]": 11.68, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[vanilla-5-10-100]": 16.384, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[neon-5-10-100]": 11.315, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[vanilla-5-10-100]": 18.783, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[neon-5-10-100]": 11.647, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[vanilla-5-10-100]": 17.04, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-1]": 11.01, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-10]": 11.902, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-1]": 10.077, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-10]": 10.4, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[neon-5-10-100]": 11.33, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[vanilla-5-10-100]": 16.434, + "test_runner/performance/test_copy.py::test_copy[neon]": 13.817, + "test_runner/performance/test_copy.py::test_copy[vanilla]": 11.736, + "test_runner/performance/test_gc_feedback.py::test_gc_feedback": 575.735, + "test_runner/performance/test_gist_build.py::test_gist_buffering_build[neon]": 14.868, + "test_runner/performance/test_gist_build.py::test_gist_buffering_build[vanilla]": 14.393, + "test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[neon-1]": 20.588, + "test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[vanilla-1]": 30.849, + "test_runner/performance/test_layer_map.py::test_layer_map": 39.378, + "test_runner/performance/test_lazy_startup.py::test_lazy_startup": 2848.938, + "test_runner/performance/test_logical_replication.py::test_logical_replication": 120.952, + "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[neon]": 35.552, + "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[vanilla]": 66.762, + "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[neon]": 85.177, + "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[vanilla]": 92.12, + "test_runner/performance/test_perf_pgbench.py::test_pgbench[neon-45-10]": 107.009, + "test_runner/performance/test_perf_pgbench.py::test_pgbench[vanilla-45-10]": 99.582, + "test_runner/performance/test_random_writes.py::test_random_writes[neon]": 4.737, + "test_runner/performance/test_random_writes.py::test_random_writes[vanilla]": 2.686, + "test_runner/performance/test_seqscans.py::test_seqscans[neon-100000-100-0]": 3.271, + "test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-0]": 50.719, + "test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-4]": 15.992, + "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-100000-100-0]": 0.566, + "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-0]": 13.542, + "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-4]": 13.35, + "test_runner/performance/test_startup.py::test_startup_simple": 13.043, + "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_off-10-5-5]": 194.841, + "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_on-10-5-5]": 286.667, + "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[vanilla-10-5-5]": 85.577, + "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_off-1000]": 297.626, + "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_on-1000]": 646.187, + "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[vanilla-1000]": 989.776, + "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_off-45-100]": 125.638, + "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_on-45-100]": 123.554, + "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[vanilla-45-100]": 190.083, + "test_runner/performance/test_write_amplification.py::test_write_amplification[neon]": 21.016, + "test_runner/performance/test_write_amplification.py::test_write_amplification[vanilla]": 23.028, }