Switch to querying new tests results DB (#5616)

## Problem

We started to store test results in a new format in
https://github.com/neondatabase/neon/pull/4549.
This PR switches scripts to query this db.

(we can completely remove old DB/ingestions scripts in a couple of
weeks after the PR merged)

## Summary of changes
- `scripts/benchmark_durations.py` query new database
- `scripts/flaky_tests.py` query new database
This commit is contained in:
Alexander Bayandin
2023-10-25 14:25:13 +01:00
committed by GitHub
parent 8b8be7bed4
commit 4778b6a12e
4 changed files with 87 additions and 106 deletions

View File

@@ -203,6 +203,10 @@ runs:
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
BASE_S3_URL: ${{ steps.generate-report.outputs.base-s3-url }} BASE_S3_URL: ${{ steps.generate-report.outputs.base-s3-url }}
run: | run: |
if [ ! -d "${WORKDIR}/report/data/test-cases" ]; then
exit 0
fi
export DATABASE_URL=${REGRESS_TEST_RESULT_CONNSTR_NEW} export DATABASE_URL=${REGRESS_TEST_RESULT_CONNSTR_NEW}
./scripts/pysync ./scripts/pysync

View File

@@ -433,7 +433,7 @@ jobs:
rerun_flaky: true rerun_flaky: true
pg_version: ${{ matrix.pg_version }} pg_version: ${{ matrix.pg_version }}
env: env:
TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }} TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
CHECK_ONDISK_DATA_COMPATIBILITY: nonempty CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
- name: Merge and upload coverage data - name: Merge and upload coverage data
@@ -468,7 +468,7 @@ jobs:
env: env:
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}" TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
# XXX: no coverage data handling here, since benchmarks are run on release builds, # XXX: no coverage data handling here, since benchmarks are run on release builds,
# while coverage is currently collected for the debug ones # while coverage is currently collected for the debug ones

View File

@@ -15,28 +15,15 @@ The script fetches the durations of benchmarks from the database and stores it i
BENCHMARKS_DURATION_QUERY = """ BENCHMARKS_DURATION_QUERY = """
SELECT SELECT
DISTINCT parent_suite, suite, test, DISTINCT parent_suite, suite, name,
PERCENTILE_DISC(%s) WITHIN GROUP (ORDER BY duration_ms) as percentile_ms PERCENTILE_DISC(%s) WITHIN GROUP (ORDER BY duration) as percentile_ms
FROM FROM results
(
SELECT
jsonb_array_elements(data -> 'children') ->> 'name' as parent_suite,
jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') ->> 'name' as suite,
jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') ->> 'name' as test,
jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') ->> 'status' as status,
to_timestamp((jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'time' -> 'start')::bigint / 1000)::date as timestamp,
(jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'time' -> 'duration')::int as duration_ms
FROM
regress_test_results
WHERE
reference = 'refs/heads/main'
) data
WHERE WHERE
timestamp > CURRENT_DATE - INTERVAL '%s' day started_at > CURRENT_DATE - INTERVAL '%s' day
AND parent_suite = 'test_runner.performance' AND parent_suite = 'test_runner.performance'
AND status = 'passed' AND status = 'passed'
GROUP BY GROUP BY
parent_suite, suite, test parent_suite, suite, name
; ;
""" """
@@ -44,68 +31,69 @@ BENCHMARKS_DURATION_QUERY = """
# the total duration varies from 8 to 40 minutes. # the total duration varies from 8 to 40 minutes.
# We use some pre-collected durations as a fallback to have a better distribution. # We use some pre-collected durations as a fallback to have a better distribution.
FALLBACK_DURATION = { FALLBACK_DURATION = {
"test_runner/performance/test_branch_creation.py::test_branch_creation_heavy_write[20]": 57.0, "test_runner/performance/test_branch_creation.py::test_branch_creation_heavy_write[20]": 62.144,
"test_runner/performance/test_branch_creation.py::test_branch_creation_many_relations": 28.0, "test_runner/performance/test_branch_creation.py::test_branch_creation_many[1024]": 90.941,
"test_runner/performance/test_branch_creation.py::test_branch_creation_many[1024]": 71.0, "test_runner/performance/test_branch_creation.py::test_branch_creation_many_relations": 26.053,
"test_runner/performance/test_branching.py::test_compare_child_and_root_pgbench_perf": 27.0, "test_runner/performance/test_branching.py::test_compare_child_and_root_pgbench_perf": 25.67,
"test_runner/performance/test_branching.py::test_compare_child_and_root_read_perf": 11.0, "test_runner/performance/test_branching.py::test_compare_child_and_root_read_perf": 14.497,
"test_runner/performance/test_branching.py::test_compare_child_and_root_write_perf": 30.0, "test_runner/performance/test_branching.py::test_compare_child_and_root_write_perf": 18.852,
"test_runner/performance/test_bulk_insert.py::test_bulk_insert[neon]": 40.0, "test_runner/performance/test_bulk_insert.py::test_bulk_insert[neon]": 26.572,
"test_runner/performance/test_bulk_insert.py::test_bulk_insert[vanilla]": 5.0, "test_runner/performance/test_bulk_insert.py::test_bulk_insert[vanilla]": 6.259,
"test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[1]": 3.0, "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[10]": 21.206,
"test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[5]": 10.0, "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[1]": 3.474,
"test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[10]": 19.0, "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[5]": 11.262,
"test_runner/performance/test_bulk_update.py::test_bulk_update[10]": 66.0, "test_runner/performance/test_bulk_update.py::test_bulk_update[100]": 94.225,
"test_runner/performance/test_bulk_update.py::test_bulk_update[50]": 30.0, "test_runner/performance/test_bulk_update.py::test_bulk_update[10]": 68.159,
"test_runner/performance/test_bulk_update.py::test_bulk_update[100]": 60.0, "test_runner/performance/test_bulk_update.py::test_bulk_update[50]": 76.719,
"test_runner/performance/test_compaction.py::test_compaction": 77.0, "test_runner/performance/test_compaction.py::test_compaction": 110.222,
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[neon-5-10-100]": 11.0, "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[neon-5-10-100]": 10.743,
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[vanilla-5-10-100]": 16.0, "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[vanilla-5-10-100]": 16.541,
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[neon-5-10-100]": 11.0, "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[neon-5-10-100]": 11.109,
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[vanilla-5-10-100]": 18.0, "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[vanilla-5-10-100]": 18.121,
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[neon-5-10-100]": 11.0, "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[neon-5-10-100]": 11.3,
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[vanilla-5-10-100]": 16.0, "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[vanilla-5-10-100]": 16.086,
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-1]": 11.0, "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-10]": 12.024,
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-10]": 11.0, "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-1]": 11.14,
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-1]": 10.0, "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-10]": 10.375,
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-10]": 10.0, "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-1]": 10.075,
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[neon-5-10-100]": 11.0, "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[neon-5-10-100]": 11.147,
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[vanilla-5-10-100]": 16.0, "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[vanilla-5-10-100]": 16.321,
"test_runner/performance/test_copy.py::test_copy[neon]": 12.0, "test_runner/performance/test_copy.py::test_copy[neon]": 16.579,
"test_runner/performance/test_copy.py::test_copy[vanilla]": 10.0, "test_runner/performance/test_copy.py::test_copy[vanilla]": 10.094,
"test_runner/performance/test_gc_feedback.py::test_gc_feedback": 284.0, "test_runner/performance/test_gc_feedback.py::test_gc_feedback": 590.157,
"test_runner/performance/test_gist_build.py::test_gist_buffering_build[neon]": 11.0, "test_runner/performance/test_gist_build.py::test_gist_buffering_build[neon]": 14.102,
"test_runner/performance/test_gist_build.py::test_gist_buffering_build[vanilla]": 7.0, "test_runner/performance/test_gist_build.py::test_gist_buffering_build[vanilla]": 8.677,
"test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[neon-1]": 85.0, "test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[neon-1]": 31.079,
"test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[vanilla-1]": 29.0, "test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[vanilla-1]": 38.119,
"test_runner/performance/test_layer_map.py::test_layer_map": 44.0, "test_runner/performance/test_layer_map.py::test_layer_map": 24.784,
"test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[neon]": 16.0, "test_runner/performance/test_logical_replication.py::test_logical_replication": 117.707,
"test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[vanilla]": 67.0, "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[neon]": 21.194,
"test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[neon]": 67.0, "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[vanilla]": 59.068,
"test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[vanilla]": 80.0, "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[neon]": 73.235,
"test_runner/performance/test_perf_pgbench.py::test_pgbench[neon-45-10]": 102.0, "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[vanilla]": 82.586,
"test_runner/performance/test_perf_pgbench.py::test_pgbench[vanilla-45-10]": 99.0, "test_runner/performance/test_perf_pgbench.py::test_pgbench[neon-45-10]": 106.536,
"test_runner/performance/test_random_writes.py::test_random_writes[neon]": 9.0, "test_runner/performance/test_perf_pgbench.py::test_pgbench[vanilla-45-10]": 98.753,
"test_runner/performance/test_random_writes.py::test_random_writes[vanilla]": 2.0, "test_runner/performance/test_random_writes.py::test_random_writes[neon]": 6.975,
"test_runner/performance/test_seqscans.py::test_seqscans[neon-100000-100-0]": 4.0, "test_runner/performance/test_random_writes.py::test_random_writes[vanilla]": 3.69,
"test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-0]": 80.0, "test_runner/performance/test_seqscans.py::test_seqscans[neon-100000-100-0]": 3.529,
"test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-4]": 68.0, "test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-0]": 64.522,
"test_runner/performance/test_seqscans.py::test_seqscans[vanilla-100000-100-0]": 0.0, "test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-4]": 40.964,
"test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-0]": 11.0, "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-100000-100-0]": 0.55,
"test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-4]": 10.0, "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-0]": 12.189,
"test_runner/performance/test_startup.py::test_startup_simple": 2.0, "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-4]": 13.899,
"test_runner/performance/test_startup.py::test_startup": 539.0, "test_runner/performance/test_startup.py::test_startup": 890.114,
"test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_off-10-5-5]": 375.0, "test_runner/performance/test_startup.py::test_startup_simple": 2.51,
"test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_on-10-5-5]": 370.0, "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_off-10-5-5]": 527.245,
"test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[vanilla-10-5-5]": 94.0, "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_on-10-5-5]": 583.46,
"test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_off-1000]": 164.0, "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[vanilla-10-5-5]": 113.653,
"test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_on-1000]": 274.0, "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_off-1000]": 233.728,
"test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[vanilla-1000]": 949.0, "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_on-1000]": 419.093,
"test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_off-45-100]": 142.0, "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[vanilla-1000]": 982.461,
"test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_on-45-100]": 151.0, "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_off-45-100]": 116.522,
"test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[vanilla-45-100]": 182.0, "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_on-45-100]": 115.583,
"test_runner/performance/test_write_amplification.py::test_write_amplification[neon]": 13.0, "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[vanilla-45-100]": 155.282,
"test_runner/performance/test_write_amplification.py::test_write_amplification[vanilla]": 16.0, "test_runner/performance/test_write_amplification.py::test_write_amplification[neon]": 26.704,
"test_runner/performance/test_write_amplification.py::test_write_amplification[vanilla]": 16.088,
} }
@@ -130,7 +118,7 @@ def main(args: argparse.Namespace):
res = FALLBACK_DURATION res = FALLBACK_DURATION
for row in rows: for row in rows:
pytest_name = f"{row['parent_suite'].replace('.', '/')}/{row['suite']}.py::{row['test']}" pytest_name = f"{row['parent_suite'].replace('.', '/')}/{row['suite']}.py::{row['name']}"
duration = row["percentile_ms"] / 1000 duration = row["percentile_ms"] / 1000
logging.info(f"\t{pytest_name}: {duration}") logging.info(f"\t{pytest_name}: {duration}")
res[pytest_name] = duration res[pytest_name] = duration

View File

@@ -9,28 +9,15 @@ from typing import DefaultDict, Dict
import psycopg2 import psycopg2
import psycopg2.extras import psycopg2.extras
# We call the test "flaky" if it failed at least once on the main branch in the last N=10 days.
FLAKY_TESTS_QUERY = """ FLAKY_TESTS_QUERY = """
SELECT SELECT
DISTINCT parent_suite, suite, REGEXP_REPLACE(test, '(release|debug)-pg(\\d+)-?', '') as deparametrized_test DISTINCT parent_suite, suite, name
FROM FROM results
(
SELECT
reference,
jsonb_array_elements(data -> 'children') ->> 'name' as parent_suite,
jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') ->> 'name' as suite,
jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') ->> 'name' as test,
jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') ->> 'status' as status,
jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') ->> 'retriesStatusChange' as retries_status_change,
to_timestamp((jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'time' ->> 'start')::bigint / 1000)::date as timestamp
FROM
regress_test_results
) data
WHERE WHERE
timestamp > CURRENT_DATE - INTERVAL '%s' day started_at > CURRENT_DATE - INTERVAL '%s' day
AND ( AND (
(status IN ('failed', 'broken') AND reference = 'refs/heads/main') (status IN ('failed', 'broken') AND reference = 'refs/heads/main')
OR retries_status_change::boolean OR flaky
) )
; ;
""" """
@@ -63,12 +50,14 @@ def main(args: argparse.Namespace):
if row["parent_suite"] != "test_runner.regress": if row["parent_suite"] != "test_runner.regress":
continue continue
deparametrized_test = row["deparametrized_test"] if row["name"].endswith("]"):
dash_if_needed = "" if deparametrized_test.endswith("[]") else "-" parametrized_test = row["name"].replace(
parametrized_test = deparametrized_test.replace( "[",
"[", f"[{build_type}-pg{pg_version}-",
f"[{build_type}-pg{pg_version}{dash_if_needed}", )
) else:
parametrized_test = f"{row['name']}[{build_type}-pg{pg_version}]"
res[row["parent_suite"]][row["suite"]][parametrized_test] = True res[row["parent_suite"]][row["suite"]][parametrized_test] = True
logging.info( logging.info(