Switch to querying new tests results DB (#5616)

## Problem We started to store test results in a new format in https://github.com/neondatabase/neon/pull/4549. This PR switches scripts to query this db. (we can completely remove old DB/ingestions scripts in a couple of weeks after the PR merged) ## Summary of changes - `scripts/benchmark_durations.py` query new database - `scripts/flaky_tests.py` query new database
2025-12-22 21:59:59 +00:00 · 2023-10-25 14:25:13 +01:00
parent 8b8be7bed4
commit 4778b6a12e
4 changed files with 87 additions and 106 deletions
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -203,6 +203,10 @@ runs:
        COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
        BASE_S3_URL: ${{ steps.generate-report.outputs.base-s3-url }}
      run: |
        if [ ! -d "${WORKDIR}/report/data/test-cases" ]; then
          exit 0
        fi
        export DATABASE_URL=${REGRESS_TEST_RESULT_CONNSTR_NEW}
        ./scripts/pysync
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -433,7 +433,7 @@ jobs:
          rerun_flaky: true
          pg_version: ${{ matrix.pg_version }}
        env:
-          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
+          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
          CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
      - name: Merge and upload coverage data
@@ -468,7 +468,7 @@ jobs:
        env:
          VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
          PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
-          TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}"
+          TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
      # XXX: no coverage data handling here, since benchmarks are run on release builds,
      # while coverage is currently collected for the debug ones
--- a/scripts/benchmark_durations.py
+++ b/scripts/benchmark_durations.py
@@ -15,28 +15,15 @@ The script fetches the durations of benchmarks from the database and stores it i
 BENCHMARKS_DURATION_QUERY = """
    SELECT
-        DISTINCT parent_suite, suite, test,
+        DISTINCT parent_suite, suite, name,
-        PERCENTILE_DISC(%s) WITHIN GROUP (ORDER BY duration_ms) as percentile_ms
+        PERCENTILE_DISC(%s) WITHIN GROUP (ORDER BY duration) as percentile_ms
-    FROM
+    FROM results
        (
            SELECT
                jsonb_array_elements(data -> 'children') ->> 'name' as parent_suite,
                jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') ->> 'name' as suite,
                jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') ->> 'name' as test,
                jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') ->> 'status' as status,
                to_timestamp((jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'time' -> 'start')::bigint / 1000)::date as timestamp,
                (jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'time' -> 'duration')::int as duration_ms
            FROM
                regress_test_results
            WHERE
                reference = 'refs/heads/main'
        ) data
    WHERE
-        timestamp > CURRENT_DATE - INTERVAL '%s' day
+        started_at > CURRENT_DATE - INTERVAL '%s' day
        AND parent_suite = 'test_runner.performance'
        AND status = 'passed'
    GROUP BY
-        parent_suite, suite, test
+        parent_suite, suite, name
    ;
 """
@@ -44,68 +31,69 @@ BENCHMARKS_DURATION_QUERY = """
 # the total duration varies from 8 to 40 minutes.
 # We use some pre-collected durations as a fallback to have a better distribution.
 FALLBACK_DURATION = {
-    "test_runner/performance/test_branch_creation.py::test_branch_creation_heavy_write[20]": 57.0,
+    "test_runner/performance/test_branch_creation.py::test_branch_creation_heavy_write[20]": 62.144,
-    "test_runner/performance/test_branch_creation.py::test_branch_creation_many_relations": 28.0,
+    "test_runner/performance/test_branch_creation.py::test_branch_creation_many[1024]": 90.941,
-    "test_runner/performance/test_branch_creation.py::test_branch_creation_many[1024]": 71.0,
+    "test_runner/performance/test_branch_creation.py::test_branch_creation_many_relations": 26.053,
-    "test_runner/performance/test_branching.py::test_compare_child_and_root_pgbench_perf": 27.0,
+    "test_runner/performance/test_branching.py::test_compare_child_and_root_pgbench_perf": 25.67,
-    "test_runner/performance/test_branching.py::test_compare_child_and_root_read_perf": 11.0,
+    "test_runner/performance/test_branching.py::test_compare_child_and_root_read_perf": 14.497,
-    "test_runner/performance/test_branching.py::test_compare_child_and_root_write_perf": 30.0,
+    "test_runner/performance/test_branching.py::test_compare_child_and_root_write_perf": 18.852,
-    "test_runner/performance/test_bulk_insert.py::test_bulk_insert[neon]": 40.0,
+    "test_runner/performance/test_bulk_insert.py::test_bulk_insert[neon]": 26.572,
-    "test_runner/performance/test_bulk_insert.py::test_bulk_insert[vanilla]": 5.0,
+    "test_runner/performance/test_bulk_insert.py::test_bulk_insert[vanilla]": 6.259,
-    "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[1]": 3.0,
+    "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[10]": 21.206,
-    "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[5]": 10.0,
+    "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[1]": 3.474,
-    "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[10]": 19.0,
+    "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[5]": 11.262,
-    "test_runner/performance/test_bulk_update.py::test_bulk_update[10]": 66.0,
+    "test_runner/performance/test_bulk_update.py::test_bulk_update[100]": 94.225,
-    "test_runner/performance/test_bulk_update.py::test_bulk_update[50]": 30.0,
+    "test_runner/performance/test_bulk_update.py::test_bulk_update[10]": 68.159,
-    "test_runner/performance/test_bulk_update.py::test_bulk_update[100]": 60.0,
+    "test_runner/performance/test_bulk_update.py::test_bulk_update[50]": 76.719,
-    "test_runner/performance/test_compaction.py::test_compaction": 77.0,
+    "test_runner/performance/test_compaction.py::test_compaction": 110.222,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[neon-5-10-100]": 11.0,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[neon-5-10-100]": 10.743,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[vanilla-5-10-100]": 16.0,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[vanilla-5-10-100]": 16.541,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[neon-5-10-100]": 11.0,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[neon-5-10-100]": 11.109,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[vanilla-5-10-100]": 18.0,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[vanilla-5-10-100]": 18.121,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[neon-5-10-100]": 11.0,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[neon-5-10-100]": 11.3,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[vanilla-5-10-100]": 16.0,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[vanilla-5-10-100]": 16.086,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-1]": 11.0,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-10]": 12.024,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-10]": 11.0,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-1]": 11.14,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-1]": 10.0,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-10]": 10.375,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-10]": 10.0,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-1]": 10.075,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[neon-5-10-100]": 11.0,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[neon-5-10-100]": 11.147,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[vanilla-5-10-100]": 16.0,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[vanilla-5-10-100]": 16.321,
-    "test_runner/performance/test_copy.py::test_copy[neon]": 12.0,
+    "test_runner/performance/test_copy.py::test_copy[neon]": 16.579,
-    "test_runner/performance/test_copy.py::test_copy[vanilla]": 10.0,
+    "test_runner/performance/test_copy.py::test_copy[vanilla]": 10.094,
-    "test_runner/performance/test_gc_feedback.py::test_gc_feedback": 284.0,
+    "test_runner/performance/test_gc_feedback.py::test_gc_feedback": 590.157,
-    "test_runner/performance/test_gist_build.py::test_gist_buffering_build[neon]": 11.0,
+    "test_runner/performance/test_gist_build.py::test_gist_buffering_build[neon]": 14.102,
-    "test_runner/performance/test_gist_build.py::test_gist_buffering_build[vanilla]": 7.0,
+    "test_runner/performance/test_gist_build.py::test_gist_buffering_build[vanilla]": 8.677,
-    "test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[neon-1]": 85.0,
+    "test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[neon-1]": 31.079,
-    "test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[vanilla-1]": 29.0,
+    "test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[vanilla-1]": 38.119,
-    "test_runner/performance/test_layer_map.py::test_layer_map": 44.0,
+    "test_runner/performance/test_layer_map.py::test_layer_map": 24.784,
-    "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[neon]": 16.0,
+    "test_runner/performance/test_logical_replication.py::test_logical_replication": 117.707,
-    "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[vanilla]": 67.0,
+    "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[neon]": 21.194,
-    "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[neon]": 67.0,
+    "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[vanilla]": 59.068,
-    "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[vanilla]": 80.0,
+    "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[neon]": 73.235,
-    "test_runner/performance/test_perf_pgbench.py::test_pgbench[neon-45-10]": 102.0,
+    "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[vanilla]": 82.586,
-    "test_runner/performance/test_perf_pgbench.py::test_pgbench[vanilla-45-10]": 99.0,
+    "test_runner/performance/test_perf_pgbench.py::test_pgbench[neon-45-10]": 106.536,
-    "test_runner/performance/test_random_writes.py::test_random_writes[neon]": 9.0,
+    "test_runner/performance/test_perf_pgbench.py::test_pgbench[vanilla-45-10]": 98.753,
-    "test_runner/performance/test_random_writes.py::test_random_writes[vanilla]": 2.0,
+    "test_runner/performance/test_random_writes.py::test_random_writes[neon]": 6.975,
-    "test_runner/performance/test_seqscans.py::test_seqscans[neon-100000-100-0]": 4.0,
+    "test_runner/performance/test_random_writes.py::test_random_writes[vanilla]": 3.69,
-    "test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-0]": 80.0,
+    "test_runner/performance/test_seqscans.py::test_seqscans[neon-100000-100-0]": 3.529,
-    "test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-4]": 68.0,
+    "test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-0]": 64.522,
-    "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-100000-100-0]": 0.0,
+    "test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-4]": 40.964,
-    "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-0]": 11.0,
+    "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-100000-100-0]": 0.55,
-    "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-4]": 10.0,
+    "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-0]": 12.189,
-    "test_runner/performance/test_startup.py::test_startup_simple": 2.0,
+    "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-4]": 13.899,
-    "test_runner/performance/test_startup.py::test_startup": 539.0,
+    "test_runner/performance/test_startup.py::test_startup": 890.114,
-    "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_off-10-5-5]": 375.0,
+    "test_runner/performance/test_startup.py::test_startup_simple": 2.51,
-    "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_on-10-5-5]": 370.0,
+    "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_off-10-5-5]": 527.245,
-    "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[vanilla-10-5-5]": 94.0,
+    "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_on-10-5-5]": 583.46,
-    "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_off-1000]": 164.0,
+    "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[vanilla-10-5-5]": 113.653,
-    "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_on-1000]": 274.0,
+    "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_off-1000]": 233.728,
-    "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[vanilla-1000]": 949.0,
+    "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_on-1000]": 419.093,
-    "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_off-45-100]": 142.0,
+    "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[vanilla-1000]": 982.461,
-    "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_on-45-100]": 151.0,
+    "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_off-45-100]": 116.522,
-    "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[vanilla-45-100]": 182.0,
+    "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_on-45-100]": 115.583,
-    "test_runner/performance/test_write_amplification.py::test_write_amplification[neon]": 13.0,
+    "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[vanilla-45-100]": 155.282,
-    "test_runner/performance/test_write_amplification.py::test_write_amplification[vanilla]": 16.0,
+    "test_runner/performance/test_write_amplification.py::test_write_amplification[neon]": 26.704,
    "test_runner/performance/test_write_amplification.py::test_write_amplification[vanilla]": 16.088,
 }
@@ -130,7 +118,7 @@ def main(args: argparse.Namespace):
        res = FALLBACK_DURATION
    for row in rows:
-        pytest_name = f"{row['parent_suite'].replace('.', '/')}/{row['suite']}.py::{row['test']}"
+        pytest_name = f"{row['parent_suite'].replace('.', '/')}/{row['suite']}.py::{row['name']}"
        duration = row["percentile_ms"] / 1000
        logging.info(f"\t{pytest_name}: {duration}")
        res[pytest_name] = duration
--- a/scripts/flaky_tests.py
+++ b/scripts/flaky_tests.py
@@ -9,28 +9,15 @@ from typing import DefaultDict, Dict
 import psycopg2
 import psycopg2.extras
 # We call the test "flaky" if it failed at least once on the main branch in the last N=10 days.
 FLAKY_TESTS_QUERY = """
    SELECT
-        DISTINCT parent_suite, suite, REGEXP_REPLACE(test, '(release|debug)-pg(\\d+)-?', '') as deparametrized_test
+        DISTINCT parent_suite, suite, name
-    FROM
+    FROM results
        (
            SELECT
                reference,
                jsonb_array_elements(data -> 'children') ->> 'name' as parent_suite,
                jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') ->> 'name' as suite,
                jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') ->> 'name' as test,
                jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') ->> 'status' as status,
                jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') ->> 'retriesStatusChange' as retries_status_change,
                to_timestamp((jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'time' ->> 'start')::bigint / 1000)::date as timestamp
            FROM
                regress_test_results
        ) data
    WHERE
-        timestamp > CURRENT_DATE - INTERVAL '%s' day
+        started_at > CURRENT_DATE - INTERVAL '%s' day
        AND (
            (status IN ('failed', 'broken') AND reference = 'refs/heads/main')
-            OR retries_status_change::boolean
+            OR flaky
        )
    ;
 """
@@ -63,12 +50,14 @@ def main(args: argparse.Namespace):
        if row["parent_suite"] != "test_runner.regress":
            continue
-        deparametrized_test = row["deparametrized_test"]
+        if row["name"].endswith("]"):
-        dash_if_needed = "" if deparametrized_test.endswith("[]") else "-"
+            parametrized_test = row["name"].replace(
-        parametrized_test = deparametrized_test.replace(
+                "[",
-            "[",
+                f"[{build_type}-pg{pg_version}-",
-            f"[{build_type}-pg{pg_version}{dash_if_needed}",
+            )
-        )
+        else:
            parametrized_test = f"{row['name']}[{build_type}-pg{pg_version}]"
        res[row["parent_suite"]][row["suite"]][parametrized_test] = True
        logging.info(