test_runner: automatically rerun flaky tests (#3880)

This PR adds a plugin that automatically reruns (up to 3 times) flaky tests. Internally, it uses data from `TEST_RESULT_CONNSTR` database and `pytest-rerunfailures` plugin. As the first approximation we consider the test flaky if it has failed on the main branch in the last 10 days. Flaky tests are fetched by `scripts/flaky_tests.py` script (it's possible to use it in a standalone mode to learn which tests are flaky), stored to a JSON file, and then the file is passed to the pytest plugin.
2025-12-22 21:59:59 +00:00 · 2023-04-04 12:21:54 +01:00
parent 846532112c
commit 105b8bb9d3
9 changed files with 195 additions and 16 deletions
--- a/.github/actions/allure-report/action.yml
+++ b/.github/actions/allure-report/action.yml
@@ -76,8 +76,8 @@ runs:
          rm -f ${ALLURE_ZIP}
        fi
      env:
-        ALLURE_VERSION: 2.19.0
-        ALLURE_ZIP_MD5: ced21401a1a8b9dfb68cee9e4c210464
+        ALLURE_VERSION: 2.21.0
+        ALLURE_ZIP_MD5: c8db4dd8e2a7882583d569ed2c82879c

    - name: Upload Allure results
      if: ${{ inputs.action == 'store' }}
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -44,6 +44,10 @@ inputs:
    description: 'Secret access key'
    required: false
    default: ''
+  rerun_flaky:
+    description: 'Whether to rerun flaky tests'
+    required: false
+    default: 'false'

 runs:
  using: "composite"
@@ -101,6 +105,7 @@ runs:
        COMPATIBILITY_SNAPSHOT_DIR: /tmp/compatibility_snapshot_pg14
        ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'backward compatibility breakage')
        ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage')
+        RERUN_FLAKY: ${{ inputs.rerun_flaky }}
      shell: bash -euxo pipefail {0}
      run: |
        # PLATFORM will be embedded in the perf test report
@@ -143,6 +148,13 @@ runs:
          EXTRA_PARAMS="--out-dir $PERF_REPORT_DIR $EXTRA_PARAMS"
        fi

+        if [ "${RERUN_FLAKY}" == "true" ]; then
+          mkdir -p $TEST_OUTPUT
+          poetry run ./scripts/flaky_tests.py "${TEST_RESULT_CONNSTR}" --days 10 --output "$TEST_OUTPUT/flaky.json"
+
+          EXTRA_PARAMS="--flaky-tests-json $TEST_OUTPUT/flaky.json $EXTRA_PARAMS"
+        fi
+
        if [[ "${{ inputs.build_type }}" == "debug" ]]; then
          cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
        elif [[ "${{ inputs.build_type }}" == "release" ]]; then
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -335,6 +335,9 @@ jobs:
          real_s3_region: us-west-2
          real_s3_access_key_id: "${{ secrets.AWS_ACCESS_KEY_ID_CI_TESTS_S3 }}"
          real_s3_secret_access_key: "${{ secrets.AWS_SECRET_ACCESS_KEY_CI_TESTS_S3 }}"
+          rerun_flaky: true
+        env:
+          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}

      - name: Merge and upload coverage data
        if: matrix.build_type == 'debug'
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.4.0 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.4.1 and should not be changed by hand.

 [[package]]
 name = "aiohttp"
@@ -79,37 +79,35 @@ sa = ["sqlalchemy[postgresql-psycopg2binary] (>=1.3,<1.5)"]

 [[package]]
 name = "allure-pytest"
-version = "2.10.0"
+version = "2.13.1"
 description = "Allure pytest integration"
 category = "main"
 optional = false
 python-versions = "*"
 files = [
-    {file = "allure-pytest-2.10.0.tar.gz", hash = "sha256:3b2ab67629f4cbd8617abd817d2b22292c6eb7efd5584f992d1af8143aea6ee7"},
-    {file = "allure_pytest-2.10.0-py3-none-any.whl", hash = "sha256:08274096594758447db54c3b2c382526ee04f1fe12119cdaee92d2d93c84b530"},
+    {file = "allure-pytest-2.13.1.tar.gz", hash = "sha256:68d69456eeb65af4061ec06a80bc941163b0616e8216554d36b070a6bf070e08"},
+    {file = "allure_pytest-2.13.1-py3-none-any.whl", hash = "sha256:a8de2fc3b3effe2d8f98801646920de3f055b779710f4c806dbee7c613c24633"},
 ]

 [package.dependencies]
-allure-python-commons = "2.10.0"
+allure-python-commons = "2.13.1"
 pytest = ">=4.5.0"
-six = ">=1.9.0"

 [[package]]
 name = "allure-python-commons"
-version = "2.10.0"
+version = "2.13.1"
 description = "Common module for integrate allure with python-based frameworks"
 category = "main"
 optional = false
-python-versions = ">=3.5"
+python-versions = ">=3.6"
 files = [
-    {file = "allure-python-commons-2.10.0.tar.gz", hash = "sha256:d4d31344b0f0037a4a11e16b91b28cf0eeb23ffa0e50c27fcfc6aabe72212d3c"},
-    {file = "allure_python_commons-2.10.0-py3-none-any.whl", hash = "sha256:2a717e8ca8d296bf89cd57f38fc3c21893bd7ea8cd02a6ae5420e6d1a6eda5d0"},
+    {file = "allure-python-commons-2.13.1.tar.gz", hash = "sha256:3fc13e1da8ebb23f9ab5c9c72ad04595023cdd5078dbb8604939997faebed5cb"},
+    {file = "allure_python_commons-2.13.1-py3-none-any.whl", hash = "sha256:d08e04867bddf44fef55def3d67f4bc25af58a1bf9fcffcf4ec3331f7f2ef0d0"},
 ]

 [package.dependencies]
 attrs = ">=16.0.0"
 pluggy = ">=0.4.0"
-six = ">=1.9.0"

 [[package]]
 name = "async-timeout"
@@ -1932,6 +1930,22 @@ pytest = [
    {version = ">=6.2.4", markers = "python_version >= \"3.10\""},
 ]

+[[package]]
+name = "pytest-rerunfailures"
+version = "11.1.2"
+description = "pytest plugin to re-run tests to eliminate flaky failures"
+category = "main"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "pytest-rerunfailures-11.1.2.tar.gz", hash = "sha256:55611661e873f1cafa384c82f08d07883954f4b76435f4b8a5b470c1954573de"},
+    {file = "pytest_rerunfailures-11.1.2-py3-none-any.whl", hash = "sha256:d21fe2e46d9774f8ad95f1aa799544ae95cac3a223477af94aa985adfae92b7e"},
+]
+
+[package.dependencies]
+packaging = ">=17.1"
+pytest = ">=5.3"
+
 [[package]]
 name = "pytest-timeout"
 version = "2.1.0"
@@ -2597,4 +2611,4 @@ testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>=
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "2515a9320c2960076012fbc036fb33c4f6a23515c8d143785931dc18c6722d91"
+content-hash = "b689ffd6eae32b966f1744b5ac3343fe0dd26b31ee1f50e13daf5045ee0623e1"
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,7 +26,7 @@ prometheus-client = "^0.14.1"
 pytest-timeout = "^2.1.0"
 Werkzeug = "^2.2.3"
 pytest-order = "^1.0.1"
-allure-pytest = "^2.10.0"
+allure-pytest = "^2.13.1"
 pytest-asyncio = "^0.19.0"
 toml = "^0.10.2"
 psutil = "^5.9.4"
@@ -34,6 +34,7 @@ types-psutil = "^5.9.5.4"
 types-toml = "^0.10.8"
 pytest-httpserver = "^1.0.6"
 aiohttp = "3.7.4"
+pytest-rerunfailures = "^11.1.2"

 [tool.poetry.group.dev.dependencies]
 black = "^23.1.0"
@@ -69,6 +70,9 @@ strict = true
 module = [
    "asyncpg.*",
    "pg8000.*",
+    "allure.*",
+    "allure_commons.*",
+    "allure_pytest.*",
 ]
 ignore_missing_imports = true

--- a/scripts/flaky_tests.py
+++ b/scripts/flaky_tests.py
@@ -0,0 +1,87 @@
+#! /usr/bin/env python3
+
+import argparse
+import json
+import logging
+from collections import defaultdict
+from typing import DefaultDict, Dict
+
+import psycopg2
+import psycopg2.extras
+
+# We call the test "flaky" if it failed at least once on the main branch in the last N=10 days.
+FLAKY_TESTS_QUERY = """
+    SELECT
+        DISTINCT parent_suite, suite, test
+    FROM
+        (
+            SELECT
+                revision,
+                jsonb_array_elements(data -> 'children') -> 'name' as parent_suite,
+                jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'name' as suite,
+                jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'name' as test,
+                jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'status' as status,
+                to_timestamp((jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'time' -> 'start')::bigint / 1000)::date as timestamp
+            FROM
+                regress_test_results
+            WHERE
+                reference = 'refs/heads/main'
+        ) data
+    WHERE
+        timestamp > CURRENT_DATE - INTERVAL '%s' day
+        AND status::text IN ('"failed"', '"broken"')
+    ;
+"""
+
+
+def main(args: argparse.Namespace):
+    connstr = args.connstr
+    interval_days = args.days
+    output = args.output
+
+    res: DefaultDict[str, DefaultDict[str, Dict[str, bool]]]
+    res = defaultdict(lambda: defaultdict(dict))
+
+    logging.info("connecting to the database...")
+    with psycopg2.connect(connstr, connect_timeout=10) as conn:
+        with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
+            logging.info("fetching flaky tests...")
+            cur.execute(FLAKY_TESTS_QUERY, (interval_days,))
+            rows = cur.fetchall()
+
+    for row in rows:
+        logging.info(f"\t{row['parent_suite'].replace('.', '/')}/{row['suite']}.py::{row['test']}")
+        res[row["parent_suite"]][row["suite"]][row["test"]] = True
+
+    logging.info(f"saving results to {output.name}")
+    json.dump(res, output, indent=2)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Detect flaky tests in the last N days")
+    parser.add_argument(
+        "--output",
+        type=argparse.FileType("w"),
+        default="flaky.json",
+        help="path to output json file (default: flaky.json)",
+    )
+    parser.add_argument(
+        "--days",
+        required=False,
+        default=10,
+        type=int,
+        help="how many days to look back for flaky tests (default: 10)",
+    )
+    parser.add_argument(
+        "connstr",
+        help="connection string to the test results database",
+    )
+    args = parser.parse_args()
+
+    level = logging.INFO
+    logging.basicConfig(
+        format="%(message)s",
+        level=level,
+    )
+
+    main(args)
--- a/test_runner/conftest.py
+++ b/test_runner/conftest.py
@@ -4,4 +4,5 @@ pytest_plugins = (
    "fixtures.pg_stats",
    "fixtures.compare_fixtures",
    "fixtures.slow",
+    "fixtures.flaky",
 )
--- a/test_runner/fixtures/flaky.py
+++ b/test_runner/fixtures/flaky.py
@@ -0,0 +1,58 @@
+import json
+from pathlib import Path
+from typing import List
+
+import pytest
+from _pytest.config import Config
+from _pytest.config.argparsing import Parser
+from allure_commons.types import LabelType
+from allure_pytest.utils import allure_name, allure_suite_labels
+
+from fixtures.log_helper import log
+
+"""
+The plugin reruns flaky tests.
+It uses `pytest.mark.flaky` provided by `pytest-rerunfailures` plugin and flaky tests detected by `scripts/flaky_tests.py`
+
+Note: the logic of getting flaky tests is extracted to a separate script to avoid running it for each of N xdist workers
+"""
+
+
+def pytest_addoption(parser: Parser):
+    parser.addoption(
+        "--flaky-tests-json",
+        action="store",
+        type=Path,
+        help="Path to json file with flaky tests generated by scripts/flaky_tests.py",
+    )
+
+
+def pytest_collection_modifyitems(config: Config, items: List[pytest.Item]):
+    if not config.getoption("--flaky-tests-json"):
+        return
+
+    # Any error with getting flaky tests aren't critical, so just do not rerun any tests
+    flaky_json = config.getoption("--flaky-tests-json")
+    if not flaky_json.exists():
+        return
+
+    content = flaky_json.read_text()
+    try:
+        flaky_tests = json.loads(content)
+    except ValueError:
+        log.error(f"Can't parse {content} as json")
+        return
+
+    for item in items:
+        # Use the same logic for constructing test name as Allure does (we store allure-provided data in DB)
+        # Ref https://github.com/allure-framework/allure-python/blob/2.13.1/allure-pytest/src/listener.py#L98-L100
+        allure_labels = dict(allure_suite_labels(item))
+        parent_suite = str(allure_labels.get(LabelType.PARENT_SUITE))
+        suite = str(allure_labels.get(LabelType.SUITE))
+        params = item.callspec.params if hasattr(item, "callspec") else {}
+        name = allure_name(item, params)
+
+        if flaky_tests.get(parent_suite, {}).get(suite, {}).get(name, False):
+            # Rerun 3 times = 1 original run + 2 reruns
+            log.info(f"Marking {item.nodeid} as flaky. It will be rerun up to 3 times")
+            item.add_marker(pytest.mark.flaky(reruns=2))
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -7,7 +7,7 @@ import time
 from pathlib import Path
 from typing import Any, Callable, Dict, List, Tuple, TypeVar

-import allure  # type: ignore
+import allure
 from psycopg2.extensions import cursor

 from fixtures.log_helper import log