test_runner: rerun all failed tests (#9917)

## Problem Currently, we rerun only known flaky tests. This approach was chosen to reduce the number of tests that go unnoticed (by forcing people to take a look at failed tests and rerun the job manually), but it has some drawbacks: - In PRs, people tend to push new changes without checking failed tests (that's ok) - In the main, tests are just restarted without checking (understandable) - Parametrised tests become flaky one by one, i.e. if `test[1]` is flaky `, test[2]` is not marked as flaky automatically (which may or may not be the case). I suggest rerunning all failed tests to increase the stability of GitHub jobs and using the Grafana Dashboard with flaky tests for deeper analysis. ## Summary of changes - Rerun all failed tests twice at max
2025-12-22 21:59:59 +00:00 · 2024-11-28 19:02:57 +00:00
parent eb520a14ce
commit e04dd3be0b
9 changed files with 46 additions and 247 deletions
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -36,8 +36,8 @@ inputs:
    description: 'Region name for real s3 tests'
    required: false
    default: ''
-  rerun_flaky:
-    description: 'Whether to rerun flaky tests'
+  rerun_failed:
+    description: 'Whether to rerun failed tests'
    required: false
    default: 'false'
  pg_version:
@@ -108,7 +108,7 @@ runs:
        COMPATIBILITY_SNAPSHOT_DIR: /tmp/compatibility_snapshot_pg${{ inputs.pg_version }}
        ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'backward compatibility breakage')
        ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage')
-        RERUN_FLAKY: ${{ inputs.rerun_flaky }}
+        RERUN_FAILED: ${{ inputs.rerun_failed }}
        PG_VERSION: ${{ inputs.pg_version }}
      shell: bash -euxo pipefail {0}
      run: |
@@ -154,15 +154,8 @@ runs:
          EXTRA_PARAMS="--out-dir $PERF_REPORT_DIR $EXTRA_PARAMS"
        fi

-        if [ "${RERUN_FLAKY}" == "true" ]; then
-          mkdir -p $TEST_OUTPUT
-          poetry run ./scripts/flaky_tests.py "${TEST_RESULT_CONNSTR}" \
-                                              --days 7 \
-                                              --output "$TEST_OUTPUT/flaky.json" \
-                                              --pg-version "${DEFAULT_PG_VERSION}" \
-                                              --build-type "${BUILD_TYPE}"
-
-          EXTRA_PARAMS="--flaky-tests-json $TEST_OUTPUT/flaky.json $EXTRA_PARAMS"
+        if [ "${RERUN_FAILED}" == "true" ]; then
+          EXTRA_PARAMS="--reruns 2 $EXTRA_PARAMS"
        fi

        # We use pytest-split plugin to run benchmarks in parallel on different CI runners
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -293,7 +293,7 @@ jobs:
          run_with_real_s3: true
          real_s3_bucket: neon-github-ci-tests
          real_s3_region: eu-central-1
-          rerun_flaky: true
+          rerun_failed: true
          pg_version: ${{ matrix.pg_version }}
        env:
          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
--- a/poetry.lock
+++ b/poetry.lock
@@ -2563,18 +2563,18 @@ pytest = "*"

 [[package]]
 name = "pytest-rerunfailures"
-version = "13.0"
+version = "15.0"
 description = "pytest plugin to re-run tests to eliminate flaky failures"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.9"
 files = [
-    {file = "pytest-rerunfailures-13.0.tar.gz", hash = "sha256:e132dbe420bc476f544b96e7036edd0a69707574209b6677263c950d19b09199"},
-    {file = "pytest_rerunfailures-13.0-py3-none-any.whl", hash = "sha256:34919cb3fcb1f8e5d4b940aa75ccdea9661bade925091873b7c6fa5548333069"},
+    {file = "pytest-rerunfailures-15.0.tar.gz", hash = "sha256:2d9ac7baf59f4c13ac730b47f6fa80e755d1ba0581da45ce30b72fb3542b4474"},
+    {file = "pytest_rerunfailures-15.0-py3-none-any.whl", hash = "sha256:dd150c4795c229ef44320adc9a0c0532c51b78bb7a6843a8c53556b9a611df1a"},
 ]

 [package.dependencies]
 packaging = ">=17.1"
-pytest = ">=7"
+pytest = ">=7.4,<8.2.2 || >8.2.2"

 [[package]]
 name = "pytest-split"
@@ -3524,4 +3524,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.11"
-content-hash = "21debe1116843e5d14bdf37d6e265c68c63a98a64ba04ec8b8a02af2e8d9f486"
+content-hash = "426c385df93f578ba3537c40a269535e27fbcca1978b3cf266096ecbc298c6a9"
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -33,7 +33,7 @@ types-psutil = "^5.9.5.12"
 types-toml = "^0.10.8.6"
 pytest-httpserver = "^1.0.8"
 aiohttp = "3.10.11"
-pytest-rerunfailures = "^13.0"
+pytest-rerunfailures = "^15.0"
 types-pytest-lazy-fixture = "^0.6.3.3"
 pytest-split = "^0.8.1"
 zstandard = "^0.21.0"
--- a/scripts/flaky_tests.py
+++ b/scripts/flaky_tests.py
@@ -1,147 +0,0 @@
-#! /usr/bin/env python3
-
-from __future__ import annotations
-
-import argparse
-import json
-import logging
-import os
-from collections import defaultdict
-from typing import TYPE_CHECKING
-
-import psycopg2
-import psycopg2.extras
-import toml
-
-if TYPE_CHECKING:
-    from typing import Any
-
-FLAKY_TESTS_QUERY = """
-    SELECT
-        DISTINCT parent_suite, suite, name
-    FROM results
-    WHERE
-        started_at > CURRENT_DATE - INTERVAL '%s' day
-        AND (
-            (status IN ('failed', 'broken') AND reference = 'refs/heads/main')
-            OR flaky
-        )
-    ;
-"""
-
-
-def main(args: argparse.Namespace):
-    connstr = args.connstr
-    interval_days = args.days
-    output = args.output
-
-    build_type = args.build_type
-    pg_version = args.pg_version
-
-    res: defaultdict[str, defaultdict[str, dict[str, bool]]]
-    res = defaultdict(lambda: defaultdict(dict))
-
-    try:
-        logging.info("connecting to the database...")
-        with psycopg2.connect(connstr, connect_timeout=30) as conn:
-            with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
-                logging.info("fetching flaky tests...")
-                cur.execute(FLAKY_TESTS_QUERY, (interval_days,))
-                rows = cur.fetchall()
-    except psycopg2.OperationalError as exc:
-        logging.error("cannot fetch flaky tests from the DB due to an error", exc)
-        rows = []
-
-    # If a test run has non-default PAGESERVER_VIRTUAL_FILE_IO_ENGINE (i.e. not empty, not tokio-epoll-uring),
-    # use it to parametrize test name along with build_type and pg_version
-    #
-    # See test_runner/fixtures/parametrize.py for details
-    if (io_engine := os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE", "")) not in (
-        "",
-        "tokio-epoll-uring",
-    ):
-        pageserver_virtual_file_io_engine_parameter = f"-{io_engine}"
-    else:
-        pageserver_virtual_file_io_engine_parameter = ""
-
-    # re-use existing records of flaky tests from before parametrization by compaction_algorithm
-    def get_pageserver_default_tenant_config_compaction_algorithm() -> dict[str, Any] | None:
-        """Duplicated from parametrize.py"""
-        toml_table = os.getenv("PAGESERVER_DEFAULT_TENANT_CONFIG_COMPACTION_ALGORITHM")
-        if toml_table is None:
-            return None
-        v = toml.loads(toml_table)
-        assert isinstance(v, dict)
-        return v
-
-    pageserver_default_tenant_config_compaction_algorithm_parameter = ""
-    if (
-        explicit_default := get_pageserver_default_tenant_config_compaction_algorithm()
-    ) is not None:
-        pageserver_default_tenant_config_compaction_algorithm_parameter = (
-            f"-{explicit_default['kind']}"
-        )
-
-    for row in rows:
-        # We don't want to automatically rerun tests in a performance suite
-        if row["parent_suite"] != "test_runner.regress":
-            continue
-
-        if row["name"].endswith("]"):
-            parametrized_test = row["name"].replace(
-                "[",
-                f"[{build_type}-pg{pg_version}{pageserver_virtual_file_io_engine_parameter}{pageserver_default_tenant_config_compaction_algorithm_parameter}-",
-            )
-        else:
-            parametrized_test = f"{row['name']}[{build_type}-pg{pg_version}{pageserver_virtual_file_io_engine_parameter}{pageserver_default_tenant_config_compaction_algorithm_parameter}]"
-
-        res[row["parent_suite"]][row["suite"]][parametrized_test] = True
-
-        logging.info(
-            f"\t{row['parent_suite'].replace('.', '/')}/{row['suite']}.py::{parametrized_test}"
-        )
-
-    logging.info(f"saving results to {output.name}")
-    json.dump(res, output, indent=2)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Detect flaky tests in the last N days")
-    parser.add_argument(
-        "--output",
-        type=argparse.FileType("w"),
-        default="flaky.json",
-        help="path to output json file (default: flaky.json)",
-    )
-    parser.add_argument(
-        "--days",
-        required=False,
-        default=10,
-        type=int,
-        help="how many days to look back for flaky tests (default: 10)",
-    )
-    parser.add_argument(
-        "--build-type",
-        required=True,
-        type=str,
-        help="for which build type to create list of flaky tests (debug or release)",
-    )
-    parser.add_argument(
-        "--pg-version",
-        required=True,
-        type=int,
-        help="for which Postgres version to create list of flaky tests (14, 15, etc.)",
-    )
-    parser.add_argument(
-        "connstr",
-        help="connection string to the test results database",
-    )
-    args = parser.parse_args()
-
-    level = logging.INFO
-    logging.basicConfig(
-        format="%(message)s",
-        level=level,
-    )
-
-    main(args)
--- a/test_runner/conftest.py
+++ b/test_runner/conftest.py
@@ -13,5 +13,5 @@ pytest_plugins = (
    "fixtures.pg_stats",
    "fixtures.compare_fixtures",
    "fixtures.slow",
-    "fixtures.flaky",
+    "fixtures.reruns",
 )
--- a/test_runner/fixtures/flaky.py
+++ b/test_runner/fixtures/flaky.py
@@ -1,78 +0,0 @@
-from __future__ import annotations
-
-import json
-from collections.abc import MutableMapping
-from pathlib import Path
-from typing import TYPE_CHECKING, cast
-
-import pytest
-from _pytest.config import Config
-from _pytest.config.argparsing import Parser
-from allure_commons.types import LabelType
-from allure_pytest.utils import allure_name, allure_suite_labels
-
-from fixtures.log_helper import log
-
-if TYPE_CHECKING:
-    from collections.abc import MutableMapping
-    from typing import Any
-
-
-"""
-The plugin reruns flaky tests.
-It uses `pytest.mark.flaky` provided by `pytest-rerunfailures` plugin and flaky tests detected by `scripts/flaky_tests.py`
-
-Note: the logic of getting flaky tests is extracted to a separate script to avoid running it for each of N xdist workers
-"""
-
-
-def pytest_addoption(parser: Parser):
-    parser.addoption(
-        "--flaky-tests-json",
-        action="store",
-        type=Path,
-        help="Path to json file with flaky tests generated by scripts/flaky_tests.py",
-    )
-
-
-def pytest_collection_modifyitems(config: Config, items: list[pytest.Item]):
-    if not config.getoption("--flaky-tests-json"):
-        return
-
-    # Any error with getting flaky tests aren't critical, so just do not rerun any tests
-    flaky_json = config.getoption("--flaky-tests-json")
-    if not flaky_json.exists():
-        return
-
-    content = flaky_json.read_text()
-    try:
-        flaky_tests = json.loads(content)
-    except ValueError:
-        log.error(f"Can't parse {content} as json")
-        return
-
-    for item in items:
-        # Use the same logic for constructing test name as Allure does (we store allure-provided data in DB)
-        # Ref https://github.com/allure-framework/allure-python/blob/2.13.1/allure-pytest/src/listener.py#L98-L100
-        allure_labels = dict(allure_suite_labels(item))
-        parent_suite = str(allure_labels.get(LabelType.PARENT_SUITE))
-        suite = str(allure_labels.get(LabelType.SUITE))
-        params = item.callspec.params if hasattr(item, "callspec") else {}
-        name = allure_name(item, params)
-
-        if flaky_tests.get(parent_suite, {}).get(suite, {}).get(name, False):
-            # Rerun 3 times = 1 original run + 2 reruns
-            log.info(f"Marking {item.nodeid} as flaky. It will be rerun up to 3 times")
-            item.add_marker(pytest.mark.flaky(reruns=2))
-
-            # pytest-rerunfailures is not compatible with pytest-timeout (timeout is not set for reruns),
-            #   we can workaround it by setting `timeout_func_only` to True[1].
-            # Unfortunately, setting `timeout_func_only = True` globally in pytest.ini is broken[2],
-            #   but we still can do it using pytest marker.
-            #
-            # - [1] https://github.com/pytest-dev/pytest-rerunfailures/issues/99
-            # - [2] https://github.com/pytest-dev/pytest-timeout/issues/142
-            timeout_marker = item.get_closest_marker("timeout")
-            if timeout_marker is not None:
-                kwargs = cast("MutableMapping[str, Any]", timeout_marker.kwargs)
-                kwargs["func_only"] = True
--- a/test_runner/fixtures/paths.py
+++ b/test_runner/fixtures/paths.py
@@ -30,7 +30,7 @@ def get_test_dir(request: FixtureRequest, top_output_dir: Path, prefix: str | No
    test_name = request.node.name
    test_dir = top_output_dir / f"{prefix or ''}{test_name.replace('/', '-')}"

-    # We rerun flaky tests multiple times, use a separate directory for each run.
+    # We rerun failed tests multiple times, use a separate directory for each run.
    if (suffix := getattr(request.node, "execution_count", None)) is not None:
        test_dir = test_dir.parent / f"{test_dir.name}-{suffix}"

--- a/test_runner/fixtures/reruns.py
+++ b/test_runner/fixtures/reruns.py
@@ -0,0 +1,31 @@
+from __future__ import annotations
+
+from collections.abc import MutableMapping
+from typing import TYPE_CHECKING, cast
+
+import pytest
+
+if TYPE_CHECKING:
+    from collections.abc import MutableMapping
+    from typing import Any
+
+    from _pytest.config import Config
+
+
+def pytest_collection_modifyitems(config: Config, items: list[pytest.Item]):
+    # pytest-rerunfailures is not compatible with pytest-timeout (timeout is not set for reruns),
+    #   we can workaround it by setting `timeout_func_only` to True[1].
+    # Unfortunately, setting `timeout_func_only = True` globally in pytest.ini is broken[2],
+    #   but we still can do it using pytest marker.
+    #
+    # - [1] https://github.com/pytest-dev/pytest-rerunfailures/issues/99
+    # - [2] https://github.com/pytest-dev/pytest-timeout/issues/142
+
+    if not config.getoption("--reruns"):
+        return
+
+    for item in items:
+        timeout_marker = item.get_closest_marker("timeout")
+        if timeout_marker is not None:
+            kwargs = cast("MutableMapping[str, Any]", timeout_marker.kwargs)
+            kwargs["func_only"] = True