From 118e13438df173b98c83bea853e346ebbe00eab3 Mon Sep 17 00:00:00 2001
From: Alexander Lakhin <alexander.lakhin@neon.tech>
Date: Mon, 16 Jun 2025 16:29:39 +0300
Subject: [PATCH] Add "Build and Test Fully" workflow (#11931)

## Problem

We don't test debug builds for v14..v16 in the regular "Build and Test"
runs to perform the testing faster, but it means we can't detect
assertion failures in those versions.
(See https://github.com/neondatabase/neon/issues/11891,
https://github.com/neondatabase/neon/issues/11997)

## Summary of changes
Add a new workflow to test all build types and all versions on all
architectures.
---
 .github/workflows/_build-and-test-locally.yml |  11 +-
 .../workflows/build_and_run_selected_test.yml |   1 +
 .github/workflows/build_and_test_fully.yml    | 151 ++++++++++++++++++
 test_runner/regress/test_compatibility.py     |   9 ++
 4 files changed, 169 insertions(+), 3 deletions(-)
 create mode 100644 .github/workflows/build_and_test_fully.yml

diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index 663afa2c8b..4729aea4f6 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -38,6 +38,11 @@ on:
         required: false
         default: 1
         type: number
+      rerun_failed:
+        description: 'rerun failed tests to ignore flaky tests'
+        required: false
+        default: true
+        type: boolean
 
 defaults:
   run:
@@ -379,7 +384,7 @@ jobs:
       - name: Pytest regression tests
         continue-on-error: ${{ matrix.lfc_state == 'with-lfc' && inputs.build-type == 'debug' }}
         uses: ./.github/actions/run-python-test-set
-        timeout-minutes: ${{ inputs.sanitizers != 'enabled' && 75 || 180 }}
+        timeout-minutes: ${{ (inputs.build-type == 'release' && inputs.sanitizers != 'enabled') && 75 || 180 }}
         with:
           build_type: ${{ inputs.build-type }}
           test_selection: regress
@@ -387,14 +392,14 @@ jobs:
           run_with_real_s3: true
           real_s3_bucket: neon-github-ci-tests
           real_s3_region: eu-central-1
-          rerun_failed: ${{ inputs.test-run-count == 1 }}
+          rerun_failed: ${{ inputs.rerun_failed }}
           pg_version: ${{ matrix.pg_version }}
           sanitizers: ${{ inputs.sanitizers }}
           aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
           # `--session-timeout` is equal to (timeout-minutes - 10 minutes) * 60 seconds.
           # Attempt to stop tests gracefully to generate test reports
           # until they are forcibly stopped by the stricter `timeout-minutes` limit.
-          extra_params: --session-timeout=${{ inputs.sanitizers != 'enabled' && 3000 || 10200 }} --count=${{ inputs.test-run-count }}
+          extra_params: --session-timeout=${{ (inputs.build-type == 'release' && inputs.sanitizers != 'enabled') && 3000 || 10200 }} --count=${{ inputs.test-run-count }}
                         ${{ inputs.test-selection != '' && format('-k "{0}"', inputs.test-selection) || '' }}
         env:
           TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
diff --git a/.github/workflows/build_and_run_selected_test.yml b/.github/workflows/build_and_run_selected_test.yml
index 7f1eb991c4..0f53d44a13 100644
--- a/.github/workflows/build_and_run_selected_test.yml
+++ b/.github/workflows/build_and_run_selected_test.yml
@@ -58,6 +58,7 @@ jobs:
       test-cfg: ${{ inputs.pg-versions }}
       test-selection: ${{ inputs.test-selection }}
       test-run-count: ${{ fromJson(inputs.run-count) }}
+      rerun_failed: false
     secrets: inherit
 
   create-test-report:
diff --git a/.github/workflows/build_and_test_fully.yml b/.github/workflows/build_and_test_fully.yml
new file mode 100644
index 0000000000..7d6543ee26
--- /dev/null
+++ b/.github/workflows/build_and_test_fully.yml
@@ -0,0 +1,151 @@
+name: Build and Test Fully
+
+on:
+  schedule:
+    # * is a special character in YAML so you have to quote this string
+    #          ┌───────────── minute (0 - 59)
+    #          │ ┌───────────── hour (0 - 23)
+    #          │ │ ┌───────────── day of the month (1 - 31)
+    #          │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
+    #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
+    - cron:   '0 3 * * *' # run once a day, timezone is utc
+  workflow_dispatch:
+
+defaults:
+  run:
+    shell: bash -euxo pipefail {0}
+
+concurrency:
+  # Allow only one workflow per any non-`main` branch.
+  group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
+  cancel-in-progress: true
+
+env:
+  RUST_BACKTRACE: 1
+  COPT: '-Werror'
+
+jobs:
+  tag:
+    runs-on: [ self-hosted, small ]
+    container: ${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/base:pinned
+    outputs:
+      build-tag: ${{steps.build-tag.outputs.tag}}
+
+    steps:
+      # Need `fetch-depth: 0` to count the number of commits in the branch
+      - name: Harden the runner (Audit all outbound calls)
+        uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
+        with:
+          egress-policy: audit
+
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          fetch-depth: 0
+
+      - name: Get build tag
+        run: |
+          echo run:$GITHUB_RUN_ID
+          echo ref:$GITHUB_REF_NAME
+          echo rev:$(git rev-list --count HEAD)
+          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
+            echo "tag=$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
+          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
+            echo "tag=release-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
+          elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
+            echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
+          elif [[ "$GITHUB_REF_NAME" == "release-compute" ]]; then
+            echo "tag=release-compute-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
+          else
+            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release', 'release-proxy', 'release-compute'"
+            echo "tag=$GITHUB_RUN_ID" >> $GITHUB_OUTPUT
+          fi
+        shell: bash
+        id: build-tag
+
+  build-build-tools-image:
+    uses: ./.github/workflows/build-build-tools-image.yml
+    secrets: inherit
+
+  build-and-test-locally:
+    needs: [ tag, build-build-tools-image ]
+    strategy:
+      fail-fast: false
+      matrix:
+        arch: [ x64, arm64 ]
+        build-type: [ debug, release ]
+    uses: ./.github/workflows/_build-and-test-locally.yml
+    with:
+      arch: ${{ matrix.arch }}
+      build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
+      build-tag: ${{ needs.tag.outputs.build-tag }}
+      build-type: ${{ matrix.build-type }}
+      rerun_failed: false
+      test-cfg: '[{"pg_version":"v14", "lfc_state": "with-lfc"},
+                  {"pg_version":"v15", "lfc_state": "with-lfc"},
+                  {"pg_version":"v16", "lfc_state": "with-lfc"},
+                  {"pg_version":"v17", "lfc_state": "with-lfc"},
+                  {"pg_version":"v14", "lfc_state": "without-lfc"},
+                  {"pg_version":"v15", "lfc_state": "without-lfc"},
+                  {"pg_version":"v16", "lfc_state": "without-lfc"},
+                  {"pg_version":"v17", "lfc_state": "withouts-lfc"}]'
+    secrets: inherit
+
+
+  create-test-report:
+    needs: [ build-and-test-locally, build-build-tools-image ]
+    if: ${{ !cancelled() }}
+    permissions:
+      id-token: write # aws-actions/configure-aws-credentials
+      statuses: write
+      contents: write
+      pull-requests: write
+    outputs:
+      report-url: ${{ steps.create-allure-report.outputs.report-url }}
+
+    runs-on: [ self-hosted, small ]
+    container:
+      image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
+      credentials:
+        username: ${{ github.actor }}
+        password: ${{ secrets.GITHUB_TOKEN }}
+      options: --init
+
+    steps:
+      - name: Harden the runner (Audit all outbound calls)
+        uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
+        with:
+          egress-policy: audit
+
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+      - name: Create Allure report
+        if: ${{ !cancelled() }}
+        id: create-allure-report
+        uses: ./.github/actions/allure-report-generate
+        with:
+          store-test-results-into-db: true
+          aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        env:
+          REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
+
+      - uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
+        if: ${{ !cancelled() }}
+        with:
+          # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
+          retries: 5
+          script: |
+            const report = {
+              reportUrl:     "${{ steps.create-allure-report.outputs.report-url }}",
+              reportJsonUrl: "${{ steps.create-allure-report.outputs.report-json-url }}",
+            }
+
+            const coverage = {}
+
+            const script = require("./scripts/comment-test-report.js")
+            await script({
+              github,
+              context,
+              fetch,
+              report,
+              coverage,
+            })
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index 31e38144e3..bc9b534095 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -127,6 +127,12 @@ check_ondisk_data_compatibility_if_enabled = pytest.mark.skipif(
     reason="CHECK_ONDISK_DATA_COMPATIBILITY env is not set",
 )
 
+skip_old_debug_versions = pytest.mark.skipif(
+    os.getenv("BUILD_TYPE", "debug") == "debug"
+    and os.getenv("DEFAULT_PG_VERSION") in [PgVersion.V14, PgVersion.V15, PgVersion.V16],
+    reason="compatibility snaphots not available for old versions of debug builds",
+)
+
 
 @pytest.mark.xdist_group("compatibility")
 @pytest.mark.order(before="test_forward_compatibility")
@@ -197,6 +203,7 @@ ingest_lag_log_line = ".*ingesting record with timestamp lagging more than wait_
 
 
 @check_ondisk_data_compatibility_if_enabled
+@skip_old_debug_versions
 @pytest.mark.xdist_group("compatibility")
 @pytest.mark.order(after="test_create_snapshot")
 def test_backward_compatibility(
@@ -224,6 +231,7 @@ def test_backward_compatibility(
 
 
 @check_ondisk_data_compatibility_if_enabled
+@skip_old_debug_versions
 @pytest.mark.xdist_group("compatibility")
 @pytest.mark.order(after="test_create_snapshot")
 def test_forward_compatibility(
@@ -593,6 +601,7 @@ def test_historic_storage_formats(
 
 
 @check_ondisk_data_compatibility_if_enabled
+@skip_old_debug_versions
 @pytest.mark.xdist_group("compatibility")
 @pytest.mark.parametrize(
     **fixtures.utils.allpairs_versions(),