periodic pagebench on hetzner runners (#11963)

## Problem - Benchmark periodic pagebench had inconsistent benchmarking results even when run with the same commit hash. Hypothesis is this was due to running on dedicated but virtualized EC instance with varying CPU frequency. - the dedicated instance type used for the benchmark is quite "old" and we increasingly get `An error occurred (InsufficientInstanceCapacity) when calling the StartInstances operation (reached max retries: 2): Insufficient capacity.` - periodic pagebench uses a snapshot of pageserver timelines to have the same layer structure in each run and get consistent performance. Re-creating the snapshot was a painful manual process (see https://github.com/neondatabase/cloud/issues/27051 and https://github.com/neondatabase/cloud/issues/27653) ## Summary of changes - Run the periodic pagebench on a custom hetzner GitHub runner with large nvme disk and governor set to defined perf profile - provide a manual dispatch option for the workflow that allows to create a new snapshot - keep the manual dispatch option to specify a commit hash useful for bi-secting regressions - always use the newest created snapshot (S3 bucket uses date suffix in S3 key, example `s3://neon-github-public-dev/performance/pagebench/shared-snapshots-2025-05-17/` - `--ignore` `test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py` in regular benchmarks run for each commit - improve perf copying snapshot by using `cp` subprocess instead of traversing tree in python ## Example runs with code in this PR: - run which creates new snapshot https://github.com/neondatabase/neon/actions/runs/15083408849/job/42402986376#step:19:55 - run which uses latest snapshot - https://github.com/neondatabase/neon/actions/runs/15084907676/job/42406240745#step:11:65
2025-12-22 21:59:59 +00:00 · 2025-05-23 11:37:19 +02:00
parent 06ce704041
commit 87fc0a0374
5 changed files with 197 additions and 108 deletions
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -314,7 +314,8 @@ jobs:
          test_selection: performance
          run_in_parallel: false
          save_perf_report: ${{ github.ref_name == 'main' }}
-          extra_params: --splits 5 --group ${{ matrix.pytest_split_group }}
+          # test_pageserver_max_throughput_getpage_at_latest_lsn is run in separate workflow periodic_pagebench.yml because it needs snapshots
+          extra_params: --splits 5 --group ${{ matrix.pytest_split_group }} --ignore=test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
          benchmark_durations: ${{ needs.get-benchmarks-durations.outputs.json }}
          pg_version: v16
          aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
--- a/.github/workflows/periodic_pagebench.yml
+++ b/.github/workflows/periodic_pagebench.yml
@@ -1,4 +1,4 @@
-name: Periodic pagebench performance test on dedicated EC2 machine in eu-central-1 region
+name: Periodic pagebench performance test on unit-perf hetzner runner

 on:
  schedule:
@@ -8,7 +8,7 @@ on:
    #        │   │ ┌───────────── day of the month (1 - 31)
    #        │   │ │ ┌───────────── month (1 - 12 or JAN-DEC)
    #        │   │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
-    - cron: '0 */3 * * *' # Runs every 3 hours
+    - cron: '0 */4 * * *' # Runs every 4 hours
  workflow_dispatch: # Allows manual triggering of the workflow
    inputs:
      commit_hash:
@@ -16,6 +16,11 @@ on:
        description: 'The long neon repo commit hash for the system under test (pageserver) to be tested.'
        required: false
        default: ''
+      recreate_snapshots:
+        type: boolean
+        description: 'Recreate snapshots - !!!WARNING!!! We should only recreate snapshots if the previous ones are no longer compatible. Otherwise benchmarking results are not comparable across runs.'
+        required: false
+        default: false

 defaults:
  run:
@@ -29,13 +34,13 @@ permissions:
  contents: read

 jobs:
-  trigger_bench_on_ec2_machine_in_eu_central_1:
+  run_periodic_pagebench_test:
    permissions:
      id-token: write # aws-actions/configure-aws-credentials
      statuses: write
      contents: write
      pull-requests: write
-    runs-on: [ self-hosted, small ]
+    runs-on: [ self-hosted, unit-perf ]
    container:
      image: ghcr.io/neondatabase/build-tools:pinned-bookworm
      credentials:
@@ -44,10 +49,13 @@ jobs:
      options: --init
    timeout-minutes: 360  # Set the timeout to 6 hours
    env:
-      API_KEY: ${{ secrets.PERIODIC_PAGEBENCH_EC2_RUNNER_API_KEY }}
      RUN_ID: ${{ github.run_id }}
-      AWS_DEFAULT_REGION : "eu-central-1"
-      AWS_INSTANCE_ID : "i-02a59a3bf86bc7e74"
+      DEFAULT_PG_VERSION: 16
+      BUILD_TYPE: release
+      RUST_BACKTRACE: 1
+      # NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS: 1 - doesn't work without root in container
+      S3_BUCKET: neon-github-public-dev
+      PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
    steps:
    # we don't need the neon source code because we run everything remotely
    # however we still need the local github actions to run the allure step below
@@ -56,99 +64,194 @@ jobs:
      with:
        egress-policy: audit

-    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+    - name: Set up the environment which depends on $RUNNER_TEMP on nvme drive
+      id: set-env
+      shell: bash -euxo pipefail {0}
+      run: |
+        {
+          echo "NEON_DIR=${RUNNER_TEMP}/neon"
+          echo "NEON_BIN=${RUNNER_TEMP}/neon/bin"
+          echo "POSTGRES_DISTRIB_DIR=${RUNNER_TEMP}/neon/pg_install"
+          echo "LD_LIBRARY_PATH=${RUNNER_TEMP}/neon/pg_install/v${DEFAULT_PG_VERSION}/lib"
+          echo "BACKUP_DIR=${RUNNER_TEMP}/instance_store/saved_snapshots"
+          echo "TEST_OUTPUT=${RUNNER_TEMP}/neon/test_output"
+          echo "PERF_REPORT_DIR=${RUNNER_TEMP}/neon/test_output/perf-report-local"
+          echo "ALLURE_DIR=${RUNNER_TEMP}/neon/test_output/allure-results"
+          echo "ALLURE_RESULTS_DIR=${RUNNER_TEMP}/neon/test_output/allure-results/results"
+        } >> "$GITHUB_ENV"

-    - name: Show my own (github runner) external IP address - usefull for IP allowlisting
-      run: curl https://ifconfig.me
+        echo "allure_results_dir=${RUNNER_TEMP}/neon/test_output/allure-results/results" >> "$GITHUB_OUTPUT"

-    - name: Assume AWS OIDC role that allows to manage (start/stop/describe... EC machine)
-      uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
+    - uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
      with:
        aws-region: eu-central-1
-        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_MANAGE_BENCHMARK_EC2_VMS_ARN }}
-        role-duration-seconds: 3600
-
-    - name: Start EC2 instance and wait for the instance to boot up
-      run: |
-        aws ec2 start-instances --instance-ids $AWS_INSTANCE_ID
-        aws ec2 wait instance-running --instance-ids $AWS_INSTANCE_ID
-        sleep 60 # sleep some time to allow cloudinit and our API server to start up
-
-    - name: Determine public IP of the EC2 instance and set env variable EC2_MACHINE_URL_US
-      run: |
-        public_ip=$(aws ec2 describe-instances --instance-ids $AWS_INSTANCE_ID --query 'Reservations[*].Instances[*].PublicIpAddress' --output text)
-        echo "Public IP of the EC2 instance: $public_ip"
-        echo "EC2_MACHINE_URL_US=https://${public_ip}:8443" >> $GITHUB_ENV
-
+        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        role-duration-seconds: 18000 # max 5 hours (needed in case commit hash is still being built)
    - name: Determine commit hash
+      id: commit_hash
+      shell: bash -euxo pipefail {0}
      env:
        INPUT_COMMIT_HASH: ${{ github.event.inputs.commit_hash }}
      run: |
-        if [ -z "$INPUT_COMMIT_HASH" ]; then
-          echo "COMMIT_HASH=$(curl -s https://api.github.com/repos/neondatabase/neon/commits/main | jq -r '.sha')" >> $GITHUB_ENV
+        if [[ -z "${INPUT_COMMIT_HASH}" ]]; then
+          COMMIT_HASH=$(curl -s https://api.github.com/repos/neondatabase/neon/commits/main | jq -r '.sha')
+          echo "COMMIT_HASH=$COMMIT_HASH" >> $GITHUB_ENV
+          echo "commit_hash=$COMMIT_HASH" >> "$GITHUB_OUTPUT"
          echo "COMMIT_HASH_TYPE=latest" >> $GITHUB_ENV
        else
-          echo "COMMIT_HASH=$INPUT_COMMIT_HASH" >> $GITHUB_ENV
+          COMMIT_HASH="${INPUT_COMMIT_HASH}"
+          echo "COMMIT_HASH=$COMMIT_HASH" >> $GITHUB_ENV
+          echo "commit_hash=$COMMIT_HASH" >> "$GITHUB_OUTPUT"
          echo "COMMIT_HASH_TYPE=manual" >> $GITHUB_ENV
        fi
+    - name: Checkout the neon repository at given commit hash
+      uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      with:
+        ref: ${{ steps.commit_hash.outputs.commit_hash }}

-    - name: Start Bench with run_id
+    # does not reuse ./.github/actions/download because we need to download the artifact for the given commit hash
+    # example artifact
+    # s3://neon-github-public-dev/artifacts/48b870bc078bd2c450eb7b468e743b9c118549bf/15036827400/1/neon-Linux-X64-release-artifact.tar.zst /instance_store/artifacts/neon-Linux-release-artifact.tar.zst
+    - name: Determine artifact S3_KEY for given commit hash and download and extract artifact
+      id: artifact_prefix
+      shell: bash -euxo pipefail {0}
+      env:
+        ARCHIVE: ${{ runner.temp }}/downloads/neon-${{ runner.os }}-${{ runner.arch }}-release-artifact.tar.zst
+        COMMIT_HASH: ${{ env.COMMIT_HASH }}
+        COMMIT_HASH_TYPE: ${{ env.COMMIT_HASH_TYPE }}
      run: |
-        curl -k -X 'POST' \
-        "${EC2_MACHINE_URL_US}/start_test/${GITHUB_RUN_ID}" \
-        -H 'accept: application/json' \
-        -H 'Content-Type: application/json' \
-        -H "Authorization: Bearer $API_KEY" \
-        -d "{\"neonRepoCommitHash\": \"${COMMIT_HASH}\", \"neonRepoCommitHashType\": \"${COMMIT_HASH_TYPE}\"}"
+        attempt=0
+        max_attempts=24 # 5 minutes * 24 = 2 hours

-    - name: Poll Test Status
-      id: poll_step
-      run: |
-        status=""
-        while [[ "$status" != "failure" && "$status" != "success" ]]; do
-          response=$(curl -k -X 'GET' \
-          "${EC2_MACHINE_URL_US}/test_status/${GITHUB_RUN_ID}" \
-          -H 'accept: application/json' \
-          -H "Authorization: Bearer $API_KEY")
-          echo "Response: $response"
-          set +x
-          status=$(echo $response | jq -r '.status')
-          echo "Test status: $status"
-          if [[ "$status" == "failure" ]]; then
-            echo "Test failed"
-            exit 1 # Fail the job step if status is failure
-          elif [[ "$status" == "success" || "$status" == "null" ]]; then
+        while [[ $attempt -lt $max_attempts ]]; do
+          # the following command will fail until the artifacts are available ...
+          S3_KEY=$(aws s3api list-objects-v2 --bucket "$S3_BUCKET" --prefix "artifacts/$COMMIT_HASH/" \
+            | jq -r '.Contents[]?.Key' \
+            | grep "neon-${{ runner.os }}-${{ runner.arch }}-release-artifact.tar.zst" \
+            | sort --version-sort \
+            | tail -1) || true # ... thus ignore errors from the command
+          if [[ -n "${S3_KEY}" ]]; then
+            echo "Artifact found: $S3_KEY"
+            echo "S3_KEY=$S3_KEY" >> $GITHUB_ENV
            break
-          elif [[ "$status" == "too_many_runs" ]]; then
-            echo "Too many runs already running"
-            echo "too_many_runs=true" >> "$GITHUB_OUTPUT"
-            exit 1
          fi
-
-          sleep 60 # Poll every 60 seconds
+          
+          # Increment attempt counter and sleep for 5 minutes
+          attempt=$((attempt + 1))
+          echo "Attempt $attempt of $max_attempts to find artifacts in S3 bucket s3://$S3_BUCKET/artifacts/$COMMIT_HASH failed. Retrying in 5 minutes..."
+          sleep 300 # Sleep for 5 minutes
        done

-    - name: Retrieve Test Logs
-      if: always() && steps.poll_step.outputs.too_many_runs != 'true'
-      run: |
-        curl -k -X 'GET' \
-        "${EC2_MACHINE_URL_US}/test_log/${GITHUB_RUN_ID}" \
-        -H 'accept: application/gzip' \
-        -H "Authorization: Bearer $API_KEY" \
-        --output "test_log_${GITHUB_RUN_ID}.gz"
+        if [[ -z "${S3_KEY}" ]]; then
+          echo "Error: artifact not found in S3 bucket s3://$S3_BUCKET/artifacts/$COMMIT_HASH" after 2 hours
+        else
+          mkdir -p $(dirname $ARCHIVE)
+          time aws s3 cp --only-show-errors s3://$S3_BUCKET/${S3_KEY} ${ARCHIVE}
+          mkdir -p ${NEON_DIR}
+          time tar -xf ${ARCHIVE} -C ${NEON_DIR}
+          rm -f ${ARCHIVE}
+        fi

-    - name: Unzip Test Log and Print it into this job's log
-      if: always() && steps.poll_step.outputs.too_many_runs != 'true'
+    - name: Download snapshots from S3
+      if: ${{ github.event_name != 'workflow_dispatch' || github.event.inputs.recreate_snapshots == 'false' || github.event.inputs.recreate_snapshots == '' }}
+      id: download_snapshots
+      shell: bash -euxo pipefail {0}
      run: |
-        gzip -d "test_log_${GITHUB_RUN_ID}.gz"
-        cat "test_log_${GITHUB_RUN_ID}"
+        # Download the snapshots from S3
+        mkdir -p ${TEST_OUTPUT}
+        mkdir -p $BACKUP_DIR
+        cd $BACKUP_DIR
+        mkdir parts
+        cd parts
+        PART=$(aws s3api list-objects-v2 --bucket $S3_BUCKET --prefix performance/pagebench/ \
+          | jq -r '.Contents[]?.Key' \
+          | grep -E 'shared-snapshots-[0-9]{4}-[0-9]{2}-[0-9]{2}' \
+          | sort \
+          | tail -1)
+        echo "Latest PART: $PART"
+        if [[ -z "$PART" ]]; then
+          echo "ERROR: No matching S3 key found" >&2
+          exit 1
+        fi
+        S3_KEY=$(dirname $PART)
+        time aws s3 cp --only-show-errors --recursive s3://${S3_BUCKET}/$S3_KEY/ .
+        cd $TEST_OUTPUT
+        time cat $BACKUP_DIR/parts/* | zstdcat | tar --extract --preserve-permissions
+        rm -rf ${BACKUP_DIR}
+
+    - name: Cache poetry deps
+      uses: actions/cache@v4
+      with:
+        path: ~/.cache/pypoetry/virtualenvs
+        key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }}
+
+    - name: Install Python deps
+      shell: bash -euxo pipefail {0}
+      run: ./scripts/pysync
+
+    # we need high number of open files for pagebench
+    - name: show ulimits
+      shell: bash -euxo pipefail {0}
+      run: |
+        ulimit -a
+
+    - name: Run pagebench testcase
+      shell: bash -euxo pipefail {0}
+      env:
+        CI: false  # need to override this env variable set by github to enforce using snapshots
+      run: |
+        export PLATFORM=hetzner-unit-perf-${COMMIT_HASH_TYPE}
+        # report the commit hash of the neon repository in the revision of the test results
+        export GITHUB_SHA=${COMMIT_HASH}
+        rm -rf ${PERF_REPORT_DIR}
+        rm -rf ${ALLURE_RESULTS_DIR}
+        mkdir -p ${PERF_REPORT_DIR}
+        mkdir -p ${ALLURE_RESULTS_DIR}
+        PARAMS="--alluredir=${ALLURE_RESULTS_DIR} --tb=short --verbose -rA"
+        EXTRA_PARAMS="--out-dir ${PERF_REPORT_DIR} --durations-path $TEST_OUTPUT/benchmark_durations.json"
+        # run only two selected tests
+        # environment set by parent:
+        # RUST_BACKTRACE=1 DEFAULT_PG_VERSION=16 BUILD_TYPE=release
+        ./scripts/pytest ${PARAMS} test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_characterize_throughput_with_n_tenants ${EXTRA_PARAMS}
+        ./scripts/pytest ${PARAMS} test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_characterize_latencies_with_1_client_and_throughput_with_many_clients_one_tenant ${EXTRA_PARAMS}
+
+    - name: upload the performance metrics to the Neon performance database which is used by grafana dashboards to display the results
+      shell: bash -euxo pipefail {0}
+      run: |
+        export REPORT_FROM="$PERF_REPORT_DIR"
+        export GITHUB_SHA=${COMMIT_HASH}
+        time ./scripts/generate_and_push_perf_report.sh
+
+    - name: Upload test results
+      if: ${{ !cancelled() }}
+      uses: ./.github/actions/allure-report-store
+      with:
+        report-dir:  ${{ steps.set-env.outputs.allure_results_dir }}
+        unique-key: ${{ env.BUILD_TYPE }}-${{ env.DEFAULT_PG_VERSION }}-${{ runner.arch }}
+        aws-oidc-role-arn:  ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

    - name: Create Allure report
+      id: create-allure-report
      if: ${{ !cancelled() }}
      uses: ./.github/actions/allure-report-generate
      with:
        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

+    - name: Upload snapshots
+      if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.recreate_snapshots != 'false' && github.event.inputs.recreate_snapshots != '' }}
+      id: upload_snapshots
+      shell: bash -euxo pipefail {0}
+      run: |
+        mkdir -p $BACKUP_DIR
+        cd $TEST_OUTPUT
+        tar --create --preserve-permissions --file - shared-snapshots | zstd -o $BACKUP_DIR/shared_snapshots.tar.zst
+        cd $BACKUP_DIR
+        mkdir parts
+        split -b 1G shared_snapshots.tar.zst ./parts/shared_snapshots.tar.zst.part.
+        SNAPSHOT_DATE=$(date +%F)  # YYYY-MM-DD
+        cd parts
+        time aws s3 cp --recursive . s3://${S3_BUCKET}/performance/pagebench/shared-snapshots-${SNAPSHOT_DATE}/
+
    - name: Post to a Slack channel
      if: ${{ github.event.schedule && failure() }}
      uses: slackapi/slack-github-action@fcfb566f8b0aab22203f066d80ca1d7e4b5d05b3 # v1.27.1
@@ -157,26 +260,22 @@ jobs:
        slack-message: "Periodic pagebench testing on dedicated hardware: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
      env:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
-
+        
    - name: Cleanup Test Resources
      if: always()
+      shell: bash -euxo pipefail {0}
+      env:
+        ARCHIVE: ${{ runner.temp }}/downloads/neon-${{ runner.os }}-${{ runner.arch }}-release-artifact.tar.zst
      run: |
-        curl -k -X 'POST' \
-        "${EC2_MACHINE_URL_US}/cleanup_test/${GITHUB_RUN_ID}" \
-        -H 'accept: application/json' \
-        -H "Authorization: Bearer $API_KEY" \
-        -d ''
+        # Cleanup the test resources
+        if [[ -d "${BACKUP_DIR}" ]]; then
+          rm -rf ${BACKUP_DIR}
+        fi
+        if [[ -d "${TEST_OUTPUT}" ]]; then
+          rm -rf ${TEST_OUTPUT}
+        fi
+        if [[ -d "${NEON_DIR}" ]]; then
+          rm -rf ${NEON_DIR}
+        fi
+        rm -rf $(dirname $ARCHIVE)

-    - name: Assume AWS OIDC role that allows to manage (start/stop/describe... EC machine)
-      if: always() && steps.poll_step.outputs.too_many_runs != 'true'
-      uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
-      with:
-        aws-region: eu-central-1
-        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_MANAGE_BENCHMARK_EC2_VMS_ARN }}
-        role-duration-seconds: 3600
-
-    - name: Stop EC2 instance and wait for the instance to be stopped
-      if: always() && steps.poll_step.outputs.too_many_runs != 'true'
-      run: |
-        aws ec2 stop-instances --instance-ids $AWS_INSTANCE_ID
-        aws ec2 wait instance-stopped --instance-ids $AWS_INSTANCE_ID