neon/.github/workflows/periodic_pagebench.yml

name: Periodic pagebench performance test on unit-perf-aws-arm runners

on:
  schedule:
    # * is a special character in YAML so you have to quote this string
    #        ┌───────────── minute (0 - 59)
    #        │   ┌───────────── hour (0 - 23)
    #        │   │ ┌───────────── day of the month (1 - 31)
    #        │   │ │ ┌───────────── month (1 - 12 or JAN-DEC)
    #        │   │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
    - cron: '0 */4 * * *' # Runs every 4 hours
  workflow_dispatch: # Allows manual triggering of the workflow
    inputs:
      commit_hash:
        type: string
        description: 'The long neon repo commit hash for the system under test (pageserver) to be tested.'
        required: false
        default: ''
      recreate_snapshots:
        type: boolean
        description: 'Recreate snapshots - !!!WARNING!!! We should only recreate snapshots if the previous ones are no longer compatible. Otherwise benchmarking results are not comparable across runs.'
        required: false
        default: false

defaults:
  run:
    shell: bash -euo pipefail {0}

concurrency:
  group: ${{ github.workflow }}
  cancel-in-progress: false

permissions:
  contents: read

jobs:
  run_periodic_pagebench_test:
    permissions:
      id-token: write # aws-actions/configure-aws-credentials
      statuses: write
      contents: write
      pull-requests: write
    runs-on: [ self-hosted, unit-perf-aws-arm ]
    container:
      image: ghcr.io/neondatabase/build-tools:pinned-bookworm
      credentials:
        username: ${{ github.actor }}
        password: ${{ secrets.GITHUB_TOKEN }}
      options: --init
    timeout-minutes: 360  # Set the timeout to 6 hours
    env:
      RUN_ID: ${{ github.run_id }}
      DEFAULT_PG_VERSION: 16
      BUILD_TYPE: release
      RUST_BACKTRACE: 1
      # NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS: 1 - doesn't work without root in container
      S3_BUCKET: neon-github-public-dev
      PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
    steps:
    # we don't need the neon source code because we run everything remotely
    # however we still need the local github actions to run the allure step below
    - name: Harden the runner (Audit all outbound calls)
      uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
      with:
        egress-policy: audit

    - name: Set up the environment which depends on $RUNNER_TEMP on nvme drive
      id: set-env
      shell: bash -euxo pipefail {0}
      run: |
        {
          echo "NEON_DIR=${RUNNER_TEMP}/neon"
          echo "NEON_BIN=${RUNNER_TEMP}/neon/bin"
          echo "POSTGRES_DISTRIB_DIR=${RUNNER_TEMP}/neon/pg_install"
          echo "LD_LIBRARY_PATH=${RUNNER_TEMP}/neon/pg_install/v${DEFAULT_PG_VERSION}/lib"
          echo "BACKUP_DIR=${RUNNER_TEMP}/instance_store/saved_snapshots"
          echo "TEST_OUTPUT=${RUNNER_TEMP}/neon/test_output"
          echo "PERF_REPORT_DIR=${RUNNER_TEMP}/neon/test_output/perf-report-local"
          echo "ALLURE_DIR=${RUNNER_TEMP}/neon/test_output/allure-results"
          echo "ALLURE_RESULTS_DIR=${RUNNER_TEMP}/neon/test_output/allure-results/results"
        } >> "$GITHUB_ENV"

        echo "allure_results_dir=${RUNNER_TEMP}/neon/test_output/allure-results/results" >> "$GITHUB_OUTPUT"

    - uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
      with:
        aws-region: eu-central-1
        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
        role-duration-seconds: 18000 # max 5 hours (needed in case commit hash is still being built)
    - name: Determine commit hash
      id: commit_hash
      shell: bash -euxo pipefail {0}
      env:
        INPUT_COMMIT_HASH: ${{ github.event.inputs.commit_hash }}
      run: |
        if [[ -z "${INPUT_COMMIT_HASH}" ]]; then
          COMMIT_HASH=$(curl -s https://api.github.com/repos/neondatabase/neon/commits/main | jq -r '.sha')
          echo "COMMIT_HASH=$COMMIT_HASH" >> $GITHUB_ENV
          echo "commit_hash=$COMMIT_HASH" >> "$GITHUB_OUTPUT"
          echo "COMMIT_HASH_TYPE=latest" >> $GITHUB_ENV
        else
          COMMIT_HASH="${INPUT_COMMIT_HASH}"
          echo "COMMIT_HASH=$COMMIT_HASH" >> $GITHUB_ENV
          echo "commit_hash=$COMMIT_HASH" >> "$GITHUB_OUTPUT"
          echo "COMMIT_HASH_TYPE=manual" >> $GITHUB_ENV
        fi
    - name: Checkout the neon repository at given commit hash
      uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      with:
        ref: ${{ steps.commit_hash.outputs.commit_hash }}

    # does not reuse ./.github/actions/download because we need to download the artifact for the given commit hash
    # example artifact
    # s3://neon-github-public-dev/artifacts/48b870bc078bd2c450eb7b468e743b9c118549bf/15036827400/1/neon-Linux-X64-release-artifact.tar.zst /instance_store/artifacts/neon-Linux-release-artifact.tar.zst
    - name: Determine artifact S3_KEY for given commit hash and download and extract artifact
      id: artifact_prefix
      shell: bash -euxo pipefail {0}
      env:
        ARCHIVE: ${{ runner.temp }}/downloads/neon-${{ runner.os }}-${{ runner.arch }}-release-artifact.tar.zst
        COMMIT_HASH: ${{ env.COMMIT_HASH }}
        COMMIT_HASH_TYPE: ${{ env.COMMIT_HASH_TYPE }}
      run: |
        attempt=0
        max_attempts=24 # 5 minutes * 24 = 2 hours

        while [[ $attempt -lt $max_attempts ]]; do
          # the following command will fail until the artifacts are available ...
          S3_KEY=$(aws s3api list-objects-v2 --bucket "$S3_BUCKET" --prefix "artifacts/$COMMIT_HASH/" \
            | jq -r '.Contents[]?.Key' \
            | grep "neon-${{ runner.os }}-${{ runner.arch }}-release-artifact.tar.zst" \
            | sort --version-sort \
            | tail -1) || true # ... thus ignore errors from the command
          if [[ -n "${S3_KEY}" ]]; then
            echo "Artifact found: $S3_KEY"
            echo "S3_KEY=$S3_KEY" >> $GITHUB_ENV
            break
          fi

          # Increment attempt counter and sleep for 5 minutes
          attempt=$((attempt + 1))
          echo "Attempt $attempt of $max_attempts to find artifacts in S3 bucket s3://$S3_BUCKET/artifacts/$COMMIT_HASH failed. Retrying in 5 minutes..."
          sleep 300 # Sleep for 5 minutes
        done

        if [[ -z "${S3_KEY}" ]]; then
          echo "Error: artifact not found in S3 bucket s3://$S3_BUCKET/artifacts/$COMMIT_HASH" after 2 hours
        else
          mkdir -p $(dirname $ARCHIVE)
          time aws s3 cp --only-show-errors s3://$S3_BUCKET/${S3_KEY} ${ARCHIVE}
          mkdir -p ${NEON_DIR}
          time tar -xf ${ARCHIVE} -C ${NEON_DIR}
          rm -f ${ARCHIVE}
        fi

    - name: Download snapshots from S3
      if: ${{ github.event_name != 'workflow_dispatch' || github.event.inputs.recreate_snapshots == 'false' || github.event.inputs.recreate_snapshots == '' }}
      id: download_snapshots
      shell: bash -euxo pipefail {0}
      run: |
        # Download the snapshots from S3
        mkdir -p ${TEST_OUTPUT}
        mkdir -p $BACKUP_DIR
        cd $BACKUP_DIR
        mkdir parts
        cd parts
        PART=$(aws s3api list-objects-v2 --bucket $S3_BUCKET --prefix performance/pagebench/ \
          | jq -r '.Contents[]?.Key' \
          | grep -E 'shared-snapshots-[0-9]{4}-[0-9]{2}-[0-9]{2}' \
          | sort \
          | tail -1)
        echo "Latest PART: $PART"
        if [[ -z "$PART" ]]; then
          echo "ERROR: No matching S3 key found" >&2
          exit 1
        fi
        S3_KEY=$(dirname $PART)
        time aws s3 cp --only-show-errors --recursive s3://${S3_BUCKET}/$S3_KEY/ .
        cd $TEST_OUTPUT
        time cat $BACKUP_DIR/parts/* | zstdcat | tar --extract --preserve-permissions
        rm -rf ${BACKUP_DIR}

    - name: Cache poetry deps
      uses: actions/cache@v4
      with:
        path: ~/.cache/pypoetry/virtualenvs
        key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }}

    - name: Install Python deps
      shell: bash -euxo pipefail {0}
      run: ./scripts/pysync

    # we need high number of open files for pagebench
    - name: show ulimits
      shell: bash -euxo pipefail {0}
      run: |
        ulimit -a

    - name: Run pagebench testcase
      shell: bash -euxo pipefail {0}
      env:
        CI: false  # need to override this env variable set by github to enforce using snapshots
      run: |
        export PLATFORM=hetzner-unit-perf-${COMMIT_HASH_TYPE}
        # report the commit hash of the neon repository in the revision of the test results
        export GITHUB_SHA=${COMMIT_HASH}
        rm -rf ${PERF_REPORT_DIR}
        rm -rf ${ALLURE_RESULTS_DIR}
        mkdir -p ${PERF_REPORT_DIR}
        mkdir -p ${ALLURE_RESULTS_DIR}
        PARAMS="--alluredir=${ALLURE_RESULTS_DIR} --tb=short --verbose -rA"
        EXTRA_PARAMS="--out-dir ${PERF_REPORT_DIR} --durations-path $TEST_OUTPUT/benchmark_durations.json"
        # run only two selected tests
        # environment set by parent:
        # RUST_BACKTRACE=1 DEFAULT_PG_VERSION=16 BUILD_TYPE=release
        ./scripts/pytest ${PARAMS} test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_characterize_throughput_with_n_tenants ${EXTRA_PARAMS}
        ./scripts/pytest ${PARAMS} test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_characterize_latencies_with_1_client_and_throughput_with_many_clients_one_tenant ${EXTRA_PARAMS}

    - name: upload the performance metrics to the Neon performance database which is used by grafana dashboards to display the results
      shell: bash -euxo pipefail {0}
      run: |
        export REPORT_FROM="$PERF_REPORT_DIR"
        export GITHUB_SHA=${COMMIT_HASH}
        time ./scripts/generate_and_push_perf_report.sh

    - name: Upload test results
      if: ${{ !cancelled() }}
      uses: ./.github/actions/allure-report-store
      with:
        report-dir:  ${{ steps.set-env.outputs.allure_results_dir }}
        unique-key: ${{ env.BUILD_TYPE }}-${{ env.DEFAULT_PG_VERSION }}-${{ runner.arch }}
        aws-oidc-role-arn:  ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

    - name: Create Allure report
      id: create-allure-report
      if: ${{ !cancelled() }}
      uses: ./.github/actions/allure-report-generate
      with:
        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

    - name: Upload snapshots
      if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.recreate_snapshots != 'false' && github.event.inputs.recreate_snapshots != '' }}
      id: upload_snapshots
      shell: bash -euxo pipefail {0}
      run: |
        mkdir -p $BACKUP_DIR
        cd $TEST_OUTPUT
        tar --create --preserve-permissions --file - shared-snapshots | zstd -o $BACKUP_DIR/shared_snapshots.tar.zst
        cd $BACKUP_DIR
        mkdir parts
        split -b 1G shared_snapshots.tar.zst ./parts/shared_snapshots.tar.zst.part.
        SNAPSHOT_DATE=$(date +%F)  # YYYY-MM-DD
        cd parts
        time aws s3 cp --recursive . s3://${S3_BUCKET}/performance/pagebench/shared-snapshots-${SNAPSHOT_DATE}/

    - name: Post to a Slack channel
      if: ${{ github.event.schedule && failure() }}
      uses: slackapi/slack-github-action@fcfb566f8b0aab22203f066d80ca1d7e4b5d05b3 # v1.27.1
      with:
        channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream
        slack-message: "Periodic pagebench testing on dedicated hardware: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
      env:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

    - name: Cleanup Test Resources
      if: always()
      shell: bash -euxo pipefail {0}
      env:
        ARCHIVE: ${{ runner.temp }}/downloads/neon-${{ runner.os }}-${{ runner.arch }}-release-artifact.tar.zst
      run: |
        # Cleanup the test resources
        if [[ -d "${BACKUP_DIR}" ]]; then
          rm -rf ${BACKUP_DIR}
        fi
        if [[ -d "${TEST_OUTPUT}" ]]; then
          rm -rf ${TEST_OUTPUT}
        fi
        if [[ -d "${NEON_DIR}" ]]; then
          rm -rf ${NEON_DIR}
        fi
        rm -rf $(dirname $ARCHIVE)