name: Periodic pagebench performance test on unit-perf-aws-arm runners on: schedule: # * is a special character in YAML so you have to quote this string # ┌───────────── minute (0 - 59) # │ ┌───────────── hour (0 - 23) # │ │ ┌───────────── day of the month (1 - 31) # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) - cron: '0 */4 * * *' # Runs every 4 hours workflow_dispatch: # Allows manual triggering of the workflow inputs: commit_hash: type: string description: 'The long neon repo commit hash for the system under test (pageserver) to be tested.' required: false default: '' recreate_snapshots: type: boolean description: 'Recreate snapshots - !!!WARNING!!! We should only recreate snapshots if the previous ones are no longer compatible. Otherwise benchmarking results are not comparable across runs.' required: false default: false defaults: run: shell: bash -euo pipefail {0} concurrency: group: ${{ github.workflow }} cancel-in-progress: false permissions: contents: read jobs: run_periodic_pagebench_test: permissions: id-token: write # aws-actions/configure-aws-credentials statuses: write contents: write pull-requests: write runs-on: [ self-hosted, unit-perf-aws-arm ] container: image: ghcr.io/neondatabase/build-tools:pinned-bookworm credentials: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} options: --init timeout-minutes: 360 # Set the timeout to 6 hours env: RUN_ID: ${{ github.run_id }} DEFAULT_PG_VERSION: 16 BUILD_TYPE: release RUST_BACKTRACE: 1 # NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS: 1 - doesn't work without root in container S3_BUCKET: neon-github-public-dev PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" steps: # we don't need the neon source code because we run everything remotely # however we still need the local github actions to run the allure step below - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - name: Set up the environment which depends on $RUNNER_TEMP on nvme drive id: set-env shell: bash -euxo pipefail {0} run: | { echo "NEON_DIR=${RUNNER_TEMP}/neon" echo "NEON_BIN=${RUNNER_TEMP}/neon/bin" echo "POSTGRES_DISTRIB_DIR=${RUNNER_TEMP}/neon/pg_install" echo "LD_LIBRARY_PATH=${RUNNER_TEMP}/neon/pg_install/v${DEFAULT_PG_VERSION}/lib" echo "BACKUP_DIR=${RUNNER_TEMP}/instance_store/saved_snapshots" echo "TEST_OUTPUT=${RUNNER_TEMP}/neon/test_output" echo "PERF_REPORT_DIR=${RUNNER_TEMP}/neon/test_output/perf-report-local" echo "ALLURE_DIR=${RUNNER_TEMP}/neon/test_output/allure-results" echo "ALLURE_RESULTS_DIR=${RUNNER_TEMP}/neon/test_output/allure-results/results" } >> "$GITHUB_ENV" echo "allure_results_dir=${RUNNER_TEMP}/neon/test_output/allure-results/results" >> "$GITHUB_OUTPUT" - uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 with: aws-region: eu-central-1 role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} role-duration-seconds: 18000 # max 5 hours (needed in case commit hash is still being built) - name: Determine commit hash id: commit_hash shell: bash -euxo pipefail {0} env: INPUT_COMMIT_HASH: ${{ github.event.inputs.commit_hash }} run: | if [[ -z "${INPUT_COMMIT_HASH}" ]]; then COMMIT_HASH=$(curl -s https://api.github.com/repos/neondatabase/neon/commits/main | jq -r '.sha') echo "COMMIT_HASH=$COMMIT_HASH" >> $GITHUB_ENV echo "commit_hash=$COMMIT_HASH" >> "$GITHUB_OUTPUT" echo "COMMIT_HASH_TYPE=latest" >> $GITHUB_ENV else COMMIT_HASH="${INPUT_COMMIT_HASH}" echo "COMMIT_HASH=$COMMIT_HASH" >> $GITHUB_ENV echo "commit_hash=$COMMIT_HASH" >> "$GITHUB_OUTPUT" echo "COMMIT_HASH_TYPE=manual" >> $GITHUB_ENV fi - name: Checkout the neon repository at given commit hash uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: ref: ${{ steps.commit_hash.outputs.commit_hash }} # does not reuse ./.github/actions/download because we need to download the artifact for the given commit hash # example artifact # s3://neon-github-public-dev/artifacts/48b870bc078bd2c450eb7b468e743b9c118549bf/15036827400/1/neon-Linux-X64-release-artifact.tar.zst /instance_store/artifacts/neon-Linux-release-artifact.tar.zst - name: Determine artifact S3_KEY for given commit hash and download and extract artifact id: artifact_prefix shell: bash -euxo pipefail {0} env: ARCHIVE: ${{ runner.temp }}/downloads/neon-${{ runner.os }}-${{ runner.arch }}-release-artifact.tar.zst COMMIT_HASH: ${{ env.COMMIT_HASH }} COMMIT_HASH_TYPE: ${{ env.COMMIT_HASH_TYPE }} run: | attempt=0 max_attempts=24 # 5 minutes * 24 = 2 hours while [[ $attempt -lt $max_attempts ]]; do # the following command will fail until the artifacts are available ... S3_KEY=$(aws s3api list-objects-v2 --bucket "$S3_BUCKET" --prefix "artifacts/$COMMIT_HASH/" \ | jq -r '.Contents[]?.Key' \ | grep "neon-${{ runner.os }}-${{ runner.arch }}-release-artifact.tar.zst" \ | sort --version-sort \ | tail -1) || true # ... thus ignore errors from the command if [[ -n "${S3_KEY}" ]]; then echo "Artifact found: $S3_KEY" echo "S3_KEY=$S3_KEY" >> $GITHUB_ENV break fi # Increment attempt counter and sleep for 5 minutes attempt=$((attempt + 1)) echo "Attempt $attempt of $max_attempts to find artifacts in S3 bucket s3://$S3_BUCKET/artifacts/$COMMIT_HASH failed. Retrying in 5 minutes..." sleep 300 # Sleep for 5 minutes done if [[ -z "${S3_KEY}" ]]; then echo "Error: artifact not found in S3 bucket s3://$S3_BUCKET/artifacts/$COMMIT_HASH" after 2 hours else mkdir -p $(dirname $ARCHIVE) time aws s3 cp --only-show-errors s3://$S3_BUCKET/${S3_KEY} ${ARCHIVE} mkdir -p ${NEON_DIR} time tar -xf ${ARCHIVE} -C ${NEON_DIR} rm -f ${ARCHIVE} fi - name: Download snapshots from S3 if: ${{ github.event_name != 'workflow_dispatch' || github.event.inputs.recreate_snapshots == 'false' || github.event.inputs.recreate_snapshots == '' }} id: download_snapshots shell: bash -euxo pipefail {0} run: | # Download the snapshots from S3 mkdir -p ${TEST_OUTPUT} mkdir -p $BACKUP_DIR cd $BACKUP_DIR mkdir parts cd parts PART=$(aws s3api list-objects-v2 --bucket $S3_BUCKET --prefix performance/pagebench/ \ | jq -r '.Contents[]?.Key' \ | grep -E 'shared-snapshots-[0-9]{4}-[0-9]{2}-[0-9]{2}' \ | sort \ | tail -1) echo "Latest PART: $PART" if [[ -z "$PART" ]]; then echo "ERROR: No matching S3 key found" >&2 exit 1 fi S3_KEY=$(dirname $PART) time aws s3 cp --only-show-errors --recursive s3://${S3_BUCKET}/$S3_KEY/ . cd $TEST_OUTPUT time cat $BACKUP_DIR/parts/* | zstdcat | tar --extract --preserve-permissions rm -rf ${BACKUP_DIR} - name: Cache poetry deps uses: actions/cache@v4 with: path: ~/.cache/pypoetry/virtualenvs key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }} - name: Install Python deps shell: bash -euxo pipefail {0} run: ./scripts/pysync # we need high number of open files for pagebench - name: show ulimits shell: bash -euxo pipefail {0} run: | ulimit -a - name: Run pagebench testcase shell: bash -euxo pipefail {0} env: CI: false # need to override this env variable set by github to enforce using snapshots run: | export PLATFORM=hetzner-unit-perf-${COMMIT_HASH_TYPE} # report the commit hash of the neon repository in the revision of the test results export GITHUB_SHA=${COMMIT_HASH} rm -rf ${PERF_REPORT_DIR} rm -rf ${ALLURE_RESULTS_DIR} mkdir -p ${PERF_REPORT_DIR} mkdir -p ${ALLURE_RESULTS_DIR} PARAMS="--alluredir=${ALLURE_RESULTS_DIR} --tb=short --verbose -rA" EXTRA_PARAMS="--out-dir ${PERF_REPORT_DIR} --durations-path $TEST_OUTPUT/benchmark_durations.json" # run only two selected tests # environment set by parent: # RUST_BACKTRACE=1 DEFAULT_PG_VERSION=16 BUILD_TYPE=release ./scripts/pytest ${PARAMS} test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_characterize_throughput_with_n_tenants ${EXTRA_PARAMS} ./scripts/pytest ${PARAMS} test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_characterize_latencies_with_1_client_and_throughput_with_many_clients_one_tenant ${EXTRA_PARAMS} - name: upload the performance metrics to the Neon performance database which is used by grafana dashboards to display the results shell: bash -euxo pipefail {0} run: | export REPORT_FROM="$PERF_REPORT_DIR" export GITHUB_SHA=${COMMIT_HASH} time ./scripts/generate_and_push_perf_report.sh - name: Upload test results if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-store with: report-dir: ${{ steps.set-env.outputs.allure_results_dir }} unique-key: ${{ env.BUILD_TYPE }}-${{ env.DEFAULT_PG_VERSION }}-${{ runner.arch }} aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Create Allure report id: create-allure-report if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate with: aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Upload snapshots if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.recreate_snapshots != 'false' && github.event.inputs.recreate_snapshots != '' }} id: upload_snapshots shell: bash -euxo pipefail {0} run: | mkdir -p $BACKUP_DIR cd $TEST_OUTPUT tar --create --preserve-permissions --file - shared-snapshots | zstd -o $BACKUP_DIR/shared_snapshots.tar.zst cd $BACKUP_DIR mkdir parts split -b 1G shared_snapshots.tar.zst ./parts/shared_snapshots.tar.zst.part. SNAPSHOT_DATE=$(date +%F) # YYYY-MM-DD cd parts time aws s3 cp --recursive . s3://${S3_BUCKET}/performance/pagebench/shared-snapshots-${SNAPSHOT_DATE}/ - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} uses: slackapi/slack-github-action@fcfb566f8b0aab22203f066d80ca1d7e4b5d05b3 # v1.27.1 with: channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream slack-message: "Periodic pagebench testing on dedicated hardware: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} - name: Cleanup Test Resources if: always() shell: bash -euxo pipefail {0} env: ARCHIVE: ${{ runner.temp }}/downloads/neon-${{ runner.os }}-${{ runner.arch }}-release-artifact.tar.zst run: | # Cleanup the test resources if [[ -d "${BACKUP_DIR}" ]]; then rm -rf ${BACKUP_DIR} fi if [[ -d "${TEST_OUTPUT}" ]]; then rm -rf ${TEST_OUTPUT} fi if [[ -d "${NEON_DIR}" ]]; then rm -rf ${NEON_DIR} fi rm -rf $(dirname $ARCHIVE)