mirror of
https://github.com/neondatabase/neon.git
synced 2025-12-22 21:59:59 +00:00
## Problem - Benchmark periodic pagebench had inconsistent benchmarking results even when run with the same commit hash. Hypothesis is this was due to running on dedicated but virtualized EC instance with varying CPU frequency. - the dedicated instance type used for the benchmark is quite "old" and we increasingly get `An error occurred (InsufficientInstanceCapacity) when calling the StartInstances operation (reached max retries: 2): Insufficient capacity.` - periodic pagebench uses a snapshot of pageserver timelines to have the same layer structure in each run and get consistent performance. Re-creating the snapshot was a painful manual process (see https://github.com/neondatabase/cloud/issues/27051 and https://github.com/neondatabase/cloud/issues/27653) ## Summary of changes - Run the periodic pagebench on a custom hetzner GitHub runner with large nvme disk and governor set to defined perf profile - provide a manual dispatch option for the workflow that allows to create a new snapshot - keep the manual dispatch option to specify a commit hash useful for bi-secting regressions - always use the newest created snapshot (S3 bucket uses date suffix in S3 key, example `s3://neon-github-public-dev/performance/pagebench/shared-snapshots-2025-05-17/` - `--ignore` `test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py` in regular benchmarks run for each commit - improve perf copying snapshot by using `cp` subprocess instead of traversing tree in python ## Example runs with code in this PR: - run which creates new snapshot https://github.com/neondatabase/neon/actions/runs/15083408849/job/42402986376#step:19:55 - run which uses latest snapshot - https://github.com/neondatabase/neon/actions/runs/15084907676/job/42406240745#step:11:65
282 lines
12 KiB
YAML
282 lines
12 KiB
YAML
name: Periodic pagebench performance test on unit-perf hetzner runner
|
|
|
|
on:
|
|
schedule:
|
|
# * is a special character in YAML so you have to quote this string
|
|
# ┌───────────── minute (0 - 59)
|
|
# │ ┌───────────── hour (0 - 23)
|
|
# │ │ ┌───────────── day of the month (1 - 31)
|
|
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
|
|
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
|
|
- cron: '0 */4 * * *' # Runs every 4 hours
|
|
workflow_dispatch: # Allows manual triggering of the workflow
|
|
inputs:
|
|
commit_hash:
|
|
type: string
|
|
description: 'The long neon repo commit hash for the system under test (pageserver) to be tested.'
|
|
required: false
|
|
default: ''
|
|
recreate_snapshots:
|
|
type: boolean
|
|
description: 'Recreate snapshots - !!!WARNING!!! We should only recreate snapshots if the previous ones are no longer compatible. Otherwise benchmarking results are not comparable across runs.'
|
|
required: false
|
|
default: false
|
|
|
|
defaults:
|
|
run:
|
|
shell: bash -euo pipefail {0}
|
|
|
|
concurrency:
|
|
group: ${{ github.workflow }}
|
|
cancel-in-progress: false
|
|
|
|
permissions:
|
|
contents: read
|
|
|
|
jobs:
|
|
run_periodic_pagebench_test:
|
|
permissions:
|
|
id-token: write # aws-actions/configure-aws-credentials
|
|
statuses: write
|
|
contents: write
|
|
pull-requests: write
|
|
runs-on: [ self-hosted, unit-perf ]
|
|
container:
|
|
image: ghcr.io/neondatabase/build-tools:pinned-bookworm
|
|
credentials:
|
|
username: ${{ github.actor }}
|
|
password: ${{ secrets.GITHUB_TOKEN }}
|
|
options: --init
|
|
timeout-minutes: 360 # Set the timeout to 6 hours
|
|
env:
|
|
RUN_ID: ${{ github.run_id }}
|
|
DEFAULT_PG_VERSION: 16
|
|
BUILD_TYPE: release
|
|
RUST_BACKTRACE: 1
|
|
# NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS: 1 - doesn't work without root in container
|
|
S3_BUCKET: neon-github-public-dev
|
|
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
|
steps:
|
|
# we don't need the neon source code because we run everything remotely
|
|
# however we still need the local github actions to run the allure step below
|
|
- name: Harden the runner (Audit all outbound calls)
|
|
uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
|
|
with:
|
|
egress-policy: audit
|
|
|
|
- name: Set up the environment which depends on $RUNNER_TEMP on nvme drive
|
|
id: set-env
|
|
shell: bash -euxo pipefail {0}
|
|
run: |
|
|
{
|
|
echo "NEON_DIR=${RUNNER_TEMP}/neon"
|
|
echo "NEON_BIN=${RUNNER_TEMP}/neon/bin"
|
|
echo "POSTGRES_DISTRIB_DIR=${RUNNER_TEMP}/neon/pg_install"
|
|
echo "LD_LIBRARY_PATH=${RUNNER_TEMP}/neon/pg_install/v${DEFAULT_PG_VERSION}/lib"
|
|
echo "BACKUP_DIR=${RUNNER_TEMP}/instance_store/saved_snapshots"
|
|
echo "TEST_OUTPUT=${RUNNER_TEMP}/neon/test_output"
|
|
echo "PERF_REPORT_DIR=${RUNNER_TEMP}/neon/test_output/perf-report-local"
|
|
echo "ALLURE_DIR=${RUNNER_TEMP}/neon/test_output/allure-results"
|
|
echo "ALLURE_RESULTS_DIR=${RUNNER_TEMP}/neon/test_output/allure-results/results"
|
|
} >> "$GITHUB_ENV"
|
|
|
|
echo "allure_results_dir=${RUNNER_TEMP}/neon/test_output/allure-results/results" >> "$GITHUB_OUTPUT"
|
|
|
|
- uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
|
|
with:
|
|
aws-region: eu-central-1
|
|
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
|
role-duration-seconds: 18000 # max 5 hours (needed in case commit hash is still being built)
|
|
- name: Determine commit hash
|
|
id: commit_hash
|
|
shell: bash -euxo pipefail {0}
|
|
env:
|
|
INPUT_COMMIT_HASH: ${{ github.event.inputs.commit_hash }}
|
|
run: |
|
|
if [[ -z "${INPUT_COMMIT_HASH}" ]]; then
|
|
COMMIT_HASH=$(curl -s https://api.github.com/repos/neondatabase/neon/commits/main | jq -r '.sha')
|
|
echo "COMMIT_HASH=$COMMIT_HASH" >> $GITHUB_ENV
|
|
echo "commit_hash=$COMMIT_HASH" >> "$GITHUB_OUTPUT"
|
|
echo "COMMIT_HASH_TYPE=latest" >> $GITHUB_ENV
|
|
else
|
|
COMMIT_HASH="${INPUT_COMMIT_HASH}"
|
|
echo "COMMIT_HASH=$COMMIT_HASH" >> $GITHUB_ENV
|
|
echo "commit_hash=$COMMIT_HASH" >> "$GITHUB_OUTPUT"
|
|
echo "COMMIT_HASH_TYPE=manual" >> $GITHUB_ENV
|
|
fi
|
|
- name: Checkout the neon repository at given commit hash
|
|
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
|
with:
|
|
ref: ${{ steps.commit_hash.outputs.commit_hash }}
|
|
|
|
# does not reuse ./.github/actions/download because we need to download the artifact for the given commit hash
|
|
# example artifact
|
|
# s3://neon-github-public-dev/artifacts/48b870bc078bd2c450eb7b468e743b9c118549bf/15036827400/1/neon-Linux-X64-release-artifact.tar.zst /instance_store/artifacts/neon-Linux-release-artifact.tar.zst
|
|
- name: Determine artifact S3_KEY for given commit hash and download and extract artifact
|
|
id: artifact_prefix
|
|
shell: bash -euxo pipefail {0}
|
|
env:
|
|
ARCHIVE: ${{ runner.temp }}/downloads/neon-${{ runner.os }}-${{ runner.arch }}-release-artifact.tar.zst
|
|
COMMIT_HASH: ${{ env.COMMIT_HASH }}
|
|
COMMIT_HASH_TYPE: ${{ env.COMMIT_HASH_TYPE }}
|
|
run: |
|
|
attempt=0
|
|
max_attempts=24 # 5 minutes * 24 = 2 hours
|
|
|
|
while [[ $attempt -lt $max_attempts ]]; do
|
|
# the following command will fail until the artifacts are available ...
|
|
S3_KEY=$(aws s3api list-objects-v2 --bucket "$S3_BUCKET" --prefix "artifacts/$COMMIT_HASH/" \
|
|
| jq -r '.Contents[]?.Key' \
|
|
| grep "neon-${{ runner.os }}-${{ runner.arch }}-release-artifact.tar.zst" \
|
|
| sort --version-sort \
|
|
| tail -1) || true # ... thus ignore errors from the command
|
|
if [[ -n "${S3_KEY}" ]]; then
|
|
echo "Artifact found: $S3_KEY"
|
|
echo "S3_KEY=$S3_KEY" >> $GITHUB_ENV
|
|
break
|
|
fi
|
|
|
|
# Increment attempt counter and sleep for 5 minutes
|
|
attempt=$((attempt + 1))
|
|
echo "Attempt $attempt of $max_attempts to find artifacts in S3 bucket s3://$S3_BUCKET/artifacts/$COMMIT_HASH failed. Retrying in 5 minutes..."
|
|
sleep 300 # Sleep for 5 minutes
|
|
done
|
|
|
|
if [[ -z "${S3_KEY}" ]]; then
|
|
echo "Error: artifact not found in S3 bucket s3://$S3_BUCKET/artifacts/$COMMIT_HASH" after 2 hours
|
|
else
|
|
mkdir -p $(dirname $ARCHIVE)
|
|
time aws s3 cp --only-show-errors s3://$S3_BUCKET/${S3_KEY} ${ARCHIVE}
|
|
mkdir -p ${NEON_DIR}
|
|
time tar -xf ${ARCHIVE} -C ${NEON_DIR}
|
|
rm -f ${ARCHIVE}
|
|
fi
|
|
|
|
- name: Download snapshots from S3
|
|
if: ${{ github.event_name != 'workflow_dispatch' || github.event.inputs.recreate_snapshots == 'false' || github.event.inputs.recreate_snapshots == '' }}
|
|
id: download_snapshots
|
|
shell: bash -euxo pipefail {0}
|
|
run: |
|
|
# Download the snapshots from S3
|
|
mkdir -p ${TEST_OUTPUT}
|
|
mkdir -p $BACKUP_DIR
|
|
cd $BACKUP_DIR
|
|
mkdir parts
|
|
cd parts
|
|
PART=$(aws s3api list-objects-v2 --bucket $S3_BUCKET --prefix performance/pagebench/ \
|
|
| jq -r '.Contents[]?.Key' \
|
|
| grep -E 'shared-snapshots-[0-9]{4}-[0-9]{2}-[0-9]{2}' \
|
|
| sort \
|
|
| tail -1)
|
|
echo "Latest PART: $PART"
|
|
if [[ -z "$PART" ]]; then
|
|
echo "ERROR: No matching S3 key found" >&2
|
|
exit 1
|
|
fi
|
|
S3_KEY=$(dirname $PART)
|
|
time aws s3 cp --only-show-errors --recursive s3://${S3_BUCKET}/$S3_KEY/ .
|
|
cd $TEST_OUTPUT
|
|
time cat $BACKUP_DIR/parts/* | zstdcat | tar --extract --preserve-permissions
|
|
rm -rf ${BACKUP_DIR}
|
|
|
|
- name: Cache poetry deps
|
|
uses: actions/cache@v4
|
|
with:
|
|
path: ~/.cache/pypoetry/virtualenvs
|
|
key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }}
|
|
|
|
- name: Install Python deps
|
|
shell: bash -euxo pipefail {0}
|
|
run: ./scripts/pysync
|
|
|
|
# we need high number of open files for pagebench
|
|
- name: show ulimits
|
|
shell: bash -euxo pipefail {0}
|
|
run: |
|
|
ulimit -a
|
|
|
|
- name: Run pagebench testcase
|
|
shell: bash -euxo pipefail {0}
|
|
env:
|
|
CI: false # need to override this env variable set by github to enforce using snapshots
|
|
run: |
|
|
export PLATFORM=hetzner-unit-perf-${COMMIT_HASH_TYPE}
|
|
# report the commit hash of the neon repository in the revision of the test results
|
|
export GITHUB_SHA=${COMMIT_HASH}
|
|
rm -rf ${PERF_REPORT_DIR}
|
|
rm -rf ${ALLURE_RESULTS_DIR}
|
|
mkdir -p ${PERF_REPORT_DIR}
|
|
mkdir -p ${ALLURE_RESULTS_DIR}
|
|
PARAMS="--alluredir=${ALLURE_RESULTS_DIR} --tb=short --verbose -rA"
|
|
EXTRA_PARAMS="--out-dir ${PERF_REPORT_DIR} --durations-path $TEST_OUTPUT/benchmark_durations.json"
|
|
# run only two selected tests
|
|
# environment set by parent:
|
|
# RUST_BACKTRACE=1 DEFAULT_PG_VERSION=16 BUILD_TYPE=release
|
|
./scripts/pytest ${PARAMS} test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_characterize_throughput_with_n_tenants ${EXTRA_PARAMS}
|
|
./scripts/pytest ${PARAMS} test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_characterize_latencies_with_1_client_and_throughput_with_many_clients_one_tenant ${EXTRA_PARAMS}
|
|
|
|
- name: upload the performance metrics to the Neon performance database which is used by grafana dashboards to display the results
|
|
shell: bash -euxo pipefail {0}
|
|
run: |
|
|
export REPORT_FROM="$PERF_REPORT_DIR"
|
|
export GITHUB_SHA=${COMMIT_HASH}
|
|
time ./scripts/generate_and_push_perf_report.sh
|
|
|
|
- name: Upload test results
|
|
if: ${{ !cancelled() }}
|
|
uses: ./.github/actions/allure-report-store
|
|
with:
|
|
report-dir: ${{ steps.set-env.outputs.allure_results_dir }}
|
|
unique-key: ${{ env.BUILD_TYPE }}-${{ env.DEFAULT_PG_VERSION }}-${{ runner.arch }}
|
|
aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
|
|
|
- name: Create Allure report
|
|
id: create-allure-report
|
|
if: ${{ !cancelled() }}
|
|
uses: ./.github/actions/allure-report-generate
|
|
with:
|
|
aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
|
|
|
- name: Upload snapshots
|
|
if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.recreate_snapshots != 'false' && github.event.inputs.recreate_snapshots != '' }}
|
|
id: upload_snapshots
|
|
shell: bash -euxo pipefail {0}
|
|
run: |
|
|
mkdir -p $BACKUP_DIR
|
|
cd $TEST_OUTPUT
|
|
tar --create --preserve-permissions --file - shared-snapshots | zstd -o $BACKUP_DIR/shared_snapshots.tar.zst
|
|
cd $BACKUP_DIR
|
|
mkdir parts
|
|
split -b 1G shared_snapshots.tar.zst ./parts/shared_snapshots.tar.zst.part.
|
|
SNAPSHOT_DATE=$(date +%F) # YYYY-MM-DD
|
|
cd parts
|
|
time aws s3 cp --recursive . s3://${S3_BUCKET}/performance/pagebench/shared-snapshots-${SNAPSHOT_DATE}/
|
|
|
|
- name: Post to a Slack channel
|
|
if: ${{ github.event.schedule && failure() }}
|
|
uses: slackapi/slack-github-action@fcfb566f8b0aab22203f066d80ca1d7e4b5d05b3 # v1.27.1
|
|
with:
|
|
channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream
|
|
slack-message: "Periodic pagebench testing on dedicated hardware: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
|
env:
|
|
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
|
|
|
- name: Cleanup Test Resources
|
|
if: always()
|
|
shell: bash -euxo pipefail {0}
|
|
env:
|
|
ARCHIVE: ${{ runner.temp }}/downloads/neon-${{ runner.os }}-${{ runner.arch }}-release-artifact.tar.zst
|
|
run: |
|
|
# Cleanup the test resources
|
|
if [[ -d "${BACKUP_DIR}" ]]; then
|
|
rm -rf ${BACKUP_DIR}
|
|
fi
|
|
if [[ -d "${TEST_OUTPUT}" ]]; then
|
|
rm -rf ${TEST_OUTPUT}
|
|
fi
|
|
if [[ -d "${NEON_DIR}" ]]; then
|
|
rm -rf ${NEON_DIR}
|
|
fi
|
|
rm -rf $(dirname $ARCHIVE)
|
|
|