mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-04 12:02:55 +00:00
## Problem
We want to move some benchmarks from hetzner runners to aws graviton
runners
## Summary of changes
Adjust the runner labels for some workflows.
Adjust the pagebench number of clients to match the latecny knee at 8
cores of the new instance type
Add `--security-opt seccomp=unconfined` to docker run command to bypass
IO_URING EPERM error.
## New runners
https://us-east-2.console.aws.amazon.com/ec2/home?region=us-east-2#Instances:instanceState=running;search=:github-unit-perf-runner-arm;v=3;$case=tags:true%5C,client:false;$regex=tags:false%5C,client:false;sort=tag:Name
## Important Notes
I added the run-benchmarks label to get this tested **before we merge
it**.
[See](https://github.com/neondatabase/neon/actions/runs/15974141360)
I also test a run of pagebench with the new setup from this branch, see
https://github.com/neondatabase/neon/actions/runs/15972523054
- Update: the benchmarking workflow had failures, [see]
(https://github.com/neondatabase/neon/actions/runs/15974141360/job/45055897591)
- changed docker run command to avoid io_uring EPERM error, new run
[see](https://github.com/neondatabase/neon/actions/runs/15997965633/job/45125689920?pr=12393)
Update: the pagebench test run on the new runner [completed
successfully](https://github.com/neondatabase/neon/actions/runs/15972523054/job/45046772556)
Update 2025-07-07: the latest runs with instance store ext4 have been
successful and resolved the direct I/O issues we have been seeing before
in some runs. We only had one perf testcase failing (shard split) that
had been flaky before. So I think we can merge this now.
## Follow up
if this is merged and works successfully we must create a separate issue
to de-provision the hetzner unit-perf runners defined
[here](91a41729af/ansible/inventory/hosts_metal (L111))
282 lines
12 KiB
YAML
282 lines
12 KiB
YAML
name: Periodic pagebench performance test on unit-perf-aws-arm runners
|
|
|
|
on:
|
|
schedule:
|
|
# * is a special character in YAML so you have to quote this string
|
|
# ┌───────────── minute (0 - 59)
|
|
# │ ┌───────────── hour (0 - 23)
|
|
# │ │ ┌───────────── day of the month (1 - 31)
|
|
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
|
|
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
|
|
- cron: '0 */4 * * *' # Runs every 4 hours
|
|
workflow_dispatch: # Allows manual triggering of the workflow
|
|
inputs:
|
|
commit_hash:
|
|
type: string
|
|
description: 'The long neon repo commit hash for the system under test (pageserver) to be tested.'
|
|
required: false
|
|
default: ''
|
|
recreate_snapshots:
|
|
type: boolean
|
|
description: 'Recreate snapshots - !!!WARNING!!! We should only recreate snapshots if the previous ones are no longer compatible. Otherwise benchmarking results are not comparable across runs.'
|
|
required: false
|
|
default: false
|
|
|
|
defaults:
|
|
run:
|
|
shell: bash -euo pipefail {0}
|
|
|
|
concurrency:
|
|
group: ${{ github.workflow }}
|
|
cancel-in-progress: false
|
|
|
|
permissions:
|
|
contents: read
|
|
|
|
jobs:
|
|
run_periodic_pagebench_test:
|
|
permissions:
|
|
id-token: write # aws-actions/configure-aws-credentials
|
|
statuses: write
|
|
contents: write
|
|
pull-requests: write
|
|
runs-on: [ self-hosted, unit-perf-aws-arm ]
|
|
container:
|
|
image: ghcr.io/neondatabase/build-tools:pinned-bookworm
|
|
credentials:
|
|
username: ${{ github.actor }}
|
|
password: ${{ secrets.GITHUB_TOKEN }}
|
|
options: --init
|
|
timeout-minutes: 360 # Set the timeout to 6 hours
|
|
env:
|
|
RUN_ID: ${{ github.run_id }}
|
|
DEFAULT_PG_VERSION: 16
|
|
BUILD_TYPE: release
|
|
RUST_BACKTRACE: 1
|
|
# NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS: 1 - doesn't work without root in container
|
|
S3_BUCKET: neon-github-public-dev
|
|
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
|
steps:
|
|
# we don't need the neon source code because we run everything remotely
|
|
# however we still need the local github actions to run the allure step below
|
|
- name: Harden the runner (Audit all outbound calls)
|
|
uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
|
|
with:
|
|
egress-policy: audit
|
|
|
|
- name: Set up the environment which depends on $RUNNER_TEMP on nvme drive
|
|
id: set-env
|
|
shell: bash -euxo pipefail {0}
|
|
run: |
|
|
{
|
|
echo "NEON_DIR=${RUNNER_TEMP}/neon"
|
|
echo "NEON_BIN=${RUNNER_TEMP}/neon/bin"
|
|
echo "POSTGRES_DISTRIB_DIR=${RUNNER_TEMP}/neon/pg_install"
|
|
echo "LD_LIBRARY_PATH=${RUNNER_TEMP}/neon/pg_install/v${DEFAULT_PG_VERSION}/lib"
|
|
echo "BACKUP_DIR=${RUNNER_TEMP}/instance_store/saved_snapshots"
|
|
echo "TEST_OUTPUT=${RUNNER_TEMP}/neon/test_output"
|
|
echo "PERF_REPORT_DIR=${RUNNER_TEMP}/neon/test_output/perf-report-local"
|
|
echo "ALLURE_DIR=${RUNNER_TEMP}/neon/test_output/allure-results"
|
|
echo "ALLURE_RESULTS_DIR=${RUNNER_TEMP}/neon/test_output/allure-results/results"
|
|
} >> "$GITHUB_ENV"
|
|
|
|
echo "allure_results_dir=${RUNNER_TEMP}/neon/test_output/allure-results/results" >> "$GITHUB_OUTPUT"
|
|
|
|
- uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
|
|
with:
|
|
aws-region: eu-central-1
|
|
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
|
role-duration-seconds: 18000 # max 5 hours (needed in case commit hash is still being built)
|
|
- name: Determine commit hash
|
|
id: commit_hash
|
|
shell: bash -euxo pipefail {0}
|
|
env:
|
|
INPUT_COMMIT_HASH: ${{ github.event.inputs.commit_hash }}
|
|
run: |
|
|
if [[ -z "${INPUT_COMMIT_HASH}" ]]; then
|
|
COMMIT_HASH=$(curl -s https://api.github.com/repos/neondatabase/neon/commits/main | jq -r '.sha')
|
|
echo "COMMIT_HASH=$COMMIT_HASH" >> $GITHUB_ENV
|
|
echo "commit_hash=$COMMIT_HASH" >> "$GITHUB_OUTPUT"
|
|
echo "COMMIT_HASH_TYPE=latest" >> $GITHUB_ENV
|
|
else
|
|
COMMIT_HASH="${INPUT_COMMIT_HASH}"
|
|
echo "COMMIT_HASH=$COMMIT_HASH" >> $GITHUB_ENV
|
|
echo "commit_hash=$COMMIT_HASH" >> "$GITHUB_OUTPUT"
|
|
echo "COMMIT_HASH_TYPE=manual" >> $GITHUB_ENV
|
|
fi
|
|
- name: Checkout the neon repository at given commit hash
|
|
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
|
with:
|
|
ref: ${{ steps.commit_hash.outputs.commit_hash }}
|
|
|
|
# does not reuse ./.github/actions/download because we need to download the artifact for the given commit hash
|
|
# example artifact
|
|
# s3://neon-github-public-dev/artifacts/48b870bc078bd2c450eb7b468e743b9c118549bf/15036827400/1/neon-Linux-X64-release-artifact.tar.zst /instance_store/artifacts/neon-Linux-release-artifact.tar.zst
|
|
- name: Determine artifact S3_KEY for given commit hash and download and extract artifact
|
|
id: artifact_prefix
|
|
shell: bash -euxo pipefail {0}
|
|
env:
|
|
ARCHIVE: ${{ runner.temp }}/downloads/neon-${{ runner.os }}-${{ runner.arch }}-release-artifact.tar.zst
|
|
COMMIT_HASH: ${{ env.COMMIT_HASH }}
|
|
COMMIT_HASH_TYPE: ${{ env.COMMIT_HASH_TYPE }}
|
|
run: |
|
|
attempt=0
|
|
max_attempts=24 # 5 minutes * 24 = 2 hours
|
|
|
|
while [[ $attempt -lt $max_attempts ]]; do
|
|
# the following command will fail until the artifacts are available ...
|
|
S3_KEY=$(aws s3api list-objects-v2 --bucket "$S3_BUCKET" --prefix "artifacts/$COMMIT_HASH/" \
|
|
| jq -r '.Contents[]?.Key' \
|
|
| grep "neon-${{ runner.os }}-${{ runner.arch }}-release-artifact.tar.zst" \
|
|
| sort --version-sort \
|
|
| tail -1) || true # ... thus ignore errors from the command
|
|
if [[ -n "${S3_KEY}" ]]; then
|
|
echo "Artifact found: $S3_KEY"
|
|
echo "S3_KEY=$S3_KEY" >> $GITHUB_ENV
|
|
break
|
|
fi
|
|
|
|
# Increment attempt counter and sleep for 5 minutes
|
|
attempt=$((attempt + 1))
|
|
echo "Attempt $attempt of $max_attempts to find artifacts in S3 bucket s3://$S3_BUCKET/artifacts/$COMMIT_HASH failed. Retrying in 5 minutes..."
|
|
sleep 300 # Sleep for 5 minutes
|
|
done
|
|
|
|
if [[ -z "${S3_KEY}" ]]; then
|
|
echo "Error: artifact not found in S3 bucket s3://$S3_BUCKET/artifacts/$COMMIT_HASH" after 2 hours
|
|
else
|
|
mkdir -p $(dirname $ARCHIVE)
|
|
time aws s3 cp --only-show-errors s3://$S3_BUCKET/${S3_KEY} ${ARCHIVE}
|
|
mkdir -p ${NEON_DIR}
|
|
time tar -xf ${ARCHIVE} -C ${NEON_DIR}
|
|
rm -f ${ARCHIVE}
|
|
fi
|
|
|
|
- name: Download snapshots from S3
|
|
if: ${{ github.event_name != 'workflow_dispatch' || github.event.inputs.recreate_snapshots == 'false' || github.event.inputs.recreate_snapshots == '' }}
|
|
id: download_snapshots
|
|
shell: bash -euxo pipefail {0}
|
|
run: |
|
|
# Download the snapshots from S3
|
|
mkdir -p ${TEST_OUTPUT}
|
|
mkdir -p $BACKUP_DIR
|
|
cd $BACKUP_DIR
|
|
mkdir parts
|
|
cd parts
|
|
PART=$(aws s3api list-objects-v2 --bucket $S3_BUCKET --prefix performance/pagebench/ \
|
|
| jq -r '.Contents[]?.Key' \
|
|
| grep -E 'shared-snapshots-[0-9]{4}-[0-9]{2}-[0-9]{2}' \
|
|
| sort \
|
|
| tail -1)
|
|
echo "Latest PART: $PART"
|
|
if [[ -z "$PART" ]]; then
|
|
echo "ERROR: No matching S3 key found" >&2
|
|
exit 1
|
|
fi
|
|
S3_KEY=$(dirname $PART)
|
|
time aws s3 cp --only-show-errors --recursive s3://${S3_BUCKET}/$S3_KEY/ .
|
|
cd $TEST_OUTPUT
|
|
time cat $BACKUP_DIR/parts/* | zstdcat | tar --extract --preserve-permissions
|
|
rm -rf ${BACKUP_DIR}
|
|
|
|
- name: Cache poetry deps
|
|
uses: actions/cache@v4
|
|
with:
|
|
path: ~/.cache/pypoetry/virtualenvs
|
|
key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }}
|
|
|
|
- name: Install Python deps
|
|
shell: bash -euxo pipefail {0}
|
|
run: ./scripts/pysync
|
|
|
|
# we need high number of open files for pagebench
|
|
- name: show ulimits
|
|
shell: bash -euxo pipefail {0}
|
|
run: |
|
|
ulimit -a
|
|
|
|
- name: Run pagebench testcase
|
|
shell: bash -euxo pipefail {0}
|
|
env:
|
|
CI: false # need to override this env variable set by github to enforce using snapshots
|
|
run: |
|
|
export PLATFORM=hetzner-unit-perf-${COMMIT_HASH_TYPE}
|
|
# report the commit hash of the neon repository in the revision of the test results
|
|
export GITHUB_SHA=${COMMIT_HASH}
|
|
rm -rf ${PERF_REPORT_DIR}
|
|
rm -rf ${ALLURE_RESULTS_DIR}
|
|
mkdir -p ${PERF_REPORT_DIR}
|
|
mkdir -p ${ALLURE_RESULTS_DIR}
|
|
PARAMS="--alluredir=${ALLURE_RESULTS_DIR} --tb=short --verbose -rA"
|
|
EXTRA_PARAMS="--out-dir ${PERF_REPORT_DIR} --durations-path $TEST_OUTPUT/benchmark_durations.json"
|
|
# run only two selected tests
|
|
# environment set by parent:
|
|
# RUST_BACKTRACE=1 DEFAULT_PG_VERSION=16 BUILD_TYPE=release
|
|
./scripts/pytest ${PARAMS} test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_characterize_throughput_with_n_tenants ${EXTRA_PARAMS}
|
|
./scripts/pytest ${PARAMS} test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_characterize_latencies_with_1_client_and_throughput_with_many_clients_one_tenant ${EXTRA_PARAMS}
|
|
|
|
- name: upload the performance metrics to the Neon performance database which is used by grafana dashboards to display the results
|
|
shell: bash -euxo pipefail {0}
|
|
run: |
|
|
export REPORT_FROM="$PERF_REPORT_DIR"
|
|
export GITHUB_SHA=${COMMIT_HASH}
|
|
time ./scripts/generate_and_push_perf_report.sh
|
|
|
|
- name: Upload test results
|
|
if: ${{ !cancelled() }}
|
|
uses: ./.github/actions/allure-report-store
|
|
with:
|
|
report-dir: ${{ steps.set-env.outputs.allure_results_dir }}
|
|
unique-key: ${{ env.BUILD_TYPE }}-${{ env.DEFAULT_PG_VERSION }}-${{ runner.arch }}
|
|
aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
|
|
|
- name: Create Allure report
|
|
id: create-allure-report
|
|
if: ${{ !cancelled() }}
|
|
uses: ./.github/actions/allure-report-generate
|
|
with:
|
|
aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
|
|
|
- name: Upload snapshots
|
|
if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.recreate_snapshots != 'false' && github.event.inputs.recreate_snapshots != '' }}
|
|
id: upload_snapshots
|
|
shell: bash -euxo pipefail {0}
|
|
run: |
|
|
mkdir -p $BACKUP_DIR
|
|
cd $TEST_OUTPUT
|
|
tar --create --preserve-permissions --file - shared-snapshots | zstd -o $BACKUP_DIR/shared_snapshots.tar.zst
|
|
cd $BACKUP_DIR
|
|
mkdir parts
|
|
split -b 1G shared_snapshots.tar.zst ./parts/shared_snapshots.tar.zst.part.
|
|
SNAPSHOT_DATE=$(date +%F) # YYYY-MM-DD
|
|
cd parts
|
|
time aws s3 cp --recursive . s3://${S3_BUCKET}/performance/pagebench/shared-snapshots-${SNAPSHOT_DATE}/
|
|
|
|
- name: Post to a Slack channel
|
|
if: ${{ github.event.schedule && failure() }}
|
|
uses: slackapi/slack-github-action@fcfb566f8b0aab22203f066d80ca1d7e4b5d05b3 # v1.27.1
|
|
with:
|
|
channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream
|
|
slack-message: "Periodic pagebench testing on dedicated hardware: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
|
env:
|
|
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
|
|
|
- name: Cleanup Test Resources
|
|
if: always()
|
|
shell: bash -euxo pipefail {0}
|
|
env:
|
|
ARCHIVE: ${{ runner.temp }}/downloads/neon-${{ runner.os }}-${{ runner.arch }}-release-artifact.tar.zst
|
|
run: |
|
|
# Cleanup the test resources
|
|
if [[ -d "${BACKUP_DIR}" ]]; then
|
|
rm -rf ${BACKUP_DIR}
|
|
fi
|
|
if [[ -d "${TEST_OUTPUT}" ]]; then
|
|
rm -rf ${TEST_OUTPUT}
|
|
fi
|
|
if [[ -d "${NEON_DIR}" ]]; then
|
|
rm -rf ${NEON_DIR}
|
|
fi
|
|
rm -rf $(dirname $ARCHIVE)
|
|
|