mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-06 21:12:55 +00:00
## Problem We want to keep `#on-call-staging-stream` channel close to the prod one and redirect notifications from failing benchmarks to another channel for investigation. ## Summary of changes - Send notifications regarding failures in `benchmarking` job to `#on-call-staging-stream` - Send notifications regarding failures in `periodic_pagebench` job to `#on-call-staging-stream`
156 lines
6.2 KiB
YAML
156 lines
6.2 KiB
YAML
name: Periodic pagebench performance test on dedicated EC2 machine in eu-central-1 region
|
|
|
|
on:
|
|
schedule:
|
|
# * is a special character in YAML so you have to quote this string
|
|
# ┌───────────── minute (0 - 59)
|
|
# │ ┌───────────── hour (0 - 23)
|
|
# │ │ ┌───────────── day of the month (1 - 31)
|
|
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
|
|
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
|
|
- cron: '0 18 * * *' # Runs at 6 PM UTC every day
|
|
workflow_dispatch: # Allows manual triggering of the workflow
|
|
inputs:
|
|
commit_hash:
|
|
type: string
|
|
description: 'The long neon repo commit hash for the system under test (pageserver) to be tested.'
|
|
required: false
|
|
default: ''
|
|
|
|
defaults:
|
|
run:
|
|
shell: bash -euo pipefail {0}
|
|
|
|
concurrency:
|
|
group: ${{ github.workflow }}
|
|
cancel-in-progress: false
|
|
|
|
jobs:
|
|
trigger_bench_on_ec2_machine_in_eu_central_1:
|
|
runs-on: [ self-hosted, small ]
|
|
container:
|
|
image: neondatabase/build-tools:pinned
|
|
credentials:
|
|
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
|
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
|
options: --init
|
|
timeout-minutes: 360 # Set the timeout to 6 hours
|
|
env:
|
|
API_KEY: ${{ secrets.PERIODIC_PAGEBENCH_EC2_RUNNER_API_KEY }}
|
|
RUN_ID: ${{ github.run_id }}
|
|
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_EC2_US_TEST_RUNNER_ACCESS_KEY_ID }}
|
|
AWS_SECRET_ACCESS_KEY : ${{ secrets.AWS_EC2_US_TEST_RUNNER_ACCESS_KEY_SECRET }}
|
|
AWS_DEFAULT_REGION : "eu-central-1"
|
|
AWS_INSTANCE_ID : "i-02a59a3bf86bc7e74"
|
|
steps:
|
|
# we don't need the neon source code because we run everything remotely
|
|
# however we still need the local github actions to run the allure step below
|
|
- uses: actions/checkout@v4
|
|
|
|
- name: Show my own (github runner) external IP address - usefull for IP allowlisting
|
|
run: curl https://ifconfig.me
|
|
|
|
- name: Start EC2 instance and wait for the instance to boot up
|
|
run: |
|
|
aws ec2 start-instances --instance-ids $AWS_INSTANCE_ID
|
|
aws ec2 wait instance-running --instance-ids $AWS_INSTANCE_ID
|
|
sleep 60 # sleep some time to allow cloudinit and our API server to start up
|
|
|
|
- name: Determine public IP of the EC2 instance and set env variable EC2_MACHINE_URL_US
|
|
run: |
|
|
public_ip=$(aws ec2 describe-instances --instance-ids $AWS_INSTANCE_ID --query 'Reservations[*].Instances[*].PublicIpAddress' --output text)
|
|
echo "Public IP of the EC2 instance: $public_ip"
|
|
echo "EC2_MACHINE_URL_US=https://${public_ip}:8443" >> $GITHUB_ENV
|
|
|
|
- name: Determine commit hash
|
|
env:
|
|
INPUT_COMMIT_HASH: ${{ github.event.inputs.commit_hash }}
|
|
run: |
|
|
if [ -z "$INPUT_COMMIT_HASH" ]; then
|
|
echo "COMMIT_HASH=$(curl -s https://api.github.com/repos/neondatabase/neon/commits/main | jq -r '.sha')" >> $GITHUB_ENV
|
|
else
|
|
echo "COMMIT_HASH=$INPUT_COMMIT_HASH" >> $GITHUB_ENV
|
|
fi
|
|
|
|
- name: Start Bench with run_id
|
|
run: |
|
|
curl -k -X 'POST' \
|
|
"${EC2_MACHINE_URL_US}/start_test/${GITHUB_RUN_ID}" \
|
|
-H 'accept: application/json' \
|
|
-H 'Content-Type: application/json' \
|
|
-H "Authorization: Bearer $API_KEY" \
|
|
-d "{\"neonRepoCommitHash\": \"${COMMIT_HASH}\"}"
|
|
|
|
- name: Poll Test Status
|
|
id: poll_step
|
|
run: |
|
|
status=""
|
|
while [[ "$status" != "failure" && "$status" != "success" ]]; do
|
|
response=$(curl -k -X 'GET' \
|
|
"${EC2_MACHINE_URL_US}/test_status/${GITHUB_RUN_ID}" \
|
|
-H 'accept: application/json' \
|
|
-H "Authorization: Bearer $API_KEY")
|
|
echo "Response: $response"
|
|
set +x
|
|
status=$(echo $response | jq -r '.status')
|
|
echo "Test status: $status"
|
|
if [[ "$status" == "failure" ]]; then
|
|
echo "Test failed"
|
|
exit 1 # Fail the job step if status is failure
|
|
elif [[ "$status" == "success" || "$status" == "null" ]]; then
|
|
break
|
|
elif [[ "$status" == "too_many_runs" ]]; then
|
|
echo "Too many runs already running"
|
|
echo "too_many_runs=true" >> "$GITHUB_OUTPUT"
|
|
exit 1
|
|
fi
|
|
|
|
sleep 60 # Poll every 60 seconds
|
|
done
|
|
|
|
- name: Retrieve Test Logs
|
|
if: always() && steps.poll_step.outputs.too_many_runs != 'true'
|
|
run: |
|
|
curl -k -X 'GET' \
|
|
"${EC2_MACHINE_URL_US}/test_log/${GITHUB_RUN_ID}" \
|
|
-H 'accept: application/gzip' \
|
|
-H "Authorization: Bearer $API_KEY" \
|
|
--output "test_log_${GITHUB_RUN_ID}.gz"
|
|
|
|
- name: Unzip Test Log and Print it into this job's log
|
|
if: always() && steps.poll_step.outputs.too_many_runs != 'true'
|
|
run: |
|
|
gzip -d "test_log_${GITHUB_RUN_ID}.gz"
|
|
cat "test_log_${GITHUB_RUN_ID}"
|
|
|
|
- name: Create Allure report
|
|
env:
|
|
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
|
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
|
if: ${{ !cancelled() }}
|
|
uses: ./.github/actions/allure-report-generate
|
|
|
|
- name: Post to a Slack channel
|
|
if: ${{ github.event.schedule && failure() }}
|
|
uses: slackapi/slack-github-action@v1
|
|
with:
|
|
channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream
|
|
slack-message: "Periodic pagebench testing on dedicated hardware: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
|
env:
|
|
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
|
|
|
- name: Cleanup Test Resources
|
|
if: always()
|
|
run: |
|
|
curl -k -X 'POST' \
|
|
"${EC2_MACHINE_URL_US}/cleanup_test/${GITHUB_RUN_ID}" \
|
|
-H 'accept: application/json' \
|
|
-H "Authorization: Bearer $API_KEY" \
|
|
-d ''
|
|
|
|
- name: Stop EC2 instance and wait for the instance to be stopped
|
|
if: always() && steps.poll_step.outputs.too_many_runs != 'true'
|
|
run: |
|
|
aws ec2 stop-instances --instance-ids $AWS_INSTANCE_ID
|
|
aws ec2 wait instance-stopped --instance-ids $AWS_INSTANCE_ID
|