neon/.github/workflows/periodic_pagebench.yml

name: Periodic pagebench performance test on dedicated EC2 machine in eu-central-1 region

on:
  schedule:
    # * is a special character in YAML so you have to quote this string
    #          ┌───────────── minute (0 - 59)
    #          │ ┌───────────── hour (0 - 23)
    #          │ │ ┌───────────── day of the month (1 - 31)
    #          │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
    #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
    - cron:  '0 18 * * *' # Runs at 6 PM UTC every day
  workflow_dispatch: # Allows manual triggering of the workflow
    inputs:
      commit_hash:
        type: string
        description: 'The long neon repo commit hash for the system under test (pageserver) to be tested.'
        required: false
        default: ''

defaults:
  run:
    shell: bash -euo pipefail {0}

concurrency:
  group: ${{ github.workflow }}
  cancel-in-progress: false

jobs:
  trigger_bench_on_ec2_machine_in_eu_central_1:
    runs-on: [ self-hosted, small ]
    container:
      image: neondatabase/build-tools:pinned
      credentials:
        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
      options: --init
    timeout-minutes: 360  # Set the timeout to 6 hours
    env:
      API_KEY: ${{ secrets.PERIODIC_PAGEBENCH_EC2_RUNNER_API_KEY }}
      RUN_ID: ${{ github.run_id }}
      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_EC2_US_TEST_RUNNER_ACCESS_KEY_ID }}
      AWS_SECRET_ACCESS_KEY : ${{ secrets.AWS_EC2_US_TEST_RUNNER_ACCESS_KEY_SECRET }}
      AWS_DEFAULT_REGION : "eu-central-1"
      AWS_INSTANCE_ID : "i-02a59a3bf86bc7e74"
    steps:
    # we don't need the neon source code because we run everything remotely
    # however we still need the local github actions to run the allure step below
    - uses: actions/checkout@v4

    - name: Show my own (github runner) external IP address - usefull for IP allowlisting
      run: curl https://ifconfig.me

    - name: Start EC2 instance and wait for the instance to boot up
      run: |
        aws ec2 start-instances --instance-ids $AWS_INSTANCE_ID
        aws ec2 wait instance-running --instance-ids $AWS_INSTANCE_ID
        sleep 60 # sleep some time to allow cloudinit and our API server to start up

    - name: Determine public IP of the EC2 instance and set env variable EC2_MACHINE_URL_US
      run: |
        public_ip=$(aws ec2 describe-instances --instance-ids $AWS_INSTANCE_ID --query 'Reservations[*].Instances[*].PublicIpAddress' --output text)
        echo "Public IP of the EC2 instance: $public_ip"
        echo "EC2_MACHINE_URL_US=https://${public_ip}:8443" >> $GITHUB_ENV

    - name: Determine commit hash
      env:
        INPUT_COMMIT_HASH: ${{ github.event.inputs.commit_hash }}
      run: |
        if [ -z "$INPUT_COMMIT_HASH" ]; then
          echo "COMMIT_HASH=$(curl -s https://api.github.com/repos/neondatabase/neon/commits/main | jq -r '.sha')" >> $GITHUB_ENV
        else
          echo "COMMIT_HASH=$INPUT_COMMIT_HASH" >> $GITHUB_ENV
        fi

    - name: Start Bench with run_id
      run: |
        curl -k -X 'POST' \
        "${EC2_MACHINE_URL_US}/start_test/${GITHUB_RUN_ID}" \
        -H 'accept: application/json' \
        -H 'Content-Type: application/json' \
        -H "Authorization: Bearer $API_KEY" \
        -d "{\"neonRepoCommitHash\": \"${COMMIT_HASH}\"}"

    - name: Poll Test Status
      id: poll_step
      run: |
        status=""
        while [[ "$status" != "failure" && "$status" != "success" ]]; do
          response=$(curl -k -X 'GET' \
          "${EC2_MACHINE_URL_US}/test_status/${GITHUB_RUN_ID}" \
          -H 'accept: application/json' \
          -H "Authorization: Bearer $API_KEY")
          echo "Response: $response"
          set +x
          status=$(echo $response | jq -r '.status')
          echo "Test status: $status"
          if [[ "$status" == "failure" ]]; then
            echo "Test failed"
            exit 1 # Fail the job step if status is failure
          elif [[ "$status" == "success" || "$status" == "null" ]]; then
            break
          elif [[ "$status" == "too_many_runs" ]]; then
            echo "Too many runs already running"
            echo "too_many_runs=true" >> "$GITHUB_OUTPUT"
            exit 1
          fi

          sleep 60 # Poll every 60 seconds
        done

    - name: Retrieve Test Logs
      if: always() && steps.poll_step.outputs.too_many_runs != 'true'
      run: |
        curl -k -X 'GET' \
        "${EC2_MACHINE_URL_US}/test_log/${GITHUB_RUN_ID}" \
        -H 'accept: application/gzip' \
        -H "Authorization: Bearer $API_KEY" \
        --output "test_log_${GITHUB_RUN_ID}.gz"

    - name: Unzip Test Log and Print it into this job's log
      if: always() && steps.poll_step.outputs.too_many_runs != 'true'
      run: |
        gzip -d "test_log_${GITHUB_RUN_ID}.gz"
        cat "test_log_${GITHUB_RUN_ID}"

    - name: Create Allure report
      env:
        AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
        AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
      if: ${{ !cancelled() }}
      uses: ./.github/actions/allure-report-generate

    - name: Post to a Slack channel
      if: ${{ github.event.schedule && failure() }}
      uses: slackapi/slack-github-action@v1
      with:
        channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream
        slack-message: "Periodic pagebench testing on dedicated hardware: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
      env:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

    - name: Cleanup Test Resources
      if: always()
      run: |
        curl -k -X 'POST' \
        "${EC2_MACHINE_URL_US}/cleanup_test/${GITHUB_RUN_ID}" \
        -H 'accept: application/json' \
        -H "Authorization: Bearer $API_KEY" \
        -d ''

    - name: Stop EC2 instance and wait for the instance to be stopped
      if: always() && steps.poll_step.outputs.too_many_runs != 'true'
      run: |
        aws ec2 stop-instances --instance-ids $AWS_INSTANCE_ID
        aws ec2 wait instance-stopped --instance-ids $AWS_INSTANCE_ID