From 08b33adfee2f4ab5d4dcbcee03120b427f53996c Mon Sep 17 00:00:00 2001 From: Peter Bendel Date: Wed, 3 Jul 2024 18:22:33 +0200 Subject: [PATCH] add pagebench test cases for periodic pagebench on dedicated hardware (#8233) we want to run some specific pagebench test cases on dedicated hardware to get reproducible results run1: 1 client per tenant => characterize throughput with n tenants. - 500 tenants - scale 13 (200 MB database) - 1 hour duration - ca 380 GB layer snapshot files run2.singleclient: 1 client per tenant => characterize latencies run2.manyclient: N clients per tenant => characterize throughput scalability within one tenant. - 1 tenant with 1 client for latencies - 1 tenant with 64 clients because typically for a high number of connections we recommend the connection pooler which by default uses 64 connections (for scalability) - scale 136 (2048 MB database) - 20 minutes each --- .github/workflows/periodic_pagebench.yml | 144 ++++++++++++++++++ ...er_max_throughput_getpage_at_latest_lsn.py | 86 ++++++++--- test_runner/performance/pageserver/util.py | 2 +- 3 files changed, 212 insertions(+), 20 deletions(-) create mode 100644 .github/workflows/periodic_pagebench.yml diff --git a/.github/workflows/periodic_pagebench.yml b/.github/workflows/periodic_pagebench.yml new file mode 100644 index 0000000000..c0219599a2 --- /dev/null +++ b/.github/workflows/periodic_pagebench.yml @@ -0,0 +1,144 @@ +name: Periodic pagebench performance test on dedicated EC2 machine in eu-central-1 region + +on: + schedule: + # * is a special character in YAML so you have to quote this string + # ┌───────────── minute (0 - 59) + # │ ┌───────────── hour (0 - 23) + # │ │ ┌───────────── day of the month (1 - 31) + # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) + # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) + - cron: '0 18 * * *' # Runs at 6 PM UTC every day + workflow_dispatch: # Allows manual triggering of the workflow + inputs: + commit_hash: + type: string + description: 'The long neon repo commit hash for the system under test (pageserver) to be tested.' + required: false + default: '' + +defaults: + run: + shell: bash -euo pipefail {0} + +concurrency: + group: ${{ github.workflow }} + cancel-in-progress: false + +jobs: + trigger_bench_on_ec2_machine_in_eu_central_1: + runs-on: [ self-hosted, gen3, small ] + container: + image: neondatabase/build-tools:pinned + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + options: --init + timeout-minutes: 360 # Set the timeout to 6 hours + env: + API_KEY: ${{ secrets.PERIODIC_PAGEBENCH_EC2_RUNNER_API_KEY }} + RUN_ID: ${{ github.run_id }} + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_EC2_US_TEST_RUNNER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY : ${{ secrets.AWS_EC2_US_TEST_RUNNER_ACCESS_KEY_SECRET }} + AWS_DEFAULT_REGION : "eu-central-1" + AWS_INSTANCE_ID : "i-02a59a3bf86bc7e74" + steps: + - name: Show my own (github runner) external IP address - usefull for IP allowlisting + run: curl https://ifconfig.me + + - name: Start EC2 instance and wait for the instance to boot up + run: | + aws ec2 start-instances --instance-ids $AWS_INSTANCE_ID + aws ec2 wait instance-running --instance-ids $AWS_INSTANCE_ID + sleep 60 # sleep some time to allow cloudinit and our API server to start up + + - name: Determine public IP of the EC2 instance and set env variable EC2_MACHINE_URL_US + run: | + public_ip=$(aws ec2 describe-instances --instance-ids $AWS_INSTANCE_ID --query 'Reservations[*].Instances[*].PublicIpAddress' --output text) + echo "Public IP of the EC2 instance: $public_ip" + echo "EC2_MACHINE_URL_US=https://${public_ip}:8443" >> $GITHUB_ENV + + - name: Determine commit hash + env: + INPUT_COMMIT_HASH: ${{ github.event.inputs.commit_hash }} + run: | + if [ -z "$INPUT_COMMIT_HASH" ]; then + echo "COMMIT_HASH=$(curl -s https://api.github.com/repos/neondatabase/neon/commits/main | jq -r '.sha')" >> $GITHUB_ENV + else + echo "COMMIT_HASH=$INPUT_COMMIT_HASH" >> $GITHUB_ENV + fi + + - name: Start Bench with run_id + run: | + curl -k -X 'POST' \ + "${EC2_MACHINE_URL_US}/start_test/${GITHUB_RUN_ID}" \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -H "Authorization: Bearer $API_KEY" \ + -d "{\"neonRepoCommitHash\": \"${COMMIT_HASH}\"}" + + - name: Poll Test Status + id: poll_step + run: | + status="" + while [[ "$status" != "failure" && "$status" != "success" ]]; do + response=$(curl -k -X 'GET' \ + "${EC2_MACHINE_URL_US}/test_status/${GITHUB_RUN_ID}" \ + -H 'accept: application/json' \ + -H "Authorization: Bearer $API_KEY") + echo "Response: $response" + set +x + status=$(echo $response | jq -r '.status') + echo "Test status: $status" + if [[ "$status" == "failure" || "$status" == "success" || "$status" == "null" ]]; then + break + fi + if [[ "$status" == "too_many_runs" ]]; then + echo "Too many runs already running" + echo "too_many_runs=true" >> "$GITHUB_OUTPUT" + exit 1 + fi + + sleep 60 # Poll every 60 seconds + done + + - name: Retrieve Test Logs + run: | + curl -k -X 'GET' \ + "${EC2_MACHINE_URL_US}/test_log/${GITHUB_RUN_ID}" \ + -H 'accept: application/gzip' \ + -H "Authorization: Bearer $API_KEY" \ + --output "test_log_${GITHUB_RUN_ID}.gz" + + - name: Unzip Test Log and Print it into this job's log + run: | + gzip -d "test_log_${GITHUB_RUN_ID}.gz" + cat "test_log_${GITHUB_RUN_ID}" + + - name: Create Allure report + if: ${{ !cancelled() }} + uses: ./.github/actions/allure-report-generate + + - name: Post to a Slack channel + if: ${{ github.event.schedule && failure() }} + uses: slackapi/slack-github-action@v1 + with: + channel-id: "C033QLM5P7D" # dev-staging-stream + slack-message: "Periodic pagebench testing on dedicated hardware: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" + env: + SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} + + - name: Cleanup Test Resources + if: always() + run: | + curl -k -X 'POST' \ + "${EC2_MACHINE_URL_US}/cleanup_test/${GITHUB_RUN_ID}" \ + -H 'accept: application/json' \ + -H "Authorization: Bearer $API_KEY" \ + -d '' + + - name: Stop EC2 instance and wait for the instance to be stopped + if: always() && steps.poll_step.outputs.too_many_runs != 'true' + run: | + aws ec2 stop-instances --instance-ids $AWS_INSTANCE_ID + aws ec2 wait instance-stopped --instance-ids $AWS_INSTANCE_ID diff --git a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py index 1d579214b0..a8f48fe675 100644 --- a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py +++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py @@ -1,4 +1,5 @@ import json +import os from pathlib import Path from typing import Any, Dict, Tuple @@ -17,30 +18,74 @@ from performance.pageserver.util import ( setup_pageserver_with_tenants, ) +# The following tests use pagebench "getpage at latest LSN" to characterize the throughput of the pageserver. +# originally there was a single test named `test_pageserver_max_throughput_getpage_at_latest_lsn`` +# so you still see some references to this name in the code. +# To avoid recreating the snapshots for each test, we continue to use the name `max_throughput_latest_lsn` +# for some files and metrics. + # For reference, the space usage of the snapshots: -# admin@ip-172-31-13-23:[~/neon-main]: sudo du -hs /instance_store/test_output/shared-snapshots -# 137G /instance_store/test_output/shared-snapshots -# admin@ip-172-31-13-23:[~/neon-main]: sudo du -hs /instance_store/test_output/shared-snapshots/* -# 1.8G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-1-13 -# 1.1G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-1-6 -# 8.5G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-10-13 -# 5.1G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-10-6 -# 76G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-100-13 -# 46G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-100-6 -@pytest.mark.parametrize("duration", [30]) -@pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(s) for s in [100, 200]]) -@pytest.mark.parametrize("n_tenants", [1, 10]) -@pytest.mark.timeout( - 10000 -) # TODO: this value is just "a really high number"; have this per instance type -def test_pageserver_max_throughput_getpage_at_latest_lsn( +# sudo du -hs /instance_store/neon/test_output/shared-snapshots/* +# 416G /instance_store/neon/test_output/shared-snapshots/max_throughput_latest_lsn-500-13 +@pytest.mark.parametrize("duration", [60 * 60]) +@pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(200)]) +@pytest.mark.parametrize("n_tenants", [500]) +@pytest.mark.timeout(10000) +@pytest.mark.skipif( + os.getenv("CI", "false") == "true", + reason="This test needs lot of resources and should run on dedicated HW, not in github action runners as part of CI", +) +def test_pageserver_characterize_throughput_with_n_tenants( neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker, pg_bin: PgBin, n_tenants: int, pgbench_scale: int, duration: int, +): + setup_and_run_pagebench_benchmark( + neon_env_builder, zenbenchmark, pg_bin, n_tenants, pgbench_scale, duration, 1 + ) + + +# For reference, the space usage of the snapshots: +# sudo du -hs /instance_store/neon/test_output/shared-snapshots/* +# 19G /instance_store/neon/test_output/shared-snapshots/max_throughput_latest_lsn-1-136 +@pytest.mark.parametrize("duration", [20 * 60]) +@pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(2048)]) +# we use 1 client to characterize latencies, and 64 clients to characterize throughput/scalability +# we use 64 clients because typically for a high number of connections we recommend the connection pooler +# which by default uses 64 connections +@pytest.mark.parametrize("n_clients", [1, 64]) +@pytest.mark.parametrize("n_tenants", [1]) +@pytest.mark.timeout(2400) +@pytest.mark.skipif( + os.getenv("CI", "false") == "true", + reason="This test needs lot of resources and should run on dedicated HW, not in github action runners as part of CI", +) +def test_pageserver_characterize_latencies_with_1_client_and_throughput_with_many_clients_one_tenant( + neon_env_builder: NeonEnvBuilder, + zenbenchmark: NeonBenchmarker, + pg_bin: PgBin, + n_tenants: int, + pgbench_scale: int, + duration: int, + n_clients: int, +): + setup_and_run_pagebench_benchmark( + neon_env_builder, zenbenchmark, pg_bin, n_tenants, pgbench_scale, duration, n_clients + ) + + +def setup_and_run_pagebench_benchmark( + neon_env_builder: NeonEnvBuilder, + zenbenchmark: NeonBenchmarker, + pg_bin: PgBin, + n_tenants: int, + pgbench_scale: int, + duration: int, + n_clients: int, ): def record(metric, **kwargs): zenbenchmark.record( @@ -55,6 +100,7 @@ def test_pageserver_max_throughput_getpage_at_latest_lsn( "n_tenants": (n_tenants, {"unit": ""}), "pgbench_scale": (pgbench_scale, {"unit": ""}), "duration": (duration, {"unit": "s"}), + "n_clients": (n_clients, {"unit": ""}), } ) @@ -96,7 +142,7 @@ def test_pageserver_max_throughput_getpage_at_latest_lsn( r".*query handler for.*pagestream.*failed: unexpected message: CopyFail during COPY.*" ) - run_benchmark_max_throughput_latest_lsn(env, pg_bin, record, duration) + run_pagebench_benchmark(env, pg_bin, record, duration, n_clients) def setup_tenant_template(env: NeonEnv, pg_bin: PgBin, scale: int): @@ -157,8 +203,8 @@ def setup_tenant_template(env: NeonEnv, pg_bin: PgBin, scale: int): return (template_tenant, template_timeline, config) -def run_benchmark_max_throughput_latest_lsn( - env: NeonEnv, pg_bin: PgBin, record, duration_secs: int +def run_pagebench_benchmark( + env: NeonEnv, pg_bin: PgBin, record, duration_secs: int, n_clients: int ): """ Benchmark `env.pageserver` for max throughput @ latest LSN and record results in `zenbenchmark`. @@ -172,6 +218,8 @@ def run_benchmark_max_throughput_latest_lsn( ps_http.base_url, "--page-service-connstring", env.pageserver.connstr(password=None), + "--num-clients", + str(n_clients), "--runtime", f"{duration_secs}s", # don't specify the targets explicitly, let pagebench auto-discover them diff --git a/test_runner/performance/pageserver/util.py b/test_runner/performance/pageserver/util.py index 92e05663ce..88296a7fbd 100644 --- a/test_runner/performance/pageserver/util.py +++ b/test_runner/performance/pageserver/util.py @@ -22,7 +22,7 @@ def ensure_pageserver_ready_for_benchmarking(env: NeonEnv, n_tenants: int): log.info("wait for all tenants to become active") wait_until_all_tenants_state( - ps_http, "Active", iterations=n_tenants, period=1, http_error_ok=False + ps_http, "Active", iterations=10 + n_tenants, period=1, http_error_ok=False ) # ensure all layers are resident for predictiable performance