add pagebench test cases for periodic pagebench on dedicated hardware (#8233)

we want to run some specific pagebench test cases on dedicated hardware
to get reproducible results

run1: 1 client per tenant => characterize throughput with n tenants.
-  500 tenants
- scale 13 (200 MB database)
- 1 hour duration
- ca 380 GB layer snapshot files

run2.singleclient: 1 client per tenant => characterize latencies
run2.manyclient: N clients per tenant => characterize throughput
scalability within one tenant.
- 1 tenant with 1 client for latencies
- 1 tenant with 64 clients because typically for a high number of
connections we recommend the connection pooler
which by default uses 64 connections (for scalability)
- scale 136 (2048 MB database)
- 20 minutes each
This commit is contained in:
Peter Bendel
2024-07-03 18:22:33 +02:00
committed by GitHub
parent e0891ec8c8
commit 392a58bdce
3 changed files with 212 additions and 20 deletions

144
.github/workflows/periodic_pagebench.yml vendored Normal file
View File

@@ -0,0 +1,144 @@
name: Periodic pagebench performance test on dedicated EC2 machine in eu-central-1 region
on:
schedule:
# * is a special character in YAML so you have to quote this string
# ┌───────────── minute (0 - 59)
# │ ┌───────────── hour (0 - 23)
# │ │ ┌───────────── day of the month (1 - 31)
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
- cron: '0 18 * * *' # Runs at 6 PM UTC every day
workflow_dispatch: # Allows manual triggering of the workflow
inputs:
commit_hash:
type: string
description: 'The long neon repo commit hash for the system under test (pageserver) to be tested.'
required: false
default: ''
defaults:
run:
shell: bash -euo pipefail {0}
concurrency:
group: ${{ github.workflow }}
cancel-in-progress: false
jobs:
trigger_bench_on_ec2_machine_in_eu_central_1:
runs-on: [ self-hosted, gen3, small ]
container:
image: neondatabase/build-tools:pinned
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
options: --init
timeout-minutes: 360 # Set the timeout to 6 hours
env:
API_KEY: ${{ secrets.PERIODIC_PAGEBENCH_EC2_RUNNER_API_KEY }}
RUN_ID: ${{ github.run_id }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_EC2_US_TEST_RUNNER_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY : ${{ secrets.AWS_EC2_US_TEST_RUNNER_ACCESS_KEY_SECRET }}
AWS_DEFAULT_REGION : "eu-central-1"
AWS_INSTANCE_ID : "i-02a59a3bf86bc7e74"
steps:
- name: Show my own (github runner) external IP address - usefull for IP allowlisting
run: curl https://ifconfig.me
- name: Start EC2 instance and wait for the instance to boot up
run: |
aws ec2 start-instances --instance-ids $AWS_INSTANCE_ID
aws ec2 wait instance-running --instance-ids $AWS_INSTANCE_ID
sleep 60 # sleep some time to allow cloudinit and our API server to start up
- name: Determine public IP of the EC2 instance and set env variable EC2_MACHINE_URL_US
run: |
public_ip=$(aws ec2 describe-instances --instance-ids $AWS_INSTANCE_ID --query 'Reservations[*].Instances[*].PublicIpAddress' --output text)
echo "Public IP of the EC2 instance: $public_ip"
echo "EC2_MACHINE_URL_US=https://${public_ip}:8443" >> $GITHUB_ENV
- name: Determine commit hash
env:
INPUT_COMMIT_HASH: ${{ github.event.inputs.commit_hash }}
run: |
if [ -z "$INPUT_COMMIT_HASH" ]; then
echo "COMMIT_HASH=$(curl -s https://api.github.com/repos/neondatabase/neon/commits/main | jq -r '.sha')" >> $GITHUB_ENV
else
echo "COMMIT_HASH=$INPUT_COMMIT_HASH" >> $GITHUB_ENV
fi
- name: Start Bench with run_id
run: |
curl -k -X 'POST' \
"${EC2_MACHINE_URL_US}/start_test/${GITHUB_RUN_ID}" \
-H 'accept: application/json' \
-H 'Content-Type: application/json' \
-H "Authorization: Bearer $API_KEY" \
-d "{\"neonRepoCommitHash\": \"${COMMIT_HASH}\"}"
- name: Poll Test Status
id: poll_step
run: |
status=""
while [[ "$status" != "failure" && "$status" != "success" ]]; do
response=$(curl -k -X 'GET' \
"${EC2_MACHINE_URL_US}/test_status/${GITHUB_RUN_ID}" \
-H 'accept: application/json' \
-H "Authorization: Bearer $API_KEY")
echo "Response: $response"
set +x
status=$(echo $response | jq -r '.status')
echo "Test status: $status"
if [[ "$status" == "failure" || "$status" == "success" || "$status" == "null" ]]; then
break
fi
if [[ "$status" == "too_many_runs" ]]; then
echo "Too many runs already running"
echo "too_many_runs=true" >> "$GITHUB_OUTPUT"
exit 1
fi
sleep 60 # Poll every 60 seconds
done
- name: Retrieve Test Logs
run: |
curl -k -X 'GET' \
"${EC2_MACHINE_URL_US}/test_log/${GITHUB_RUN_ID}" \
-H 'accept: application/gzip' \
-H "Authorization: Bearer $API_KEY" \
--output "test_log_${GITHUB_RUN_ID}.gz"
- name: Unzip Test Log and Print it into this job's log
run: |
gzip -d "test_log_${GITHUB_RUN_ID}.gz"
cat "test_log_${GITHUB_RUN_ID}"
- name: Create Allure report
if: ${{ !cancelled() }}
uses: ./.github/actions/allure-report-generate
- name: Post to a Slack channel
if: ${{ github.event.schedule && failure() }}
uses: slackapi/slack-github-action@v1
with:
channel-id: "C033QLM5P7D" # dev-staging-stream
slack-message: "Periodic pagebench testing on dedicated hardware: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
- name: Cleanup Test Resources
if: always()
run: |
curl -k -X 'POST' \
"${EC2_MACHINE_URL_US}/cleanup_test/${GITHUB_RUN_ID}" \
-H 'accept: application/json' \
-H "Authorization: Bearer $API_KEY" \
-d ''
- name: Stop EC2 instance and wait for the instance to be stopped
if: always() && steps.poll_step.outputs.too_many_runs != 'true'
run: |
aws ec2 stop-instances --instance-ids $AWS_INSTANCE_ID
aws ec2 wait instance-stopped --instance-ids $AWS_INSTANCE_ID

View File

@@ -1,4 +1,5 @@
import json
import os
from pathlib import Path
from typing import Any, Dict, Tuple
@@ -17,30 +18,74 @@ from performance.pageserver.util import (
setup_pageserver_with_tenants,
)
# The following tests use pagebench "getpage at latest LSN" to characterize the throughput of the pageserver.
# originally there was a single test named `test_pageserver_max_throughput_getpage_at_latest_lsn``
# so you still see some references to this name in the code.
# To avoid recreating the snapshots for each test, we continue to use the name `max_throughput_latest_lsn`
# for some files and metrics.
# For reference, the space usage of the snapshots:
# admin@ip-172-31-13-23:[~/neon-main]: sudo du -hs /instance_store/test_output/shared-snapshots
# 137G /instance_store/test_output/shared-snapshots
# admin@ip-172-31-13-23:[~/neon-main]: sudo du -hs /instance_store/test_output/shared-snapshots/*
# 1.8G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-1-13
# 1.1G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-1-6
# 8.5G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-10-13
# 5.1G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-10-6
# 76G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-100-13
# 46G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-100-6
@pytest.mark.parametrize("duration", [30])
@pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(s) for s in [100, 200]])
@pytest.mark.parametrize("n_tenants", [1, 10])
@pytest.mark.timeout(
10000
) # TODO: this value is just "a really high number"; have this per instance type
def test_pageserver_max_throughput_getpage_at_latest_lsn(
# sudo du -hs /instance_store/neon/test_output/shared-snapshots/*
# 416G /instance_store/neon/test_output/shared-snapshots/max_throughput_latest_lsn-500-13
@pytest.mark.parametrize("duration", [60 * 60])
@pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(200)])
@pytest.mark.parametrize("n_tenants", [500])
@pytest.mark.timeout(10000)
@pytest.mark.skipif(
os.getenv("CI", "false") == "true",
reason="This test needs lot of resources and should run on dedicated HW, not in github action runners as part of CI",
)
def test_pageserver_characterize_throughput_with_n_tenants(
neon_env_builder: NeonEnvBuilder,
zenbenchmark: NeonBenchmarker,
pg_bin: PgBin,
n_tenants: int,
pgbench_scale: int,
duration: int,
):
setup_and_run_pagebench_benchmark(
neon_env_builder, zenbenchmark, pg_bin, n_tenants, pgbench_scale, duration, 1
)
# For reference, the space usage of the snapshots:
# sudo du -hs /instance_store/neon/test_output/shared-snapshots/*
# 19G /instance_store/neon/test_output/shared-snapshots/max_throughput_latest_lsn-1-136
@pytest.mark.parametrize("duration", [20 * 60])
@pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(2048)])
# we use 1 client to characterize latencies, and 64 clients to characterize throughput/scalability
# we use 64 clients because typically for a high number of connections we recommend the connection pooler
# which by default uses 64 connections
@pytest.mark.parametrize("n_clients", [1, 64])
@pytest.mark.parametrize("n_tenants", [1])
@pytest.mark.timeout(2400)
@pytest.mark.skipif(
os.getenv("CI", "false") == "true",
reason="This test needs lot of resources and should run on dedicated HW, not in github action runners as part of CI",
)
def test_pageserver_characterize_latencies_with_1_client_and_throughput_with_many_clients_one_tenant(
neon_env_builder: NeonEnvBuilder,
zenbenchmark: NeonBenchmarker,
pg_bin: PgBin,
n_tenants: int,
pgbench_scale: int,
duration: int,
n_clients: int,
):
setup_and_run_pagebench_benchmark(
neon_env_builder, zenbenchmark, pg_bin, n_tenants, pgbench_scale, duration, n_clients
)
def setup_and_run_pagebench_benchmark(
neon_env_builder: NeonEnvBuilder,
zenbenchmark: NeonBenchmarker,
pg_bin: PgBin,
n_tenants: int,
pgbench_scale: int,
duration: int,
n_clients: int,
):
def record(metric, **kwargs):
zenbenchmark.record(
@@ -55,6 +100,7 @@ def test_pageserver_max_throughput_getpage_at_latest_lsn(
"n_tenants": (n_tenants, {"unit": ""}),
"pgbench_scale": (pgbench_scale, {"unit": ""}),
"duration": (duration, {"unit": "s"}),
"n_clients": (n_clients, {"unit": ""}),
}
)
@@ -96,7 +142,7 @@ def test_pageserver_max_throughput_getpage_at_latest_lsn(
r".*query handler for.*pagestream.*failed: unexpected message: CopyFail during COPY.*"
)
run_benchmark_max_throughput_latest_lsn(env, pg_bin, record, duration)
run_pagebench_benchmark(env, pg_bin, record, duration, n_clients)
def setup_tenant_template(env: NeonEnv, pg_bin: PgBin, scale: int):
@@ -157,8 +203,8 @@ def setup_tenant_template(env: NeonEnv, pg_bin: PgBin, scale: int):
return (template_tenant, template_timeline, config)
def run_benchmark_max_throughput_latest_lsn(
env: NeonEnv, pg_bin: PgBin, record, duration_secs: int
def run_pagebench_benchmark(
env: NeonEnv, pg_bin: PgBin, record, duration_secs: int, n_clients: int
):
"""
Benchmark `env.pageserver` for max throughput @ latest LSN and record results in `zenbenchmark`.
@@ -172,6 +218,8 @@ def run_benchmark_max_throughput_latest_lsn(
ps_http.base_url,
"--page-service-connstring",
env.pageserver.connstr(password=None),
"--num-clients",
str(n_clients),
"--runtime",
f"{duration_secs}s",
# don't specify the targets explicitly, let pagebench auto-discover them

View File

@@ -22,7 +22,7 @@ def ensure_pageserver_ready_for_benchmarking(env: NeonEnv, n_tenants: int):
log.info("wait for all tenants to become active")
wait_until_all_tenants_state(
ps_http, "Active", iterations=n_tenants, period=1, http_error_ok=False
ps_http, "Active", iterations=10 + n_tenants, period=1, http_error_ok=False
)
# ensure all layers are resident for predictiable performance