mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-04 03:52:56 +00:00
add pagebench test cases for periodic pagebench on dedicated hardware (#8233)
we want to run some specific pagebench test cases on dedicated hardware to get reproducible results run1: 1 client per tenant => characterize throughput with n tenants. - 500 tenants - scale 13 (200 MB database) - 1 hour duration - ca 380 GB layer snapshot files run2.singleclient: 1 client per tenant => characterize latencies run2.manyclient: N clients per tenant => characterize throughput scalability within one tenant. - 1 tenant with 1 client for latencies - 1 tenant with 64 clients because typically for a high number of connections we recommend the connection pooler which by default uses 64 connections (for scalability) - scale 136 (2048 MB database) - 20 minutes each
This commit is contained in:
144
.github/workflows/periodic_pagebench.yml
vendored
Normal file
144
.github/workflows/periodic_pagebench.yml
vendored
Normal file
@@ -0,0 +1,144 @@
|
||||
name: Periodic pagebench performance test on dedicated EC2 machine in eu-central-1 region
|
||||
|
||||
on:
|
||||
schedule:
|
||||
# * is a special character in YAML so you have to quote this string
|
||||
# ┌───────────── minute (0 - 59)
|
||||
# │ ┌───────────── hour (0 - 23)
|
||||
# │ │ ┌───────────── day of the month (1 - 31)
|
||||
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
|
||||
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
|
||||
- cron: '0 18 * * *' # Runs at 6 PM UTC every day
|
||||
workflow_dispatch: # Allows manual triggering of the workflow
|
||||
inputs:
|
||||
commit_hash:
|
||||
type: string
|
||||
description: 'The long neon repo commit hash for the system under test (pageserver) to be tested.'
|
||||
required: false
|
||||
default: ''
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -euo pipefail {0}
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}
|
||||
cancel-in-progress: false
|
||||
|
||||
jobs:
|
||||
trigger_bench_on_ec2_machine_in_eu_central_1:
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container:
|
||||
image: neondatabase/build-tools:pinned
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
options: --init
|
||||
timeout-minutes: 360 # Set the timeout to 6 hours
|
||||
env:
|
||||
API_KEY: ${{ secrets.PERIODIC_PAGEBENCH_EC2_RUNNER_API_KEY }}
|
||||
RUN_ID: ${{ github.run_id }}
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_EC2_US_TEST_RUNNER_ACCESS_KEY_ID }}
|
||||
AWS_SECRET_ACCESS_KEY : ${{ secrets.AWS_EC2_US_TEST_RUNNER_ACCESS_KEY_SECRET }}
|
||||
AWS_DEFAULT_REGION : "eu-central-1"
|
||||
AWS_INSTANCE_ID : "i-02a59a3bf86bc7e74"
|
||||
steps:
|
||||
- name: Show my own (github runner) external IP address - usefull for IP allowlisting
|
||||
run: curl https://ifconfig.me
|
||||
|
||||
- name: Start EC2 instance and wait for the instance to boot up
|
||||
run: |
|
||||
aws ec2 start-instances --instance-ids $AWS_INSTANCE_ID
|
||||
aws ec2 wait instance-running --instance-ids $AWS_INSTANCE_ID
|
||||
sleep 60 # sleep some time to allow cloudinit and our API server to start up
|
||||
|
||||
- name: Determine public IP of the EC2 instance and set env variable EC2_MACHINE_URL_US
|
||||
run: |
|
||||
public_ip=$(aws ec2 describe-instances --instance-ids $AWS_INSTANCE_ID --query 'Reservations[*].Instances[*].PublicIpAddress' --output text)
|
||||
echo "Public IP of the EC2 instance: $public_ip"
|
||||
echo "EC2_MACHINE_URL_US=https://${public_ip}:8443" >> $GITHUB_ENV
|
||||
|
||||
- name: Determine commit hash
|
||||
env:
|
||||
INPUT_COMMIT_HASH: ${{ github.event.inputs.commit_hash }}
|
||||
run: |
|
||||
if [ -z "$INPUT_COMMIT_HASH" ]; then
|
||||
echo "COMMIT_HASH=$(curl -s https://api.github.com/repos/neondatabase/neon/commits/main | jq -r '.sha')" >> $GITHUB_ENV
|
||||
else
|
||||
echo "COMMIT_HASH=$INPUT_COMMIT_HASH" >> $GITHUB_ENV
|
||||
fi
|
||||
|
||||
- name: Start Bench with run_id
|
||||
run: |
|
||||
curl -k -X 'POST' \
|
||||
"${EC2_MACHINE_URL_US}/start_test/${GITHUB_RUN_ID}" \
|
||||
-H 'accept: application/json' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H "Authorization: Bearer $API_KEY" \
|
||||
-d "{\"neonRepoCommitHash\": \"${COMMIT_HASH}\"}"
|
||||
|
||||
- name: Poll Test Status
|
||||
id: poll_step
|
||||
run: |
|
||||
status=""
|
||||
while [[ "$status" != "failure" && "$status" != "success" ]]; do
|
||||
response=$(curl -k -X 'GET' \
|
||||
"${EC2_MACHINE_URL_US}/test_status/${GITHUB_RUN_ID}" \
|
||||
-H 'accept: application/json' \
|
||||
-H "Authorization: Bearer $API_KEY")
|
||||
echo "Response: $response"
|
||||
set +x
|
||||
status=$(echo $response | jq -r '.status')
|
||||
echo "Test status: $status"
|
||||
if [[ "$status" == "failure" || "$status" == "success" || "$status" == "null" ]]; then
|
||||
break
|
||||
fi
|
||||
if [[ "$status" == "too_many_runs" ]]; then
|
||||
echo "Too many runs already running"
|
||||
echo "too_many_runs=true" >> "$GITHUB_OUTPUT"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
sleep 60 # Poll every 60 seconds
|
||||
done
|
||||
|
||||
- name: Retrieve Test Logs
|
||||
run: |
|
||||
curl -k -X 'GET' \
|
||||
"${EC2_MACHINE_URL_US}/test_log/${GITHUB_RUN_ID}" \
|
||||
-H 'accept: application/gzip' \
|
||||
-H "Authorization: Bearer $API_KEY" \
|
||||
--output "test_log_${GITHUB_RUN_ID}.gz"
|
||||
|
||||
- name: Unzip Test Log and Print it into this job's log
|
||||
run: |
|
||||
gzip -d "test_log_${GITHUB_RUN_ID}.gz"
|
||||
cat "test_log_${GITHUB_RUN_ID}"
|
||||
|
||||
- name: Create Allure report
|
||||
if: ${{ !cancelled() }}
|
||||
uses: ./.github/actions/allure-report-generate
|
||||
|
||||
- name: Post to a Slack channel
|
||||
if: ${{ github.event.schedule && failure() }}
|
||||
uses: slackapi/slack-github-action@v1
|
||||
with:
|
||||
channel-id: "C033QLM5P7D" # dev-staging-stream
|
||||
slack-message: "Periodic pagebench testing on dedicated hardware: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
||||
- name: Cleanup Test Resources
|
||||
if: always()
|
||||
run: |
|
||||
curl -k -X 'POST' \
|
||||
"${EC2_MACHINE_URL_US}/cleanup_test/${GITHUB_RUN_ID}" \
|
||||
-H 'accept: application/json' \
|
||||
-H "Authorization: Bearer $API_KEY" \
|
||||
-d ''
|
||||
|
||||
- name: Stop EC2 instance and wait for the instance to be stopped
|
||||
if: always() && steps.poll_step.outputs.too_many_runs != 'true'
|
||||
run: |
|
||||
aws ec2 stop-instances --instance-ids $AWS_INSTANCE_ID
|
||||
aws ec2 wait instance-stopped --instance-ids $AWS_INSTANCE_ID
|
||||
@@ -1,4 +1,5 @@
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Tuple
|
||||
|
||||
@@ -17,30 +18,74 @@ from performance.pageserver.util import (
|
||||
setup_pageserver_with_tenants,
|
||||
)
|
||||
|
||||
# The following tests use pagebench "getpage at latest LSN" to characterize the throughput of the pageserver.
|
||||
# originally there was a single test named `test_pageserver_max_throughput_getpage_at_latest_lsn``
|
||||
# so you still see some references to this name in the code.
|
||||
# To avoid recreating the snapshots for each test, we continue to use the name `max_throughput_latest_lsn`
|
||||
# for some files and metrics.
|
||||
|
||||
|
||||
# For reference, the space usage of the snapshots:
|
||||
# admin@ip-172-31-13-23:[~/neon-main]: sudo du -hs /instance_store/test_output/shared-snapshots
|
||||
# 137G /instance_store/test_output/shared-snapshots
|
||||
# admin@ip-172-31-13-23:[~/neon-main]: sudo du -hs /instance_store/test_output/shared-snapshots/*
|
||||
# 1.8G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-1-13
|
||||
# 1.1G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-1-6
|
||||
# 8.5G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-10-13
|
||||
# 5.1G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-10-6
|
||||
# 76G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-100-13
|
||||
# 46G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-100-6
|
||||
@pytest.mark.parametrize("duration", [30])
|
||||
@pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(s) for s in [100, 200]])
|
||||
@pytest.mark.parametrize("n_tenants", [1, 10])
|
||||
@pytest.mark.timeout(
|
||||
10000
|
||||
) # TODO: this value is just "a really high number"; have this per instance type
|
||||
def test_pageserver_max_throughput_getpage_at_latest_lsn(
|
||||
# sudo du -hs /instance_store/neon/test_output/shared-snapshots/*
|
||||
# 416G /instance_store/neon/test_output/shared-snapshots/max_throughput_latest_lsn-500-13
|
||||
@pytest.mark.parametrize("duration", [60 * 60])
|
||||
@pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(200)])
|
||||
@pytest.mark.parametrize("n_tenants", [500])
|
||||
@pytest.mark.timeout(10000)
|
||||
@pytest.mark.skipif(
|
||||
os.getenv("CI", "false") == "true",
|
||||
reason="This test needs lot of resources and should run on dedicated HW, not in github action runners as part of CI",
|
||||
)
|
||||
def test_pageserver_characterize_throughput_with_n_tenants(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
zenbenchmark: NeonBenchmarker,
|
||||
pg_bin: PgBin,
|
||||
n_tenants: int,
|
||||
pgbench_scale: int,
|
||||
duration: int,
|
||||
):
|
||||
setup_and_run_pagebench_benchmark(
|
||||
neon_env_builder, zenbenchmark, pg_bin, n_tenants, pgbench_scale, duration, 1
|
||||
)
|
||||
|
||||
|
||||
# For reference, the space usage of the snapshots:
|
||||
# sudo du -hs /instance_store/neon/test_output/shared-snapshots/*
|
||||
# 19G /instance_store/neon/test_output/shared-snapshots/max_throughput_latest_lsn-1-136
|
||||
@pytest.mark.parametrize("duration", [20 * 60])
|
||||
@pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(2048)])
|
||||
# we use 1 client to characterize latencies, and 64 clients to characterize throughput/scalability
|
||||
# we use 64 clients because typically for a high number of connections we recommend the connection pooler
|
||||
# which by default uses 64 connections
|
||||
@pytest.mark.parametrize("n_clients", [1, 64])
|
||||
@pytest.mark.parametrize("n_tenants", [1])
|
||||
@pytest.mark.timeout(2400)
|
||||
@pytest.mark.skipif(
|
||||
os.getenv("CI", "false") == "true",
|
||||
reason="This test needs lot of resources and should run on dedicated HW, not in github action runners as part of CI",
|
||||
)
|
||||
def test_pageserver_characterize_latencies_with_1_client_and_throughput_with_many_clients_one_tenant(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
zenbenchmark: NeonBenchmarker,
|
||||
pg_bin: PgBin,
|
||||
n_tenants: int,
|
||||
pgbench_scale: int,
|
||||
duration: int,
|
||||
n_clients: int,
|
||||
):
|
||||
setup_and_run_pagebench_benchmark(
|
||||
neon_env_builder, zenbenchmark, pg_bin, n_tenants, pgbench_scale, duration, n_clients
|
||||
)
|
||||
|
||||
|
||||
def setup_and_run_pagebench_benchmark(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
zenbenchmark: NeonBenchmarker,
|
||||
pg_bin: PgBin,
|
||||
n_tenants: int,
|
||||
pgbench_scale: int,
|
||||
duration: int,
|
||||
n_clients: int,
|
||||
):
|
||||
def record(metric, **kwargs):
|
||||
zenbenchmark.record(
|
||||
@@ -55,6 +100,7 @@ def test_pageserver_max_throughput_getpage_at_latest_lsn(
|
||||
"n_tenants": (n_tenants, {"unit": ""}),
|
||||
"pgbench_scale": (pgbench_scale, {"unit": ""}),
|
||||
"duration": (duration, {"unit": "s"}),
|
||||
"n_clients": (n_clients, {"unit": ""}),
|
||||
}
|
||||
)
|
||||
|
||||
@@ -96,7 +142,7 @@ def test_pageserver_max_throughput_getpage_at_latest_lsn(
|
||||
r".*query handler for.*pagestream.*failed: unexpected message: CopyFail during COPY.*"
|
||||
)
|
||||
|
||||
run_benchmark_max_throughput_latest_lsn(env, pg_bin, record, duration)
|
||||
run_pagebench_benchmark(env, pg_bin, record, duration, n_clients)
|
||||
|
||||
|
||||
def setup_tenant_template(env: NeonEnv, pg_bin: PgBin, scale: int):
|
||||
@@ -157,8 +203,8 @@ def setup_tenant_template(env: NeonEnv, pg_bin: PgBin, scale: int):
|
||||
return (template_tenant, template_timeline, config)
|
||||
|
||||
|
||||
def run_benchmark_max_throughput_latest_lsn(
|
||||
env: NeonEnv, pg_bin: PgBin, record, duration_secs: int
|
||||
def run_pagebench_benchmark(
|
||||
env: NeonEnv, pg_bin: PgBin, record, duration_secs: int, n_clients: int
|
||||
):
|
||||
"""
|
||||
Benchmark `env.pageserver` for max throughput @ latest LSN and record results in `zenbenchmark`.
|
||||
@@ -172,6 +218,8 @@ def run_benchmark_max_throughput_latest_lsn(
|
||||
ps_http.base_url,
|
||||
"--page-service-connstring",
|
||||
env.pageserver.connstr(password=None),
|
||||
"--num-clients",
|
||||
str(n_clients),
|
||||
"--runtime",
|
||||
f"{duration_secs}s",
|
||||
# don't specify the targets explicitly, let pagebench auto-discover them
|
||||
|
||||
@@ -22,7 +22,7 @@ def ensure_pageserver_ready_for_benchmarking(env: NeonEnv, n_tenants: int):
|
||||
|
||||
log.info("wait for all tenants to become active")
|
||||
wait_until_all_tenants_state(
|
||||
ps_http, "Active", iterations=n_tenants, period=1, http_error_ok=False
|
||||
ps_http, "Active", iterations=10 + n_tenants, period=1, http_error_ok=False
|
||||
)
|
||||
|
||||
# ensure all layers are resident for predictiable performance
|
||||
|
||||
Reference in New Issue
Block a user