mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-09 06:22:57 +00:00
add pagebench test cases for periodic pagebench on dedicated hardware (#8233)
we want to run some specific pagebench test cases on dedicated hardware to get reproducible results run1: 1 client per tenant => characterize throughput with n tenants. - 500 tenants - scale 13 (200 MB database) - 1 hour duration - ca 380 GB layer snapshot files run2.singleclient: 1 client per tenant => characterize latencies run2.manyclient: N clients per tenant => characterize throughput scalability within one tenant. - 1 tenant with 1 client for latencies - 1 tenant with 64 clients because typically for a high number of connections we recommend the connection pooler which by default uses 64 connections (for scalability) - scale 136 (2048 MB database) - 20 minutes each
This commit is contained in:
@@ -1,4 +1,5 @@
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Tuple
|
||||
|
||||
@@ -17,30 +18,74 @@ from performance.pageserver.util import (
|
||||
setup_pageserver_with_tenants,
|
||||
)
|
||||
|
||||
# The following tests use pagebench "getpage at latest LSN" to characterize the throughput of the pageserver.
|
||||
# originally there was a single test named `test_pageserver_max_throughput_getpage_at_latest_lsn``
|
||||
# so you still see some references to this name in the code.
|
||||
# To avoid recreating the snapshots for each test, we continue to use the name `max_throughput_latest_lsn`
|
||||
# for some files and metrics.
|
||||
|
||||
|
||||
# For reference, the space usage of the snapshots:
|
||||
# admin@ip-172-31-13-23:[~/neon-main]: sudo du -hs /instance_store/test_output/shared-snapshots
|
||||
# 137G /instance_store/test_output/shared-snapshots
|
||||
# admin@ip-172-31-13-23:[~/neon-main]: sudo du -hs /instance_store/test_output/shared-snapshots/*
|
||||
# 1.8G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-1-13
|
||||
# 1.1G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-1-6
|
||||
# 8.5G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-10-13
|
||||
# 5.1G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-10-6
|
||||
# 76G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-100-13
|
||||
# 46G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-100-6
|
||||
@pytest.mark.parametrize("duration", [30])
|
||||
@pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(s) for s in [100, 200]])
|
||||
@pytest.mark.parametrize("n_tenants", [1, 10])
|
||||
@pytest.mark.timeout(
|
||||
10000
|
||||
) # TODO: this value is just "a really high number"; have this per instance type
|
||||
def test_pageserver_max_throughput_getpage_at_latest_lsn(
|
||||
# sudo du -hs /instance_store/neon/test_output/shared-snapshots/*
|
||||
# 416G /instance_store/neon/test_output/shared-snapshots/max_throughput_latest_lsn-500-13
|
||||
@pytest.mark.parametrize("duration", [60 * 60])
|
||||
@pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(200)])
|
||||
@pytest.mark.parametrize("n_tenants", [500])
|
||||
@pytest.mark.timeout(10000)
|
||||
@pytest.mark.skipif(
|
||||
os.getenv("CI", "false") == "true",
|
||||
reason="This test needs lot of resources and should run on dedicated HW, not in github action runners as part of CI",
|
||||
)
|
||||
def test_pageserver_characterize_throughput_with_n_tenants(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
zenbenchmark: NeonBenchmarker,
|
||||
pg_bin: PgBin,
|
||||
n_tenants: int,
|
||||
pgbench_scale: int,
|
||||
duration: int,
|
||||
):
|
||||
setup_and_run_pagebench_benchmark(
|
||||
neon_env_builder, zenbenchmark, pg_bin, n_tenants, pgbench_scale, duration, 1
|
||||
)
|
||||
|
||||
|
||||
# For reference, the space usage of the snapshots:
|
||||
# sudo du -hs /instance_store/neon/test_output/shared-snapshots/*
|
||||
# 19G /instance_store/neon/test_output/shared-snapshots/max_throughput_latest_lsn-1-136
|
||||
@pytest.mark.parametrize("duration", [20 * 60])
|
||||
@pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(2048)])
|
||||
# we use 1 client to characterize latencies, and 64 clients to characterize throughput/scalability
|
||||
# we use 64 clients because typically for a high number of connections we recommend the connection pooler
|
||||
# which by default uses 64 connections
|
||||
@pytest.mark.parametrize("n_clients", [1, 64])
|
||||
@pytest.mark.parametrize("n_tenants", [1])
|
||||
@pytest.mark.timeout(2400)
|
||||
@pytest.mark.skipif(
|
||||
os.getenv("CI", "false") == "true",
|
||||
reason="This test needs lot of resources and should run on dedicated HW, not in github action runners as part of CI",
|
||||
)
|
||||
def test_pageserver_characterize_latencies_with_1_client_and_throughput_with_many_clients_one_tenant(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
zenbenchmark: NeonBenchmarker,
|
||||
pg_bin: PgBin,
|
||||
n_tenants: int,
|
||||
pgbench_scale: int,
|
||||
duration: int,
|
||||
n_clients: int,
|
||||
):
|
||||
setup_and_run_pagebench_benchmark(
|
||||
neon_env_builder, zenbenchmark, pg_bin, n_tenants, pgbench_scale, duration, n_clients
|
||||
)
|
||||
|
||||
|
||||
def setup_and_run_pagebench_benchmark(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
zenbenchmark: NeonBenchmarker,
|
||||
pg_bin: PgBin,
|
||||
n_tenants: int,
|
||||
pgbench_scale: int,
|
||||
duration: int,
|
||||
n_clients: int,
|
||||
):
|
||||
def record(metric, **kwargs):
|
||||
zenbenchmark.record(
|
||||
@@ -55,6 +100,7 @@ def test_pageserver_max_throughput_getpage_at_latest_lsn(
|
||||
"n_tenants": (n_tenants, {"unit": ""}),
|
||||
"pgbench_scale": (pgbench_scale, {"unit": ""}),
|
||||
"duration": (duration, {"unit": "s"}),
|
||||
"n_clients": (n_clients, {"unit": ""}),
|
||||
}
|
||||
)
|
||||
|
||||
@@ -96,7 +142,7 @@ def test_pageserver_max_throughput_getpage_at_latest_lsn(
|
||||
r".*query handler for.*pagestream.*failed: unexpected message: CopyFail during COPY.*"
|
||||
)
|
||||
|
||||
run_benchmark_max_throughput_latest_lsn(env, pg_bin, record, duration)
|
||||
run_pagebench_benchmark(env, pg_bin, record, duration, n_clients)
|
||||
|
||||
|
||||
def setup_tenant_template(env: NeonEnv, pg_bin: PgBin, scale: int):
|
||||
@@ -157,8 +203,8 @@ def setup_tenant_template(env: NeonEnv, pg_bin: PgBin, scale: int):
|
||||
return (template_tenant, template_timeline, config)
|
||||
|
||||
|
||||
def run_benchmark_max_throughput_latest_lsn(
|
||||
env: NeonEnv, pg_bin: PgBin, record, duration_secs: int
|
||||
def run_pagebench_benchmark(
|
||||
env: NeonEnv, pg_bin: PgBin, record, duration_secs: int, n_clients: int
|
||||
):
|
||||
"""
|
||||
Benchmark `env.pageserver` for max throughput @ latest LSN and record results in `zenbenchmark`.
|
||||
@@ -172,6 +218,8 @@ def run_benchmark_max_throughput_latest_lsn(
|
||||
ps_http.base_url,
|
||||
"--page-service-connstring",
|
||||
env.pageserver.connstr(password=None),
|
||||
"--num-clients",
|
||||
str(n_clients),
|
||||
"--runtime",
|
||||
f"{duration_secs}s",
|
||||
# don't specify the targets explicitly, let pagebench auto-discover them
|
||||
|
||||
@@ -22,7 +22,7 @@ def ensure_pageserver_ready_for_benchmarking(env: NeonEnv, n_tenants: int):
|
||||
|
||||
log.info("wait for all tenants to become active")
|
||||
wait_until_all_tenants_state(
|
||||
ps_http, "Active", iterations=n_tenants, period=1, http_error_ok=False
|
||||
ps_http, "Active", iterations=10 + n_tenants, period=1, http_error_ok=False
|
||||
)
|
||||
|
||||
# ensure all layers are resident for predictiable performance
|
||||
|
||||
Reference in New Issue
Block a user