diff --git a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py new file mode 100644 index 0000000000..1a7e54a652 --- /dev/null +++ b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py @@ -0,0 +1,196 @@ +from pathlib import Path +import asyncio +import json +import pytest + +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, PgBin, wait_for_last_flush_lsn, Endpoint +from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker +from performance.pageserver.util import ( + ensure_pageserver_ready_for_benchmarking, + setup_pageserver_with_tenants, +) + +from fixtures.utils import get_scale_for_db, humantime_to_ms + + +@pytest.mark.parametrize("duration", [30]) +@pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(200)]) +@pytest.mark.parametrize("n_tenants", [10]) +@pytest.mark.parametrize("get_vectored_impl", ["sequential", "vectored"]) +@pytest.mark.timeout(1000) +def test_basebackup_with_high_slru_count( + neon_env_builder: NeonEnvBuilder, + zenbenchmark: NeonBenchmarker, + pg_bin: PgBin, + get_vectored_impl: str, + n_tenants: int, + pgbench_scale: int, + duration: int, +): + def record(metric, **kwargs): + zenbenchmark.record(metric_name=f"pageserver_basebackup.{metric}", **kwargs) + + params: Dict[str, Tuple[Any, Dict[str, Any]]] = {} + + # params from fixtures + params.update( + { + "n_tenants": (n_tenants, {"unit": ""}), + "pgbench_scale": (pgbench_scale, {"unit": ""}), + "duration": (duration, {"unit": "s"}), + } + ) + + # configure cache sizes like in prod + page_cache_size = 16384 + max_file_descriptors = 500000 + neon_env_builder.pageserver_config_override = ( + f"page_cache_size={page_cache_size}; max_file_descriptors={max_file_descriptors}; " + f"get_vectored_impl='{get_vectored_impl}'; validate_vectored_get=false" + ) + params.update( + { + "pageserver_config_override.page_cache_size": ( + page_cache_size * 8192, + {"unit": "byte"}, + ), + "pageserver_config_override.max_file_descriptors": (max_file_descriptors, {"unit": ""}), + } + ) + + for param, (value, kwargs) in params.items(): + record(param, metric_value=value, report=MetricReport.TEST_PARAM, **kwargs) + + n_txns = 100000 + + def setup_wrapper(env: NeonEnv): + return setup_tenant_template(env, n_txns) + + env = setup_pageserver_with_tenants( + neon_env_builder, f"large_slru_count-{n_tenants}-{n_txns}", n_tenants, setup_wrapper + ) + run_benchmark(env, pg_bin, record, duration) + + +def setup_tenant_template(env: NeonEnv, n_txns: int): + config = { + "gc_period": "0s", # disable periodic gc + "checkpoint_timeout": "10 years", + "compaction_period": "0s", # disable periodic compaction + "compaction_threshold": 10, + "compaction_target_size": 134217728, + "checkpoint_distance": 268435456, + "image_creation_threshold": 3, + } + + template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True) + env.pageserver.tenant_detach(template_tenant) + env.pageserver.allowed_errors.append( + # tenant detach causes this because the underlying attach-hook removes the tenant from attachment_service entirely + ".*Dropped remote consistent LSN updates.*", + ) + env.pageserver.tenant_attach(template_tenant, config) + + ps_http = env.pageserver.http_client() + + with env.endpoints.create_start( + "main", tenant_id=template_tenant, config_lines=["shared_buffers=1MB"] + ) as ep: + rels = 10 + + with ep.cursor() as cur: + asyncio.run(run_updates(ep, n_txns, rels)) + + wait_for_last_flush_lsn(env, ep, template_tenant, template_timeline) + ps_http.timeline_checkpoint(template_tenant, template_timeline) + ps_http.timeline_compact(template_tenant, template_timeline) + + return (template_tenant, template_timeline, config) + + +# Takes about 5 minutes and produces tenants with around 300 SLRU blocks +# of 8 KiB each. +async def run_updates(ep: Endpoint, n_txns: int, workers_count: int): + workers = [] + for i in range(workers_count): + workers.append(asyncio.create_task(run_update_loop_worker(ep, n_txns, i))) + + await asyncio.gather(*workers) + + +async def run_update_loop_worker(ep: Endpoint, n_txns: int, idx: int): + table = f"t_{idx}" + conn = await ep.connect_async() + await conn.execute(f"CREATE TABLE {table} (pk integer PRIMARY KEY, x integer)") + await conn.execute(f"ALTER TABLE {table} SET (autovacuum_enabled = false)") + await conn.execute(f"INSERT INTO {table} VALUES (1, 0)") + await conn.execute( + """ + CREATE PROCEDURE updating{0}() as + $$ + DECLARE + i integer; + BEGIN + FOR i IN 1..{1} LOOP + UPDATE {0} SET x = x + 1 WHERE pk=1; + COMMIT; + END LOOP; + END + $$ LANGUAGE plpgsql + """.format(table, n_txns) + ) + await conn.execute("SET statement_timeout=0") + await conn.execute(f"call updating{table}()") + + +def run_benchmark(env: NeonEnv, pg_bin: PgBin, record, duration_secs: int): + ps_http = env.pageserver.http_client() + cmd = [ + str(env.neon_binpath / "pagebench"), + "basebackup", + "--mgmt-api-endpoint", + ps_http.base_url, + "--page-service-connstring", + env.pageserver.connstr(password=None), + "--gzip-probability", + "1", + "--runtime", + f"{duration_secs}s", + # don't specify the targets explicitly, let pagebench auto-discover them + ] + + log.info(f"command: {' '.join(cmd)}") + basepath = pg_bin.run_capture(cmd, with_command_header=False) + results_path = Path(basepath + ".stdout") + log.info(f"Benchmark results at: {results_path}") + + with open(results_path, "r") as f: + results = json.load(f) + log.info(f"Results:\n{json.dumps(results, sort_keys=True, indent=2)}") + + total = results["total"] + metric = "request_count" + record( + metric, + metric_value=total[metric], + unit="", + report=MetricReport.HIGHER_IS_BETTER, + ) + + metric = "latency_mean" + record( + metric, + metric_value=humantime_to_ms(total[metric]), + unit="ms", + report=MetricReport.LOWER_IS_BETTER, + ) + + metric = "latency_percentiles" + for k, v in total[metric].items(): + record( + f"{metric}.{k}", + metric_value=humantime_to_ms(v), + unit="ms", + report=MetricReport.LOWER_IS_BETTER, + ) diff --git a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py index 1ed7e577b9..f8125920c8 100644 --- a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py +++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py @@ -14,7 +14,10 @@ from fixtures.neon_fixtures import ( ) from fixtures.utils import get_scale_for_db, humantime_to_ms -from performance.pageserver.util import ensure_pageserver_ready_for_benchmarking +from performance.pageserver.util import ( + ensure_pageserver_ready_for_benchmarking, + setup_pageserver_with_tenants, +) # For reference, the space usage of the snapshots: @@ -75,10 +78,72 @@ def test_pageserver_max_throughput_getpage_at_latest_lsn( for param, (value, kwargs) in params.items(): record(param, metric_value=value, report=MetricReport.TEST_PARAM, **kwargs) - env = setup_pageserver_with_pgbench_tenants(neon_env_builder, pg_bin, n_tenants, pgbench_scale) + + def setup_wrapper(env: NeonEnv): + return setup_tenant_template(env, pg_bin, pgbench_scale) + + env = setup_pageserver_with_tenants( + neon_env_builder, + f"max_throughput_latest_lsn-{n_tenants}-{pgbench_scale}", + n_tenants, + setup_wrapper, + ) run_benchmark_max_throughput_latest_lsn(env, pg_bin, record, duration) +def setup_tenant_template(env: NeonEnv, pg_bin: PgBin, scale: int): + # use a config that makes production of on-disk state timing-insensitive + # as we ingest data into the tenant. + config = { + "gc_period": "0s", # disable periodic gc + "checkpoint_timeout": "10 years", + "compaction_period": "0s", # disable periodic compaction + "compaction_threshold": 10, + "compaction_target_size": 134217728, + "checkpoint_distance": 268435456, + "image_creation_threshold": 3, + } + template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True) + env.pageserver.tenant_detach(template_tenant) + env.pageserver.allowed_errors.append( + # tenant detach causes this because the underlying attach-hook removes the tenant from attachment_service entirely + ".*Dropped remote consistent LSN updates.*", + ) + env.pageserver.tenant_attach(template_tenant, config) + ps_http = env.pageserver.http_client() + with env.endpoints.create_start("main", tenant_id=template_tenant) as ep: + pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", "-I", "dtGvp", ep.connstr()]) + wait_for_last_flush_lsn(env, ep, template_tenant, template_timeline) + ps_http.timeline_checkpoint(template_tenant, template_timeline) + ps_http.timeline_compact(template_tenant, template_timeline) + for _ in range( + 0, 17 + ): # some prime number to avoid potential resonances with the "_threshold" variables from the config + # the L0s produced by this appear to have size ~5MiB + num_txns = 10_000 + pg_bin.run_capture( + ["pgbench", "-N", "-c1", "--transactions", f"{num_txns}", ep.connstr()] + ) + wait_for_last_flush_lsn(env, ep, template_tenant, template_timeline) + ps_http.timeline_checkpoint(template_tenant, template_timeline) + ps_http.timeline_compact(template_tenant, template_timeline) + # for reference, the output at scale=6 looked like so (306M total) + # ls -sh test_output/shared-snapshots/max_throughput_latest_lsn-2-6/snapshot/pageserver_1/tenants/35c30b88ea16a7a09f82d9c6a115551b/timelines/da902b378eebe83dc8a4e81cd3dc1c59 + # total 306M + # 188M 000000000000000000000000000000000000-030000000000000000000000000000000003__000000000149F060-0000000009E75829 + # 4.5M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000009E75829-000000000A21E919 + # 33M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000A21E919-000000000C20CB71 + # 36M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000C20CB71-000000000E470791 + # 16M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000E470791-000000000F34AEF1 + # 8.2M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000F34AEF1-000000000FABA8A9 + # 6.0M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000FABA8A9-000000000FFE0639 + # 6.1M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000FFE0639-000000001051D799 + # 4.7M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000001051D799-0000000010908F19 + # 4.6M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000010908F19-0000000010CD3021 + + return (template_tenant, template_timeline, config) + + def run_benchmark_max_throughput_latest_lsn( env: NeonEnv, pg_bin: PgBin, record, duration_secs: int ): @@ -133,78 +198,3 @@ def run_benchmark_max_throughput_latest_lsn( unit="ms", report=MetricReport.LOWER_IS_BETTER, ) - - -def setup_pageserver_with_pgbench_tenants( - neon_env_builder: NeonEnvBuilder, - pg_bin: PgBin, - n_tenants: int, - scale: int, -) -> NeonEnv: - """ - Utility function to set up a pageserver with a given number of identical tenants. - Each tenant is a pgbench tenant, initialize to a certain scale, and treated afterwards - with a repeat application of (pgbench simple-update workload, checkpoint, compact). - """ - - def setup_template(env: NeonEnv): - # use a config that makes production of on-disk state timing-insensitive - # as we ingest data into the tenant. - config = { - "gc_period": "0s", # disable periodic gc - "checkpoint_timeout": "10 years", - "compaction_period": "0s", # disable periodic compaction - "compaction_threshold": 10, - "compaction_target_size": 134217728, - "checkpoint_distance": 268435456, - "image_creation_threshold": 3, - } - template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True) - env.pageserver.tenant_detach(template_tenant) - env.pageserver.allowed_errors.append( - # tenant detach causes this because the underlying attach-hook removes the tenant from attachment_service entirely - ".*Dropped remote consistent LSN updates.*", - ) - env.pageserver.tenant_attach(template_tenant, config) - ps_http = env.pageserver.http_client() - with env.endpoints.create_start("main", tenant_id=template_tenant) as ep: - pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", "-I", "dtGvp", ep.connstr()]) - wait_for_last_flush_lsn(env, ep, template_tenant, template_timeline) - ps_http.timeline_checkpoint(template_tenant, template_timeline) - ps_http.timeline_compact(template_tenant, template_timeline) - for _ in range( - 0, 17 - ): # some prime number to avoid potential resonances with the "_threshold" variables from the config - # the L0s produced by this appear to have size ~5MiB - num_txns = 10_000 - pg_bin.run_capture( - ["pgbench", "-N", "-c1", "--transactions", f"{num_txns}", ep.connstr()] - ) - wait_for_last_flush_lsn(env, ep, template_tenant, template_timeline) - ps_http.timeline_checkpoint(template_tenant, template_timeline) - ps_http.timeline_compact(template_tenant, template_timeline) - # for reference, the output at scale=6 looked like so (306M total) - # ls -sh test_output/shared-snapshots/max_throughput_latest_lsn-2-6/snapshot/pageserver_1/tenants/35c30b88ea16a7a09f82d9c6a115551b/timelines/da902b378eebe83dc8a4e81cd3dc1c59 - # total 306M - # 188M 000000000000000000000000000000000000-030000000000000000000000000000000003__000000000149F060-0000000009E75829 - # 4.5M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000009E75829-000000000A21E919 - # 33M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000A21E919-000000000C20CB71 - # 36M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000C20CB71-000000000E470791 - # 16M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000E470791-000000000F34AEF1 - # 8.2M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000F34AEF1-000000000FABA8A9 - # 6.0M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000FABA8A9-000000000FFE0639 - # 6.1M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000FFE0639-000000001051D799 - # 4.7M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000001051D799-0000000010908F19 - # 4.6M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000010908F19-0000000010CD3021 - - return (template_tenant, template_timeline, config) - - def doit(neon_env_builder: NeonEnvBuilder) -> NeonEnv: - return many_tenants.single_timeline(neon_env_builder, setup_template, n_tenants) - - env = neon_env_builder.build_and_use_snapshot( - f"max_throughput_latest_lsn-{n_tenants}-{scale}", doit - ) - env.start() - ensure_pageserver_ready_for_benchmarking(env, n_tenants) - return env diff --git a/test_runner/performance/pageserver/util.py b/test_runner/performance/pageserver/util.py index 45eb652362..590f759ffc 100644 --- a/test_runner/performance/pageserver/util.py +++ b/test_runner/performance/pageserver/util.py @@ -2,9 +2,17 @@ Utilities used by all code in this sub-directory """ +from typing import Any, Callable, Dict, Tuple from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv +from fixtures.neon_fixtures import ( + NeonEnv, + NeonEnvBuilder, + PgBin, + wait_for_last_flush_lsn, +) +import fixtures.pageserver.many_tenants as many_tenants from fixtures.pageserver.utils import wait_until_all_tenants_state +from fixtures.types import TenantId, TimelineId def ensure_pageserver_ready_for_benchmarking(env: NeonEnv, n_tenants: int): @@ -27,3 +35,24 @@ def ensure_pageserver_ready_for_benchmarking(env: NeonEnv, n_tenants: int): assert not layer.remote log.info("ready") + + +def setup_pageserver_with_tenants( + neon_env_builder: NeonEnvBuilder, + name: str, + n_tenants: int, + setup: Callable[[NeonEnv], Tuple[TenantId, TimelineId, Dict[str, Any]]], +) -> NeonEnv: + """ + Utility function to set up a pageserver with a given number of identical tenants. + Each tenant is a pgbench tenant, initialize to a certain scale, and treated afterwards + with a repeat application of (pgbench simple-update workload, checkpoint, compact). + """ + + def doit(neon_env_builder: NeonEnvBuilder) -> NeonEnv: + return many_tenants.single_timeline(neon_env_builder, setup, n_tenants) + + env = neon_env_builder.build_and_use_snapshot(name, doit) + env.start() + ensure_pageserver_ready_for_benchmarking(env, n_tenants) + return env