diff --git a/results.txt b/results.txt new file mode 100644 index 0000000000..5bfc22575c --- /dev/null +++ b/results.txt @@ -0,0 +1,36 @@ +run on i3en.3xlarge + +admin@ip-172-31-13-23:[~/neon-main]: du -hs /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-1000-6/snapshot/local_fs_remote_storage/pageserver/tenants +225G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-1000-6/snapshot/local_fs_remote_storage/pageserver/tenants + +=> ~2.25x main memory + +admin@ip-172-31-13-23:[~/neon-main]: NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS=1 DEFAULT_PG_VERSION=15 BUILD_TYPE=release ./scripts/pytest test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py + +--------------------------------------------------------------------------------- Benchmark results --------------------------------------------------------------------------------- +test_pageserver_max_throughput_getpage_at_latest_lsn[1000-6-30-std-fs].pageserver_max_throughput_getpage_at_latest_lsn.n_tenants: 1000 +test_pageserver_max_throughput_getpage_at_latest_lsn[1000-6-30-std-fs].pageserver_max_throughput_getpage_at_latest_lsn.pgbench_scale: 6 +test_pageserver_max_throughput_getpage_at_latest_lsn[1000-6-30-std-fs].pageserver_max_throughput_getpage_at_latest_lsn.duration: 30 s +test_pageserver_max_throughput_getpage_at_latest_lsn[1000-6-30-std-fs].pageserver_max_throughput_getpage_at_latest_lsn.pageserver_config_override.page_cache_size: 134217728 byte +test_pageserver_max_throughput_getpage_at_latest_lsn[1000-6-30-std-fs].pageserver_max_throughput_getpage_at_latest_lsn.pageserver_config_override.max_file_descriptors: 500000 +test_pageserver_max_throughput_getpage_at_latest_lsn[1000-6-30-std-fs].pageserver_max_throughput_getpage_at_latest_lsn.pageserver_config.override.virtual_file_io_engine: IoEngine.STD_FS +test_pageserver_max_throughput_getpage_at_latest_lsn[1000-6-30-std-fs].pageserver_max_throughput_getpage_at_latest_lsn.request_count: 2321 +test_pageserver_max_throughput_getpage_at_latest_lsn[1000-6-30-std-fs].pageserver_max_throughput_getpage_at_latest_lsn.latency_mean: 8,785.440 ms +test_pageserver_max_throughput_getpage_at_latest_lsn[1000-6-30-std-fs].pageserver_max_throughput_getpage_at_latest_lsn.latency_percentiles.p95: 20,234.239 ms +test_pageserver_max_throughput_getpage_at_latest_lsn[1000-6-30-std-fs].pageserver_max_throughput_getpage_at_latest_lsn.latency_percentiles.p99: 20,234.239 ms +test_pageserver_max_throughput_getpage_at_latest_lsn[1000-6-30-std-fs].pageserver_max_throughput_getpage_at_latest_lsn.latency_percentiles.p99.9: 20,234.239 ms +test_pageserver_max_throughput_getpage_at_latest_lsn[1000-6-30-std-fs].pageserver_max_throughput_getpage_at_latest_lsn.latency_percentiles.p99.99: 20,234.239 ms +test_pageserver_max_throughput_getpage_at_latest_lsn[1000-6-30-tokio-epoll-uring].pageserver_max_throughput_getpage_at_latest_lsn.n_tenants: 1000 +test_pageserver_max_throughput_getpage_at_latest_lsn[1000-6-30-tokio-epoll-uring].pageserver_max_throughput_getpage_at_latest_lsn.pgbench_scale: 6 +test_pageserver_max_throughput_getpage_at_latest_lsn[1000-6-30-tokio-epoll-uring].pageserver_max_throughput_getpage_at_latest_lsn.duration: 30 s +test_pageserver_max_throughput_getpage_at_latest_lsn[1000-6-30-tokio-epoll-uring].pageserver_max_throughput_getpage_at_latest_lsn.pageserver_config_override.page_cache_size: 134217728 byte +test_pageserver_max_throughput_getpage_at_latest_lsn[1000-6-30-tokio-epoll-uring].pageserver_max_throughput_getpage_at_latest_lsn.pageserver_config_override.max_file_descriptors: 500000 +test_pageserver_max_throughput_getpage_at_latest_lsn[1000-6-30-tokio-epoll-uring].pageserver_max_throughput_getpage_at_latest_lsn.pageserver_config.override.virtual_file_io_engine: IoEngine.TOKIO_EPOLL_URING +test_pageserver_max_throughput_getpage_at_latest_lsn[1000-6-30-tokio-epoll-uring].pageserver_max_throughput_getpage_at_latest_lsn.request_count: 2200 +test_pageserver_max_throughput_getpage_at_latest_lsn[1000-6-30-tokio-epoll-uring].pageserver_max_throughput_getpage_at_latest_lsn.latency_mean: 9,046.271 ms +test_pageserver_max_throughput_getpage_at_latest_lsn[1000-6-30-tokio-epoll-uring].pageserver_max_throughput_getpage_at_latest_lsn.latency_percentiles.p95: 16,457.727 ms +test_pageserver_max_throughput_getpage_at_latest_lsn[1000-6-30-tokio-epoll-uring].pageserver_max_throughput_getpage_at_latest_lsn.latency_percentiles.p99: 16,457.727 ms +test_pageserver_max_throughput_getpage_at_latest_lsn[1000-6-30-tokio-epoll-uring].pageserver_max_throughput_getpage_at_latest_lsn.latency_percentiles.p99.9: 16,457.727 ms +test_pageserver_max_throughput_getpage_at_latest_lsn[1000-6-30-tokio-epoll-uring].pageserver_max_throughput_getpage_at_latest_lsn.latency_percentiles.p99.99: 16,457.727 ms + +=========================================================================== 2 passed in 142.33s (0:02:22) =========================================================================== diff --git a/scripts/ps_ec2_setup_instance_store b/scripts/ps_ec2_setup_instance_store index 4cca3a9857..3d80c7b830 100755 --- a/scripts/ps_ec2_setup_instance_store +++ b/scripts/ps_ec2_setup_instance_store @@ -20,10 +20,10 @@ fi # do all the on-disk initialization work now instead of a background kernel thread # so that we're ready for benchmarking right after this line -sudo mkfs.ext4 -E lazy_itable_init=0,lazy_journal_init=0 /dev/nvme1n1 +#sudo mkfs.ext4 -E lazy_itable_init=0,lazy_journal_init=0 /dev/nvme1n1 MOUNTPOINT=/instance_store -sudo mkdir "$MOUNTPOINT" +sudo rmdir "$MOUNTPOINT" || sudo mkdir "$MOUNTPOINT" sudo mount /dev/nvme1n1 "$MOUNTPOINT" sudo chown -R "$(id -u)":"$(id -g)" "$MOUNTPOINT" @@ -40,7 +40,7 @@ To run your local neon.git build on the instance store volume, run the following commands from the top of the neon.git checkout # raise file descriptor limit of your shell and its child processes - sudo prlimit -p $$ --nofile=800000:800000 + sudo prlimit -p \$\$ --nofile=800000:800000 # test suite run export TEST_OUTPUT="$TEST_OUTPUT" diff --git a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py index 1ed7e577b9..ea9924844a 100644 --- a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py +++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py @@ -1,7 +1,10 @@ +import enum import json from pathlib import Path from typing import Any, Dict, Tuple +import toml + import fixtures.pageserver.many_tenants as many_tenants import pytest from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker @@ -17,6 +20,10 @@ from fixtures.utils import get_scale_for_db, humantime_to_ms from performance.pageserver.util import ensure_pageserver_ready_for_benchmarking +class IoEngine(str, enum.Enum): + STD_FS = "std-fs" + TOKIO_EPOLL_URING = "tokio-epoll-uring" + # For reference, the space usage of the snapshots: # admin@ip-172-31-13-23:[~/neon-main]: sudo du -hs /instance_store/test_output/shared-snapshots # 137G /instance_store/test_output/shared-snapshots @@ -27,9 +34,10 @@ from performance.pageserver.util import ensure_pageserver_ready_for_benchmarking # 5.1G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-10-6 # 76G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-100-13 # 46G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-100-6 +@pytest.mark.parametrize("ioengine", [IoEngine.STD_FS, IoEngine.TOKIO_EPOLL_URING]) @pytest.mark.parametrize("duration", [30]) -@pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(s) for s in [100, 200]]) -@pytest.mark.parametrize("n_tenants", [1, 10]) +@pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(s) for s in [100]]) +@pytest.mark.parametrize("n_tenants", [1000]) @pytest.mark.timeout( 10000 ) # TODO: this value is just "a really high number"; have this per instance type @@ -40,6 +48,7 @@ def test_pageserver_max_throughput_getpage_at_latest_lsn( n_tenants: int, pgbench_scale: int, duration: int, + ioengine: IoEngine, ): def record(metric, **kwargs): zenbenchmark.record( @@ -60,9 +69,12 @@ def test_pageserver_max_throughput_getpage_at_latest_lsn( # configure cache sizes like in prod page_cache_size = 16384 max_file_descriptors = 500000 - neon_env_builder.pageserver_config_override = ( - f"page_cache_size={page_cache_size}; max_file_descriptors={max_file_descriptors}" - ) + pageserver_config_override = { + "page_cache_size": f"{page_cache_size}", + "max_file_descriptors": f"{max_file_descriptors}", + "virtual_file_io_engine": f"\"{ioengine}\"", + } + neon_env_builder.pageserver_config_override = ";".join([f"{k}={v}" for k, v in pageserver_config_override.items()]) params.update( { "pageserver_config_override.page_cache_size": ( @@ -70,6 +82,7 @@ def test_pageserver_max_throughput_getpage_at_latest_lsn( {"unit": "byte"}, ), "pageserver_config_override.max_file_descriptors": (max_file_descriptors, {"unit": ""}), + "pageserver_config.override.virtual_file_io_engine": (ioengine, {"unit": ""}), } )