diff --git a/test_runner/performance/test_compute_startup.py b/test_runner/performance/test_compute_startup.py new file mode 100644 index 0000000000..d002d2e221 --- /dev/null +++ b/test_runner/performance/test_compute_startup.py @@ -0,0 +1,240 @@ +from __future__ import annotations + +import pytest +import requests +from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker +from fixtures.neon_fixtures import NeonEnvBuilder, PgBin + + +# Just start and measure duration. +# +# This test runs pretty quickly and can be informative when used in combination +# with emulated network delay. Some useful delay commands: +# +# 1. Add 2msec delay to all localhost traffic +# `sudo tc qdisc add dev lo root handle 1:0 netem delay 2msec` +# +# 2. Test that it works (you should see 4ms ping) +# `ping localhost` +# +# 3. Revert back to normal +# `sudo tc qdisc del dev lo root netem` +# +# NOTE this test might not represent the real startup time because the basebackup +# for a large database might be larger if there's a lof of transaction metadata, +# or safekeepers might need more syncing, or there might be more operations to +# apply during config step, like more users, databases, or extensions. By default +# we load extensions 'neon,pg_stat_statements,timescaledb,pg_cron', but in this +# test we only load neon. +def test_compute_startup_simple( + neon_env_builder: NeonEnvBuilder, + zenbenchmark: NeonBenchmarker, +): + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() + + env.create_branch("test_startup") + + endpoint = None + + # We do two iterations so we can see if the second startup is faster. It should + # be because the compute node should already be configured with roles, databases, + # extensions, etc from the first run. + for i in range(2): + # Start + with zenbenchmark.record_duration(f"{i}_start_and_select"): + if endpoint: + endpoint.start() + else: + endpoint = env.endpoints.create( + "test_startup", + # Shared buffers need to be allocated during startup, so they + # impact startup time. This is the default value we use for + # 1CPU pods (maybe different for VMs). + # + # TODO extensions also contribute to shared memory allocation, + # and this test doesn't include all default extensions we + # load. + config_lines=["shared_buffers=262144"], + ) + # Do not skip pg_catalog updates at first start, i.e. + # imitate 'the first start after project creation'. + endpoint.respec(skip_pg_catalog_updates=False) + endpoint.start() + endpoint.safe_psql("select 1;") + + # Get metrics + metrics = requests.get( + f"http://localhost:{endpoint.external_http_port}/metrics.json" + ).json() + durations = { + "wait_for_spec_ms": f"{i}_wait_for_spec", + "sync_safekeepers_ms": f"{i}_sync_safekeepers", + "sync_sk_check_ms": f"{i}_sync_sk_check", + "basebackup_ms": f"{i}_basebackup", + "start_postgres_ms": f"{i}_start_postgres", + "config_ms": f"{i}_config", + "total_startup_ms": f"{i}_total_startup", + } + for key, name in durations.items(): + value = metrics[key] + zenbenchmark.record(name, value, "ms", report=MetricReport.LOWER_IS_BETTER) + + # Check basebackup size makes sense + basebackup_bytes = metrics["basebackup_bytes"] + if i > 0: + assert basebackup_bytes < 100 * 1024 + + # Stop so we can restart + endpoint.stop() + + # Imitate optimizations that console would do for the second start + endpoint.respec(skip_pg_catalog_updates=True) + + +# Start and measure duration with huge SLRU segments. +# This test is similar to test_compute_startup_simple, but it creates huge number of transactions +# and records containing this XIDs. Autovacuum is disable for the table to prevent CLOG truncation. +# TODO: this is very suspicious test, I doubt that it does what it's supposed to do, +# e.g. these two starts do not make much sense. Looks like it's just copy-paste. +# To be fixed within https://github.com/neondatabase/cloud/issues/8673 +@pytest.mark.timeout(1800) +@pytest.mark.parametrize("slru", ["lazy", "eager"]) +def test_compute_ondemand_slru_startup( + slru: str, neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker +): + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() + + lazy_slru_download = "true" if slru == "lazy" else "false" + tenant, _ = env.create_tenant( + conf={ + "lazy_slru_download": lazy_slru_download, + } + ) + + endpoint = env.endpoints.create_start("main", tenant_id=tenant) + with endpoint.cursor() as cur: + cur.execute("CREATE TABLE t (pk integer PRIMARY KEY, x integer)") + cur.execute("ALTER TABLE t SET (autovacuum_enabled = false)") + cur.execute("INSERT INTO t VALUES (1, 0)") + cur.execute( + """ + CREATE PROCEDURE updating() as + $$ + DECLARE + i integer; + BEGIN + FOR i IN 1..1000000 LOOP + UPDATE t SET x = x + 1 WHERE pk=1; + COMMIT; + END LOOP; + END + $$ LANGUAGE plpgsql + """ + ) + cur.execute("SET statement_timeout=0") + cur.execute("call updating()") + + endpoint.stop() + + # We do two iterations so we can see if the second startup is faster. It should + # be because the compute node should already be configured with roles, databases, + # extensions, etc from the first run. + for i in range(2): + # Start + with zenbenchmark.record_duration(f"{slru}_{i}_start"): + endpoint.start() + + with zenbenchmark.record_duration(f"{slru}_{i}_select"): + sum = endpoint.safe_psql("select sum(x) from t")[0][0] + assert sum == 1000000 + + # Get metrics + metrics = requests.get( + f"http://localhost:{endpoint.external_http_port}/metrics.json" + ).json() + durations = { + "wait_for_spec_ms": f"{slru}_{i}_wait_for_spec", + "sync_safekeepers_ms": f"{slru}_{i}_sync_safekeepers", + "sync_sk_check_ms": f"{slru}_{i}_sync_sk_check", + "basebackup_ms": f"{slru}_{i}_basebackup", + "start_postgres_ms": f"{slru}_{i}_start_postgres", + "config_ms": f"{slru}_{i}_config", + "total_startup_ms": f"{slru}_{i}_total_startup", + } + for key, name in durations.items(): + value = metrics[key] + zenbenchmark.record(name, value, "ms", report=MetricReport.LOWER_IS_BETTER) + + basebackup_bytes = metrics["basebackup_bytes"] + zenbenchmark.record( + f"{slru}_{i}_basebackup_bytes", + basebackup_bytes, + "bytes", + report=MetricReport.LOWER_IS_BETTER, + ) + + # Stop so we can restart + endpoint.stop() + + # Imitate optimizations that console would do for the second start + endpoint.respec(skip_pg_catalog_updates=True) + + +@pytest.mark.timeout(240) +def test_compute_startup_latency( + neon_env_builder: NeonEnvBuilder, + pg_bin: PgBin, + zenbenchmark: NeonBenchmarker, +): + """ + Do NUM_STARTS 'optimized' starts, i.e. with pg_catalog updates skipped, + and measure the duration of each step. Report p50, p90, p99 latencies. + """ + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() + + endpoint = env.endpoints.create_start("main") + pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s4", endpoint.connstr()]) + endpoint.stop() + + NUM_STARTS = 100 + + durations: dict[str, list[int]] = { + "sync_sk_check_ms": [], + "sync_safekeepers_ms": [], + "basebackup_ms": [], + "start_postgres_ms": [], + "total_startup_ms": [], + } + + for _i in range(NUM_STARTS): + endpoint.start() + client = endpoint.http_client() + metrics = client.metrics_json() + for key in durations.keys(): + value = metrics[key] + durations[key].append(value) + endpoint.stop() + + for key in durations.keys(): + durations[key] = sorted(durations[key]) + zenbenchmark.record( + f"{key}_p50", + durations[key][len(durations[key]) // 2], + "ms", + report=MetricReport.LOWER_IS_BETTER, + ) + zenbenchmark.record( + f"{key}_p90", + durations[key][len(durations[key]) * 9 // 10], + "ms", + report=MetricReport.LOWER_IS_BETTER, + ) + zenbenchmark.record( + f"{key}_p99", + durations[key][len(durations[key]) * 99 // 100], + "ms", + report=MetricReport.LOWER_IS_BETTER, + ) diff --git a/test_runner/performance/test_lazy_startup.py b/test_runner/performance/test_lazy_startup.py deleted file mode 100644 index 3bf3ef890f..0000000000 --- a/test_runner/performance/test_lazy_startup.py +++ /dev/null @@ -1,110 +0,0 @@ -from __future__ import annotations - -import pytest -import requests -from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker -from fixtures.neon_fixtures import NeonEnvBuilder - - -# Start and measure duration with huge SLRU segments. -# This test is similar to test_startup_simple, but it creates huge number of transactions -# and records containing this XIDs. Autovacuum is disable for the table to prevent CLOG truncation. -# -# This test runs pretty quickly and can be informative when used in combination -# with emulated network delay. Some useful delay commands: -# -# 1. Add 2msec delay to all localhost traffic -# `sudo tc qdisc add dev lo root handle 1:0 netem delay 2msec` -# -# 2. Test that it works (you should see 4ms ping) -# `ping localhost` -# -# 3. Revert back to normal -# `sudo tc qdisc del dev lo root netem` -# -# NOTE this test might not represent the real startup time because the basebackup -# for a large database might be larger if there's a lof of transaction metadata, -# or safekeepers might need more syncing, or there might be more operations to -# apply during config step, like more users, databases, or extensions. By default -# we load extensions 'neon,pg_stat_statements,timescaledb,pg_cron', but in this -# test we only load neon. -@pytest.mark.timeout(1800) -@pytest.mark.parametrize("slru", ["lazy", "eager"]) -def test_lazy_startup(slru: str, neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker): - neon_env_builder.num_safekeepers = 3 - env = neon_env_builder.init_start() - - lazy_slru_download = "true" if slru == "lazy" else "false" - tenant, _ = env.create_tenant( - conf={ - "lazy_slru_download": lazy_slru_download, - } - ) - - endpoint = env.endpoints.create_start("main", tenant_id=tenant) - with endpoint.cursor() as cur: - cur.execute("CREATE TABLE t (pk integer PRIMARY KEY, x integer)") - cur.execute("ALTER TABLE t SET (autovacuum_enabled = false)") - cur.execute("INSERT INTO t VALUES (1, 0)") - cur.execute( - """ - CREATE PROCEDURE updating() as - $$ - DECLARE - i integer; - BEGIN - FOR i IN 1..1000000 LOOP - UPDATE t SET x = x + 1 WHERE pk=1; - COMMIT; - END LOOP; - END - $$ LANGUAGE plpgsql - """ - ) - cur.execute("SET statement_timeout=0") - cur.execute("call updating()") - - endpoint.stop() - - # We do two iterations so we can see if the second startup is faster. It should - # be because the compute node should already be configured with roles, databases, - # extensions, etc from the first run. - for i in range(2): - # Start - with zenbenchmark.record_duration(f"{slru}_{i}_start"): - endpoint.start() - - with zenbenchmark.record_duration(f"{slru}_{i}_select"): - sum = endpoint.safe_psql("select sum(x) from t")[0][0] - assert sum == 1000000 - - # Get metrics - metrics = requests.get( - f"http://localhost:{endpoint.external_http_port}/metrics.json" - ).json() - durations = { - "wait_for_spec_ms": f"{slru}_{i}_wait_for_spec", - "sync_safekeepers_ms": f"{slru}_{i}_sync_safekeepers", - "sync_sk_check_ms": f"{slru}_{i}_sync_sk_check", - "basebackup_ms": f"{slru}_{i}_basebackup", - "start_postgres_ms": f"{slru}_{i}_start_postgres", - "config_ms": f"{slru}_{i}_config", - "total_startup_ms": f"{slru}_{i}_total_startup", - } - for key, name in durations.items(): - value = metrics[key] - zenbenchmark.record(name, value, "ms", report=MetricReport.LOWER_IS_BETTER) - - basebackup_bytes = metrics["basebackup_bytes"] - zenbenchmark.record( - f"{slru}_{i}_basebackup_bytes", - basebackup_bytes, - "bytes", - report=MetricReport.LOWER_IS_BETTER, - ) - - # Stop so we can restart - endpoint.stop() - - # Imitate optimizations that console would do for the second start - endpoint.respec(skip_pg_catalog_updates=True) diff --git a/test_runner/performance/test_startup.py b/test_runner/performance/test_startup.py deleted file mode 100644 index 60d8b5be30..0000000000 --- a/test_runner/performance/test_startup.py +++ /dev/null @@ -1,84 +0,0 @@ -from __future__ import annotations - -import requests -from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker -from fixtures.neon_fixtures import NeonEnvBuilder - - -# Just start and measure duration. -# -# This test runs pretty quickly and can be informative when used in combination -# with emulated network delay. Some useful delay commands: -# -# 1. Add 2msec delay to all localhost traffic -# `sudo tc qdisc add dev lo root handle 1:0 netem delay 2msec` -# -# 2. Test that it works (you should see 4ms ping) -# `ping localhost` -# -# 3. Revert back to normal -# `sudo tc qdisc del dev lo root netem` -# -# NOTE this test might not represent the real startup time because the basebackup -# for a large database might be larger if there's a lof of transaction metadata, -# or safekeepers might need more syncing, or there might be more operations to -# apply during config step, like more users, databases, or extensions. By default -# we load extensions 'neon,pg_stat_statements,timescaledb,pg_cron', but in this -# test we only load neon. -def test_startup_simple(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker): - neon_env_builder.num_safekeepers = 3 - env = neon_env_builder.init_start() - - env.create_branch("test_startup") - - endpoint = None - - # We do two iterations so we can see if the second startup is faster. It should - # be because the compute node should already be configured with roles, databases, - # extensions, etc from the first run. - for i in range(2): - # Start - with zenbenchmark.record_duration(f"{i}_start_and_select"): - if endpoint: - endpoint.start() - else: - endpoint = env.endpoints.create_start( - "test_startup", - # Shared buffers need to be allocated during startup, so they - # impact startup time. This is the default value we use for - # 1CPU pods (maybe different for VMs). - # - # TODO extensions also contribute to shared memory allocation, - # and this test doesn't include all default extensions we - # load. - config_lines=["shared_buffers=262144"], - ) - endpoint.safe_psql("select 1;") - - # Get metrics - metrics = requests.get( - f"http://localhost:{endpoint.external_http_port}/metrics.json" - ).json() - durations = { - "wait_for_spec_ms": f"{i}_wait_for_spec", - "sync_safekeepers_ms": f"{i}_sync_safekeepers", - "sync_sk_check_ms": f"{i}_sync_sk_check", - "basebackup_ms": f"{i}_basebackup", - "start_postgres_ms": f"{i}_start_postgres", - "config_ms": f"{i}_config", - "total_startup_ms": f"{i}_total_startup", - } - for key, name in durations.items(): - value = metrics[key] - zenbenchmark.record(name, value, "ms", report=MetricReport.LOWER_IS_BETTER) - - # Check basebackup size makes sense - basebackup_bytes = metrics["basebackup_bytes"] - if i > 0: - assert basebackup_bytes < 100 * 1024 - - # Stop so we can restart - endpoint.stop() - - # Imitate optimizations that console would do for the second start - endpoint.respec(skip_pg_catalog_updates=True)