Files
neon/test_runner/performance/test_compute_startup.py
John Spray d4c059a884 tests: use endpoint http wrapper to get auth (#11628)
## Problem

`test_compute_startup_simple` and `test_compute_ondemand_slru_startup`
are failing.

This test implicitly asserts that the metrics.json endpoint succeeds and
returns all expected metrics, but doesn't make it easy to see what went
wrong if it doesn't (e.g. in this failure
https://neon-github-public-dev.s3.amazonaws.com/reports/main/14513210240/index.html#suites/13d8e764c394daadbad415a08454c04e/b0f92a86b2ed309f/)

In this case, it was failing because of a missing auth token, because it
was using `requests` directly instead of using the endpoint http client
type.

## Summary of changes

- Use endpoint http wrapper to get raise_for_status & auth token
2025-04-17 15:03:23 +00:00

240 lines
8.5 KiB
Python

from __future__ import annotations
from typing import TYPE_CHECKING
import pytest
from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
if TYPE_CHECKING:
from fixtures.neon_fixtures import NeonEnvBuilder, PgBin
# Just start and measure duration.
#
# This test runs pretty quickly and can be informative when used in combination
# with emulated network delay. Some useful delay commands:
#
# 1. Add 2msec delay to all localhost traffic
# `sudo tc qdisc add dev lo root handle 1:0 netem delay 2msec`
#
# 2. Test that it works (you should see 4ms ping)
# `ping localhost`
#
# 3. Revert back to normal
# `sudo tc qdisc del dev lo root netem`
#
# NOTE this test might not represent the real startup time because the basebackup
# for a large database might be larger if there's a lof of transaction metadata,
# or safekeepers might need more syncing, or there might be more operations to
# apply during config step, like more users, databases, or extensions. By default
# we load extensions 'neon,pg_stat_statements,timescaledb,pg_cron', but in this
# test we only load neon.
def test_compute_startup_simple(
neon_env_builder: NeonEnvBuilder,
zenbenchmark: NeonBenchmarker,
):
neon_env_builder.num_safekeepers = 3
env = neon_env_builder.init_start()
env.create_branch("test_startup")
endpoint = None
# We do two iterations so we can see if the second startup is faster. It should
# be because the compute node should already be configured with roles, databases,
# extensions, etc from the first run.
for i in range(2):
# Start
with zenbenchmark.record_duration(f"{i}_start_and_select"):
if endpoint:
endpoint.start()
else:
endpoint = env.endpoints.create(
"test_startup",
# Shared buffers need to be allocated during startup, so they
# impact startup time. This is the default value we use for
# 1CPU pods (maybe different for VMs).
#
# TODO extensions also contribute to shared memory allocation,
# and this test doesn't include all default extensions we
# load.
config_lines=["shared_buffers=262144"],
)
# Do not skip pg_catalog updates at first start, i.e.
# imitate 'the first start after project creation'.
endpoint.respec(skip_pg_catalog_updates=False)
endpoint.start()
endpoint.safe_psql("select 1;")
# Get metrics
metrics = endpoint.http_client().metrics_json()
durations = {
"wait_for_spec_ms": f"{i}_wait_for_spec",
"sync_safekeepers_ms": f"{i}_sync_safekeepers",
"sync_sk_check_ms": f"{i}_sync_sk_check",
"basebackup_ms": f"{i}_basebackup",
"start_postgres_ms": f"{i}_start_postgres",
"config_ms": f"{i}_config",
"total_startup_ms": f"{i}_total_startup",
}
for key, name in durations.items():
value = metrics[key]
zenbenchmark.record(name, value, "ms", report=MetricReport.LOWER_IS_BETTER)
# Check basebackup size makes sense
basebackup_bytes = metrics["basebackup_bytes"]
if i > 0:
assert basebackup_bytes < 100 * 1024
# Stop so we can restart
endpoint.stop()
# Imitate optimizations that console would do for the second start
endpoint.respec(skip_pg_catalog_updates=True)
# Start and measure duration with huge SLRU segments.
# This test is similar to test_compute_startup_simple, but it creates huge number of transactions
# and records containing this XIDs. Autovacuum is disable for the table to prevent CLOG truncation.
# TODO: this is very suspicious test, I doubt that it does what it's supposed to do,
# e.g. these two starts do not make much sense. Looks like it's just copy-paste.
# To be fixed within https://github.com/neondatabase/cloud/issues/8673
@pytest.mark.timeout(1800)
@pytest.mark.parametrize("slru", ["lazy", "eager"])
def test_compute_ondemand_slru_startup(
slru: str, neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker
):
neon_env_builder.num_safekeepers = 3
env = neon_env_builder.init_start()
lazy_slru_download = "true" if slru == "lazy" else "false"
tenant, _ = env.create_tenant(
conf={
"lazy_slru_download": lazy_slru_download,
}
)
endpoint = env.endpoints.create_start("main", tenant_id=tenant)
with endpoint.cursor() as cur:
cur.execute("CREATE TABLE t (pk integer PRIMARY KEY, x integer)")
cur.execute("ALTER TABLE t SET (autovacuum_enabled = false)")
cur.execute("INSERT INTO t VALUES (1, 0)")
cur.execute(
"""
CREATE PROCEDURE updating() as
$$
DECLARE
i integer;
BEGIN
FOR i IN 1..1000000 LOOP
UPDATE t SET x = x + 1 WHERE pk=1;
COMMIT;
END LOOP;
END
$$ LANGUAGE plpgsql
"""
)
cur.execute("SET statement_timeout=0")
cur.execute("call updating()")
endpoint.stop()
# We do two iterations so we can see if the second startup is faster. It should
# be because the compute node should already be configured with roles, databases,
# extensions, etc from the first run.
for i in range(2):
# Start
with zenbenchmark.record_duration(f"{slru}_{i}_start"):
endpoint.start()
with zenbenchmark.record_duration(f"{slru}_{i}_select"):
sum = endpoint.safe_psql("select sum(x) from t")[0][0]
assert sum == 1000000
# Get metrics
metrics = endpoint.http_client().metrics_json()
durations = {
"wait_for_spec_ms": f"{slru}_{i}_wait_for_spec",
"sync_safekeepers_ms": f"{slru}_{i}_sync_safekeepers",
"sync_sk_check_ms": f"{slru}_{i}_sync_sk_check",
"basebackup_ms": f"{slru}_{i}_basebackup",
"start_postgres_ms": f"{slru}_{i}_start_postgres",
"config_ms": f"{slru}_{i}_config",
"total_startup_ms": f"{slru}_{i}_total_startup",
}
for key, name in durations.items():
value = metrics[key]
zenbenchmark.record(name, value, "ms", report=MetricReport.LOWER_IS_BETTER)
basebackup_bytes = metrics["basebackup_bytes"]
zenbenchmark.record(
f"{slru}_{i}_basebackup_bytes",
basebackup_bytes,
"bytes",
report=MetricReport.LOWER_IS_BETTER,
)
# Stop so we can restart
endpoint.stop()
# Imitate optimizations that console would do for the second start
endpoint.respec(skip_pg_catalog_updates=True)
@pytest.mark.timeout(240)
def test_compute_startup_latency(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
zenbenchmark: NeonBenchmarker,
):
"""
Do NUM_STARTS 'optimized' starts, i.e. with pg_catalog updates skipped,
and measure the duration of each step. Report p50, p90, p99 latencies.
"""
neon_env_builder.num_safekeepers = 3
env = neon_env_builder.init_start()
endpoint = env.endpoints.create_start("main")
pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s4", endpoint.connstr()])
endpoint.stop()
NUM_STARTS = 100
durations: dict[str, list[int]] = {
"sync_sk_check_ms": [],
"sync_safekeepers_ms": [],
"basebackup_ms": [],
"start_postgres_ms": [],
"total_startup_ms": [],
}
for _i in range(NUM_STARTS):
endpoint.start()
client = endpoint.http_client()
metrics = client.metrics_json()
for key in durations.keys():
value = metrics[key]
durations[key].append(value)
endpoint.stop()
for key in durations.keys():
durations[key] = sorted(durations[key])
zenbenchmark.record(
f"{key}_p50",
durations[key][len(durations[key]) // 2],
"ms",
report=MetricReport.LOWER_IS_BETTER,
)
zenbenchmark.record(
f"{key}_p90",
durations[key][len(durations[key]) * 9 // 10],
"ms",
report=MetricReport.LOWER_IS_BETTER,
)
zenbenchmark.record(
f"{key}_p99",
durations[key][len(durations[key]) * 99 // 100],
"ms",
report=MetricReport.LOWER_IS_BETTER,
)