mirror of
https://github.com/neondatabase/neon.git
synced 2025-12-24 22:59:59 +00:00
## Problem `test_compute_startup_simple` and `test_compute_ondemand_slru_startup` are failing. This test implicitly asserts that the metrics.json endpoint succeeds and returns all expected metrics, but doesn't make it easy to see what went wrong if it doesn't (e.g. in this failure https://neon-github-public-dev.s3.amazonaws.com/reports/main/14513210240/index.html#suites/13d8e764c394daadbad415a08454c04e/b0f92a86b2ed309f/) In this case, it was failing because of a missing auth token, because it was using `requests` directly instead of using the endpoint http client type. ## Summary of changes - Use endpoint http wrapper to get raise_for_status & auth token
240 lines
8.5 KiB
Python
240 lines
8.5 KiB
Python
from __future__ import annotations
|
|
|
|
from typing import TYPE_CHECKING
|
|
|
|
import pytest
|
|
from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
|
|
|
|
if TYPE_CHECKING:
|
|
from fixtures.neon_fixtures import NeonEnvBuilder, PgBin
|
|
|
|
|
|
# Just start and measure duration.
|
|
#
|
|
# This test runs pretty quickly and can be informative when used in combination
|
|
# with emulated network delay. Some useful delay commands:
|
|
#
|
|
# 1. Add 2msec delay to all localhost traffic
|
|
# `sudo tc qdisc add dev lo root handle 1:0 netem delay 2msec`
|
|
#
|
|
# 2. Test that it works (you should see 4ms ping)
|
|
# `ping localhost`
|
|
#
|
|
# 3. Revert back to normal
|
|
# `sudo tc qdisc del dev lo root netem`
|
|
#
|
|
# NOTE this test might not represent the real startup time because the basebackup
|
|
# for a large database might be larger if there's a lof of transaction metadata,
|
|
# or safekeepers might need more syncing, or there might be more operations to
|
|
# apply during config step, like more users, databases, or extensions. By default
|
|
# we load extensions 'neon,pg_stat_statements,timescaledb,pg_cron', but in this
|
|
# test we only load neon.
|
|
def test_compute_startup_simple(
|
|
neon_env_builder: NeonEnvBuilder,
|
|
zenbenchmark: NeonBenchmarker,
|
|
):
|
|
neon_env_builder.num_safekeepers = 3
|
|
env = neon_env_builder.init_start()
|
|
|
|
env.create_branch("test_startup")
|
|
|
|
endpoint = None
|
|
|
|
# We do two iterations so we can see if the second startup is faster. It should
|
|
# be because the compute node should already be configured with roles, databases,
|
|
# extensions, etc from the first run.
|
|
for i in range(2):
|
|
# Start
|
|
with zenbenchmark.record_duration(f"{i}_start_and_select"):
|
|
if endpoint:
|
|
endpoint.start()
|
|
else:
|
|
endpoint = env.endpoints.create(
|
|
"test_startup",
|
|
# Shared buffers need to be allocated during startup, so they
|
|
# impact startup time. This is the default value we use for
|
|
# 1CPU pods (maybe different for VMs).
|
|
#
|
|
# TODO extensions also contribute to shared memory allocation,
|
|
# and this test doesn't include all default extensions we
|
|
# load.
|
|
config_lines=["shared_buffers=262144"],
|
|
)
|
|
# Do not skip pg_catalog updates at first start, i.e.
|
|
# imitate 'the first start after project creation'.
|
|
endpoint.respec(skip_pg_catalog_updates=False)
|
|
endpoint.start()
|
|
endpoint.safe_psql("select 1;")
|
|
|
|
# Get metrics
|
|
metrics = endpoint.http_client().metrics_json()
|
|
durations = {
|
|
"wait_for_spec_ms": f"{i}_wait_for_spec",
|
|
"sync_safekeepers_ms": f"{i}_sync_safekeepers",
|
|
"sync_sk_check_ms": f"{i}_sync_sk_check",
|
|
"basebackup_ms": f"{i}_basebackup",
|
|
"start_postgres_ms": f"{i}_start_postgres",
|
|
"config_ms": f"{i}_config",
|
|
"total_startup_ms": f"{i}_total_startup",
|
|
}
|
|
for key, name in durations.items():
|
|
value = metrics[key]
|
|
zenbenchmark.record(name, value, "ms", report=MetricReport.LOWER_IS_BETTER)
|
|
|
|
# Check basebackup size makes sense
|
|
basebackup_bytes = metrics["basebackup_bytes"]
|
|
if i > 0:
|
|
assert basebackup_bytes < 100 * 1024
|
|
|
|
# Stop so we can restart
|
|
endpoint.stop()
|
|
|
|
# Imitate optimizations that console would do for the second start
|
|
endpoint.respec(skip_pg_catalog_updates=True)
|
|
|
|
|
|
# Start and measure duration with huge SLRU segments.
|
|
# This test is similar to test_compute_startup_simple, but it creates huge number of transactions
|
|
# and records containing this XIDs. Autovacuum is disable for the table to prevent CLOG truncation.
|
|
# TODO: this is very suspicious test, I doubt that it does what it's supposed to do,
|
|
# e.g. these two starts do not make much sense. Looks like it's just copy-paste.
|
|
# To be fixed within https://github.com/neondatabase/cloud/issues/8673
|
|
@pytest.mark.timeout(1800)
|
|
@pytest.mark.parametrize("slru", ["lazy", "eager"])
|
|
def test_compute_ondemand_slru_startup(
|
|
slru: str, neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker
|
|
):
|
|
neon_env_builder.num_safekeepers = 3
|
|
env = neon_env_builder.init_start()
|
|
|
|
lazy_slru_download = "true" if slru == "lazy" else "false"
|
|
tenant, _ = env.create_tenant(
|
|
conf={
|
|
"lazy_slru_download": lazy_slru_download,
|
|
}
|
|
)
|
|
|
|
endpoint = env.endpoints.create_start("main", tenant_id=tenant)
|
|
with endpoint.cursor() as cur:
|
|
cur.execute("CREATE TABLE t (pk integer PRIMARY KEY, x integer)")
|
|
cur.execute("ALTER TABLE t SET (autovacuum_enabled = false)")
|
|
cur.execute("INSERT INTO t VALUES (1, 0)")
|
|
cur.execute(
|
|
"""
|
|
CREATE PROCEDURE updating() as
|
|
$$
|
|
DECLARE
|
|
i integer;
|
|
BEGIN
|
|
FOR i IN 1..1000000 LOOP
|
|
UPDATE t SET x = x + 1 WHERE pk=1;
|
|
COMMIT;
|
|
END LOOP;
|
|
END
|
|
$$ LANGUAGE plpgsql
|
|
"""
|
|
)
|
|
cur.execute("SET statement_timeout=0")
|
|
cur.execute("call updating()")
|
|
|
|
endpoint.stop()
|
|
|
|
# We do two iterations so we can see if the second startup is faster. It should
|
|
# be because the compute node should already be configured with roles, databases,
|
|
# extensions, etc from the first run.
|
|
for i in range(2):
|
|
# Start
|
|
with zenbenchmark.record_duration(f"{slru}_{i}_start"):
|
|
endpoint.start()
|
|
|
|
with zenbenchmark.record_duration(f"{slru}_{i}_select"):
|
|
sum = endpoint.safe_psql("select sum(x) from t")[0][0]
|
|
assert sum == 1000000
|
|
|
|
# Get metrics
|
|
metrics = endpoint.http_client().metrics_json()
|
|
durations = {
|
|
"wait_for_spec_ms": f"{slru}_{i}_wait_for_spec",
|
|
"sync_safekeepers_ms": f"{slru}_{i}_sync_safekeepers",
|
|
"sync_sk_check_ms": f"{slru}_{i}_sync_sk_check",
|
|
"basebackup_ms": f"{slru}_{i}_basebackup",
|
|
"start_postgres_ms": f"{slru}_{i}_start_postgres",
|
|
"config_ms": f"{slru}_{i}_config",
|
|
"total_startup_ms": f"{slru}_{i}_total_startup",
|
|
}
|
|
for key, name in durations.items():
|
|
value = metrics[key]
|
|
zenbenchmark.record(name, value, "ms", report=MetricReport.LOWER_IS_BETTER)
|
|
|
|
basebackup_bytes = metrics["basebackup_bytes"]
|
|
zenbenchmark.record(
|
|
f"{slru}_{i}_basebackup_bytes",
|
|
basebackup_bytes,
|
|
"bytes",
|
|
report=MetricReport.LOWER_IS_BETTER,
|
|
)
|
|
|
|
# Stop so we can restart
|
|
endpoint.stop()
|
|
|
|
# Imitate optimizations that console would do for the second start
|
|
endpoint.respec(skip_pg_catalog_updates=True)
|
|
|
|
|
|
@pytest.mark.timeout(240)
|
|
def test_compute_startup_latency(
|
|
neon_env_builder: NeonEnvBuilder,
|
|
pg_bin: PgBin,
|
|
zenbenchmark: NeonBenchmarker,
|
|
):
|
|
"""
|
|
Do NUM_STARTS 'optimized' starts, i.e. with pg_catalog updates skipped,
|
|
and measure the duration of each step. Report p50, p90, p99 latencies.
|
|
"""
|
|
neon_env_builder.num_safekeepers = 3
|
|
env = neon_env_builder.init_start()
|
|
|
|
endpoint = env.endpoints.create_start("main")
|
|
pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s4", endpoint.connstr()])
|
|
endpoint.stop()
|
|
|
|
NUM_STARTS = 100
|
|
|
|
durations: dict[str, list[int]] = {
|
|
"sync_sk_check_ms": [],
|
|
"sync_safekeepers_ms": [],
|
|
"basebackup_ms": [],
|
|
"start_postgres_ms": [],
|
|
"total_startup_ms": [],
|
|
}
|
|
|
|
for _i in range(NUM_STARTS):
|
|
endpoint.start()
|
|
client = endpoint.http_client()
|
|
metrics = client.metrics_json()
|
|
for key in durations.keys():
|
|
value = metrics[key]
|
|
durations[key].append(value)
|
|
endpoint.stop()
|
|
|
|
for key in durations.keys():
|
|
durations[key] = sorted(durations[key])
|
|
zenbenchmark.record(
|
|
f"{key}_p50",
|
|
durations[key][len(durations[key]) // 2],
|
|
"ms",
|
|
report=MetricReport.LOWER_IS_BETTER,
|
|
)
|
|
zenbenchmark.record(
|
|
f"{key}_p90",
|
|
durations[key][len(durations[key]) * 9 // 10],
|
|
"ms",
|
|
report=MetricReport.LOWER_IS_BETTER,
|
|
)
|
|
zenbenchmark.record(
|
|
f"{key}_p99",
|
|
durations[key][len(durations[key]) * 99 // 100],
|
|
"ms",
|
|
report=MetricReport.LOWER_IS_BETTER,
|
|
)
|