Files
neon/test_runner/performance/test_lazy_startup.py
Tristan Partin da9c101939 Implement a second HTTP server within compute_ctl (#10574)
The compute_ctl HTTP server has the following purposes:

- Allow management via the control plane
- Provide an endpoint for scaping metrics
- Provide APIs for compute internal clients
  - Neon Postgres extension for installing remote extensions
  - local_proxy for installing extensions and adding grants

The first two purposes require the HTTP server to be available outside
the compute.

The Neon threat model is a bad actor within our internal network. We
need to reduce the surface area of attack. By exposing unnecessary
unauthenticated HTTP endpoints to the internal network, we increase the
surface area of attack. For endpoints described in the third bullet
point, we can just run an extra HTTP server, which is only bound to the
loopback interface since all consumers of those endpoints are within the
compute.
2025-02-11 18:02:22 +00:00

111 lines
4.1 KiB
Python

from __future__ import annotations
import pytest
import requests
from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
from fixtures.neon_fixtures import NeonEnvBuilder
# Start and measure duration with huge SLRU segments.
# This test is similar to test_startup_simple, but it creates huge number of transactions
# and records containing this XIDs. Autovacuum is disable for the table to prevent CLOG truncation.
#
# This test runs pretty quickly and can be informative when used in combination
# with emulated network delay. Some useful delay commands:
#
# 1. Add 2msec delay to all localhost traffic
# `sudo tc qdisc add dev lo root handle 1:0 netem delay 2msec`
#
# 2. Test that it works (you should see 4ms ping)
# `ping localhost`
#
# 3. Revert back to normal
# `sudo tc qdisc del dev lo root netem`
#
# NOTE this test might not represent the real startup time because the basebackup
# for a large database might be larger if there's a lof of transaction metadata,
# or safekeepers might need more syncing, or there might be more operations to
# apply during config step, like more users, databases, or extensions. By default
# we load extensions 'neon,pg_stat_statements,timescaledb,pg_cron', but in this
# test we only load neon.
@pytest.mark.timeout(1800)
@pytest.mark.parametrize("slru", ["lazy", "eager"])
def test_lazy_startup(slru: str, neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker):
neon_env_builder.num_safekeepers = 3
env = neon_env_builder.init_start()
lazy_slru_download = "true" if slru == "lazy" else "false"
tenant, _ = env.create_tenant(
conf={
"lazy_slru_download": lazy_slru_download,
}
)
endpoint = env.endpoints.create_start("main", tenant_id=tenant)
with endpoint.cursor() as cur:
cur.execute("CREATE TABLE t (pk integer PRIMARY KEY, x integer)")
cur.execute("ALTER TABLE t SET (autovacuum_enabled = false)")
cur.execute("INSERT INTO t VALUES (1, 0)")
cur.execute(
"""
CREATE PROCEDURE updating() as
$$
DECLARE
i integer;
BEGIN
FOR i IN 1..1000000 LOOP
UPDATE t SET x = x + 1 WHERE pk=1;
COMMIT;
END LOOP;
END
$$ LANGUAGE plpgsql
"""
)
cur.execute("SET statement_timeout=0")
cur.execute("call updating()")
endpoint.stop()
# We do two iterations so we can see if the second startup is faster. It should
# be because the compute node should already be configured with roles, databases,
# extensions, etc from the first run.
for i in range(2):
# Start
with zenbenchmark.record_duration(f"{slru}_{i}_start"):
endpoint.start()
with zenbenchmark.record_duration(f"{slru}_{i}_select"):
sum = endpoint.safe_psql("select sum(x) from t")[0][0]
assert sum == 1000000
# Get metrics
metrics = requests.get(
f"http://localhost:{endpoint.external_http_port}/metrics.json"
).json()
durations = {
"wait_for_spec_ms": f"{slru}_{i}_wait_for_spec",
"sync_safekeepers_ms": f"{slru}_{i}_sync_safekeepers",
"sync_sk_check_ms": f"{slru}_{i}_sync_sk_check",
"basebackup_ms": f"{slru}_{i}_basebackup",
"start_postgres_ms": f"{slru}_{i}_start_postgres",
"config_ms": f"{slru}_{i}_config",
"total_startup_ms": f"{slru}_{i}_total_startup",
}
for key, name in durations.items():
value = metrics[key]
zenbenchmark.record(name, value, "ms", report=MetricReport.LOWER_IS_BETTER)
basebackup_bytes = metrics["basebackup_bytes"]
zenbenchmark.record(
f"{slru}_{i}_basebackup_bytes",
basebackup_bytes,
"bytes",
report=MetricReport.LOWER_IS_BETTER,
)
# Stop so we can restart
endpoint.stop()
# Imitate optimizations that console would do for the second start
endpoint.respec(skip_pg_catalog_updates=True)