Add cold start test (#4436)

2026-01-05 20:42:54 +00:00 · 2023-06-08 18:11:33 -04:00
parent c82d19d8d6
commit 6bac770811
1 changed files with 54 additions and 1 deletions
--- a/test_runner/performance/test_startup.py
+++ b/test_runner/performance/test_startup.py
@@ -1,10 +1,63 @@
 from contextlib import closing

 import pytest
-from fixtures.benchmark_fixture import NeonBenchmarker
+import requests
+from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
 from fixtures.neon_fixtures import NeonEnvBuilder


+# Just start and measure duration.
+#
+# This test runs pretty quickly and can be informative when used in combination
+# with emulated network delay. Some useful delay commands:
+#
+# 1. Add 2msec delay to all localhost traffic
+# `sudo tc qdisc add dev lo root handle 1:0 netem delay 2msec`
+#
+# 2. Test that it works (you should see 4ms ping)
+# `ping localhost`
+#
+# 3. Revert back to normal
+# `sudo tc qdisc del dev lo root netem`
+#
+# NOTE this test might not represent the real startup time because the basebackup
+#      for a large database might be larger if there's a lof of transaction metadata,
+#      or safekeepers might need more syncing, or there might be more operations to
+#      apply during config step, like more users, databases, or extensions. By default
+#      we load extensions 'neon,pg_stat_statements,timescaledb,pg_cron', but in this
+#      test we only load neon.
+def test_startup_simple(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker):
+    neon_env_builder.num_safekeepers = 3
+    env = neon_env_builder.init_start()
+
+    env.neon_cli.create_branch("test_startup")
+
+    # We do two iterations so we can see if the second startup is faster. It should
+    # be because the compute node should already be configured with roles, databases,
+    # extensions, etc from the first run.
+    for i in range(2):
+        # Start
+        with zenbenchmark.record_duration(f"{i}_start_and_select"):
+            endpoint = env.endpoints.create_start("test_startup")
+            endpoint.safe_psql("select 1;")
+
+        # Get metrics
+        metrics = requests.get(f"http://localhost:{endpoint.http_port}/metrics.json").json()
+        durations = {
+            "wait_for_spec_ms": f"{i}_wait_for_spec",
+            "sync_safekeepers_ms": f"{i}_sync_safekeepers",
+            "basebackup_ms": f"{i}_basebackup",
+            "config_ms": f"{i}_config",
+            "total_startup_ms": f"{i}_total_startup",
+        }
+        for key, name in durations.items():
+            value = metrics[key]
+            zenbenchmark.record(name, value, "ms", report=MetricReport.LOWER_IS_BETTER)
+
+        # Stop so we can restart
+        endpoint.stop()
+
+
 # This test sometimes runs for longer than the global 5 minute timeout.
@pytest.mark.timeout(600)
 def test_startup(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker):