LFC prewarm perftest fixes: use existing staging project (#12651)

https://github.com/neondatabase/cloud/issues/19011 - Prewarm config changes are not publicly available. Correct the test by using a pre-filled 50 GB project on staging - Create extension neon with schema neon to fix read performance tests on staging, error example in https://neon-github-public-dev.s3.amazonaws.com/reports/main/16483462789/index.html#suites/3d632da6dda4a70f5b4bd24904ab444c/919841e331089fc4/ - Don't create extra endpoint in LFC prewarm performance tests
2025-12-22 21:59:59 +00:00 · 2025-07-25 17:56:41 +01:00
parent 33b400beae
commit 6689d6fd89
6 changed files with 145 additions and 97 deletions
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -31,7 +31,7 @@ config-variables:
  - NEON_PROD_AWS_ACCOUNT_ID
  - PGREGRESS_PG16_PROJECT_ID
  - PGREGRESS_PG17_PROJECT_ID
-  - PREWARM_PGBENCH_SIZE
+  - PREWARM_PROJECT_ID
  - REMOTE_STORAGE_AZURE_CONTAINER
  - REMOTE_STORAGE_AZURE_REGION
  - SLACK_CICD_CHANNEL_ID
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -418,7 +418,7 @@ jobs:
      statuses: write
      id-token: write # aws-actions/configure-aws-credentials
    env:
-      PGBENCH_SIZE: ${{ vars.PREWARM_PGBENCH_SIZE }}
+      PROJECT_ID: ${{ vars.PREWARM_PROJECT_ID }}
      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
      DEFAULT_PG_VERSION: 17
      TEST_OUTPUT: /tmp/test_output
--- a/test_runner/fixtures/compare_fixtures.py
+++ b/test_runner/fixtures/compare_fixtures.py
@@ -16,6 +16,7 @@ from typing_extensions import override
 from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
    Endpoint,
    NeonEnv,
    PgBin,
    PgProtocol,
@@ -129,6 +130,10 @@ class NeonCompare(PgCompare):
        # Start pg
        self._pg = self.env.endpoints.create_start("main", "main", self.tenant)
    @property
    def endpoint(self) -> Endpoint:
        return self._pg
    @property
    @override
    def pg(self) -> PgProtocol:
--- a/test_runner/fixtures/endpoint/http.py
+++ b/test_runner/fixtures/endpoint/http.py
@@ -79,18 +79,28 @@ class EndpointHttpClient(requests.Session):
        return json
    def prewarm_lfc(self, from_endpoint_id: str | None = None):
        """
        Prewarm LFC cache from given endpoint and wait till it finishes or errors
        """
        params = {"from_endpoint": from_endpoint_id} if from_endpoint_id else dict()
        self.post(self.prewarm_url, params=params).raise_for_status()
        self.prewarm_lfc_wait()
    def prewarm_lfc_wait(self):
        """
        Wait till LFC prewarm returns with error or success.
        If prewarm was not requested before calling this function, it will error
        """
        statuses = "failed", "completed", "skipped"
        def prewarmed():
            json = self.prewarm_lfc_status()
            status, err = json["status"], json.get("error")
-            assert status in ["failed", "completed", "skipped"], f"{status}, {err=}"
+            assert status in statuses, f"{status}, {err=}"
        wait_until(prewarmed, timeout=60)
-        assert self.prewarm_lfc_status()["status"] != "failed"
+        res = self.prewarm_lfc_status()
        assert res["status"] != "failed", res
    def offload_lfc_status(self) -> dict[str, str]:
        res = self.get(self.offload_url)
@@ -99,17 +109,26 @@ class EndpointHttpClient(requests.Session):
        return json
    def offload_lfc(self):
        """
        Offload LFC cache to endpoint storage and wait till offload finishes or errors
        """
        self.post(self.offload_url).raise_for_status()
        self.offload_lfc_wait()
    def offload_lfc_wait(self):
        """
        Wait till LFC offload returns with error or success.
        If offload was not requested before calling this function, it will error
        """
        def offloaded():
            json = self.offload_lfc_status()
            status, err = json["status"], json.get("error")
            assert status in ["failed", "completed"], f"{status}, {err=}"
-        wait_until(offloaded)
+        wait_until(offloaded, timeout=60)
-        assert self.offload_lfc_status()["status"] != "failed"
+        res = self.offload_lfc_status()
        assert res["status"] != "failed", res
    def promote(self, promote_spec: dict[str, Any], disconnect: bool = False):
        url = f"http://localhost:{self.external_port}/promote"
--- a/test_runner/fixtures/neon_api.py
+++ b/test_runner/fixtures/neon_api.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 import re
 import time
 from typing import TYPE_CHECKING, cast, final
@@ -13,6 +14,17 @@ if TYPE_CHECKING:
    from fixtures.pg_version import PgVersion
 def connstr_to_env(connstr: str) -> dict[str, str]:
    # postgresql://neondb_owner:npg_kuv6Rqi1cB@ep-old-silence-w26pxsvz-pooler.us-east-2.aws.neon.build/neondb?sslmode=require&channel_binding=...'
    parts = re.split(r":|@|\/|\?", connstr.removeprefix("postgresql://"))
    return {
        "PGUSER": parts[0],
        "PGPASSWORD": parts[1],
        "PGHOST": parts[2],
        "PGDATABASE": parts[3],
    }
 def connection_parameters_to_env(params: dict[str, str]) -> dict[str, str]:
    return {
        "PGHOST": params["host"],
--- a/test_runner/performance/test_lfc_prewarm.py
+++ b/test_runner/performance/test_lfc_prewarm.py
@@ -2,45 +2,48 @@ from __future__ import annotations
 import os
 import timeit
 import traceback
 from concurrent.futures import ThreadPoolExecutor as Exec
 from pathlib import Path
 from threading import Thread
 from time import sleep
-from typing import TYPE_CHECKING, Any, cast
+from typing import TYPE_CHECKING, cast
 import pytest
 from fixtures.benchmark_fixture import NeonBenchmarker, PgBenchRunResult
 from fixtures.log_helper import log
-from fixtures.neon_api import NeonAPI, connection_parameters_to_env
+from fixtures.neon_api import NeonAPI, connstr_to_env
 from performance.test_perf_pgbench import utc_now_timestamp
 if TYPE_CHECKING:
    from fixtures.compare_fixtures import NeonCompare
    from fixtures.neon_fixtures import Endpoint, PgBin
    from fixtures.pg_version import PgVersion
 from performance.test_perf_pgbench import utc_now_timestamp
 # These tests compare performance for a write-heavy and read-heavy workloads of an ordinary endpoint
-# compared to the endpoint which saves its LFC and prewarms using it on startup.
+# compared to the endpoint which saves its LFC and prewarms using it on startup
 def test_compare_prewarmed_pgbench_perf(neon_compare: NeonCompare):
    env = neon_compare.env
    env.create_branch("normal")
    env.create_branch("prewarmed")
    pg_bin = neon_compare.pg_bin
-    ep_normal: Endpoint = env.endpoints.create_start("normal")
+    ep_ordinary: Endpoint = neon_compare.endpoint
-    ep_prewarmed: Endpoint = env.endpoints.create_start("prewarmed", autoprewarm=True)
+    ep_prewarmed: Endpoint = env.endpoints.create_start("prewarmed")
-    for ep in [ep_normal, ep_prewarmed]:
+    for ep in [ep_ordinary, ep_prewarmed]:
        connstr: str = ep.connstr()
        pg_bin.run(["pgbench", "-i", "-I", "dtGvp", connstr, "-s100"])
-        ep.safe_psql("CREATE EXTENSION neon")
+        ep.safe_psql("CREATE SCHEMA neon; CREATE EXTENSION neon WITH SCHEMA neon")
-        client = ep.http_client()
+        if ep == ep_prewarmed:
-        client.offload_lfc()
+            client = ep.http_client()
-        ep.stop()
+            client.offload_lfc()
-        ep.start()
+            ep.stop()
-        client.prewarm_lfc_wait()
+            ep.start(autoprewarm=True)
            client.prewarm_lfc_wait()
        else:
            ep.stop()
            ep.start()
        run_start_timestamp = utc_now_timestamp()
        t0 = timeit.default_timer()
@@ -59,6 +62,36 @@ def test_compare_prewarmed_pgbench_perf(neon_compare: NeonCompare):
        neon_compare.zenbenchmark.record_pg_bench_result(name, res)
 def test_compare_prewarmed_read_perf(neon_compare: NeonCompare):
    env = neon_compare.env
    env.create_branch("prewarmed")
    ep_ordinary: Endpoint = neon_compare.endpoint
    ep_prewarmed: Endpoint = env.endpoints.create_start("prewarmed")
    sql = [
        "CREATE SCHEMA neon",
        "CREATE EXTENSION neon WITH SCHEMA neon",
        "CREATE TABLE foo(key serial primary key, t text default 'foooooooooooooooooooooooooooooooooooooooooooooooooooo')",
        "INSERT INTO foo SELECT FROM generate_series(1,1000000)",
    ]
    sql_check = "SELECT count(*) from foo"
    ep_ordinary.safe_psql_many(sql)
    ep_ordinary.stop()
    ep_ordinary.start()
    with neon_compare.record_duration("ordinary_run_duration"):
        ep_ordinary.safe_psql(sql_check)
    ep_prewarmed.safe_psql_many(sql)
    client = ep_prewarmed.http_client()
    client.offload_lfc()
    ep_prewarmed.stop()
    ep_prewarmed.start(autoprewarm=True)
    client.prewarm_lfc_wait()
    with neon_compare.record_duration("prewarmed_run_duration"):
        ep_prewarmed.safe_psql(sql_check)
@pytest.mark.remote_cluster
@pytest.mark.timeout(2 * 60 * 60)
 def test_compare_prewarmed_pgbench_perf_benchmark(
@@ -67,67 +100,66 @@ def test_compare_prewarmed_pgbench_perf_benchmark(
    pg_version: PgVersion,
    zenbenchmark: NeonBenchmarker,
 ):
-    name = f"Test prewarmed pgbench performance, GITHUB_RUN_ID={os.getenv('GITHUB_RUN_ID')}"
+    """
-    project = neon_api.create_project(pg_version, name)
+    Prewarm API is not public, so this test relies on a pre-created project
-    project_id = project["project"]["id"]
+    with pgbench size of 3424, pgbench -i -IdtGvp -s3424. Sleeping and
-    neon_api.wait_for_operation_to_finish(project_id)
+    offloading constants are hardcoded to this size as well
-    err = False
+    """
-    try:
+    project_id = os.getenv("PROJECT_ID")
-        benchmark_impl(pg_bin, neon_api, project, zenbenchmark)
+    assert project_id
    except Exception as e:
        err = True
        log.error(f"Caught exception: {e}")
        log.error(traceback.format_exc())
    finally:
        assert not err
        neon_api.delete_project(project_id)
    ordinary_branch_id = ""
    prewarmed_branch_id = ""
    for branch in neon_api.get_branches(project_id)["branches"]:
        if branch["name"] == "ordinary":
            ordinary_branch_id = branch["id"]
        if branch["name"] == "prewarmed":
            prewarmed_branch_id = branch["id"]
    assert len(ordinary_branch_id) > 0
    assert len(prewarmed_branch_id) > 0
    ep_ordinary = None
    ep_prewarmed = None
    for ep in neon_api.get_endpoints(project_id)["endpoints"]:
        if ep["branch_id"] == ordinary_branch_id:
            ep_ordinary = ep
        if ep["branch_id"] == prewarmed_branch_id:
            ep_prewarmed = ep
    assert ep_ordinary
    assert ep_prewarmed
    ordinary_id = ep_ordinary["id"]
    prewarmed_id = ep_prewarmed["id"]
 def benchmark_impl(
    pg_bin: PgBin, neon_api: NeonAPI, project: dict[str, Any], zenbenchmark: NeonBenchmarker
 ):
    pgbench_size = int(os.getenv("PGBENCH_SIZE") or "3424")  # 50GB
    offload_secs = 20
-    test_duration_min = 5
+    test_duration_min = 3
    pgbench_duration = f"-T{test_duration_min * 60}"
-    # prewarm API is not publicly exposed. In order to test performance of a
+    pgbench_init_cmd = ["pgbench", "-P10", "-n", "-c10", pgbench_duration, "-Mprepared"]
-    # fully prewarmed endpoint, wait after it restarts.
+    pgbench_perf_cmd = pgbench_init_cmd + ["-S"]
    # The number here is empirical, based on manual runs on staging
    prewarmed_sleep_secs = 180
-    branch_id = project["branch"]["id"]
+    ordinary_uri = neon_api.get_connection_uri(project_id, ordinary_branch_id, ordinary_id)["uri"]
-    project_id = project["project"]["id"]
+    prewarmed_uri = neon_api.get_connection_uri(project_id, prewarmed_branch_id, prewarmed_id)[
-    normal_env = connection_parameters_to_env(
+        "uri"
-        project["connection_uris"][0]["connection_parameters"]
+    ]
    )
    normal_id = project["endpoints"][0]["id"]
    prewarmed_branch_id = neon_api.create_branch(
        project_id, "prewarmed", parent_id=branch_id, add_endpoint=False
    )["branch"]["id"]
    neon_api.wait_for_operation_to_finish(project_id)
    ep_prewarmed = neon_api.create_endpoint(
        project_id,
        prewarmed_branch_id,
        endpoint_type="read_write",
        settings={"autoprewarm": True, "offload_lfc_interval_seconds": offload_secs},
    )
    neon_api.wait_for_operation_to_finish(project_id)
    prewarmed_env = normal_env.copy()
    prewarmed_env["PGHOST"] = ep_prewarmed["endpoint"]["host"]
    prewarmed_id = ep_prewarmed["endpoint"]["id"]
    def bench(endpoint_name, endpoint_id, env):
-        pg_bin.run(["pgbench", "-i", "-I", "dtGvp", f"-s{pgbench_size}"], env)
+        log.info(f"Running pgbench for {pgbench_duration}s to warm up the cache")
-        sleep(offload_secs * 2)  # ensure LFC is offloaded after pgbench finishes
+        pg_bin.run_capture(pgbench_init_cmd, env)  # capture useful for debugging
        neon_api.restart_endpoint(project_id, endpoint_id)
        sleep(prewarmed_sleep_secs)
        log.info(f"Initialized {endpoint_name}")
        if endpoint_name == "prewarmed":
            log.info(f"sleeping {offload_secs * 2} to ensure LFC is offloaded")
            sleep(offload_secs * 2)
            neon_api.restart_endpoint(project_id, endpoint_id)
            log.info(f"sleeping {prewarmed_sleep_secs} to ensure LFC is prewarmed")
            sleep(prewarmed_sleep_secs)
        else:
            neon_api.restart_endpoint(project_id, endpoint_id)
        log.info(f"Starting benchmark for {endpoint_name}")
        run_start_timestamp = utc_now_timestamp()
        t0 = timeit.default_timer()
-        out = pg_bin.run_capture(["pgbench", "-c10", pgbench_duration, "-Mprepared"], env)
+        out = pg_bin.run_capture(pgbench_perf_cmd, env)
        run_duration = timeit.default_timer() - t0
        run_end_timestamp = utc_now_timestamp()
@@ -140,29 +172,9 @@ def benchmark_impl(
        )
        zenbenchmark.record_pg_bench_result(endpoint_name, res)
-    with Exec(max_workers=2) as exe:
+    prewarmed_args = ("prewarmed", prewarmed_id, connstr_to_env(prewarmed_uri))
-        exe.submit(bench, "normal", normal_id, normal_env)
+    prewarmed_thread = Thread(target=bench, args=prewarmed_args)
-        exe.submit(bench, "prewarmed", prewarmed_id, prewarmed_env)
+    prewarmed_thread.start()
-
+    bench("ordinary", ordinary_id, connstr_to_env(ordinary_uri))
-def test_compare_prewarmed_read_perf(neon_compare: NeonCompare):
+    prewarmed_thread.join()
    env = neon_compare.env
    env.create_branch("normal")
    env.create_branch("prewarmed")
    ep_normal: Endpoint = env.endpoints.create_start("normal")
    ep_prewarmed: Endpoint = env.endpoints.create_start("prewarmed", autoprewarm=True)
    sql = [
        "CREATE EXTENSION neon",
        "CREATE TABLE foo(key serial primary key, t text default 'foooooooooooooooooooooooooooooooooooooooooooooooooooo')",
        "INSERT INTO foo SELECT FROM generate_series(1,1000000)",
    ]
    for ep in [ep_normal, ep_prewarmed]:
        ep.safe_psql_many(sql)
        client = ep.http_client()
        client.offload_lfc()
        ep.stop()
        ep.start()
        client.prewarm_lfc_wait()
        with neon_compare.record_duration(f"{ep.branch_name}_run_duration"):
            ep.safe_psql("SELECT count(*) from foo")