From 2288efae662e41fcd2cf7369e3b4b9dc95d25e95 Mon Sep 17 00:00:00 2001 From: Mikhail Date: Mon, 14 Jul 2025 14:41:31 +0100 Subject: [PATCH] Performance test for LFC prewarm (#12524) https://github.com/neondatabase/cloud/issues/19011 Measure relative performance for prewarmed and non-prewarmed endpoints. Add test that runs on every commit, and one performance test with a remote cluster. --- .github/actionlint.yml | 1 + .github/workflows/benchmarking.yml | 72 +++++++++ test_runner/fixtures/neon_api.py | 4 + test_runner/performance/test_lfc_prewarm.py | 167 ++++++++++++++++++++ 4 files changed, 244 insertions(+) create mode 100644 test_runner/performance/test_lfc_prewarm.py diff --git a/.github/actionlint.yml b/.github/actionlint.yml index 3142a36fa0..25b2fc702a 100644 --- a/.github/actionlint.yml +++ b/.github/actionlint.yml @@ -31,6 +31,7 @@ config-variables: - NEON_PROD_AWS_ACCOUNT_ID - PGREGRESS_PG16_PROJECT_ID - PGREGRESS_PG17_PROJECT_ID + - PREWARM_PGBENCH_SIZE - REMOTE_STORAGE_AZURE_CONTAINER - REMOTE_STORAGE_AZURE_REGION - SLACK_CICD_CHANNEL_ID diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 79371ec704..df80bad579 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -219,6 +219,7 @@ jobs: --ignore test_runner/performance/test_cumulative_statistics_persistence.py --ignore test_runner/performance/test_perf_many_relations.py --ignore test_runner/performance/test_perf_oltp_large_tenant.py + --ignore test_runner/performance/test_lfc_prewarm.py env: BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" @@ -410,6 +411,77 @@ jobs: env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} + prewarm-test: + if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }} + permissions: + contents: write + statuses: write + id-token: write # aws-actions/configure-aws-credentials + env: + PGBENCH_SIZE: ${{ vars.PREWARM_PGBENCH_SIZE }} + POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install + DEFAULT_PG_VERSION: 17 + TEST_OUTPUT: /tmp/test_output + BUILD_TYPE: remote + SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} + PLATFORM: "neon-staging" + + runs-on: [ self-hosted, us-east-2, x64 ] + container: + image: ghcr.io/neondatabase/build-tools:pinned-bookworm + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + options: --init + + steps: + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 + with: + aws-region: eu-central-1 + role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + role-duration-seconds: 18000 # 5 hours + + - name: Download Neon artifact + uses: ./.github/actions/download + with: + name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact + path: /tmp/neon/ + prefix: latest + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + + - name: Run prewarm benchmark + uses: ./.github/actions/run-python-test-set + with: + build_type: ${{ env.BUILD_TYPE }} + test_selection: performance/test_lfc_prewarm.py + run_in_parallel: false + save_perf_report: ${{ env.SAVE_PERF_REPORT }} + extra_params: -m remote_cluster --timeout 5400 + pg_version: ${{ env.DEFAULT_PG_VERSION }} + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + env: + VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" + PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" + NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }} + + - name: Create Allure report + id: create-allure-report + if: ${{ !cancelled() }} + uses: ./.github/actions/allure-report-generate + with: + store-test-results-into-db: true + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + env: + REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} + generate-matrices: if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }} # Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday) diff --git a/test_runner/fixtures/neon_api.py b/test_runner/fixtures/neon_api.py index e0f16abe77..bb618325e0 100644 --- a/test_runner/fixtures/neon_api.py +++ b/test_runner/fixtures/neon_api.py @@ -314,6 +314,10 @@ class NeonAPI: if endpoint_type: data["endpoint"]["type"] = endpoint_type if settings: + # otherwise we get 400 "settings must not be nil" + # TODO(myrrc): fix on cplane side + if "pg_settings" not in settings: + settings["pg_settings"] = {} data["endpoint"]["settings"] = settings resp = self.__request( diff --git a/test_runner/performance/test_lfc_prewarm.py b/test_runner/performance/test_lfc_prewarm.py new file mode 100644 index 0000000000..ad2c759a63 --- /dev/null +++ b/test_runner/performance/test_lfc_prewarm.py @@ -0,0 +1,167 @@ +from __future__ import annotations + +import os +import timeit +import traceback +from concurrent.futures import ThreadPoolExecutor as Exec +from pathlib import Path +from time import sleep +from typing import TYPE_CHECKING, Any, cast + +import pytest +from fixtures.benchmark_fixture import NeonBenchmarker, PgBenchRunResult +from fixtures.log_helper import log +from fixtures.neon_api import NeonAPI, connection_parameters_to_env + +if TYPE_CHECKING: + from fixtures.compare_fixtures import NeonCompare + from fixtures.neon_fixtures import Endpoint, PgBin + from fixtures.pg_version import PgVersion + +from performance.test_perf_pgbench import utc_now_timestamp + +# These tests compare performance for a write-heavy and read-heavy workloads of an ordinary endpoint +# compared to the endpoint which saves its LFC and prewarms using it on startup. + + +def test_compare_prewarmed_pgbench_perf(neon_compare: NeonCompare): + env = neon_compare.env + env.create_branch("normal") + env.create_branch("prewarmed") + pg_bin = neon_compare.pg_bin + ep_normal: Endpoint = env.endpoints.create_start("normal") + ep_prewarmed: Endpoint = env.endpoints.create_start("prewarmed", autoprewarm=True) + + for ep in [ep_normal, ep_prewarmed]: + connstr: str = ep.connstr() + pg_bin.run(["pgbench", "-i", "-I", "dtGvp", connstr, "-s100"]) + ep.safe_psql("CREATE EXTENSION neon") + client = ep.http_client() + client.offload_lfc() + ep.stop() + ep.start() + client.prewarm_lfc_wait() + + run_start_timestamp = utc_now_timestamp() + t0 = timeit.default_timer() + out = pg_bin.run_capture(["pgbench", "-c10", "-T10", connstr]) + run_duration = timeit.default_timer() - t0 + run_end_timestamp = utc_now_timestamp() + + stdout = Path(f"{out}.stdout").read_text() + res = PgBenchRunResult.parse_from_stdout( + stdout=stdout, + run_duration=run_duration, + run_start_timestamp=run_start_timestamp, + run_end_timestamp=run_end_timestamp, + ) + name: str = cast("str", ep.branch_name) + neon_compare.zenbenchmark.record_pg_bench_result(name, res) + + +@pytest.mark.remote_cluster +@pytest.mark.timeout(30 * 60) +def test_compare_prewarmed_pgbench_perf_benchmark( + pg_bin: PgBin, + neon_api: NeonAPI, + pg_version: PgVersion, + zenbenchmark: NeonBenchmarker, +): + name = f"Test prewarmed pgbench performance, GITHUB_RUN_ID={os.getenv('GITHUB_RUN_ID')}" + project = neon_api.create_project(pg_version, name) + project_id = project["project"]["id"] + neon_api.wait_for_operation_to_finish(project_id) + err = False + try: + benchmark_impl(pg_bin, neon_api, project, zenbenchmark) + except Exception as e: + err = True + log.error(f"Caught exception: {e}") + log.error(traceback.format_exc()) + finally: + assert not err + neon_api.delete_project(project_id) + + +def benchmark_impl( + pg_bin: PgBin, neon_api: NeonAPI, project: dict[str, Any], zenbenchmark: NeonBenchmarker +): + pgbench_size = int(os.getenv("PGBENCH_SIZE") or "3424") # 50GB + offload_secs = 20 + test_duration_min = 5 + pgbench_duration = f"-T{test_duration_min * 60}" + # prewarm API is not publicly exposed. In order to test performance of a + # fully prewarmed endpoint, wait after it restarts + prewarmed_sleep_secs = 30 + + branch_id = project["branch"]["id"] + project_id = project["project"]["id"] + normal_env = connection_parameters_to_env( + project["connection_uris"][0]["connection_parameters"] + ) + normal_id = project["endpoints"][0]["id"] + + prewarmed_branch_id = neon_api.create_branch( + project_id, "prewarmed", parent_id=branch_id, add_endpoint=False + )["branch"]["id"] + neon_api.wait_for_operation_to_finish(project_id) + + ep_prewarmed = neon_api.create_endpoint( + project_id, + prewarmed_branch_id, + endpoint_type="read_write", + settings={"autoprewarm": True, "offload_lfc_interval_seconds": offload_secs}, + ) + neon_api.wait_for_operation_to_finish(project_id) + + prewarmed_env = normal_env.copy() + prewarmed_env["PGHOST"] = ep_prewarmed["endpoint"]["host"] + prewarmed_id = ep_prewarmed["endpoint"]["id"] + + def bench(endpoint_name, endpoint_id, env): + pg_bin.run(["pgbench", "-i", "-I", "dtGvp", f"-s{pgbench_size}"], env) + sleep(offload_secs * 2) # ensure LFC is offloaded after pgbench finishes + neon_api.restart_endpoint(project_id, endpoint_id) + sleep(prewarmed_sleep_secs) + + run_start_timestamp = utc_now_timestamp() + t0 = timeit.default_timer() + out = pg_bin.run_capture(["pgbench", "-c10", pgbench_duration, "-Mprepared"], env) + run_duration = timeit.default_timer() - t0 + run_end_timestamp = utc_now_timestamp() + + stdout = Path(f"{out}.stdout").read_text() + res = PgBenchRunResult.parse_from_stdout( + stdout=stdout, + run_duration=run_duration, + run_start_timestamp=run_start_timestamp, + run_end_timestamp=run_end_timestamp, + ) + zenbenchmark.record_pg_bench_result(endpoint_name, res) + + with Exec(max_workers=2) as exe: + exe.submit(bench, "normal", normal_id, normal_env) + exe.submit(bench, "prewarmed", prewarmed_id, prewarmed_env) + + +def test_compare_prewarmed_read_perf(neon_compare: NeonCompare): + env = neon_compare.env + env.create_branch("normal") + env.create_branch("prewarmed") + ep_normal: Endpoint = env.endpoints.create_start("normal") + ep_prewarmed: Endpoint = env.endpoints.create_start("prewarmed", autoprewarm=True) + + sql = [ + "CREATE EXTENSION neon", + "CREATE TABLE foo(key serial primary key, t text default 'foooooooooooooooooooooooooooooooooooooooooooooooooooo')", + "INSERT INTO foo SELECT FROM generate_series(1,1000000)", + ] + for ep in [ep_normal, ep_prewarmed]: + ep.safe_psql_many(sql) + client = ep.http_client() + client.offload_lfc() + ep.stop() + ep.start() + client.prewarm_lfc_wait() + with neon_compare.record_duration(f"{ep.branch_name}_run_duration"): + ep.safe_psql("SELECT count(*) from foo")