Files
neon/scripts/benchmark_durations.py
Peter Bendel 87fc0a0374 periodic pagebench on hetzner runners (#11963)
## Problem

- Benchmark periodic pagebench had inconsistent benchmarking results
even when run with the same commit hash.
Hypothesis is this was due to running on dedicated but virtualized EC
instance with varying CPU frequency.

- the dedicated instance type used for the benchmark is quite "old" and
we increasingly get `An error occurred (InsufficientInstanceCapacity)
when calling the StartInstances operation (reached max retries: 2):
Insufficient capacity.`

- periodic pagebench uses a snapshot of pageserver timelines to have the
same layer structure in each run and get consistent performance.
Re-creating the snapshot was a painful manual process (see
https://github.com/neondatabase/cloud/issues/27051 and
https://github.com/neondatabase/cloud/issues/27653)

## Summary of changes

- Run the periodic pagebench on a custom hetzner GitHub runner with
large nvme disk and governor set to defined perf profile
- provide a manual dispatch option for the workflow that allows to
create a new snapshot
- keep the manual dispatch option to specify a commit hash useful for
bi-secting regressions
- always use the newest created snapshot (S3 bucket uses date suffix in
S3 key, example
`s3://neon-github-public-dev/performance/pagebench/shared-snapshots-2025-05-17/`
- `--ignore`
`test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py`
in regular benchmarks run for each commit
- improve perf copying snapshot by using `cp` subprocess instead of
traversing tree in python


## Example runs with code in this PR:
- run which creates new snapshot
https://github.com/neondatabase/neon/actions/runs/15083408849/job/42402986376#step:19:55
- run which uses latest snapshot
-
https://github.com/neondatabase/neon/actions/runs/15084907676/job/42406240745#step:11:65
2025-05-23 09:37:19 +00:00

168 lines
9.2 KiB
Python
Executable File

#! /usr/bin/env python3
from __future__ import annotations
import argparse
import json
import logging
import psycopg2
import psycopg2.extras
"""
The script fetches the durations of benchmarks from the database and stores it in a file compatible with pytest-split plugin.
"""
BENCHMARKS_DURATION_QUERY = """
SELECT
DISTINCT parent_suite, suite, name,
PERCENTILE_DISC(%s) WITHIN GROUP (ORDER BY duration) as percentile_ms
FROM results
WHERE
started_at > CURRENT_DATE - INTERVAL '%s' day
AND starts_with(parent_suite, 'test_runner.performance')
AND status = 'passed'
GROUP BY
parent_suite, suite, name
;
"""
# For out benchmarks the default distibution for 4 worked produces pretty uneven chunks,
# the total duration varies from 8 to 40 minutes.
# We use some pre-collected durations as a fallback to have a better distribution.
FALLBACK_DURATION = {
"test_runner/performance/test_branch_creation.py::test_branch_creation_heavy_write[20]": 58.036,
"test_runner/performance/test_branch_creation.py::test_branch_creation_many_relations": 22.104,
"test_runner/performance/test_branch_creation.py::test_branch_creation_many[1024]": 126.073,
"test_runner/performance/test_branching.py::test_compare_child_and_root_pgbench_perf": 25.759,
"test_runner/performance/test_branching.py::test_compare_child_and_root_read_perf": 6.885,
"test_runner/performance/test_branching.py::test_compare_child_and_root_write_perf": 8.758,
"test_runner/performance/test_bulk_insert.py::test_bulk_insert[neon]": 18.275,
"test_runner/performance/test_bulk_insert.py::test_bulk_insert[vanilla]": 9.533,
"test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[1]": 12.09,
"test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[10]": 35.145,
"test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[5]": 22.28,
"test_runner/performance/test_bulk_update.py::test_bulk_update[10]": 66.353,
"test_runner/performance/test_bulk_update.py::test_bulk_update[100]": 75.487,
"test_runner/performance/test_bulk_update.py::test_bulk_update[50]": 54.142,
"test_runner/performance/test_compaction.py::test_compaction": 110.715,
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[neon-5-10-100]": 11.68,
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[vanilla-5-10-100]": 16.384,
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[neon-5-10-100]": 11.315,
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[vanilla-5-10-100]": 18.783,
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[neon-5-10-100]": 11.647,
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[vanilla-5-10-100]": 17.04,
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-1]": 11.01,
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-10]": 11.902,
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-1]": 10.077,
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-10]": 10.4,
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[neon-5-10-100]": 11.33,
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[vanilla-5-10-100]": 16.434,
"test_runner/performance/test_copy.py::test_copy[neon]": 13.817,
"test_runner/performance/test_copy.py::test_copy[vanilla]": 11.736,
"test_runner/performance/test_gc_feedback.py::test_gc_feedback": 575.735,
"test_runner/performance/test_gc_feedback.py::test_gc_feedback_with_snapshots": 575.735,
"test_runner/performance/test_gist_build.py::test_gist_buffering_build[neon]": 14.868,
"test_runner/performance/test_gist_build.py::test_gist_buffering_build[vanilla]": 14.393,
"test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[neon-1]": 20.588,
"test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[vanilla-1]": 30.849,
"test_runner/performance/test_layer_map.py::test_layer_map": 39.378,
"test_runner/performance/test_lazy_startup.py::test_lazy_startup": 2848.938,
"test_runner/performance/test_logical_replication.py::test_logical_replication": 120.952,
"test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[neon]": 35.552,
"test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[vanilla]": 66.762,
"test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[neon]": 85.177,
"test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[vanilla]": 92.12,
"test_runner/performance/test_perf_pgbench.py::test_pgbench[neon-45-10]": 107.009,
"test_runner/performance/test_perf_pgbench.py::test_pgbench[vanilla-45-10]": 99.582,
"test_runner/performance/test_random_writes.py::test_random_writes[neon]": 4.737,
"test_runner/performance/test_random_writes.py::test_random_writes[vanilla]": 2.686,
"test_runner/performance/test_seqscans.py::test_seqscans[neon-100000-100-0]": 3.271,
"test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-0]": 50.719,
"test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-4]": 15.992,
"test_runner/performance/test_seqscans.py::test_seqscans[vanilla-100000-100-0]": 0.566,
"test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-0]": 13.542,
"test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-4]": 13.35,
"test_runner/performance/test_startup.py::test_startup_simple": 13.043,
"test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_off-10-5-5]": 194.841,
"test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_on-10-5-5]": 286.667,
"test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[vanilla-10-5-5]": 85.577,
"test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_off-1000]": 297.626,
"test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_on-1000]": 646.187,
"test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[vanilla-1000]": 989.776,
"test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_off-45-100]": 125.638,
"test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_on-45-100]": 123.554,
"test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[vanilla-45-100]": 190.083,
"test_runner/performance/test_write_amplification.py::test_write_amplification[neon]": 21.016,
"test_runner/performance/test_write_amplification.py::test_write_amplification[vanilla]": 23.028,
}
def main(args: argparse.Namespace):
connstr = args.connstr
interval_days = args.days
output = args.output
percentile = args.percentile
res: dict[str, float] = {}
try:
logging.info("connecting to the database...")
with psycopg2.connect(connstr, connect_timeout=30) as conn:
with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
logging.info("fetching benchmarks...")
cur.execute(BENCHMARKS_DURATION_QUERY, (percentile, interval_days))
rows = cur.fetchall()
except psycopg2.OperationalError as exc:
logging.error("cannot fetch benchmarks duration from the DB due to an error", exc)
rows = []
res = FALLBACK_DURATION
for row in rows:
pytest_name = f"{row['parent_suite'].replace('.', '/')}/{row['suite']}.py::{row['name']}"
duration = row["percentile_ms"] / 1000
logging.info(f"\t{pytest_name}: {duration}")
res[pytest_name] = duration
logging.info(f"saving results to {output.name}")
json.dump(res, output, indent=2)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Get <percentile> of benchmarks duration for the last <N> days"
)
parser.add_argument(
"--output",
type=argparse.FileType("w"),
default=".test_durations",
help="path to output json file (default: .test_durations)",
)
parser.add_argument(
"--percentile",
type=float,
default="0.99",
help="percentile (default: 0.99)",
)
parser.add_argument(
"--days",
required=False,
default=10,
type=int,
help="how many days to look back for (default: 10)",
)
parser.add_argument(
"connstr",
help="connection string to the test results database",
)
args = parser.parse_args()
level = logging.INFO
logging.basicConfig(
format="%(message)s",
level=level,
)
main(args)