mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-16 09:52:54 +00:00
introduce new runners: unit-perf and use them for benchmark jobs (#11409)
## Problem
Benchmarks results are inconsistent on existing small-metal runners
## Summary of changes
Introduce new `unit-perf` runners, and lets run benchmark on them.
The new hardware has slower, but consistent, CPU frequency - if run with
default governor schedutil.
Thus we needed to adjust some testcases' timeouts and add some retry
steps where hard-coded timeouts couldn't be increased without changing
the system under test.
-
[wait_for_last_record_lsn](6592d69a67/test_runner/fixtures/pageserver/utils.py (L193))
1000s -> 2000s
-
[test_branch_creation_many](https://github.com/neondatabase/neon/pull/11409/files#diff-2ebfe76f89004d563c7e53e3ca82462e1d85e92e6d5588e8e8f598bbe119e927)
1000s
-
[test_ingest_insert_bulk](https://github.com/neondatabase/neon/pull/11409/files#diff-e90e685be4a87053bc264a68740969e6a8872c8897b8b748d0e8c5f683a68d9f)
- with back throttling disabled compute becomes unresponsive for more
than 60 seconds (PG hard-coded client authentication connection timeout)
-
[test_sharded_ingest](https://github.com/neondatabase/neon/pull/11409/files#diff-e8d870165bd44acb9a6d8350f8640b301c1385a4108430b8d6d659b697e4a3f1)
600s -> 1200s
Right now there are only 2 runners of that class, and if we decide to go
with them, we have to check how much that type of runners we need, so
jobs not stuck with waiting for that type of runners available.
However we now decided to run those runners with governor performance
instead of schedutil.
This achieves almost same performance as previous runners but still
achieves consistent results for same commit
Related issue to activate performance governor on these runners
https://github.com/neondatabase/runner/pull/138
## Verification that it helps
### analyze runtimes on new runner for same commit
Table of runtimes for the same commit on different runners in
[run](https://github.com/neondatabase/neon/actions/runs/14417589789)
| Run | Benchmarks (1) | Benchmarks (2) |Benchmarks (3) |Benchmarks (4)
| Benchmarks (5) |
|--------|--------|---------|---------|---------|---------|
| 1 | 1950.37s | 6374.55s | 3646.15s | 4149.48s | 2330.22s |
| 2 | - | 6369.27s | 3666.65s | 4162.42s | 2329.23s |
| Delta % | - | 0,07 % | 0,5 % | 0,3 % | 0,04 % |
| with governor performance | 1519.57s | 4131.62s | - | - | - |
| second run gov. perf. | 1513.62s | 4134.67s | - | - | - |
| Delta % | 0,3 % | 0,07 % | - | - | - |
| speedup gov. performance | 22 % | 35 % | - | - | - |
| current desktop class hetzner runners (main) | 1487.10s | 3699.67s | -
| - | - |
| slower than desktop class | 2 % | 12 % | - | - | - |
In summary, the runtimes for the same commit on this hardware varies
less than 1 %.
---------
Co-authored-by: BodoBolero <peterbendel@neon.tech>
This commit is contained in:
1
.github/actionlint.yml
vendored
1
.github/actionlint.yml
vendored
@@ -6,6 +6,7 @@ self-hosted-runner:
|
||||
- small
|
||||
- small-metal
|
||||
- small-arm64
|
||||
- unit-perf
|
||||
- us-east-2
|
||||
config-variables:
|
||||
- AWS_ECR_REGION
|
||||
|
||||
2
.github/workflows/build_and_test.yml
vendored
2
.github/workflows/build_and_test.yml
vendored
@@ -284,7 +284,7 @@ jobs:
|
||||
statuses: write
|
||||
contents: write
|
||||
pull-requests: write
|
||||
runs-on: [ self-hosted, small-metal ]
|
||||
runs-on: [ self-hosted, unit-perf ]
|
||||
container:
|
||||
image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
|
||||
credentials:
|
||||
|
||||
0
explained_queries.sql
Normal file
0
explained_queries.sql
Normal file
@@ -199,7 +199,7 @@ def wait_for_last_record_lsn(
|
||||
"""waits for pageserver to catch up to a certain lsn, returns the last observed lsn."""
|
||||
|
||||
current_lsn = Lsn(0)
|
||||
for i in range(1000):
|
||||
for i in range(2000):
|
||||
current_lsn = last_record_lsn(pageserver_http, tenant, timeline)
|
||||
if current_lsn >= lsn:
|
||||
return current_lsn
|
||||
|
||||
@@ -97,6 +97,7 @@ def test_branch_creation_heavy_write(neon_compare: NeonCompare, n_branches: int)
|
||||
_record_branch_creation_durations(neon_compare, branch_creation_durations)
|
||||
|
||||
|
||||
@pytest.mark.timeout(1000)
|
||||
@pytest.mark.parametrize("n_branches", [500, 1024])
|
||||
@pytest.mark.parametrize("shape", ["one_ancestor", "random"])
|
||||
def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int, shape: str):
|
||||
@@ -205,7 +206,7 @@ def wait_and_record_startup_metrics(
|
||||
assert len(matching) == len(expected_labels)
|
||||
return matching
|
||||
|
||||
samples = wait_until(metrics_are_filled)
|
||||
samples = wait_until(metrics_are_filled, timeout=60)
|
||||
|
||||
for sample in samples:
|
||||
phase = sample.labels["phase"]
|
||||
|
||||
@@ -52,6 +52,8 @@ def test_ingest_insert_bulk(
|
||||
# would compete with Pageserver for bandwidth.
|
||||
# neon_env_builder.enable_safekeeper_remote_storage(s3_storage())
|
||||
|
||||
neon_env_builder.pageserver_config_override = "wait_lsn_timeout='600 s'"
|
||||
|
||||
neon_env_builder.disable_scrub_on_exit() # immediate shutdown may leave stray layers
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
@@ -92,7 +94,18 @@ def test_ingest_insert_bulk(
|
||||
worker_rows = rows / CONCURRENCY
|
||||
pool.submit(insert_rows, endpoint, f"table{i}", worker_rows, value)
|
||||
|
||||
end_lsn = Lsn(endpoint.safe_psql("select pg_current_wal_lsn()")[0][0])
|
||||
for attempt in range(5):
|
||||
try:
|
||||
end_lsn = Lsn(endpoint.safe_psql("select pg_current_wal_lsn()")[0][0])
|
||||
break
|
||||
except Exception as e:
|
||||
# if we disable backpressure, postgres can become unresponsive for longer than a minute
|
||||
# and new connection attempts time out in postgres after 1 minute
|
||||
# so if this happens we retry new connection
|
||||
log.error(f"Attempt {attempt + 1}/5: Failed to select current wal lsn: {e}")
|
||||
if attempt == 4:
|
||||
log.error("Exceeded maximum retry attempts for selecting current wal lsn")
|
||||
raise
|
||||
|
||||
# Wait for pageserver to ingest the WAL.
|
||||
client = env.pageserver.http_client()
|
||||
|
||||
@@ -13,7 +13,7 @@ from fixtures.neon_fixtures import (
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.timeout(600)
|
||||
@pytest.mark.timeout(1200)
|
||||
@pytest.mark.parametrize("shard_count", [1, 8, 32])
|
||||
@pytest.mark.parametrize(
|
||||
"wal_receiver_protocol",
|
||||
|
||||
Reference in New Issue
Block a user