From 9a6ace9bde2b46dd1b46ea2fe28f01c0b62a6745 Mon Sep 17 00:00:00 2001 From: Fedor Dikarev Date: Tue, 15 Apr 2025 10:21:44 +0200 Subject: [PATCH] introduce new runners: unit-perf and use them for benchmark jobs (#11409) ## Problem Benchmarks results are inconsistent on existing small-metal runners ## Summary of changes Introduce new `unit-perf` runners, and lets run benchmark on them. The new hardware has slower, but consistent, CPU frequency - if run with default governor schedutil. Thus we needed to adjust some testcases' timeouts and add some retry steps where hard-coded timeouts couldn't be increased without changing the system under test. - [wait_for_last_record_lsn](https://github.com/neondatabase/neon/blob/6592d69a6700a2bd2e9f60c22af138ea0dafbdd0/test_runner/fixtures/pageserver/utils.py#L193) 1000s -> 2000s - [test_branch_creation_many](https://github.com/neondatabase/neon/pull/11409/files#diff-2ebfe76f89004d563c7e53e3ca82462e1d85e92e6d5588e8e8f598bbe119e927) 1000s - [test_ingest_insert_bulk](https://github.com/neondatabase/neon/pull/11409/files#diff-e90e685be4a87053bc264a68740969e6a8872c8897b8b748d0e8c5f683a68d9f) - with back throttling disabled compute becomes unresponsive for more than 60 seconds (PG hard-coded client authentication connection timeout) - [test_sharded_ingest](https://github.com/neondatabase/neon/pull/11409/files#diff-e8d870165bd44acb9a6d8350f8640b301c1385a4108430b8d6d659b697e4a3f1) 600s -> 1200s Right now there are only 2 runners of that class, and if we decide to go with them, we have to check how much that type of runners we need, so jobs not stuck with waiting for that type of runners available. However we now decided to run those runners with governor performance instead of schedutil. This achieves almost same performance as previous runners but still achieves consistent results for same commit Related issue to activate performance governor on these runners https://github.com/neondatabase/runner/pull/138 ## Verification that it helps ### analyze runtimes on new runner for same commit Table of runtimes for the same commit on different runners in [run](https://github.com/neondatabase/neon/actions/runs/14417589789) | Run | Benchmarks (1) | Benchmarks (2) |Benchmarks (3) |Benchmarks (4) | Benchmarks (5) | |--------|--------|---------|---------|---------|---------| | 1 | 1950.37s | 6374.55s | 3646.15s | 4149.48s | 2330.22s | | 2 | - | 6369.27s | 3666.65s | 4162.42s | 2329.23s | | Delta % | - | 0,07 % | 0,5 % | 0,3 % | 0,04 % | | with governor performance | 1519.57s | 4131.62s | - | - | - | | second run gov. perf. | 1513.62s | 4134.67s | - | - | - | | Delta % | 0,3 % | 0,07 % | - | - | - | | speedup gov. performance | 22 % | 35 % | - | - | - | | current desktop class hetzner runners (main) | 1487.10s | 3699.67s | - | - | - | | slower than desktop class | 2 % | 12 % | - | - | - | In summary, the runtimes for the same commit on this hardware varies less than 1 %. --------- Co-authored-by: BodoBolero --- .github/actionlint.yml | 1 + .github/workflows/build_and_test.yml | 2 +- explained_queries.sql | 0 test_runner/fixtures/pageserver/utils.py | 2 +- test_runner/performance/test_branch_creation.py | 3 ++- .../performance/test_ingest_insert_bulk.py | 15 ++++++++++++++- test_runner/performance/test_sharded_ingest.py | 2 +- 7 files changed, 20 insertions(+), 5 deletions(-) create mode 100644 explained_queries.sql diff --git a/.github/actionlint.yml b/.github/actionlint.yml index edc456d611..1d1b50e458 100644 --- a/.github/actionlint.yml +++ b/.github/actionlint.yml @@ -6,6 +6,7 @@ self-hosted-runner: - small - small-metal - small-arm64 + - unit-perf - us-east-2 config-variables: - AWS_ECR_REGION diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 46c8cd6fc9..0e67a22bfc 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -284,7 +284,7 @@ jobs: statuses: write contents: write pull-requests: write - runs-on: [ self-hosted, small-metal ] + runs-on: [ self-hosted, unit-perf ] container: image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: diff --git a/explained_queries.sql b/explained_queries.sql new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py index bc5076758d..8f5234a2fa 100644 --- a/test_runner/fixtures/pageserver/utils.py +++ b/test_runner/fixtures/pageserver/utils.py @@ -199,7 +199,7 @@ def wait_for_last_record_lsn( """waits for pageserver to catch up to a certain lsn, returns the last observed lsn.""" current_lsn = Lsn(0) - for i in range(1000): + for i in range(2000): current_lsn = last_record_lsn(pageserver_http, tenant, timeline) if current_lsn >= lsn: return current_lsn diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py index b2bd94fae7..a3ee30cda2 100644 --- a/test_runner/performance/test_branch_creation.py +++ b/test_runner/performance/test_branch_creation.py @@ -97,6 +97,7 @@ def test_branch_creation_heavy_write(neon_compare: NeonCompare, n_branches: int) _record_branch_creation_durations(neon_compare, branch_creation_durations) +@pytest.mark.timeout(1000) @pytest.mark.parametrize("n_branches", [500, 1024]) @pytest.mark.parametrize("shape", ["one_ancestor", "random"]) def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int, shape: str): @@ -205,7 +206,7 @@ def wait_and_record_startup_metrics( assert len(matching) == len(expected_labels) return matching - samples = wait_until(metrics_are_filled) + samples = wait_until(metrics_are_filled, timeout=60) for sample in samples: phase = sample.labels["phase"] diff --git a/test_runner/performance/test_ingest_insert_bulk.py b/test_runner/performance/test_ingest_insert_bulk.py index 01836b82e9..ed0a6c70bd 100644 --- a/test_runner/performance/test_ingest_insert_bulk.py +++ b/test_runner/performance/test_ingest_insert_bulk.py @@ -52,6 +52,8 @@ def test_ingest_insert_bulk( # would compete with Pageserver for bandwidth. # neon_env_builder.enable_safekeeper_remote_storage(s3_storage()) + neon_env_builder.pageserver_config_override = "wait_lsn_timeout='600 s'" + neon_env_builder.disable_scrub_on_exit() # immediate shutdown may leave stray layers env = neon_env_builder.init_start() @@ -92,7 +94,18 @@ def test_ingest_insert_bulk( worker_rows = rows / CONCURRENCY pool.submit(insert_rows, endpoint, f"table{i}", worker_rows, value) - end_lsn = Lsn(endpoint.safe_psql("select pg_current_wal_lsn()")[0][0]) + for attempt in range(5): + try: + end_lsn = Lsn(endpoint.safe_psql("select pg_current_wal_lsn()")[0][0]) + break + except Exception as e: + # if we disable backpressure, postgres can become unresponsive for longer than a minute + # and new connection attempts time out in postgres after 1 minute + # so if this happens we retry new connection + log.error(f"Attempt {attempt + 1}/5: Failed to select current wal lsn: {e}") + if attempt == 4: + log.error("Exceeded maximum retry attempts for selecting current wal lsn") + raise # Wait for pageserver to ingest the WAL. client = env.pageserver.http_client() diff --git a/test_runner/performance/test_sharded_ingest.py b/test_runner/performance/test_sharded_ingest.py index 94fd54bade..293026d40a 100644 --- a/test_runner/performance/test_sharded_ingest.py +++ b/test_runner/performance/test_sharded_ingest.py @@ -13,7 +13,7 @@ from fixtures.neon_fixtures import ( ) -@pytest.mark.timeout(600) +@pytest.mark.timeout(1200) @pytest.mark.parametrize("shard_count", [1, 8, 32]) @pytest.mark.parametrize( "wal_receiver_protocol",