From 9a6ace9bde2b46dd1b46ea2fe28f01c0b62a6745 Mon Sep 17 00:00:00 2001
From: Fedor Dikarev <fedor@neon.tech>
Date: Tue, 15 Apr 2025 10:21:44 +0200
Subject: [PATCH] introduce new runners: unit-perf and use them for benchmark
 jobs (#11409)

## Problem
Benchmarks results are inconsistent on existing small-metal runners

## Summary of changes
Introduce new `unit-perf` runners, and lets run benchmark on them.

The new hardware has slower, but consistent, CPU frequency - if run with
default governor schedutil.
Thus we needed to adjust some testcases' timeouts and add some retry
steps where hard-coded timeouts couldn't be increased without changing
the system under test.
-
[wait_for_last_record_lsn](https://github.com/neondatabase/neon/blob/6592d69a6700a2bd2e9f60c22af138ea0dafbdd0/test_runner/fixtures/pageserver/utils.py#L193)
1000s -> 2000s
-
[test_branch_creation_many](https://github.com/neondatabase/neon/pull/11409/files#diff-2ebfe76f89004d563c7e53e3ca82462e1d85e92e6d5588e8e8f598bbe119e927)
1000s
-
[test_ingest_insert_bulk](https://github.com/neondatabase/neon/pull/11409/files#diff-e90e685be4a87053bc264a68740969e6a8872c8897b8b748d0e8c5f683a68d9f)
- with back throttling disabled compute becomes unresponsive for more
than 60 seconds (PG hard-coded client authentication connection timeout)
-
[test_sharded_ingest](https://github.com/neondatabase/neon/pull/11409/files#diff-e8d870165bd44acb9a6d8350f8640b301c1385a4108430b8d6d659b697e4a3f1)
600s -> 1200s

Right now there are only 2 runners of that class, and if we decide to go
with them, we have to check how much that type of runners we need, so
jobs not stuck with waiting for that type of runners available.

However we now decided to run those runners with governor performance
instead of schedutil.
This achieves almost same performance as previous runners but still
achieves consistent results for same commit

Related issue to activate performance governor on these runners
https://github.com/neondatabase/runner/pull/138

## Verification that it helps

### analyze runtimes on new runner for same commit

Table of runtimes for the same commit on different runners in
[run](https://github.com/neondatabase/neon/actions/runs/14417589789)

| Run | Benchmarks (1) | Benchmarks (2) |Benchmarks (3) |Benchmarks (4)
| Benchmarks (5) |
|--------|--------|---------|---------|---------|---------|
| 1 | 1950.37s | 6374.55s |  3646.15s |  4149.48s |  2330.22s |
| 2 | - | 6369.27s |  3666.65s |  4162.42s |  2329.23s |
| Delta % |  - |  0,07 %  | 0,5 %   |   0,3 % | 0,04 %   |
| with governor performance | 1519.57s |  4131.62s |  - | -  |  - |
| second run gov. perf. | 1513.62s |  4134.67s |  - | -  |  - |
| Delta % |  0,3 % |  0,07 %  |  -  |  - | -   |
| speedup gov. performance | 22 % |  35 % |  - | -  |  - |
| current desktop class hetzner runners (main) | 1487.10s | 3699.67s | -
| - | - |
| slower than desktop class | 2 % |  12 % |  - | -  |  - |


In summary, the runtimes for the same commit on this hardware varies
less than 1 %.

---------

Co-authored-by: BodoBolero <peterbendel@neon.tech>
---
 .github/actionlint.yml                            |  1 +
 .github/workflows/build_and_test.yml              |  2 +-
 explained_queries.sql                             |  0
 test_runner/fixtures/pageserver/utils.py          |  2 +-
 test_runner/performance/test_branch_creation.py   |  3 ++-
 .../performance/test_ingest_insert_bulk.py        | 15 ++++++++++++++-
 test_runner/performance/test_sharded_ingest.py    |  2 +-
 7 files changed, 20 insertions(+), 5 deletions(-)
 create mode 100644 explained_queries.sql

diff --git a/.github/actionlint.yml b/.github/actionlint.yml
index edc456d611..1d1b50e458 100644
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -6,6 +6,7 @@ self-hosted-runner:
     - small
     - small-metal
     - small-arm64
+    - unit-perf
     - us-east-2
 config-variables:
   - AWS_ECR_REGION
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 46c8cd6fc9..0e67a22bfc 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -284,7 +284,7 @@ jobs:
       statuses: write
       contents: write
       pull-requests: write
-    runs-on: [ self-hosted, small-metal ]
+    runs-on: [ self-hosted, unit-perf ]
     container:
       image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
       credentials:
diff --git a/explained_queries.sql b/explained_queries.sql
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index bc5076758d..8f5234a2fa 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -199,7 +199,7 @@ def wait_for_last_record_lsn(
     """waits for pageserver to catch up to a certain lsn, returns the last observed lsn."""
 
     current_lsn = Lsn(0)
-    for i in range(1000):
+    for i in range(2000):
         current_lsn = last_record_lsn(pageserver_http, tenant, timeline)
         if current_lsn >= lsn:
             return current_lsn
diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py
index b2bd94fae7..a3ee30cda2 100644
--- a/test_runner/performance/test_branch_creation.py
+++ b/test_runner/performance/test_branch_creation.py
@@ -97,6 +97,7 @@ def test_branch_creation_heavy_write(neon_compare: NeonCompare, n_branches: int)
     _record_branch_creation_durations(neon_compare, branch_creation_durations)
 
 
+@pytest.mark.timeout(1000)
 @pytest.mark.parametrize("n_branches", [500, 1024])
 @pytest.mark.parametrize("shape", ["one_ancestor", "random"])
 def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int, shape: str):
@@ -205,7 +206,7 @@ def wait_and_record_startup_metrics(
         assert len(matching) == len(expected_labels)
         return matching
 
-    samples = wait_until(metrics_are_filled)
+    samples = wait_until(metrics_are_filled, timeout=60)
 
     for sample in samples:
         phase = sample.labels["phase"]
diff --git a/test_runner/performance/test_ingest_insert_bulk.py b/test_runner/performance/test_ingest_insert_bulk.py
index 01836b82e9..ed0a6c70bd 100644
--- a/test_runner/performance/test_ingest_insert_bulk.py
+++ b/test_runner/performance/test_ingest_insert_bulk.py
@@ -52,6 +52,8 @@ def test_ingest_insert_bulk(
         # would compete with Pageserver for bandwidth.
         # neon_env_builder.enable_safekeeper_remote_storage(s3_storage())
 
+    neon_env_builder.pageserver_config_override = "wait_lsn_timeout='600 s'"
+
     neon_env_builder.disable_scrub_on_exit()  # immediate shutdown may leave stray layers
     env = neon_env_builder.init_start()
 
@@ -92,7 +94,18 @@ def test_ingest_insert_bulk(
                     worker_rows = rows / CONCURRENCY
                     pool.submit(insert_rows, endpoint, f"table{i}", worker_rows, value)
 
-        end_lsn = Lsn(endpoint.safe_psql("select pg_current_wal_lsn()")[0][0])
+        for attempt in range(5):
+            try:
+                end_lsn = Lsn(endpoint.safe_psql("select pg_current_wal_lsn()")[0][0])
+                break
+            except Exception as e:
+                # if we disable backpressure, postgres can become unresponsive for longer than a minute
+                # and new connection attempts time out in postgres after 1 minute
+                # so if this happens we retry new connection
+                log.error(f"Attempt {attempt + 1}/5: Failed to select current wal lsn: {e}")
+            if attempt == 4:
+                log.error("Exceeded maximum retry attempts for selecting current wal lsn")
+                raise
 
         # Wait for pageserver to ingest the WAL.
         client = env.pageserver.http_client()
diff --git a/test_runner/performance/test_sharded_ingest.py b/test_runner/performance/test_sharded_ingest.py
index 94fd54bade..293026d40a 100644
--- a/test_runner/performance/test_sharded_ingest.py
+++ b/test_runner/performance/test_sharded_ingest.py
@@ -13,7 +13,7 @@ from fixtures.neon_fixtures import (
 )
 
 
-@pytest.mark.timeout(600)
+@pytest.mark.timeout(1200)
 @pytest.mark.parametrize("shard_count", [1, 8, 32])
 @pytest.mark.parametrize(
     "wal_receiver_protocol",