From ca9d8761ffb71e9eb12631c9da04ae58b468847b Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Mon, 7 Jul 2025 08:44:41 +0200
Subject: [PATCH] Move some perf benchmarks  from hetzner to aws arm github
 runners (#12393)

## Problem

We want to move some benchmarks from hetzner runners to aws graviton
runners

## Summary of changes

Adjust the runner labels for some workflows.
Adjust the pagebench number of clients to match the latecny knee at 8
cores of the new instance type
Add `--security-opt seccomp=unconfined` to docker run command to bypass
IO_URING EPERM error.

## New runners


https://us-east-2.console.aws.amazon.com/ec2/home?region=us-east-2#Instances:instanceState=running;search=:github-unit-perf-runner-arm;v=3;$case=tags:true%5C,client:false;$regex=tags:false%5C,client:false;sort=tag:Name

## Important Notes

I added the run-benchmarks label to get this tested **before we merge
it**.
[See](https://github.com/neondatabase/neon/actions/runs/15974141360)

I also test a run of pagebench with the new setup from this branch, see
https://github.com/neondatabase/neon/actions/runs/15972523054
- Update: the benchmarking workflow had failures, [see]
(https://github.com/neondatabase/neon/actions/runs/15974141360/job/45055897591)
- changed docker run command to avoid io_uring EPERM error, new run
[see](https://github.com/neondatabase/neon/actions/runs/15997965633/job/45125689920?pr=12393)

Update: the pagebench test run on the new runner [completed
successfully](https://github.com/neondatabase/neon/actions/runs/15972523054/job/45046772556)

Update 2025-07-07: the latest runs with instance store ext4 have been
successful and resolved the direct I/O issues we have been seeing before
in some runs. We only had one perf testcase failing (shard split) that
had been flaky before. So I think we can merge this now.

## Follow up

if this is merged and works successfully we must create a separate issue
to de-provision the hetzner unit-perf runners defined
[here](https://github.com/neondatabase/runner/blob/91a41729af5b168d42e8acab0a7c38fde031113e/ansible/inventory/hosts_metal#L111)
---
 .github/actionlint.yml                                     | 1 +
 .github/workflows/build_and_test.yml                       | 4 ++--
 .github/workflows/periodic_pagebench.yml                   | 4 ++--
 .github/workflows/proxy-benchmark.yml                      | 4 ++--
 ...test_pageserver_max_throughput_getpage_at_latest_lsn.py | 7 ++++---
 5 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/.github/actionlint.yml b/.github/actionlint.yml
index b7e0be761a..3142a36fa0 100644
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -7,6 +7,7 @@ self-hosted-runner:
     - small-metal
     - small-arm64
     - unit-perf
+    - unit-perf-aws-arm
     - us-east-2
 config-variables:
   - AWS_ECR_REGION
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 456c7b8c92..0ceaa96fb0 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -306,14 +306,14 @@ jobs:
       statuses: write
       contents: write
       pull-requests: write
-    runs-on: [ self-hosted, unit-perf ]
+    runs-on: [ self-hosted, unit-perf-aws-arm ]
     container:
       image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
       credentials:
         username: ${{ github.actor }}
         password: ${{ secrets.GITHUB_TOKEN }}
       # for changed limits, see comments on `options:` earlier in this file
-      options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
+      options: --init --shm-size=512mb --ulimit memlock=67108864:67108864 --ulimit nofile=65536:65536 --security-opt seccomp=unconfined
     strategy:
       fail-fast: false
       matrix:
diff --git a/.github/workflows/periodic_pagebench.yml b/.github/workflows/periodic_pagebench.yml
index 317db94052..728a6d4956 100644
--- a/.github/workflows/periodic_pagebench.yml
+++ b/.github/workflows/periodic_pagebench.yml
@@ -1,4 +1,4 @@
-name: Periodic pagebench performance test on unit-perf hetzner runner
+name: Periodic pagebench performance test on unit-perf-aws-arm runners
 
 on:
   schedule:
@@ -40,7 +40,7 @@ jobs:
       statuses: write
       contents: write
       pull-requests: write
-    runs-on: [ self-hosted, unit-perf ]
+    runs-on: [ self-hosted, unit-perf-aws-arm ]
     container:
       image: ghcr.io/neondatabase/build-tools:pinned-bookworm
       credentials:
diff --git a/.github/workflows/proxy-benchmark.yml b/.github/workflows/proxy-benchmark.yml
index 3a98ad4e8e..0ae93ce295 100644
--- a/.github/workflows/proxy-benchmark.yml
+++ b/.github/workflows/proxy-benchmark.yml
@@ -1,4 +1,4 @@
-name: Periodic proxy performance test on unit-perf hetzner runner
+name: Periodic proxy performance test on unit-perf-aws-arm runners
 
 on:
   push: # TODO: remove after testing
@@ -32,7 +32,7 @@ jobs:
       statuses: write
       contents: write
       pull-requests: write
-    runs-on: [self-hosted, unit-perf]
+    runs-on: [self-hosted, unit-perf-aws-arm]
     timeout-minutes: 60  # 1h timeout
     container:
       image: ghcr.io/neondatabase/build-tools:pinned-bookworm
diff --git a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
index 41696bf887..68bfa81b25 100644
--- a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
+++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
@@ -55,9 +55,10 @@ def test_pageserver_characterize_throughput_with_n_tenants(
 @pytest.mark.parametrize("duration", [20 * 60])
 @pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(2048)])
 # we use 1 client to characterize latencies, and 64 clients to characterize throughput/scalability
-# we use 64 clients because typically for a high number of connections we recommend the connection pooler
-# which by default uses 64 connections
-@pytest.mark.parametrize("n_clients", [1, 64])
+# we use 8 clients because we see a latency knee around 6-8 clients on im4gn.2xlarge instance type,
+# which we use for this periodic test - at a cpu utilization of around 70 % - which is considered
+# a good utilization for pageserver.
+@pytest.mark.parametrize("n_clients", [1, 8])
 @pytest.mark.parametrize("n_tenants", [1])
 @pytest.mark.timeout(2400)
 def test_pageserver_characterize_latencies_with_1_client_and_throughput_with_many_clients_one_tenant(