From ca9d8761ffb71e9eb12631c9da04ae58b468847b Mon Sep 17 00:00:00 2001 From: Peter Bendel Date: Mon, 7 Jul 2025 08:44:41 +0200 Subject: [PATCH] Move some perf benchmarks from hetzner to aws arm github runners (#12393) ## Problem We want to move some benchmarks from hetzner runners to aws graviton runners ## Summary of changes Adjust the runner labels for some workflows. Adjust the pagebench number of clients to match the latecny knee at 8 cores of the new instance type Add `--security-opt seccomp=unconfined` to docker run command to bypass IO_URING EPERM error. ## New runners https://us-east-2.console.aws.amazon.com/ec2/home?region=us-east-2#Instances:instanceState=running;search=:github-unit-perf-runner-arm;v=3;$case=tags:true%5C,client:false;$regex=tags:false%5C,client:false;sort=tag:Name ## Important Notes I added the run-benchmarks label to get this tested **before we merge it**. [See](https://github.com/neondatabase/neon/actions/runs/15974141360) I also test a run of pagebench with the new setup from this branch, see https://github.com/neondatabase/neon/actions/runs/15972523054 - Update: the benchmarking workflow had failures, [see] (https://github.com/neondatabase/neon/actions/runs/15974141360/job/45055897591) - changed docker run command to avoid io_uring EPERM error, new run [see](https://github.com/neondatabase/neon/actions/runs/15997965633/job/45125689920?pr=12393) Update: the pagebench test run on the new runner [completed successfully](https://github.com/neondatabase/neon/actions/runs/15972523054/job/45046772556) Update 2025-07-07: the latest runs with instance store ext4 have been successful and resolved the direct I/O issues we have been seeing before in some runs. We only had one perf testcase failing (shard split) that had been flaky before. So I think we can merge this now. ## Follow up if this is merged and works successfully we must create a separate issue to de-provision the hetzner unit-perf runners defined [here](https://github.com/neondatabase/runner/blob/91a41729af5b168d42e8acab0a7c38fde031113e/ansible/inventory/hosts_metal#L111) --- .github/actionlint.yml | 1 + .github/workflows/build_and_test.yml | 4 ++-- .github/workflows/periodic_pagebench.yml | 4 ++-- .github/workflows/proxy-benchmark.yml | 4 ++-- ...test_pageserver_max_throughput_getpage_at_latest_lsn.py | 7 ++++--- 5 files changed, 11 insertions(+), 9 deletions(-) diff --git a/.github/actionlint.yml b/.github/actionlint.yml index b7e0be761a..3142a36fa0 100644 --- a/.github/actionlint.yml +++ b/.github/actionlint.yml @@ -7,6 +7,7 @@ self-hosted-runner: - small-metal - small-arm64 - unit-perf + - unit-perf-aws-arm - us-east-2 config-variables: - AWS_ECR_REGION diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 456c7b8c92..0ceaa96fb0 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -306,14 +306,14 @@ jobs: statuses: write contents: write pull-requests: write - runs-on: [ self-hosted, unit-perf ] + runs-on: [ self-hosted, unit-perf-aws-arm ] container: image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} # for changed limits, see comments on `options:` earlier in this file - options: --init --shm-size=512mb --ulimit memlock=67108864:67108864 + options: --init --shm-size=512mb --ulimit memlock=67108864:67108864 --ulimit nofile=65536:65536 --security-opt seccomp=unconfined strategy: fail-fast: false matrix: diff --git a/.github/workflows/periodic_pagebench.yml b/.github/workflows/periodic_pagebench.yml index 317db94052..728a6d4956 100644 --- a/.github/workflows/periodic_pagebench.yml +++ b/.github/workflows/periodic_pagebench.yml @@ -1,4 +1,4 @@ -name: Periodic pagebench performance test on unit-perf hetzner runner +name: Periodic pagebench performance test on unit-perf-aws-arm runners on: schedule: @@ -40,7 +40,7 @@ jobs: statuses: write contents: write pull-requests: write - runs-on: [ self-hosted, unit-perf ] + runs-on: [ self-hosted, unit-perf-aws-arm ] container: image: ghcr.io/neondatabase/build-tools:pinned-bookworm credentials: diff --git a/.github/workflows/proxy-benchmark.yml b/.github/workflows/proxy-benchmark.yml index 3a98ad4e8e..0ae93ce295 100644 --- a/.github/workflows/proxy-benchmark.yml +++ b/.github/workflows/proxy-benchmark.yml @@ -1,4 +1,4 @@ -name: Periodic proxy performance test on unit-perf hetzner runner +name: Periodic proxy performance test on unit-perf-aws-arm runners on: push: # TODO: remove after testing @@ -32,7 +32,7 @@ jobs: statuses: write contents: write pull-requests: write - runs-on: [self-hosted, unit-perf] + runs-on: [self-hosted, unit-perf-aws-arm] timeout-minutes: 60 # 1h timeout container: image: ghcr.io/neondatabase/build-tools:pinned-bookworm diff --git a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py index 41696bf887..68bfa81b25 100644 --- a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py +++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py @@ -55,9 +55,10 @@ def test_pageserver_characterize_throughput_with_n_tenants( @pytest.mark.parametrize("duration", [20 * 60]) @pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(2048)]) # we use 1 client to characterize latencies, and 64 clients to characterize throughput/scalability -# we use 64 clients because typically for a high number of connections we recommend the connection pooler -# which by default uses 64 connections -@pytest.mark.parametrize("n_clients", [1, 64]) +# we use 8 clients because we see a latency knee around 6-8 clients on im4gn.2xlarge instance type, +# which we use for this periodic test - at a cpu utilization of around 70 % - which is considered +# a good utilization for pageserver. +@pytest.mark.parametrize("n_clients", [1, 8]) @pytest.mark.parametrize("n_tenants", [1]) @pytest.mark.timeout(2400) def test_pageserver_characterize_latencies_with_1_client_and_throughput_with_many_clients_one_tenant(