First version of a new benchmark to test larger OLTP workload (#11053)

## Problem We want to support larger tenants (regarding logical database size, number of transactions per second etc.) and should increase our test coverage of OLTP transactions at larger scale. ## Summary of changes Start a new benchmark that over time will add more OLTP tests at larger scale. This PR covers the first version and will be extended in further PRs. Also fix some infrastructure: - default for new connections and large tenants is to use connection pooler pgbouncer, however our fixture always added `statement_timeout=120` which is not compatible with pooler [see](https://neon.tech/docs/connect/connection-errors#unsupported-startup-parameter) - action to create branch timed out after 10 seconds and 10 retries but for large tenants it can take longer so use increasing back-off for retries ## Test run https://github.com/neondatabase/neon/actions/runs/13593446706
2026-01-03 19:42:55 +00:00 · 2025-03-03 16:25:48 +01:00
parent 38277497fd
commit a07599949f
8 changed files with 324 additions and 4 deletions
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -32,3 +32,4 @@ config-variables:
  - NEON_DEV_AWS_ACCOUNT_ID
  - NEON_PROD_AWS_ACCOUNT_ID
  - AWS_ECR_REGION
+  - BENCHMARK_LARGE_OLTP_PROJECTID
--- a/.github/actions/neon-branch-create/action.yml
+++ b/.github/actions/neon-branch-create/action.yml
@@ -84,7 +84,13 @@ runs:
          --header "Authorization: Bearer ${API_KEY}"
          )

-        role_name=$(echo $roles | jq --raw-output '.roles[] | select(.protected == false) | .name')
+        role_name=$(echo "$roles" | jq --raw-output '
+          (.roles | map(select(.protected == false))) as $roles |
+          if any($roles[]; .name == "neondb_owner")
+          then "neondb_owner"
+          else $roles[0].name
+          end
+        ')
        echo "role_name=${role_name}" >> $GITHUB_OUTPUT
      env:
        API_HOST: ${{ inputs.api_host }}
@@ -107,13 +113,13 @@ runs:
            )

          if [ -z "${reset_password}" ]; then
-            sleep 1
+            sleep $i
            continue
          fi

          password=$(echo $reset_password | jq --raw-output '.role.password')
          if [ "${password}" == "null" ]; then
-            sleep 1
+            sleep $i # increasing backoff
            continue
          fi

--- a/.github/workflows/large_oltp_benchmark.yml
+++ b/.github/workflows/large_oltp_benchmark.yml
@@ -0,0 +1,147 @@
+name: large oltp benchmark
+
+on:
+  # uncomment to run on push for debugging your PR
+  push:
+    branches: [ bodobolero/synthetic_oltp_workload ]
+
+  schedule:
+    # * is a special character in YAML so you have to quote this string
+    #          ┌───────────── minute (0 - 59)
+    #          │ ┌───────────── hour (0 - 23)
+    #          │ │  ┌───────────── day of the month (1 - 31)
+    #          │ │  │ ┌───────────── month (1 - 12 or JAN-DEC)
+    #          │ │  │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
+    - cron:   '0 15 * * *' # run once a day, timezone is utc, avoid conflict with other benchmarks
+  workflow_dispatch: # adds ability to run this manually
+
+defaults:
+  run:
+    shell: bash -euxo pipefail {0}
+
+concurrency:
+  # Allow only one workflow globally because we need dedicated resources which only exist once
+  group: large-oltp-bench-workflow
+  cancel-in-progress: true
+
+jobs:
+  oltp:
+    strategy:
+      fail-fast: false # allow other variants to continue even if one fails
+      matrix:
+        include:
+          - target: new_branch 
+            custom_scripts: insert_webhooks.sql@2 select_any_webhook_with_skew.sql@4 select_recent_webhook.sql@4 
+          - target: reuse_branch 
+            custom_scripts: insert_webhooks.sql@2 select_any_webhook_with_skew.sql@4 select_recent_webhook.sql@4 
+      max-parallel: 1 # we want to run each stripe size sequentially to be able to compare the results
+    permissions:
+      contents: write
+      statuses: write
+      id-token: write # aws-actions/configure-aws-credentials
+    env:
+      TEST_PG_BENCH_DURATIONS_MATRIX: "1h" # todo update to > 1 h 
+      TEST_PGBENCH_CUSTOM_SCRIPTS: ${{ matrix.custom_scripts }}
+      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
+      PG_VERSION: 16 # pre-determined by pre-determined project
+      TEST_OUTPUT: /tmp/test_output
+      BUILD_TYPE: remote
+      SAVE_PERF_REPORT: ${{ github.ref_name == 'main' }}
+      PLATFORM: ${{ matrix.target }}
+
+    runs-on: [ self-hosted, us-east-2, x64 ]
+    container:
+      image: neondatabase/build-tools:pinned-bookworm
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      options: --init
+
+    # Increase timeout to 8h, default timeout is 6h
+    timeout-minutes: 480
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Configure AWS credentials # necessary to download artefacts
+      uses: aws-actions/configure-aws-credentials@v4
+      with:
+        aws-region: eu-central-1
+        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        role-duration-seconds: 18000 # 5 hours is currently max associated with IAM role
+
+    - name: Download Neon artifact
+      uses: ./.github/actions/download
+      with:
+        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
+        path: /tmp/neon/
+        prefix: latest
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+
+    - name: Create Neon Branch for large tenant
+      if: ${{ matrix.target == 'new_branch' }}
+      id: create-neon-branch-oltp-target
+      uses: ./.github/actions/neon-branch-create
+      with:
+          project_id: ${{ vars.BENCHMARK_LARGE_OLTP_PROJECTID }}
+          api_key: ${{ secrets.NEON_STAGING_API_KEY }}
+
+    - name: Set up Connection String
+      id: set-up-connstr
+      run: |
+          case "${{ matrix.target }}" in
+              new_branch)
+              CONNSTR=${{ steps.create-neon-branch-oltp-target.outputs.dsn }}
+              ;;
+              reuse_branch)
+              CONNSTR=${{ secrets.BENCHMARK_LARGE_OLTP_REUSE_CONNSTR }}
+              ;;
+              *)
+              echo >&2 "Unknown target=${{ matrix.target }}"
+              exit 1
+              ;;
+          esac
+
+          echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
+
+    - name: Benchmark pgbench with custom-scripts
+      uses: ./.github/actions/run-python-test-set
+      with:
+        build_type: ${{ env.BUILD_TYPE }}
+        test_selection: performance
+        run_in_parallel: false
+        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
+        extra_params: -m remote_cluster --timeout 21600 -k test_perf_oltp_large_tenant
+        pg_version: ${{ env.PG_VERSION }}
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+      env:
+        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
+        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
+        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
+
+    - name: Delete Neon Branch for large tenant
+      if: ${{ always() && matrix.target == 'new_branch' }}
+      uses: ./.github/actions/neon-branch-delete
+      with:
+        project_id: ${{ vars.BENCHMARK_LARGE_OLTP_PROJECTID }}
+        branch_id: ${{ steps.create-neon-branch-oltp-target.outputs.branch_id }}
+        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
+
+    - name: Create Allure report
+      id: create-allure-report
+      if: ${{ !cancelled() }}
+      uses: ./.github/actions/allure-report-generate
+      with:
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+  
+    - name: Post to a Slack channel
+      if: ${{ github.event.schedule && failure() }}
+      uses: slackapi/slack-github-action@v1
+      with:
+        channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream
+        slack-message: |
+          Periodic large oltp perf testing: ${{ job.status }}
+          <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
+          <${{ steps.create-allure-report.outputs.report-url }}|Allure report>
+      env:
+        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -253,10 +253,15 @@ class PgProtocol:
        # enough for our tests, but if you need a longer, you can
        # change it by calling "SET statement_timeout" after
        # connecting.
+        # pooler does not support statement_timeout
+        # Check if the hostname contains the string 'pooler'
+        hostname = result.get("host", "")
+        log.info(f"Hostname: {hostname}")
        options = result.get("options", "")
-        if "statement_timeout" not in options:
+        if "statement_timeout" not in options and "pooler" not in hostname:
            options = f"-cstatement_timeout=120s {options}"
        result["options"] = options
+
        return result

    # autocommit=True here by default because that's what we need most of the time
--- a/test_runner/performance/large_synthetic_oltp/insert_webhooks.sql
+++ b/test_runner/performance/large_synthetic_oltp/insert_webhooks.sql
@@ -0,0 +1,47 @@
+\set event_type random(1,10)
+\set service_key random(1, 3)
+
+INSERT INTO webhook.incoming_webhooks (
+    created_at, 
+    delivery_id, 
+    upstream_emitted_at, 
+    service_key, 
+    event_id, 
+    source, 
+    body, 
+    json, 
+    additional_data, 
+    is_body_encrypted, 
+    event_type
+) VALUES (
+    now(),
+    gen_random_uuid(),
+    now() - interval '10 minutes',
+    CASE :service_key::int
+        WHEN 1 THEN 'shopify'
+        WHEN 2 THEN 'stripe'
+        WHEN 3 THEN 'github'
+    END,
+    'evt_' || gen_random_uuid(),  -- Ensures uniqueness
+    CASE :service_key::int
+        WHEN 1 THEN 'Shopify'
+        WHEN 2 THEN 'Stripe'
+        WHEN 3 THEN 'GitHub'
+    END,
+    '{"order_id": 987654, "customer": {"name": "John Doe", "email": "john.doe@example.com"}, "items": [{"product_id": 12345, "quantity": 2}, {"product_id": 67890, "quantity": 1}], "total": 199.99}',
+    '{"order_id": 987654, "customer": {"name": "John Doe", "email": "john.doe@example.com"}, "items": [{"product_id": 12345, "quantity": 2}, {"product_id": 67890, "quantity": 1}], "total": 199.99}'::jsonb,
+    '{"metadata": {"user_agent": "Mozilla/5.0", "ip_address": "203.0.113.42"}}'::jsonb,
+    false,
+    CASE :event_type::int
+        WHEN 1 THEN 'ORDER_PLACED'
+        WHEN 2 THEN 'ORDER_CANCELLED'
+        WHEN 3 THEN 'PAYMENT_SUCCESSFUL'
+        WHEN 4 THEN 'PAYMENT_FAILED'
+        WHEN 5 THEN 'CUSTOMER_CREATED'
+        WHEN 6 THEN 'CUSTOMER_UPDATED'
+        WHEN 7 THEN 'PRODUCT_UPDATED'
+        WHEN 8 THEN 'INVENTORY_LOW'
+        WHEN 9 THEN 'SHIPPING_DISPATCHED'
+        WHEN 10 THEN 'REFUND_ISSUED'
+    END
+);
--- a/test_runner/performance/large_synthetic_oltp/select_any_webhook_with_skew.sql
+++ b/test_runner/performance/large_synthetic_oltp/select_any_webhook_with_skew.sql
@@ -0,0 +1,15 @@
+-- Zipfian distributions model real-world access patterns where:
+--	A few values (popular IDs) are accessed frequently.
+--	Many values are accessed rarely.
+-- This is useful for simulating realistic workloads, like webhook processing where recent events are more frequently accessed.
+
+\set alpha 1.2  
+\set min_id 1
+\set max_id 135000000
+
+\set zipf_random_id random_zipfian(:min_id, :max_id, :alpha)
+
+SELECT * 
+FROM webhook.incoming_webhooks
+WHERE id = (:zipf_random_id)::bigint
+LIMIT 1;
--- a/test_runner/performance/large_synthetic_oltp/select_recent_webhook.sql
+++ b/test_runner/performance/large_synthetic_oltp/select_recent_webhook.sql
@@ -0,0 +1,9 @@
+-- select one of the most recent webhook records (created in the branch timeline during the bench run)
+SELECT * 
+FROM webhook.incoming_webhooks
+WHERE id = (
+    SELECT (floor(random() * (
+        (SELECT last_value FROM webhook.incoming_webhooks_id_seq) - 1350000001 + 1
+    ) + 1350000001))::bigint
+)
+LIMIT 1;
--- a/test_runner/performance/test_perf_oltp_large_tenant.py
+++ b/test_runner/performance/test_perf_oltp_large_tenant.py
@@ -0,0 +1,90 @@
+from __future__ import annotations
+
+import os
+import timeit
+from pathlib import Path
+
+import pytest
+from fixtures.benchmark_fixture import PgBenchRunResult
+from fixtures.compare_fixtures import PgCompare
+
+from performance.test_perf_pgbench import get_durations_matrix, utc_now_timestamp
+
+
+def get_custom_scripts(
+    default: str = "insert_webhooks.sql@2 select_any_webhook_with_skew.sql@4 select_recent_webhook.sql@4",
+) -> list[str]:
+    # We parametrize each run with the custom scripts to run and their weights.
+    # The custom scripts and their weights are passed through TEST_PGBENCH_CUSTOM_SCRIPTS env variable.
+    # Delimit the custom scripts for one run by spaces and for different runs by commas, for example:
+    # "insert_webhooks.sql@2 select_any_webhook_with_skew.sql@4,insert_webhooks.sql@8 select_any_webhook_with_skew.sql@2"
+    # Databases/branches  are pre-created and passed through BENCHMARK_CONNSTR env variable.
+    scripts = os.getenv("TEST_PGBENCH_CUSTOM_SCRIPTS", default=str(default))
+    rv = []
+    for s in scripts.split(","):
+        rv.append(s)
+    return rv
+
+
+def run_test_pgbench(env: PgCompare, custom_scripts: str, duration: int):
+    password = env.pg.default_options.get("password", None)
+    options = env.pg.default_options.get("options", "")
+    # drop password from the connection string by passing password=None and set password separately
+    connstr = env.pg.connstr(password=None, options=options)
+    # if connstr does not contain pooler we can set statement_timeout to 0
+    if "pooler" not in connstr:
+        options = "-cstatement_timeout=0 " + env.pg.default_options.get("options", "")
+        connstr = env.pg.connstr(password=None, options=options)
+
+    script_args = [
+        "pgbench",
+        "-n",  # no explicit vacuum before the test - we want to rely on auto-vacuum
+        "-M",
+        "prepared",
+        "--client=500",
+        "--jobs=100",
+        f"-T{duration}",
+        "-P60",  # progress every minute
+        "--progress-timestamp",
+    ]
+    for script in custom_scripts.split():
+        script_args.extend(["-f", f"test_runner/performance/large_synthetic_oltp/{script}"])
+    script_args.append(connstr)
+
+    run_pgbench(
+        env,
+        "custom-scripts",
+        script_args,
+        password=password,
+    )
+
+
+def run_pgbench(env: PgCompare, prefix: str, cmdline, password: None):
+    environ: dict[str, str] = {}
+    if password is not None:
+        environ["PGPASSWORD"] = password
+
+    run_start_timestamp = utc_now_timestamp()
+    t0 = timeit.default_timer()
+    out = env.pg_bin.run_capture(cmdline, env=environ)
+    run_duration = timeit.default_timer() - t0
+    run_end_timestamp = utc_now_timestamp()
+    env.flush()
+
+    stdout = Path(f"{out}.stdout").read_text()
+
+    res = PgBenchRunResult.parse_from_stdout(
+        stdout=stdout,
+        run_duration=run_duration,
+        run_start_timestamp=run_start_timestamp,
+        run_end_timestamp=run_end_timestamp,
+    )
+    env.zenbenchmark.record_pg_bench_result(prefix, res)
+
+
+@pytest.mark.parametrize("custom_scripts", get_custom_scripts())
+@pytest.mark.parametrize("duration", get_durations_matrix())
+@pytest.mark.remote_cluster
+def test_perf_oltp_large_tenant(remote_compare: PgCompare, custom_scripts: str, duration: int):
+    run_test_pgbench(remote_compare, custom_scripts, duration)
+    # todo: run re-index, analyze, vacuum, etc. after the test and measure and report its duration