diff --git a/.github/actionlint.yml b/.github/actionlint.yml index 1e6c2d0aa2..667ff7f92e 100644 --- a/.github/actionlint.yml +++ b/.github/actionlint.yml @@ -32,3 +32,4 @@ config-variables: - NEON_DEV_AWS_ACCOUNT_ID - NEON_PROD_AWS_ACCOUNT_ID - AWS_ECR_REGION + - BENCHMARK_LARGE_OLTP_PROJECTID diff --git a/.github/actions/neon-branch-create/action.yml b/.github/actions/neon-branch-create/action.yml index 9f752d5a89..71dd6f3af2 100644 --- a/.github/actions/neon-branch-create/action.yml +++ b/.github/actions/neon-branch-create/action.yml @@ -84,7 +84,13 @@ runs: --header "Authorization: Bearer ${API_KEY}" ) - role_name=$(echo $roles | jq --raw-output '.roles[] | select(.protected == false) | .name') + role_name=$(echo "$roles" | jq --raw-output ' + (.roles | map(select(.protected == false))) as $roles | + if any($roles[]; .name == "neondb_owner") + then "neondb_owner" + else $roles[0].name + end + ') echo "role_name=${role_name}" >> $GITHUB_OUTPUT env: API_HOST: ${{ inputs.api_host }} @@ -107,13 +113,13 @@ runs: ) if [ -z "${reset_password}" ]; then - sleep 1 + sleep $i continue fi password=$(echo $reset_password | jq --raw-output '.role.password') if [ "${password}" == "null" ]; then - sleep 1 + sleep $i # increasing backoff continue fi diff --git a/.github/workflows/large_oltp_benchmark.yml b/.github/workflows/large_oltp_benchmark.yml new file mode 100644 index 0000000000..f33e11cd08 --- /dev/null +++ b/.github/workflows/large_oltp_benchmark.yml @@ -0,0 +1,147 @@ +name: large oltp benchmark + +on: + # uncomment to run on push for debugging your PR + push: + branches: [ bodobolero/synthetic_oltp_workload ] + + schedule: + # * is a special character in YAML so you have to quote this string + # ┌───────────── minute (0 - 59) + # │ ┌───────────── hour (0 - 23) + # │ │ ┌───────────── day of the month (1 - 31) + # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) + # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) + - cron: '0 15 * * *' # run once a day, timezone is utc, avoid conflict with other benchmarks + workflow_dispatch: # adds ability to run this manually + +defaults: + run: + shell: bash -euxo pipefail {0} + +concurrency: + # Allow only one workflow globally because we need dedicated resources which only exist once + group: large-oltp-bench-workflow + cancel-in-progress: true + +jobs: + oltp: + strategy: + fail-fast: false # allow other variants to continue even if one fails + matrix: + include: + - target: new_branch + custom_scripts: insert_webhooks.sql@2 select_any_webhook_with_skew.sql@4 select_recent_webhook.sql@4 + - target: reuse_branch + custom_scripts: insert_webhooks.sql@2 select_any_webhook_with_skew.sql@4 select_recent_webhook.sql@4 + max-parallel: 1 # we want to run each stripe size sequentially to be able to compare the results + permissions: + contents: write + statuses: write + id-token: write # aws-actions/configure-aws-credentials + env: + TEST_PG_BENCH_DURATIONS_MATRIX: "1h" # todo update to > 1 h + TEST_PGBENCH_CUSTOM_SCRIPTS: ${{ matrix.custom_scripts }} + POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install + PG_VERSION: 16 # pre-determined by pre-determined project + TEST_OUTPUT: /tmp/test_output + BUILD_TYPE: remote + SAVE_PERF_REPORT: ${{ github.ref_name == 'main' }} + PLATFORM: ${{ matrix.target }} + + runs-on: [ self-hosted, us-east-2, x64 ] + container: + image: neondatabase/build-tools:pinned-bookworm + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + options: --init + + # Increase timeout to 8h, default timeout is 6h + timeout-minutes: 480 + + steps: + - uses: actions/checkout@v4 + + - name: Configure AWS credentials # necessary to download artefacts + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: eu-central-1 + role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + role-duration-seconds: 18000 # 5 hours is currently max associated with IAM role + + - name: Download Neon artifact + uses: ./.github/actions/download + with: + name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact + path: /tmp/neon/ + prefix: latest + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + + - name: Create Neon Branch for large tenant + if: ${{ matrix.target == 'new_branch' }} + id: create-neon-branch-oltp-target + uses: ./.github/actions/neon-branch-create + with: + project_id: ${{ vars.BENCHMARK_LARGE_OLTP_PROJECTID }} + api_key: ${{ secrets.NEON_STAGING_API_KEY }} + + - name: Set up Connection String + id: set-up-connstr + run: | + case "${{ matrix.target }}" in + new_branch) + CONNSTR=${{ steps.create-neon-branch-oltp-target.outputs.dsn }} + ;; + reuse_branch) + CONNSTR=${{ secrets.BENCHMARK_LARGE_OLTP_REUSE_CONNSTR }} + ;; + *) + echo >&2 "Unknown target=${{ matrix.target }}" + exit 1 + ;; + esac + + echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT + + - name: Benchmark pgbench with custom-scripts + uses: ./.github/actions/run-python-test-set + with: + build_type: ${{ env.BUILD_TYPE }} + test_selection: performance + run_in_parallel: false + save_perf_report: ${{ env.SAVE_PERF_REPORT }} + extra_params: -m remote_cluster --timeout 21600 -k test_perf_oltp_large_tenant + pg_version: ${{ env.PG_VERSION }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + env: + BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} + VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" + PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" + + - name: Delete Neon Branch for large tenant + if: ${{ always() && matrix.target == 'new_branch' }} + uses: ./.github/actions/neon-branch-delete + with: + project_id: ${{ vars.BENCHMARK_LARGE_OLTP_PROJECTID }} + branch_id: ${{ steps.create-neon-branch-oltp-target.outputs.branch_id }} + api_key: ${{ secrets.NEON_STAGING_API_KEY }} + + - name: Create Allure report + id: create-allure-report + if: ${{ !cancelled() }} + uses: ./.github/actions/allure-report-generate + with: + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + + - name: Post to a Slack channel + if: ${{ github.event.schedule && failure() }} + uses: slackapi/slack-github-action@v1 + with: + channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream + slack-message: | + Periodic large oltp perf testing: ${{ job.status }} + <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run> + <${{ steps.create-allure-report.outputs.report-url }}|Allure report> + env: + SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 53df10be49..3aa018e99e 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -253,10 +253,15 @@ class PgProtocol: # enough for our tests, but if you need a longer, you can # change it by calling "SET statement_timeout" after # connecting. + # pooler does not support statement_timeout + # Check if the hostname contains the string 'pooler' + hostname = result.get("host", "") + log.info(f"Hostname: {hostname}") options = result.get("options", "") - if "statement_timeout" not in options: + if "statement_timeout" not in options and "pooler" not in hostname: options = f"-cstatement_timeout=120s {options}" result["options"] = options + return result # autocommit=True here by default because that's what we need most of the time diff --git a/test_runner/performance/large_synthetic_oltp/insert_webhooks.sql b/test_runner/performance/large_synthetic_oltp/insert_webhooks.sql new file mode 100644 index 0000000000..69e6366a53 --- /dev/null +++ b/test_runner/performance/large_synthetic_oltp/insert_webhooks.sql @@ -0,0 +1,47 @@ +\set event_type random(1,10) +\set service_key random(1, 3) + +INSERT INTO webhook.incoming_webhooks ( + created_at, + delivery_id, + upstream_emitted_at, + service_key, + event_id, + source, + body, + json, + additional_data, + is_body_encrypted, + event_type +) VALUES ( + now(), + gen_random_uuid(), + now() - interval '10 minutes', + CASE :service_key::int + WHEN 1 THEN 'shopify' + WHEN 2 THEN 'stripe' + WHEN 3 THEN 'github' + END, + 'evt_' || gen_random_uuid(), -- Ensures uniqueness + CASE :service_key::int + WHEN 1 THEN 'Shopify' + WHEN 2 THEN 'Stripe' + WHEN 3 THEN 'GitHub' + END, + '{"order_id": 987654, "customer": {"name": "John Doe", "email": "john.doe@example.com"}, "items": [{"product_id": 12345, "quantity": 2}, {"product_id": 67890, "quantity": 1}], "total": 199.99}', + '{"order_id": 987654, "customer": {"name": "John Doe", "email": "john.doe@example.com"}, "items": [{"product_id": 12345, "quantity": 2}, {"product_id": 67890, "quantity": 1}], "total": 199.99}'::jsonb, + '{"metadata": {"user_agent": "Mozilla/5.0", "ip_address": "203.0.113.42"}}'::jsonb, + false, + CASE :event_type::int + WHEN 1 THEN 'ORDER_PLACED' + WHEN 2 THEN 'ORDER_CANCELLED' + WHEN 3 THEN 'PAYMENT_SUCCESSFUL' + WHEN 4 THEN 'PAYMENT_FAILED' + WHEN 5 THEN 'CUSTOMER_CREATED' + WHEN 6 THEN 'CUSTOMER_UPDATED' + WHEN 7 THEN 'PRODUCT_UPDATED' + WHEN 8 THEN 'INVENTORY_LOW' + WHEN 9 THEN 'SHIPPING_DISPATCHED' + WHEN 10 THEN 'REFUND_ISSUED' + END +); \ No newline at end of file diff --git a/test_runner/performance/large_synthetic_oltp/select_any_webhook_with_skew.sql b/test_runner/performance/large_synthetic_oltp/select_any_webhook_with_skew.sql new file mode 100644 index 0000000000..b2f173f011 --- /dev/null +++ b/test_runner/performance/large_synthetic_oltp/select_any_webhook_with_skew.sql @@ -0,0 +1,15 @@ +-- Zipfian distributions model real-world access patterns where: +-- A few values (popular IDs) are accessed frequently. +-- Many values are accessed rarely. +-- This is useful for simulating realistic workloads, like webhook processing where recent events are more frequently accessed. + +\set alpha 1.2 +\set min_id 1 +\set max_id 135000000 + +\set zipf_random_id random_zipfian(:min_id, :max_id, :alpha) + +SELECT * +FROM webhook.incoming_webhooks +WHERE id = (:zipf_random_id)::bigint +LIMIT 1; \ No newline at end of file diff --git a/test_runner/performance/large_synthetic_oltp/select_recent_webhook.sql b/test_runner/performance/large_synthetic_oltp/select_recent_webhook.sql new file mode 100644 index 0000000000..78a843bf0f --- /dev/null +++ b/test_runner/performance/large_synthetic_oltp/select_recent_webhook.sql @@ -0,0 +1,9 @@ +-- select one of the most recent webhook records (created in the branch timeline during the bench run) +SELECT * +FROM webhook.incoming_webhooks +WHERE id = ( + SELECT (floor(random() * ( + (SELECT last_value FROM webhook.incoming_webhooks_id_seq) - 1350000001 + 1 + ) + 1350000001))::bigint +) +LIMIT 1; \ No newline at end of file diff --git a/test_runner/performance/test_perf_oltp_large_tenant.py b/test_runner/performance/test_perf_oltp_large_tenant.py new file mode 100644 index 0000000000..ae00dbb3b5 --- /dev/null +++ b/test_runner/performance/test_perf_oltp_large_tenant.py @@ -0,0 +1,90 @@ +from __future__ import annotations + +import os +import timeit +from pathlib import Path + +import pytest +from fixtures.benchmark_fixture import PgBenchRunResult +from fixtures.compare_fixtures import PgCompare + +from performance.test_perf_pgbench import get_durations_matrix, utc_now_timestamp + + +def get_custom_scripts( + default: str = "insert_webhooks.sql@2 select_any_webhook_with_skew.sql@4 select_recent_webhook.sql@4", +) -> list[str]: + # We parametrize each run with the custom scripts to run and their weights. + # The custom scripts and their weights are passed through TEST_PGBENCH_CUSTOM_SCRIPTS env variable. + # Delimit the custom scripts for one run by spaces and for different runs by commas, for example: + # "insert_webhooks.sql@2 select_any_webhook_with_skew.sql@4,insert_webhooks.sql@8 select_any_webhook_with_skew.sql@2" + # Databases/branches are pre-created and passed through BENCHMARK_CONNSTR env variable. + scripts = os.getenv("TEST_PGBENCH_CUSTOM_SCRIPTS", default=str(default)) + rv = [] + for s in scripts.split(","): + rv.append(s) + return rv + + +def run_test_pgbench(env: PgCompare, custom_scripts: str, duration: int): + password = env.pg.default_options.get("password", None) + options = env.pg.default_options.get("options", "") + # drop password from the connection string by passing password=None and set password separately + connstr = env.pg.connstr(password=None, options=options) + # if connstr does not contain pooler we can set statement_timeout to 0 + if "pooler" not in connstr: + options = "-cstatement_timeout=0 " + env.pg.default_options.get("options", "") + connstr = env.pg.connstr(password=None, options=options) + + script_args = [ + "pgbench", + "-n", # no explicit vacuum before the test - we want to rely on auto-vacuum + "-M", + "prepared", + "--client=500", + "--jobs=100", + f"-T{duration}", + "-P60", # progress every minute + "--progress-timestamp", + ] + for script in custom_scripts.split(): + script_args.extend(["-f", f"test_runner/performance/large_synthetic_oltp/{script}"]) + script_args.append(connstr) + + run_pgbench( + env, + "custom-scripts", + script_args, + password=password, + ) + + +def run_pgbench(env: PgCompare, prefix: str, cmdline, password: None): + environ: dict[str, str] = {} + if password is not None: + environ["PGPASSWORD"] = password + + run_start_timestamp = utc_now_timestamp() + t0 = timeit.default_timer() + out = env.pg_bin.run_capture(cmdline, env=environ) + run_duration = timeit.default_timer() - t0 + run_end_timestamp = utc_now_timestamp() + env.flush() + + stdout = Path(f"{out}.stdout").read_text() + + res = PgBenchRunResult.parse_from_stdout( + stdout=stdout, + run_duration=run_duration, + run_start_timestamp=run_start_timestamp, + run_end_timestamp=run_end_timestamp, + ) + env.zenbenchmark.record_pg_bench_result(prefix, res) + + +@pytest.mark.parametrize("custom_scripts", get_custom_scripts()) +@pytest.mark.parametrize("duration", get_durations_matrix()) +@pytest.mark.remote_cluster +def test_perf_oltp_large_tenant(remote_compare: PgCompare, custom_scripts: str, duration: int): + run_test_pgbench(remote_compare, custom_scripts, duration) + # todo: run re-index, analyze, vacuum, etc. after the test and measure and report its duration