mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-06 21:12:55 +00:00
First version of a new benchmark to test larger OLTP workload (#11053)
## Problem We want to support larger tenants (regarding logical database size, number of transactions per second etc.) and should increase our test coverage of OLTP transactions at larger scale. ## Summary of changes Start a new benchmark that over time will add more OLTP tests at larger scale. This PR covers the first version and will be extended in further PRs. Also fix some infrastructure: - default for new connections and large tenants is to use connection pooler pgbouncer, however our fixture always added `statement_timeout=120` which is not compatible with pooler [see](https://neon.tech/docs/connect/connection-errors#unsupported-startup-parameter) - action to create branch timed out after 10 seconds and 10 retries but for large tenants it can take longer so use increasing back-off for retries ## Test run https://github.com/neondatabase/neon/actions/runs/13593446706
This commit is contained in:
1
.github/actionlint.yml
vendored
1
.github/actionlint.yml
vendored
@@ -32,3 +32,4 @@ config-variables:
|
|||||||
- NEON_DEV_AWS_ACCOUNT_ID
|
- NEON_DEV_AWS_ACCOUNT_ID
|
||||||
- NEON_PROD_AWS_ACCOUNT_ID
|
- NEON_PROD_AWS_ACCOUNT_ID
|
||||||
- AWS_ECR_REGION
|
- AWS_ECR_REGION
|
||||||
|
- BENCHMARK_LARGE_OLTP_PROJECTID
|
||||||
|
|||||||
12
.github/actions/neon-branch-create/action.yml
vendored
12
.github/actions/neon-branch-create/action.yml
vendored
@@ -84,7 +84,13 @@ runs:
|
|||||||
--header "Authorization: Bearer ${API_KEY}"
|
--header "Authorization: Bearer ${API_KEY}"
|
||||||
)
|
)
|
||||||
|
|
||||||
role_name=$(echo $roles | jq --raw-output '.roles[] | select(.protected == false) | .name')
|
role_name=$(echo "$roles" | jq --raw-output '
|
||||||
|
(.roles | map(select(.protected == false))) as $roles |
|
||||||
|
if any($roles[]; .name == "neondb_owner")
|
||||||
|
then "neondb_owner"
|
||||||
|
else $roles[0].name
|
||||||
|
end
|
||||||
|
')
|
||||||
echo "role_name=${role_name}" >> $GITHUB_OUTPUT
|
echo "role_name=${role_name}" >> $GITHUB_OUTPUT
|
||||||
env:
|
env:
|
||||||
API_HOST: ${{ inputs.api_host }}
|
API_HOST: ${{ inputs.api_host }}
|
||||||
@@ -107,13 +113,13 @@ runs:
|
|||||||
)
|
)
|
||||||
|
|
||||||
if [ -z "${reset_password}" ]; then
|
if [ -z "${reset_password}" ]; then
|
||||||
sleep 1
|
sleep $i
|
||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
|
|
||||||
password=$(echo $reset_password | jq --raw-output '.role.password')
|
password=$(echo $reset_password | jq --raw-output '.role.password')
|
||||||
if [ "${password}" == "null" ]; then
|
if [ "${password}" == "null" ]; then
|
||||||
sleep 1
|
sleep $i # increasing backoff
|
||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|||||||
147
.github/workflows/large_oltp_benchmark.yml
vendored
Normal file
147
.github/workflows/large_oltp_benchmark.yml
vendored
Normal file
@@ -0,0 +1,147 @@
|
|||||||
|
name: large oltp benchmark
|
||||||
|
|
||||||
|
on:
|
||||||
|
# uncomment to run on push for debugging your PR
|
||||||
|
push:
|
||||||
|
branches: [ bodobolero/synthetic_oltp_workload ]
|
||||||
|
|
||||||
|
schedule:
|
||||||
|
# * is a special character in YAML so you have to quote this string
|
||||||
|
# ┌───────────── minute (0 - 59)
|
||||||
|
# │ ┌───────────── hour (0 - 23)
|
||||||
|
# │ │ ┌───────────── day of the month (1 - 31)
|
||||||
|
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
|
||||||
|
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
|
||||||
|
- cron: '0 15 * * *' # run once a day, timezone is utc, avoid conflict with other benchmarks
|
||||||
|
workflow_dispatch: # adds ability to run this manually
|
||||||
|
|
||||||
|
defaults:
|
||||||
|
run:
|
||||||
|
shell: bash -euxo pipefail {0}
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
# Allow only one workflow globally because we need dedicated resources which only exist once
|
||||||
|
group: large-oltp-bench-workflow
|
||||||
|
cancel-in-progress: true
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
oltp:
|
||||||
|
strategy:
|
||||||
|
fail-fast: false # allow other variants to continue even if one fails
|
||||||
|
matrix:
|
||||||
|
include:
|
||||||
|
- target: new_branch
|
||||||
|
custom_scripts: insert_webhooks.sql@2 select_any_webhook_with_skew.sql@4 select_recent_webhook.sql@4
|
||||||
|
- target: reuse_branch
|
||||||
|
custom_scripts: insert_webhooks.sql@2 select_any_webhook_with_skew.sql@4 select_recent_webhook.sql@4
|
||||||
|
max-parallel: 1 # we want to run each stripe size sequentially to be able to compare the results
|
||||||
|
permissions:
|
||||||
|
contents: write
|
||||||
|
statuses: write
|
||||||
|
id-token: write # aws-actions/configure-aws-credentials
|
||||||
|
env:
|
||||||
|
TEST_PG_BENCH_DURATIONS_MATRIX: "1h" # todo update to > 1 h
|
||||||
|
TEST_PGBENCH_CUSTOM_SCRIPTS: ${{ matrix.custom_scripts }}
|
||||||
|
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||||
|
PG_VERSION: 16 # pre-determined by pre-determined project
|
||||||
|
TEST_OUTPUT: /tmp/test_output
|
||||||
|
BUILD_TYPE: remote
|
||||||
|
SAVE_PERF_REPORT: ${{ github.ref_name == 'main' }}
|
||||||
|
PLATFORM: ${{ matrix.target }}
|
||||||
|
|
||||||
|
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||||
|
container:
|
||||||
|
image: neondatabase/build-tools:pinned-bookworm
|
||||||
|
credentials:
|
||||||
|
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||||
|
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||||
|
options: --init
|
||||||
|
|
||||||
|
# Increase timeout to 8h, default timeout is 6h
|
||||||
|
timeout-minutes: 480
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Configure AWS credentials # necessary to download artefacts
|
||||||
|
uses: aws-actions/configure-aws-credentials@v4
|
||||||
|
with:
|
||||||
|
aws-region: eu-central-1
|
||||||
|
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||||
|
role-duration-seconds: 18000 # 5 hours is currently max associated with IAM role
|
||||||
|
|
||||||
|
- name: Download Neon artifact
|
||||||
|
uses: ./.github/actions/download
|
||||||
|
with:
|
||||||
|
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
|
||||||
|
path: /tmp/neon/
|
||||||
|
prefix: latest
|
||||||
|
aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||||
|
|
||||||
|
- name: Create Neon Branch for large tenant
|
||||||
|
if: ${{ matrix.target == 'new_branch' }}
|
||||||
|
id: create-neon-branch-oltp-target
|
||||||
|
uses: ./.github/actions/neon-branch-create
|
||||||
|
with:
|
||||||
|
project_id: ${{ vars.BENCHMARK_LARGE_OLTP_PROJECTID }}
|
||||||
|
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||||
|
|
||||||
|
- name: Set up Connection String
|
||||||
|
id: set-up-connstr
|
||||||
|
run: |
|
||||||
|
case "${{ matrix.target }}" in
|
||||||
|
new_branch)
|
||||||
|
CONNSTR=${{ steps.create-neon-branch-oltp-target.outputs.dsn }}
|
||||||
|
;;
|
||||||
|
reuse_branch)
|
||||||
|
CONNSTR=${{ secrets.BENCHMARK_LARGE_OLTP_REUSE_CONNSTR }}
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
echo >&2 "Unknown target=${{ matrix.target }}"
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||||
|
|
||||||
|
- name: Benchmark pgbench with custom-scripts
|
||||||
|
uses: ./.github/actions/run-python-test-set
|
||||||
|
with:
|
||||||
|
build_type: ${{ env.BUILD_TYPE }}
|
||||||
|
test_selection: performance
|
||||||
|
run_in_parallel: false
|
||||||
|
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||||
|
extra_params: -m remote_cluster --timeout 21600 -k test_perf_oltp_large_tenant
|
||||||
|
pg_version: ${{ env.PG_VERSION }}
|
||||||
|
aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||||
|
env:
|
||||||
|
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
||||||
|
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||||
|
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||||
|
|
||||||
|
- name: Delete Neon Branch for large tenant
|
||||||
|
if: ${{ always() && matrix.target == 'new_branch' }}
|
||||||
|
uses: ./.github/actions/neon-branch-delete
|
||||||
|
with:
|
||||||
|
project_id: ${{ vars.BENCHMARK_LARGE_OLTP_PROJECTID }}
|
||||||
|
branch_id: ${{ steps.create-neon-branch-oltp-target.outputs.branch_id }}
|
||||||
|
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||||
|
|
||||||
|
- name: Create Allure report
|
||||||
|
id: create-allure-report
|
||||||
|
if: ${{ !cancelled() }}
|
||||||
|
uses: ./.github/actions/allure-report-generate
|
||||||
|
with:
|
||||||
|
aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||||
|
|
||||||
|
- name: Post to a Slack channel
|
||||||
|
if: ${{ github.event.schedule && failure() }}
|
||||||
|
uses: slackapi/slack-github-action@v1
|
||||||
|
with:
|
||||||
|
channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream
|
||||||
|
slack-message: |
|
||||||
|
Periodic large oltp perf testing: ${{ job.status }}
|
||||||
|
<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
|
||||||
|
<${{ steps.create-allure-report.outputs.report-url }}|Allure report>
|
||||||
|
env:
|
||||||
|
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||||
@@ -253,10 +253,15 @@ class PgProtocol:
|
|||||||
# enough for our tests, but if you need a longer, you can
|
# enough for our tests, but if you need a longer, you can
|
||||||
# change it by calling "SET statement_timeout" after
|
# change it by calling "SET statement_timeout" after
|
||||||
# connecting.
|
# connecting.
|
||||||
|
# pooler does not support statement_timeout
|
||||||
|
# Check if the hostname contains the string 'pooler'
|
||||||
|
hostname = result.get("host", "")
|
||||||
|
log.info(f"Hostname: {hostname}")
|
||||||
options = result.get("options", "")
|
options = result.get("options", "")
|
||||||
if "statement_timeout" not in options:
|
if "statement_timeout" not in options and "pooler" not in hostname:
|
||||||
options = f"-cstatement_timeout=120s {options}"
|
options = f"-cstatement_timeout=120s {options}"
|
||||||
result["options"] = options
|
result["options"] = options
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
# autocommit=True here by default because that's what we need most of the time
|
# autocommit=True here by default because that's what we need most of the time
|
||||||
|
|||||||
@@ -0,0 +1,47 @@
|
|||||||
|
\set event_type random(1,10)
|
||||||
|
\set service_key random(1, 3)
|
||||||
|
|
||||||
|
INSERT INTO webhook.incoming_webhooks (
|
||||||
|
created_at,
|
||||||
|
delivery_id,
|
||||||
|
upstream_emitted_at,
|
||||||
|
service_key,
|
||||||
|
event_id,
|
||||||
|
source,
|
||||||
|
body,
|
||||||
|
json,
|
||||||
|
additional_data,
|
||||||
|
is_body_encrypted,
|
||||||
|
event_type
|
||||||
|
) VALUES (
|
||||||
|
now(),
|
||||||
|
gen_random_uuid(),
|
||||||
|
now() - interval '10 minutes',
|
||||||
|
CASE :service_key::int
|
||||||
|
WHEN 1 THEN 'shopify'
|
||||||
|
WHEN 2 THEN 'stripe'
|
||||||
|
WHEN 3 THEN 'github'
|
||||||
|
END,
|
||||||
|
'evt_' || gen_random_uuid(), -- Ensures uniqueness
|
||||||
|
CASE :service_key::int
|
||||||
|
WHEN 1 THEN 'Shopify'
|
||||||
|
WHEN 2 THEN 'Stripe'
|
||||||
|
WHEN 3 THEN 'GitHub'
|
||||||
|
END,
|
||||||
|
'{"order_id": 987654, "customer": {"name": "John Doe", "email": "john.doe@example.com"}, "items": [{"product_id": 12345, "quantity": 2}, {"product_id": 67890, "quantity": 1}], "total": 199.99}',
|
||||||
|
'{"order_id": 987654, "customer": {"name": "John Doe", "email": "john.doe@example.com"}, "items": [{"product_id": 12345, "quantity": 2}, {"product_id": 67890, "quantity": 1}], "total": 199.99}'::jsonb,
|
||||||
|
'{"metadata": {"user_agent": "Mozilla/5.0", "ip_address": "203.0.113.42"}}'::jsonb,
|
||||||
|
false,
|
||||||
|
CASE :event_type::int
|
||||||
|
WHEN 1 THEN 'ORDER_PLACED'
|
||||||
|
WHEN 2 THEN 'ORDER_CANCELLED'
|
||||||
|
WHEN 3 THEN 'PAYMENT_SUCCESSFUL'
|
||||||
|
WHEN 4 THEN 'PAYMENT_FAILED'
|
||||||
|
WHEN 5 THEN 'CUSTOMER_CREATED'
|
||||||
|
WHEN 6 THEN 'CUSTOMER_UPDATED'
|
||||||
|
WHEN 7 THEN 'PRODUCT_UPDATED'
|
||||||
|
WHEN 8 THEN 'INVENTORY_LOW'
|
||||||
|
WHEN 9 THEN 'SHIPPING_DISPATCHED'
|
||||||
|
WHEN 10 THEN 'REFUND_ISSUED'
|
||||||
|
END
|
||||||
|
);
|
||||||
@@ -0,0 +1,15 @@
|
|||||||
|
-- Zipfian distributions model real-world access patterns where:
|
||||||
|
-- A few values (popular IDs) are accessed frequently.
|
||||||
|
-- Many values are accessed rarely.
|
||||||
|
-- This is useful for simulating realistic workloads, like webhook processing where recent events are more frequently accessed.
|
||||||
|
|
||||||
|
\set alpha 1.2
|
||||||
|
\set min_id 1
|
||||||
|
\set max_id 135000000
|
||||||
|
|
||||||
|
\set zipf_random_id random_zipfian(:min_id, :max_id, :alpha)
|
||||||
|
|
||||||
|
SELECT *
|
||||||
|
FROM webhook.incoming_webhooks
|
||||||
|
WHERE id = (:zipf_random_id)::bigint
|
||||||
|
LIMIT 1;
|
||||||
@@ -0,0 +1,9 @@
|
|||||||
|
-- select one of the most recent webhook records (created in the branch timeline during the bench run)
|
||||||
|
SELECT *
|
||||||
|
FROM webhook.incoming_webhooks
|
||||||
|
WHERE id = (
|
||||||
|
SELECT (floor(random() * (
|
||||||
|
(SELECT last_value FROM webhook.incoming_webhooks_id_seq) - 1350000001 + 1
|
||||||
|
) + 1350000001))::bigint
|
||||||
|
)
|
||||||
|
LIMIT 1;
|
||||||
90
test_runner/performance/test_perf_oltp_large_tenant.py
Normal file
90
test_runner/performance/test_perf_oltp_large_tenant.py
Normal file
@@ -0,0 +1,90 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
import timeit
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from fixtures.benchmark_fixture import PgBenchRunResult
|
||||||
|
from fixtures.compare_fixtures import PgCompare
|
||||||
|
|
||||||
|
from performance.test_perf_pgbench import get_durations_matrix, utc_now_timestamp
|
||||||
|
|
||||||
|
|
||||||
|
def get_custom_scripts(
|
||||||
|
default: str = "insert_webhooks.sql@2 select_any_webhook_with_skew.sql@4 select_recent_webhook.sql@4",
|
||||||
|
) -> list[str]:
|
||||||
|
# We parametrize each run with the custom scripts to run and their weights.
|
||||||
|
# The custom scripts and their weights are passed through TEST_PGBENCH_CUSTOM_SCRIPTS env variable.
|
||||||
|
# Delimit the custom scripts for one run by spaces and for different runs by commas, for example:
|
||||||
|
# "insert_webhooks.sql@2 select_any_webhook_with_skew.sql@4,insert_webhooks.sql@8 select_any_webhook_with_skew.sql@2"
|
||||||
|
# Databases/branches are pre-created and passed through BENCHMARK_CONNSTR env variable.
|
||||||
|
scripts = os.getenv("TEST_PGBENCH_CUSTOM_SCRIPTS", default=str(default))
|
||||||
|
rv = []
|
||||||
|
for s in scripts.split(","):
|
||||||
|
rv.append(s)
|
||||||
|
return rv
|
||||||
|
|
||||||
|
|
||||||
|
def run_test_pgbench(env: PgCompare, custom_scripts: str, duration: int):
|
||||||
|
password = env.pg.default_options.get("password", None)
|
||||||
|
options = env.pg.default_options.get("options", "")
|
||||||
|
# drop password from the connection string by passing password=None and set password separately
|
||||||
|
connstr = env.pg.connstr(password=None, options=options)
|
||||||
|
# if connstr does not contain pooler we can set statement_timeout to 0
|
||||||
|
if "pooler" not in connstr:
|
||||||
|
options = "-cstatement_timeout=0 " + env.pg.default_options.get("options", "")
|
||||||
|
connstr = env.pg.connstr(password=None, options=options)
|
||||||
|
|
||||||
|
script_args = [
|
||||||
|
"pgbench",
|
||||||
|
"-n", # no explicit vacuum before the test - we want to rely on auto-vacuum
|
||||||
|
"-M",
|
||||||
|
"prepared",
|
||||||
|
"--client=500",
|
||||||
|
"--jobs=100",
|
||||||
|
f"-T{duration}",
|
||||||
|
"-P60", # progress every minute
|
||||||
|
"--progress-timestamp",
|
||||||
|
]
|
||||||
|
for script in custom_scripts.split():
|
||||||
|
script_args.extend(["-f", f"test_runner/performance/large_synthetic_oltp/{script}"])
|
||||||
|
script_args.append(connstr)
|
||||||
|
|
||||||
|
run_pgbench(
|
||||||
|
env,
|
||||||
|
"custom-scripts",
|
||||||
|
script_args,
|
||||||
|
password=password,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def run_pgbench(env: PgCompare, prefix: str, cmdline, password: None):
|
||||||
|
environ: dict[str, str] = {}
|
||||||
|
if password is not None:
|
||||||
|
environ["PGPASSWORD"] = password
|
||||||
|
|
||||||
|
run_start_timestamp = utc_now_timestamp()
|
||||||
|
t0 = timeit.default_timer()
|
||||||
|
out = env.pg_bin.run_capture(cmdline, env=environ)
|
||||||
|
run_duration = timeit.default_timer() - t0
|
||||||
|
run_end_timestamp = utc_now_timestamp()
|
||||||
|
env.flush()
|
||||||
|
|
||||||
|
stdout = Path(f"{out}.stdout").read_text()
|
||||||
|
|
||||||
|
res = PgBenchRunResult.parse_from_stdout(
|
||||||
|
stdout=stdout,
|
||||||
|
run_duration=run_duration,
|
||||||
|
run_start_timestamp=run_start_timestamp,
|
||||||
|
run_end_timestamp=run_end_timestamp,
|
||||||
|
)
|
||||||
|
env.zenbenchmark.record_pg_bench_result(prefix, res)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("custom_scripts", get_custom_scripts())
|
||||||
|
@pytest.mark.parametrize("duration", get_durations_matrix())
|
||||||
|
@pytest.mark.remote_cluster
|
||||||
|
def test_perf_oltp_large_tenant(remote_compare: PgCompare, custom_scripts: str, duration: int):
|
||||||
|
run_test_pgbench(remote_compare, custom_scripts, duration)
|
||||||
|
# todo: run re-index, analyze, vacuum, etc. after the test and measure and report its duration
|
||||||
Reference in New Issue
Block a user