mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-15 12:10:37 +00:00
Extend large tenant OLTP workload ... (#11166)
... to better match the workload characteristics of real Neon customers ## Problem We analyzed workloads of large Neon users and want to extend the oltp workload to include characteristics seen in those workloads. ## Summary of changes - for re-use branch delete inserted rows from last run - adjust expected run-time (time-outs) in GitHub workflow - add queries that exposes the prefetch getpages path - add I/U/D transactions for another table (so far the workload was insert/append-only) - add an explicit vacuum analyze step and measure its time - add reindex concurrently step and measure its time (and take care that this step succeeds even if prior reindex runs have failed or were canceled) - create a second connection string for the pooled connection that removes the `-pooler` suffix from the hostname because we want to run long-running statements (database maintenance) and bypass the pooler which doesn't support unlimited statement timeout ## Test run https://github.com/neondatabase/neon/actions/runs/13851772887/job/38760172415
This commit is contained in:
89
.github/workflows/large_oltp_benchmark.yml
vendored
89
.github/workflows/large_oltp_benchmark.yml
vendored
@@ -2,8 +2,8 @@ name: large oltp benchmark
|
||||
|
||||
on:
|
||||
# uncomment to run on push for debugging your PR
|
||||
push:
|
||||
branches: [ bodobolero/synthetic_oltp_workload ]
|
||||
#push:
|
||||
# branches: [ bodobolero/synthetic_oltp_workload ]
|
||||
|
||||
schedule:
|
||||
# * is a special character in YAML so you have to quote this string
|
||||
@@ -12,7 +12,7 @@ on:
|
||||
# │ │ ┌───────────── day of the month (1 - 31)
|
||||
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
|
||||
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
|
||||
- cron: '0 15 * * *' # run once a day, timezone is utc, avoid conflict with other benchmarks
|
||||
- cron: '0 15 * * 0,2,4' # run on Sunday, Tuesday, Thursday at 3 PM UTC
|
||||
workflow_dispatch: # adds ability to run this manually
|
||||
|
||||
defaults:
|
||||
@@ -22,7 +22,7 @@ defaults:
|
||||
concurrency:
|
||||
# Allow only one workflow globally because we need dedicated resources which only exist once
|
||||
group: large-oltp-bench-workflow
|
||||
cancel-in-progress: true
|
||||
cancel-in-progress: false
|
||||
|
||||
jobs:
|
||||
oltp:
|
||||
@@ -31,9 +31,9 @@ jobs:
|
||||
matrix:
|
||||
include:
|
||||
- target: new_branch
|
||||
custom_scripts: insert_webhooks.sql@2 select_any_webhook_with_skew.sql@4 select_recent_webhook.sql@4
|
||||
custom_scripts: insert_webhooks.sql@200 select_any_webhook_with_skew.sql@300 select_recent_webhook.sql@397 select_prefetch_webhook.sql@3 IUD_one_transaction.sql@100
|
||||
- target: reuse_branch
|
||||
custom_scripts: insert_webhooks.sql@2 select_any_webhook_with_skew.sql@4 select_recent_webhook.sql@4
|
||||
custom_scripts: insert_webhooks.sql@200 select_any_webhook_with_skew.sql@300 select_recent_webhook.sql@397 select_prefetch_webhook.sql@3 IUD_one_transaction.sql@100
|
||||
max-parallel: 1 # we want to run each stripe size sequentially to be able to compare the results
|
||||
permissions:
|
||||
contents: write
|
||||
@@ -46,7 +46,6 @@ jobs:
|
||||
PG_VERSION: 16 # pre-determined by pre-determined project
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
BUILD_TYPE: remote
|
||||
SAVE_PERF_REPORT: ${{ github.ref_name == 'main' }}
|
||||
PLATFORM: ${{ matrix.target }}
|
||||
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
@@ -57,8 +56,10 @@ jobs:
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
options: --init
|
||||
|
||||
# Increase timeout to 8h, default timeout is 6h
|
||||
timeout-minutes: 480
|
||||
# Increase timeout to 2 days, default timeout is 6h - database maintenance can take a long time
|
||||
# (normally 1h pgbench, 3h vacuum analyze 3.5h re-index) x 2 = 15h, leave some buffer for regressions
|
||||
# in one run vacuum didn't finish within 12 hours
|
||||
timeout-minutes: 2880
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
@@ -89,29 +90,45 @@ jobs:
|
||||
- name: Set up Connection String
|
||||
id: set-up-connstr
|
||||
run: |
|
||||
case "${{ matrix.target }}" in
|
||||
new_branch)
|
||||
CONNSTR=${{ steps.create-neon-branch-oltp-target.outputs.dsn }}
|
||||
;;
|
||||
reuse_branch)
|
||||
CONNSTR=${{ secrets.BENCHMARK_LARGE_OLTP_REUSE_CONNSTR }}
|
||||
;;
|
||||
*)
|
||||
echo >&2 "Unknown target=${{ matrix.target }}"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
case "${{ matrix.target }}" in
|
||||
new_branch)
|
||||
CONNSTR=${{ steps.create-neon-branch-oltp-target.outputs.dsn }}
|
||||
;;
|
||||
reuse_branch)
|
||||
CONNSTR=${{ secrets.BENCHMARK_LARGE_OLTP_REUSE_CONNSTR }}
|
||||
;;
|
||||
*)
|
||||
echo >&2 "Unknown target=${{ matrix.target }}"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||
CONNSTR_WITHOUT_POOLER="${CONNSTR//-pooler/}"
|
||||
|
||||
- name: Benchmark pgbench with custom-scripts
|
||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||
echo "connstr_without_pooler=${CONNSTR_WITHOUT_POOLER}" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Delete rows from prior runs in reuse branch
|
||||
if: ${{ matrix.target == 'reuse_branch' }}
|
||||
env:
|
||||
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr_without_pooler }}
|
||||
PG_CONFIG: /tmp/neon/pg_install/v16/bin/pg_config
|
||||
PSQL: /tmp/neon/pg_install/v16/bin/psql
|
||||
PG_16_LIB_PATH: /tmp/neon/pg_install/v16/lib
|
||||
run: |
|
||||
echo "$(date '+%Y-%m-%d %H:%M:%S') - Deleting rows in table webhook.incoming_webhooks from prior runs"
|
||||
export LD_LIBRARY_PATH=${PG_16_LIB_PATH}
|
||||
${PSQL} "${BENCHMARK_CONNSTR}" -c "SET statement_timeout = 0; DELETE FROM webhook.incoming_webhooks WHERE created_at > '2025-02-27 23:59:59+00';"
|
||||
echo "$(date '+%Y-%m-%d %H:%M:%S') - Finished deleting rows in table webhook.incoming_webhooks from prior runs"
|
||||
|
||||
- name: Benchmark pgbench with custom-scripts
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
build_type: ${{ env.BUILD_TYPE }}
|
||||
test_selection: performance
|
||||
run_in_parallel: false
|
||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||
extra_params: -m remote_cluster --timeout 21600 -k test_perf_oltp_large_tenant
|
||||
save_perf_report: true
|
||||
extra_params: -m remote_cluster --timeout 7200 -k test_perf_oltp_large_tenant_pgbench
|
||||
pg_version: ${{ env.PG_VERSION }}
|
||||
aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
env:
|
||||
@@ -119,6 +136,21 @@ jobs:
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
|
||||
- name: Benchmark database maintenance
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
build_type: ${{ env.BUILD_TYPE }}
|
||||
test_selection: performance
|
||||
run_in_parallel: false
|
||||
save_perf_report: true
|
||||
extra_params: -m remote_cluster --timeout 172800 -k test_perf_oltp_large_tenant_maintenance
|
||||
pg_version: ${{ env.PG_VERSION }}
|
||||
aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
env:
|
||||
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr_without_pooler }}
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
|
||||
- name: Delete Neon Branch for large tenant
|
||||
if: ${{ always() && matrix.target == 'new_branch' }}
|
||||
uses: ./.github/actions/neon-branch-delete
|
||||
@@ -127,6 +159,13 @@ jobs:
|
||||
branch_id: ${{ steps.create-neon-branch-oltp-target.outputs.branch_id }}
|
||||
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
|
||||
- name: Configure AWS credentials # again because prior steps could have exceeded 5 hours
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
aws-region: eu-central-1
|
||||
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
role-duration-seconds: 18000 # 5 hours
|
||||
|
||||
- name: Create Allure report
|
||||
id: create-allure-report
|
||||
if: ${{ !cancelled() }}
|
||||
|
||||
@@ -0,0 +1,162 @@
|
||||
\set min_id 1
|
||||
\set max_id 1500000000
|
||||
\set range_size 100
|
||||
|
||||
-- Use uniform random instead of random_zipfian
|
||||
\set random_id random(:min_id, :max_id)
|
||||
\set random_mar_id random(1, 65536)
|
||||
\set random_delete_id random(:min_id, :max_id)
|
||||
|
||||
-- Update exactly one row (if it exists) using the uniformly chosen random_id
|
||||
UPDATE transaction.transaction
|
||||
SET state = 'COMPLETED',
|
||||
settlement_date = CURRENT_DATE,
|
||||
mar_identifier = (:random_mar_id)::int
|
||||
WHERE id = (:random_id)::bigint;
|
||||
|
||||
-- Insert exactly one row
|
||||
INSERT INTO transaction.transaction (
|
||||
user_id,
|
||||
card_id,
|
||||
business_id,
|
||||
preceding_transaction_id,
|
||||
is_last,
|
||||
is_mocked,
|
||||
type,
|
||||
state,
|
||||
network,
|
||||
subnetwork,
|
||||
user_transaction_time,
|
||||
settlement_date,
|
||||
request_amount,
|
||||
amount,
|
||||
currency_code,
|
||||
approval_code,
|
||||
response,
|
||||
gpa,
|
||||
gpa_order_unload,
|
||||
gpa_order,
|
||||
program_transfer,
|
||||
fee_transfer,
|
||||
peer_transfer,
|
||||
msa_orders,
|
||||
risk_assessment,
|
||||
auto_reload,
|
||||
direct_deposit,
|
||||
polarity,
|
||||
real_time_fee_group,
|
||||
fee,
|
||||
chargeback,
|
||||
standin_approved_by,
|
||||
acquirer_fee_amount,
|
||||
funded_account_holder,
|
||||
digital_wallet_token,
|
||||
network_fees,
|
||||
card_security_code_verification,
|
||||
fraud,
|
||||
cardholder_authentication_data,
|
||||
currency_conversion,
|
||||
merchant,
|
||||
store,
|
||||
card_acceptor,
|
||||
acquirer,
|
||||
pos,
|
||||
avs,
|
||||
mar_token,
|
||||
mar_preceding_related_transaction_token,
|
||||
mar_business_token,
|
||||
mar_acting_user_token,
|
||||
mar_card_token,
|
||||
mar_duration,
|
||||
mar_created_time,
|
||||
issuer_interchange_amount,
|
||||
offer_orders,
|
||||
transaction_canonical_id,
|
||||
mar_identifier,
|
||||
created_at,
|
||||
card_acceptor_mid,
|
||||
card_acceptor_name,
|
||||
address_verification,
|
||||
issuing_product,
|
||||
mar_enhanced_data_token,
|
||||
standin_reason
|
||||
)
|
||||
SELECT
|
||||
(:random_id % 100000) + 1 AS user_id,
|
||||
(:random_id % 500000) + 1 AS card_id,
|
||||
(:random_id % 20000) + 1 AS business_id,
|
||||
NULL AS preceding_transaction_id,
|
||||
(:random_id % 2) = 0 AS is_last,
|
||||
(:random_id % 5) = 0 AS is_mocked,
|
||||
'authorization' AS type,
|
||||
'PENDING' AS state,
|
||||
'VISA' AS network,
|
||||
'VISANET' AS subnetwork,
|
||||
now() - ((:random_id % 100) || ' days')::interval AS user_transaction_time,
|
||||
now() - ((:random_id % 100) || ' days')::interval AS settlement_date,
|
||||
random() * 1000 AS request_amount,
|
||||
random() * 1000 AS amount,
|
||||
'USD' AS currency_code,
|
||||
md5((:random_id)::text) AS approval_code,
|
||||
'{}'::jsonb AS response,
|
||||
'{}'::jsonb AS gpa,
|
||||
'{}'::jsonb AS gpa_order_unload,
|
||||
'{}'::jsonb AS gpa_order,
|
||||
'{}'::jsonb AS program_transfer,
|
||||
'{}'::jsonb AS fee_transfer,
|
||||
'{}'::jsonb AS peer_transfer,
|
||||
'{}'::jsonb AS msa_orders,
|
||||
'{}'::jsonb AS risk_assessment,
|
||||
'{}'::jsonb AS auto_reload,
|
||||
'{}'::jsonb AS direct_deposit,
|
||||
'{}'::jsonb AS polarity,
|
||||
'{}'::jsonb AS real_time_fee_group,
|
||||
'{}'::jsonb AS fee,
|
||||
'{}'::jsonb AS chargeback,
|
||||
NULL AS standin_approved_by,
|
||||
random() * 100 AS acquirer_fee_amount,
|
||||
'{}'::jsonb AS funded_account_holder,
|
||||
'{}'::jsonb AS digital_wallet_token,
|
||||
'{}'::jsonb AS network_fees,
|
||||
'{}'::jsonb AS card_security_code_verification,
|
||||
'{}'::jsonb AS fraud,
|
||||
'{}'::jsonb AS cardholder_authentication_data,
|
||||
'{}'::jsonb AS currency_conversion,
|
||||
'{}'::jsonb AS merchant,
|
||||
'{}'::jsonb AS store,
|
||||
'{}'::jsonb AS card_acceptor,
|
||||
'{}'::jsonb AS acquirer,
|
||||
'{}'::jsonb AS pos,
|
||||
'{}'::jsonb AS avs,
|
||||
md5((:random_id)::text || 'token') AS mar_token,
|
||||
NULL AS mar_preceding_related_transaction_token,
|
||||
NULL AS mar_business_token,
|
||||
NULL AS mar_acting_user_token,
|
||||
NULL AS mar_card_token,
|
||||
random() * 1000 AS mar_duration,
|
||||
now() AS mar_created_time,
|
||||
random() * 100 AS issuer_interchange_amount,
|
||||
'{}'::jsonb AS offer_orders,
|
||||
(:random_id % 500) + 1 AS transaction_canonical_id,
|
||||
:random_id::integer AS mar_identifier,
|
||||
now() AS created_at,
|
||||
NULL AS card_acceptor_mid,
|
||||
NULL AS card_acceptor_name,
|
||||
'{}'::jsonb AS address_verification,
|
||||
'DEFAULT_PRODUCT' AS issuing_product,
|
||||
NULL AS mar_enhanced_data_token,
|
||||
NULL AS standin_reason
|
||||
FROM (SELECT 1) AS dummy;
|
||||
|
||||
-- Delete exactly one row using the uniformly chosen random_delete_id
|
||||
WITH to_delete AS (
|
||||
SELECT id
|
||||
FROM transaction.transaction
|
||||
WHERE id >= (:random_delete_id)::bigint
|
||||
AND id < ((:random_delete_id)::bigint + :range_size)
|
||||
ORDER BY id
|
||||
LIMIT 1
|
||||
)
|
||||
DELETE FROM transaction.transaction
|
||||
USING to_delete
|
||||
WHERE transaction.transaction.id = to_delete.id;
|
||||
@@ -0,0 +1,25 @@
|
||||
-- enforce a controlled number of getpages prefetch requests from a range of
|
||||
-- 40 million first pages (320 GB) of a 500 GiB table
|
||||
-- the table has 55 million pages
|
||||
|
||||
|
||||
-- Zipfian distributions model real-world access patterns where:
|
||||
-- A few values (popular IDs) are accessed frequently.
|
||||
-- Many values are accessed rarely.
|
||||
-- This is useful for simulating realistic workloads
|
||||
|
||||
\set alpha 1.2
|
||||
\set min_page 1
|
||||
\set max_page 40000000
|
||||
|
||||
\set zipf_random_page random_zipfian(:min_page, :max_page, :alpha)
|
||||
|
||||
-- Read 500 consecutive pages from a Zipfian-distributed random start page
|
||||
-- This enforces PostgreSQL prefetching
|
||||
WITH random_page AS (
|
||||
SELECT :zipf_random_page::int AS start_page
|
||||
)
|
||||
SELECT MAX(created_at)
|
||||
FROM webhook.incoming_webhooks
|
||||
WHERE ctid >= (SELECT format('(%s,1)', start_page)::tid FROM random_page)
|
||||
AND ctid < (SELECT format('(%s,1)', start_page + 500)::tid FROM random_page);
|
||||
@@ -2,11 +2,13 @@ from __future__ import annotations
|
||||
|
||||
import os
|
||||
import timeit
|
||||
from contextlib import closing
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from fixtures.benchmark_fixture import PgBenchRunResult
|
||||
from fixtures.compare_fixtures import PgCompare
|
||||
from fixtures.log_helper import log
|
||||
|
||||
from performance.test_perf_pgbench import get_durations_matrix, utc_now_timestamp
|
||||
|
||||
@@ -82,9 +84,81 @@ def run_pgbench(env: PgCompare, prefix: str, cmdline, password: None):
|
||||
env.zenbenchmark.record_pg_bench_result(prefix, res)
|
||||
|
||||
|
||||
def run_database_maintenance(env: PgCompare):
|
||||
with closing(env.pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
log.info("start vacuum analyze transaction.transaction")
|
||||
with env.zenbenchmark.record_duration("vacuum_analyze"):
|
||||
cur.execute("SET statement_timeout = 0;")
|
||||
cur.execute("SET max_parallel_maintenance_workers = 7;")
|
||||
cur.execute("SET maintenance_work_mem = '10GB';")
|
||||
cur.execute("vacuum analyze transaction.transaction;")
|
||||
log.info("finished vacuum analyze transaction.transaction")
|
||||
|
||||
# recover previously failed or canceled re-indexing
|
||||
cur.execute(
|
||||
"""
|
||||
DO $$
|
||||
DECLARE
|
||||
invalid_index TEXT;
|
||||
BEGIN
|
||||
FOR invalid_index IN
|
||||
SELECT c.relname
|
||||
FROM pg_class c
|
||||
JOIN pg_index i ON i.indexrelid = c.oid
|
||||
JOIN pg_namespace n ON n.oid = c.relnamespace
|
||||
WHERE n.nspname = 'transaction'
|
||||
AND i.indisvalid = FALSE
|
||||
AND c.relname LIKE '%_ccnew%'
|
||||
LOOP
|
||||
EXECUTE 'DROP INDEX IF EXISTS transaction.' || invalid_index;
|
||||
END LOOP;
|
||||
END $$;
|
||||
"""
|
||||
)
|
||||
# also recover failed or canceled re-indexing on toast part of table
|
||||
cur.execute(
|
||||
"""
|
||||
DO $$
|
||||
DECLARE
|
||||
invalid_index TEXT;
|
||||
BEGIN
|
||||
FOR invalid_index IN
|
||||
SELECT c.relname
|
||||
FROM pg_class c
|
||||
JOIN pg_index i ON i.indexrelid = c.oid
|
||||
JOIN pg_namespace n ON n.oid = c.relnamespace
|
||||
WHERE n.nspname = 'pg_toast'
|
||||
AND i.indisvalid = FALSE
|
||||
AND c.relname LIKE '%_ccnew%'
|
||||
AND i.indrelid = (
|
||||
SELECT reltoastrelid FROM pg_class
|
||||
WHERE relname = 'transaction'
|
||||
AND relnamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'transaction')
|
||||
)
|
||||
LOOP
|
||||
EXECUTE 'DROP INDEX IF EXISTS pg_toast.' || invalid_index;
|
||||
END LOOP;
|
||||
END $$;
|
||||
"""
|
||||
)
|
||||
|
||||
log.info("start REINDEX TABLE CONCURRENTLY transaction.transaction")
|
||||
with env.zenbenchmark.record_duration("reindex concurrently"):
|
||||
cur.execute("REINDEX TABLE CONCURRENTLY transaction.transaction;")
|
||||
log.info("finished REINDEX TABLE CONCURRENTLY transaction.transaction")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("custom_scripts", get_custom_scripts())
|
||||
@pytest.mark.parametrize("duration", get_durations_matrix())
|
||||
@pytest.mark.remote_cluster
|
||||
def test_perf_oltp_large_tenant(remote_compare: PgCompare, custom_scripts: str, duration: int):
|
||||
def test_perf_oltp_large_tenant_pgbench(
|
||||
remote_compare: PgCompare, custom_scripts: str, duration: int
|
||||
):
|
||||
run_test_pgbench(remote_compare, custom_scripts, duration)
|
||||
# todo: run re-index, analyze, vacuum, etc. after the test and measure and report its duration
|
||||
|
||||
|
||||
@pytest.mark.remote_cluster
|
||||
def test_perf_oltp_large_tenant_maintenance(remote_compare: PgCompare):
|
||||
# run analyze, vacuum, re-index after the test and measure and report its duration
|
||||
run_database_maintenance(remote_compare)
|
||||
|
||||
Reference in New Issue
Block a user