Bodobolero/test cum stats persistence (#10995)

## Problem

So far cumulative statistics have not been persisted when Neon scales to
zero (suspends endpoint).
With PR https://github.com/neondatabase/neon/pull/6560 the cumulative
statistics should now survive endpoint restarts and correctly trigger
the auto- vacuum and auto analyze maintenance

So far we did not have a testcase that validates that improvement in our
dev cloud environment with a real project.

## Summary of changes

Introduce testcase `test_cumulative_statistics_persistence`in the
benchmarking workflow running daily to verify:

- Verifies that the cumulative statistics are correctly persisted across
restarts.
- Cumulative statistics are important to persist across restarts because
they are used
-  when auto-vacuum an auto-analyze trigger conditions are met.
-  The test performs the following steps:
    - Seed a new project using pgbench
    - insert tuples that by itself are not enough to trigger auto-vacuum
    - suspend the endpoint
    - resume the endpoint
- insert additional tuples that by itself are not enough to trigger
auto-vacuum but in combination with the previous tuples are
- verify that autovacuum is triggered by the combination of tuples
inserted before and after endpoint suspension

## Test run


https://github.com/neondatabase/neon/actions/runs/13546879714/job/37860609089#step:6:282
This commit is contained in:
Peter Bendel
2025-02-27 11:45:13 +01:00
committed by GitHub
parent a22be5af72
commit 3a3d62dc4f
2 changed files with 277 additions and 0 deletions

View File

@@ -140,6 +140,7 @@ jobs:
--ignore test_runner/performance/test_logical_replication.py
--ignore test_runner/performance/test_physical_replication.py
--ignore test_runner/performance/test_perf_ingest_using_pgcopydb.py
--ignore test_runner/performance/test_cumulative_statistics_persistence.py
env:
BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -171,6 +172,61 @@ jobs:
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
cumstats-test:
if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
permissions:
contents: write
statuses: write
id-token: write # aws-actions/configure-aws-credentials
env:
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
DEFAULT_PG_VERSION: 17
TEST_OUTPUT: /tmp/test_output
BUILD_TYPE: remote
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
PLATFORM: "neon-staging"
runs-on: [ self-hosted, us-east-2, x64 ]
container:
image: neondatabase/build-tools:pinned-bookworm
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
options: --init
steps:
- uses: actions/checkout@v4
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: eu-central-1
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
role-duration-seconds: 18000 # 5 hours
- name: Download Neon artifact
uses: ./.github/actions/download
with:
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
path: /tmp/neon/
prefix: latest
aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
- name: Verify that cumulative statistics are preserved
uses: ./.github/actions/run-python-test-set
with:
build_type: ${{ env.BUILD_TYPE }}
test_selection: performance/test_cumulative_statistics_persistence.py
run_in_parallel: false
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
extra_params: -m remote_cluster --timeout 3600
pg_version: ${{ env.DEFAULT_PG_VERSION }}
aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
env:
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
replication-tests:
if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
permissions:

View File

@@ -0,0 +1,221 @@
import math # Add this import
import time
import traceback
from pathlib import Path
import psycopg2
import psycopg2.extras
import pytest
from fixtures.benchmark_fixture import NeonBenchmarker
from fixtures.log_helper import log
from fixtures.neon_api import NeonAPI, connection_parameters_to_env
from fixtures.neon_fixtures import PgBin
from fixtures.pg_version import PgVersion
vacuum_times_sql = """
SELECT
relname AS table_name,
last_autovacuum,
last_autoanalyze
FROM
pg_stat_user_tables where relname = 'pgbench_accounts'
ORDER BY
last_autovacuum DESC, last_autoanalyze DESC
"""
def insert_first_chunk_and_verify_autovacuum_is_not_running(
cur, rows_to_insert, autovacuum_naptime
):
cur.execute(f"""
INSERT INTO pgbench_accounts (aid, bid, abalance, filler)
SELECT
aid,
(random() * 10)::int + 1 AS bid,
(random() * 10000)::int AS abalance,
'filler text' AS filler
FROM generate_series(6800001, {6800001 + rows_to_insert - 1}) AS aid;
""")
assert cur.rowcount == rows_to_insert
for _ in range(5):
time.sleep(0.5 * autovacuum_naptime)
cur.execute(vacuum_times_sql)
row = cur.fetchall()[0]
log.info(f"last_autovacuum: {row[1]}, last_autoanalyze: {row[2]}")
assert row[1] is None
def insert_second_chunk_and_verify_autovacuum_is_now_running(
cur, rows_to_insert, autovacuum_naptime
):
cur.execute(f"""
INSERT INTO pgbench_accounts (aid, bid, abalance, filler)
SELECT
aid,
(random() * 10)::int + 1 AS bid,
(random() * 10000)::int AS abalance,
'filler text' AS filler
FROM generate_series({6800001 + rows_to_insert}, {6800001 + rows_to_insert * 2 - 1}) AS aid;
""")
assert cur.rowcount == rows_to_insert
for _ in range(5):
time.sleep(0.5 * autovacuum_naptime)
cur.execute(vacuum_times_sql)
row = cur.fetchall()[0]
log.info(f"last_autovacuum: {row[1]}, last_autoanalyze: {row[2]}")
assert row[1] is not None
@pytest.mark.remote_cluster
@pytest.mark.timeout(60 * 60)
def test_cumulative_statistics_persistence(
pg_bin: PgBin,
test_output_dir: Path,
neon_api: NeonAPI,
pg_version: PgVersion,
zenbenchmark: NeonBenchmarker,
):
"""
Verifies that the cumulative statistics are correctly persisted across restarts.
Cumulative statistics are important to persist across restarts because they are used
when auto-vacuum an auto-analyze trigger conditions are met.
The test performs the following steps:
- Seed a new project using pgbench
- insert tuples that by itself are not enough to trigger auto-vacuum
- suspend the endpoint
- resume the endpoint
- insert additional tuples that by itself are not enough to trigger auto-vacuum but in combination with the previous tuples are
- verify that autovacuum is triggered by the combination of tuples inserted before and after endpoint suspension
"""
project = neon_api.create_project(pg_version)
project_id = project["project"]["id"]
neon_api.wait_for_operation_to_finish(project_id)
endpoint_id = project["endpoints"][0]["id"]
region_id = project["project"]["region_id"]
log.info(f"Created project {project_id} with endpoint {endpoint_id} in region {region_id}")
error_occurred = False
try:
connstr = project["connection_uris"][0]["connection_uri"]
env = connection_parameters_to_env(project["connection_uris"][0]["connection_parameters"])
# seed about 1 GiB of data into pgbench_accounts
pg_bin.run_capture(["pgbench", "-i", "-s68"], env=env)
# assert rows in pgbench_accounts is 6800000 rows
conn = psycopg2.connect(connstr)
conn.autocommit = True
with conn.cursor() as cur:
# assert rows in pgbench_accounts is 6800000 rows
cur.execute("select count(*) from pgbench_accounts")
row_count = cur.fetchall()[0][0]
assert row_count == 6800000
# verify n_tup_ins, n_live_tup, vacuum_count, analyze_count (manual vacuum and analyze)
cur.execute(
"select n_tup_ins, vacuum_count,analyze_count from pg_stat_user_tables where relname = 'pgbench_accounts'"
)
row = cur.fetchall()[0]
assert row[0] == 6800000 # n_tup_ins
assert row[1] == 1 # vacuum_count
assert row[2] == 1 # analyze_count
# retrieve some GUCs (postgres settings) relevant to autovacuum
cur.execute(
"SELECT setting::int AS autovacuum_naptime FROM pg_settings WHERE name = 'autovacuum_naptime'"
)
autovacuum_naptime = cur.fetchall()[0][0]
assert autovacuum_naptime < 300 and autovacuum_naptime > 0
cur.execute(
"SELECT setting::float AS autovacuum_vacuum_insert_scale_factor FROM pg_settings WHERE name = 'autovacuum_vacuum_insert_scale_factor'"
)
autovacuum_vacuum_insert_scale_factor = cur.fetchall()[0][0]
assert (
autovacuum_vacuum_insert_scale_factor > 0.05
and autovacuum_vacuum_insert_scale_factor < 1.0
)
cur.execute(
"SELECT setting::int AS autovacuum_vacuum_insert_threshold FROM pg_settings WHERE name = 'autovacuum_vacuum_insert_threshold'"
)
autovacuum_vacuum_insert_threshold = cur.fetchall()[0][0]
cur.execute(
"SELECT setting::int AS pgstat_file_size_limit FROM pg_settings WHERE name = 'neon.pgstat_file_size_limit'"
)
pgstat_file_size_limit = cur.fetchall()[0][0]
assert pgstat_file_size_limit > 10 * 1024 # at least 10 MB
# insert rows that by itself are not enough to trigger auto-vacuum
# vacuum insert threshold = vacuum base insert threshold + vacuum insert scale factor * number of tuples
# https://www.postgresql.org/docs/17/routine-vacuuming.html
rows_to_insert = int(
math.ceil(
autovacuum_vacuum_insert_threshold / 2
+ row_count * autovacuum_vacuum_insert_scale_factor * 0.6
)
)
log.info(
f"autovacuum_vacuum_insert_scale_factor: {autovacuum_vacuum_insert_scale_factor}, autovacuum_vacuum_insert_threshold: {autovacuum_vacuum_insert_threshold}, row_count: {row_count}"
)
log.info(
f"Inserting {rows_to_insert} rows, which is below the 'vacuum insert threshold'"
)
insert_first_chunk_and_verify_autovacuum_is_not_running(
cur, rows_to_insert, autovacuum_naptime
)
conn.close()
# suspend the endpoint
log.info(f"Suspending endpoint {endpoint_id}")
neon_api.suspend_endpoint(project_id, endpoint_id)
neon_api.wait_for_operation_to_finish(project_id)
time.sleep(60) # give some time in between suspend and resume
# resume the endpoint
log.info(f"Starting endpoint {endpoint_id}")
neon_api.start_endpoint(project_id, endpoint_id)
neon_api.wait_for_operation_to_finish(project_id)
conn = psycopg2.connect(connstr)
conn.autocommit = True
with conn.cursor() as cur:
# insert additional rows that by itself are not enough to trigger auto-vacuum, but in combination
# with the previous rows inserted before the suspension are
log.info(
f"Inserting another {rows_to_insert} rows, which is below the 'vacuum insert threshold'"
)
insert_second_chunk_and_verify_autovacuum_is_now_running(
cur, rows_to_insert, autovacuum_naptime
)
# verify estimatednumber of tuples in pgbench_accounts is within 6800000 + inserted rows +- 2 %
cur.execute(
"select reltuples::bigint from pg_class where relkind = 'r' and relname = 'pgbench_accounts'"
)
reltuples = cur.fetchall()[0][0]
assert reltuples > 6800000 + rows_to_insert * 2 * 0.98
assert reltuples < 6800000 + rows_to_insert * 2 * 1.02
# verify exact number of pgbench_accounts rows (computed row_count)
cur.execute("select count(*) from pgbench_accounts")
row_count = cur.fetchall()[0][0]
assert row_count == 6800000 + rows_to_insert * 2
# verify n_tup_ins, n_live_tup, vacuum_count, analyze_count (manual vacuum and analyze)
cur.execute(
"select n_tup_ins, vacuum_count,analyze_count from pg_stat_user_tables where relname = 'pgbench_accounts'"
)
row = cur.fetchall()[0]
assert row[0] == 6800000 + rows_to_insert * 2
assert row[1] == 1
assert row[2] == 1
conn.close()
except Exception as e:
error_occurred = True
log.error(f"Caught exception: {e}")
log.error(traceback.format_exc())
finally:
assert not error_occurred # Fail the test if an error occurred
neon_api.delete_project(project_id)