mirror of
https://github.com/neondatabase/neon.git
synced 2025-12-23 06:09:59 +00:00
Add long running replication tests
These tests will help verify that replication, both physical and logical, works as expected in Neon. Co-authored-by: Sasha Krassovsky <sasha@neon.tech>
This commit is contained in:
committed by
Tristan Partin
parent
b54dd9af15
commit
1c57f6bac3
@@ -115,6 +115,7 @@ runs:
|
||||
export POSTGRES_DISTRIB_DIR=${POSTGRES_DISTRIB_DIR:-/tmp/neon/pg_install}
|
||||
export DEFAULT_PG_VERSION=${PG_VERSION#v}
|
||||
export LD_LIBRARY_PATH=${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/lib
|
||||
export BENCHMARK_CONNSTR=${BENCHMARK_CONNSTR:-}
|
||||
|
||||
if [ "${BUILD_TYPE}" = "remote" ]; then
|
||||
export REMOTE_ENV=1
|
||||
|
||||
72
.github/workflows/benchmarking.yml
vendored
72
.github/workflows/benchmarking.yml
vendored
@@ -99,7 +99,14 @@ jobs:
|
||||
# Set --sparse-ordering option of pytest-order plugin
|
||||
# to ensure tests are running in order of appears in the file.
|
||||
# It's important for test_perf_pgbench.py::test_pgbench_remote_* tests
|
||||
extra_params: -m remote_cluster --sparse-ordering --timeout 5400 --ignore test_runner/performance/test_perf_olap.py --ignore test_runner/performance/test_perf_pgvector_queries.py
|
||||
extra_params:
|
||||
-m remote_cluster
|
||||
--sparse-ordering
|
||||
--timeout 5400
|
||||
--ignore test_runner/performance/test_perf_olap.py
|
||||
--ignore test_runner/performance/test_perf_pgvector_queries.py
|
||||
--ignore test_runner/performance/test_logical_replication.py
|
||||
--ignore test_runner/performance/test_physical_replication.py
|
||||
env:
|
||||
BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
@@ -125,6 +132,69 @@ jobs:
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
||||
replication-tests:
|
||||
env:
|
||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||
DEFAULT_PG_VERSION: 14
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
BUILD_TYPE: remote
|
||||
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
|
||||
PLATFORM: "neon-staging"
|
||||
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
|
||||
- name: Run benchmark
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
build_type: ${{ env.BUILD_TYPE }}
|
||||
test_selection: performance/test_logical_replication.py
|
||||
run_in_parallel: false
|
||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||
extra_params: -m remote_cluster --timeout 5400
|
||||
env:
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
|
||||
- name: Run benchmark
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
build_type: ${{ env.BUILD_TYPE }}
|
||||
test_selection: performance/test_physical_replication.py
|
||||
run_in_parallel: false
|
||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||
extra_params: -m remote_cluster --timeout 5400
|
||||
env:
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
|
||||
- name: Create Allure report
|
||||
if: ${{ !cancelled() }}
|
||||
uses: ./.github/actions/allure-report-generate
|
||||
|
||||
- name: Post to a Slack channel
|
||||
if: ${{ github.event.schedule && failure() }}
|
||||
uses: slackapi/slack-github-action@v1
|
||||
with:
|
||||
channel-id: "C033QLM5P7D" # dev-staging-stream
|
||||
slack-message: "Periodic replication testing: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
||||
generate-matrices:
|
||||
if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
|
||||
# Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday)
|
||||
|
||||
@@ -1,13 +1,24 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import time
|
||||
import traceback
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
import pytest
|
||||
from fixtures.benchmark_fixture import MetricReport
|
||||
from fixtures.common_types import Lsn
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_api import connection_parameters_to_env
|
||||
from fixtures.neon_fixtures import AuxFileStore, logical_replication_sync
|
||||
from fixtures.pg_version import PgVersion
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from fixtures.benchmark_fixture import NeonBenchmarker
|
||||
from fixtures.neon_api import NeonAPI
|
||||
from fixtures.neon_fixtures import NeonEnv, PgBin
|
||||
from fixtures.pg_version import PgVersion
|
||||
|
||||
|
||||
@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.V2])
|
||||
@@ -31,7 +42,6 @@ def test_logical_replication(neon_simple_env: NeonEnv, pg_bin: PgBin, vanilla_pg
|
||||
vanilla_pg.safe_psql("truncate table pgbench_history")
|
||||
|
||||
connstr = endpoint.connstr().replace("'", "''")
|
||||
print(f"connstr='{connstr}'")
|
||||
vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1")
|
||||
|
||||
# Wait logical replication channel to be established
|
||||
@@ -47,3 +57,286 @@ def test_logical_replication(neon_simple_env: NeonEnv, pg_bin: PgBin, vanilla_pg
|
||||
sum_master = endpoint.safe_psql("select sum(abalance) from pgbench_accounts")[0][0]
|
||||
sum_replica = vanilla_pg.safe_psql("select sum(abalance) from pgbench_accounts")[0][0]
|
||||
assert sum_master == sum_replica
|
||||
|
||||
|
||||
def check_pgbench_still_running(pgbench, label=""):
|
||||
rc = pgbench.poll()
|
||||
if rc is not None:
|
||||
raise RuntimeError(f"{label} pgbench terminated early with return code {rc}")
|
||||
|
||||
|
||||
def measure_logical_replication_lag(sub_cur, pub_cur, timeout_sec=600):
|
||||
start = time.time()
|
||||
pub_cur.execute("SELECT pg_current_wal_flush_lsn()")
|
||||
pub_lsn = Lsn(pub_cur.fetchall()[0][0])
|
||||
while (time.time() - start) < timeout_sec:
|
||||
sub_cur.execute("SELECT latest_end_lsn FROM pg_catalog.pg_stat_subscription")
|
||||
res = sub_cur.fetchall()[0][0]
|
||||
if res:
|
||||
log.info(f"subscriber_lsn={res}")
|
||||
sub_lsn = Lsn(res)
|
||||
log.info(f"Subscriber LSN={sub_lsn}, publisher LSN={pub_lsn}")
|
||||
if sub_lsn >= pub_lsn:
|
||||
return time.time() - start
|
||||
time.sleep(0.5)
|
||||
raise TimeoutError(f"Logical replication sync took more than {timeout_sec} sec")
|
||||
|
||||
|
||||
@pytest.mark.remote_cluster
|
||||
@pytest.mark.timeout(2 * 60 * 60)
|
||||
def test_subscriber_lag(
|
||||
pg_bin: PgBin,
|
||||
neon_api: NeonAPI,
|
||||
pg_version: PgVersion,
|
||||
zenbenchmark: NeonBenchmarker,
|
||||
):
|
||||
"""
|
||||
Creates a publisher and subscriber, runs pgbench inserts on publisher and pgbench selects
|
||||
on subscriber. Periodically restarts subscriber while still running the inserts, and
|
||||
measures how long sync takes after restart.
|
||||
"""
|
||||
test_duration_min = 60
|
||||
sync_interval_min = 5
|
||||
pgbench_duration = f"-T{test_duration_min * 60 * 2}"
|
||||
|
||||
pub_project = neon_api.create_project(pg_version)
|
||||
pub_project_id = pub_project["project"]["id"]
|
||||
neon_api.wait_for_operation_to_finish(pub_project_id)
|
||||
error_occurred = False
|
||||
try:
|
||||
sub_project = neon_api.create_project(pg_version)
|
||||
sub_project_id = sub_project["project"]["id"]
|
||||
sub_endpoint_id = sub_project["endpoints"][0]["id"]
|
||||
neon_api.wait_for_operation_to_finish(sub_project_id)
|
||||
try:
|
||||
pub_env = connection_parameters_to_env(
|
||||
pub_project["connection_uris"][0]["connection_parameters"]
|
||||
)
|
||||
sub_env = connection_parameters_to_env(
|
||||
sub_project["connection_uris"][0]["connection_parameters"]
|
||||
)
|
||||
pub_connstr = pub_project["connection_uris"][0]["connection_uri"]
|
||||
sub_connstr = sub_project["connection_uris"][0]["connection_uri"]
|
||||
|
||||
pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env)
|
||||
pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env)
|
||||
|
||||
pub_conn = psycopg2.connect(pub_connstr)
|
||||
sub_conn = psycopg2.connect(sub_connstr)
|
||||
pub_conn.autocommit = True
|
||||
sub_conn.autocommit = True
|
||||
with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
|
||||
sub_cur.execute("truncate table pgbench_accounts")
|
||||
sub_cur.execute("truncate table pgbench_history")
|
||||
|
||||
pub_cur.execute(
|
||||
"create publication pub1 for table pgbench_accounts, pgbench_history"
|
||||
)
|
||||
sub_cur.execute(
|
||||
f"create subscription sub1 connection '{pub_connstr}' publication pub1"
|
||||
)
|
||||
|
||||
initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur)
|
||||
pub_conn.close()
|
||||
sub_conn.close()
|
||||
|
||||
zenbenchmark.record(
|
||||
"initial_sync_lag", initial_sync_lag, "s", MetricReport.LOWER_IS_BETTER
|
||||
)
|
||||
|
||||
pub_workload = pg_bin.run_nonblocking(
|
||||
["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=pub_env
|
||||
)
|
||||
try:
|
||||
sub_workload = pg_bin.run_nonblocking(
|
||||
["pgbench", "-c10", pgbench_duration, "-S"],
|
||||
env=sub_env,
|
||||
)
|
||||
try:
|
||||
start = time.time()
|
||||
while time.time() - start < test_duration_min * 60:
|
||||
time.sleep(sync_interval_min * 60)
|
||||
check_pgbench_still_running(pub_workload, "pub")
|
||||
check_pgbench_still_running(sub_workload, "sub")
|
||||
|
||||
with psycopg2.connect(pub_connstr) as pub_conn, psycopg2.connect(
|
||||
sub_connstr
|
||||
) as sub_conn:
|
||||
with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
|
||||
lag = measure_logical_replication_lag(sub_cur, pub_cur)
|
||||
|
||||
log.info(f"Replica lagged behind master by {lag} seconds")
|
||||
zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER)
|
||||
sub_workload.terminate()
|
||||
neon_api.restart_endpoint(
|
||||
sub_project_id,
|
||||
sub_endpoint_id,
|
||||
)
|
||||
neon_api.wait_for_operation_to_finish(sub_project_id)
|
||||
sub_workload = pg_bin.run_nonblocking(
|
||||
["pgbench", "-c10", pgbench_duration, "-S"],
|
||||
env=sub_env,
|
||||
)
|
||||
|
||||
# Measure storage to make sure replication information isn't bloating storage
|
||||
sub_storage = neon_api.get_project_details(sub_project_id)["project"][
|
||||
"synthetic_storage_size"
|
||||
]
|
||||
pub_storage = neon_api.get_project_details(pub_project_id)["project"][
|
||||
"synthetic_storage_size"
|
||||
]
|
||||
zenbenchmark.record(
|
||||
"sub_storage", sub_storage, "B", MetricReport.LOWER_IS_BETTER
|
||||
)
|
||||
zenbenchmark.record(
|
||||
"pub_storage", pub_storage, "B", MetricReport.LOWER_IS_BETTER
|
||||
)
|
||||
|
||||
finally:
|
||||
sub_workload.terminate()
|
||||
finally:
|
||||
pub_workload.terminate()
|
||||
except Exception as e:
|
||||
error_occurred = True
|
||||
log.error(f"Caught exception {e}")
|
||||
log.error(traceback.format_exc())
|
||||
finally:
|
||||
if not error_occurred:
|
||||
neon_api.delete_project(sub_project_id)
|
||||
except Exception as e:
|
||||
error_occurred = True
|
||||
log.error(f"Caught exception {e}")
|
||||
log.error(traceback.format_exc())
|
||||
finally:
|
||||
assert not error_occurred
|
||||
neon_api.delete_project(pub_project_id)
|
||||
|
||||
|
||||
@pytest.mark.remote_cluster
|
||||
@pytest.mark.timeout(2 * 60 * 60)
|
||||
def test_publisher_restart(
|
||||
pg_bin: PgBin,
|
||||
neon_api: NeonAPI,
|
||||
pg_version: PgVersion,
|
||||
zenbenchmark: NeonBenchmarker,
|
||||
):
|
||||
"""
|
||||
Creates a publisher and subscriber, runs pgbench inserts on publisher and pgbench selects
|
||||
on subscriber. Periodically restarts publisher (to exercise on-demand WAL download), and
|
||||
measures how long sync takes after restart.
|
||||
"""
|
||||
test_duration_min = 60
|
||||
sync_interval_min = 5
|
||||
pgbench_duration = f"-T{test_duration_min * 60 * 2}"
|
||||
|
||||
pub_project = neon_api.create_project(pg_version)
|
||||
pub_project_id = pub_project["project"]["id"]
|
||||
pub_endpoint_id = pub_project["endpoints"][0]["id"]
|
||||
neon_api.wait_for_operation_to_finish(pub_project_id)
|
||||
error_occurred = False
|
||||
try:
|
||||
sub_project = neon_api.create_project(pg_version)
|
||||
sub_project_id = sub_project["project"]["id"]
|
||||
neon_api.wait_for_operation_to_finish(sub_project_id)
|
||||
try:
|
||||
pub_env = connection_parameters_to_env(
|
||||
pub_project["connection_uris"][0]["connection_parameters"]
|
||||
)
|
||||
sub_env = connection_parameters_to_env(
|
||||
sub_project["connection_uris"][0]["connection_parameters"]
|
||||
)
|
||||
pub_connstr = pub_project["connection_uris"][0]["connection_uri"]
|
||||
sub_connstr = sub_project["connection_uris"][0]["connection_uri"]
|
||||
|
||||
pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env)
|
||||
pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env)
|
||||
|
||||
pub_conn = psycopg2.connect(pub_connstr)
|
||||
sub_conn = psycopg2.connect(sub_connstr)
|
||||
pub_conn.autocommit = True
|
||||
sub_conn.autocommit = True
|
||||
with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
|
||||
sub_cur.execute("truncate table pgbench_accounts")
|
||||
sub_cur.execute("truncate table pgbench_history")
|
||||
|
||||
pub_cur.execute(
|
||||
"create publication pub1 for table pgbench_accounts, pgbench_history"
|
||||
)
|
||||
sub_cur.execute(
|
||||
f"create subscription sub1 connection '{pub_connstr}' publication pub1"
|
||||
)
|
||||
|
||||
initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur)
|
||||
pub_conn.close()
|
||||
sub_conn.close()
|
||||
|
||||
zenbenchmark.record(
|
||||
"initial_sync_lag", initial_sync_lag, "s", MetricReport.LOWER_IS_BETTER
|
||||
)
|
||||
|
||||
pub_workload = pg_bin.run_nonblocking(
|
||||
["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=pub_env
|
||||
)
|
||||
try:
|
||||
sub_workload = pg_bin.run_nonblocking(
|
||||
["pgbench", "-c10", pgbench_duration, "-S"],
|
||||
env=sub_env,
|
||||
)
|
||||
try:
|
||||
start = time.time()
|
||||
while time.time() - start < test_duration_min * 60:
|
||||
time.sleep(sync_interval_min * 60)
|
||||
check_pgbench_still_running(pub_workload, "pub")
|
||||
check_pgbench_still_running(sub_workload, "sub")
|
||||
|
||||
pub_workload.terminate()
|
||||
neon_api.restart_endpoint(
|
||||
pub_project_id,
|
||||
pub_endpoint_id,
|
||||
)
|
||||
neon_api.wait_for_operation_to_finish(pub_project_id)
|
||||
pub_workload = pg_bin.run_nonblocking(
|
||||
["pgbench", "-c10", pgbench_duration, "-Mprepared"],
|
||||
env=pub_env,
|
||||
)
|
||||
with psycopg2.connect(pub_connstr) as pub_conn, psycopg2.connect(
|
||||
sub_connstr
|
||||
) as sub_conn:
|
||||
with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
|
||||
lag = measure_logical_replication_lag(sub_cur, pub_cur)
|
||||
|
||||
log.info(f"Replica lagged behind master by {lag} seconds")
|
||||
zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER)
|
||||
|
||||
# Measure storage to make sure replication information isn't bloating storage
|
||||
sub_storage = neon_api.get_project_details(sub_project_id)["project"][
|
||||
"synthetic_storage_size"
|
||||
]
|
||||
pub_storage = neon_api.get_project_details(pub_project_id)["project"][
|
||||
"synthetic_storage_size"
|
||||
]
|
||||
zenbenchmark.record(
|
||||
"sub_storage", sub_storage, "B", MetricReport.LOWER_IS_BETTER
|
||||
)
|
||||
zenbenchmark.record(
|
||||
"pub_storage", pub_storage, "B", MetricReport.LOWER_IS_BETTER
|
||||
)
|
||||
|
||||
finally:
|
||||
sub_workload.terminate()
|
||||
finally:
|
||||
pub_workload.terminate()
|
||||
except Exception as e:
|
||||
error_occurred = True
|
||||
log.error(f"Caught exception {e}")
|
||||
log.error(traceback.format_exc())
|
||||
finally:
|
||||
if not error_occurred:
|
||||
neon_api.delete_project(sub_project_id)
|
||||
except Exception as e:
|
||||
error_occurred = True
|
||||
log.error(f"Caught exception {e}")
|
||||
log.error(traceback.format_exc())
|
||||
finally:
|
||||
assert not error_occurred
|
||||
neon_api.delete_project(pub_project_id)
|
||||
|
||||
296
test_runner/performance/test_physical_replication.py
Normal file
296
test_runner/performance/test_physical_replication.py
Normal file
@@ -0,0 +1,296 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import csv
|
||||
import os
|
||||
import subprocess
|
||||
import time
|
||||
import traceback
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
import pytest
|
||||
from fixtures.benchmark_fixture import MetricReport
|
||||
from fixtures.common_types import Lsn
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_api import connection_parameters_to_env
|
||||
from fixtures.pg_version import PgVersion
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from typing import Any, List, Optional
|
||||
|
||||
from fixtures.benchmark_fixture import NeonBenchmarker
|
||||
from fixtures.neon_api import NeonAPI
|
||||
from fixtures.neon_fixtures import PgBin
|
||||
|
||||
|
||||
# Granularity of ~0.5 sec
|
||||
def measure_replication_lag(master, replica, timeout_sec=600):
|
||||
start = time.time()
|
||||
master.execute("SELECT pg_current_wal_flush_lsn()")
|
||||
master_lsn = Lsn(master.fetchall()[0][0])
|
||||
while (time.time() - start) < timeout_sec:
|
||||
replica.execute("select pg_last_wal_replay_lsn()")
|
||||
replica_lsn = replica.fetchall()[0][0]
|
||||
if replica_lsn:
|
||||
if Lsn(replica_lsn) >= master_lsn:
|
||||
return time.time() - start
|
||||
time.sleep(0.5)
|
||||
raise TimeoutError(f"Replication sync took more than {timeout_sec} sec")
|
||||
|
||||
|
||||
def check_pgbench_still_running(pgbench):
|
||||
rc = pgbench.poll()
|
||||
if rc is not None:
|
||||
raise RuntimeError(f"Pgbench terminated early with return code {rc}")
|
||||
|
||||
|
||||
@pytest.mark.remote_cluster
|
||||
@pytest.mark.timeout(2 * 60 * 60)
|
||||
def test_ro_replica_lag(
|
||||
pg_bin: PgBin,
|
||||
neon_api: NeonAPI,
|
||||
pg_version: PgVersion,
|
||||
zenbenchmark: NeonBenchmarker,
|
||||
):
|
||||
test_duration_min = 60
|
||||
sync_interval_min = 10
|
||||
|
||||
pgbench_duration = f"-T{test_duration_min * 60 * 2}"
|
||||
|
||||
project = neon_api.create_project(pg_version)
|
||||
project_id = project["project"]["id"]
|
||||
neon_api.wait_for_operation_to_finish(project_id)
|
||||
error_occurred = False
|
||||
try:
|
||||
branch_id = project["branch"]["id"]
|
||||
master_connstr = project["connection_uris"][0]["connection_uri"]
|
||||
master_env = connection_parameters_to_env(
|
||||
project["connection_uris"][0]["connection_parameters"]
|
||||
)
|
||||
|
||||
replica = neon_api.create_endpoint(
|
||||
project_id,
|
||||
branch_id,
|
||||
endpoint_type="read_only",
|
||||
settings={"pg_settings": {"hot_standby_feedback": "on"}},
|
||||
)
|
||||
replica_env = master_env.copy()
|
||||
replica_env["PGHOST"] = replica["endpoint"]["host"]
|
||||
neon_api.wait_for_operation_to_finish(project_id)
|
||||
|
||||
replica_connstr = neon_api.get_connection_uri(
|
||||
project_id,
|
||||
endpoint_id=replica["endpoint"]["id"],
|
||||
)["uri"]
|
||||
|
||||
pg_bin.run_capture(["pgbench", "-i", "-s100"], env=master_env)
|
||||
|
||||
master_workload = pg_bin.run_nonblocking(
|
||||
["pgbench", "-c10", pgbench_duration, "-Mprepared"],
|
||||
env=master_env,
|
||||
)
|
||||
try:
|
||||
replica_workload = pg_bin.run_nonblocking(
|
||||
["pgbench", "-c10", pgbench_duration, "-S"],
|
||||
env=replica_env,
|
||||
)
|
||||
try:
|
||||
start = time.time()
|
||||
while time.time() - start < test_duration_min * 60:
|
||||
check_pgbench_still_running(master_workload)
|
||||
check_pgbench_still_running(replica_workload)
|
||||
time.sleep(sync_interval_min * 60)
|
||||
with psycopg2.connect(master_connstr) as conn_master, psycopg2.connect(
|
||||
replica_connstr
|
||||
) as conn_replica:
|
||||
with conn_master.cursor() as cur_master, conn_replica.cursor() as cur_replica:
|
||||
lag = measure_replication_lag(cur_master, cur_replica)
|
||||
log.info(f"Replica lagged behind master by {lag} seconds")
|
||||
zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER)
|
||||
finally:
|
||||
replica_workload.terminate()
|
||||
finally:
|
||||
master_workload.terminate()
|
||||
except Exception as e:
|
||||
error_occurred = True
|
||||
log.error(f"Caught exception: {e}")
|
||||
log.error(traceback.format_exc())
|
||||
finally:
|
||||
assert not error_occurred # Fail the test if an error occurred
|
||||
neon_api.delete_project(project_id)
|
||||
|
||||
|
||||
def report_pgbench_aggregate_intervals(
|
||||
output_dir: Path,
|
||||
prefix: str,
|
||||
zenbenchmark: NeonBenchmarker,
|
||||
):
|
||||
for filename in os.listdir(output_dir):
|
||||
if filename.startswith(prefix):
|
||||
# The file will be in the form <prefix>_<node>.<pid>
|
||||
# So we first lop off the .<pid>, and then lop off the prefix and the _
|
||||
node = filename.split(".")[0][len(prefix) + 1 :]
|
||||
with open(output_dir / filename) as f:
|
||||
reader = csv.reader(f, delimiter=" ")
|
||||
for line in reader:
|
||||
num_transactions = int(line[1])
|
||||
if num_transactions == 0:
|
||||
continue
|
||||
sum_latency = int(line[2])
|
||||
sum_lag = int(line[3])
|
||||
zenbenchmark.record(
|
||||
f"{node}_num_txns", num_transactions, "txns", MetricReport.HIGHER_IS_BETTER
|
||||
)
|
||||
zenbenchmark.record(
|
||||
f"{node}_avg_latency",
|
||||
sum_latency / num_transactions,
|
||||
"s",
|
||||
MetricReport.LOWER_IS_BETTER,
|
||||
)
|
||||
zenbenchmark.record(
|
||||
f"{node}_avg_lag",
|
||||
sum_lag / num_transactions,
|
||||
"s",
|
||||
MetricReport.LOWER_IS_BETTER,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.remote_cluster
|
||||
@pytest.mark.timeout(2 * 60 * 60)
|
||||
def test_replication_start_stop(
|
||||
pg_bin: PgBin,
|
||||
test_output_dir: Path,
|
||||
neon_api: NeonAPI,
|
||||
pg_version: PgVersion,
|
||||
zenbenchmark: NeonBenchmarker,
|
||||
):
|
||||
"""
|
||||
Cycles through different configurations of read replicas being enabled disabled. The whole time,
|
||||
there's a pgbench read/write workload going on the master. For each replica, we either turn it
|
||||
on or off, and see how long it takes to catch up after some set amount of time of replicating
|
||||
the pgbench.
|
||||
"""
|
||||
|
||||
prefix = "pgbench_agg"
|
||||
num_replicas = 2
|
||||
configuration_test_time_sec = 10 * 60
|
||||
pgbench_duration = f"-T{2 ** num_replicas * configuration_test_time_sec}"
|
||||
error_occurred = False
|
||||
|
||||
project = neon_api.create_project(pg_version)
|
||||
project_id = project["project"]["id"]
|
||||
neon_api.wait_for_operation_to_finish(project_id)
|
||||
try:
|
||||
branch_id = project["branch"]["id"]
|
||||
master_connstr = project["connection_uris"][0]["connection_uri"]
|
||||
master_env = connection_parameters_to_env(
|
||||
project["connection_uris"][0]["connection_parameters"]
|
||||
)
|
||||
|
||||
replicas = []
|
||||
for _ in range(num_replicas):
|
||||
replicas.append(
|
||||
neon_api.create_endpoint(
|
||||
project_id,
|
||||
branch_id,
|
||||
endpoint_type="read_only",
|
||||
settings={"pg_settings": {"hot_standby_feedback": "on"}},
|
||||
)
|
||||
)
|
||||
neon_api.wait_for_operation_to_finish(project_id)
|
||||
|
||||
replica_connstr = [
|
||||
neon_api.get_connection_uri(
|
||||
project_id,
|
||||
endpoint_id=replicas[i]["endpoint"]["id"],
|
||||
)["uri"]
|
||||
for i in range(num_replicas)
|
||||
]
|
||||
replica_env = [master_env.copy() for _ in range(num_replicas)]
|
||||
for i in range(num_replicas):
|
||||
replica_env[i]["PGHOST"] = replicas[i]["endpoint"]["host"]
|
||||
|
||||
pg_bin.run_capture(["pgbench", "-i", "-s10"], env=master_env)
|
||||
|
||||
# Sync replicas
|
||||
with psycopg2.connect(master_connstr) as conn_master:
|
||||
with conn_master.cursor() as cur_master:
|
||||
for i in range(num_replicas):
|
||||
conn_replica = psycopg2.connect(replica_connstr[i])
|
||||
measure_replication_lag(cur_master, conn_replica.cursor())
|
||||
|
||||
master_pgbench = pg_bin.run_nonblocking(
|
||||
[
|
||||
"pgbench",
|
||||
"-c10",
|
||||
pgbench_duration,
|
||||
"-Mprepared",
|
||||
"--log",
|
||||
f"--log-prefix={test_output_dir}/{prefix}_master",
|
||||
f"--aggregate-interval={configuration_test_time_sec}",
|
||||
],
|
||||
env=master_env,
|
||||
)
|
||||
replica_pgbench: List[Optional[subprocess.Popen[Any]]] = [None for _ in range(num_replicas)]
|
||||
|
||||
# Use the bits of iconfig to tell us which configuration we are on. For example
|
||||
# a iconfig of 2 is 10 in binary, indicating replica 0 is suspended and replica 1 is
|
||||
# alive.
|
||||
for iconfig in range((1 << num_replicas) - 1, -1, -1):
|
||||
|
||||
def replica_enabled(iconfig: int = iconfig):
|
||||
return bool((iconfig >> 1) & 1)
|
||||
|
||||
# Change configuration
|
||||
for ireplica in range(num_replicas):
|
||||
if replica_enabled() and replica_pgbench[ireplica] is None:
|
||||
replica_pgbench[ireplica] = pg_bin.run_nonblocking(
|
||||
[
|
||||
"pgbench",
|
||||
"-c10",
|
||||
"-S",
|
||||
pgbench_duration,
|
||||
"--log",
|
||||
f"--log-prefix={test_output_dir}/{prefix}_replica_{ireplica}",
|
||||
f"--aggregate-interval={configuration_test_time_sec}",
|
||||
],
|
||||
env=replica_env[ireplica],
|
||||
)
|
||||
elif not replica_enabled() and replica_pgbench[ireplica] is not None:
|
||||
pgb = replica_pgbench[ireplica]
|
||||
assert pgb is not None
|
||||
pgb.terminate()
|
||||
pgb.wait()
|
||||
replica_pgbench[ireplica] = None
|
||||
|
||||
neon_api.suspend_endpoint(
|
||||
project_id,
|
||||
replicas[ireplica]["endpoint"]["id"],
|
||||
)
|
||||
neon_api.wait_for_operation_to_finish(project_id)
|
||||
|
||||
time.sleep(configuration_test_time_sec)
|
||||
|
||||
with psycopg2.connect(master_connstr) as conn_master:
|
||||
with conn_master.cursor() as cur_master:
|
||||
for ireplica in range(num_replicas):
|
||||
replica_conn = psycopg2.connect(replica_connstr[ireplica])
|
||||
lag = measure_replication_lag(cur_master, replica_conn.cursor())
|
||||
zenbenchmark.record(
|
||||
f"Replica {ireplica} lag", lag, "s", MetricReport.LOWER_IS_BETTER
|
||||
)
|
||||
log.info(
|
||||
f"Replica {ireplica} lagging behind master by {lag} seconds after configuration {iconfig:>b}"
|
||||
)
|
||||
master_pgbench.terminate()
|
||||
except Exception as e:
|
||||
error_occurred = True
|
||||
log.error(f"Caught exception {e}")
|
||||
log.error(traceback.format_exc())
|
||||
finally:
|
||||
assert not error_occurred
|
||||
neon_api.delete_project(project_id)
|
||||
# Only report results if we didn't error out
|
||||
report_pgbench_aggregate_intervals(test_output_dir, prefix, zenbenchmark)
|
||||
Reference in New Issue
Block a user