From 1c57f6bac34c2e97a1929cd5e96af1156bdc240d Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Wed, 3 Jul 2024 15:22:42 -0500 Subject: [PATCH] Add long running replication tests These tests will help verify that replication, both physical and logical, works as expected in Neon. Co-authored-by: Sasha Krassovsky --- .../actions/run-python-test-set/action.yml | 1 + .github/workflows/benchmarking.yml | 72 ++++- .../performance/test_logical_replication.py | 295 ++++++++++++++++- .../performance/test_physical_replication.py | 296 ++++++++++++++++++ 4 files changed, 662 insertions(+), 2 deletions(-) create mode 100644 test_runner/performance/test_physical_replication.py diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index 7f843de1a5..daaedf6d11 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -115,6 +115,7 @@ runs: export POSTGRES_DISTRIB_DIR=${POSTGRES_DISTRIB_DIR:-/tmp/neon/pg_install} export DEFAULT_PG_VERSION=${PG_VERSION#v} export LD_LIBRARY_PATH=${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/lib + export BENCHMARK_CONNSTR=${BENCHMARK_CONNSTR:-} if [ "${BUILD_TYPE}" = "remote" ]; then export REMOTE_ENV=1 diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index db04b5de7d..899cae2b86 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -99,7 +99,14 @@ jobs: # Set --sparse-ordering option of pytest-order plugin # to ensure tests are running in order of appears in the file. # It's important for test_perf_pgbench.py::test_pgbench_remote_* tests - extra_params: -m remote_cluster --sparse-ordering --timeout 5400 --ignore test_runner/performance/test_perf_olap.py --ignore test_runner/performance/test_perf_pgvector_queries.py + extra_params: + -m remote_cluster + --sparse-ordering + --timeout 5400 + --ignore test_runner/performance/test_perf_olap.py + --ignore test_runner/performance/test_perf_pgvector_queries.py + --ignore test_runner/performance/test_logical_replication.py + --ignore test_runner/performance/test_physical_replication.py env: BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" @@ -125,6 +132,69 @@ jobs: env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} + replication-tests: + env: + POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install + DEFAULT_PG_VERSION: 14 + TEST_OUTPUT: /tmp/test_output + BUILD_TYPE: remote + SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} + PLATFORM: "neon-staging" + + runs-on: [ self-hosted, us-east-2, x64 ] + container: + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned + options: --init + + steps: + - uses: actions/checkout@v4 + + - name: Download Neon artifact + uses: ./.github/actions/download + with: + name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact + path: /tmp/neon/ + prefix: latest + + - name: Run benchmark + uses: ./.github/actions/run-python-test-set + with: + build_type: ${{ env.BUILD_TYPE }} + test_selection: performance/test_logical_replication.py + run_in_parallel: false + save_perf_report: ${{ env.SAVE_PERF_REPORT }} + extra_params: -m remote_cluster --timeout 5400 + env: + VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" + PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" + NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }} + + - name: Run benchmark + uses: ./.github/actions/run-python-test-set + with: + build_type: ${{ env.BUILD_TYPE }} + test_selection: performance/test_physical_replication.py + run_in_parallel: false + save_perf_report: ${{ env.SAVE_PERF_REPORT }} + extra_params: -m remote_cluster --timeout 5400 + env: + VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" + PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" + NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }} + + - name: Create Allure report + if: ${{ !cancelled() }} + uses: ./.github/actions/allure-report-generate + + - name: Post to a Slack channel + if: ${{ github.event.schedule && failure() }} + uses: slackapi/slack-github-action@v1 + with: + channel-id: "C033QLM5P7D" # dev-staging-stream + slack-message: "Periodic replication testing: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" + env: + SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} + generate-matrices: if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }} # Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday) diff --git a/test_runner/performance/test_logical_replication.py b/test_runner/performance/test_logical_replication.py index 570bd11b6f..5ab83dd31d 100644 --- a/test_runner/performance/test_logical_replication.py +++ b/test_runner/performance/test_logical_replication.py @@ -1,13 +1,24 @@ from __future__ import annotations import time +import traceback +from typing import TYPE_CHECKING +import psycopg2 +import psycopg2.extras import pytest +from fixtures.benchmark_fixture import MetricReport +from fixtures.common_types import Lsn from fixtures.log_helper import log +from fixtures.neon_api import connection_parameters_to_env from fixtures.neon_fixtures import AuxFileStore, logical_replication_sync +from fixtures.pg_version import PgVersion if TYPE_CHECKING: + from fixtures.benchmark_fixture import NeonBenchmarker + from fixtures.neon_api import NeonAPI from fixtures.neon_fixtures import NeonEnv, PgBin + from fixtures.pg_version import PgVersion @pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.V2]) @@ -31,7 +42,6 @@ def test_logical_replication(neon_simple_env: NeonEnv, pg_bin: PgBin, vanilla_pg vanilla_pg.safe_psql("truncate table pgbench_history") connstr = endpoint.connstr().replace("'", "''") - print(f"connstr='{connstr}'") vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1") # Wait logical replication channel to be established @@ -47,3 +57,286 @@ def test_logical_replication(neon_simple_env: NeonEnv, pg_bin: PgBin, vanilla_pg sum_master = endpoint.safe_psql("select sum(abalance) from pgbench_accounts")[0][0] sum_replica = vanilla_pg.safe_psql("select sum(abalance) from pgbench_accounts")[0][0] assert sum_master == sum_replica + + +def check_pgbench_still_running(pgbench, label=""): + rc = pgbench.poll() + if rc is not None: + raise RuntimeError(f"{label} pgbench terminated early with return code {rc}") + + +def measure_logical_replication_lag(sub_cur, pub_cur, timeout_sec=600): + start = time.time() + pub_cur.execute("SELECT pg_current_wal_flush_lsn()") + pub_lsn = Lsn(pub_cur.fetchall()[0][0]) + while (time.time() - start) < timeout_sec: + sub_cur.execute("SELECT latest_end_lsn FROM pg_catalog.pg_stat_subscription") + res = sub_cur.fetchall()[0][0] + if res: + log.info(f"subscriber_lsn={res}") + sub_lsn = Lsn(res) + log.info(f"Subscriber LSN={sub_lsn}, publisher LSN={pub_lsn}") + if sub_lsn >= pub_lsn: + return time.time() - start + time.sleep(0.5) + raise TimeoutError(f"Logical replication sync took more than {timeout_sec} sec") + + +@pytest.mark.remote_cluster +@pytest.mark.timeout(2 * 60 * 60) +def test_subscriber_lag( + pg_bin: PgBin, + neon_api: NeonAPI, + pg_version: PgVersion, + zenbenchmark: NeonBenchmarker, +): + """ + Creates a publisher and subscriber, runs pgbench inserts on publisher and pgbench selects + on subscriber. Periodically restarts subscriber while still running the inserts, and + measures how long sync takes after restart. + """ + test_duration_min = 60 + sync_interval_min = 5 + pgbench_duration = f"-T{test_duration_min * 60 * 2}" + + pub_project = neon_api.create_project(pg_version) + pub_project_id = pub_project["project"]["id"] + neon_api.wait_for_operation_to_finish(pub_project_id) + error_occurred = False + try: + sub_project = neon_api.create_project(pg_version) + sub_project_id = sub_project["project"]["id"] + sub_endpoint_id = sub_project["endpoints"][0]["id"] + neon_api.wait_for_operation_to_finish(sub_project_id) + try: + pub_env = connection_parameters_to_env( + pub_project["connection_uris"][0]["connection_parameters"] + ) + sub_env = connection_parameters_to_env( + sub_project["connection_uris"][0]["connection_parameters"] + ) + pub_connstr = pub_project["connection_uris"][0]["connection_uri"] + sub_connstr = sub_project["connection_uris"][0]["connection_uri"] + + pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env) + pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env) + + pub_conn = psycopg2.connect(pub_connstr) + sub_conn = psycopg2.connect(sub_connstr) + pub_conn.autocommit = True + sub_conn.autocommit = True + with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur: + sub_cur.execute("truncate table pgbench_accounts") + sub_cur.execute("truncate table pgbench_history") + + pub_cur.execute( + "create publication pub1 for table pgbench_accounts, pgbench_history" + ) + sub_cur.execute( + f"create subscription sub1 connection '{pub_connstr}' publication pub1" + ) + + initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur) + pub_conn.close() + sub_conn.close() + + zenbenchmark.record( + "initial_sync_lag", initial_sync_lag, "s", MetricReport.LOWER_IS_BETTER + ) + + pub_workload = pg_bin.run_nonblocking( + ["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=pub_env + ) + try: + sub_workload = pg_bin.run_nonblocking( + ["pgbench", "-c10", pgbench_duration, "-S"], + env=sub_env, + ) + try: + start = time.time() + while time.time() - start < test_duration_min * 60: + time.sleep(sync_interval_min * 60) + check_pgbench_still_running(pub_workload, "pub") + check_pgbench_still_running(sub_workload, "sub") + + with psycopg2.connect(pub_connstr) as pub_conn, psycopg2.connect( + sub_connstr + ) as sub_conn: + with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur: + lag = measure_logical_replication_lag(sub_cur, pub_cur) + + log.info(f"Replica lagged behind master by {lag} seconds") + zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER) + sub_workload.terminate() + neon_api.restart_endpoint( + sub_project_id, + sub_endpoint_id, + ) + neon_api.wait_for_operation_to_finish(sub_project_id) + sub_workload = pg_bin.run_nonblocking( + ["pgbench", "-c10", pgbench_duration, "-S"], + env=sub_env, + ) + + # Measure storage to make sure replication information isn't bloating storage + sub_storage = neon_api.get_project_details(sub_project_id)["project"][ + "synthetic_storage_size" + ] + pub_storage = neon_api.get_project_details(pub_project_id)["project"][ + "synthetic_storage_size" + ] + zenbenchmark.record( + "sub_storage", sub_storage, "B", MetricReport.LOWER_IS_BETTER + ) + zenbenchmark.record( + "pub_storage", pub_storage, "B", MetricReport.LOWER_IS_BETTER + ) + + finally: + sub_workload.terminate() + finally: + pub_workload.terminate() + except Exception as e: + error_occurred = True + log.error(f"Caught exception {e}") + log.error(traceback.format_exc()) + finally: + if not error_occurred: + neon_api.delete_project(sub_project_id) + except Exception as e: + error_occurred = True + log.error(f"Caught exception {e}") + log.error(traceback.format_exc()) + finally: + assert not error_occurred + neon_api.delete_project(pub_project_id) + + +@pytest.mark.remote_cluster +@pytest.mark.timeout(2 * 60 * 60) +def test_publisher_restart( + pg_bin: PgBin, + neon_api: NeonAPI, + pg_version: PgVersion, + zenbenchmark: NeonBenchmarker, +): + """ + Creates a publisher and subscriber, runs pgbench inserts on publisher and pgbench selects + on subscriber. Periodically restarts publisher (to exercise on-demand WAL download), and + measures how long sync takes after restart. + """ + test_duration_min = 60 + sync_interval_min = 5 + pgbench_duration = f"-T{test_duration_min * 60 * 2}" + + pub_project = neon_api.create_project(pg_version) + pub_project_id = pub_project["project"]["id"] + pub_endpoint_id = pub_project["endpoints"][0]["id"] + neon_api.wait_for_operation_to_finish(pub_project_id) + error_occurred = False + try: + sub_project = neon_api.create_project(pg_version) + sub_project_id = sub_project["project"]["id"] + neon_api.wait_for_operation_to_finish(sub_project_id) + try: + pub_env = connection_parameters_to_env( + pub_project["connection_uris"][0]["connection_parameters"] + ) + sub_env = connection_parameters_to_env( + sub_project["connection_uris"][0]["connection_parameters"] + ) + pub_connstr = pub_project["connection_uris"][0]["connection_uri"] + sub_connstr = sub_project["connection_uris"][0]["connection_uri"] + + pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env) + pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env) + + pub_conn = psycopg2.connect(pub_connstr) + sub_conn = psycopg2.connect(sub_connstr) + pub_conn.autocommit = True + sub_conn.autocommit = True + with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur: + sub_cur.execute("truncate table pgbench_accounts") + sub_cur.execute("truncate table pgbench_history") + + pub_cur.execute( + "create publication pub1 for table pgbench_accounts, pgbench_history" + ) + sub_cur.execute( + f"create subscription sub1 connection '{pub_connstr}' publication pub1" + ) + + initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur) + pub_conn.close() + sub_conn.close() + + zenbenchmark.record( + "initial_sync_lag", initial_sync_lag, "s", MetricReport.LOWER_IS_BETTER + ) + + pub_workload = pg_bin.run_nonblocking( + ["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=pub_env + ) + try: + sub_workload = pg_bin.run_nonblocking( + ["pgbench", "-c10", pgbench_duration, "-S"], + env=sub_env, + ) + try: + start = time.time() + while time.time() - start < test_duration_min * 60: + time.sleep(sync_interval_min * 60) + check_pgbench_still_running(pub_workload, "pub") + check_pgbench_still_running(sub_workload, "sub") + + pub_workload.terminate() + neon_api.restart_endpoint( + pub_project_id, + pub_endpoint_id, + ) + neon_api.wait_for_operation_to_finish(pub_project_id) + pub_workload = pg_bin.run_nonblocking( + ["pgbench", "-c10", pgbench_duration, "-Mprepared"], + env=pub_env, + ) + with psycopg2.connect(pub_connstr) as pub_conn, psycopg2.connect( + sub_connstr + ) as sub_conn: + with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur: + lag = measure_logical_replication_lag(sub_cur, pub_cur) + + log.info(f"Replica lagged behind master by {lag} seconds") + zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER) + + # Measure storage to make sure replication information isn't bloating storage + sub_storage = neon_api.get_project_details(sub_project_id)["project"][ + "synthetic_storage_size" + ] + pub_storage = neon_api.get_project_details(pub_project_id)["project"][ + "synthetic_storage_size" + ] + zenbenchmark.record( + "sub_storage", sub_storage, "B", MetricReport.LOWER_IS_BETTER + ) + zenbenchmark.record( + "pub_storage", pub_storage, "B", MetricReport.LOWER_IS_BETTER + ) + + finally: + sub_workload.terminate() + finally: + pub_workload.terminate() + except Exception as e: + error_occurred = True + log.error(f"Caught exception {e}") + log.error(traceback.format_exc()) + finally: + if not error_occurred: + neon_api.delete_project(sub_project_id) + except Exception as e: + error_occurred = True + log.error(f"Caught exception {e}") + log.error(traceback.format_exc()) + finally: + assert not error_occurred + neon_api.delete_project(pub_project_id) diff --git a/test_runner/performance/test_physical_replication.py b/test_runner/performance/test_physical_replication.py new file mode 100644 index 0000000000..7e16197211 --- /dev/null +++ b/test_runner/performance/test_physical_replication.py @@ -0,0 +1,296 @@ +from __future__ import annotations + +import csv +import os +import subprocess +import time +import traceback +from pathlib import Path +from typing import TYPE_CHECKING + +import psycopg2 +import psycopg2.extras +import pytest +from fixtures.benchmark_fixture import MetricReport +from fixtures.common_types import Lsn +from fixtures.log_helper import log +from fixtures.neon_api import connection_parameters_to_env +from fixtures.pg_version import PgVersion + +if TYPE_CHECKING: + from typing import Any, List, Optional + + from fixtures.benchmark_fixture import NeonBenchmarker + from fixtures.neon_api import NeonAPI + from fixtures.neon_fixtures import PgBin + + +# Granularity of ~0.5 sec +def measure_replication_lag(master, replica, timeout_sec=600): + start = time.time() + master.execute("SELECT pg_current_wal_flush_lsn()") + master_lsn = Lsn(master.fetchall()[0][0]) + while (time.time() - start) < timeout_sec: + replica.execute("select pg_last_wal_replay_lsn()") + replica_lsn = replica.fetchall()[0][0] + if replica_lsn: + if Lsn(replica_lsn) >= master_lsn: + return time.time() - start + time.sleep(0.5) + raise TimeoutError(f"Replication sync took more than {timeout_sec} sec") + + +def check_pgbench_still_running(pgbench): + rc = pgbench.poll() + if rc is not None: + raise RuntimeError(f"Pgbench terminated early with return code {rc}") + + +@pytest.mark.remote_cluster +@pytest.mark.timeout(2 * 60 * 60) +def test_ro_replica_lag( + pg_bin: PgBin, + neon_api: NeonAPI, + pg_version: PgVersion, + zenbenchmark: NeonBenchmarker, +): + test_duration_min = 60 + sync_interval_min = 10 + + pgbench_duration = f"-T{test_duration_min * 60 * 2}" + + project = neon_api.create_project(pg_version) + project_id = project["project"]["id"] + neon_api.wait_for_operation_to_finish(project_id) + error_occurred = False + try: + branch_id = project["branch"]["id"] + master_connstr = project["connection_uris"][0]["connection_uri"] + master_env = connection_parameters_to_env( + project["connection_uris"][0]["connection_parameters"] + ) + + replica = neon_api.create_endpoint( + project_id, + branch_id, + endpoint_type="read_only", + settings={"pg_settings": {"hot_standby_feedback": "on"}}, + ) + replica_env = master_env.copy() + replica_env["PGHOST"] = replica["endpoint"]["host"] + neon_api.wait_for_operation_to_finish(project_id) + + replica_connstr = neon_api.get_connection_uri( + project_id, + endpoint_id=replica["endpoint"]["id"], + )["uri"] + + pg_bin.run_capture(["pgbench", "-i", "-s100"], env=master_env) + + master_workload = pg_bin.run_nonblocking( + ["pgbench", "-c10", pgbench_duration, "-Mprepared"], + env=master_env, + ) + try: + replica_workload = pg_bin.run_nonblocking( + ["pgbench", "-c10", pgbench_duration, "-S"], + env=replica_env, + ) + try: + start = time.time() + while time.time() - start < test_duration_min * 60: + check_pgbench_still_running(master_workload) + check_pgbench_still_running(replica_workload) + time.sleep(sync_interval_min * 60) + with psycopg2.connect(master_connstr) as conn_master, psycopg2.connect( + replica_connstr + ) as conn_replica: + with conn_master.cursor() as cur_master, conn_replica.cursor() as cur_replica: + lag = measure_replication_lag(cur_master, cur_replica) + log.info(f"Replica lagged behind master by {lag} seconds") + zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER) + finally: + replica_workload.terminate() + finally: + master_workload.terminate() + except Exception as e: + error_occurred = True + log.error(f"Caught exception: {e}") + log.error(traceback.format_exc()) + finally: + assert not error_occurred # Fail the test if an error occurred + neon_api.delete_project(project_id) + + +def report_pgbench_aggregate_intervals( + output_dir: Path, + prefix: str, + zenbenchmark: NeonBenchmarker, +): + for filename in os.listdir(output_dir): + if filename.startswith(prefix): + # The file will be in the form _. + # So we first lop off the ., and then lop off the prefix and the _ + node = filename.split(".")[0][len(prefix) + 1 :] + with open(output_dir / filename) as f: + reader = csv.reader(f, delimiter=" ") + for line in reader: + num_transactions = int(line[1]) + if num_transactions == 0: + continue + sum_latency = int(line[2]) + sum_lag = int(line[3]) + zenbenchmark.record( + f"{node}_num_txns", num_transactions, "txns", MetricReport.HIGHER_IS_BETTER + ) + zenbenchmark.record( + f"{node}_avg_latency", + sum_latency / num_transactions, + "s", + MetricReport.LOWER_IS_BETTER, + ) + zenbenchmark.record( + f"{node}_avg_lag", + sum_lag / num_transactions, + "s", + MetricReport.LOWER_IS_BETTER, + ) + + +@pytest.mark.remote_cluster +@pytest.mark.timeout(2 * 60 * 60) +def test_replication_start_stop( + pg_bin: PgBin, + test_output_dir: Path, + neon_api: NeonAPI, + pg_version: PgVersion, + zenbenchmark: NeonBenchmarker, +): + """ + Cycles through different configurations of read replicas being enabled disabled. The whole time, + there's a pgbench read/write workload going on the master. For each replica, we either turn it + on or off, and see how long it takes to catch up after some set amount of time of replicating + the pgbench. + """ + + prefix = "pgbench_agg" + num_replicas = 2 + configuration_test_time_sec = 10 * 60 + pgbench_duration = f"-T{2 ** num_replicas * configuration_test_time_sec}" + error_occurred = False + + project = neon_api.create_project(pg_version) + project_id = project["project"]["id"] + neon_api.wait_for_operation_to_finish(project_id) + try: + branch_id = project["branch"]["id"] + master_connstr = project["connection_uris"][0]["connection_uri"] + master_env = connection_parameters_to_env( + project["connection_uris"][0]["connection_parameters"] + ) + + replicas = [] + for _ in range(num_replicas): + replicas.append( + neon_api.create_endpoint( + project_id, + branch_id, + endpoint_type="read_only", + settings={"pg_settings": {"hot_standby_feedback": "on"}}, + ) + ) + neon_api.wait_for_operation_to_finish(project_id) + + replica_connstr = [ + neon_api.get_connection_uri( + project_id, + endpoint_id=replicas[i]["endpoint"]["id"], + )["uri"] + for i in range(num_replicas) + ] + replica_env = [master_env.copy() for _ in range(num_replicas)] + for i in range(num_replicas): + replica_env[i]["PGHOST"] = replicas[i]["endpoint"]["host"] + + pg_bin.run_capture(["pgbench", "-i", "-s10"], env=master_env) + + # Sync replicas + with psycopg2.connect(master_connstr) as conn_master: + with conn_master.cursor() as cur_master: + for i in range(num_replicas): + conn_replica = psycopg2.connect(replica_connstr[i]) + measure_replication_lag(cur_master, conn_replica.cursor()) + + master_pgbench = pg_bin.run_nonblocking( + [ + "pgbench", + "-c10", + pgbench_duration, + "-Mprepared", + "--log", + f"--log-prefix={test_output_dir}/{prefix}_master", + f"--aggregate-interval={configuration_test_time_sec}", + ], + env=master_env, + ) + replica_pgbench: List[Optional[subprocess.Popen[Any]]] = [None for _ in range(num_replicas)] + + # Use the bits of iconfig to tell us which configuration we are on. For example + # a iconfig of 2 is 10 in binary, indicating replica 0 is suspended and replica 1 is + # alive. + for iconfig in range((1 << num_replicas) - 1, -1, -1): + + def replica_enabled(iconfig: int = iconfig): + return bool((iconfig >> 1) & 1) + + # Change configuration + for ireplica in range(num_replicas): + if replica_enabled() and replica_pgbench[ireplica] is None: + replica_pgbench[ireplica] = pg_bin.run_nonblocking( + [ + "pgbench", + "-c10", + "-S", + pgbench_duration, + "--log", + f"--log-prefix={test_output_dir}/{prefix}_replica_{ireplica}", + f"--aggregate-interval={configuration_test_time_sec}", + ], + env=replica_env[ireplica], + ) + elif not replica_enabled() and replica_pgbench[ireplica] is not None: + pgb = replica_pgbench[ireplica] + assert pgb is not None + pgb.terminate() + pgb.wait() + replica_pgbench[ireplica] = None + + neon_api.suspend_endpoint( + project_id, + replicas[ireplica]["endpoint"]["id"], + ) + neon_api.wait_for_operation_to_finish(project_id) + + time.sleep(configuration_test_time_sec) + + with psycopg2.connect(master_connstr) as conn_master: + with conn_master.cursor() as cur_master: + for ireplica in range(num_replicas): + replica_conn = psycopg2.connect(replica_connstr[ireplica]) + lag = measure_replication_lag(cur_master, replica_conn.cursor()) + zenbenchmark.record( + f"Replica {ireplica} lag", lag, "s", MetricReport.LOWER_IS_BETTER + ) + log.info( + f"Replica {ireplica} lagging behind master by {lag} seconds after configuration {iconfig:>b}" + ) + master_pgbench.terminate() + except Exception as e: + error_occurred = True + log.error(f"Caught exception {e}") + log.error(traceback.format_exc()) + finally: + assert not error_occurred + neon_api.delete_project(project_id) + # Only report results if we didn't error out + report_pgbench_aggregate_intervals(test_output_dir, prefix, zenbenchmark)