From 33cb1e9c0ce01d1fd1544a85bc6139b8d7cdfe81 Mon Sep 17 00:00:00 2001 From: John Spray Date: Tue, 8 Aug 2023 09:16:21 +0100 Subject: [PATCH] tests: enable higher concurrency and adjust tests with outlier runtime (#4904) ## Problem I spent a few minutes seeing how fast I could get our regression test suite to run on my workstation, for when I want to run a "did I break anything?" smoke test before pushing to CI. - Test runtime was dominated by a couple of tests that run for longer than all the others take together - Test concurrency was limited to <16 by the ports-per-worker setting There's no "right answer" for how long a test should be, but as a rule of thumb, no one test should run for much longer than the time it takes to run all the other tests together. ## Summary of changes - Make the ports per worker setting dynamic depending on worker count - Modify the longest running tests to run for a shorter time (`test_duplicate_layers` which uses a pgbench runtime) or fewer iterations (`test_restarts_frequent_checkpoints`). --- test_runner/fixtures/neon_fixtures.py | 28 +++++++------------ test_runner/regress/test_duplicate_layers.py | 2 +- .../regress/test_wal_acceptor_async.py | 2 +- 3 files changed, 12 insertions(+), 20 deletions(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 79c1bb055b..cdda8c414e 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -86,19 +86,6 @@ DEFAULT_OUTPUT_DIR: str = "test_output" DEFAULT_BRANCH_NAME: str = "main" BASE_PORT: int = 15000 -WORKER_PORT_NUM: int = 1000 - - -def pytest_configure(config: Config): - """ - Check that we do not overflow available ports range. - """ - - numprocesses = config.getoption("numprocesses") - if ( - numprocesses is not None and BASE_PORT + numprocesses * WORKER_PORT_NUM > 32768 - ): # do not use ephemeral ports - raise Exception("Too many workers configured. Cannot distribute ports for services.") @pytest.fixture(scope="session") @@ -200,6 +187,11 @@ def shareable_scope(fixture_name: str, config: Config) -> Literal["session", "fu return scope +@pytest.fixture(scope="session") +def worker_port_num(): + return (32768 - BASE_PORT) // int(os.environ.get("PYTEST_XDIST_WORKER_COUNT", "1")) + + @pytest.fixture(scope="session") def worker_seq_no(worker_id: str) -> int: # worker_id is a pytest-xdist fixture @@ -212,10 +204,10 @@ def worker_seq_no(worker_id: str) -> int: @pytest.fixture(scope="session") -def worker_base_port(worker_seq_no: int) -> int: - # so we divide ports in ranges of 100 ports +def worker_base_port(worker_seq_no: int, worker_port_num: int) -> int: + # so we divide ports in ranges of ports # so workers have disjoint set of ports for services - return BASE_PORT + worker_seq_no * WORKER_PORT_NUM + return BASE_PORT + worker_seq_no * worker_port_num def get_dir_size(path: str) -> int: @@ -229,8 +221,8 @@ def get_dir_size(path: str) -> int: @pytest.fixture(scope="session") -def port_distributor(worker_base_port: int) -> PortDistributor: - return PortDistributor(base_port=worker_base_port, port_number=WORKER_PORT_NUM) +def port_distributor(worker_base_port: int, worker_port_num: int) -> PortDistributor: + return PortDistributor(base_port=worker_base_port, port_number=worker_port_num) @pytest.fixture(scope="session") diff --git a/test_runner/regress/test_duplicate_layers.py b/test_runner/regress/test_duplicate_layers.py index c1832a2063..7f76a8e042 100644 --- a/test_runner/regress/test_duplicate_layers.py +++ b/test_runner/regress/test_duplicate_layers.py @@ -33,4 +33,4 @@ def test_duplicate_layers(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): time.sleep(10) # let compaction to be performed assert env.pageserver.log_contains("compact-level0-phase1-return-same") - pg_bin.run_capture(["pgbench", "-P1", "-N", "-c5", "-T500", "-Mprepared", connstr]) + pg_bin.run_capture(["pgbench", "-P1", "-N", "-c5", "-T200", "-Mprepared", connstr]) diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py index bb8ee8f52c..cfc131a3aa 100644 --- a/test_runner/regress/test_wal_acceptor_async.py +++ b/test_runner/regress/test_wal_acceptor_async.py @@ -245,7 +245,7 @@ def test_restarts_frequent_checkpoints(neon_env_builder: NeonEnvBuilder): # we try to simulate large (flush_lsn - truncate_lsn) lag, to test that WAL segments # are not removed before broadcasted to all safekeepers, with the help of replication slot asyncio.run( - run_restarts_under_load(env, endpoint, env.safekeepers, period_time=15, iterations=5) + run_restarts_under_load(env, endpoint, env.safekeepers, period_time=15, iterations=4) )