tests: enable higher concurrency and adjust tests with outlier runtime (#4904)

## Problem I spent a few minutes seeing how fast I could get our regression test suite to run on my workstation, for when I want to run a "did I break anything?" smoke test before pushing to CI. - Test runtime was dominated by a couple of tests that run for longer than all the others take together - Test concurrency was limited to <16 by the ports-per-worker setting There's no "right answer" for how long a test should be, but as a rule of thumb, no one test should run for much longer than the time it takes to run all the other tests together. ## Summary of changes - Make the ports per worker setting dynamic depending on worker count - Modify the longest running tests to run for a shorter time (`test_duplicate_layers` which uses a pgbench runtime) or fewer iterations (`test_restarts_frequent_checkpoints`).
2026-01-14 17:02:56 +00:00 · 2023-08-08 09:16:21 +01:00
parent 9559ef6f3b
commit 33cb1e9c0c
3 changed files with 12 additions and 20 deletions
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -86,19 +86,6 @@ DEFAULT_OUTPUT_DIR: str = "test_output"
 DEFAULT_BRANCH_NAME: str = "main"

 BASE_PORT: int = 15000
-WORKER_PORT_NUM: int = 1000
-
-
-def pytest_configure(config: Config):
-    """
-    Check that we do not overflow available ports range.
-    """
-
-    numprocesses = config.getoption("numprocesses")
-    if (
-        numprocesses is not None and BASE_PORT + numprocesses * WORKER_PORT_NUM > 32768
-    ):  # do not use ephemeral ports
-        raise Exception("Too many workers configured. Cannot distribute ports for services.")


@pytest.fixture(scope="session")
@@ -200,6 +187,11 @@ def shareable_scope(fixture_name: str, config: Config) -> Literal["session", "fu
    return scope


+@pytest.fixture(scope="session")
+def worker_port_num():
+    return (32768 - BASE_PORT) // int(os.environ.get("PYTEST_XDIST_WORKER_COUNT", "1"))
+
+
@pytest.fixture(scope="session")
 def worker_seq_no(worker_id: str) -> int:
    # worker_id is a pytest-xdist fixture
@@ -212,10 +204,10 @@ def worker_seq_no(worker_id: str) -> int:


@pytest.fixture(scope="session")
-def worker_base_port(worker_seq_no: int) -> int:
-    # so we divide ports in ranges of 100 ports
+def worker_base_port(worker_seq_no: int, worker_port_num: int) -> int:
+    # so we divide ports in ranges of ports
    # so workers have disjoint set of ports for services
-    return BASE_PORT + worker_seq_no * WORKER_PORT_NUM
+    return BASE_PORT + worker_seq_no * worker_port_num


 def get_dir_size(path: str) -> int:
@@ -229,8 +221,8 @@ def get_dir_size(path: str) -> int:


@pytest.fixture(scope="session")
-def port_distributor(worker_base_port: int) -> PortDistributor:
-    return PortDistributor(base_port=worker_base_port, port_number=WORKER_PORT_NUM)
+def port_distributor(worker_base_port: int, worker_port_num: int) -> PortDistributor:
+    return PortDistributor(base_port=worker_base_port, port_number=worker_port_num)


@pytest.fixture(scope="session")
--- a/test_runner/regress/test_duplicate_layers.py
+++ b/test_runner/regress/test_duplicate_layers.py
@@ -33,4 +33,4 @@ def test_duplicate_layers(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
    time.sleep(10)  # let compaction to be performed
    assert env.pageserver.log_contains("compact-level0-phase1-return-same")

-    pg_bin.run_capture(["pgbench", "-P1", "-N", "-c5", "-T500", "-Mprepared", connstr])
+    pg_bin.run_capture(["pgbench", "-P1", "-N", "-c5", "-T200", "-Mprepared", connstr])
--- a/test_runner/regress/test_wal_acceptor_async.py
+++ b/test_runner/regress/test_wal_acceptor_async.py
@@ -245,7 +245,7 @@ def test_restarts_frequent_checkpoints(neon_env_builder: NeonEnvBuilder):
    # we try to simulate large (flush_lsn - truncate_lsn) lag, to test that WAL segments
    # are not removed before broadcasted to all safekeepers, with the help of replication slot
    asyncio.run(
-        run_restarts_under_load(env, endpoint, env.safekeepers, period_time=15, iterations=5)
+        run_restarts_under_load(env, endpoint, env.safekeepers, period_time=15, iterations=4)
    )