From 33cb1e9c0ce01d1fd1544a85bc6139b8d7cdfe81 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 8 Aug 2023 09:16:21 +0100
Subject: [PATCH] tests: enable higher concurrency and adjust tests with
 outlier runtime (#4904)

## Problem

I spent a few minutes seeing how fast I could get our regression test
suite to run on my workstation, for when I want to run a "did I break
anything?" smoke test before pushing to CI.

- Test runtime was dominated by a couple of tests that run for longer
than all the others take together
- Test concurrency was limited to <16 by the ports-per-worker setting

There's no "right answer" for how long a test should
be, but as a rule of thumb, no one test should run
for much longer than the time it takes to run all the
other tests together.

## Summary of changes

- Make the ports per worker setting dynamic depending on worker count
- Modify the longest running tests to run for a shorter time
(`test_duplicate_layers` which uses a pgbench runtime) or fewer
iterations (`test_restarts_frequent_checkpoints`).
---
 test_runner/fixtures/neon_fixtures.py         | 28 +++++++------------
 test_runner/regress/test_duplicate_layers.py  |  2 +-
 .../regress/test_wal_acceptor_async.py        |  2 +-
 3 files changed, 12 insertions(+), 20 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 79c1bb055b..cdda8c414e 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -86,19 +86,6 @@ DEFAULT_OUTPUT_DIR: str = "test_output"
 DEFAULT_BRANCH_NAME: str = "main"
 
 BASE_PORT: int = 15000
-WORKER_PORT_NUM: int = 1000
-
-
-def pytest_configure(config: Config):
-    """
-    Check that we do not overflow available ports range.
-    """
-
-    numprocesses = config.getoption("numprocesses")
-    if (
-        numprocesses is not None and BASE_PORT + numprocesses * WORKER_PORT_NUM > 32768
-    ):  # do not use ephemeral ports
-        raise Exception("Too many workers configured. Cannot distribute ports for services.")
 
 
 @pytest.fixture(scope="session")
@@ -200,6 +187,11 @@ def shareable_scope(fixture_name: str, config: Config) -> Literal["session", "fu
     return scope
 
 
+@pytest.fixture(scope="session")
+def worker_port_num():
+    return (32768 - BASE_PORT) // int(os.environ.get("PYTEST_XDIST_WORKER_COUNT", "1"))
+
+
 @pytest.fixture(scope="session")
 def worker_seq_no(worker_id: str) -> int:
     # worker_id is a pytest-xdist fixture
@@ -212,10 +204,10 @@ def worker_seq_no(worker_id: str) -> int:
 
 
 @pytest.fixture(scope="session")
-def worker_base_port(worker_seq_no: int) -> int:
-    # so we divide ports in ranges of 100 ports
+def worker_base_port(worker_seq_no: int, worker_port_num: int) -> int:
+    # so we divide ports in ranges of ports
     # so workers have disjoint set of ports for services
-    return BASE_PORT + worker_seq_no * WORKER_PORT_NUM
+    return BASE_PORT + worker_seq_no * worker_port_num
 
 
 def get_dir_size(path: str) -> int:
@@ -229,8 +221,8 @@ def get_dir_size(path: str) -> int:
 
 
 @pytest.fixture(scope="session")
-def port_distributor(worker_base_port: int) -> PortDistributor:
-    return PortDistributor(base_port=worker_base_port, port_number=WORKER_PORT_NUM)
+def port_distributor(worker_base_port: int, worker_port_num: int) -> PortDistributor:
+    return PortDistributor(base_port=worker_base_port, port_number=worker_port_num)
 
 
 @pytest.fixture(scope="session")
diff --git a/test_runner/regress/test_duplicate_layers.py b/test_runner/regress/test_duplicate_layers.py
index c1832a2063..7f76a8e042 100644
--- a/test_runner/regress/test_duplicate_layers.py
+++ b/test_runner/regress/test_duplicate_layers.py
@@ -33,4 +33,4 @@ def test_duplicate_layers(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
     time.sleep(10)  # let compaction to be performed
     assert env.pageserver.log_contains("compact-level0-phase1-return-same")
 
-    pg_bin.run_capture(["pgbench", "-P1", "-N", "-c5", "-T500", "-Mprepared", connstr])
+    pg_bin.run_capture(["pgbench", "-P1", "-N", "-c5", "-T200", "-Mprepared", connstr])
diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py
index bb8ee8f52c..cfc131a3aa 100644
--- a/test_runner/regress/test_wal_acceptor_async.py
+++ b/test_runner/regress/test_wal_acceptor_async.py
@@ -245,7 +245,7 @@ def test_restarts_frequent_checkpoints(neon_env_builder: NeonEnvBuilder):
     # we try to simulate large (flush_lsn - truncate_lsn) lag, to test that WAL segments
     # are not removed before broadcasted to all safekeepers, with the help of replication slot
     asyncio.run(
-        run_restarts_under_load(env, endpoint, env.safekeepers, period_time=15, iterations=5)
+        run_restarts_under_load(env, endpoint, env.safekeepers, period_time=15, iterations=4)
     )