tests: enable higher concurrency and adjust tests with outlier runtime (#4904)

## Problem

I spent a few minutes seeing how fast I could get our regression test
suite to run on my workstation, for when I want to run a "did I break
anything?" smoke test before pushing to CI.

- Test runtime was dominated by a couple of tests that run for longer
than all the others take together
- Test concurrency was limited to <16 by the ports-per-worker setting

There's no "right answer" for how long a test should
be, but as a rule of thumb, no one test should run
for much longer than the time it takes to run all the
other tests together.

## Summary of changes

- Make the ports per worker setting dynamic depending on worker count
- Modify the longest running tests to run for a shorter time
(`test_duplicate_layers` which uses a pgbench runtime) or fewer
iterations (`test_restarts_frequent_checkpoints`).
This commit is contained in:
John Spray
2023-08-08 09:16:21 +01:00
committed by GitHub
parent 9559ef6f3b
commit 33cb1e9c0c
3 changed files with 12 additions and 20 deletions

View File

@@ -86,19 +86,6 @@ DEFAULT_OUTPUT_DIR: str = "test_output"
DEFAULT_BRANCH_NAME: str = "main"
BASE_PORT: int = 15000
WORKER_PORT_NUM: int = 1000
def pytest_configure(config: Config):
"""
Check that we do not overflow available ports range.
"""
numprocesses = config.getoption("numprocesses")
if (
numprocesses is not None and BASE_PORT + numprocesses * WORKER_PORT_NUM > 32768
): # do not use ephemeral ports
raise Exception("Too many workers configured. Cannot distribute ports for services.")
@pytest.fixture(scope="session")
@@ -200,6 +187,11 @@ def shareable_scope(fixture_name: str, config: Config) -> Literal["session", "fu
return scope
@pytest.fixture(scope="session")
def worker_port_num():
return (32768 - BASE_PORT) // int(os.environ.get("PYTEST_XDIST_WORKER_COUNT", "1"))
@pytest.fixture(scope="session")
def worker_seq_no(worker_id: str) -> int:
# worker_id is a pytest-xdist fixture
@@ -212,10 +204,10 @@ def worker_seq_no(worker_id: str) -> int:
@pytest.fixture(scope="session")
def worker_base_port(worker_seq_no: int) -> int:
# so we divide ports in ranges of 100 ports
def worker_base_port(worker_seq_no: int, worker_port_num: int) -> int:
# so we divide ports in ranges of ports
# so workers have disjoint set of ports for services
return BASE_PORT + worker_seq_no * WORKER_PORT_NUM
return BASE_PORT + worker_seq_no * worker_port_num
def get_dir_size(path: str) -> int:
@@ -229,8 +221,8 @@ def get_dir_size(path: str) -> int:
@pytest.fixture(scope="session")
def port_distributor(worker_base_port: int) -> PortDistributor:
return PortDistributor(base_port=worker_base_port, port_number=WORKER_PORT_NUM)
def port_distributor(worker_base_port: int, worker_port_num: int) -> PortDistributor:
return PortDistributor(base_port=worker_base_port, port_number=worker_port_num)
@pytest.fixture(scope="session")

View File

@@ -33,4 +33,4 @@ def test_duplicate_layers(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
time.sleep(10) # let compaction to be performed
assert env.pageserver.log_contains("compact-level0-phase1-return-same")
pg_bin.run_capture(["pgbench", "-P1", "-N", "-c5", "-T500", "-Mprepared", connstr])
pg_bin.run_capture(["pgbench", "-P1", "-N", "-c5", "-T200", "-Mprepared", connstr])

View File

@@ -245,7 +245,7 @@ def test_restarts_frequent_checkpoints(neon_env_builder: NeonEnvBuilder):
# we try to simulate large (flush_lsn - truncate_lsn) lag, to test that WAL segments
# are not removed before broadcasted to all safekeepers, with the help of replication slot
asyncio.run(
run_restarts_under_load(env, endpoint, env.safekeepers, period_time=15, iterations=5)
run_restarts_under_load(env, endpoint, env.safekeepers, period_time=15, iterations=4)
)