mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-28 18:40:38 +00:00
Allow longer timeout for starting pageserver, safe keeper and storage controller in test cases to make test cases less flaky (#8079)
## Problem see https://github.com/neondatabase/neon/issues/8070 ## Summary of changes the neon_local subcommands to - start neon - start pageserver - start safekeeper - start storage controller get a new option -t=xx or --start-timeout=xx which allows to specify a longer timeout in seconds we wait for the process start. This is useful in test cases where the pageserver has to read a lot of layer data, like in pagebench test cases. In addition we exploit the new timeout option in the python test infrastructure (python fixtures) and modify the flaky testcase to increase the timeout from 10 seconds to 1 minute. Example from the test execution ```bash RUST_BACKTRACE=1 NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS=1 DEFAULT_PG_VERSION=15 BUILD_TYPE=release ./scripts/pytest test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py ... 2024-06-19 09:29:34.590 INFO [neon_fixtures.py:1513] Running command "/instance_store/neon/target/release/neon_local storage_controller start --start-timeout=60s" 2024-06-19 09:29:36.365 INFO [broker.py:34] starting storage_broker to listen incoming connections at "127.0.0.1:15001" 2024-06-19 09:29:36.365 INFO [neon_fixtures.py:1513] Running command "/instance_store/neon/target/release/neon_local pageserver start --id=1 --start-timeout=60s" 2024-06-19 09:29:36.366 INFO [neon_fixtures.py:1513] Running command "/instance_store/neon/target/release/neon_local safekeeper start 1 --start-timeout=60s" ```
This commit is contained in:
@@ -1177,10 +1177,10 @@ class NeonEnv:
|
||||
force=config.config_init_force,
|
||||
)
|
||||
|
||||
def start(self):
|
||||
def start(self, timeout_in_seconds: Optional[int] = None):
|
||||
# Storage controller starts first, so that pageserver /re-attach calls don't
|
||||
# bounce through retries on startup
|
||||
self.storage_controller.start()
|
||||
self.storage_controller.start(timeout_in_seconds=timeout_in_seconds)
|
||||
|
||||
# Wait for storage controller readiness to prevent unnecessary post start-up
|
||||
# reconcile.
|
||||
@@ -1196,10 +1196,18 @@ class NeonEnv:
|
||||
) # The `or None` is for the linter
|
||||
|
||||
for pageserver in self.pageservers:
|
||||
futs.append(executor.submit(lambda ps=pageserver: ps.start()))
|
||||
futs.append(
|
||||
executor.submit(
|
||||
lambda ps=pageserver: ps.start(timeout_in_seconds=timeout_in_seconds)
|
||||
)
|
||||
)
|
||||
|
||||
for safekeeper in self.safekeepers:
|
||||
futs.append(executor.submit(lambda sk=safekeeper: sk.start()))
|
||||
futs.append(
|
||||
executor.submit(
|
||||
lambda sk=safekeeper: sk.start(timeout_in_seconds=timeout_in_seconds)
|
||||
)
|
||||
)
|
||||
|
||||
for f in futs:
|
||||
f.result()
|
||||
@@ -1783,8 +1791,13 @@ class NeonCli(AbstractNeonCli):
|
||||
res.check_returncode()
|
||||
return res
|
||||
|
||||
def storage_controller_start(self):
|
||||
def storage_controller_start(
|
||||
self,
|
||||
timeout_in_seconds: Optional[int] = None,
|
||||
):
|
||||
cmd = ["storage_controller", "start"]
|
||||
if timeout_in_seconds is not None:
|
||||
cmd.append(f"--start-timeout={timeout_in_seconds}s")
|
||||
return self.raw_cli(cmd)
|
||||
|
||||
def storage_controller_stop(self, immediate: bool):
|
||||
@@ -1797,8 +1810,11 @@ class NeonCli(AbstractNeonCli):
|
||||
self,
|
||||
id: int,
|
||||
extra_env_vars: Optional[Dict[str, str]] = None,
|
||||
timeout_in_seconds: Optional[int] = None,
|
||||
) -> "subprocess.CompletedProcess[str]":
|
||||
start_args = ["pageserver", "start", f"--id={id}"]
|
||||
if timeout_in_seconds is not None:
|
||||
start_args.append(f"--start-timeout={timeout_in_seconds}s")
|
||||
storage = self.env.pageserver_remote_storage
|
||||
|
||||
if isinstance(storage, S3Storage):
|
||||
@@ -1816,7 +1832,10 @@ class NeonCli(AbstractNeonCli):
|
||||
return self.raw_cli(cmd)
|
||||
|
||||
def safekeeper_start(
|
||||
self, id: int, extra_opts: Optional[List[str]] = None
|
||||
self,
|
||||
id: int,
|
||||
extra_opts: Optional[List[str]] = None,
|
||||
timeout_in_seconds: Optional[int] = None,
|
||||
) -> "subprocess.CompletedProcess[str]":
|
||||
s3_env_vars = None
|
||||
if isinstance(self.env.safekeepers_remote_storage, S3Storage):
|
||||
@@ -1826,6 +1845,8 @@ class NeonCli(AbstractNeonCli):
|
||||
extra_opts = [f"-e={opt}" for opt in extra_opts]
|
||||
else:
|
||||
extra_opts = []
|
||||
if timeout_in_seconds is not None:
|
||||
extra_opts.append(f"--start-timeout={timeout_in_seconds}s")
|
||||
return self.raw_cli(
|
||||
["safekeeper", "start", str(id), *extra_opts], extra_env_vars=s3_env_vars
|
||||
)
|
||||
@@ -2077,9 +2098,9 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
self.allowed_errors: list[str] = DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS
|
||||
self.logfile = self.workdir / "storage_controller.log"
|
||||
|
||||
def start(self):
|
||||
def start(self, timeout_in_seconds: Optional[int] = None):
|
||||
assert not self.running
|
||||
self.env.neon_cli.storage_controller_start()
|
||||
self.env.neon_cli.storage_controller_start(timeout_in_seconds)
|
||||
self.running = True
|
||||
return self
|
||||
|
||||
@@ -2531,6 +2552,7 @@ class NeonPageserver(PgProtocol, LogUtils):
|
||||
def start(
|
||||
self,
|
||||
extra_env_vars: Optional[Dict[str, str]] = None,
|
||||
timeout_in_seconds: Optional[int] = None,
|
||||
) -> "NeonPageserver":
|
||||
"""
|
||||
Start the page server.
|
||||
@@ -2539,7 +2561,9 @@ class NeonPageserver(PgProtocol, LogUtils):
|
||||
"""
|
||||
assert self.running is False
|
||||
|
||||
self.env.neon_cli.pageserver_start(self.id, extra_env_vars=extra_env_vars)
|
||||
self.env.neon_cli.pageserver_start(
|
||||
self.id, extra_env_vars=extra_env_vars, timeout_in_seconds=timeout_in_seconds
|
||||
)
|
||||
self.running = True
|
||||
return self
|
||||
|
||||
@@ -2553,13 +2577,17 @@ class NeonPageserver(PgProtocol, LogUtils):
|
||||
self.running = False
|
||||
return self
|
||||
|
||||
def restart(self, immediate: bool = False):
|
||||
def restart(
|
||||
self,
|
||||
immediate: bool = False,
|
||||
timeout_in_seconds: Optional[int] = None,
|
||||
):
|
||||
"""
|
||||
High level wrapper for restart: restarts the process, and waits for
|
||||
tenant state to stabilize.
|
||||
"""
|
||||
self.stop(immediate=immediate)
|
||||
self.start()
|
||||
self.start(timeout_in_seconds=timeout_in_seconds)
|
||||
self.quiesce_tenants()
|
||||
|
||||
def quiesce_tenants(self):
|
||||
@@ -3835,9 +3863,13 @@ class Safekeeper(LogUtils):
|
||||
self.running = running
|
||||
self.logfile = Path(self.data_dir) / f"safekeeper-{id}.log"
|
||||
|
||||
def start(self, extra_opts: Optional[List[str]] = None) -> "Safekeeper":
|
||||
def start(
|
||||
self, extra_opts: Optional[List[str]] = None, timeout_in_seconds: Optional[int] = None
|
||||
) -> "Safekeeper":
|
||||
assert self.running is False
|
||||
self.env.neon_cli.safekeeper_start(self.id, extra_opts=extra_opts)
|
||||
self.env.neon_cli.safekeeper_start(
|
||||
self.id, extra_opts=extra_opts, timeout_in_seconds=timeout_in_seconds
|
||||
)
|
||||
self.running = True
|
||||
# wait for wal acceptor start by checking its status
|
||||
started_at = time.time()
|
||||
|
||||
@@ -85,6 +85,8 @@ def test_pageserver_max_throughput_getpage_at_latest_lsn(
|
||||
f"max_throughput_latest_lsn-{n_tenants}-{pgbench_scale}",
|
||||
n_tenants,
|
||||
setup_wrapper,
|
||||
# https://github.com/neondatabase/neon/issues/8070
|
||||
timeout_in_seconds=60,
|
||||
)
|
||||
|
||||
env.pageserver.allowed_errors.append(
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
Utilities used by all code in this sub-directory
|
||||
"""
|
||||
|
||||
from typing import Any, Callable, Dict, Tuple
|
||||
from typing import Any, Callable, Dict, Optional, Tuple
|
||||
|
||||
import fixtures.pageserver.many_tenants as many_tenants
|
||||
from fixtures.common_types import TenantId, TimelineId
|
||||
@@ -41,6 +41,7 @@ def setup_pageserver_with_tenants(
|
||||
name: str,
|
||||
n_tenants: int,
|
||||
setup: Callable[[NeonEnv], Tuple[TenantId, TimelineId, Dict[str, Any]]],
|
||||
timeout_in_seconds: Optional[int] = None,
|
||||
) -> NeonEnv:
|
||||
"""
|
||||
Utility function to set up a pageserver with a given number of identical tenants.
|
||||
@@ -50,6 +51,6 @@ def setup_pageserver_with_tenants(
|
||||
return many_tenants.single_timeline(neon_env_builder, setup, n_tenants)
|
||||
|
||||
env = neon_env_builder.build_and_use_snapshot(name, doit)
|
||||
env.start()
|
||||
env.start(timeout_in_seconds=timeout_in_seconds)
|
||||
ensure_pageserver_ready_for_benchmarking(env, n_tenants)
|
||||
return env
|
||||
|
||||
Reference in New Issue
Block a user