diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index b3ee045421..7416eac76d 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1746,10 +1746,9 @@ class NeonCli(AbstractNeonCli): def pageserver_start( self, id: int, - overrides: Tuple[str, ...] = (), extra_env_vars: Optional[Dict[str, str]] = None, ) -> "subprocess.CompletedProcess[str]": - start_args = ["pageserver", "start", f"--id={id}", *overrides] + start_args = ["pageserver", "start", f"--id={id}"] storage = self.env.pageserver_remote_storage append_pageserver_param_overrides( params_to_update=start_args, @@ -2412,9 +2411,47 @@ class NeonPageserver(PgProtocol, LogUtils): return self.workdir / "tenants" return self.workdir / "tenants" / str(tenant_shard_id) + @property + def config_toml_path(self) -> Path: + return self.workdir / "pageserver.toml" + + def edit_config_toml(self, edit_fn: Callable[[Dict[str, Any]], bool]): + """ + Edit the pageserver's config toml file in place. + + The `edit_fn` is to manipulate the dict, and if it returns True, the file will be written. + If it returns False, no changes are made to the file system. + """ + path = self.config_toml_path + with open(path, "r") as f: + config = toml.load(f) + save = edit_fn(config) + if save: + with open(path, "w") as f: + toml.dump(config, f) + + def patch_config_toml_nonrecursive(self, patch: Dict[str, Any]) -> Dict[str, Any]: + """ + Non-recursively merge the given `patch` dict into the existing config toml, using `dict.update()`. + Returns the replaced values. + If there was no previous value, the key is mapped to None. + This allows to restore the original value by calling this method with the returned dict. + """ + replacements = {} + + def doit(config: Dict[str, Any]) -> bool: + while len(patch) > 0: + key, new = patch.popitem() + old = config.get(key, None) + config[key] = new + replacements[key] = old + return True + + self.edit_config_toml(doit) + return replacements + def start( self, - overrides: Tuple[str, ...] = (), extra_env_vars: Optional[Dict[str, str]] = None, ) -> "NeonPageserver": """ @@ -2424,9 +2461,7 @@ class NeonPageserver(PgProtocol, LogUtils): """ assert self.running is False - self.env.neon_cli.pageserver_start( - self.id, overrides=overrides, extra_env_vars=extra_env_vars - ) + self.env.neon_cli.pageserver_start(self.id, extra_env_vars=extra_env_vars) self.running = True return self diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py index 54905759bd..8709310748 100644 --- a/test_runner/performance/test_branch_creation.py +++ b/test_runner/performance/test_branch_creation.py @@ -140,10 +140,12 @@ def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int, shape: # start without gc so we can time compaction with less noise; use shorter # period for compaction so it starts earlier + def patch_default_tenant_config(config): + config["compaction_period"] = "3s" + config["gc_period"] = "0s" + return True + env.pageserver.edit_config_toml(patch_default_tenant_config) env.pageserver.start( - overrides=( - "--pageserver-config-override=tenant_config={ compaction_period = '3s', gc_period = '0s' }", - ), # this does print more than we want, but the number should be comparable between runs extra_env_vars={ "RUST_LOG": f"[compaction_loop{{tenant_id={env.initial_tenant}}}]=debug,info" diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py index b83545216d..21fde449f1 100644 --- a/test_runner/regress/test_disk_usage_eviction.py +++ b/test_runner/regress/test_disk_usage_eviction.py @@ -45,17 +45,15 @@ def test_min_resident_size_override_handling( ps_http.set_tenant_config(tenant_id, {}) assert_config(tenant_id, None, default_tenant_conf_value) - env.pageserver.stop() if config_level_override is not None: - env.pageserver.start( - overrides=( - "--pageserver-config-override=tenant_config={ min_resident_size_override = " - + str(config_level_override) - + " }", - ) - ) - else: - env.pageserver.start() + + def set_min_resident_size(config): + config["tenant_config"] = {"min_resident_size": config_level_override} + return True + + env.pageserver.edit_config_toml(set_min_resident_size) + env.pageserver.stop() + env.pageserver.start() tenant_id, _ = env.neon_cli.create_tenant() assert_overrides(tenant_id, config_level_override) @@ -164,34 +162,32 @@ class EvictionEnv: usage eviction task is unknown; it might need to run one more iteration before assertions can be made. """ - disk_usage_config = { - "period": period, - "max_usage_pct": max_usage_pct, - "min_avail_bytes": min_avail_bytes, - "mock_statvfs": mock_behavior, - "eviction_order": eviction_order.config(), - } - - enc = toml.TomlEncoder() # these can sometimes happen during startup before any tenants have been # loaded, so nothing can be evicted, we just wait for next iteration which # is able to evict. pageserver.allowed_errors.append(".*WARN.* disk usage still high.*") - pageserver.start( - overrides=( - "--pageserver-config-override=disk_usage_based_eviction=" - + enc.dump_inline_table(disk_usage_config).replace("\n", " "), + pageserver.patch_config_toml_nonrecursive( + { + "disk_usage_based_eviction": { + "period": period, + "max_usage_pct": max_usage_pct, + "min_avail_bytes": min_avail_bytes, + "mock_statvfs": mock_behavior, + "eviction_order": eviction_order.config(), + }, # Disk usage based eviction runs as a background task. # But pageserver startup delays launch of background tasks for some time, to prioritize initial logical size calculations during startup. # But, initial logical size calculation may not be triggered if safekeepers don't publish new broker messages. # But, we only have a 10-second-timeout in this test. # So, disable the delay for this test. - "--pageserver-config-override=background_task_maximum_delay='0s'", - ), + "background_task_maximum_delay": "0s", + } ) + pageserver.start() + # we now do initial logical size calculation on startup, which on debug builds can fight with disk usage based eviction for tenant_id, timeline_id in self.timelines: tenant_ps = self.neon_env.get_tenant_pageserver(tenant_id) diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index 67f68a62af..eaa81e2ac8 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -220,7 +220,12 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder): # We will start a pageserver with no control_plane_api set, so it won't be able to self-register env.storage_controller.node_register(env.pageserver) - env.pageserver.start(overrides=('--pageserver-config-override=control_plane_api=""',)) + replaced_config = env.pageserver.patch_config_toml_nonrecursive( + { + "control_plane_api": "", + } + ) + env.pageserver.start() env.storage_controller.node_configure(env.pageserver.id, {"availability": "Active"}) env.neon_cli.create_tenant( @@ -251,8 +256,8 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder): assert parse_generation_suffix(key) is None env.pageserver.stop() - # Starting without the override that disabled control_plane_api + env.pageserver.patch_config_toml_nonrecursive(replaced_config) env.pageserver.start() generate_uploads_and_deletions(env, pageserver=env.pageserver, init=False) @@ -525,9 +530,10 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): # incident, but it might be unavoidable: if so, we want to be able to start up # and serve clients. env.pageserver.stop() # Non-immediate: implicitly checking that shutdown doesn't hang waiting for CP - env.pageserver.start( - overrides=("--pageserver-config-override=control_plane_emergency_mode=true",), - ) + replaced = env.pageserver.patch_config_toml_nonrecursive({ + "control_plane_emergency_mode": True, + }) + env.pageserver.start() # The pageserver should provide service to clients generate_uploads_and_deletions(env, init=False, pageserver=env.pageserver) @@ -549,6 +555,7 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): # The pageserver should work fine when subsequently restarted in non-emergency mode env.pageserver.stop() # Non-immediate: implicitly checking that shutdown doesn't hang waiting for CP + env.pageserver.patch_config_toml_nonrecursive(replaced) env.pageserver.start() generate_uploads_and_deletions(env, init=False, pageserver=env.pageserver) diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index fdcb4cf9a4..f9130c00cc 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -290,9 +290,10 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up # This is the pageserver where we'll initially create the tenant. Run it in emergency # mode so that it doesn't talk to storage controller, and do not register it. env.pageservers[0].allowed_errors.append(".*Emergency mode!.*") - env.pageservers[0].start( - overrides=("--pageserver-config-override=control_plane_emergency_mode=true",), - ) + env.pageservers[0].patch_config_toml_nonrecursive({ + "control_plane_emergency_mode": True, + }) + env.pageservers[0].start() origin_ps = env.pageservers[0] # These are the pageservers managed by the sharding service, where the tenant