refactor(pageserver): rely on serde derive for toml deserialization (#7656)

This PR simplifies the pageserver configuration parsing as follows: * introduce the `pageserver_api::config::ConfigToml` type * implement `Default` for `ConfigToml` * use serde derive to do the brain-dead leg-work of processing the toml document * use `serde(default)` to fill in default values * in `pageserver` crate: * use `toml_edit` to deserialize the pageserver.toml string into a `ConfigToml` * `PageServerConfig::parse_and_validate` then * consumes the `ConfigToml` * destructures it exhaustively into its constituent fields * constructs the `PageServerConfig` The rules are: * in `ConfigToml`, use `deny_unknown_fields` everywhere * static default values go in `pageserver_api` * if there cannot be a static default value (e.g. which default IO engine to use, because it depends on the runtime), make the field in `ConfigToml` an `Option` * if runtime-augmentation of a value is needed, do that in `parse_and_validate` * a good example is `virtual_file_io_engine` or `l0_flush`, both of which need to execute code to determine the effective value in `PageServerConf` The benefits: * massive amount of brain-dead repetitive code can be deleted * "unused variable" compile-time errors when removing a config value, due to the exhaustive destructuring in `parse_and_validate` * compile-time errors guide you when adding a new config field Drawbacks: * serde derive is sometimes a bit too magical * `deny_unknown_fields` is easy to miss Future Work / Benefits: * make `neon_local` use `pageserver_api` to construct `ConfigToml` and write it to `pageserver.toml` * This provides more type safety / coompile-time errors than the current approach. ### Refs Fixes #3682 ### Future Work * `remote_storage` deser doesn't reject unknown fields https://github.com/neondatabase/neon/issues/8915 * clean up `libs/pageserver_api/src/config.rs` further * break up into multiple files, at least for tenant config * move `models` as appropriate / refine distinction between config and API models / be explicit about when it's the same * use `pub(crate)` visibility on `mod defaults` to detect stale values
2026-01-08 05:52:55 +00:00 · 2024-09-05 14:59:49 +02:00
parent 6dfbf49128
commit 850421ec06
31 changed files with 1001 additions and 1656 deletions
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -24,7 +24,20 @@ from functools import cached_property, partial
 from itertools import chain, product
 from pathlib import Path
 from types import TracebackType
-from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Tuple, Type, Union, cast
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    Iterator,
+    List,
+    Optional,
+    Tuple,
+    Type,
+    TypeVar,
+    Union,
+    cast,
+)
 from urllib.parse import quote, urlparse

 import asyncpg
@@ -90,6 +103,8 @@ from fixtures.utils import AuxFileStore as AuxFileStore  # reexport

 from .neon_api import NeonAPI, NeonApiEndpoint

+T = TypeVar("T")
+
 """
 This file contains pytest fixtures. A fixture is a test resource that can be
 summoned by placing its name in the test's arguments.
@@ -2986,16 +3001,17 @@ class NeonPageserver(PgProtocol, LogUtils):
    def config_toml_path(self) -> Path:
        return self.workdir / "pageserver.toml"

-    def edit_config_toml(self, edit_fn: Callable[[Dict[str, Any]], None]):
+    def edit_config_toml(self, edit_fn: Callable[[Dict[str, Any]], T]) -> T:
        """
        Edit the pageserver's config toml file in place.
        """
        path = self.config_toml_path
        with open(path, "r") as f:
            config = toml.load(f)
-        edit_fn(config)
+        res = edit_fn(config)
        with open(path, "w") as f:
            toml.dump(config, f)
+        return res

    def patch_config_toml_nonrecursive(self, patch: Dict[str, Any]) -> Dict[str, Any]:
        """
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -142,11 +142,10 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
    # We will start a pageserver with no control_plane_api set, so it won't be able to self-register
    env.storage_controller.node_register(env.pageserver)

-    replaced_config = env.pageserver.patch_config_toml_nonrecursive(
-        {
-            "control_plane_api": "",
-        }
-    )
+    def remove_control_plane_api_field(config):
+        return config.pop("control_plane_api")
+
+    control_plane_api = env.pageserver.edit_config_toml(remove_control_plane_api_field)
    env.pageserver.start()
    env.storage_controller.node_configure(env.pageserver.id, {"availability": "Active"})

@@ -179,7 +178,11 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):

    env.pageserver.stop()
    # Starting without the override that disabled control_plane_api
-    env.pageserver.patch_config_toml_nonrecursive(replaced_config)
+    env.pageserver.patch_config_toml_nonrecursive(
+        {
+            "control_plane_api": control_plane_api,
+        }
+    )
    env.pageserver.start()

    generate_uploads_and_deletions(env, pageserver=env.pageserver, init=False)
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -733,7 +733,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):

    # We will run with the limit set to 1, so that once we have one tenant stuck
    # in a pausable failpoint, the rest are prevented from proceeding through warmup.
-    neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = '1'"
+    neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = 1"

    env = neon_env_builder.init_start()
    pageserver_http = env.pageserver.http_client()
@@ -984,7 +984,7 @@ def test_timeline_logical_size_task_priority(neon_env_builder: NeonEnvBuilder):


 def test_eager_attach_does_not_queue_up(neon_env_builder: NeonEnvBuilder):
-    neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = '1'"
+    neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = 1"

    env = neon_env_builder.init_start()

@@ -1062,7 +1062,7 @@ def test_eager_attach_does_not_queue_up(neon_env_builder: NeonEnvBuilder):
@pytest.mark.parametrize("activation_method", ["endpoint", "branch", "delete"])
 def test_lazy_attach_activation(neon_env_builder: NeonEnvBuilder, activation_method: str):
    # env.initial_tenant will take up this permit when attaching with lazy because of a failpoint activated after restart
-    neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = '1'"
+    neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = 1"

    env = neon_env_builder.init_start()