From b1a1be6a4cb91d6a5af9f643a16f786eeb128c17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Thu, 13 Mar 2025 20:50:52 +0100 Subject: [PATCH] switch pytests and neon_local to control_plane_hooks_api (#11195) We want to switch away from and deprecate the `--compute-hook-url` param for the storcon in favour of `--control-plane-url` because it allows us to construct urls with `notify-safekeepers`. This PR switches the pytests and neon_local from a `control_plane_compute_hook_api` to a new param named `control_plane_hooks_api` which is supposed to point to the parent of the `notify-attach` URL. We still support reading the old url from disk to not be too disruptive with existing deployments, but we just ignore it. Also add docs for the `notify-safekeepers` upcall API. Follow-up of #11173 Part of https://github.com/neondatabase/neon/issues/11163 --- control_plane/src/bin/neon_local.rs | 2 +- control_plane/src/local_env.rs | 19 ++++--- control_plane/src/storage_controller.rs | 6 +- docs/storage_controller.md | 57 ++++++++++++++++--- test_runner/fixtures/compute_reconfigure.py | 2 +- test_runner/fixtures/neon_fixtures.py | 8 +-- .../test_storage_controller_scale.py | 4 +- test_runner/regress/test_change_pageserver.py | 4 +- .../regress/test_pageserver_secondary.py | 4 +- test_runner/regress/test_sharding.py | 8 +-- .../regress/test_storage_controller.py | 20 +++---- 11 files changed, 85 insertions(+), 49 deletions(-) diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 72ebbafd3b..747268f80b 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -979,7 +979,7 @@ fn handle_init(args: &InitCmdArgs) -> anyhow::Result { neon_distrib_dir: None, default_tenant_id: TenantId::from_array(std::array::from_fn(|_| 0)), storage_controller: None, - control_plane_compute_hook_api: None, + control_plane_hooks_api: None, generate_local_ssl_certs: false, } }; diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index ec9eb74e6f..2e57236ddb 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -72,9 +72,9 @@ pub struct LocalEnv { // be propagated into each pageserver's configuration. pub control_plane_api: Url, - // Control plane upcall API for storage controller. If set, this will be propagated into the + // Control plane upcall APIs for storage controller. If set, this will be propagated into the // storage controller's configuration. - pub control_plane_compute_hook_api: Option, + pub control_plane_hooks_api: Option, /// Keep human-readable aliases in memory (and persist them to config), to hide ZId hex strings from the user. // A `HashMap>` would be more appropriate here, @@ -104,6 +104,7 @@ pub struct OnDiskConfig { pub pageservers: Vec, pub safekeepers: Vec, pub control_plane_api: Option, + pub control_plane_hooks_api: Option, pub control_plane_compute_hook_api: Option, branch_name_mappings: HashMap>, // Note: skip serializing because in compat tests old storage controller fails @@ -136,7 +137,7 @@ pub struct NeonLocalInitConf { pub pageservers: Vec, pub safekeepers: Vec, pub control_plane_api: Option, - pub control_plane_compute_hook_api: Option>, + pub control_plane_hooks_api: Option, pub generate_local_ssl_certs: bool, } @@ -573,7 +574,8 @@ impl LocalEnv { pageservers, safekeepers, control_plane_api, - control_plane_compute_hook_api, + control_plane_hooks_api, + control_plane_compute_hook_api: _, branch_name_mappings, generate_local_ssl_certs, } = on_disk_config; @@ -588,7 +590,7 @@ impl LocalEnv { pageservers, safekeepers, control_plane_api: control_plane_api.unwrap(), - control_plane_compute_hook_api, + control_plane_hooks_api, branch_name_mappings, generate_local_ssl_certs, } @@ -695,7 +697,8 @@ impl LocalEnv { pageservers: vec![], // it's skip_serializing anyway safekeepers: self.safekeepers.clone(), control_plane_api: Some(self.control_plane_api.clone()), - control_plane_compute_hook_api: self.control_plane_compute_hook_api.clone(), + control_plane_hooks_api: self.control_plane_hooks_api.clone(), + control_plane_compute_hook_api: None, branch_name_mappings: self.branch_name_mappings.clone(), generate_local_ssl_certs: self.generate_local_ssl_certs, }, @@ -779,8 +782,8 @@ impl LocalEnv { pageservers, safekeepers, control_plane_api, - control_plane_compute_hook_api, generate_local_ssl_certs, + control_plane_hooks_api, } = conf; // Find postgres binaries. @@ -827,7 +830,7 @@ impl LocalEnv { pageservers: pageservers.iter().map(Into::into).collect(), safekeepers, control_plane_api: control_plane_api.unwrap(), - control_plane_compute_hook_api: control_plane_compute_hook_api.unwrap_or_default(), + control_plane_hooks_api, branch_name_mappings: Default::default(), generate_local_ssl_certs, }; diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index bbd7f67720..e28fd70fdf 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -558,10 +558,8 @@ impl StorageController { args.push(format!("--public-key=\"{public_key}\"")); } - if let Some(control_plane_compute_hook_api) = &self.env.control_plane_compute_hook_api { - args.push(format!( - "--compute-hook-url={control_plane_compute_hook_api}" - )); + if let Some(control_plane_hooks_api) = &self.env.control_plane_hooks_api { + args.push(format!("--control-plane-url={control_plane_hooks_api}")); } if let Some(split_threshold) = self.config.split_threshold.as_ref() { diff --git a/docs/storage_controller.md b/docs/storage_controller.md index cf00cd8e33..ac4aca4219 100644 --- a/docs/storage_controller.md +++ b/docs/storage_controller.md @@ -101,15 +101,25 @@ changes such as a pageserver node becoming unavailable, or the tenant's shard co postgres clients to handle such changes, the storage controller calls an API hook when a tenant's pageserver location changes. -The hook is configured using the storage controller's `--control-plane-url` CLI option. If the hook requires -JWT auth, the token may be provided with `--control-plane-jwt-token`. The hook will be invoked with a `PUT` request. +The hook is configured using the storage controller's `--control-plane-url` CLI option, from which the hook URL is computed. -In the Neon cloud service, this hook is implemented by Neon's internal cloud control plane. In `neon_local` systems +Currently, there is two hooks, each computed by appending the name to the provided control plane URL prefix: + +- `notify-attach`, called whenever attachment for pageservers changes +- `notify-safekeepers`, called whenever attachment for safekeepers changes + +If the hooks require JWT auth, the token may be provided with `--control-plane-jwt-token`. +The hooks will be invoked with a `PUT` request. + +In the Neon cloud service, these hooks are implemented by Neon's internal cloud control plane. In `neon_local` systems, the storage controller integrates directly with neon_local to reconfigure local postgres processes instead of calling the compute hook. -When implementing an on-premise Neon deployment, you must implement a service that handles the compute hook. This is not complicated: -the request body has format of the `ComputeHookNotifyRequest` structure, provided below for convenience. +When implementing an on-premise Neon deployment, you must implement a service that handles the compute hooks. This is not complicated. + +### `notify-attach` body + +The `notify-attach` request body follows the format of the `ComputeHookNotifyRequest` structure, provided below for convenience. ``` struct ComputeHookNotifyRequestShard { @@ -128,15 +138,15 @@ When a notification is received: 1. Modify postgres configuration for this tenant: - - set `neon.pageserver_connstr` to a comma-separated list of postgres connection strings to pageservers according to the `shards` list. The + - set `neon.pageserver_connstring` to a comma-separated list of postgres connection strings to pageservers according to the `shards` list. The shards identified by `NodeId` must be converted to the address+port of the node. - - if stripe_size is not None, set `neon.stripe_size` to this value + - if stripe_size is not None, set `neon.shard_stripe_size` to this value 2. Send SIGHUP to postgres to reload configuration 3. Respond with 200 to the notification request. Do not return success if postgres was not updated: if an error is returned, the controller will retry the notification until it succeeds.. -### Example notification body +Example body: ``` { @@ -148,3 +158,34 @@ When a notification is received: ], } ``` + +### `notify-safekeepers` body + +The `notify-safekeepers` request body forllows the format of the `SafekeepersNotifyRequest` structure, provided below for convenience. + +``` +pub struct SafekeeperInfo { + pub id: NodeId, + pub hostname: String, +} + +pub struct SafekeepersNotifyRequest { + pub tenant_id: TenantId, + pub timeline_id: TimelineId, + pub generation: u32, + pub safekeepers: Vec, +} +``` + +When a notification is received: + +1. Modify postgres configuration for this tenant: + + - set `neon.safekeeper_connstrings` to an array of postgres connection strings to safekeepers according to the `safekeepers` list. The + safekeepers identified by `NodeId` must be converted to the address+port of the respective safekeeper. + The hostname is provided for debugging purposes, so we reserve changes to how we pass it. + - set `neon.safekeepers_generation` to the provided `generation` value. + +2. Send SIGHUP to postgres to reload configuration +3. Respond with 200 to the notification request. Do not return success if postgres was not updated: if an error is returned, the controller + will retry the notification until it succeeds.. \ No newline at end of file diff --git a/test_runner/fixtures/compute_reconfigure.py b/test_runner/fixtures/compute_reconfigure.py index 425abef935..205b9141e0 100644 --- a/test_runner/fixtures/compute_reconfigure.py +++ b/test_runner/fixtures/compute_reconfigure.py @@ -19,7 +19,7 @@ if TYPE_CHECKING: class ComputeReconfigure: def __init__(self, server: HTTPServer): self.server = server - self.control_plane_compute_hook_api = f"http://{server.host}:{server.port}/notify-attach" + self.control_plane_hooks_api = f"http://{server.host}:{server.port}/" self.workloads: dict[TenantId, Any] = {} self.on_notify: Callable[[Any], None] | None = None diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 7bc746d668..11ca1d7913 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -460,7 +460,7 @@ class NeonEnvBuilder: self.overlay_mounts_created_by_us: list[tuple[str, Path]] = [] self.config_init_force: str | None = None self.top_output_dir = top_output_dir - self.control_plane_compute_hook_api: str | None = None + self.control_plane_hooks_api: str | None = None self.storage_controller_config: dict[Any, Any] | None = None # Flag to enable https listener in pageserver, generate local ssl certs, @@ -1116,7 +1116,7 @@ class NeonEnv: self.control_plane_api: str = self.storage_controller.upcall_api_endpoint() # For testing this with a fake HTTP server, enable passing through a URL from config - self.control_plane_compute_hook_api = config.control_plane_compute_hook_api + self.control_plane_hooks_api = config.control_plane_hooks_api self.pageserver_virtual_file_io_engine = config.pageserver_virtual_file_io_engine self.pageserver_virtual_file_io_mode = config.pageserver_virtual_file_io_mode @@ -1137,8 +1137,8 @@ class NeonEnv: if self.control_plane_api is not None: cfg["control_plane_api"] = self.control_plane_api - if self.control_plane_compute_hook_api is not None: - cfg["control_plane_compute_hook_api"] = self.control_plane_compute_hook_api + if self.control_plane_hooks_api is not None: + cfg["control_plane_hooks_api"] = self.control_plane_hooks_api storage_controller_config = self.storage_controller_config diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py index 777b9e2870..e897d53cc8 100644 --- a/test_runner/performance/test_storage_controller_scale.py +++ b/test_runner/performance/test_storage_controller_scale.py @@ -83,9 +83,7 @@ def test_storage_controller_many_tenants( "max_offline": "30s", "max_warming_up": "300s", } - neon_env_builder.control_plane_compute_hook_api = ( - compute_reconfigure_listener.control_plane_compute_hook_api - ) + neon_env_builder.control_plane_hooks_api = compute_reconfigure_listener.control_plane_hooks_api AZS = ["alpha", "bravo", "charlie"] diff --git a/test_runner/regress/test_change_pageserver.py b/test_runner/regress/test_change_pageserver.py index 41aa5b47ca..5526b783d5 100644 --- a/test_runner/regress/test_change_pageserver.py +++ b/test_runner/regress/test_change_pageserver.py @@ -23,8 +23,8 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder, make_httpserver): ) env = neon_env_builder.init_start() - neon_env_builder.control_plane_compute_hook_api = ( - f"http://{make_httpserver.host}:{make_httpserver.port}/notify-attach" + neon_env_builder.control_plane_hooks_api = ( + f"http://{make_httpserver.host}:{make_httpserver.port}/" ) def ignore_notify(request: Request): diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index 130db009c9..9f2aa5df8c 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -87,8 +87,8 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, make_httpserver, neon_env_builder.enable_pageserver_remote_storage( remote_storage_kind=s3_storage(), ) - neon_env_builder.control_plane_compute_hook_api = ( - f"http://{make_httpserver.host}:{make_httpserver.port}/notify-attach" + neon_env_builder.control_plane_hooks_api = ( + f"http://{make_httpserver.host}:{make_httpserver.port}/" ) def ignore_notify(request: Request): diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index cb28f5b12d..b98ac8e50a 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -794,7 +794,7 @@ def test_sharding_split_stripe_size( Check that modifying stripe size inline with a shard split works as expected """ (host, port) = httpserver_listen_address - neon_env_builder.control_plane_compute_hook_api = f"http://{host}:{port}/notify" + neon_env_builder.control_plane_hooks_api = f"http://{host}:{port}" neon_env_builder.num_pageservers = 1 # Set up fake HTTP notify endpoint: we will use this to validate that we receive @@ -806,7 +806,7 @@ def test_sharding_split_stripe_size( notifications.append(request.json) return Response(status=200) - httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler) + httpserver.expect_request("/notify-attach", method="PUT").respond_with_handler(handler) env = neon_env_builder.init_start( initial_tenant_shard_count=1, initial_tenant_shard_stripe_size=initial_stripe_size @@ -1312,9 +1312,7 @@ def test_sharding_split_failures( failure: Failure, ): neon_env_builder.num_pageservers = 4 - neon_env_builder.control_plane_compute_hook_api = ( - compute_reconfigure_listener.control_plane_compute_hook_api - ) + neon_env_builder.control_plane_hooks_api = compute_reconfigure_listener.control_plane_hooks_api initial_shard_count = 2 split_shard_count = 4 diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 5eaf69cfa1..05eb4301b0 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -605,7 +605,7 @@ def test_storage_controller_compute_hook( # when migrating. neon_env_builder.num_pageservers = 2 (host, port) = httpserver_listen_address - neon_env_builder.control_plane_compute_hook_api = f"http://{host}:{port}/notify" + neon_env_builder.control_plane_hooks_api = f"http://{host}:{port}" # Set up fake HTTP notify endpoint notifications = [] @@ -618,7 +618,7 @@ def test_storage_controller_compute_hook( notifications.append(request.json) return Response(status=status) - httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler) + httpserver.expect_request("/notify-attach", method="PUT").respond_with_handler(handler) # Start running env = neon_env_builder.init_start(initial_tenant_conf={"lsn_lease_length": "0s"}) @@ -724,7 +724,7 @@ def test_storage_controller_stuck_compute_hook( neon_env_builder.num_pageservers = 2 (host, port) = httpserver_listen_address - neon_env_builder.control_plane_compute_hook_api = f"http://{host}:{port}/notify" + neon_env_builder.control_plane_hooks_api = f"http://{host}:{port}" handle_params = {"status": 200} @@ -736,7 +736,7 @@ def test_storage_controller_stuck_compute_hook( notifications.append(request.json) return Response(status=status) - httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler) + httpserver.expect_request("/notify-attach", method="PUT").respond_with_handler(handler) # Start running env = neon_env_builder.init_start(initial_tenant_conf={"lsn_lease_length": "0s"}) @@ -871,7 +871,7 @@ def test_storage_controller_compute_hook_retry( neon_env_builder.num_pageservers = 2 (host, port) = httpserver_listen_address - neon_env_builder.control_plane_compute_hook_api = f"http://{host}:{port}/notify" + neon_env_builder.control_plane_hooks_api = f"http://{host}:{port}" handle_params = {"status": 200} @@ -883,7 +883,7 @@ def test_storage_controller_compute_hook_retry( notifications.append(request.json) return Response(status=status) - httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler) + httpserver.expect_request("/notify-attach", method="PUT").respond_with_handler(handler) # Start running env = neon_env_builder.init_configs() @@ -993,7 +993,7 @@ def test_storage_controller_compute_hook_revert( # when migrating. neon_env_builder.num_pageservers = 2 (host, port) = httpserver_listen_address - neon_env_builder.control_plane_compute_hook_api = f"http://{host}:{port}/notify" + neon_env_builder.control_plane_hooks_api = f"http://{host}:{port}" # Set up fake HTTP notify endpoint notifications = [] @@ -1006,7 +1006,7 @@ def test_storage_controller_compute_hook_revert( notifications.append(request.json) return Response(status=status) - httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler) + httpserver.expect_request("/notify-attach", method="PUT").respond_with_handler(handler) # Start running env = neon_env_builder.init_start(initial_tenant_conf={"lsn_lease_length": "0s"}) @@ -1395,9 +1395,7 @@ def test_storage_controller_tenant_deletion( """ neon_env_builder.num_pageservers = 4 neon_env_builder.enable_pageserver_remote_storage(s3_storage()) - neon_env_builder.control_plane_compute_hook_api = ( - compute_reconfigure_listener.control_plane_compute_hook_api - ) + neon_env_builder.control_plane_hooks_api = compute_reconfigure_listener.control_plane_hooks_api env = neon_env_builder.init_configs() env.start()