switch pytests and neon_local to control_plane_hooks_api (#11195)

We want to switch away from and deprecate the `--compute-hook-url` param
for the storcon in favour of `--control-plane-url` because it allows us
to construct urls with `notify-safekeepers`.

This PR switches the pytests and neon_local from a
`control_plane_compute_hook_api` to a new param named
`control_plane_hooks_api` which is supposed to point to the parent of
the `notify-attach` URL.

We still support reading the old url from disk to not be too disruptive
with existing deployments, but we just ignore it.

Also add docs for the `notify-safekeepers` upcall API.

Follow-up of #11173
Part of https://github.com/neondatabase/neon/issues/11163
This commit is contained in:
Arpad Müller
2025-03-13 20:50:52 +01:00
committed by GitHub
parent 8afae9d03c
commit b1a1be6a4c
11 changed files with 85 additions and 49 deletions

View File

@@ -979,7 +979,7 @@ fn handle_init(args: &InitCmdArgs) -> anyhow::Result<LocalEnv> {
neon_distrib_dir: None,
default_tenant_id: TenantId::from_array(std::array::from_fn(|_| 0)),
storage_controller: None,
control_plane_compute_hook_api: None,
control_plane_hooks_api: None,
generate_local_ssl_certs: false,
}
};

View File

@@ -72,9 +72,9 @@ pub struct LocalEnv {
// be propagated into each pageserver's configuration.
pub control_plane_api: Url,
// Control plane upcall API for storage controller. If set, this will be propagated into the
// Control plane upcall APIs for storage controller. If set, this will be propagated into the
// storage controller's configuration.
pub control_plane_compute_hook_api: Option<Url>,
pub control_plane_hooks_api: Option<Url>,
/// Keep human-readable aliases in memory (and persist them to config), to hide ZId hex strings from the user.
// A `HashMap<String, HashMap<TenantId, TimelineId>>` would be more appropriate here,
@@ -104,6 +104,7 @@ pub struct OnDiskConfig {
pub pageservers: Vec<PageServerConf>,
pub safekeepers: Vec<SafekeeperConf>,
pub control_plane_api: Option<Url>,
pub control_plane_hooks_api: Option<Url>,
pub control_plane_compute_hook_api: Option<Url>,
branch_name_mappings: HashMap<String, Vec<(TenantId, TimelineId)>>,
// Note: skip serializing because in compat tests old storage controller fails
@@ -136,7 +137,7 @@ pub struct NeonLocalInitConf {
pub pageservers: Vec<NeonLocalInitPageserverConf>,
pub safekeepers: Vec<SafekeeperConf>,
pub control_plane_api: Option<Url>,
pub control_plane_compute_hook_api: Option<Option<Url>>,
pub control_plane_hooks_api: Option<Url>,
pub generate_local_ssl_certs: bool,
}
@@ -573,7 +574,8 @@ impl LocalEnv {
pageservers,
safekeepers,
control_plane_api,
control_plane_compute_hook_api,
control_plane_hooks_api,
control_plane_compute_hook_api: _,
branch_name_mappings,
generate_local_ssl_certs,
} = on_disk_config;
@@ -588,7 +590,7 @@ impl LocalEnv {
pageservers,
safekeepers,
control_plane_api: control_plane_api.unwrap(),
control_plane_compute_hook_api,
control_plane_hooks_api,
branch_name_mappings,
generate_local_ssl_certs,
}
@@ -695,7 +697,8 @@ impl LocalEnv {
pageservers: vec![], // it's skip_serializing anyway
safekeepers: self.safekeepers.clone(),
control_plane_api: Some(self.control_plane_api.clone()),
control_plane_compute_hook_api: self.control_plane_compute_hook_api.clone(),
control_plane_hooks_api: self.control_plane_hooks_api.clone(),
control_plane_compute_hook_api: None,
branch_name_mappings: self.branch_name_mappings.clone(),
generate_local_ssl_certs: self.generate_local_ssl_certs,
},
@@ -779,8 +782,8 @@ impl LocalEnv {
pageservers,
safekeepers,
control_plane_api,
control_plane_compute_hook_api,
generate_local_ssl_certs,
control_plane_hooks_api,
} = conf;
// Find postgres binaries.
@@ -827,7 +830,7 @@ impl LocalEnv {
pageservers: pageservers.iter().map(Into::into).collect(),
safekeepers,
control_plane_api: control_plane_api.unwrap(),
control_plane_compute_hook_api: control_plane_compute_hook_api.unwrap_or_default(),
control_plane_hooks_api,
branch_name_mappings: Default::default(),
generate_local_ssl_certs,
};

View File

@@ -558,10 +558,8 @@ impl StorageController {
args.push(format!("--public-key=\"{public_key}\""));
}
if let Some(control_plane_compute_hook_api) = &self.env.control_plane_compute_hook_api {
args.push(format!(
"--compute-hook-url={control_plane_compute_hook_api}"
));
if let Some(control_plane_hooks_api) = &self.env.control_plane_hooks_api {
args.push(format!("--control-plane-url={control_plane_hooks_api}"));
}
if let Some(split_threshold) = self.config.split_threshold.as_ref() {

View File

@@ -101,15 +101,25 @@ changes such as a pageserver node becoming unavailable, or the tenant's shard co
postgres clients to handle such changes, the storage controller calls an API hook when a tenant's pageserver
location changes.
The hook is configured using the storage controller's `--control-plane-url` CLI option. If the hook requires
JWT auth, the token may be provided with `--control-plane-jwt-token`. The hook will be invoked with a `PUT` request.
The hook is configured using the storage controller's `--control-plane-url` CLI option, from which the hook URL is computed.
In the Neon cloud service, this hook is implemented by Neon's internal cloud control plane. In `neon_local` systems
Currently, there is two hooks, each computed by appending the name to the provided control plane URL prefix:
- `notify-attach`, called whenever attachment for pageservers changes
- `notify-safekeepers`, called whenever attachment for safekeepers changes
If the hooks require JWT auth, the token may be provided with `--control-plane-jwt-token`.
The hooks will be invoked with a `PUT` request.
In the Neon cloud service, these hooks are implemented by Neon's internal cloud control plane. In `neon_local` systems,
the storage controller integrates directly with neon_local to reconfigure local postgres processes instead of calling
the compute hook.
When implementing an on-premise Neon deployment, you must implement a service that handles the compute hook. This is not complicated:
the request body has format of the `ComputeHookNotifyRequest` structure, provided below for convenience.
When implementing an on-premise Neon deployment, you must implement a service that handles the compute hooks. This is not complicated.
### `notify-attach` body
The `notify-attach` request body follows the format of the `ComputeHookNotifyRequest` structure, provided below for convenience.
```
struct ComputeHookNotifyRequestShard {
@@ -128,15 +138,15 @@ When a notification is received:
1. Modify postgres configuration for this tenant:
- set `neon.pageserver_connstr` to a comma-separated list of postgres connection strings to pageservers according to the `shards` list. The
- set `neon.pageserver_connstring` to a comma-separated list of postgres connection strings to pageservers according to the `shards` list. The
shards identified by `NodeId` must be converted to the address+port of the node.
- if stripe_size is not None, set `neon.stripe_size` to this value
- if stripe_size is not None, set `neon.shard_stripe_size` to this value
2. Send SIGHUP to postgres to reload configuration
3. Respond with 200 to the notification request. Do not return success if postgres was not updated: if an error is returned, the controller
will retry the notification until it succeeds..
### Example notification body
Example body:
```
{
@@ -148,3 +158,34 @@ When a notification is received:
],
}
```
### `notify-safekeepers` body
The `notify-safekeepers` request body forllows the format of the `SafekeepersNotifyRequest` structure, provided below for convenience.
```
pub struct SafekeeperInfo {
pub id: NodeId,
pub hostname: String,
}
pub struct SafekeepersNotifyRequest {
pub tenant_id: TenantId,
pub timeline_id: TimelineId,
pub generation: u32,
pub safekeepers: Vec<SafekeeperInfo>,
}
```
When a notification is received:
1. Modify postgres configuration for this tenant:
- set `neon.safekeeper_connstrings` to an array of postgres connection strings to safekeepers according to the `safekeepers` list. The
safekeepers identified by `NodeId` must be converted to the address+port of the respective safekeeper.
The hostname is provided for debugging purposes, so we reserve changes to how we pass it.
- set `neon.safekeepers_generation` to the provided `generation` value.
2. Send SIGHUP to postgres to reload configuration
3. Respond with 200 to the notification request. Do not return success if postgres was not updated: if an error is returned, the controller
will retry the notification until it succeeds..

View File

@@ -19,7 +19,7 @@ if TYPE_CHECKING:
class ComputeReconfigure:
def __init__(self, server: HTTPServer):
self.server = server
self.control_plane_compute_hook_api = f"http://{server.host}:{server.port}/notify-attach"
self.control_plane_hooks_api = f"http://{server.host}:{server.port}/"
self.workloads: dict[TenantId, Any] = {}
self.on_notify: Callable[[Any], None] | None = None

View File

@@ -460,7 +460,7 @@ class NeonEnvBuilder:
self.overlay_mounts_created_by_us: list[tuple[str, Path]] = []
self.config_init_force: str | None = None
self.top_output_dir = top_output_dir
self.control_plane_compute_hook_api: str | None = None
self.control_plane_hooks_api: str | None = None
self.storage_controller_config: dict[Any, Any] | None = None
# Flag to enable https listener in pageserver, generate local ssl certs,
@@ -1116,7 +1116,7 @@ class NeonEnv:
self.control_plane_api: str = self.storage_controller.upcall_api_endpoint()
# For testing this with a fake HTTP server, enable passing through a URL from config
self.control_plane_compute_hook_api = config.control_plane_compute_hook_api
self.control_plane_hooks_api = config.control_plane_hooks_api
self.pageserver_virtual_file_io_engine = config.pageserver_virtual_file_io_engine
self.pageserver_virtual_file_io_mode = config.pageserver_virtual_file_io_mode
@@ -1137,8 +1137,8 @@ class NeonEnv:
if self.control_plane_api is not None:
cfg["control_plane_api"] = self.control_plane_api
if self.control_plane_compute_hook_api is not None:
cfg["control_plane_compute_hook_api"] = self.control_plane_compute_hook_api
if self.control_plane_hooks_api is not None:
cfg["control_plane_hooks_api"] = self.control_plane_hooks_api
storage_controller_config = self.storage_controller_config

View File

@@ -83,9 +83,7 @@ def test_storage_controller_many_tenants(
"max_offline": "30s",
"max_warming_up": "300s",
}
neon_env_builder.control_plane_compute_hook_api = (
compute_reconfigure_listener.control_plane_compute_hook_api
)
neon_env_builder.control_plane_hooks_api = compute_reconfigure_listener.control_plane_hooks_api
AZS = ["alpha", "bravo", "charlie"]

View File

@@ -23,8 +23,8 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder, make_httpserver):
)
env = neon_env_builder.init_start()
neon_env_builder.control_plane_compute_hook_api = (
f"http://{make_httpserver.host}:{make_httpserver.port}/notify-attach"
neon_env_builder.control_plane_hooks_api = (
f"http://{make_httpserver.host}:{make_httpserver.port}/"
)
def ignore_notify(request: Request):

View File

@@ -87,8 +87,8 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, make_httpserver,
neon_env_builder.enable_pageserver_remote_storage(
remote_storage_kind=s3_storage(),
)
neon_env_builder.control_plane_compute_hook_api = (
f"http://{make_httpserver.host}:{make_httpserver.port}/notify-attach"
neon_env_builder.control_plane_hooks_api = (
f"http://{make_httpserver.host}:{make_httpserver.port}/"
)
def ignore_notify(request: Request):

View File

@@ -794,7 +794,7 @@ def test_sharding_split_stripe_size(
Check that modifying stripe size inline with a shard split works as expected
"""
(host, port) = httpserver_listen_address
neon_env_builder.control_plane_compute_hook_api = f"http://{host}:{port}/notify"
neon_env_builder.control_plane_hooks_api = f"http://{host}:{port}"
neon_env_builder.num_pageservers = 1
# Set up fake HTTP notify endpoint: we will use this to validate that we receive
@@ -806,7 +806,7 @@ def test_sharding_split_stripe_size(
notifications.append(request.json)
return Response(status=200)
httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler)
httpserver.expect_request("/notify-attach", method="PUT").respond_with_handler(handler)
env = neon_env_builder.init_start(
initial_tenant_shard_count=1, initial_tenant_shard_stripe_size=initial_stripe_size
@@ -1312,9 +1312,7 @@ def test_sharding_split_failures(
failure: Failure,
):
neon_env_builder.num_pageservers = 4
neon_env_builder.control_plane_compute_hook_api = (
compute_reconfigure_listener.control_plane_compute_hook_api
)
neon_env_builder.control_plane_hooks_api = compute_reconfigure_listener.control_plane_hooks_api
initial_shard_count = 2
split_shard_count = 4

View File

@@ -605,7 +605,7 @@ def test_storage_controller_compute_hook(
# when migrating.
neon_env_builder.num_pageservers = 2
(host, port) = httpserver_listen_address
neon_env_builder.control_plane_compute_hook_api = f"http://{host}:{port}/notify"
neon_env_builder.control_plane_hooks_api = f"http://{host}:{port}"
# Set up fake HTTP notify endpoint
notifications = []
@@ -618,7 +618,7 @@ def test_storage_controller_compute_hook(
notifications.append(request.json)
return Response(status=status)
httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler)
httpserver.expect_request("/notify-attach", method="PUT").respond_with_handler(handler)
# Start running
env = neon_env_builder.init_start(initial_tenant_conf={"lsn_lease_length": "0s"})
@@ -724,7 +724,7 @@ def test_storage_controller_stuck_compute_hook(
neon_env_builder.num_pageservers = 2
(host, port) = httpserver_listen_address
neon_env_builder.control_plane_compute_hook_api = f"http://{host}:{port}/notify"
neon_env_builder.control_plane_hooks_api = f"http://{host}:{port}"
handle_params = {"status": 200}
@@ -736,7 +736,7 @@ def test_storage_controller_stuck_compute_hook(
notifications.append(request.json)
return Response(status=status)
httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler)
httpserver.expect_request("/notify-attach", method="PUT").respond_with_handler(handler)
# Start running
env = neon_env_builder.init_start(initial_tenant_conf={"lsn_lease_length": "0s"})
@@ -871,7 +871,7 @@ def test_storage_controller_compute_hook_retry(
neon_env_builder.num_pageservers = 2
(host, port) = httpserver_listen_address
neon_env_builder.control_plane_compute_hook_api = f"http://{host}:{port}/notify"
neon_env_builder.control_plane_hooks_api = f"http://{host}:{port}"
handle_params = {"status": 200}
@@ -883,7 +883,7 @@ def test_storage_controller_compute_hook_retry(
notifications.append(request.json)
return Response(status=status)
httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler)
httpserver.expect_request("/notify-attach", method="PUT").respond_with_handler(handler)
# Start running
env = neon_env_builder.init_configs()
@@ -993,7 +993,7 @@ def test_storage_controller_compute_hook_revert(
# when migrating.
neon_env_builder.num_pageservers = 2
(host, port) = httpserver_listen_address
neon_env_builder.control_plane_compute_hook_api = f"http://{host}:{port}/notify"
neon_env_builder.control_plane_hooks_api = f"http://{host}:{port}"
# Set up fake HTTP notify endpoint
notifications = []
@@ -1006,7 +1006,7 @@ def test_storage_controller_compute_hook_revert(
notifications.append(request.json)
return Response(status=status)
httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler)
httpserver.expect_request("/notify-attach", method="PUT").respond_with_handler(handler)
# Start running
env = neon_env_builder.init_start(initial_tenant_conf={"lsn_lease_length": "0s"})
@@ -1395,9 +1395,7 @@ def test_storage_controller_tenant_deletion(
"""
neon_env_builder.num_pageservers = 4
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
neon_env_builder.control_plane_compute_hook_api = (
compute_reconfigure_listener.control_plane_compute_hook_api
)
neon_env_builder.control_plane_hooks_api = compute_reconfigure_listener.control_plane_hooks_api
env = neon_env_builder.init_configs()
env.start()