Allow to change compute safekeeper list without restart.

- Add --safekeepers option to neon_local reconfigure
- Add it to python Endpoint reconfigure
- Implement config reload in walproposer by restarting the whole bgw when
  safekeeper list changes.

ref https://github.com/neondatabase/neon/issues/6341
This commit is contained in:
Arseny Sher
2024-06-17 16:23:07 +03:00
committed by Arseny Sher
parent d557002675
commit 6f20a18e8e
6 changed files with 139 additions and 55 deletions

View File

@@ -1933,6 +1933,7 @@ class NeonCli(AbstractNeonCli):
endpoint_id: str,
tenant_id: Optional[TenantId] = None,
pageserver_id: Optional[int] = None,
safekeepers: Optional[List[int]] = None,
check_return_code=True,
) -> "subprocess.CompletedProcess[str]":
args = ["endpoint", "reconfigure", endpoint_id]
@@ -1940,6 +1941,8 @@ class NeonCli(AbstractNeonCli):
args.extend(["--tenant-id", str(tenant_id)])
if pageserver_id is not None:
args.extend(["--pageserver-id", str(pageserver_id)])
if safekeepers is not None:
args.extend(["--safekeepers", (",".join(map(str, safekeepers)))])
return self.raw_cli(args, check_return_code=check_return_code)
def endpoint_stop(
@@ -3484,6 +3487,7 @@ class Endpoint(PgProtocol, LogUtils):
self.pg_port = pg_port
self.http_port = http_port
self.check_stop_result = check_stop_result
# passed to endpoint create and endpoint reconfigure
self.active_safekeepers: List[int] = list(map(lambda sk: sk.id, env.safekeepers))
# path to conf is <repo_dir>/endpoints/<endpoint_id>/pgdata/postgresql.conf
@@ -3552,6 +3556,7 @@ class Endpoint(PgProtocol, LogUtils):
self,
remote_ext_config: Optional[str] = None,
pageserver_id: Optional[int] = None,
safekeepers: Optional[List[int]] = None,
allow_multiple: bool = False,
) -> "Endpoint":
"""
@@ -3561,6 +3566,11 @@ class Endpoint(PgProtocol, LogUtils):
assert self.endpoint_id is not None
# If `safekeepers` is not None, they are remember them as active and use
# in the following commands.
if safekeepers is not None:
self.active_safekeepers = safekeepers
log.info(f"Starting postgres endpoint {self.endpoint_id}")
self.env.neon_cli.endpoint_start(
@@ -3624,9 +3634,17 @@ class Endpoint(PgProtocol, LogUtils):
def is_running(self):
return self._running._value > 0
def reconfigure(self, pageserver_id: Optional[int] = None):
def reconfigure(
self, pageserver_id: Optional[int] = None, safekeepers: Optional[List[int]] = None
):
assert self.endpoint_id is not None
self.env.neon_cli.endpoint_reconfigure(self.endpoint_id, self.tenant_id, pageserver_id)
# If `safekeepers` is not None, they are remember them as active and use
# in the following commands.
if safekeepers is not None:
self.active_safekeepers = safekeepers
self.env.neon_cli.endpoint_reconfigure(
self.endpoint_id, self.tenant_id, pageserver_id, self.active_safekeepers
)
def respec(self, **kwargs):
"""Update the endpoint.json file used by control_plane."""

View File

@@ -1725,7 +1725,10 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
# Basic pull_timeline test.
def test_pull_timeline(neon_env_builder: NeonEnvBuilder):
# When live_sk_change is False, compute is restarted to change set of
# safekeepers; otherwise it is live reload.
@pytest.mark.parametrize("live_sk_change", [False, True])
def test_pull_timeline(neon_env_builder: NeonEnvBuilder, live_sk_change: bool):
neon_env_builder.auth_enabled = True
def execute_payload(endpoint: Endpoint):
@@ -1758,8 +1761,7 @@ def test_pull_timeline(neon_env_builder: NeonEnvBuilder):
log.info("Use only first 3 safekeepers")
env.safekeepers[3].stop()
endpoint = env.endpoints.create("main")
endpoint.active_safekeepers = [1, 2, 3]
endpoint.start()
endpoint.start(safekeepers=[1, 2, 3])
execute_payload(endpoint)
show_statuses(env.safekeepers, tenant_id, timeline_id)
@@ -1771,29 +1773,22 @@ def test_pull_timeline(neon_env_builder: NeonEnvBuilder):
log.info("Initialize new safekeeper 4, pull data from 1 & 3")
env.safekeepers[3].start()
res = (
env.safekeepers[3]
.http_client(auth_token=env.auth_keys.generate_safekeeper_token())
.pull_timeline(
{
"tenant_id": str(tenant_id),
"timeline_id": str(timeline_id),
"http_hosts": [
f"http://localhost:{env.safekeepers[0].port.http}",
f"http://localhost:{env.safekeepers[2].port.http}",
],
}
)
res = env.safekeepers[3].pull_timeline(
[env.safekeepers[0], env.safekeepers[2]], tenant_id, timeline_id
)
log.info("Finished pulling timeline")
log.info(res)
show_statuses(env.safekeepers, tenant_id, timeline_id)
log.info("Restarting compute with new config to verify that it works")
endpoint.stop_and_destroy().create("main")
endpoint.active_safekeepers = [1, 3, 4]
endpoint.start()
action = "reconfiguing" if live_sk_change else "restarting"
log.info(f"{action} compute with new config to verify that it works")
new_sks = [1, 3, 4]
if not live_sk_change:
endpoint.stop_and_destroy().create("main")
endpoint.start(safekeepers=new_sks)
else:
endpoint.reconfigure(safekeepers=new_sks)
execute_payload(endpoint)
show_statuses(env.safekeepers, tenant_id, timeline_id)