mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-21 07:00:38 +00:00
control_plane: database persistence for attachment_service (#6468)
## Problem Spun off from https://github.com/neondatabase/neon/pull/6394 -- this PR is just the persistence parts and the changes that enable it to work nicely ## Summary of changes - Revert #6444 and #6450 - In neon_local, start a vanilla postgres instance for the attachment service to use. - Adopt `diesel` crate for database access in attachment service. This uses raw SQL migrations as the source of truth for the schema, so it's a soft dependency: we can switch libraries pretty easily. - Rewrite persistence.rs to use postgres (via diesel) instead of JSON. - Preserve JSON read+write at startup and shutdown: this enables using the JSON format in compatibility tests, so that we don't have to commit to our DB schema yet. - In neon_local, run database creation + migrations before starting attachment service - Run the initial reconciliation in Service::spawn in the background, so that the pageserver + attachment service don't get stuck waiting for each other to start, when restarting both together in a test.
This commit is contained in:
@@ -2,6 +2,7 @@ from __future__ import annotations
|
||||
|
||||
import abc
|
||||
import asyncio
|
||||
import concurrent.futures
|
||||
import filecmp
|
||||
import json
|
||||
import os
|
||||
@@ -993,6 +994,11 @@ class NeonEnv:
|
||||
self.initial_timeline = config.initial_timeline
|
||||
|
||||
attachment_service_port = self.port_distributor.get_port()
|
||||
# Reserve the next port after attachment service for use by its postgres: this
|
||||
# will assert out if the next port wasn't free.
|
||||
attachment_service_pg_port = self.port_distributor.get_port()
|
||||
assert attachment_service_pg_port == attachment_service_port + 1
|
||||
|
||||
self.control_plane_api: str = f"http://127.0.0.1:{attachment_service_port}"
|
||||
self.attachment_service: NeonAttachmentService = NeonAttachmentService(
|
||||
self, config.auth_enabled
|
||||
@@ -1071,16 +1077,27 @@ class NeonEnv:
|
||||
self.neon_cli.init(cfg, force=config.config_init_force)
|
||||
|
||||
def start(self):
|
||||
# Start up broker, pageserver and all safekeepers
|
||||
self.broker.try_start()
|
||||
|
||||
# Attachment service starts first, so that pageserver /re-attach calls don't
|
||||
# bounce through retries on startup
|
||||
self.attachment_service.start()
|
||||
|
||||
for pageserver in self.pageservers:
|
||||
pageserver.start()
|
||||
# Start up broker, pageserver and all safekeepers
|
||||
futs = []
|
||||
with concurrent.futures.ThreadPoolExecutor(
|
||||
max_workers=2 + len(self.pageservers) + len(self.safekeepers)
|
||||
) as executor:
|
||||
futs.append(
|
||||
executor.submit(lambda: self.broker.try_start() or None)
|
||||
) # The `or None` is for the linter
|
||||
|
||||
for safekeeper in self.safekeepers:
|
||||
safekeeper.start()
|
||||
for pageserver in self.pageservers:
|
||||
futs.append(executor.submit(lambda ps=pageserver: ps.start()))
|
||||
|
||||
for safekeeper in self.safekeepers:
|
||||
futs.append(executor.submit(lambda sk=safekeeper: sk.start()))
|
||||
|
||||
for f in futs:
|
||||
f.result()
|
||||
|
||||
def stop(self, immediate=False, ps_assert_metric_no_errors=False):
|
||||
"""
|
||||
@@ -1652,8 +1669,10 @@ class NeonCli(AbstractNeonCli):
|
||||
id: int,
|
||||
overrides: Tuple[str, ...] = (),
|
||||
extra_env_vars: Optional[Dict[str, str]] = None,
|
||||
register: bool = True,
|
||||
) -> "subprocess.CompletedProcess[str]":
|
||||
start_args = ["pageserver", "start", f"--id={id}", *overrides]
|
||||
register_str = "true" if register else "false"
|
||||
start_args = ["pageserver", "start", f"--id={id}", *overrides, f"--register={register_str}"]
|
||||
storage = self.env.pageserver_remote_storage
|
||||
append_pageserver_param_overrides(
|
||||
params_to_update=start_args,
|
||||
@@ -2080,6 +2099,7 @@ class NeonPageserver(PgProtocol):
|
||||
self,
|
||||
overrides: Tuple[str, ...] = (),
|
||||
extra_env_vars: Optional[Dict[str, str]] = None,
|
||||
register: bool = True,
|
||||
) -> "NeonPageserver":
|
||||
"""
|
||||
Start the page server.
|
||||
@@ -2089,7 +2109,7 @@ class NeonPageserver(PgProtocol):
|
||||
assert self.running is False
|
||||
|
||||
self.env.neon_cli.pageserver_start(
|
||||
self.id, overrides=overrides, extra_env_vars=extra_env_vars
|
||||
self.id, overrides=overrides, extra_env_vars=extra_env_vars, register=register
|
||||
)
|
||||
self.running = True
|
||||
return self
|
||||
|
||||
@@ -138,6 +138,7 @@ def test_create_snapshot(
|
||||
for sk in env.safekeepers:
|
||||
sk.stop()
|
||||
env.pageserver.stop()
|
||||
env.attachment_service.stop()
|
||||
|
||||
# Directory `compatibility_snapshot_dir` is uploaded to S3 in a workflow, keep the name in sync with it
|
||||
compatibility_snapshot_dir = (
|
||||
@@ -226,11 +227,17 @@ def test_forward_compatibility(
|
||||
|
||||
try:
|
||||
neon_env_builder.num_safekeepers = 3
|
||||
neon_local_binpath = neon_env_builder.neon_binpath
|
||||
env = neon_env_builder.from_repo_dir(
|
||||
compatibility_snapshot_dir / "repo",
|
||||
neon_binpath=compatibility_neon_bin,
|
||||
pg_distrib_dir=compatibility_postgres_distrib_dir,
|
||||
)
|
||||
|
||||
# Use current neon_local even though we're using old binaries for
|
||||
# everything else: our test code is written for latest CLI args.
|
||||
env.neon_local_binpath = neon_local_binpath
|
||||
|
||||
neon_env_builder.start()
|
||||
|
||||
check_neon_works(
|
||||
|
||||
@@ -499,7 +499,8 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
|
||||
# and serve clients.
|
||||
env.pageserver.stop() # Non-immediate: implicitly checking that shutdown doesn't hang waiting for CP
|
||||
env.pageserver.start(
|
||||
overrides=("--pageserver-config-override=control_plane_emergency_mode=true",)
|
||||
overrides=("--pageserver-config-override=control_plane_emergency_mode=true",),
|
||||
register=False,
|
||||
)
|
||||
|
||||
# The pageserver should provide service to clients
|
||||
|
||||
Reference in New Issue
Block a user