control_plane: database persistence for attachment_service (#6468)

## Problem

Spun off from https://github.com/neondatabase/neon/pull/6394 -- this PR
is just the persistence parts and the changes that enable it to work
nicely


## Summary of changes

- Revert #6444 and #6450
- In neon_local, start a vanilla postgres instance for the attachment
service to use.
- Adopt `diesel` crate for database access in attachment service. This
uses raw SQL migrations as the source of truth for the schema, so it's a
soft dependency: we can switch libraries pretty easily.
- Rewrite persistence.rs to use postgres (via diesel) instead of JSON.
- Preserve JSON read+write at startup and shutdown: this enables using
the JSON format in compatibility tests, so that we don't have to commit
to our DB schema yet.
- In neon_local, run database creation + migrations before starting
attachment service
- Run the initial reconciliation in Service::spawn in the background, so
that the pageserver + attachment service don't get stuck waiting for
each other to start, when restarting both together in a test.
This commit is contained in:
John Spray
2024-01-26 17:20:44 +00:00
committed by GitHub
parent dcc7610ad6
commit 58f6cb649e
28 changed files with 1168 additions and 471 deletions

View File

@@ -2,6 +2,7 @@ from __future__ import annotations
import abc
import asyncio
import concurrent.futures
import filecmp
import json
import os
@@ -993,6 +994,11 @@ class NeonEnv:
self.initial_timeline = config.initial_timeline
attachment_service_port = self.port_distributor.get_port()
# Reserve the next port after attachment service for use by its postgres: this
# will assert out if the next port wasn't free.
attachment_service_pg_port = self.port_distributor.get_port()
assert attachment_service_pg_port == attachment_service_port + 1
self.control_plane_api: str = f"http://127.0.0.1:{attachment_service_port}"
self.attachment_service: NeonAttachmentService = NeonAttachmentService(
self, config.auth_enabled
@@ -1071,16 +1077,27 @@ class NeonEnv:
self.neon_cli.init(cfg, force=config.config_init_force)
def start(self):
# Start up broker, pageserver and all safekeepers
self.broker.try_start()
# Attachment service starts first, so that pageserver /re-attach calls don't
# bounce through retries on startup
self.attachment_service.start()
for pageserver in self.pageservers:
pageserver.start()
# Start up broker, pageserver and all safekeepers
futs = []
with concurrent.futures.ThreadPoolExecutor(
max_workers=2 + len(self.pageservers) + len(self.safekeepers)
) as executor:
futs.append(
executor.submit(lambda: self.broker.try_start() or None)
) # The `or None` is for the linter
for safekeeper in self.safekeepers:
safekeeper.start()
for pageserver in self.pageservers:
futs.append(executor.submit(lambda ps=pageserver: ps.start()))
for safekeeper in self.safekeepers:
futs.append(executor.submit(lambda sk=safekeeper: sk.start()))
for f in futs:
f.result()
def stop(self, immediate=False, ps_assert_metric_no_errors=False):
"""
@@ -1652,8 +1669,10 @@ class NeonCli(AbstractNeonCli):
id: int,
overrides: Tuple[str, ...] = (),
extra_env_vars: Optional[Dict[str, str]] = None,
register: bool = True,
) -> "subprocess.CompletedProcess[str]":
start_args = ["pageserver", "start", f"--id={id}", *overrides]
register_str = "true" if register else "false"
start_args = ["pageserver", "start", f"--id={id}", *overrides, f"--register={register_str}"]
storage = self.env.pageserver_remote_storage
append_pageserver_param_overrides(
params_to_update=start_args,
@@ -2080,6 +2099,7 @@ class NeonPageserver(PgProtocol):
self,
overrides: Tuple[str, ...] = (),
extra_env_vars: Optional[Dict[str, str]] = None,
register: bool = True,
) -> "NeonPageserver":
"""
Start the page server.
@@ -2089,7 +2109,7 @@ class NeonPageserver(PgProtocol):
assert self.running is False
self.env.neon_cli.pageserver_start(
self.id, overrides=overrides, extra_env_vars=extra_env_vars
self.id, overrides=overrides, extra_env_vars=extra_env_vars, register=register
)
self.running = True
return self

View File

@@ -138,6 +138,7 @@ def test_create_snapshot(
for sk in env.safekeepers:
sk.stop()
env.pageserver.stop()
env.attachment_service.stop()
# Directory `compatibility_snapshot_dir` is uploaded to S3 in a workflow, keep the name in sync with it
compatibility_snapshot_dir = (
@@ -226,11 +227,17 @@ def test_forward_compatibility(
try:
neon_env_builder.num_safekeepers = 3
neon_local_binpath = neon_env_builder.neon_binpath
env = neon_env_builder.from_repo_dir(
compatibility_snapshot_dir / "repo",
neon_binpath=compatibility_neon_bin,
pg_distrib_dir=compatibility_postgres_distrib_dir,
)
# Use current neon_local even though we're using old binaries for
# everything else: our test code is written for latest CLI args.
env.neon_local_binpath = neon_local_binpath
neon_env_builder.start()
check_neon_works(

View File

@@ -499,7 +499,8 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
# and serve clients.
env.pageserver.stop() # Non-immediate: implicitly checking that shutdown doesn't hang waiting for CP
env.pageserver.start(
overrides=("--pageserver-config-override=control_plane_emergency_mode=true",)
overrides=("--pageserver-config-override=control_plane_emergency_mode=true",),
register=False,
)
# The pageserver should provide service to clients