mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-08 05:52:55 +00:00
## Problem To test sharding, we need something to control it. We could write python code for doing this from the test runner, but this wouldn't be usable with neon_local run directly, and when we want to write tests with large number of shards/tenants, Rust is a better fit efficiently handling all the required state. This service enables automated tests to easily get a system with sharding/HA without the test itself having to set this all up by hand: existing tests can be run against sharded tenants just by setting a shard count when creating the tenant. ## Summary of changes Attachment service was previously a map of TenantId->TenantState, where the principal state stored for each tenant was the generation and the last attached pageserver. This enabled it to serve the re-attach and validate requests that the pageserver requires. In this PR, the scope of the service is extended substantially to do overall management of tenants in the pageserver, including tenant/timeline creation, live migration, evacuation of offline pageservers etc. This is done using synchronous code to make declarative changes to the tenant's intended state (`TenantState.policy` and `TenantState.intent`), which are then translated into calls into the pageserver by the `Reconciler`. Top level summary of modules within `control_plane/attachment_service/src`: - `tenant_state`: structure that represents one tenant shard. - `service`: implements the main high level such as tenant/timeline creation, marking a node offline, etc. - `scheduler`: for operations that need to pick a pageserver for a tenant, construct a scheduler and call into it. - `compute_hook`: receive notifications when a tenant shard is attached somewhere new. Once we have locations for all the shards in a tenant, emit an update to postgres configuration via the neon_local `LocalEnv`. - `http`: HTTP stubs. These mostly map to methods on `Service`, but are separated for readability and so that it'll be easier to adapt if/when we switch to another RPC layer. - `node`: structure that describes a pageserver node. The most important attribute of a node is its availability: marking a node offline causes tenant shards to reschedule away from it. This PR is a precursor to implementing the full sharding service for prod (#6342). What's the difference between this and a production-ready controller for pageservers? - JSON file persistence to be replaced with a database - Limited observability. - No concurrency limits. Marking a pageserver offline will try and migrate every tenant to a new pageserver concurrently, even if there are thousands. - Very simple scheduler that only knows to pick the pageserver with fewest tenants, and place secondary locations on a different pageserver than attached locations: it does not try to place shards for the same tenant on different pageservers. This matters little in tests, because picking the least-used pageserver usually results in round-robin placement. - Scheduler state is rebuilt exhaustively for each operation that requires a scheduler. - Relies on neon_local mechanisms for updating postgres: in production this would be something that flows through the real control plane. --------- Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
233 lines
9.5 KiB
Python
233 lines
9.5 KiB
Python
import concurrent.futures
|
|
import os
|
|
from typing import List, Tuple
|
|
|
|
import pytest
|
|
from fixtures.log_helper import log
|
|
from fixtures.neon_fixtures import (
|
|
Endpoint,
|
|
NeonEnv,
|
|
NeonEnvBuilder,
|
|
wait_for_last_flush_lsn,
|
|
)
|
|
from fixtures.pg_version import PgVersion
|
|
from fixtures.types import TenantId, TimelineId
|
|
|
|
|
|
# Test restarting page server, while safekeeper and compute node keep
|
|
# running.
|
|
def test_local_corruption(neon_env_builder: NeonEnvBuilder):
|
|
env = neon_env_builder.init_start()
|
|
|
|
env.pageserver.allowed_errors.extend(
|
|
[
|
|
".*get_value_reconstruct_data for layer .*",
|
|
".*could not find data for key.*",
|
|
".*is not active. Current state: Broken.*",
|
|
".*will not become active. Current state: Broken.*",
|
|
".*failed to load metadata.*",
|
|
".*load failed.*load local timeline.*",
|
|
".*layer loading failed permanently: load layer: .*",
|
|
]
|
|
)
|
|
|
|
tenant_timelines: List[Tuple[TenantId, TimelineId, Endpoint]] = []
|
|
|
|
for _ in range(3):
|
|
tenant_id, timeline_id = env.neon_cli.create_tenant()
|
|
|
|
endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
|
|
with endpoint.cursor() as cur:
|
|
cur.execute("CREATE TABLE t(key int primary key, value text)")
|
|
cur.execute("INSERT INTO t SELECT generate_series(1,100), 'payload'")
|
|
wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
|
|
endpoint.stop()
|
|
tenant_timelines.append((tenant_id, timeline_id, endpoint))
|
|
|
|
# Stop the pageserver -- this has to be not immediate or we need to wait for uploads
|
|
env.pageserver.stop()
|
|
|
|
# Leave the first timeline alone, but corrupt the others in different ways
|
|
(tenant0, timeline0, pg0) = tenant_timelines[0]
|
|
log.info(f"Timeline {tenant0}/{timeline0} is left intact")
|
|
|
|
(tenant1, timeline1, pg1) = tenant_timelines[1]
|
|
metadata_path = f"{env.pageserver.workdir}/tenants/{tenant1}/timelines/{timeline1}/metadata"
|
|
with open(metadata_path, "w") as f:
|
|
f.write("overwritten with garbage!")
|
|
log.info(f"Timeline {tenant1}/{timeline1} got its metadata spoiled")
|
|
|
|
(tenant2, timeline2, pg2) = tenant_timelines[2]
|
|
timeline_path = f"{env.pageserver.workdir}/tenants/{tenant2}/timelines/{timeline2}/"
|
|
for filename in os.listdir(timeline_path):
|
|
if filename.startswith("00000"):
|
|
# Looks like a layer file. Corrupt it
|
|
p = f"{timeline_path}/{filename}"
|
|
size = os.path.getsize(p)
|
|
with open(p, "wb") as f:
|
|
f.truncate(0)
|
|
f.truncate(size)
|
|
log.info(f"Timeline {tenant2}/{timeline2} got its local layer files spoiled")
|
|
|
|
env.pageserver.start()
|
|
|
|
# Un-damaged tenant works
|
|
pg0.start()
|
|
assert pg0.safe_psql("SELECT COUNT(*) FROM t")[0][0] == 100
|
|
|
|
# Tenant with corrupt local metadata works: remote storage is authoritative for metadata
|
|
pg1.start()
|
|
assert pg1.safe_psql("SELECT COUNT(*) FROM t")[0][0] == 100
|
|
|
|
# Second timeline will fail during basebackup, because the local layer file is corrupt.
|
|
# It will fail when we try to read (and reconstruct) a page from it, ergo the error message.
|
|
# (We don't check layer file contents on startup, when loading the timeline)
|
|
#
|
|
# This will change when we implement checksums for layers
|
|
with pytest.raises(Exception, match="get_value_reconstruct_data for layer ") as err:
|
|
pg2.start()
|
|
log.info(
|
|
f"As expected, compute startup failed for timeline {tenant2}/{timeline2} with corrupt layers: {err}"
|
|
)
|
|
|
|
|
|
def test_create_multiple_timelines_parallel(neon_simple_env: NeonEnv):
|
|
env = neon_simple_env
|
|
|
|
tenant_id, _ = env.neon_cli.create_tenant()
|
|
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
|
|
futures = [
|
|
executor.submit(
|
|
env.neon_cli.create_timeline, f"test-create-multiple-timelines-{i}", tenant_id
|
|
)
|
|
for i in range(4)
|
|
]
|
|
for future in futures:
|
|
future.result()
|
|
|
|
|
|
def test_timeline_init_break_before_checkpoint(neon_env_builder: NeonEnvBuilder):
|
|
env = neon_env_builder.init_start()
|
|
pageserver_http = env.pageserver.http_client()
|
|
|
|
env.pageserver.allowed_errors.extend(
|
|
[
|
|
".*Failed to process timeline dir contents.*Timeline has no ancestor and no layer files.*",
|
|
".*Timeline got dropped without initializing, cleaning its files.*",
|
|
]
|
|
)
|
|
|
|
tenant_id = env.initial_tenant
|
|
|
|
timelines_dir = env.pageserver.timeline_dir(tenant_id)
|
|
old_tenant_timelines = env.neon_cli.list_timelines(tenant_id)
|
|
initial_timeline_dirs = [d for d in timelines_dir.iterdir()]
|
|
|
|
# Introduce failpoint during timeline init (some intermediate files are on disk), before it's checkpointed.
|
|
pageserver_http.configure_failpoints(("before-checkpoint-new-timeline", "return"))
|
|
with pytest.raises(Exception, match="before-checkpoint-new-timeline"):
|
|
_ = pageserver_http.timeline_create(PgVersion.NOT_SET, tenant_id, TimelineId.generate())
|
|
|
|
# Restart the page server
|
|
env.pageserver.restart(immediate=True)
|
|
|
|
# Creating the timeline didn't finish. The other timelines on tenant should still be present and work normally.
|
|
new_tenant_timelines = env.neon_cli.list_timelines(tenant_id)
|
|
assert (
|
|
new_tenant_timelines == old_tenant_timelines
|
|
), f"Pageserver after restart should ignore non-initialized timelines for tenant {tenant_id}"
|
|
|
|
timeline_dirs = [d for d in timelines_dir.iterdir()]
|
|
assert (
|
|
timeline_dirs == initial_timeline_dirs
|
|
), "pageserver should clean its temp timeline files on timeline creation failure"
|
|
|
|
|
|
# The "exit" case is for a reproducer of issue 6007: an unclean shutdown where we can't do local fs cleanups
|
|
@pytest.mark.parametrize("exit_or_return", ["return", "exit"])
|
|
def test_timeline_init_break_before_checkpoint_recreate(
|
|
neon_env_builder: NeonEnvBuilder, exit_or_return: str
|
|
):
|
|
env = neon_env_builder.init_configs()
|
|
env.start()
|
|
pageserver_http = env.pageserver.http_client()
|
|
|
|
env.pageserver.allowed_errors.extend(
|
|
[
|
|
".*Failed to process timeline dir contents.*Timeline has no ancestor and no layer files.*",
|
|
".*Timeline got dropped without initializing, cleaning its files.*",
|
|
".*Failed to load index_part from remote storage, failed creation?.*",
|
|
]
|
|
)
|
|
|
|
env.neon_cli.create_tenant(env.initial_tenant)
|
|
tenant_id = env.initial_tenant
|
|
|
|
timelines_dir = env.pageserver.timeline_dir(tenant_id)
|
|
old_tenant_timelines = env.neon_cli.list_timelines(tenant_id)
|
|
initial_timeline_dirs = [d for d in timelines_dir.iterdir()]
|
|
|
|
# Some fixed timeline ID (like control plane does)
|
|
timeline_id = TimelineId("1080243c1f76fe3c5147266663c9860b")
|
|
|
|
# Introduce failpoint during timeline init (some intermediate files are on disk), before it's checkpointed.
|
|
failpoint = "before-checkpoint-new-timeline"
|
|
pattern = failpoint
|
|
if exit_or_return == "exit":
|
|
# in reality a read error happens, but there are automatic retries which now fail because pageserver is dead
|
|
pattern = "Connection aborted."
|
|
|
|
pageserver_http.configure_failpoints((failpoint, exit_or_return))
|
|
with pytest.raises(Exception, match=pattern):
|
|
_ = pageserver_http.timeline_create(env.pg_version, tenant_id, timeline_id)
|
|
|
|
# Restart the page server (with the failpoint disabled)
|
|
env.pageserver.restart(immediate=True)
|
|
|
|
# Creating the timeline didn't finish. The other timelines on tenant should still be present and work normally.
|
|
new_tenant_timelines = env.neon_cli.list_timelines(tenant_id)
|
|
assert (
|
|
new_tenant_timelines == old_tenant_timelines
|
|
), f"Pageserver after restart should ignore non-initialized timelines for tenant {tenant_id}"
|
|
|
|
timeline_dirs = [d for d in timelines_dir.iterdir()]
|
|
assert (
|
|
timeline_dirs == initial_timeline_dirs
|
|
), "pageserver should clean its temp timeline files on timeline creation failure"
|
|
|
|
# creating the branch should have worked now
|
|
new_timeline_id = TimelineId(
|
|
pageserver_http.timeline_create(env.pg_version, tenant_id, timeline_id)["timeline_id"]
|
|
)
|
|
|
|
assert timeline_id == new_timeline_id
|
|
|
|
|
|
def test_timeline_create_break_after_uninit_mark(neon_env_builder: NeonEnvBuilder):
|
|
env = neon_env_builder.init_start()
|
|
pageserver_http = env.pageserver.http_client()
|
|
|
|
tenant_id = env.initial_tenant
|
|
|
|
timelines_dir = env.pageserver.timeline_dir(tenant_id)
|
|
old_tenant_timelines = env.neon_cli.list_timelines(tenant_id)
|
|
initial_timeline_dirs = [d for d in timelines_dir.iterdir()]
|
|
|
|
# Introduce failpoint when creating a new timeline uninit mark, before any other files were created
|
|
pageserver_http.configure_failpoints(("after-timeline-uninit-mark-creation", "return"))
|
|
with pytest.raises(Exception, match="after-timeline-uninit-mark-creation"):
|
|
_ = pageserver_http.timeline_create(PgVersion.NOT_SET, tenant_id, TimelineId.generate())
|
|
|
|
# Creating the timeline didn't finish. The other timelines on tenant should still be present and work normally.
|
|
# "New" timeline is not present in the list, allowing pageserver to retry the same request
|
|
new_tenant_timelines = env.neon_cli.list_timelines(tenant_id)
|
|
assert (
|
|
new_tenant_timelines == old_tenant_timelines
|
|
), f"Pageserver after restart should ignore non-initialized timelines for tenant {tenant_id}"
|
|
|
|
timeline_dirs = [d for d in timelines_dir.iterdir()]
|
|
assert (
|
|
timeline_dirs == initial_timeline_dirs
|
|
), "pageserver should clean its temp timeline files on timeline creation failure"
|