mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-08 05:52:55 +00:00
storage controller: debug observability endpoints and self-test (#6820)
This PR stacks on https://github.com/neondatabase/neon/pull/6814 Observability: - Because we only persist a subset of our state, and our external API is pretty high level, it can be hard to get at the detail of what's going on internally (e.g. the IntentState of a shard). - Add debug endpoints for getting a full dump of all TenantState and SchedulerNode objects - Enrich the /control/v1/node listing endpoint to include full in-memory detail of `Node` rather than just the `NodePersistence` subset Consistency checks: - The storage controller maintains separate in-memory and on-disk states, by design. To catch subtle bugs, it is useful to occasionally cross-check these. - The Scheduler maintains reference counts for shard->node relationships, which could drift if there was a bug in IntentState: exhausively cross check them in tests.
This commit is contained in:
@@ -2100,6 +2100,17 @@ class NeonAttachmentService(MetricsGetter):
|
||||
log.info(f"Migrated tenant {tenant_shard_id} to pageserver {dest_ps_id}")
|
||||
assert self.env.get_tenant_pageserver(tenant_shard_id).id == dest_ps_id
|
||||
|
||||
def consistency_check(self):
|
||||
"""
|
||||
Throw an exception if the service finds any inconsistencies in its state
|
||||
"""
|
||||
response = self.request(
|
||||
"POST",
|
||||
f"{self.env.attachment_service_api}/debug/v1/consistency_check",
|
||||
)
|
||||
response.raise_for_status()
|
||||
log.info("Attachment service passed consistency check")
|
||||
|
||||
def __enter__(self) -> "NeonAttachmentService":
|
||||
return self
|
||||
|
||||
|
||||
@@ -83,6 +83,8 @@ def test_sharding_smoke(
|
||||
)
|
||||
assert timelines == {env.initial_timeline, timeline_b}
|
||||
|
||||
env.attachment_service.consistency_check()
|
||||
|
||||
|
||||
def test_sharding_split_unsharded(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
@@ -113,6 +115,8 @@ def test_sharding_split_unsharded(
|
||||
|
||||
workload.validate()
|
||||
|
||||
env.attachment_service.consistency_check()
|
||||
|
||||
|
||||
def test_sharding_split_smoke(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
@@ -278,3 +282,5 @@ def test_sharding_split_smoke(
|
||||
)
|
||||
is None
|
||||
)
|
||||
|
||||
env.attachment_service.consistency_check()
|
||||
|
||||
@@ -51,13 +51,13 @@ def test_sharding_service_smoke(
|
||||
# The pageservers we started should have registered with the sharding service on startup
|
||||
nodes = env.attachment_service.node_list()
|
||||
assert len(nodes) == 2
|
||||
assert set(n["node_id"] for n in nodes) == {env.pageservers[0].id, env.pageservers[1].id}
|
||||
assert set(n["id"] for n in nodes) == {env.pageservers[0].id, env.pageservers[1].id}
|
||||
|
||||
# Starting an additional pageserver should register successfully
|
||||
env.pageservers[2].start()
|
||||
nodes = env.attachment_service.node_list()
|
||||
assert len(nodes) == 3
|
||||
assert set(n["node_id"] for n in nodes) == {ps.id for ps in env.pageservers}
|
||||
assert set(n["id"] for n in nodes) == {ps.id for ps in env.pageservers}
|
||||
|
||||
# Use a multiple of pageservers to get nice even number of shards on each one
|
||||
tenant_shard_count = len(env.pageservers) * 4
|
||||
@@ -127,6 +127,8 @@ def test_sharding_service_smoke(
|
||||
assert counts[env.pageservers[0].id] == tenant_shard_count // 2
|
||||
assert counts[env.pageservers[2].id] == tenant_shard_count // 2
|
||||
|
||||
env.attachment_service.consistency_check()
|
||||
|
||||
|
||||
def test_node_status_after_restart(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
@@ -159,6 +161,8 @@ def test_node_status_after_restart(
|
||||
# should have had its availabilty state set to Active.
|
||||
env.attachment_service.tenant_create(TenantId.generate())
|
||||
|
||||
env.attachment_service.consistency_check()
|
||||
|
||||
|
||||
def test_sharding_service_passthrough(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
@@ -184,6 +188,8 @@ def test_sharding_service_passthrough(
|
||||
}
|
||||
assert status["state"]["slug"] == "Active"
|
||||
|
||||
env.attachment_service.consistency_check()
|
||||
|
||||
|
||||
def test_sharding_service_restart(neon_env_builder: NeonEnvBuilder):
|
||||
env = neon_env_builder.init_start()
|
||||
@@ -216,6 +222,8 @@ def test_sharding_service_restart(neon_env_builder: NeonEnvBuilder):
|
||||
assert tenant_a not in observed
|
||||
assert tenant_b in observed
|
||||
|
||||
env.attachment_service.consistency_check()
|
||||
|
||||
|
||||
def test_sharding_service_onboarding(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
@@ -318,6 +326,8 @@ def test_sharding_service_onboarding(
|
||||
dest_ps.stop()
|
||||
dest_ps.start()
|
||||
|
||||
env.attachment_service.consistency_check()
|
||||
|
||||
|
||||
def test_sharding_service_compute_hook(
|
||||
httpserver: HTTPServer,
|
||||
@@ -388,6 +398,8 @@ def test_sharding_service_compute_hook(
|
||||
|
||||
wait_until(10, 1, received_restart_notification)
|
||||
|
||||
env.attachment_service.consistency_check()
|
||||
|
||||
|
||||
def test_sharding_service_debug_apis(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
@@ -401,13 +413,47 @@ def test_sharding_service_debug_apis(neon_env_builder: NeonEnvBuilder):
|
||||
tenant_id = TenantId.generate()
|
||||
env.attachment_service.tenant_create(tenant_id, shard_count=2, shard_stripe_size=8192)
|
||||
|
||||
# Check that the consistency check passes on a freshly setup system
|
||||
env.attachment_service.consistency_check()
|
||||
|
||||
# These APIs are intentionally not implemented as methods on NeonAttachmentService, as
|
||||
# they're just for use in unanticipated circumstances.
|
||||
env.attachment_service.request(
|
||||
|
||||
# Initial tenant (1 shard) and the one we just created (2 shards) should be visible
|
||||
response = env.attachment_service.request(
|
||||
"GET", f"{env.attachment_service_api}/debug/v1/tenant"
|
||||
)
|
||||
response.raise_for_status()
|
||||
assert len(response.json()) == 3
|
||||
|
||||
# Scheduler should report the expected nodes and shard counts
|
||||
response = env.attachment_service.request(
|
||||
"GET", f"{env.attachment_service_api}/debug/v1/scheduler"
|
||||
)
|
||||
response.raise_for_status()
|
||||
# Two nodes, in a dict of node_id->node
|
||||
assert len(response.json()["nodes"]) == 2
|
||||
assert sum(v["shard_count"] for v in response.json()["nodes"].values()) == 3
|
||||
assert all(v["may_schedule"] for v in response.json()["nodes"].values())
|
||||
|
||||
response = env.attachment_service.request(
|
||||
"POST", f"{env.attachment_service_api}/debug/v1/node/{env.pageservers[1].id}/drop"
|
||||
)
|
||||
response.raise_for_status()
|
||||
assert len(env.attachment_service.node_list()) == 1
|
||||
|
||||
env.attachment_service.request(
|
||||
response = env.attachment_service.request(
|
||||
"POST", f"{env.attachment_service_api}/debug/v1/tenant/{tenant_id}/drop"
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
# Tenant drop should be reflected in dump output
|
||||
response = env.attachment_service.request(
|
||||
"GET", f"{env.attachment_service_api}/debug/v1/tenant"
|
||||
)
|
||||
response.raise_for_status()
|
||||
assert len(response.json()) == 1
|
||||
|
||||
# Check that the 'drop' APIs didn't leave things in a state that would fail a consistency check: they're
|
||||
# meant to be unclean wrt the pageserver state, but not leave a broken storage controller behind.
|
||||
env.attachment_service.consistency_check()
|
||||
|
||||
Reference in New Issue
Block a user