storage controller: debug observability endpoints and self-test (#6820)

This PR stacks on https://github.com/neondatabase/neon/pull/6814 Observability: - Because we only persist a subset of our state, and our external API is pretty high level, it can be hard to get at the detail of what's going on internally (e.g. the IntentState of a shard). - Add debug endpoints for getting a full dump of all TenantState and SchedulerNode objects - Enrich the /control/v1/node listing endpoint to include full in-memory detail of `Node` rather than just the `NodePersistence` subset Consistency checks: - The storage controller maintains separate in-memory and on-disk states, by design. To catch subtle bugs, it is useful to occasionally cross-check these. - The Scheduler maintains reference counts for shard->node relationships, which could drift if there was a bug in IntentState: exhausively cross check them in tests.
2026-01-08 05:52:55 +00:00 · 2024-02-19 20:29:23 +00:00
parent 4f7704af24
commit 0c105ef352
11 changed files with 346 additions and 30 deletions
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2100,6 +2100,17 @@ class NeonAttachmentService(MetricsGetter):
        log.info(f"Migrated tenant {tenant_shard_id} to pageserver {dest_ps_id}")
        assert self.env.get_tenant_pageserver(tenant_shard_id).id == dest_ps_id

+    def consistency_check(self):
+        """
+        Throw an exception if the service finds any inconsistencies in its state
+        """
+        response = self.request(
+            "POST",
+            f"{self.env.attachment_service_api}/debug/v1/consistency_check",
+        )
+        response.raise_for_status()
+        log.info("Attachment service passed consistency check")
+
    def __enter__(self) -> "NeonAttachmentService":
        return self

--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -83,6 +83,8 @@ def test_sharding_smoke(
        )
        assert timelines == {env.initial_timeline, timeline_b}

+    env.attachment_service.consistency_check()
+

 def test_sharding_split_unsharded(
    neon_env_builder: NeonEnvBuilder,
@@ -113,6 +115,8 @@ def test_sharding_split_unsharded(

    workload.validate()

+    env.attachment_service.consistency_check()
+

 def test_sharding_split_smoke(
    neon_env_builder: NeonEnvBuilder,
@@ -278,3 +282,5 @@ def test_sharding_split_smoke(
        )
        is None
    )
+
+    env.attachment_service.consistency_check()
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -51,13 +51,13 @@ def test_sharding_service_smoke(
    # The pageservers we started should have registered with the sharding service on startup
    nodes = env.attachment_service.node_list()
    assert len(nodes) == 2
-    assert set(n["node_id"] for n in nodes) == {env.pageservers[0].id, env.pageservers[1].id}
+    assert set(n["id"] for n in nodes) == {env.pageservers[0].id, env.pageservers[1].id}

    # Starting an additional pageserver should register successfully
    env.pageservers[2].start()
    nodes = env.attachment_service.node_list()
    assert len(nodes) == 3
-    assert set(n["node_id"] for n in nodes) == {ps.id for ps in env.pageservers}
+    assert set(n["id"] for n in nodes) == {ps.id for ps in env.pageservers}

    # Use a multiple of pageservers to get nice even number of shards on each one
    tenant_shard_count = len(env.pageservers) * 4
@@ -127,6 +127,8 @@ def test_sharding_service_smoke(
    assert counts[env.pageservers[0].id] == tenant_shard_count // 2
    assert counts[env.pageservers[2].id] == tenant_shard_count // 2

+    env.attachment_service.consistency_check()
+

 def test_node_status_after_restart(
    neon_env_builder: NeonEnvBuilder,
@@ -159,6 +161,8 @@ def test_node_status_after_restart(
    # should have had its availabilty state set to Active.
    env.attachment_service.tenant_create(TenantId.generate())

+    env.attachment_service.consistency_check()
+

 def test_sharding_service_passthrough(
    neon_env_builder: NeonEnvBuilder,
@@ -184,6 +188,8 @@ def test_sharding_service_passthrough(
    }
    assert status["state"]["slug"] == "Active"

+    env.attachment_service.consistency_check()
+

 def test_sharding_service_restart(neon_env_builder: NeonEnvBuilder):
    env = neon_env_builder.init_start()
@@ -216,6 +222,8 @@ def test_sharding_service_restart(neon_env_builder: NeonEnvBuilder):
    assert tenant_a not in observed
    assert tenant_b in observed

+    env.attachment_service.consistency_check()
+

 def test_sharding_service_onboarding(
    neon_env_builder: NeonEnvBuilder,
@@ -318,6 +326,8 @@ def test_sharding_service_onboarding(
    dest_ps.stop()
    dest_ps.start()

+    env.attachment_service.consistency_check()
+

 def test_sharding_service_compute_hook(
    httpserver: HTTPServer,
@@ -388,6 +398,8 @@ def test_sharding_service_compute_hook(

    wait_until(10, 1, received_restart_notification)

+    env.attachment_service.consistency_check()
+

 def test_sharding_service_debug_apis(neon_env_builder: NeonEnvBuilder):
    """
@@ -401,13 +413,47 @@ def test_sharding_service_debug_apis(neon_env_builder: NeonEnvBuilder):
    tenant_id = TenantId.generate()
    env.attachment_service.tenant_create(tenant_id, shard_count=2, shard_stripe_size=8192)

+    # Check that the consistency check passes on a freshly setup system
+    env.attachment_service.consistency_check()
+
    # These APIs are intentionally not implemented as methods on NeonAttachmentService, as
    # they're just for use in unanticipated circumstances.
-    env.attachment_service.request(
+
+    # Initial tenant (1 shard) and the one we just created (2 shards) should be visible
+    response = env.attachment_service.request(
+        "GET", f"{env.attachment_service_api}/debug/v1/tenant"
+    )
+    response.raise_for_status()
+    assert len(response.json()) == 3
+
+    # Scheduler should report the expected nodes and shard counts
+    response = env.attachment_service.request(
+        "GET", f"{env.attachment_service_api}/debug/v1/scheduler"
+    )
+    response.raise_for_status()
+    # Two nodes, in a dict of node_id->node
+    assert len(response.json()["nodes"]) == 2
+    assert sum(v["shard_count"] for v in response.json()["nodes"].values()) == 3
+    assert all(v["may_schedule"] for v in response.json()["nodes"].values())
+
+    response = env.attachment_service.request(
        "POST", f"{env.attachment_service_api}/debug/v1/node/{env.pageservers[1].id}/drop"
    )
+    response.raise_for_status()
    assert len(env.attachment_service.node_list()) == 1

-    env.attachment_service.request(
+    response = env.attachment_service.request(
        "POST", f"{env.attachment_service_api}/debug/v1/tenant/{tenant_id}/drop"
    )
+    response.raise_for_status()
+
+    # Tenant drop should be reflected in dump output
+    response = env.attachment_service.request(
+        "GET", f"{env.attachment_service_api}/debug/v1/tenant"
+    )
+    response.raise_for_status()
+    assert len(response.json()) == 1
+
+    # Check that the 'drop' APIs didn't leave things in a state that would fail a consistency check: they're
+    # meant to be unclean wrt the pageserver state, but not leave a broken storage controller behind.
+    env.attachment_service.consistency_check()