storage_controller: periodic pageserver heartbeats (#7092)

## Problem If a pageserver was offline when the storage controller started, there was no mechanism to update the storage controller state when the pageserver becomes active. ## Summary of changes * Add a heartbeater module. The heartbeater must be driven by an external loop. * Integrate the heartbeater into the service. - Extend the types used by the service and scheduler to keep track of a nodes' utilisation score. - Add a background loop to drive the heartbeater and update the state based on the deltas it generated - Do an initial round of heartbeats at start-up
2026-05-29 11:00:38 +00:00 · 2024-03-14 15:21:36 +00:00
parent 9fe0193e51
commit 38767ace68
17 changed files with 779 additions and 88 deletions
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2088,6 +2088,14 @@ class NeonStorageController(MetricsGetter):
        )
        return response.json()

+    def tenant_list(self):
+        response = self.request(
+            "GET",
+            f"{self.env.storage_controller_api}/debug/v1/tenant",
+            headers=self.headers(TokenScope.ADMIN),
+        )
+        return response.json()
+
    def node_configure(self, node_id, body: dict[str, Any]):
        log.info(f"node_configure({node_id}, {body})")
        body["node_id"] = node_id
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -209,10 +209,12 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
    env.storage_controller.node_register(env.pageserver)

    env.pageserver.start(overrides=('--pageserver-config-override=control_plane_api=""',))
+    env.storage_controller.node_configure(env.pageserver.id, {"availability": "Active"})

    env.neon_cli.create_tenant(
        tenant_id=env.initial_tenant, conf=TENANT_CONF, timeline_id=env.initial_timeline
    )
+
    generate_uploads_and_deletions(env, pageserver=env.pageserver)

    def parse_generation_suffix(key):
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -769,3 +769,172 @@ def test_sharding_service_tenant_conf(neon_env_builder: NeonEnvBuilder):
    assert "pitr_interval" not in readback_ps.tenant_specific_overrides

    env.storage_controller.consistency_check()
+
+
+class Failure:
+    pageserver_id: int
+
+    def apply(self, env: NeonEnv):
+        raise NotImplementedError()
+
+    def clear(self, env: NeonEnv):
+        raise NotImplementedError()
+
+
+class NodeStop(Failure):
+    def __init__(self, pageserver_id, immediate):
+        self.pageserver_id = pageserver_id
+        self.immediate = immediate
+
+    def apply(self, env: NeonEnv):
+        pageserver = env.get_pageserver(self.pageserver_id)
+        pageserver.stop(immediate=self.immediate)
+
+    def clear(self, env: NeonEnv):
+        pageserver = env.get_pageserver(self.pageserver_id)
+        pageserver.start()
+
+
+class PageserverFailpoint(Failure):
+    def __init__(self, failpoint, pageserver_id):
+        self.failpoint = failpoint
+        self.pageserver_id = pageserver_id
+
+    def apply(self, env: NeonEnv):
+        pageserver = env.get_pageserver(self.pageserver_id)
+        pageserver.http_client().configure_failpoints((self.failpoint, "return(1)"))
+
+    def clear(self, env: NeonEnv):
+        pageserver = env.get_pageserver(self.pageserver_id)
+        pageserver.http_client().configure_failpoints((self.failpoint, "off"))
+
+
+def build_node_to_tenants_map(env: NeonEnv) -> dict[int, list[TenantId]]:
+    tenants = env.storage_controller.tenant_list()
+
+    node_to_tenants: dict[int, list[TenantId]] = {}
+    for t in tenants:
+        for node_id, loc_state in t["observed"]["locations"].items():
+            if (
+                loc_state is not None
+                and "conf" in loc_state
+                and loc_state["conf"] is not None
+                and loc_state["conf"]["mode"] == "AttachedSingle"
+            ):
+                crnt = node_to_tenants.get(int(node_id), [])
+                crnt.append(TenantId(t["tenant_shard_id"]))
+                node_to_tenants[int(node_id)] = crnt
+
+    return node_to_tenants
+
+
+@pytest.mark.parametrize(
+    "failure",
+    [
+        NodeStop(pageserver_id=1, immediate=False),
+        NodeStop(pageserver_id=1, immediate=True),
+        PageserverFailpoint(pageserver_id=1, failpoint="get-utilization-http-handler"),
+    ],
+)
+def test_sharding_service_heartbeats(
+    neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, failure: Failure
+):
+    neon_env_builder.num_pageservers = 2
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    # Initially we have two online pageservers
+    nodes = env.storage_controller.node_list()
+    assert len(nodes) == 2
+    assert all([n["availability"] == "Active" for n in nodes])
+
+    # ... then we create two tenants and write some data into them
+    def create_tenant(tid: TenantId):
+        env.storage_controller.tenant_create(tid)
+
+        branch_name = "main"
+        env.neon_cli.create_timeline(
+            branch_name,
+            tenant_id=tid,
+        )
+
+        with env.endpoints.create_start("main", tenant_id=tid) as endpoint:
+            run_pg_bench_small(pg_bin, endpoint.connstr())
+            endpoint.safe_psql("CREATE TABLE created_foo(id integer);")
+
+    tenant_ids = [TenantId.generate(), TenantId.generate()]
+    for tid in tenant_ids:
+        create_tenant(tid)
+
+    # ... expecting that each tenant will be placed on a different node
+    def tenants_placed():
+        node_to_tenants = build_node_to_tenants_map(env)
+        log.info(f"{node_to_tenants=}")
+
+        # Check that all the tenants have been attached
+        assert sum((len(ts) for ts in node_to_tenants.values())) == len(tenant_ids)
+        # Check that each node got one tenant
+        assert all((len(ts) == 1 for ts in node_to_tenants.values()))
+
+    wait_until(10, 1, tenants_placed)
+
+    # ... then we apply the failure
+    offline_node_id = failure.pageserver_id
+    online_node_id = (set(range(1, len(env.pageservers) + 1)) - {offline_node_id}).pop()
+    env.get_pageserver(offline_node_id).allowed_errors.append(
+        # In the case of the failpoint failure, the impacted pageserver
+        # still believes it has the tenant attached since location
+        # config calls into it will fail due to being marked offline.
+        ".*Dropped remote consistent LSN updates.*",
+    )
+
+    failure.apply(env)
+
+    # ... expecting the heartbeats to mark it offline
+    def node_offline():
+        nodes = env.storage_controller.node_list()
+        log.info(f"{nodes=}")
+        target = next(n for n in nodes if n["id"] == offline_node_id)
+        assert target["availability"] == "Offline"
+
+    # A node is considered offline if the last successful heartbeat
+    # was more than 10 seconds ago (hardcoded in the storage controller).
+    wait_until(20, 1, node_offline)
+
+    # .. expecting the tenant on the offline node to be migrated
+    def tenant_migrated():
+        node_to_tenants = build_node_to_tenants_map(env)
+        log.info(f"{node_to_tenants=}")
+        assert set(node_to_tenants[online_node_id]) == set(tenant_ids)
+
+    wait_until(10, 1, tenant_migrated)
+
+    # ... then we clear the failure
+    failure.clear(env)
+
+    # ... expecting the offline node to become active again
+    def node_online():
+        nodes = env.storage_controller.node_list()
+        target = next(n for n in nodes if n["id"] == offline_node_id)
+        assert target["availability"] == "Active"
+
+    wait_until(10, 1, node_online)
+
+    time.sleep(5)
+
+    # ... then we create a new tenant
+    tid = TenantId.generate()
+    env.storage_controller.tenant_create(tid)
+
+    # ... expecting it to be placed on the node that just came back online
+    tenants = env.storage_controller.tenant_list()
+    newest_tenant = next(t for t in tenants if t["tenant_shard_id"] == str(tid))
+    locations = list(newest_tenant["observed"]["locations"].keys())
+    locations = [int(node_id) for node_id in locations]
+    assert locations == [offline_node_id]
+
+    # ... expecting the storage controller to reach a consistent state
+    def storage_controller_consistent():
+        env.storage_controller.consistency_check()
+
+    wait_until(10, 1, storage_controller_consistent)