storage controller: add node deletion API (#8226)

## Problem In anticipation of later adding a really nice drain+delete API, I initially only added an intentionally basic `/drop` API that is just about usable for deleting nodes in a pinch, but requires some ugly storage controller restarts to persuade it to restart secondaries. ## Summary of changes I started making a few tiny fixes, and ended up writing the delete API... - Quality of life nit: ordering of node + tenant listings in storcon_cli - Papercut: Fix the attach_hook using the wrong operation type for reporting slow locks - Make Service::spawn tolerate `generation_pageserver` columns that point to nonexistent node IDs. I started out thinking of this as a general resilience thing, but when implementing the delete API I realized it was actually a legitimate end state after the delete API is called (as that API doesn't wait for all reconciles to succeed). - Add a `DELETE` API for nodes, which does not gracefully drain, but does reschedule everything. This becomes safe to use when the system is in any state, but will incur availability gaps for any tenants that weren't already live-migrated away. If tenants have already been drained, this becomes a totally clean + safe way to decom a node. - Add a test and a storcon_cli wrapper for it This is meant to be a robust initial API that lets us remove nodes without doing ugly things like restarting the storage controller -- it's not quite a totally graceful node-draining routine yet. There's more work in https://github.com/neondatabase/neon/issues/8333 to get to our end-end state.
2026-05-23 08:00:37 +00:00 · 2024-07-11 17:05:47 +01:00
parent 0159ae9536
commit 814c8e8f68
7 changed files with 277 additions and 14 deletions
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2287,6 +2287,14 @@ class NeonStorageController(MetricsGetter, LogUtils):
            headers=self.headers(TokenScope.ADMIN),
        )

+    def node_delete(self, node_id):
+        log.info(f"node_delete({node_id})")
+        self.request(
+            "DELETE",
+            f"{self.env.storage_controller_api}/control/v1/node/{node_id}",
+            headers=self.headers(TokenScope.ADMIN),
+        )
+
    def node_drain(self, node_id):
        log.info(f"node_drain({node_id})")
        self.request(
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -93,6 +93,29 @@ check_ondisk_data_compatibility_if_enabled = pytest.mark.skipif(
 )


+def fixup_storage_controller(env: NeonEnv):
+    """
+    After importing a repo_dir, we need to massage the storage controller's state a bit: it will have
+    initially started up with no nodes, but some tenants, and thereby those tenants won't be scheduled
+    anywhere.
+
+    After NeonEnv.start() is done (i.e. nodes are started + registered), call this function to get
+    the storage controller into a good state.
+
+    This function should go away once compat tests carry the controller database in their snapshots, so
+    that the controller properly remembers nodes between creating + restoring the snapshot.
+    """
+    env.storage_controller.allowed_errors.extend(
+        [
+            ".*Tenant shard .+ references non-existent node.*",
+            ".*Failed to schedule tenant .+ at startup.*",
+        ]
+    )
+    env.storage_controller.stop()
+    env.storage_controller.start()
+    env.storage_controller.reconcile_until_idle()
+
+
@pytest.mark.xdist_group("compatibility")
@pytest.mark.order(before="test_forward_compatibility")
 def test_create_snapshot(
@@ -175,6 +198,7 @@ def test_backward_compatibility(
        neon_env_builder.num_safekeepers = 3
        env = neon_env_builder.from_repo_dir(compatibility_snapshot_dir / "repo")
        neon_env_builder.start()
+        fixup_storage_controller(env)

        check_neon_works(
            env,
@@ -263,6 +287,7 @@ def test_forward_compatibility(
        assert not env.pageserver.log_contains("git-env:" + prev_pageserver_version)

        neon_env_builder.start()
+        fixup_storage_controller(env)

        # ensure the specified pageserver is running
        assert env.pageserver.log_contains("git-env:" + prev_pageserver_version)
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -1611,3 +1611,91 @@ def test_background_operation_cancellation(neon_env_builder: NeonEnvBuilder):
    env.storage_controller.cancel_node_drain(ps_id_to_drain)

    env.storage_controller.poll_node_status(ps_id_to_drain, "Active", max_attempts=6, backoff=2)
+
+
+@pytest.mark.parametrize("while_offline", [True, False])
+def test_storage_controller_node_deletion(
+    neon_env_builder: NeonEnvBuilder,
+    compute_reconfigure_listener: ComputeReconfigure,
+    while_offline: bool,
+):
+    """
+    Test that deleting a node works & properly reschedules everything that was on the node.
+    """
+    neon_env_builder.num_pageservers = 3
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    tenant_count = 10
+    shard_count_per_tenant = 8
+    tenant_ids = []
+    for _ in range(0, tenant_count):
+        tid = TenantId.generate()
+        tenant_ids.append(tid)
+        env.neon_cli.create_tenant(
+            tid, placement_policy='{"Attached":1}', shard_count=shard_count_per_tenant
+        )
+
+    victim = env.pageservers[-1]
+
+    # The procedure a human would follow is:
+    # 1. Mark pageserver scheduling=pause
+    # 2. Mark pageserver availability=offline to trigger migrations away from it
+    # 3. Wait for attachments to all move elsewhere
+    # 4. Call deletion API
+    # 5. Stop the node.
+
+    env.storage_controller.node_configure(victim.id, {"scheduling": "Pause"})
+
+    if while_offline:
+        victim.stop(immediate=True)
+        env.storage_controller.node_configure(victim.id, {"availability": "Offline"})
+
+        def assert_shards_migrated():
+            counts = get_node_shard_counts(env, tenant_ids)
+            elsewhere = sum(v for (k, v) in counts.items() if k != victim.id)
+            log.info(f"Shards on nodes other than on victim: {elsewhere}")
+            assert elsewhere == tenant_count * shard_count_per_tenant
+
+        wait_until(30, 1, assert_shards_migrated)
+
+    log.info(f"Deleting pageserver {victim.id}")
+    env.storage_controller.node_delete(victim.id)
+
+    if not while_offline:
+
+        def assert_victim_evacuated():
+            counts = get_node_shard_counts(env, tenant_ids)
+            count = counts[victim.id]
+            log.info(f"Shards on node {victim.id}: {count}")
+            assert count == 0
+
+        wait_until(30, 1, assert_victim_evacuated)
+
+    # The node should be gone from the list API
+    assert victim.id not in [n["id"] for n in env.storage_controller.node_list()]
+
+    # No tenants should refer to the node in their intent
+    for tenant_id in tenant_ids:
+        describe = env.storage_controller.tenant_describe(tenant_id)
+        for shard in describe["shards"]:
+            assert shard["node_attached"] != victim.id
+            assert victim.id not in shard["node_secondary"]
+
+    # Reconciles running during deletion should all complete
+    # FIXME: this currently doesn't work because the deletion schedules shards without a proper ScheduleContext, resulting
+    # in states that background_reconcile wants to optimize, but can't proceed with migrations yet because this is a short3
+    # test that hasn't uploaded any heatmaps for secondaries.
+    # In the interim, just do a reconcile_all to enable the consistency check.
+    # env.storage_controller.reconcile_until_idle()
+    env.storage_controller.reconcile_all()
+
+    # Controller should pass its own consistency checks
+    env.storage_controller.consistency_check()
+
+    # The node should stay gone across a restart
+    env.storage_controller.stop()
+    env.storage_controller.start()
+    assert victim.id not in [n["id"] for n in env.storage_controller.node_list()]
+    env.storage_controller.reconcile_all()  # FIXME: workaround for optimizations happening on startup, see FIXME above.
+    env.storage_controller.consistency_check()