storage controller: add node deletion API (#8226)

## Problem

In anticipation of later adding a really nice drain+delete API, I
initially only added an intentionally basic `/drop` API that is just
about usable for deleting nodes in a pinch, but requires some ugly
storage controller restarts to persuade it to restart secondaries.

## Summary of changes

I started making a few tiny fixes, and ended up writing the delete
API...

- Quality of life nit: ordering of node + tenant listings in storcon_cli
- Papercut: Fix the attach_hook using the wrong operation type for
reporting slow locks
- Make Service::spawn tolerate `generation_pageserver` columns that
point to nonexistent node IDs. I started out thinking of this as a
general resilience thing, but when implementing the delete API I
realized it was actually a legitimate end state after the delete API is
called (as that API doesn't wait for all reconciles to succeed).
- Add a `DELETE` API for nodes, which does not gracefully drain, but
does reschedule everything. This becomes safe to use when the system is
in any state, but will incur availability gaps for any tenants that
weren't already live-migrated away. If tenants have already been
drained, this becomes a totally clean + safe way to decom a node.
- Add a test and a storcon_cli wrapper for it

This is meant to be a robust initial API that lets us remove nodes
without doing ugly things like restarting the storage controller -- it's
not quite a totally graceful node-draining routine yet. There's more
work in https://github.com/neondatabase/neon/issues/8333 to get to our
end-end state.
This commit is contained in:
John Spray
2024-07-11 17:05:47 +01:00
committed by GitHub
parent 0159ae9536
commit 814c8e8f68
7 changed files with 277 additions and 14 deletions

View File

@@ -2287,6 +2287,14 @@ class NeonStorageController(MetricsGetter, LogUtils):
headers=self.headers(TokenScope.ADMIN),
)
def node_delete(self, node_id):
log.info(f"node_delete({node_id})")
self.request(
"DELETE",
f"{self.env.storage_controller_api}/control/v1/node/{node_id}",
headers=self.headers(TokenScope.ADMIN),
)
def node_drain(self, node_id):
log.info(f"node_drain({node_id})")
self.request(

View File

@@ -93,6 +93,29 @@ check_ondisk_data_compatibility_if_enabled = pytest.mark.skipif(
)
def fixup_storage_controller(env: NeonEnv):
"""
After importing a repo_dir, we need to massage the storage controller's state a bit: it will have
initially started up with no nodes, but some tenants, and thereby those tenants won't be scheduled
anywhere.
After NeonEnv.start() is done (i.e. nodes are started + registered), call this function to get
the storage controller into a good state.
This function should go away once compat tests carry the controller database in their snapshots, so
that the controller properly remembers nodes between creating + restoring the snapshot.
"""
env.storage_controller.allowed_errors.extend(
[
".*Tenant shard .+ references non-existent node.*",
".*Failed to schedule tenant .+ at startup.*",
]
)
env.storage_controller.stop()
env.storage_controller.start()
env.storage_controller.reconcile_until_idle()
@pytest.mark.xdist_group("compatibility")
@pytest.mark.order(before="test_forward_compatibility")
def test_create_snapshot(
@@ -175,6 +198,7 @@ def test_backward_compatibility(
neon_env_builder.num_safekeepers = 3
env = neon_env_builder.from_repo_dir(compatibility_snapshot_dir / "repo")
neon_env_builder.start()
fixup_storage_controller(env)
check_neon_works(
env,
@@ -263,6 +287,7 @@ def test_forward_compatibility(
assert not env.pageserver.log_contains("git-env:" + prev_pageserver_version)
neon_env_builder.start()
fixup_storage_controller(env)
# ensure the specified pageserver is running
assert env.pageserver.log_contains("git-env:" + prev_pageserver_version)

View File

@@ -1611,3 +1611,91 @@ def test_background_operation_cancellation(neon_env_builder: NeonEnvBuilder):
env.storage_controller.cancel_node_drain(ps_id_to_drain)
env.storage_controller.poll_node_status(ps_id_to_drain, "Active", max_attempts=6, backoff=2)
@pytest.mark.parametrize("while_offline", [True, False])
def test_storage_controller_node_deletion(
neon_env_builder: NeonEnvBuilder,
compute_reconfigure_listener: ComputeReconfigure,
while_offline: bool,
):
"""
Test that deleting a node works & properly reschedules everything that was on the node.
"""
neon_env_builder.num_pageservers = 3
env = neon_env_builder.init_configs()
env.start()
tenant_count = 10
shard_count_per_tenant = 8
tenant_ids = []
for _ in range(0, tenant_count):
tid = TenantId.generate()
tenant_ids.append(tid)
env.neon_cli.create_tenant(
tid, placement_policy='{"Attached":1}', shard_count=shard_count_per_tenant
)
victim = env.pageservers[-1]
# The procedure a human would follow is:
# 1. Mark pageserver scheduling=pause
# 2. Mark pageserver availability=offline to trigger migrations away from it
# 3. Wait for attachments to all move elsewhere
# 4. Call deletion API
# 5. Stop the node.
env.storage_controller.node_configure(victim.id, {"scheduling": "Pause"})
if while_offline:
victim.stop(immediate=True)
env.storage_controller.node_configure(victim.id, {"availability": "Offline"})
def assert_shards_migrated():
counts = get_node_shard_counts(env, tenant_ids)
elsewhere = sum(v for (k, v) in counts.items() if k != victim.id)
log.info(f"Shards on nodes other than on victim: {elsewhere}")
assert elsewhere == tenant_count * shard_count_per_tenant
wait_until(30, 1, assert_shards_migrated)
log.info(f"Deleting pageserver {victim.id}")
env.storage_controller.node_delete(victim.id)
if not while_offline:
def assert_victim_evacuated():
counts = get_node_shard_counts(env, tenant_ids)
count = counts[victim.id]
log.info(f"Shards on node {victim.id}: {count}")
assert count == 0
wait_until(30, 1, assert_victim_evacuated)
# The node should be gone from the list API
assert victim.id not in [n["id"] for n in env.storage_controller.node_list()]
# No tenants should refer to the node in their intent
for tenant_id in tenant_ids:
describe = env.storage_controller.tenant_describe(tenant_id)
for shard in describe["shards"]:
assert shard["node_attached"] != victim.id
assert victim.id not in shard["node_secondary"]
# Reconciles running during deletion should all complete
# FIXME: this currently doesn't work because the deletion schedules shards without a proper ScheduleContext, resulting
# in states that background_reconcile wants to optimize, but can't proceed with migrations yet because this is a short3
# test that hasn't uploaded any heatmaps for secondaries.
# In the interim, just do a reconcile_all to enable the consistency check.
# env.storage_controller.reconcile_until_idle()
env.storage_controller.reconcile_all()
# Controller should pass its own consistency checks
env.storage_controller.consistency_check()
# The node should stay gone across a restart
env.storage_controller.stop()
env.storage_controller.start()
assert victim.id not in [n["id"] for n in env.storage_controller.node_list()]
env.storage_controller.reconcile_all() # FIXME: workaround for optimizations happening on startup, see FIXME above.
env.storage_controller.consistency_check()