storage controller: tenant scheduling policy (#7262)

## Problem

In the event of bugs with scheduling or reconciliation, we need to be
able to switch this off at a per-tenant granularity.

This is intended to mitigate risk of issues with
https://github.com/neondatabase/neon/pull/7181, which makes scheduling
more involved.

Closes: #7103

## Summary of changes

- Introduce a scheduling policy per tenant, with API to set it
- Refactor persistent.rs helpers for updating tenants to be more general
- Add tests
This commit is contained in:
John Spray
2024-03-28 14:19:25 +00:00
committed by GitHub
parent 5928f6709c
commit 6633332e67
10 changed files with 448 additions and 79 deletions

View File

@@ -2116,6 +2116,7 @@ class NeonStorageController(MetricsGetter):
shard_count: Optional[int] = None,
shard_stripe_size: Optional[int] = None,
tenant_config: Optional[Dict[Any, Any]] = None,
placement_policy: Optional[str] = None,
):
"""
Use this rather than pageserver_api() when you need to include shard parameters
@@ -2135,6 +2136,8 @@ class NeonStorageController(MetricsGetter):
for k, v in tenant_config.items():
body[k] = v
body["placement_policy"] = placement_policy
response = self.request(
"POST",
f"{self.env.storage_controller_api}/v1/tenant",
@@ -2193,6 +2196,34 @@ class NeonStorageController(MetricsGetter):
log.info(f"Migrated tenant {tenant_shard_id} to pageserver {dest_ps_id}")
assert self.env.get_tenant_pageserver(tenant_shard_id).id == dest_ps_id
def tenant_policy_update(self, tenant_id: TenantId, body: dict[str, Any]):
log.info(f"tenant_policy_update({tenant_id}, {body})")
self.request(
"PUT",
f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_id}/policy",
json=body,
headers=self.headers(TokenScope.ADMIN),
)
def reconcile_all(self):
r = self.request(
"POST",
f"{self.env.storage_controller_api}/debug/v1/reconcile_all",
headers=self.headers(TokenScope.ADMIN),
)
r.raise_for_status()
n = r.json()
log.info(f"reconcile_all waited for {n} shards")
return n
def reconcile_until_idle(self, timeout_secs=30):
start_at = time.time()
n = 1
while n > 0:
n = self.reconcile_all()
if time.time() - start_at > timeout_secs:
raise RuntimeError("Timeout in reconcile_until_idle")
def consistency_check(self):
"""
Throw an exception if the service finds any inconsistencies in its state

View File

@@ -1015,3 +1015,98 @@ def test_sharding_service_re_attach(neon_env_builder: NeonEnvBuilder):
"storage_controller_reconcile_complete_total", filter={"status": "ok"}
)
assert reconciles_after_restart == reconciles_before_restart
def test_storage_controller_shard_scheduling_policy(neon_env_builder: NeonEnvBuilder):
"""
Check that emergency hooks for disabling rogue tenants' reconcilers work as expected.
"""
env = neon_env_builder.init_configs()
env.start()
tenant_id = TenantId.generate()
env.storage_controller.allowed_errors.extend(
[
# We will intentionally cause reconcile errors
".*Reconcile error.*",
# Message from using a scheduling policy
".*Scheduling is disabled by policy.*",
".*Skipping reconcile for policy.*",
# Message from a node being offline
".*Call to node .* management API .* failed",
]
)
# Stop pageserver so that reconcile cannot complete
env.pageserver.stop()
env.storage_controller.tenant_create(tenant_id, placement_policy="Detached")
# Try attaching it: we should see reconciles failing
env.storage_controller.tenant_policy_update(
tenant_id,
{
"placement": {"Attached": 0},
},
)
def reconcile_errors() -> int:
return int(
env.storage_controller.get_metric_value(
"storage_controller_reconcile_complete_total", filter={"status": "error"}
)
or 0
)
def reconcile_ok() -> int:
return int(
env.storage_controller.get_metric_value(
"storage_controller_reconcile_complete_total", filter={"status": "ok"}
)
or 0
)
def assert_errors_gt(n) -> int:
e = reconcile_errors()
assert e > n
return e
errs = wait_until(10, 1, lambda: assert_errors_gt(0))
# Try reconciling again, it should fail again
with pytest.raises(StorageControllerApiException):
env.storage_controller.reconcile_all()
errs = wait_until(10, 1, lambda: assert_errors_gt(errs))
# Configure the tenant to disable reconciles
env.storage_controller.tenant_policy_update(
tenant_id,
{
"scheduling": "Stop",
},
)
# Try reconciling again, it should not cause an error (silently skip)
env.storage_controller.reconcile_all()
assert reconcile_errors() == errs
# Start the pageserver and re-enable reconciles
env.pageserver.start()
env.storage_controller.tenant_policy_update(
tenant_id,
{
"scheduling": "Active",
},
)
def assert_ok_gt(n) -> int:
o = reconcile_ok()
assert o > n
return o
# We should see a successful reconciliation
wait_until(10, 1, lambda: assert_ok_gt(0))
# And indeed the tenant should be attached
assert len(env.pageserver.http_client().tenant_list_locations()["tenant_shards"]) == 1