mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-08 05:52:55 +00:00
storage controller: improved handling of node availability on restart (#6658)
- Automatically set a node's availability to Active if it is responsive in startup_reconcile - Impose a 5s timeout of HTTP request to list location conf, so that an unresponsive node can't hang it for minutes - Do several retries if the request fails with a retryable error, to be tolerant of concurrent pageserver & storage controller restarts - Add a readiness hook for use with k8s so that we can tell when the startup reconciliaton is done and the service is fully ready to do work. - Add /metrics to the list of un-authenticated endpoints (this is unrelated but we're touching the line in this PR already, and it fixes auth error spam in deployed container.) - A test for the above. Closes: #6670
This commit is contained in:
@@ -1949,6 +1949,15 @@ class NeonAttachmentService:
|
||||
|
||||
return headers
|
||||
|
||||
def ready(self) -> bool:
|
||||
resp = self.request("GET", f"{self.env.attachment_service_api}/ready")
|
||||
if resp.status_code == 503:
|
||||
return False
|
||||
elif resp.status_code == 200:
|
||||
return True
|
||||
else:
|
||||
raise RuntimeError(f"Unexpected status {resp.status_code} from readiness endpoint")
|
||||
|
||||
def attach_hook_issue(
|
||||
self, tenant_shard_id: Union[TenantId, TenantShardId], pageserver_id: int
|
||||
) -> int:
|
||||
|
||||
@@ -128,6 +128,38 @@ def test_sharding_service_smoke(
|
||||
assert counts[env.pageservers[2].id] == tenant_shard_count // 2
|
||||
|
||||
|
||||
def test_node_status_after_restart(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
):
|
||||
neon_env_builder.num_pageservers = 2
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
# Initially we have two online pageservers
|
||||
nodes = env.attachment_service.node_list()
|
||||
assert len(nodes) == 2
|
||||
|
||||
env.pageservers[1].stop()
|
||||
|
||||
env.attachment_service.stop()
|
||||
env.attachment_service.start()
|
||||
|
||||
# Initially readiness check should fail because we're trying to connect to the offline node
|
||||
assert env.attachment_service.ready() is False
|
||||
|
||||
def is_ready():
|
||||
assert env.attachment_service.ready() is True
|
||||
|
||||
wait_until(30, 1, is_ready)
|
||||
|
||||
# We loaded nodes from database on restart
|
||||
nodes = env.attachment_service.node_list()
|
||||
assert len(nodes) == 2
|
||||
|
||||
# We should still be able to create a tenant, because the pageserver which is still online
|
||||
# should have had its availabilty state set to Active.
|
||||
env.attachment_service.tenant_create(TenantId.generate())
|
||||
|
||||
|
||||
def test_sharding_service_passthrough(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
):
|
||||
|
||||
Reference in New Issue
Block a user