mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-08 14:02:55 +00:00
storage controller: improved handling of node availability on restart (#6658)
- Automatically set a node's availability to Active if it is responsive in startup_reconcile - Impose a 5s timeout of HTTP request to list location conf, so that an unresponsive node can't hang it for minutes - Do several retries if the request fails with a retryable error, to be tolerant of concurrent pageserver & storage controller restarts - Add a readiness hook for use with k8s so that we can tell when the startup reconciliaton is done and the service is fully ready to do work. - Add /metrics to the list of un-authenticated endpoints (this is unrelated but we're touching the line in this PR already, and it fixes auth error spam in deployed container.) - A test for the above. Closes: #6670
This commit is contained in:
@@ -42,7 +42,7 @@ pub struct HttpState {
|
||||
|
||||
impl HttpState {
|
||||
pub fn new(service: Arc<crate::service::Service>, auth: Option<Arc<SwappableJwtAuth>>) -> Self {
|
||||
let allowlist_routes = ["/status"]
|
||||
let allowlist_routes = ["/status", "/ready", "/metrics"]
|
||||
.iter()
|
||||
.map(|v| v.parse().unwrap())
|
||||
.collect::<Vec<_>>();
|
||||
@@ -325,6 +325,17 @@ async fn handle_status(_req: Request<Body>) -> Result<Response<Body>, ApiError>
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
/// Readiness endpoint indicates when we're done doing startup I/O (e.g. reconciling
|
||||
/// with remote pageserver nodes). This is intended for use as a kubernetes readiness probe.
|
||||
async fn handle_ready(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let state = get_state(&req);
|
||||
if state.service.startup_complete.is_ready() {
|
||||
json_response(StatusCode::OK, ())
|
||||
} else {
|
||||
json_response(StatusCode::SERVICE_UNAVAILABLE, ())
|
||||
}
|
||||
}
|
||||
|
||||
impl From<ReconcileError> for ApiError {
|
||||
fn from(value: ReconcileError) -> Self {
|
||||
ApiError::Conflict(format!("Reconciliation error: {}", value))
|
||||
@@ -380,6 +391,7 @@ pub fn make_router(
|
||||
.data(Arc::new(HttpState::new(service, auth)))
|
||||
// Non-prefixed generic endpoints (status, metrics)
|
||||
.get("/status", |r| request_span(r, handle_status))
|
||||
.get("/ready", |r| request_span(r, handle_ready))
|
||||
// Upcalls for the pageserver: point the pageserver's `control_plane_api` config to this prefix
|
||||
.post("/upcall/v1/re-attach", |r| {
|
||||
request_span(r, handle_re_attach)
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use std::{
|
||||
cmp::Ordering,
|
||||
collections::{BTreeMap, HashMap},
|
||||
collections::{BTreeMap, HashMap, HashSet},
|
||||
str::FromStr,
|
||||
sync::Arc,
|
||||
time::{Duration, Instant},
|
||||
@@ -31,6 +31,7 @@ use pageserver_api::{
|
||||
use pageserver_client::mgmt_api;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::{
|
||||
backoff,
|
||||
completion::Barrier,
|
||||
generation::Generation,
|
||||
http::error::ApiError,
|
||||
@@ -150,31 +151,71 @@ impl Service {
|
||||
// indeterminate, same as in [`ObservedStateLocation`])
|
||||
let mut observed = HashMap::new();
|
||||
|
||||
let nodes = {
|
||||
let locked = self.inner.read().unwrap();
|
||||
locked.nodes.clone()
|
||||
};
|
||||
let mut nodes_online = HashSet::new();
|
||||
|
||||
// TODO: give Service a cancellation token for clean shutdown
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
// TODO: issue these requests concurrently
|
||||
for node in nodes.values() {
|
||||
let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
|
||||
{
|
||||
let nodes = {
|
||||
let locked = self.inner.read().unwrap();
|
||||
locked.nodes.clone()
|
||||
};
|
||||
for node in nodes.values() {
|
||||
let http_client = reqwest::ClientBuilder::new()
|
||||
.timeout(Duration::from_secs(5))
|
||||
.build()
|
||||
.expect("Failed to construct HTTP client");
|
||||
let client = mgmt_api::Client::from_client(
|
||||
http_client,
|
||||
node.base_url(),
|
||||
self.config.jwt_token.as_deref(),
|
||||
);
|
||||
|
||||
tracing::info!("Scanning shards on node {}...", node.id);
|
||||
match client.list_location_config().await {
|
||||
Err(e) => {
|
||||
tracing::warn!("Could not contact pageserver {} ({e})", node.id);
|
||||
// TODO: be more tolerant, apply a generous 5-10 second timeout with retries, in case
|
||||
// pageserver is being restarted at the same time as we are
|
||||
fn is_fatal(e: &mgmt_api::Error) -> bool {
|
||||
use mgmt_api::Error::*;
|
||||
match e {
|
||||
ReceiveBody(_) | ReceiveErrorBody(_) => false,
|
||||
ApiError(StatusCode::SERVICE_UNAVAILABLE, _)
|
||||
| ApiError(StatusCode::GATEWAY_TIMEOUT, _)
|
||||
| ApiError(StatusCode::REQUEST_TIMEOUT, _) => false,
|
||||
ApiError(_, _) => true,
|
||||
}
|
||||
}
|
||||
Ok(listing) => {
|
||||
tracing::info!(
|
||||
"Received {} shard statuses from pageserver {}, setting it to Active",
|
||||
listing.tenant_shards.len(),
|
||||
node.id
|
||||
);
|
||||
|
||||
for (tenant_shard_id, conf_opt) in listing.tenant_shards {
|
||||
observed.insert(tenant_shard_id, (node.id, conf_opt));
|
||||
let list_response = backoff::retry(
|
||||
|| client.list_location_config(),
|
||||
is_fatal,
|
||||
1,
|
||||
5,
|
||||
"Location config listing",
|
||||
&cancel,
|
||||
)
|
||||
.await;
|
||||
let Some(list_response) = list_response else {
|
||||
tracing::info!("Shutdown during startup_reconcile");
|
||||
return;
|
||||
};
|
||||
|
||||
tracing::info!("Scanning shards on node {}...", node.id);
|
||||
match list_response {
|
||||
Err(e) => {
|
||||
tracing::warn!("Could not contact pageserver {} ({e})", node.id);
|
||||
// TODO: be more tolerant, do some retries, in case
|
||||
// pageserver is being restarted at the same time as we are
|
||||
}
|
||||
Ok(listing) => {
|
||||
tracing::info!(
|
||||
"Received {} shard statuses from pageserver {}, setting it to Active",
|
||||
listing.tenant_shards.len(),
|
||||
node.id
|
||||
);
|
||||
nodes_online.insert(node.id);
|
||||
|
||||
for (tenant_shard_id, conf_opt) in listing.tenant_shards {
|
||||
observed.insert(tenant_shard_id, (node.id, conf_opt));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -185,8 +226,19 @@ impl Service {
|
||||
let mut compute_notifications = Vec::new();
|
||||
|
||||
// Populate intent and observed states for all tenants, based on reported state on pageservers
|
||||
let shard_count = {
|
||||
let (shard_count, nodes) = {
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
|
||||
// Mark nodes online if they responded to us: nodes are offline by default after a restart.
|
||||
let mut nodes = (*locked.nodes).clone();
|
||||
for (node_id, node) in nodes.iter_mut() {
|
||||
if nodes_online.contains(node_id) {
|
||||
node.availability = NodeAvailability::Active;
|
||||
}
|
||||
}
|
||||
locked.nodes = Arc::new(nodes);
|
||||
let nodes = locked.nodes.clone();
|
||||
|
||||
for (tenant_shard_id, (node_id, observed_loc)) in observed {
|
||||
let Some(tenant_state) = locked.tenants.get_mut(&tenant_shard_id) else {
|
||||
cleanup.push((tenant_shard_id, node_id));
|
||||
@@ -218,7 +270,7 @@ impl Service {
|
||||
}
|
||||
}
|
||||
|
||||
locked.tenants.len()
|
||||
(locked.tenants.len(), nodes)
|
||||
};
|
||||
|
||||
// TODO: if any tenant's intent now differs from its loaded generation_pageserver, we should clear that
|
||||
@@ -279,9 +331,8 @@ impl Service {
|
||||
let stream = futures::stream::iter(compute_notifications.into_iter())
|
||||
.map(|(tenant_shard_id, node_id)| {
|
||||
let compute_hook = compute_hook.clone();
|
||||
let cancel = cancel.clone();
|
||||
async move {
|
||||
// TODO: give Service a cancellation token for clean shutdown
|
||||
let cancel = CancellationToken::new();
|
||||
if let Err(e) = compute_hook.notify(tenant_shard_id, node_id, &cancel).await {
|
||||
tracing::error!(
|
||||
tenant_shard_id=%tenant_shard_id,
|
||||
@@ -387,7 +438,7 @@ impl Service {
|
||||
))),
|
||||
config,
|
||||
persistence,
|
||||
startup_complete,
|
||||
startup_complete: startup_complete.clone(),
|
||||
});
|
||||
|
||||
let result_task_this = this.clone();
|
||||
@@ -984,6 +1035,10 @@ impl Service {
|
||||
}
|
||||
};
|
||||
|
||||
// TODO: if we timeout/fail on reconcile, we should still succeed this request,
|
||||
// because otherwise a broken compute hook causes a feedback loop where
|
||||
// location_config returns 500 and gets retried forever.
|
||||
|
||||
if let Some(create_req) = maybe_create {
|
||||
let create_resp = self.tenant_create(create_req).await?;
|
||||
result.shards = create_resp
|
||||
|
||||
@@ -27,6 +27,11 @@ impl Barrier {
|
||||
b.wait().await
|
||||
}
|
||||
}
|
||||
|
||||
/// Return true if a call to wait() would complete immediately
|
||||
pub fn is_ready(&self) -> bool {
|
||||
futures::future::FutureExt::now_or_never(self.0.wait()).is_some()
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq for Barrier {
|
||||
|
||||
@@ -56,10 +56,18 @@ pub enum ForceAwaitLogicalSize {
|
||||
|
||||
impl Client {
|
||||
pub fn new(mgmt_api_endpoint: String, jwt: Option<&str>) -> Self {
|
||||
Self::from_client(reqwest::Client::new(), mgmt_api_endpoint, jwt)
|
||||
}
|
||||
|
||||
pub fn from_client(
|
||||
client: reqwest::Client,
|
||||
mgmt_api_endpoint: String,
|
||||
jwt: Option<&str>,
|
||||
) -> Self {
|
||||
Self {
|
||||
mgmt_api_endpoint,
|
||||
authorization_header: jwt.map(|jwt| format!("Bearer {jwt}")),
|
||||
client: reqwest::Client::new(),
|
||||
client,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1949,6 +1949,15 @@ class NeonAttachmentService:
|
||||
|
||||
return headers
|
||||
|
||||
def ready(self) -> bool:
|
||||
resp = self.request("GET", f"{self.env.attachment_service_api}/ready")
|
||||
if resp.status_code == 503:
|
||||
return False
|
||||
elif resp.status_code == 200:
|
||||
return True
|
||||
else:
|
||||
raise RuntimeError(f"Unexpected status {resp.status_code} from readiness endpoint")
|
||||
|
||||
def attach_hook_issue(
|
||||
self, tenant_shard_id: Union[TenantId, TenantShardId], pageserver_id: int
|
||||
) -> int:
|
||||
|
||||
@@ -128,6 +128,38 @@ def test_sharding_service_smoke(
|
||||
assert counts[env.pageservers[2].id] == tenant_shard_count // 2
|
||||
|
||||
|
||||
def test_node_status_after_restart(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
):
|
||||
neon_env_builder.num_pageservers = 2
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
# Initially we have two online pageservers
|
||||
nodes = env.attachment_service.node_list()
|
||||
assert len(nodes) == 2
|
||||
|
||||
env.pageservers[1].stop()
|
||||
|
||||
env.attachment_service.stop()
|
||||
env.attachment_service.start()
|
||||
|
||||
# Initially readiness check should fail because we're trying to connect to the offline node
|
||||
assert env.attachment_service.ready() is False
|
||||
|
||||
def is_ready():
|
||||
assert env.attachment_service.ready() is True
|
||||
|
||||
wait_until(30, 1, is_ready)
|
||||
|
||||
# We loaded nodes from database on restart
|
||||
nodes = env.attachment_service.node_list()
|
||||
assert len(nodes) == 2
|
||||
|
||||
# We should still be able to create a tenant, because the pageserver which is still online
|
||||
# should have had its availabilty state set to Active.
|
||||
env.attachment_service.tenant_create(TenantId.generate())
|
||||
|
||||
|
||||
def test_sharding_service_passthrough(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
):
|
||||
|
||||
Reference in New Issue
Block a user