mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-06 21:12:55 +00:00
storage controller: be more tolerant of control plane blocking notifications (#7268)
## Problem - Control plane can deadlock if it calls into a function that requires reconciliation to complete, while refusing compute notification hooks API calls. ## Summary of changes - Fail faster in the notify path in 438 errors: these were originally expected to be transient, but in practice it's more common that a 438 results from an operation blocking on the currently API call, rather than something happening in the background. - In ensure_attached, relax the condition for spawning a reconciler: instead of just the general maybe_reconcile path, do a pre-check that skips trying to reconcile if the shard appears to be attached. This avoids doing work in cases where the tenant is attached, but is dirty from a reconciliation point of view, e.g. due to a failed compute notification.
This commit is contained in:
@@ -14,7 +14,6 @@ use utils::{
|
||||
|
||||
use crate::service::Config;
|
||||
|
||||
const BUSY_DELAY: Duration = Duration::from_secs(1);
|
||||
const SLOWDOWN_DELAY: Duration = Duration::from_secs(5);
|
||||
|
||||
pub(crate) const API_CONCURRENCY: usize = 32;
|
||||
@@ -280,11 +279,10 @@ impl ComputeHook {
|
||||
Err(NotifyError::SlowDown)
|
||||
}
|
||||
StatusCode::LOCKED => {
|
||||
// Delay our retry if busy: the usual fast exponential backoff in backoff::retry
|
||||
// is not appropriate
|
||||
tokio::time::timeout(BUSY_DELAY, cancel.cancelled())
|
||||
.await
|
||||
.ok();
|
||||
// We consider this fatal, because it's possible that the operation blocking the control one is
|
||||
// also the one that is waiting for this reconcile. We should let the reconciler calling
|
||||
// this hook fail, to give control plane a chance to un-lock.
|
||||
tracing::info!("Control plane reports tenant is locked, dropping out of notify");
|
||||
Err(NotifyError::Busy)
|
||||
}
|
||||
StatusCode::SERVICE_UNAVAILABLE
|
||||
@@ -306,7 +304,12 @@ impl ComputeHook {
|
||||
let client = reqwest::Client::new();
|
||||
backoff::retry(
|
||||
|| self.do_notify_iteration(&client, url, &reconfigure_request, cancel),
|
||||
|e| matches!(e, NotifyError::Fatal(_) | NotifyError::Unexpected(_)),
|
||||
|e| {
|
||||
matches!(
|
||||
e,
|
||||
NotifyError::Fatal(_) | NotifyError::Unexpected(_) | NotifyError::Busy
|
||||
)
|
||||
},
|
||||
3,
|
||||
10,
|
||||
"Send compute notification",
|
||||
|
||||
@@ -3936,9 +3936,6 @@ impl Service {
|
||||
/// Helper for methods that will try and call pageserver APIs for
|
||||
/// a tenant, such as timeline CRUD: they cannot proceed unless the tenant
|
||||
/// is attached somewhere.
|
||||
///
|
||||
/// TODO: this doesn't actually ensure attached unless the PlacementPolicy is
|
||||
/// an attached policy. We should error out if it isn't.
|
||||
fn ensure_attached_schedule(
|
||||
&self,
|
||||
mut locked: std::sync::RwLockWriteGuard<'_, ServiceState>,
|
||||
@@ -3947,10 +3944,26 @@ impl Service {
|
||||
let mut waiters = Vec::new();
|
||||
let (nodes, tenants, scheduler) = locked.parts_mut();
|
||||
|
||||
for (_tenant_shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
|
||||
for (tenant_shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
|
||||
shard.schedule(scheduler)?;
|
||||
|
||||
// The shard's policies may not result in an attached location being scheduled: this
|
||||
// is an error because our caller needs it attached somewhere.
|
||||
if shard.intent.get_attached().is_none() {
|
||||
return Err(anyhow::anyhow!(
|
||||
"Tenant {tenant_id} not scheduled to be attached"
|
||||
));
|
||||
};
|
||||
|
||||
if shard.stably_attached().is_some() {
|
||||
// We do not require the shard to be totally up to date on reconciliation: we just require
|
||||
// that it has been attached on the intended node. Other dirty state such as unattached secondary
|
||||
// locations, or compute hook notifications can be ignored.
|
||||
continue;
|
||||
}
|
||||
|
||||
if let Some(waiter) = self.maybe_reconcile_shard(shard, nodes) {
|
||||
tracing::info!("Waiting for shard {tenant_shard_id} to reconcile, in order to ensure it is attached");
|
||||
waiters.push(waiter);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -433,10 +433,13 @@ def test_sharding_service_compute_hook(
|
||||
# Set up fake HTTP notify endpoint
|
||||
notifications = []
|
||||
|
||||
handle_params = {"status": 200}
|
||||
|
||||
def handler(request: Request):
|
||||
log.info(f"Notify request: {request}")
|
||||
status = handle_params["status"]
|
||||
log.info(f"Notify request[{status}]: {request}")
|
||||
notifications.append(request.json)
|
||||
return Response(status=200)
|
||||
return Response(status=status)
|
||||
|
||||
httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler)
|
||||
|
||||
@@ -504,6 +507,24 @@ def test_sharding_service_compute_hook(
|
||||
|
||||
wait_until(10, 1, received_split_notification)
|
||||
|
||||
# If the compute hook is unavailable, that should not block creating a tenant and
|
||||
# creating a timeline. This simulates a control plane refusing to accept notifications
|
||||
handle_params["status"] = 423
|
||||
degraded_tenant_id = TenantId.generate()
|
||||
degraded_timeline_id = TimelineId.generate()
|
||||
env.storage_controller.tenant_create(degraded_tenant_id)
|
||||
env.storage_controller.pageserver_api().timeline_create(
|
||||
PgVersion.NOT_SET, degraded_tenant_id, degraded_timeline_id
|
||||
)
|
||||
|
||||
# Ensure we hit the handler error path
|
||||
env.storage_controller.allowed_errors.append(
|
||||
".*Failed to notify compute of attached pageserver.*tenant busy.*"
|
||||
)
|
||||
env.storage_controller.allowed_errors.append(".*Reconcile error.*tenant busy.*")
|
||||
assert notifications[-1] is not None
|
||||
assert notifications[-1]["tenant_id"] == str(degraded_tenant_id)
|
||||
|
||||
env.storage_controller.consistency_check()
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user