mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-08 22:12:56 +00:00
storage controller: fix retries of compute hook notifications while a secondary node is offline (#10352)
## Problem We would sometimes fail to retry compute notifications: 1. Try and send, set compute_notify_failure if we can't 2. On next reconcile, reconcile() fails for some other reason (e.g. tried to talk to an offline node), and we fail the `result.is_ok() && must_notify` condition around the re-sending. Closes: https://github.com/neondatabase/cloud/issues/22612 ## Summary of changes - Clarify the meaning of the reconcile result: it should be Ok(()) if configuring attached location worked, even if secondary or detach locations cannot be reached. - Skip trying to talk to secondaries if they're offline - Even if reconcile fails and we can't send the compute notification (we can't send it because we're not sure if it's really attached), make sure we save the `compute_notify_failure` flag so that subsequent reconciler runs will try again - Add a regression test for the above
This commit is contained in:
@@ -696,6 +696,11 @@ impl Reconciler {
|
||||
/// First we apply special case handling (e.g. for live migrations), and then a
|
||||
/// general case reconciliation where we walk through the intent by pageserver
|
||||
/// and call out to the pageserver to apply the desired state.
|
||||
///
|
||||
/// An Ok(()) result indicates that we successfully attached the tenant, but _not_ that
|
||||
/// all locations for the tenant are in the expected state. When nodes that are to be detached
|
||||
/// or configured as secondary are unavailable, we may return Ok(()) but leave the shard in a
|
||||
/// state where it still requires later reconciliation.
|
||||
pub(crate) async fn reconcile(&mut self) -> Result<(), ReconcileError> {
|
||||
// Prepare: if we have uncertain `observed` state for our would-be attachement location, then refresh it
|
||||
self.maybe_refresh_observed().await?;
|
||||
@@ -784,10 +789,18 @@ impl Reconciler {
|
||||
tracing::info!(node_id=%node.get_id(), "Observed configuration already correct.")
|
||||
}
|
||||
_ => {
|
||||
// In all cases other than a matching observed configuration, we will
|
||||
// reconcile this location.
|
||||
tracing::info!(node_id=%node.get_id(), "Observed configuration requires update.");
|
||||
changes.push((node.clone(), wanted_conf))
|
||||
// Only try and configure secondary locations on nodes that are available. This
|
||||
// allows the reconciler to "succeed" while some secondaries are offline (e.g. after
|
||||
// a node failure, where the failed node will have a secondary intent)
|
||||
if node.is_available() {
|
||||
tracing::info!(node_id=%node.get_id(), "Observed configuration requires update.");
|
||||
changes.push((node.clone(), wanted_conf))
|
||||
} else {
|
||||
tracing::info!(node_id=%node.get_id(), "Skipping configuration as secondary, node is unavailable");
|
||||
self.observed
|
||||
.locations
|
||||
.insert(node.get_id(), ObservedStateLocation { conf: None });
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1122,10 +1122,15 @@ impl TenantShard {
|
||||
let result = reconciler.reconcile().await;
|
||||
|
||||
// If we know we had a pending compute notification from some previous action, send a notification irrespective
|
||||
// of whether the above reconcile() did any work
|
||||
// of whether the above reconcile() did any work. It has to be Ok() though, because otherwise we might be
|
||||
// sending a notification of a location that isn't really attached.
|
||||
if result.is_ok() && must_notify {
|
||||
// If this fails we will send the need to retry in [`ReconcileResult::pending_compute_notification`]
|
||||
reconciler.compute_notify().await.ok();
|
||||
} else if must_notify {
|
||||
// Carry this flag so that the reconciler's result will indicate that it still needs to retry
|
||||
// the compute hook notification eventually.
|
||||
reconciler.compute_notify_failure = true;
|
||||
}
|
||||
|
||||
// Update result counter
|
||||
|
||||
@@ -822,6 +822,122 @@ def test_storage_controller_stuck_compute_hook(
|
||||
env.storage_controller.consistency_check()
|
||||
|
||||
|
||||
@run_only_on_default_postgres("postgres behavior is not relevant")
|
||||
def test_storage_controller_compute_hook_retry(
|
||||
httpserver: HTTPServer,
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
httpserver_listen_address: ListenAddress,
|
||||
):
|
||||
"""
|
||||
Test that when a reconciler can't do its compute hook notification, it will keep
|
||||
trying until it succeeds.
|
||||
|
||||
Reproducer for https://github.com/neondatabase/cloud/issues/22612
|
||||
"""
|
||||
|
||||
neon_env_builder.num_pageservers = 2
|
||||
(host, port) = httpserver_listen_address
|
||||
neon_env_builder.control_plane_compute_hook_api = f"http://{host}:{port}/notify"
|
||||
|
||||
handle_params = {"status": 200}
|
||||
|
||||
notifications = []
|
||||
|
||||
def handler(request: Request):
|
||||
status = handle_params["status"]
|
||||
log.info(f"Notify request[{status}]: {request}")
|
||||
notifications.append(request.json)
|
||||
return Response(status=status)
|
||||
|
||||
httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler)
|
||||
|
||||
# Start running
|
||||
env = neon_env_builder.init_configs()
|
||||
env.start()
|
||||
|
||||
tenant_id = TenantId.generate()
|
||||
env.create_tenant(tenant_id, placement_policy='{"Attached": 1}')
|
||||
|
||||
# Initial notification from tenant creation
|
||||
assert len(notifications) == 1
|
||||
expect: dict[str, list[dict[str, int]] | str | None | int] = {
|
||||
"tenant_id": str(tenant_id),
|
||||
"stripe_size": None,
|
||||
"shards": [{"node_id": int(env.pageservers[0].id), "shard_number": 0}],
|
||||
"preferred_az": DEFAULT_AZ_ID,
|
||||
}
|
||||
assert notifications[0] == expect
|
||||
|
||||
# Block notifications, and fail a node
|
||||
handle_params["status"] = 423
|
||||
env.pageservers[0].stop()
|
||||
env.storage_controller.allowed_errors.append(NOTIFY_BLOCKED_LOG)
|
||||
env.storage_controller.allowed_errors.extend(NOTIFY_FAILURE_LOGS)
|
||||
|
||||
# Avoid waiting for heartbeats
|
||||
env.storage_controller.node_configure(env.pageservers[0].id, {"availability": "Offline"})
|
||||
|
||||
# Make reconciler run and fail: it should leave itself in a state where the shard will retry notification later,
|
||||
# and we will check that that happens
|
||||
notifications = []
|
||||
try:
|
||||
assert env.storage_controller.reconcile_all() == 1
|
||||
except StorageControllerApiException as e:
|
||||
assert "Control plane tenant busy" in str(e)
|
||||
assert len(notifications) == 1
|
||||
assert (
|
||||
env.storage_controller.tenant_describe(tenant_id)["shards"][0][
|
||||
"is_pending_compute_notification"
|
||||
]
|
||||
is True
|
||||
)
|
||||
|
||||
# Try reconciling again, it should try notifying again
|
||||
notifications = []
|
||||
try:
|
||||
assert env.storage_controller.reconcile_all() == 1
|
||||
except StorageControllerApiException as e:
|
||||
assert "Control plane tenant busy" in str(e)
|
||||
assert len(notifications) == 1
|
||||
assert (
|
||||
env.storage_controller.tenant_describe(tenant_id)["shards"][0][
|
||||
"is_pending_compute_notification"
|
||||
]
|
||||
is True
|
||||
)
|
||||
|
||||
# The describe API should indicate that a notification is pending
|
||||
assert (
|
||||
env.storage_controller.tenant_describe(tenant_id)["shards"][0][
|
||||
"is_pending_compute_notification"
|
||||
]
|
||||
is True
|
||||
)
|
||||
|
||||
# Unblock notifications: reconcile should work now
|
||||
handle_params["status"] = 200
|
||||
notifications = []
|
||||
assert env.storage_controller.reconcile_all() == 1
|
||||
assert len(notifications) == 1
|
||||
assert (
|
||||
env.storage_controller.tenant_describe(tenant_id)["shards"][0][
|
||||
"is_pending_compute_notification"
|
||||
]
|
||||
is False
|
||||
)
|
||||
|
||||
# Reconciler should be idle now that it succeeded in its compute notification
|
||||
notifications = []
|
||||
assert env.storage_controller.reconcile_all() == 0
|
||||
assert len(notifications) == 0
|
||||
assert (
|
||||
env.storage_controller.tenant_describe(tenant_id)["shards"][0][
|
||||
"is_pending_compute_notification"
|
||||
]
|
||||
is False
|
||||
)
|
||||
|
||||
|
||||
@run_only_on_default_postgres("this test doesn't start an endpoint")
|
||||
def test_storage_controller_compute_hook_revert(
|
||||
httpserver: HTTPServer,
|
||||
|
||||
Reference in New Issue
Block a user