mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-09 14:32:57 +00:00
storcon: deny external node configuration if an operation is ongoing (#8727)
Per #8674, disallow node configuration while drain/fill are ongoing. Implement it by adding a only-http wrapper `Service::external_node_configure` which checks for operation existing before configuring. Additionally: - allow cancelling drain/fill after a pageserver has restarted and transitioned to WarmingUp Fixes: #8674
This commit is contained in:
@@ -313,20 +313,17 @@ pub struct MetadataHealthUpdateRequest {
|
||||
pub struct MetadataHealthUpdateResponse {}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
|
||||
pub struct MetadataHealthListUnhealthyResponse {
|
||||
pub unhealthy_tenant_shards: Vec<TenantShardId>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
|
||||
pub struct MetadataHealthListOutdatedRequest {
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub not_scrubbed_for: Duration,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
|
||||
pub struct MetadataHealthListOutdatedResponse {
|
||||
pub health_records: Vec<MetadataHealthRecord>,
|
||||
}
|
||||
|
||||
@@ -500,7 +500,7 @@ async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>,
|
||||
StatusCode::OK,
|
||||
state
|
||||
.service
|
||||
.node_configure(
|
||||
.external_node_configure(
|
||||
config_req.node_id,
|
||||
config_req.availability.map(NodeAvailability::from),
|
||||
config_req.scheduling,
|
||||
|
||||
@@ -4912,6 +4912,26 @@ impl Service {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Wrapper around [`Self::node_configure`] which only allows changes while there is no ongoing
|
||||
/// operation for HTTP api.
|
||||
pub(crate) async fn external_node_configure(
|
||||
&self,
|
||||
node_id: NodeId,
|
||||
availability: Option<NodeAvailability>,
|
||||
scheduling: Option<NodeSchedulingPolicy>,
|
||||
) -> Result<(), ApiError> {
|
||||
{
|
||||
let locked = self.inner.read().unwrap();
|
||||
if let Some(op) = locked.ongoing_operation.as_ref().map(|op| op.operation) {
|
||||
return Err(ApiError::PreconditionFailed(
|
||||
format!("Ongoing background operation forbids configuring: {op}").into(),
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
self.node_configure(node_id, availability, scheduling).await
|
||||
}
|
||||
|
||||
pub(crate) async fn start_node_drain(
|
||||
self: &Arc<Self>,
|
||||
node_id: NodeId,
|
||||
@@ -5017,14 +5037,14 @@ impl Service {
|
||||
}
|
||||
|
||||
pub(crate) async fn cancel_node_drain(&self, node_id: NodeId) -> Result<(), ApiError> {
|
||||
let (node_available, node_policy) = {
|
||||
let node_available = {
|
||||
let locked = self.inner.read().unwrap();
|
||||
let nodes = &locked.nodes;
|
||||
let node = nodes.get(&node_id).ok_or(ApiError::NotFound(
|
||||
anyhow::anyhow!("Node {} not registered", node_id).into(),
|
||||
))?;
|
||||
|
||||
(node.is_available(), node.get_scheduling())
|
||||
node.is_available()
|
||||
};
|
||||
|
||||
if !node_available {
|
||||
@@ -5033,12 +5053,6 @@ impl Service {
|
||||
));
|
||||
}
|
||||
|
||||
if !matches!(node_policy, NodeSchedulingPolicy::Draining) {
|
||||
return Err(ApiError::PreconditionFailed(
|
||||
format!("Node {node_id} has no drain in progress").into(),
|
||||
));
|
||||
}
|
||||
|
||||
if let Some(op_handler) = self.inner.read().unwrap().ongoing_operation.as_ref() {
|
||||
if let Operation::Drain(drain) = op_handler.operation {
|
||||
if drain.node_id == node_id {
|
||||
@@ -5152,14 +5166,14 @@ impl Service {
|
||||
}
|
||||
|
||||
pub(crate) async fn cancel_node_fill(&self, node_id: NodeId) -> Result<(), ApiError> {
|
||||
let (node_available, node_policy) = {
|
||||
let node_available = {
|
||||
let locked = self.inner.read().unwrap();
|
||||
let nodes = &locked.nodes;
|
||||
let node = nodes.get(&node_id).ok_or(ApiError::NotFound(
|
||||
anyhow::anyhow!("Node {} not registered", node_id).into(),
|
||||
))?;
|
||||
|
||||
(node.is_available(), node.get_scheduling())
|
||||
node.is_available()
|
||||
};
|
||||
|
||||
if !node_available {
|
||||
@@ -5168,12 +5182,6 @@ impl Service {
|
||||
));
|
||||
}
|
||||
|
||||
if !matches!(node_policy, NodeSchedulingPolicy::Filling) {
|
||||
return Err(ApiError::PreconditionFailed(
|
||||
format!("Node {node_id} has no fill in progress").into(),
|
||||
));
|
||||
}
|
||||
|
||||
if let Some(op_handler) = self.inner.read().unwrap().ongoing_operation.as_ref() {
|
||||
if let Operation::Fill(fill) = op_handler.operation {
|
||||
if fill.node_id == node_id {
|
||||
@@ -5982,7 +5990,7 @@ impl Service {
|
||||
.await_waiters_remainder(waiters, SHORT_RECONCILE_TIMEOUT)
|
||||
.await;
|
||||
|
||||
failpoint_support::sleep_millis_async!("sleepy-drain-loop");
|
||||
failpoint_support::sleep_millis_async!("sleepy-drain-loop", &cancel);
|
||||
}
|
||||
|
||||
while !waiters.is_empty() {
|
||||
|
||||
@@ -2091,3 +2091,47 @@ def test_storage_controller_step_down(neon_env_builder: NeonEnvBuilder):
|
||||
)
|
||||
== 0
|
||||
)
|
||||
|
||||
|
||||
def test_storage_controller_ps_restarted_during_drain(neon_env_builder: NeonEnvBuilder):
|
||||
# single unsharded tenant, two locations
|
||||
neon_env_builder.num_pageservers = 2
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
env.storage_controller.tenant_policy_update(env.initial_tenant, {"placement": {"Attached": 1}})
|
||||
env.storage_controller.reconcile_until_idle()
|
||||
|
||||
attached_id = int(env.storage_controller.locate(env.initial_tenant)[0]["node_id"])
|
||||
attached = next((ps for ps in env.pageservers if ps.id == attached_id))
|
||||
|
||||
def attached_is_draining():
|
||||
details = env.storage_controller.node_status(attached.id)
|
||||
assert details["scheduling"] == "Draining"
|
||||
|
||||
env.storage_controller.configure_failpoints(("sleepy-drain-loop", "return(10000)"))
|
||||
env.storage_controller.node_drain(attached.id)
|
||||
|
||||
wait_until(10, 0.5, attached_is_draining)
|
||||
|
||||
attached.restart()
|
||||
|
||||
# we are unable to reconfigure node while the operation is still ongoing
|
||||
with pytest.raises(
|
||||
StorageControllerApiException,
|
||||
match="Precondition failed: Ongoing background operation forbids configuring: drain.*",
|
||||
):
|
||||
env.storage_controller.node_configure(attached.id, {"scheduling": "Pause"})
|
||||
with pytest.raises(
|
||||
StorageControllerApiException,
|
||||
match="Precondition failed: Ongoing background operation forbids configuring: drain.*",
|
||||
):
|
||||
env.storage_controller.node_configure(attached.id, {"availability": "Offline"})
|
||||
|
||||
env.storage_controller.cancel_node_drain(attached.id)
|
||||
|
||||
def reconfigure_node_again():
|
||||
env.storage_controller.node_configure(attached.id, {"scheduling": "Pause"})
|
||||
|
||||
# allow for small delay between actually having cancelled and being able reconfigure again
|
||||
wait_until(4, 0.5, reconfigure_node_again)
|
||||
|
||||
Reference in New Issue
Block a user