storcon: deny external node configuration if an operation is ongoing (#8727)

Per #8674, disallow node configuration while drain/fill are ongoing.
Implement it by adding a only-http wrapper
`Service::external_node_configure` which checks for operation existing
before configuring.

Additionally:
- allow cancelling drain/fill after a pageserver has restarted and
transitioned to WarmingUp

Fixes: #8674
This commit is contained in:
Joonas Koivunen
2024-08-15 12:54:05 +03:00
committed by GitHub
parent a9c28be7d0
commit d9a57aeed9
4 changed files with 70 additions and 21 deletions

View File

@@ -500,7 +500,7 @@ async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>,
StatusCode::OK,
state
.service
.node_configure(
.external_node_configure(
config_req.node_id,
config_req.availability.map(NodeAvailability::from),
config_req.scheduling,

View File

@@ -4912,6 +4912,26 @@ impl Service {
Ok(())
}
/// Wrapper around [`Self::node_configure`] which only allows changes while there is no ongoing
/// operation for HTTP api.
pub(crate) async fn external_node_configure(
&self,
node_id: NodeId,
availability: Option<NodeAvailability>,
scheduling: Option<NodeSchedulingPolicy>,
) -> Result<(), ApiError> {
{
let locked = self.inner.read().unwrap();
if let Some(op) = locked.ongoing_operation.as_ref().map(|op| op.operation) {
return Err(ApiError::PreconditionFailed(
format!("Ongoing background operation forbids configuring: {op}").into(),
));
}
}
self.node_configure(node_id, availability, scheduling).await
}
pub(crate) async fn start_node_drain(
self: &Arc<Self>,
node_id: NodeId,
@@ -5017,14 +5037,14 @@ impl Service {
}
pub(crate) async fn cancel_node_drain(&self, node_id: NodeId) -> Result<(), ApiError> {
let (node_available, node_policy) = {
let node_available = {
let locked = self.inner.read().unwrap();
let nodes = &locked.nodes;
let node = nodes.get(&node_id).ok_or(ApiError::NotFound(
anyhow::anyhow!("Node {} not registered", node_id).into(),
))?;
(node.is_available(), node.get_scheduling())
node.is_available()
};
if !node_available {
@@ -5033,12 +5053,6 @@ impl Service {
));
}
if !matches!(node_policy, NodeSchedulingPolicy::Draining) {
return Err(ApiError::PreconditionFailed(
format!("Node {node_id} has no drain in progress").into(),
));
}
if let Some(op_handler) = self.inner.read().unwrap().ongoing_operation.as_ref() {
if let Operation::Drain(drain) = op_handler.operation {
if drain.node_id == node_id {
@@ -5152,14 +5166,14 @@ impl Service {
}
pub(crate) async fn cancel_node_fill(&self, node_id: NodeId) -> Result<(), ApiError> {
let (node_available, node_policy) = {
let node_available = {
let locked = self.inner.read().unwrap();
let nodes = &locked.nodes;
let node = nodes.get(&node_id).ok_or(ApiError::NotFound(
anyhow::anyhow!("Node {} not registered", node_id).into(),
))?;
(node.is_available(), node.get_scheduling())
node.is_available()
};
if !node_available {
@@ -5168,12 +5182,6 @@ impl Service {
));
}
if !matches!(node_policy, NodeSchedulingPolicy::Filling) {
return Err(ApiError::PreconditionFailed(
format!("Node {node_id} has no fill in progress").into(),
));
}
if let Some(op_handler) = self.inner.read().unwrap().ongoing_operation.as_ref() {
if let Operation::Fill(fill) = op_handler.operation {
if fill.node_id == node_id {
@@ -5982,7 +5990,7 @@ impl Service {
.await_waiters_remainder(waiters, SHORT_RECONCILE_TIMEOUT)
.await;
failpoint_support::sleep_millis_async!("sleepy-drain-loop");
failpoint_support::sleep_millis_async!("sleepy-drain-loop", &cancel);
}
while !waiters.is_empty() {