storage controller: misc fixes (#7036)

## Problem Collection of small changes, batched together to reduce CI overhead. ## Summary of changes - Layer download messages include size -- this is useful when watching a pageserver hydrate its on disk cache in the log. - Controller migrate API could put an invalid NodeId into TenantState - Scheduling errors during tenant create could result in creating some shards and not others. - Consistency check could give hard-to-understand failures in tests if a reconcile was in process: explicitly fail the check if reconciles are in progress instead.
2026-01-16 01:42:55 +00:00 · 2024-03-06 16:47:32 +00:00
parent 5dc2088cf3
commit a9a4a76d13
2 changed files with 47 additions and 19 deletions
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -1159,9 +1159,12 @@ impl Service {

        let (waiters, response_shards) = {
            let mut locked = self.inner.write().unwrap();
-            let (_nodes, tenants, scheduler) = locked.parts_mut();
+            let result_tx = locked.result_tx.clone();
+            let compute_hook = locked.compute_hook.clone();
+            let (nodes, tenants, scheduler) = locked.parts_mut();

            let mut response_shards = Vec::new();
+            let mut schcedule_error = None;

            for tenant_shard_id in create_ids {
                tracing::info!("Creating shard {tenant_shard_id}...");
@@ -1198,23 +1201,20 @@ impl Service {
                        continue;
                    }
                    Entry::Vacant(entry) => {
-                        let mut state = TenantState::new(
+                        let state = entry.insert(TenantState::new(
                            tenant_shard_id,
                            ShardIdentity::from_params(
                                tenant_shard_id.shard_number,
                                &create_req.shard_parameters,
                            ),
                            placement_policy.clone(),
-                        );
+                        ));

                        state.generation = initial_generation;
                        state.config = create_req.config.clone();
-
-                        state.schedule(scheduler).map_err(|e| {
-                            ApiError::Conflict(format!(
-                                "Failed to schedule shard {tenant_shard_id}: {e}"
-                            ))
-                        })?;
+                        if let Err(e) = state.schedule(scheduler) {
+                            schcedule_error = Some(e);
+                        }

                        // Only include shards in result if we are attaching: the purpose
                        // of the response is to tell the caller where the shards are attached.
@@ -1228,24 +1228,27 @@ impl Service {
                                generation: generation.into().unwrap(),
                            });
                        }
-                        entry.insert(state)
                    }
                };
            }

-            // Take a snapshot of pageservers
-            let pageservers = locked.nodes.clone();
+            // If we failed to schedule shards, then they are still created in the controller,
+            // but we return an error to the requester to avoid a silent failure when someone
+            // tries to e.g. create a tenant whose placement policy requires more nodes than
+            // are present in the system.  We do this here rather than in the above loop, to
+            // avoid situations where we only create a subset of shards in the tenant.
+            if let Some(e) = schcedule_error {
+                return Err(ApiError::Conflict(format!(
+                    "Failed to schedule shard(s): {e}"
+                )));
+            }

-            let result_tx = locked.result_tx.clone();
-            let compute_hook = locked.compute_hook.clone();
-
-            let waiters = locked
-                .tenants
+            let waiters = tenants
                .range_mut(TenantShardId::tenant_range(tenant_id))
                .filter_map(|(_shard_id, shard)| {
                    shard.maybe_reconcile(
                        result_tx.clone(),
-                        &pageservers,
+                        nodes,
                        &compute_hook,
                        &self.config,
                        &self.persistence,
@@ -2516,6 +2519,19 @@ impl Service {
            let compute_hook = locked.compute_hook.clone();
            let (nodes, tenants, scheduler) = locked.parts_mut();

+            let Some(node) = nodes.get(&migrate_req.node_id) else {
+                return Err(ApiError::BadRequest(anyhow::anyhow!(
+                    "Node {} not found",
+                    migrate_req.node_id
+                )));
+            };
+
+            if node.availability != NodeAvailability::Active {
+                // Warn but proceed: the caller may intend to manually adjust the placement of
+                // a shard even if the node is down, e.g. if intervening during an incident.
+                tracing::warn!("Migrating to an unavailable node ({})", node.id);
+            }
+
            let Some(shard) = tenants.get_mut(&tenant_shard_id) else {
                return Err(ApiError::NotFound(
                    anyhow::anyhow!("Tenant shard not found").into(),
@@ -2645,6 +2661,18 @@ impl Service {
                .map(|t| t.to_persistent())
                .collect::<Vec<_>>();

+            // This method can only validate the state of an idle system: if a reconcile is in
+            // progress, fail out early to avoid giving false errors on state that won't match
+            // between database and memory under a ReconcileResult is processed.
+            for t in locked.tenants.values() {
+                if t.reconciler.is_some() {
+                    return Err(ApiError::InternalServerError(anyhow::anyhow!(
+                        "Shard {} reconciliation in progress",
+                        t.tenant_shard_id
+                    )));
+                }
+            }
+
            (expect_nodes, expect_shards)
        };

--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -976,7 +976,7 @@ impl LayerInner {
                }

                self.consecutive_failures.store(0, Ordering::Relaxed);
-                tracing::info!("on-demand download successful");
+                tracing::info!(size=%self.desc.file_size, "on-demand download successful");

                Ok(permit)
            }