storage controller: make direct tenant creation more robust (#7247)

## Problem - Creations were not idempotent (unique key violation) - Creations waited for reconciliation, which control plane blocks while an operation is in flight ## Summary of changes - Handle unique key constraint violation as an OK situation: if we're creating the same tenant ID and shard count, it's reasonable to assume this is a duplicate creation. - Make the wait for reconcile during creation tolerate failures: this is similar to location_conf, where the cloud control plane blocks our notification calls until it is done with calling into our API (in future this constraint is expected to relax as the cloud control plane learns to run multiple operations concurrently for a tenant)
2026-01-10 15:02:56 +00:00 · 2024-03-26 16:57:35 +00:00
parent 47d2b3a483
commit b3bb1d1cad
3 changed files with 33 additions and 6 deletions
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -1523,6 +1523,8 @@ impl Service {
        &self,
        create_req: TenantCreateRequest,
    ) -> Result<TenantCreateResponse, ApiError> {
+        let tenant_id = create_req.new_tenant_id.tenant_id;
+
        // Exclude any concurrent attempts to create/access the same tenant ID
        let _tenant_lock = self
            .tenant_op_locks
@@ -1531,7 +1533,12 @@ impl Service {

        let (response, waiters) = self.do_tenant_create(create_req).await?;

-        self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await?;
+        if let Err(e) = self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await {
+            // Avoid deadlock: reconcile may fail while notifying compute, if the cloud control plane refuses to
+            // accept compute notifications while it is in the process of creating.  Reconciliation will
+            // be retried in the background.
+            tracing::warn!(%tenant_id, "Reconcile not done yet while creating tenant ({e})");
+        }
        Ok(response)
    }

@@ -1610,13 +1617,25 @@ impl Service {
                splitting: SplitState::default(),
            })
            .collect();
-        self.persistence
+
+        match self
+            .persistence
            .insert_tenant_shards(persist_tenant_shards)
            .await
-            .map_err(|e| {
-                // TODO: distinguish primary key constraint (idempotent, OK), from other errors
-                ApiError::InternalServerError(anyhow::anyhow!(e))
-            })?;
+        {
+            Ok(_) => {}
+            Err(DatabaseError::Query(diesel::result::Error::DatabaseError(
+                DatabaseErrorKind::UniqueViolation,
+                _,
+            ))) => {
+                // Unique key violation: this is probably a retry.  Because the shard count is part of the unique key,
+                // if we see a unique key violation it means that the creation request's shard count matches the previous
+                // creation's shard count.
+                tracing::info!("Tenant shards already present in database, proceeding with idempotent creation...");
+            }
+            // Any other database error is unexpected and a bug.
+            Err(e) => return Err(ApiError::InternalServerError(anyhow::anyhow!(e))),
+        };

        let (waiters, response_shards) = {
            let mut locked = self.inner.write().unwrap();
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2126,6 +2126,8 @@ class NeonStorageController(MetricsGetter):
            shard_params = {"count": shard_count}
            if shard_stripe_size is not None:
                shard_params["stripe_size"] = shard_stripe_size
+            else:
+                shard_params["stripe_size"] = 32768

            body["shard_parameters"] = shard_params

@@ -2139,6 +2141,7 @@ class NeonStorageController(MetricsGetter):
            json=body,
            headers=self.headers(TokenScope.PAGE_SERVER_API),
        )
+        response.raise_for_status()
        log.info(f"tenant_create success: {response.json()}")

    def locate(self, tenant_id: TenantId) -> list[dict[str, Any]]:
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -89,6 +89,11 @@ def test_sharding_service_smoke(
    for tid in tenant_ids:
        env.neon_cli.create_tenant(tid, shard_count=shards_per_tenant)

+    # Repeating a creation should be idempotent (we are just testing it doesn't return an error)
+    env.storage_controller.tenant_create(
+        tenant_id=next(iter(tenant_ids)), shard_count=shards_per_tenant
+    )
+
    for node_id, count in get_node_shard_counts(env, tenant_ids).items():
        # we used a multiple of pagservers for the total shard count,
        # so expect equal number on all pageservers