storcon: reproduce shard split issue (#11290)

## Problem Issue https://github.com/neondatabase/neon/issues/11254 describes a case where restart during a shard split can result in a bad end state in the database. ## Summary of changes - Add a reproducer for the issue - Tighten an existing safety check around updated row counts in complete_shard_split
2026-01-07 13:32:57 +00:00 · 2025-03-21 08:48:56 +00:00
parent 0d99609870
commit 76088c16d2
4 changed files with 155 additions and 3 deletions
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -997,10 +997,11 @@ impl Persistence {
                // Clear sharding flag
                let updated = diesel::update(tenant_shards)
                    .filter(tenant_id.eq(split_tenant_id.to_string()))
+                    .filter(shard_count.eq(new_shard_count.literal() as i32))
                    .set((splitting.eq(0),))
                    .execute(conn)
                    .await?;
-                debug_assert!(updated > 0);
+                assert!(updated == new_shard_count.count() as usize);

                Ok(())
            })
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -5456,6 +5456,8 @@ impl Service {
            }
        }

+        pausable_failpoint!("shard-split-pre-complete");
+
        // TODO: if the pageserver restarted concurrently with our split API call,
        // the actual generation of the child shard might differ from the generation
        // we expect it to have.  In order for our in-database generation to end up