mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-07 21:42:56 +00:00
storcon: safety check when completing shard split (#11256)
## Problem There is a rare race between controller graceful deployment and shard splitting where we may incorrectly both abort _and_ complete the split (on different pods), and thereby leave no shards at all in the database. Related: #11254 ## Summary of changes - In complete_shard_split, refuse to delete anything if child shards are not found
This commit is contained in:
@@ -967,10 +967,26 @@ impl Persistence {
|
||||
&self,
|
||||
split_tenant_id: TenantId,
|
||||
old_shard_count: ShardCount,
|
||||
new_shard_count: ShardCount,
|
||||
) -> DatabaseResult<()> {
|
||||
use crate::schema::tenant_shards::dsl::*;
|
||||
self.with_measured_conn(DatabaseOperation::CompleteShardSplit, move |conn| {
|
||||
Box::pin(async move {
|
||||
// Sanity: child shards must still exist, as we're deleting parent shards
|
||||
let child_shards_query = tenant_shards
|
||||
.filter(tenant_id.eq(split_tenant_id.to_string()))
|
||||
.filter(shard_count.eq(new_shard_count.literal() as i32));
|
||||
let child_shards = child_shards_query
|
||||
.load::<TenantShardPersistence>(conn)
|
||||
.await?;
|
||||
if child_shards.len() != new_shard_count.count() as usize {
|
||||
return Err(DatabaseError::Logical(format!(
|
||||
"Unexpected child shard count {} while completing split to \
|
||||
count {new_shard_count:?} on tenant {split_tenant_id}",
|
||||
child_shards.len()
|
||||
)));
|
||||
}
|
||||
|
||||
// Drop parent shards
|
||||
diesel::delete(tenant_shards)
|
||||
.filter(tenant_id.eq(split_tenant_id.to_string()))
|
||||
|
||||
@@ -5753,7 +5753,7 @@ impl Service {
|
||||
// it doesn't match, but that requires more retry logic on this side)
|
||||
|
||||
self.persistence
|
||||
.complete_shard_split(tenant_id, old_shard_count)
|
||||
.complete_shard_split(tenant_id, old_shard_count, new_shard_count)
|
||||
.await?;
|
||||
|
||||
fail::fail_point!("shard-split-post-complete", |_| Err(
|
||||
|
||||
Reference in New Issue
Block a user