From da22557383ce590f60a8c3f8ce8bc43d09dcc470 Mon Sep 17 00:00:00 2001 From: John Spray Date: Fri, 8 Mar 2024 17:14:30 +0000 Subject: [PATCH] pageserver: fix leaving tenant in bad state on split failure --- pageserver/src/tenant/mgr.rs | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index fc08b3c82e..efcbe7c83b 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -1440,6 +1440,31 @@ impl TenantManager { tenant_shard_id: TenantShardId, new_shard_count: ShardCount, ctx: &RequestContext, + ) -> anyhow::Result> { + let r = self + .do_shard_split(tenant_shard_id, new_shard_count, ctx) + .await; + if r.is_err() { + // Shard splitting might have left the original shard in a partially shut down state (it + // stops the shard's remote timeline client). Reset it to ensure we leave things in + // a working state. + if self.get(tenant_shard_id).is_some() { + tracing::warn!("Resetting {tenant_shard_id} after shard split failure"); + if let Err(e) = self.reset_tenant(tenant_shard_id, false, ctx).await { + // Log this error because our return value will still be the original error, not this one. + tracing::warn!("Failed to reset {tenant_shard_id}: {e}"); + } + } + } + + r + } + + pub(crate) async fn do_shard_split( + &self, + tenant_shard_id: TenantShardId, + new_shard_count: ShardCount, + ctx: &RequestContext, ) -> anyhow::Result> { let tenant = get_tenant(tenant_shard_id, true)?;