From 6313f1fa7a36a91a83158a381bd850f0147cb772 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 26 Mar 2024 09:56:47 +0000
Subject: [PATCH] tests: tolerate transient unavailability in
 test_sharding_split_failures (#7223)

## Problem

While most forms of split rollback don't interrupt clients, there are a
couple of cases that do -- this interruption is brief, driven by the
time it takes the controller to kick off Reconcilers during the async
abort of the split, so it's operationally fine, but can trip up a test.

- #7148

## Summary of changes

- Relax test check to require that the tenant is eventually available
after split failure, rather than immediately. In the vast majority of
cases this will pass on the first iteration.
---
 test_runner/regress/test_sharding.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index e6318aff68..9aebf16c68 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -874,11 +874,17 @@ def test_sharding_split_failures(
         workload.validate()
 
     if failure.expect_available():
-        # Even though the split failed partway through, this should not have interrupted
-        # clients.  Disable waiting for pageservers in the workload helper, because our
-        # failpoints may prevent API access.
-        # This only applies for failure modes that leave pageserver page_service API available.
-        workload.churn_rows(10, upload=False, ingest=False)
+        # Even though the split failed partway through, this should not leave the tenant in
+        # an unavailable state.
+        # - Disable waiting for pageservers in the workload helper, because our
+        #   failpoints may prevent API access. This only applies for failure modes that
+        #   leave pageserver page_service API available.
+        # - This is a wait_until because clients may see transient errors in some split error cases,
+        #   e.g. while waiting for a storage controller to re-attach a parent shard if we failed
+        #   inside the pageserver and the storage controller responds by detaching children and attaching
+        #   parents concurrently (https://github.com/neondatabase/neon/issues/7148)
+        wait_until(10, 1, lambda: workload.churn_rows(10, upload=False, ingest=False))  # type: ignore
+
         workload.validate()
 
     if failure.fails_forward(env):