make test_tenant_detach_smoke fail reproducibly

Add failpoint that triggers the race condition. Skip test until we'll land the fix from https://github.com/neondatabase/neon/pull/2851 with https://github.com/neondatabase/neon/pull/2785
2025-12-23 06:09:59 +00:00 · 2022-11-17 10:53:06 -05:00
parent d783889a1f
commit f564dff0e3
3 changed files with 48 additions and 15 deletions
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -48,6 +48,25 @@ pub mod nonblock;
 // Default signal handling
 pub mod signals;

+/// use with fail::cfg("$name", "return(2000)")
+#[macro_export]
+macro_rules! failpoint_sleep_millis_async {
+    ($name:literal) => {{
+        let should_sleep: Option<std::time::Duration> = (|| {
+            fail::fail_point!($name, |v: Option<_>| {
+                let millis = v.unwrap().parse::<u64>().unwrap();
+                Some(Duration::from_millis(millis))
+            });
+            None
+        })();
+        if let Some(d) = should_sleep {
+            tracing::info!("failpoint {:?}: sleeping for {:?}", $name, d);
+            tokio::time::sleep(d).await;
+            tracing::info!("failpoint {:?}: sleep done", $name);
+        }
+    }};
+}
+
 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
 /// we have several cases:
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1010,6 +1010,10 @@ impl Tenant {

        let gc_timelines = self.refresh_gc_info_internal(target_timeline_id, horizon, pitr)?;

+        utils::failpoint_sleep_millis_async!("gc_iteration_internal_after_getting_gc_timelines");
+
+        info!("starting on {} timelines", gc_timelines.len());
+
        // Perform GC for each timeline.
        //
        // Note that we don't hold the GC lock here because we don't want
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -1,3 +1,4 @@
+import time
 from threading import Thread

 import pytest
@@ -11,11 +12,21 @@ def do_gc_target(
 ):
    """Hack to unblock main, see https://github.com/neondatabase/neon/issues/2211"""
    try:
+        log.info("sending gc http request")
        pageserver_http.timeline_gc(tenant_id, timeline_id, 0)
    except Exception as e:
        log.error("do_gc failed: %s", e)
+    finally:
+        log.info("gc http thread returning")


+@pytest.mark.skip(
+    reason="""
+Commit 'make test_tenant_detach_smoke fail reproducibly' adds failpoint to make this test fail reproducibly.
+Fix in https://github.com/neondatabase/neon/pull/2851 will come as part of
+https://github.com/neondatabase/neon/pull/2785 .
+"""
+)
 def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
    env = neon_env_builder.init_start()
    pageserver_http = env.pageserver.http_client()
@@ -51,7 +62,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
        ]
    )

-    # gc should not try to even start
+    # gc should not try to even start on a timeline that doesn't exist
    with pytest.raises(
        expected_exception=PageserverApiException, match="gc target timeline does not exist"
    ):
@@ -61,25 +72,24 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
        # the error will be printed to the log too
    env.pageserver.allowed_errors.append(".*gc target timeline does not exist.*")

-    # try to concurrently run gc and detach
+    # Detach while running manual GC.
+    # It should wait for manual GC to finish (right now it doesn't that's why this test fails sometimes)
+    pageserver_http.configure_failpoints(
+        ("gc_iteration_internal_after_getting_gc_timelines", "return(2000)")
+    )
    gc_thread = Thread(target=lambda: do_gc_target(pageserver_http, tenant_id, timeline_id))
    gc_thread.start()
+    time.sleep(1)
+    # By now the gc task is spawned but in sleep for another second due to the failpoint.

-    last_error = None
-    for i in range(3):
-        try:
-            pageserver_http.tenant_detach(tenant_id)
-        except Exception as e:
-            last_error = e
-            log.error(f"try {i} error detaching tenant: {e}")
-            continue
-        else:
-            break
-    # else is called if the loop finished without reaching "break"
-    else:
-        pytest.fail(f"could not detach tenant: {last_error}")
+    log.info("detaching tenant")
+    pageserver_http.tenant_detach(tenant_id)
+    log.info("tenant detached without error")

+    log.info("wait for gc thread to return")
    gc_thread.join(timeout=10)
+    assert not gc_thread.is_alive()
+    log.info("gc thread returned")

    # check that nothing is left on disk for deleted tenant
    assert not (env.repo_dir / "tenants" / str(tenant_id)).exists()