diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index 81722085d8..e1c9a373e5 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -50,6 +50,25 @@ pub mod signals; pub mod fs_ext; +/// use with fail::cfg("$name", "return(2000)") +#[macro_export] +macro_rules! failpoint_sleep_millis_async { + ($name:literal) => {{ + let should_sleep: Option = (|| { + fail::fail_point!($name, |v: Option<_>| { + let millis = v.unwrap().parse::().unwrap(); + Some(Duration::from_millis(millis)) + }); + None + })(); + if let Some(d) = should_sleep { + tracing::info!("failpoint {:?}: sleeping for {:?}", $name, d); + tokio::time::sleep(d).await; + tracing::info!("failpoint {:?}: sleep done", $name); + } + }}; +} + /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages /// /// we have several cases: diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 64dc07212e..81db7d593b 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -1609,6 +1609,10 @@ impl Tenant { let gc_timelines = self.refresh_gc_info_internal(target_timeline_id, horizon, pitr)?; + utils::failpoint_sleep_millis_async!("gc_iteration_internal_after_getting_gc_timelines"); + + info!("starting on {} timelines", gc_timelines.len()); + // Perform GC for each timeline. // // Note that we don't hold the GC lock here because we don't want diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index f66bacc4f7..d20860f25a 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -4,6 +4,7 @@ import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder, PageserverApiException, PageserverHttpClient from fixtures.types import TenantId, TimelineId +import time def do_gc_target( @@ -11,9 +12,12 @@ def do_gc_target( ): """Hack to unblock main, see https://github.com/neondatabase/neon/issues/2211""" try: + log.info("sending gc http request") pageserver_http.timeline_gc(tenant_id, timeline_id, 0) except Exception as e: log.error("do_gc failed: %s", e) + finally: + log.info("gc http thread returning") def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): @@ -48,7 +52,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): ] ) - # gc should not try to even start + # gc should not try to even start on a timeline that doesn't exist with pytest.raises( expected_exception=PageserverApiException, match="gc target timeline does not exist" ): @@ -58,25 +62,24 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): # the error will be printed to the log too env.pageserver.allowed_errors.append(".*gc target timeline does not exist.*") - # try to concurrently run gc and detach + # Detach while running manual GC. + # It should wait for manual GC to finish (right now it doesn't that's why this test fails sometimes) + pageserver_http.configure_failpoints( + ("gc_iteration_internal_after_getting_gc_timelines", "return(2000)") + ) gc_thread = Thread(target=lambda: do_gc_target(pageserver_http, tenant_id, timeline_id)) gc_thread.start() + time.sleep(1) + # By now the gc task is spawned but in sleep for another second due to the failpoint. - last_error = None - for i in range(3): - try: - pageserver_http.tenant_detach(tenant_id) - except Exception as e: - last_error = e - log.error(f"try {i} error detaching tenant: {e}") - continue - else: - break - # else is called if the loop finished without reaching "break" - else: - pytest.fail(f"could not detach tenant: {last_error}") + log.info("detaching tenant") + pageserver_http.tenant_detach(tenant_id) + log.info("tenant detached without error") + log.info("wait for gc thread to return") gc_thread.join(timeout=10) + assert not gc_thread.is_alive() + log.info("gc thread returned") # check that nothing is left on disk for deleted tenant assert not (env.repo_dir / "tenants" / str(tenant_id)).exists()