make test_tenant_detach_smoke fail reproducibly

Add failpoint that triggers the race condition.
This commit is contained in:
Christian Schwarz
2022-11-17 10:53:06 -05:00
committed by Dmitry Rodionov
parent 9d273c840a
commit 919f2b261a
3 changed files with 41 additions and 15 deletions

View File

@@ -50,6 +50,25 @@ pub mod signals;
pub mod fs_ext;
/// use with fail::cfg("$name", "return(2000)")
#[macro_export]
macro_rules! failpoint_sleep_millis_async {
($name:literal) => {{
let should_sleep: Option<std::time::Duration> = (|| {
fail::fail_point!($name, |v: Option<_>| {
let millis = v.unwrap().parse::<u64>().unwrap();
Some(Duration::from_millis(millis))
});
None
})();
if let Some(d) = should_sleep {
tracing::info!("failpoint {:?}: sleeping for {:?}", $name, d);
tokio::time::sleep(d).await;
tracing::info!("failpoint {:?}: sleep done", $name);
}
}};
}
/// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
///
/// we have several cases:

View File

@@ -1609,6 +1609,10 @@ impl Tenant {
let gc_timelines = self.refresh_gc_info_internal(target_timeline_id, horizon, pitr)?;
utils::failpoint_sleep_millis_async!("gc_iteration_internal_after_getting_gc_timelines");
info!("starting on {} timelines", gc_timelines.len());
// Perform GC for each timeline.
//
// Note that we don't hold the GC lock here because we don't want

View File

@@ -4,6 +4,7 @@ import pytest
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnvBuilder, PageserverApiException, PageserverHttpClient
from fixtures.types import TenantId, TimelineId
import time
def do_gc_target(
@@ -11,9 +12,12 @@ def do_gc_target(
):
"""Hack to unblock main, see https://github.com/neondatabase/neon/issues/2211"""
try:
log.info("sending gc http request")
pageserver_http.timeline_gc(tenant_id, timeline_id, 0)
except Exception as e:
log.error("do_gc failed: %s", e)
finally:
log.info("gc http thread returning")
def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
@@ -48,7 +52,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
]
)
# gc should not try to even start
# gc should not try to even start on a timeline that doesn't exist
with pytest.raises(
expected_exception=PageserverApiException, match="gc target timeline does not exist"
):
@@ -58,25 +62,24 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
# the error will be printed to the log too
env.pageserver.allowed_errors.append(".*gc target timeline does not exist.*")
# try to concurrently run gc and detach
# Detach while running manual GC.
# It should wait for manual GC to finish (right now it doesn't that's why this test fails sometimes)
pageserver_http.configure_failpoints(
("gc_iteration_internal_after_getting_gc_timelines", "return(2000)")
)
gc_thread = Thread(target=lambda: do_gc_target(pageserver_http, tenant_id, timeline_id))
gc_thread.start()
time.sleep(1)
# By now the gc task is spawned but in sleep for another second due to the failpoint.
last_error = None
for i in range(3):
try:
pageserver_http.tenant_detach(tenant_id)
except Exception as e:
last_error = e
log.error(f"try {i} error detaching tenant: {e}")
continue
else:
break
# else is called if the loop finished without reaching "break"
else:
pytest.fail(f"could not detach tenant: {last_error}")
log.info("detaching tenant")
pageserver_http.tenant_detach(tenant_id)
log.info("tenant detached without error")
log.info("wait for gc thread to return")
gc_thread.join(timeout=10)
assert not gc_thread.is_alive()
log.info("gc thread returned")
# check that nothing is left on disk for deleted tenant
assert not (env.repo_dir / "tenants" / str(tenant_id)).exists()