mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-21 15:10:44 +00:00
make test_tenant_detach_smoke fail reproducibly
Add failpoint that triggers the race condition.
This commit is contained in:
committed by
Dmitry Rodionov
parent
9d273c840a
commit
919f2b261a
@@ -50,6 +50,25 @@ pub mod signals;
|
||||
|
||||
pub mod fs_ext;
|
||||
|
||||
/// use with fail::cfg("$name", "return(2000)")
|
||||
#[macro_export]
|
||||
macro_rules! failpoint_sleep_millis_async {
|
||||
($name:literal) => {{
|
||||
let should_sleep: Option<std::time::Duration> = (|| {
|
||||
fail::fail_point!($name, |v: Option<_>| {
|
||||
let millis = v.unwrap().parse::<u64>().unwrap();
|
||||
Some(Duration::from_millis(millis))
|
||||
});
|
||||
None
|
||||
})();
|
||||
if let Some(d) = should_sleep {
|
||||
tracing::info!("failpoint {:?}: sleeping for {:?}", $name, d);
|
||||
tokio::time::sleep(d).await;
|
||||
tracing::info!("failpoint {:?}: sleep done", $name);
|
||||
}
|
||||
}};
|
||||
}
|
||||
|
||||
/// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
|
||||
///
|
||||
/// we have several cases:
|
||||
|
||||
@@ -1609,6 +1609,10 @@ impl Tenant {
|
||||
|
||||
let gc_timelines = self.refresh_gc_info_internal(target_timeline_id, horizon, pitr)?;
|
||||
|
||||
utils::failpoint_sleep_millis_async!("gc_iteration_internal_after_getting_gc_timelines");
|
||||
|
||||
info!("starting on {} timelines", gc_timelines.len());
|
||||
|
||||
// Perform GC for each timeline.
|
||||
//
|
||||
// Note that we don't hold the GC lock here because we don't want
|
||||
|
||||
@@ -4,6 +4,7 @@ import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder, PageserverApiException, PageserverHttpClient
|
||||
from fixtures.types import TenantId, TimelineId
|
||||
import time
|
||||
|
||||
|
||||
def do_gc_target(
|
||||
@@ -11,9 +12,12 @@ def do_gc_target(
|
||||
):
|
||||
"""Hack to unblock main, see https://github.com/neondatabase/neon/issues/2211"""
|
||||
try:
|
||||
log.info("sending gc http request")
|
||||
pageserver_http.timeline_gc(tenant_id, timeline_id, 0)
|
||||
except Exception as e:
|
||||
log.error("do_gc failed: %s", e)
|
||||
finally:
|
||||
log.info("gc http thread returning")
|
||||
|
||||
|
||||
def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
|
||||
@@ -48,7 +52,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
|
||||
]
|
||||
)
|
||||
|
||||
# gc should not try to even start
|
||||
# gc should not try to even start on a timeline that doesn't exist
|
||||
with pytest.raises(
|
||||
expected_exception=PageserverApiException, match="gc target timeline does not exist"
|
||||
):
|
||||
@@ -58,25 +62,24 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
|
||||
# the error will be printed to the log too
|
||||
env.pageserver.allowed_errors.append(".*gc target timeline does not exist.*")
|
||||
|
||||
# try to concurrently run gc and detach
|
||||
# Detach while running manual GC.
|
||||
# It should wait for manual GC to finish (right now it doesn't that's why this test fails sometimes)
|
||||
pageserver_http.configure_failpoints(
|
||||
("gc_iteration_internal_after_getting_gc_timelines", "return(2000)")
|
||||
)
|
||||
gc_thread = Thread(target=lambda: do_gc_target(pageserver_http, tenant_id, timeline_id))
|
||||
gc_thread.start()
|
||||
time.sleep(1)
|
||||
# By now the gc task is spawned but in sleep for another second due to the failpoint.
|
||||
|
||||
last_error = None
|
||||
for i in range(3):
|
||||
try:
|
||||
pageserver_http.tenant_detach(tenant_id)
|
||||
except Exception as e:
|
||||
last_error = e
|
||||
log.error(f"try {i} error detaching tenant: {e}")
|
||||
continue
|
||||
else:
|
||||
break
|
||||
# else is called if the loop finished without reaching "break"
|
||||
else:
|
||||
pytest.fail(f"could not detach tenant: {last_error}")
|
||||
log.info("detaching tenant")
|
||||
pageserver_http.tenant_detach(tenant_id)
|
||||
log.info("tenant detached without error")
|
||||
|
||||
log.info("wait for gc thread to return")
|
||||
gc_thread.join(timeout=10)
|
||||
assert not gc_thread.is_alive()
|
||||
log.info("gc thread returned")
|
||||
|
||||
# check that nothing is left on disk for deleted tenant
|
||||
assert not (env.repo_dir / "tenants" / str(tenant_id)).exists()
|
||||
|
||||
Reference in New Issue
Block a user