fix: use local random state in FTS test fixtures to prevent flaky failures (#2532)

## Summary
Fixes intermittent CI failures in `test_search_fts[False]` where boolean
FTS queries were returning fewer results than expected due to
non-deterministic test data generation.

## Problem
The test was using global `random` and `np.random` without seeding,
causing the boolean query `MatchQuery("puppy", "text") &
MatchQuery("runs", "text")` to sometimes return only 3 results instead
of the expected 5, leading to `AssertionError: assert 3 == 5`.

## Solution
- Replace global random calls with local `random.Random(42)` and
`np.random.RandomState(42)` objects in test fixtures
- Ensures deterministic test data while maintaining test isolation
- No impact on other tests since random state is scoped to fixtures only

## Test Results
-  `test_search_fts[False]` now passes consistently
-  All other FTS tests continue to pass 
-  No regression in other test suites (verified with `test_basic`)
-  Maintains existing test behavior and coverage
This commit is contained in:
Tristan Zajonc
2025-07-24 11:30:02 -07:00
committed by GitHub
parent c2aa03615a
commit 81afd8a42f

View File

@@ -33,8 +33,11 @@ tantivy = pytest.importorskip("tantivy")
@pytest.fixture
def table(tmp_path) -> ldb.table.LanceTable:
# Use local random state to avoid affecting other tests
rng = np.random.RandomState(42)
local_random = random.Random(42)
db = ldb.connect(tmp_path)
vectors = [np.random.randn(128) for _ in range(100)]
vectors = [rng.randn(128) for _ in range(100)]
text_nouns = ("puppy", "car")
text2_nouns = ("rabbit", "girl", "monkey")
@@ -44,10 +47,10 @@ def table(tmp_path) -> ldb.table.LanceTable:
text = [
" ".join(
[
text_nouns[random.randrange(0, len(text_nouns))],
verbs[random.randrange(0, 5)],
adv[random.randrange(0, 5)],
adj[random.randrange(0, 5)],
text_nouns[local_random.randrange(0, len(text_nouns))],
verbs[local_random.randrange(0, 5)],
adv[local_random.randrange(0, 5)],
adj[local_random.randrange(0, 5)],
]
)
for _ in range(100)
@@ -55,15 +58,15 @@ def table(tmp_path) -> ldb.table.LanceTable:
text2 = [
" ".join(
[
text2_nouns[random.randrange(0, len(text2_nouns))],
verbs[random.randrange(0, 5)],
adv[random.randrange(0, 5)],
adj[random.randrange(0, 5)],
text2_nouns[local_random.randrange(0, len(text2_nouns))],
verbs[local_random.randrange(0, 5)],
adv[local_random.randrange(0, 5)],
adj[local_random.randrange(0, 5)],
]
)
for _ in range(100)
]
count = [random.randint(1, 10000) for _ in range(100)]
count = [local_random.randint(1, 10000) for _ in range(100)]
table = db.create_table(
"test",
data=pd.DataFrame(
@@ -82,8 +85,11 @@ def table(tmp_path) -> ldb.table.LanceTable:
@pytest.fixture
async def async_table(tmp_path) -> ldb.table.AsyncTable:
# Use local random state to avoid affecting other tests
rng = np.random.RandomState(42)
local_random = random.Random(42)
db = await ldb.connect_async(tmp_path)
vectors = [np.random.randn(128) for _ in range(100)]
vectors = [rng.randn(128) for _ in range(100)]
text_nouns = ("puppy", "car")
text2_nouns = ("rabbit", "girl", "monkey")
@@ -93,10 +99,10 @@ async def async_table(tmp_path) -> ldb.table.AsyncTable:
text = [
" ".join(
[
text_nouns[random.randrange(0, len(text_nouns))],
verbs[random.randrange(0, 5)],
adv[random.randrange(0, 5)],
adj[random.randrange(0, 5)],
text_nouns[local_random.randrange(0, len(text_nouns))],
verbs[local_random.randrange(0, 5)],
adv[local_random.randrange(0, 5)],
adj[local_random.randrange(0, 5)],
]
)
for _ in range(100)
@@ -104,15 +110,15 @@ async def async_table(tmp_path) -> ldb.table.AsyncTable:
text2 = [
" ".join(
[
text2_nouns[random.randrange(0, len(text2_nouns))],
verbs[random.randrange(0, 5)],
adv[random.randrange(0, 5)],
adj[random.randrange(0, 5)],
text2_nouns[local_random.randrange(0, len(text2_nouns))],
verbs[local_random.randrange(0, 5)],
adv[local_random.randrange(0, 5)],
adj[local_random.randrange(0, 5)],
]
)
for _ in range(100)
]
count = [random.randint(1, 10000) for _ in range(100)]
count = [local_random.randint(1, 10000) for _ in range(100)]
table = await db.create_table(
"test",
data=pd.DataFrame(