fix: specify column to search for FTS (#1572)

Before this we ignored the `fts_columns` parameter, and for now we
support to search on only one column, it could lead to an error if we
have multiple indexed columns for FTS

---------

Signed-off-by: BubbleCal <bubble-cal@outlook.com>
This commit is contained in:
BubbleCal
2024-08-29 23:43:46 +08:00
committed by GitHub
parent bfe8fccfab
commit 1521435193
5 changed files with 136 additions and 21 deletions

View File

@@ -29,14 +29,26 @@ def table(tmp_path) -> ldb.table.LanceTable:
db = ldb.connect(tmp_path)
vectors = [np.random.randn(128) for _ in range(100)]
nouns = ("puppy", "car", "rabbit", "girl", "monkey")
text_nouns = ("puppy", "car")
text2_nouns = ("rabbit", "girl", "monkey")
verbs = ("runs", "hits", "jumps", "drives", "barfs")
adv = ("crazily.", "dutifully.", "foolishly.", "merrily.", "occasionally.")
adj = ("adorable", "clueless", "dirty", "odd", "stupid")
text = [
" ".join(
[
nouns[random.randrange(0, 5)],
text_nouns[random.randrange(0, len(text_nouns))],
verbs[random.randrange(0, 5)],
adv[random.randrange(0, 5)],
adj[random.randrange(0, 5)],
]
)
for _ in range(100)
]
text2 = [
" ".join(
[
text2_nouns[random.randrange(0, len(text2_nouns))],
verbs[random.randrange(0, 5)],
adv[random.randrange(0, 5)],
adj[random.randrange(0, 5)],
@@ -52,7 +64,7 @@ def table(tmp_path) -> ldb.table.LanceTable:
"vector": vectors,
"id": [i % 2 for i in range(100)],
"text": text,
"text2": text,
"text2": text2,
"nested": [{"text": t} for t in text],
"count": count,
}
@@ -66,14 +78,26 @@ async def async_table(tmp_path) -> ldb.table.AsyncTable:
db = await ldb.connect_async(tmp_path)
vectors = [np.random.randn(128) for _ in range(100)]
nouns = ("puppy", "car", "rabbit", "girl", "monkey")
text_nouns = ("puppy", "car")
text2_nouns = ("rabbit", "girl", "monkey")
verbs = ("runs", "hits", "jumps", "drives", "barfs")
adv = ("crazily.", "dutifully.", "foolishly.", "merrily.", "occasionally.")
adj = ("adorable", "clueless", "dirty", "odd", "stupid")
text = [
" ".join(
[
nouns[random.randrange(0, 5)],
text_nouns[random.randrange(0, len(text_nouns))],
verbs[random.randrange(0, 5)],
adv[random.randrange(0, 5)],
adj[random.randrange(0, 5)],
]
)
for _ in range(100)
]
text2 = [
" ".join(
[
text2_nouns[random.randrange(0, len(text2_nouns))],
verbs[random.randrange(0, 5)],
adv[random.randrange(0, 5)],
adj[random.randrange(0, 5)],
@@ -89,7 +113,7 @@ async def async_table(tmp_path) -> ldb.table.AsyncTable:
"vector": vectors,
"id": [i % 2 for i in range(100)],
"text": text,
"text2": text,
"text2": text2,
"nested": [{"text": t} for t in text],
"count": count,
}
@@ -142,12 +166,81 @@ def test_search_fts(table, use_tantivy):
assert len(results) == 5
def test_search_fts_specify_column(table):
table.create_fts_index("text", use_tantivy=False)
table.create_fts_index("text2", use_tantivy=False)
results = table.search("puppy", fts_columns="text").limit(5).to_list()
assert len(results) == 5
results = table.search("rabbit", fts_columns="text2").limit(5).to_list()
assert len(results) == 5
try:
# we can only specify one column for now
table.search("puppy", fts_columns=["text", "text2"]).limit(5).to_list()
assert False
except Exception:
pass
try:
# have to specify a column because we have two fts indices
table.search("puppy").limit(5).to_list()
assert False
except Exception:
pass
@pytest.mark.asyncio
async def test_search_fts_async(async_table):
async_table = await async_table
await async_table.create_index("text", config=FTS())
results = await async_table.query().nearest_to_text("puppy").limit(5).to_list()
assert len(results) == 5
@pytest.mark.asyncio
async def test_search_fts_specify_column_async(async_table):
async_table = await async_table
await async_table.create_index("text", config=FTS())
await async_table.create_index("text2", config=FTS())
results = (
await async_table.query()
.nearest_to_text("puppy", columns="text")
.limit(5)
.to_list()
)
assert len(results) == 5
results = (
await async_table.query()
.nearest_to_text("rabbit", columns="text2")
.limit(5)
.to_list()
)
assert len(results) == 5
try:
# we can only specify one column for now
await (
async_table.query()
.nearest_to_text("rabbit", columns="text2")
.limit(5)
.to_list()
)
assert False
except Exception:
pass
try:
# have to specify a column because we have two fts indices
await async_table.query().nearest_to_text("puppy").limit(5).to_list()
assert False
except Exception:
pass
def test_search_ordering_field_index_table(tmp_path, table):
table.create_fts_index("text", ordering_field_names=["count"], use_tantivy=True)
rows = (