Files
lancedb/python/python/tests/test_fts.py

1057 lines
32 KiB
Python

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
# Copyright 2023 LanceDB Developers
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import random
import shutil
from unittest import mock
from pathlib import Path
import zipfile
import lancedb as ldb
from lancedb.db import DBConnection
from lancedb.index import FTS
from lancedb.query import (
BoostQuery,
MatchQuery,
MultiMatchQuery,
PhraseQuery,
BooleanQuery,
Occur,
LanceFtsQueryBuilder,
)
import numpy as np
import pyarrow as pa
import pandas as pd
import pytest
import pytest_asyncio
from utils import exception_output
TEST_LANGUAGE_MODEL_HOME = Path(__file__).parent / "models"
@pytest.fixture
def table(tmp_path) -> ldb.table.LanceTable:
# Use local random state to avoid affecting other tests
rng = np.random.RandomState(42)
local_random = random.Random(42)
db = ldb.connect(tmp_path)
vectors = [rng.randn(128) for _ in range(100)]
text_nouns = ("puppy", "car")
text2_nouns = ("rabbit", "girl", "monkey")
verbs = ("runs", "hits", "jumps", "drives", "barfs")
adv = ("crazily.", "dutifully.", "foolishly.", "merrily.", "occasionally.")
adj = ("adorable", "clueless", "dirty", "odd", "stupid")
text = [
" ".join(
[
text_nouns[local_random.randrange(0, len(text_nouns))],
verbs[local_random.randrange(0, 5)],
adv[local_random.randrange(0, 5)],
adj[local_random.randrange(0, 5)],
]
)
for _ in range(100)
]
text2 = [
" ".join(
[
text2_nouns[local_random.randrange(0, len(text2_nouns))],
verbs[local_random.randrange(0, 5)],
adv[local_random.randrange(0, 5)],
adj[local_random.randrange(0, 5)],
]
)
for _ in range(100)
]
count = [local_random.randint(1, 10000) for _ in range(100)]
table = db.create_table(
"test",
data=pd.DataFrame(
{
"vector": vectors,
"id": [i % 2 for i in range(100)],
"text": text,
"text2": text2,
"nested": [{"text": t} for t in text],
"count": count,
}
),
)
return table
@pytest.fixture
def language_model_home(monkeypatch, tmp_path):
model_home = tmp_path / "language-models"
shutil.copytree(TEST_LANGUAGE_MODEL_HOME, model_home)
monkeypatch.setenv("LANCE_LANGUAGE_MODEL_HOME", str(model_home))
return model_home
@pytest.fixture
def lindera_ipadic(language_model_home):
model_path = language_model_home / "lindera" / "ipadic"
extracted_model = model_path / "main"
config_path = model_path / "config.yml"
if extracted_model.exists():
shutil.rmtree(extracted_model)
with zipfile.ZipFile(model_path / "main.zip", "r") as zip_ref:
zip_ref.extractall(model_path)
config_path.write_text(
"segmenter:\n"
' mode: "normal"\n'
" dictionary:\n"
f' path: "{extracted_model.resolve().as_posix()}"\n',
encoding="utf-8",
)
try:
yield
finally:
if extracted_model.exists():
shutil.rmtree(extracted_model)
@pytest_asyncio.fixture
async def async_table(tmp_path) -> ldb.table.AsyncTable:
# Use local random state to avoid affecting other tests
rng = np.random.RandomState(42)
local_random = random.Random(42)
db = await ldb.connect_async(tmp_path)
vectors = [rng.randn(128) for _ in range(100)]
text_nouns = ("puppy", "car")
text2_nouns = ("rabbit", "girl", "monkey")
verbs = ("runs", "hits", "jumps", "drives", "barfs")
adv = ("crazily.", "dutifully.", "foolishly.", "merrily.", "occasionally.")
adj = ("adorable", "clueless", "dirty", "odd", "stupid")
text = [
" ".join(
[
text_nouns[local_random.randrange(0, len(text_nouns))],
verbs[local_random.randrange(0, 5)],
adv[local_random.randrange(0, 5)],
adj[local_random.randrange(0, 5)],
]
)
for _ in range(100)
]
text2 = [
" ".join(
[
text2_nouns[local_random.randrange(0, len(text2_nouns))],
verbs[local_random.randrange(0, 5)],
adv[local_random.randrange(0, 5)],
adj[local_random.randrange(0, 5)],
]
)
for _ in range(100)
]
count = [local_random.randint(1, 10000) for _ in range(100)]
table = await db.create_table(
"test",
data=pd.DataFrame(
{
"vector": vectors,
"id": [i % 2 for i in range(100)],
"text": text,
"text2": text2,
"nested": [{"text": t} for t in text],
"count": count,
}
),
)
return table
@pytest.mark.parametrize(
("kwargs", "match"),
[
(
{"use_tantivy": True},
"Tantivy-based FTS has been removed",
),
(
{"ordering_field_names": ["count"]},
"ordering_field_names was only supported",
),
(
{"writer_heap_size": 128},
"writer_heap_size was only supported",
),
],
)
def test_reject_removed_tantivy_parameters(table, kwargs, match):
with pytest.raises(ValueError, match=match):
table.create_fts_index("text", **kwargs)
def test_reject_legacy_tantivy_index(table):
path, _, _ = table._get_fts_index_path()
os.makedirs(path, exist_ok=True)
with pytest.raises(ValueError, match="Legacy Tantivy FTS index detected"):
table.search("puppy").limit(5).to_list()
with pytest.raises(ValueError, match="Legacy Tantivy FTS index detected"):
table.create_fts_index("text")
@pytest.mark.parametrize("with_position", [True, False])
def test_create_inverted_index(table, with_position):
table.create_fts_index(
"text",
with_position=with_position,
name="custom_fts_index",
)
indices = table.list_indices()
fts_indices = [i for i in indices if i.index_type == "FTS"]
assert any(i.name == "custom_fts_index" for i in fts_indices)
def test_search_fts(table):
table.create_fts_index("text")
results = table.search("puppy").select(["id", "text"]).limit(5).to_list()
assert len(results) == 5
assert len(results[0]) == 3 # id, text, _score
# Default limit of 10
results = table.search("puppy").select(["id", "text"]).to_list()
assert len(results) == 10
# Test with a query
results = (
table.search(MatchQuery("puppy", "text"))
.select(["id", "text"])
.limit(5)
.to_list()
)
assert len(results) == 5
# Test boost query
results = (
table.search(
BoostQuery(
MatchQuery("puppy", "text"),
MatchQuery("runs", "text"),
)
)
.select(["id", "text"])
.limit(5)
.to_list()
)
assert len(results) == 5
# Test multi match query
table.create_fts_index("text2")
results = (
table.search(MultiMatchQuery("puppy", ["text", "text2"]))
.select(["id", "text"])
.limit(5)
.to_list()
)
assert len(results) == 5
assert len(results[0]) == 3 # id, text, _score
# Test boolean query
results = (
table.search(MatchQuery("puppy", "text") & MatchQuery("runs", "text"))
.select(["id", "text"])
.limit(5)
.to_list()
)
assert len(results) == 5
assert len(results[0]) == 3 # id, text, _score
for r in results:
assert "puppy" in r["text"]
assert "runs" in r["text"]
@pytest.mark.asyncio
async def test_fts_select_async(async_table):
tbl = async_table
await tbl.create_index("text", config=FTS())
await tbl.create_index("text2", config=FTS())
results = (
await tbl.query()
.nearest_to_text("puppy")
.select(["id", "text"])
.limit(5)
.to_list()
)
assert len(results) == 5
assert len(results[0]) == 3 # id, text, _score
# Test with FullTextQuery
results = (
await tbl.query()
.nearest_to_text(MatchQuery("puppy", "text"))
.select(["id", "text"])
.limit(5)
.to_list()
)
assert len(results) == 5
assert len(results[0]) == 3 # id, text, _score
# Test with BoostQuery
results = (
await tbl.query()
.nearest_to_text(
BoostQuery(
MatchQuery("puppy", "text"),
MatchQuery("runs", "text"),
)
)
.select(["id", "text"])
.limit(5)
.to_list()
)
assert len(results) == 5
assert len(results[0]) == 3 # id, text, _score
# Test with MultiMatchQuery
results = (
await tbl.query()
.nearest_to_text(MultiMatchQuery("puppy", ["text", "text2"]))
.select(["id", "text"])
.limit(5)
.to_list()
)
assert len(results) == 5
assert len(results[0]) == 3 # id, text, _score
# Test with search() API
results = (
await (await tbl.search(MatchQuery("puppy", "text")))
.select(["id", "text"])
.limit(5)
.to_list()
)
assert len(results) == 5
assert len(results[0]) == 3 # id, text, _score
def test_search_fts_phrase_query(table):
table.create_fts_index("text", with_position=False)
try:
phrase_results = table.search('"puppy runs"').limit(100).to_list()
assert False
except Exception:
pass
table.create_fts_index("text", with_position=True, replace=True)
results = table.search("puppy").limit(100).to_list()
# Test with quotation marks
phrase_results = table.search('"puppy runs"').limit(100).to_list()
assert len(results) > len(phrase_results)
assert len(phrase_results) > 0
# Test with .phrase_query()
phrase_results = table.search("puppy runs").phrase_query().limit(100).to_list()
assert len(results) > len(phrase_results)
assert len(phrase_results) > 0
# Test with PhraseQuery()
phrase_results = (
table.search(PhraseQuery("puppy runs", "text")).limit(100).to_list()
)
assert len(results) > len(phrase_results)
assert len(phrase_results) > 0
@pytest.mark.asyncio
async def test_search_fts_phrase_query_async(async_table):
await async_table.create_index("text", config=FTS(with_position=False))
try:
phrase_results = (
await async_table.query().nearest_to_text("puppy runs").limit(100).to_list()
)
assert False
except Exception:
pass
await async_table.create_index("text", config=FTS(with_position=True))
results = await async_table.query().nearest_to_text("puppy").limit(100).to_list()
phrase_results = (
await async_table.query().nearest_to_text('"puppy runs"').limit(100).to_list()
)
assert len(results) > len(phrase_results)
assert len(phrase_results) > 0
# Test with a query
phrase_results = (
await async_table.query()
.nearest_to_text(PhraseQuery("puppy runs", "text"))
.limit(100)
.to_list()
)
assert len(results) > len(phrase_results)
assert len(phrase_results) > 0
def test_search_fts_specify_column(table):
table.create_fts_index("text")
table.create_fts_index("text2")
results = table.search("puppy", fts_columns="text").limit(5).to_list()
assert len(results) == 5
results = table.search("rabbit", fts_columns="text2").limit(5).to_list()
assert len(results) == 5
try:
# we can only specify one column for now
table.search("puppy", fts_columns=["text", "text2"]).limit(5).to_list()
assert False
except Exception:
pass
try:
# have to specify a column because we have two fts indices
table.search("puppy").limit(5).to_list()
assert False
except Exception:
pass
@pytest.mark.asyncio
async def test_search_fts_async(async_table):
await async_table.create_index("text", config=FTS())
results = await async_table.query().nearest_to_text("puppy").limit(5).to_list()
assert len(results) == 5
expected_count = await async_table.count_rows(
"count > 5000 and contains(text, 'puppy')"
)
expected_count = min(expected_count, 10)
limited_results_pre_filter = await (
async_table.query()
.nearest_to_text("puppy")
.where("count > 5000")
.limit(10)
.to_list()
)
assert len(limited_results_pre_filter) == expected_count
limited_results_post_filter = await (
async_table.query()
.nearest_to_text("puppy")
.where("count > 5000")
.limit(10)
.postfilter()
.to_list()
)
assert len(limited_results_post_filter) <= expected_count
@pytest.mark.asyncio
async def test_search_fts_specify_column_async(async_table):
await async_table.create_index("text", config=FTS())
await async_table.create_index("text2", config=FTS())
results = (
await async_table.query()
.nearest_to_text("puppy", columns="text")
.limit(5)
.to_list()
)
assert len(results) == 5
results = (
await async_table.query()
.nearest_to_text("rabbit", columns="text2")
.limit(5)
.to_list()
)
assert len(results) == 5
try:
# we can only specify one column for now
await (
async_table.query()
.nearest_to_text("rabbit", columns="text2")
.limit(5)
.to_list()
)
assert False
except Exception:
pass
try:
# have to specify a column because we have two fts indices
await async_table.query().nearest_to_text("puppy").limit(5).to_list()
assert False
except Exception:
pass
def test_create_index_from_table(tmp_path, table):
table.create_fts_index("text")
df = table.search("puppy").limit(5).select(["text"]).to_pandas()
assert len(df) <= 5
assert "text" in df.columns
# Check whether it can be updated
table.add(
[
{
"vector": np.random.randn(128),
"id": 101,
"text": "gorilla",
"text2": "gorilla",
"nested": {"text": "gorilla"},
"count": 10,
}
]
)
with pytest.raises(Exception, match="already exists"):
table.create_fts_index("text")
table.create_fts_index("text", replace=True)
assert len(table.search("gorilla").limit(1).to_pandas()) == 1
def test_create_index_multiple_columns(tmp_path, table):
with pytest.raises(ValueError, match="Native FTS indexes can only be created"):
table.create_fts_index(["text", "text2"])
def test_nested_schema(tmp_path, table):
with pytest.raises(ValueError, match="top-level fields"):
table.create_fts_index("nested.text")
def test_search_index_with_filter(table):
table.create_fts_index("text")
orig_import = __import__
def import_mock(name, *args):
if name == "duckdb":
raise ImportError
return orig_import(name, *args)
# no duckdb
with mock.patch("builtins.__import__", side_effect=import_mock):
rs = table.search("puppy").where("id=1").limit(10)
# test schema
assert rs.to_arrow().drop("_score").schema.equals(table.schema)
rs = rs.to_list()
for r in rs:
assert r["id"] == 1
# yes duckdb
rs2 = table.search("puppy").where("id=1").limit(10).to_list()
for r in rs2:
assert r["id"] == 1
assert rs == rs2
rs = table.search("puppy").where("id=1").with_row_id(True).limit(10).to_list()
for r in rs:
assert r["id"] == 1
assert r["_rowid"] is not None
def test_null_input(table):
table.add(
[
{
"vector": np.random.randn(128),
"id": 101,
"text": None,
"text2": None,
"nested": {"text": None},
"count": 7,
}
]
)
table.create_fts_index("text")
def test_syntax(table):
# https://github.com/lancedb/lancedb/issues/769
table.create_fts_index("text")
table.search("they could have been dogs OR").limit(10).to_list()
# these should work
# terms queries
table.search('"they could have been dogs" OR cats').limit(10).to_list()
table.search("(they AND could) OR (have AND been AND dogs) OR cats").limit(
10
).to_list()
# phrase queries
table.create_fts_index("text", with_position=True, replace=True)
table.search("they could have been dogs OR cats").phrase_query().limit(10).to_list()
table.search('"they could have been dogs OR cats"').limit(10).to_list()
table.search('''"the cats OR dogs were not really 'pets' at all"''').limit(
10
).to_list()
table.search('the cats OR dogs were not really "pets" at all').phrase_query().limit(
10
).to_list()
table.search('the cats OR dogs were not really "pets" at all').phrase_query().limit(
10
).to_list()
def test_language(mem_db: DBConnection):
sentences = [
"Il n'y a que trois routes qui traversent la ville.",
"Je veux prendre la route vers l'est.",
"Je te retrouve au café au bout de la route.",
]
data = [{"text": s} for s in sentences]
table = mem_db.create_table("test", data=data)
with pytest.raises(ValueError) as e:
table.create_fts_index("text", language="klingon")
assert exception_output(e) == (
"ValueError: LanceDB does not support the requested language: 'klingon'\n"
"Supported languages: Arabic, Danish, Dutch, English, Finnish, French, "
"German, Greek, Hungarian, Italian, Norwegian, Portuguese, Romanian, "
"Russian, Spanish, Swedish, Tamil, Turkish"
)
table.create_fts_index(
"text",
language="French",
stem=True,
ascii_folding=True,
remove_stop_words=True,
)
# Can get "routes" and "route" from the same root
results = table.search("route", query_type="fts").limit(5).to_list()
assert len(results) == 3
# Can find "café", without needing to provide accent
results = table.search("cafe", query_type="fts").limit(5).to_list()
assert len(results) == 1
# Stop words -> no results
results = table.search("la", query_type="fts").limit(5).to_list()
assert len(results) == 0
def test_fts_on_list(mem_db: DBConnection):
data = pa.table(
{
"text": [
["lance database", "the", "search"],
["lance database"],
["lance", "search"],
["database", "search"],
["unrelated", "doc"],
],
"vector": [
[1.0, 2.0, 3.0],
[4.0, 5.0, 6.0],
[7.0, 8.0, 9.0],
[10.0, 11.0, 12.0],
[13.0, 14.0, 15.0],
],
}
)
table = mem_db.create_table("test", data=data)
table.create_fts_index("text", with_position=True)
res = table.search("lance").limit(5).to_list()
assert len(res) == 3
res = table.search(PhraseQuery("lance database", "text")).limit(5).to_list()
assert len(res) == 2
def test_fts_ngram(mem_db: DBConnection):
data = pa.table({"text": ["hello world", "lance database", "lance is cool"]})
table = mem_db.create_table("test", data=data)
table.create_fts_index("text", base_tokenizer="ngram")
results = table.search("lan", query_type="fts").limit(10).to_list()
assert len(results) == 2
assert set(r["text"] for r in results) == {"lance database", "lance is cool"}
results = (
table.search("nce", query_type="fts").limit(10).to_list()
) # spellchecker:disable-line
assert len(results) == 2
assert set(r["text"] for r in results) == {"lance database", "lance is cool"}
# the default min_ngram_length is 3, so "la" should not match
results = table.search("la", query_type="fts").limit(10).to_list()
assert len(results) == 0
# test setting min_ngram_length and prefix_only
table.create_fts_index(
"text",
base_tokenizer="ngram",
replace=True,
ngram_min_length=2,
prefix_only=True,
)
results = table.search("lan", query_type="fts").limit(10).to_list()
assert len(results) == 2
assert set(r["text"] for r in results) == {"lance database", "lance is cool"}
results = (
table.search("nce", query_type="fts").limit(10).to_list()
) # spellchecker:disable-line
assert len(results) == 0
results = table.search("la", query_type="fts").limit(10).to_list()
assert len(results) == 2
assert set(r["text"] for r in results) == {"lance database", "lance is cool"}
def test_fts_jieba_tokenizer(mem_db: DBConnection, language_model_home):
data = pa.table({"text": ["我们都有光明的前途", "光明的前途"]})
table = mem_db.create_table("test_jieba", data=data)
table.create_fts_index(
"text",
base_tokenizer="jieba/default",
stem=False,
remove_stop_words=False,
ascii_folding=False,
)
results = table.search("我们", query_type="fts").limit(10).to_list()
assert [row["text"] for row in results] == ["我们都有光明的前途"]
def test_fts_jieba_missing_language_model_note(
mem_db: DBConnection, monkeypatch, tmp_path
):
missing_root = tmp_path / "missing-language-models"
monkeypatch.setenv("LANCE_LANGUAGE_MODEL_HOME", str(missing_root))
table = mem_db.create_table(
"test_missing_jieba_model",
data=pa.table({"text": ["我们都有光明的前途"]}),
)
with pytest.raises((ValueError, RuntimeError)) as e:
table.create_fts_index(
"text",
base_tokenizer="jieba/default",
stem=False,
remove_stop_words=False,
ascii_folding=False,
)
output = exception_output(e)
assert "Invalid directory path:" in output
assert "LANCE_LANGUAGE_MODEL_HOME" in output
assert "jieba/default" in output
@pytest.mark.asyncio
async def test_fts_jieba_missing_language_model_note_async(monkeypatch, tmp_path):
missing_root = tmp_path / "missing-language-models"
monkeypatch.setenv("LANCE_LANGUAGE_MODEL_HOME", str(missing_root))
db = await ldb.connect_async(tmp_path / "async-db")
table = await db.create_table(
"test_missing_jieba_model_async",
data=pa.table({"text": ["我们都有光明的前途"]}),
)
with pytest.raises((ValueError, RuntimeError)) as e:
await table.create_index(
"text",
config=FTS(
base_tokenizer="jieba/default",
stem=False,
remove_stop_words=False,
ascii_folding=False,
),
)
output = exception_output(e)
assert "Invalid directory path:" in output
assert "LANCE_LANGUAGE_MODEL_HOME" in output
assert "jieba/default" in output
def test_fts_lindera_tokenizer(
mem_db: DBConnection, language_model_home, lindera_ipadic
):
data = pa.table({"text": ["成田国際空港", "東京国際空港", "羽田空港"]})
table = mem_db.create_table("test_lindera", data=data)
table.create_fts_index(
"text",
base_tokenizer="lindera/ipadic",
stem=False,
remove_stop_words=False,
ascii_folding=False,
)
results = table.search("成田", query_type="fts").limit(10).to_list()
assert [row["text"] for row in results] == ["成田国際空港"]
def test_fts_query_to_json():
"""Test that FTS query to_json() produces valid JSON strings with exact format."""
# Test MatchQuery - basic
match_query = MatchQuery("hello world", "text")
json_str = match_query.to_json()
expected = (
'{"match":{"column":"text","terms":"hello world","boost":1.0,'
'"fuzziness":0,"max_expansions":50,"operator":"Or","prefix_length":0}}'
)
assert json_str == expected
# Test MatchQuery with options
match_query = MatchQuery("puppy", "text", fuzziness=2, boost=1.5, prefix_length=3)
json_str = match_query.to_json()
expected = (
'{"match":{"column":"text","terms":"puppy","boost":1.5,"fuzziness":2,'
'"max_expansions":50,"operator":"Or","prefix_length":3}}'
)
assert json_str == expected
# Test PhraseQuery
phrase_query = PhraseQuery("quick brown fox", "title")
json_str = phrase_query.to_json()
expected = '{"phrase":{"column":"title","terms":"quick brown fox","slop":0}}'
assert json_str == expected
# Test PhraseQuery with slop
phrase_query = PhraseQuery("quick brown", "title", slop=2)
json_str = phrase_query.to_json()
expected = '{"phrase":{"column":"title","terms":"quick brown","slop":2}}'
assert json_str == expected
# Test BooleanQuery with MUST
must_query = BooleanQuery(
[
(Occur.MUST, MatchQuery("puppy", "text")),
(Occur.MUST, MatchQuery("runs", "text")),
]
)
json_str = must_query.to_json()
expected = (
'{"boolean":{"should":[],"must":[{"match":{"column":"text","terms":"puppy",'
'"boost":1.0,"fuzziness":0,"max_expansions":50,"operator":"Or",'
'"prefix_length":0}},{"match":{"column":"text","terms":"runs","boost":1.0,'
'"fuzziness":0,"max_expansions":50,"operator":"Or","prefix_length":0}}],'
'"must_not":[]}}'
)
assert json_str == expected
# Test BooleanQuery with SHOULD
should_query = BooleanQuery(
[
(Occur.SHOULD, MatchQuery("cat", "text")),
(Occur.SHOULD, MatchQuery("dog", "text")),
]
)
json_str = should_query.to_json()
expected = (
'{"boolean":{"should":[{"match":{"column":"text","terms":"cat","boost":1.0,'
'"fuzziness":0,"max_expansions":50,"operator":"Or","prefix_length":0}},'
'{"match":{"column":"text","terms":"dog","boost":1.0,"fuzziness":0,'
'"max_expansions":50,"operator":"Or","prefix_length":0}}],"must":[],'
'"must_not":[]}}'
)
assert json_str == expected
# Test BooleanQuery with MUST_NOT
must_not_query = BooleanQuery(
[
(Occur.MUST, MatchQuery("puppy", "text")),
(Occur.MUST_NOT, MatchQuery("training", "text")),
]
)
json_str = must_not_query.to_json()
expected = (
'{"boolean":{"should":[],"must":[{"match":{"column":"text","terms":"puppy",'
'"boost":1.0,"fuzziness":0,"max_expansions":50,"operator":"Or",'
'"prefix_length":0}}],"must_not":[{"match":{"column":"text",'
'"terms":"training","boost":1.0,"fuzziness":0,"max_expansions":50,'
'"operator":"Or","prefix_length":0}}]}}'
)
assert json_str == expected
# Test BoostQuery
positive = MatchQuery("puppy", "text")
negative = MatchQuery("training", "text")
boost_query = BoostQuery(positive, negative, negative_boost=0.3)
json_str = boost_query.to_json()
expected = (
'{"boost":{"positive":{"match":{"column":"text","terms":"puppy",'
'"boost":1.0,"fuzziness":0,"max_expansions":50,"operator":"Or",'
'"prefix_length":0}},"negative":{"match":{"column":"text",'
'"terms":"training","boost":1.0,"fuzziness":0,"max_expansions":50,'
'"operator":"Or","prefix_length":0}},"negative_boost":0.3}}'
)
assert json_str == expected
# Test MultiMatchQuery
multi_match = MultiMatchQuery("python", ["tags", "title"])
json_str = multi_match.to_json()
expected = (
'{"multi_match":{"query":"python","columns":["tags","title"],'
'"boost":[1.0,1.0]}}'
)
assert json_str == expected
# Test complex nested BooleanQuery
inner1 = BooleanQuery(
[
(Occur.MUST, MatchQuery("python", "tags")),
(Occur.MUST, MatchQuery("tutorial", "title")),
]
)
inner2 = BooleanQuery(
[
(Occur.MUST, MatchQuery("rust", "tags")),
(Occur.MUST, MatchQuery("guide", "title")),
]
)
complex_query = BooleanQuery(
[
(Occur.SHOULD, inner1),
(Occur.SHOULD, inner2),
]
)
json_str = complex_query.to_json()
expected = (
'{"boolean":{"should":[{"boolean":{"should":[],"must":[{"match":'
'{"column":"tags","terms":"python","boost":1.0,"fuzziness":0,'
'"max_expansions":50,"operator":"Or","prefix_length":0}},{"match":'
'{"column":"title","terms":"tutorial","boost":1.0,"fuzziness":0,'
'"max_expansions":50,"operator":"Or","prefix_length":0}}],"must_not":[]}}'
',{"boolean":{"should":[],"must":[{"match":{"column":"tags",'
'"terms":"rust","boost":1.0,"fuzziness":0,"max_expansions":50,'
'"operator":"Or","prefix_length":0}},{"match":{"column":"title",'
'"terms":"guide","boost":1.0,"fuzziness":0,"max_expansions":50,'
'"operator":"Or","prefix_length":0}}],"must_not":[]}}],"must":[],'
'"must_not":[]}}'
)
assert json_str == expected
def test_fts_fast_search(table):
table.create_fts_index("text")
# Insert some unindexed data
table.add(
[
{
"text": "xyz",
"vector": [0 for _ in range(128)],
"id": 101,
"text2": "xyz",
"nested": {"text": "xyz"},
"count": 10,
}
]
)
# Without fast_search, the query object should not have fast_search set
builder = table.search("xyz", query_type="fts").limit(10)
query = builder.to_query_object()
assert query.fast_search is None
# With fast_search, the query object should have fast_search=True
builder = table.search("xyz", query_type="fts").fast_search().limit(10)
query = builder.to_query_object()
assert query.fast_search is True
# fast_search should be chainable with other methods
builder = (
table.search("xyz", query_type="fts").fast_search().select(["text"]).limit(5)
)
query = builder.to_query_object()
assert query.fast_search is True
assert query.limit == 5
assert query.columns == ["text"]
# fast_search should be enabled by keyword argument too
query = LanceFtsQueryBuilder(table, "xyz", fast_search=True).to_query_object()
assert query.fast_search is True
# Verify it executes without error and skips unindexed data
results = table.search("xyz", query_type="fts").fast_search().limit(5).to_list()
assert len(results) == 0
# Update index and verify it returns results
table.optimize()
results = table.search("xyz", query_type="fts").fast_search().limit(5).to_list()
assert len(results) > 0
@pytest.mark.asyncio
async def test_fts_fast_search_async(async_table):
await async_table.create_index("text", config=FTS())
# Insert some unindexed data
await async_table.add(
[
{
"text": "xyz",
"vector": [0 for _ in range(128)],
"id": 101,
"text2": "xyz",
"nested": {"text": "xyz"},
"count": 10,
}
]
)
# Without fast_search, should return results
results = await async_table.query().nearest_to_text("xyz").limit(5).to_list()
assert len(results) > 0
# With fast_search, should return no results data unindexed
fast_results = (
await async_table.query()
.nearest_to_text("xyz")
.fast_search()
.limit(5)
.to_list()
)
assert len(fast_results) == 0
# Update index and verify it returns results
await async_table.optimize()
fast_results = (
await async_table.query()
.nearest_to_text("xyz")
.fast_search()
.limit(5)
.to_list()
)
assert len(fast_results) > 0
# fast_search should be chainable with other methods
results = (
await async_table.query()
.nearest_to_text("xyz")
.fast_search()
.select(["text"])
.limit(5)
.to_list()
)
assert len(results) > 0