# SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright The LanceDB Authors # Copyright 2023 LanceDB Developers # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import random from unittest import mock import lancedb as ldb from lancedb.db import DBConnection from lancedb.index import FTS from lancedb.query import ( BoostQuery, MatchQuery, MultiMatchQuery, PhraseQuery, BooleanQuery, Occur, LanceFtsQueryBuilder, ) import numpy as np import pyarrow as pa import pandas as pd import pytest import pytest_asyncio from utils import exception_output pytest.importorskip("lancedb.fts") tantivy = pytest.importorskip("tantivy") @pytest.fixture def table(tmp_path) -> ldb.table.LanceTable: # Use local random state to avoid affecting other tests rng = np.random.RandomState(42) local_random = random.Random(42) db = ldb.connect(tmp_path) vectors = [rng.randn(128) for _ in range(100)] text_nouns = ("puppy", "car") text2_nouns = ("rabbit", "girl", "monkey") verbs = ("runs", "hits", "jumps", "drives", "barfs") adv = ("crazily.", "dutifully.", "foolishly.", "merrily.", "occasionally.") adj = ("adorable", "clueless", "dirty", "odd", "stupid") text = [ " ".join( [ text_nouns[local_random.randrange(0, len(text_nouns))], verbs[local_random.randrange(0, 5)], adv[local_random.randrange(0, 5)], adj[local_random.randrange(0, 5)], ] ) for _ in range(100) ] text2 = [ " ".join( [ text2_nouns[local_random.randrange(0, len(text2_nouns))], verbs[local_random.randrange(0, 5)], adv[local_random.randrange(0, 5)], adj[local_random.randrange(0, 5)], ] ) for _ in range(100) ] count = [local_random.randint(1, 10000) for _ in range(100)] table = db.create_table( "test", data=pd.DataFrame( { "vector": vectors, "id": [i % 2 for i in range(100)], "text": text, "text2": text2, "nested": [{"text": t} for t in text], "count": count, } ), ) return table @pytest_asyncio.fixture async def async_table(tmp_path) -> ldb.table.AsyncTable: # Use local random state to avoid affecting other tests rng = np.random.RandomState(42) local_random = random.Random(42) db = await ldb.connect_async(tmp_path) vectors = [rng.randn(128) for _ in range(100)] text_nouns = ("puppy", "car") text2_nouns = ("rabbit", "girl", "monkey") verbs = ("runs", "hits", "jumps", "drives", "barfs") adv = ("crazily.", "dutifully.", "foolishly.", "merrily.", "occasionally.") adj = ("adorable", "clueless", "dirty", "odd", "stupid") text = [ " ".join( [ text_nouns[local_random.randrange(0, len(text_nouns))], verbs[local_random.randrange(0, 5)], adv[local_random.randrange(0, 5)], adj[local_random.randrange(0, 5)], ] ) for _ in range(100) ] text2 = [ " ".join( [ text2_nouns[local_random.randrange(0, len(text2_nouns))], verbs[local_random.randrange(0, 5)], adv[local_random.randrange(0, 5)], adj[local_random.randrange(0, 5)], ] ) for _ in range(100) ] count = [local_random.randint(1, 10000) for _ in range(100)] table = await db.create_table( "test", data=pd.DataFrame( { "vector": vectors, "id": [i % 2 for i in range(100)], "text": text, "text2": text2, "nested": [{"text": t} for t in text], "count": count, } ), ) return table def test_create_index(tmp_path): index = ldb.fts.create_index(str(tmp_path / "index"), ["text"]) assert isinstance(index, tantivy.Index) assert os.path.exists(str(tmp_path / "index")) def test_create_index_with_stemming(tmp_path, table): index = ldb.fts.create_index( str(tmp_path / "index"), ["text"], tokenizer_name="en_stem" ) assert isinstance(index, tantivy.Index) assert os.path.exists(str(tmp_path / "index")) # Check stemming by running tokenizer on non empty table table.create_fts_index("text", tokenizer_name="en_stem", use_tantivy=True) @pytest.mark.parametrize("use_tantivy", [True, False]) @pytest.mark.parametrize("with_position", [True, False]) def test_create_inverted_index(table, use_tantivy, with_position): if use_tantivy and not with_position: pytest.skip("we don't support building a tantivy index without position") table.create_fts_index( "text", use_tantivy=use_tantivy, with_position=with_position, name="custom_fts_index", ) if not use_tantivy: indices = table.list_indices() fts_indices = [i for i in indices if i.index_type == "FTS"] assert any(i.name == "custom_fts_index" for i in fts_indices) def test_populate_index(tmp_path, table): index = ldb.fts.create_index(str(tmp_path / "index"), ["text"]) assert ldb.fts.populate_index(index, table, ["text"]) == len(table) def test_search_index(tmp_path, table): index = ldb.fts.create_index(str(tmp_path / "index"), ["text"]) ldb.fts.populate_index(index, table, ["text"]) index.reload() results = ldb.fts.search_index(index, query="puppy", limit=5) assert len(results) == 2 assert len(results[0]) == 5 # row_ids assert len(results[1]) == 5 # _score @pytest.mark.parametrize("use_tantivy", [True, False]) def test_search_fts(table, use_tantivy): table.create_fts_index("text", use_tantivy=use_tantivy) results = table.search("puppy").select(["id", "text"]).limit(5).to_list() assert len(results) == 5 assert len(results[0]) == 3 # id, text, _score # Default limit of 10 results = table.search("puppy").select(["id", "text"]).to_list() assert len(results) == 10 if not use_tantivy: # Test with a query results = ( table.search(MatchQuery("puppy", "text")) .select(["id", "text"]) .limit(5) .to_list() ) assert len(results) == 5 # Test boost query results = ( table.search( BoostQuery( MatchQuery("puppy", "text"), MatchQuery("runs", "text"), ) ) .select(["id", "text"]) .limit(5) .to_list() ) assert len(results) == 5 # Test multi match query table.create_fts_index("text2", use_tantivy=use_tantivy) results = ( table.search(MultiMatchQuery("puppy", ["text", "text2"])) .select(["id", "text"]) .limit(5) .to_list() ) assert len(results) == 5 assert len(results[0]) == 3 # id, text, _score # Test boolean query results = ( table.search(MatchQuery("puppy", "text") & MatchQuery("runs", "text")) .select(["id", "text"]) .limit(5) .to_list() ) assert len(results) == 5 assert len(results[0]) == 3 # id, text, _score for r in results: assert "puppy" in r["text"] assert "runs" in r["text"] @pytest.mark.asyncio async def test_fts_select_async(async_table): tbl = async_table await tbl.create_index("text", config=FTS()) await tbl.create_index("text2", config=FTS()) results = ( await tbl.query() .nearest_to_text("puppy") .select(["id", "text"]) .limit(5) .to_list() ) assert len(results) == 5 assert len(results[0]) == 3 # id, text, _score # Test with FullTextQuery results = ( await tbl.query() .nearest_to_text(MatchQuery("puppy", "text")) .select(["id", "text"]) .limit(5) .to_list() ) assert len(results) == 5 assert len(results[0]) == 3 # id, text, _score # Test with BoostQuery results = ( await tbl.query() .nearest_to_text( BoostQuery( MatchQuery("puppy", "text"), MatchQuery("runs", "text"), ) ) .select(["id", "text"]) .limit(5) .to_list() ) assert len(results) == 5 assert len(results[0]) == 3 # id, text, _score # Test with MultiMatchQuery results = ( await tbl.query() .nearest_to_text(MultiMatchQuery("puppy", ["text", "text2"])) .select(["id", "text"]) .limit(5) .to_list() ) assert len(results) == 5 assert len(results[0]) == 3 # id, text, _score # Test with search() API results = ( await (await tbl.search(MatchQuery("puppy", "text"))) .select(["id", "text"]) .limit(5) .to_list() ) assert len(results) == 5 assert len(results[0]) == 3 # id, text, _score def test_search_fts_phrase_query(table): table.create_fts_index("text", use_tantivy=False, with_position=False) try: phrase_results = table.search('"puppy runs"').limit(100).to_list() assert False except Exception: pass table.create_fts_index("text", use_tantivy=False, with_position=True, replace=True) results = table.search("puppy").limit(100).to_list() # Test with quotation marks phrase_results = table.search('"puppy runs"').limit(100).to_list() assert len(results) > len(phrase_results) assert len(phrase_results) > 0 # Test with .phrase_query() phrase_results = table.search("puppy runs").phrase_query().limit(100).to_list() assert len(results) > len(phrase_results) assert len(phrase_results) > 0 # Test with PhraseQuery() phrase_results = ( table.search(PhraseQuery("puppy runs", "text")).limit(100).to_list() ) assert len(results) > len(phrase_results) assert len(phrase_results) > 0 @pytest.mark.asyncio async def test_search_fts_phrase_query_async(async_table): await async_table.create_index("text", config=FTS(with_position=False)) try: phrase_results = ( await async_table.query().nearest_to_text("puppy runs").limit(100).to_list() ) assert False except Exception: pass await async_table.create_index("text", config=FTS(with_position=True)) results = await async_table.query().nearest_to_text("puppy").limit(100).to_list() phrase_results = ( await async_table.query().nearest_to_text('"puppy runs"').limit(100).to_list() ) assert len(results) > len(phrase_results) assert len(phrase_results) > 0 # Test with a query phrase_results = ( await async_table.query() .nearest_to_text(PhraseQuery("puppy runs", "text")) .limit(100) .to_list() ) assert len(results) > len(phrase_results) assert len(phrase_results) > 0 def test_search_fts_specify_column(table): table.create_fts_index("text", use_tantivy=False) table.create_fts_index("text2", use_tantivy=False) results = table.search("puppy", fts_columns="text").limit(5).to_list() assert len(results) == 5 results = table.search("rabbit", fts_columns="text2").limit(5).to_list() assert len(results) == 5 try: # we can only specify one column for now table.search("puppy", fts_columns=["text", "text2"]).limit(5).to_list() assert False except Exception: pass try: # have to specify a column because we have two fts indices table.search("puppy").limit(5).to_list() assert False except Exception: pass @pytest.mark.asyncio async def test_search_fts_async(async_table): await async_table.create_index("text", config=FTS()) results = await async_table.query().nearest_to_text("puppy").limit(5).to_list() assert len(results) == 5 expected_count = await async_table.count_rows( "count > 5000 and contains(text, 'puppy')" ) expected_count = min(expected_count, 10) limited_results_pre_filter = await ( async_table.query() .nearest_to_text("puppy") .where("count > 5000") .limit(10) .to_list() ) assert len(limited_results_pre_filter) == expected_count limited_results_post_filter = await ( async_table.query() .nearest_to_text("puppy") .where("count > 5000") .limit(10) .postfilter() .to_list() ) assert len(limited_results_post_filter) <= expected_count @pytest.mark.asyncio async def test_search_fts_specify_column_async(async_table): await async_table.create_index("text", config=FTS()) await async_table.create_index("text2", config=FTS()) results = ( await async_table.query() .nearest_to_text("puppy", columns="text") .limit(5) .to_list() ) assert len(results) == 5 results = ( await async_table.query() .nearest_to_text("rabbit", columns="text2") .limit(5) .to_list() ) assert len(results) == 5 try: # we can only specify one column for now await ( async_table.query() .nearest_to_text("rabbit", columns="text2") .limit(5) .to_list() ) assert False except Exception: pass try: # have to specify a column because we have two fts indices await async_table.query().nearest_to_text("puppy").limit(5).to_list() assert False except Exception: pass def test_search_ordering_field_index_table(tmp_path, table): table.create_fts_index("text", ordering_field_names=["count"], use_tantivy=True) rows = ( table.search("puppy", ordering_field_name="count") .limit(20) .select(["text", "count"]) .to_list() ) for r in rows: assert "puppy" in r["text"] assert sorted(rows, key=lambda x: x["count"], reverse=True) == rows def test_search_ordering_field_index(tmp_path, table): index = ldb.fts.create_index( str(tmp_path / "index"), ["text"], ordering_fields=["count"] ) ldb.fts.populate_index(index, table, ["text"], ordering_fields=["count"]) index.reload() results = ldb.fts.search_index( index, query="puppy", limit=5, ordering_field="count" ) assert len(results) == 2 assert len(results[0]) == 5 # row_ids assert len(results[1]) == 5 # _distance rows = table.to_lance().take(results[0]).to_pylist() for r in rows: assert "puppy" in r["text"] assert sorted(rows, key=lambda x: x["count"], reverse=True) == rows @pytest.mark.parametrize("use_tantivy", [True, False]) def test_create_index_from_table(tmp_path, table, use_tantivy): table.create_fts_index("text", use_tantivy=use_tantivy) df = table.search("puppy").limit(5).select(["text"]).to_pandas() assert len(df) <= 5 assert "text" in df.columns # Check whether it can be updated table.add( [ { "vector": np.random.randn(128), "id": 101, "text": "gorilla", "text2": "gorilla", "nested": {"text": "gorilla"}, "count": 10, } ] ) with pytest.raises(Exception, match="already exists"): table.create_fts_index("text", use_tantivy=use_tantivy) table.create_fts_index("text", replace=True, use_tantivy=use_tantivy) assert len(table.search("gorilla").limit(1).to_pandas()) == 1 def test_create_index_multiple_columns(tmp_path, table): table.create_fts_index(["text", "text2"], use_tantivy=True) df = table.search("puppy").limit(5).to_pandas() assert len(df) == 5 assert "text" in df.columns assert "text2" in df.columns def test_empty_rs(tmp_path, table, mocker): table.create_fts_index(["text", "text2"], use_tantivy=True) mocker.patch("lancedb.fts.search_index", return_value=([], [])) df = table.search("puppy").limit(5).to_pandas() assert len(df) == 0 def test_nested_schema(tmp_path, table): table.create_fts_index("nested.text", use_tantivy=True) rs = table.search("puppy").limit(5).to_list() assert len(rs) == 5 @pytest.mark.parametrize("use_tantivy", [True, False]) def test_search_index_with_filter(table, use_tantivy): table.create_fts_index("text", use_tantivy=use_tantivy) orig_import = __import__ def import_mock(name, *args): if name == "duckdb": raise ImportError return orig_import(name, *args) # no duckdb with mock.patch("builtins.__import__", side_effect=import_mock): rs = table.search("puppy").where("id=1").limit(10) # test schema assert rs.to_arrow().drop("_score").schema.equals(table.schema) rs = rs.to_list() for r in rs: assert r["id"] == 1 # yes duckdb rs2 = table.search("puppy").where("id=1").limit(10).to_list() for r in rs2: assert r["id"] == 1 assert rs == rs2 rs = table.search("puppy").where("id=1").with_row_id(True).limit(10).to_list() for r in rs: assert r["id"] == 1 assert r["_rowid"] is not None @pytest.mark.parametrize("use_tantivy", [True, False]) def test_null_input(table, use_tantivy): table.add( [ { "vector": np.random.randn(128), "id": 101, "text": None, "text2": None, "nested": {"text": None}, "count": 7, } ] ) table.create_fts_index("text", use_tantivy=use_tantivy) def test_syntax(table): # https://github.com/lancedb/lancedb/issues/769 table.create_fts_index("text", use_tantivy=True) with pytest.raises(ValueError, match="Syntax Error"): table.search("they could have been dogs OR").limit(10).to_list() # these should work # terms queries table.search('"they could have been dogs" OR cats').limit(10).to_list() table.search("(they AND could) OR (have AND been AND dogs) OR cats").limit( 10 ).to_list() # phrase queries table.search("they could have been dogs OR cats").phrase_query().limit(10).to_list() table.search('"they could have been dogs OR cats"').limit(10).to_list() table.search('''"the cats OR dogs were not really 'pets' at all"''').limit( 10 ).to_list() table.search('the cats OR dogs were not really "pets" at all').phrase_query().limit( 10 ).to_list() table.search('the cats OR dogs were not really "pets" at all').phrase_query().limit( 10 ).to_list() def test_language(mem_db: DBConnection): sentences = [ "Il n'y a que trois routes qui traversent la ville.", "Je veux prendre la route vers l'est.", "Je te retrouve au café au bout de la route.", ] data = [{"text": s} for s in sentences] table = mem_db.create_table("test", data=data) with pytest.raises(ValueError) as e: table.create_fts_index("text", use_tantivy=False, language="klingon") assert exception_output(e) == ( "ValueError: LanceDB does not support the requested language: 'klingon'\n" "Supported languages: Arabic, Danish, Dutch, English, Finnish, French, " "German, Greek, Hungarian, Italian, Norwegian, Portuguese, Romanian, " "Russian, Spanish, Swedish, Tamil, Turkish" ) table.create_fts_index( "text", use_tantivy=False, language="French", stem=True, ascii_folding=True, remove_stop_words=True, ) # Can get "routes" and "route" from the same root results = table.search("route", query_type="fts").limit(5).to_list() assert len(results) == 3 # Can find "café", without needing to provide accent results = table.search("cafe", query_type="fts").limit(5).to_list() assert len(results) == 1 # Stop words -> no results results = table.search("la", query_type="fts").limit(5).to_list() assert len(results) == 0 def test_fts_on_list(mem_db: DBConnection): data = pa.table( { "text": [ ["lance database", "the", "search"], ["lance database"], ["lance", "search"], ["database", "search"], ["unrelated", "doc"], ], "vector": [ [1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0], [10.0, 11.0, 12.0], [13.0, 14.0, 15.0], ], } ) table = mem_db.create_table("test", data=data) table.create_fts_index("text", use_tantivy=False, with_position=True) res = table.search("lance").limit(5).to_list() assert len(res) == 3 res = table.search(PhraseQuery("lance database", "text")).limit(5).to_list() assert len(res) == 2 def test_fts_ngram(mem_db: DBConnection): data = pa.table({"text": ["hello world", "lance database", "lance is cool"]}) table = mem_db.create_table("test", data=data) table.create_fts_index("text", use_tantivy=False, base_tokenizer="ngram") results = table.search("lan", query_type="fts").limit(10).to_list() assert len(results) == 2 assert set(r["text"] for r in results) == {"lance database", "lance is cool"} results = ( table.search("nce", query_type="fts").limit(10).to_list() ) # spellchecker:disable-line assert len(results) == 2 assert set(r["text"] for r in results) == {"lance database", "lance is cool"} # the default min_ngram_length is 3, so "la" should not match results = table.search("la", query_type="fts").limit(10).to_list() assert len(results) == 0 # test setting min_ngram_length and prefix_only table.create_fts_index( "text", use_tantivy=False, base_tokenizer="ngram", replace=True, ngram_min_length=2, prefix_only=True, ) results = table.search("lan", query_type="fts").limit(10).to_list() assert len(results) == 2 assert set(r["text"] for r in results) == {"lance database", "lance is cool"} results = ( table.search("nce", query_type="fts").limit(10).to_list() ) # spellchecker:disable-line assert len(results) == 0 results = table.search("la", query_type="fts").limit(10).to_list() assert len(results) == 2 assert set(r["text"] for r in results) == {"lance database", "lance is cool"} def test_fts_query_to_json(): """Test that FTS query to_json() produces valid JSON strings with exact format.""" # Test MatchQuery - basic match_query = MatchQuery("hello world", "text") json_str = match_query.to_json() expected = ( '{"match":{"column":"text","terms":"hello world","boost":1.0,' '"fuzziness":0,"max_expansions":50,"operator":"Or","prefix_length":0}}' ) assert json_str == expected # Test MatchQuery with options match_query = MatchQuery("puppy", "text", fuzziness=2, boost=1.5, prefix_length=3) json_str = match_query.to_json() expected = ( '{"match":{"column":"text","terms":"puppy","boost":1.5,"fuzziness":2,' '"max_expansions":50,"operator":"Or","prefix_length":3}}' ) assert json_str == expected # Test PhraseQuery phrase_query = PhraseQuery("quick brown fox", "title") json_str = phrase_query.to_json() expected = '{"phrase":{"column":"title","terms":"quick brown fox","slop":0}}' assert json_str == expected # Test PhraseQuery with slop phrase_query = PhraseQuery("quick brown", "title", slop=2) json_str = phrase_query.to_json() expected = '{"phrase":{"column":"title","terms":"quick brown","slop":2}}' assert json_str == expected # Test BooleanQuery with MUST must_query = BooleanQuery( [ (Occur.MUST, MatchQuery("puppy", "text")), (Occur.MUST, MatchQuery("runs", "text")), ] ) json_str = must_query.to_json() expected = ( '{"boolean":{"should":[],"must":[{"match":{"column":"text","terms":"puppy",' '"boost":1.0,"fuzziness":0,"max_expansions":50,"operator":"Or",' '"prefix_length":0}},{"match":{"column":"text","terms":"runs","boost":1.0,' '"fuzziness":0,"max_expansions":50,"operator":"Or","prefix_length":0}}],' '"must_not":[]}}' ) assert json_str == expected # Test BooleanQuery with SHOULD should_query = BooleanQuery( [ (Occur.SHOULD, MatchQuery("cat", "text")), (Occur.SHOULD, MatchQuery("dog", "text")), ] ) json_str = should_query.to_json() expected = ( '{"boolean":{"should":[{"match":{"column":"text","terms":"cat","boost":1.0,' '"fuzziness":0,"max_expansions":50,"operator":"Or","prefix_length":0}},' '{"match":{"column":"text","terms":"dog","boost":1.0,"fuzziness":0,' '"max_expansions":50,"operator":"Or","prefix_length":0}}],"must":[],' '"must_not":[]}}' ) assert json_str == expected # Test BooleanQuery with MUST_NOT must_not_query = BooleanQuery( [ (Occur.MUST, MatchQuery("puppy", "text")), (Occur.MUST_NOT, MatchQuery("training", "text")), ] ) json_str = must_not_query.to_json() expected = ( '{"boolean":{"should":[],"must":[{"match":{"column":"text","terms":"puppy",' '"boost":1.0,"fuzziness":0,"max_expansions":50,"operator":"Or",' '"prefix_length":0}}],"must_not":[{"match":{"column":"text",' '"terms":"training","boost":1.0,"fuzziness":0,"max_expansions":50,' '"operator":"Or","prefix_length":0}}]}}' ) assert json_str == expected # Test BoostQuery positive = MatchQuery("puppy", "text") negative = MatchQuery("training", "text") boost_query = BoostQuery(positive, negative, negative_boost=0.3) json_str = boost_query.to_json() expected = ( '{"boost":{"positive":{"match":{"column":"text","terms":"puppy",' '"boost":1.0,"fuzziness":0,"max_expansions":50,"operator":"Or",' '"prefix_length":0}},"negative":{"match":{"column":"text",' '"terms":"training","boost":1.0,"fuzziness":0,"max_expansions":50,' '"operator":"Or","prefix_length":0}},"negative_boost":0.3}}' ) assert json_str == expected # Test MultiMatchQuery multi_match = MultiMatchQuery("python", ["tags", "title"]) json_str = multi_match.to_json() expected = ( '{"multi_match":{"query":"python","columns":["tags","title"],' '"boost":[1.0,1.0]}}' ) assert json_str == expected # Test complex nested BooleanQuery inner1 = BooleanQuery( [ (Occur.MUST, MatchQuery("python", "tags")), (Occur.MUST, MatchQuery("tutorial", "title")), ] ) inner2 = BooleanQuery( [ (Occur.MUST, MatchQuery("rust", "tags")), (Occur.MUST, MatchQuery("guide", "title")), ] ) complex_query = BooleanQuery( [ (Occur.SHOULD, inner1), (Occur.SHOULD, inner2), ] ) json_str = complex_query.to_json() expected = ( '{"boolean":{"should":[{"boolean":{"should":[],"must":[{"match":' '{"column":"tags","terms":"python","boost":1.0,"fuzziness":0,' '"max_expansions":50,"operator":"Or","prefix_length":0}},{"match":' '{"column":"title","terms":"tutorial","boost":1.0,"fuzziness":0,' '"max_expansions":50,"operator":"Or","prefix_length":0}}],"must_not":[]}}' ',{"boolean":{"should":[],"must":[{"match":{"column":"tags",' '"terms":"rust","boost":1.0,"fuzziness":0,"max_expansions":50,' '"operator":"Or","prefix_length":0}},{"match":{"column":"title",' '"terms":"guide","boost":1.0,"fuzziness":0,"max_expansions":50,' '"operator":"Or","prefix_length":0}}],"must_not":[]}}],"must":[],' '"must_not":[]}}' ) assert json_str == expected def test_fts_fast_search(table): table.create_fts_index("text", use_tantivy=False) # Insert some unindexed data table.add( [ { "text": "xyz", "vector": [0 for _ in range(128)], "id": 101, "text2": "xyz", "nested": {"text": "xyz"}, "count": 10, } ] ) # Without fast_search, the query object should not have fast_search set builder = table.search("xyz", query_type="fts").limit(10) query = builder.to_query_object() assert query.fast_search is None # With fast_search, the query object should have fast_search=True builder = table.search("xyz", query_type="fts").fast_search().limit(10) query = builder.to_query_object() assert query.fast_search is True # fast_search should be chainable with other methods builder = ( table.search("xyz", query_type="fts").fast_search().select(["text"]).limit(5) ) query = builder.to_query_object() assert query.fast_search is True assert query.limit == 5 assert query.columns == ["text"] # fast_search should be enabled by keyword argument too query = LanceFtsQueryBuilder(table, "xyz", fast_search=True).to_query_object() assert query.fast_search is True # Verify it executes without error and skips unindexed data results = table.search("xyz", query_type="fts").fast_search().limit(5).to_list() assert len(results) == 0 # Update index and verify it returns results table.optimize() results = table.search("xyz", query_type="fts").fast_search().limit(5).to_list() assert len(results) > 0 @pytest.mark.asyncio async def test_fts_fast_search_async(async_table): await async_table.create_index("text", config=FTS()) # Insert some unindexed data await async_table.add( [ { "text": "xyz", "vector": [0 for _ in range(128)], "id": 101, "text2": "xyz", "nested": {"text": "xyz"}, "count": 10, } ] ) # Without fast_search, should return results results = await async_table.query().nearest_to_text("xyz").limit(5).to_list() assert len(results) > 0 # With fast_search, should return no results data unindexed fast_results = ( await async_table.query() .nearest_to_text("xyz") .fast_search() .limit(5) .to_list() ) assert len(fast_results) == 0 # Update index and verify it returns results await async_table.optimize() fast_results = ( await async_table.query() .nearest_to_text("xyz") .fast_search() .limit(5) .to_list() ) assert len(fast_results) > 0 # fast_search should be chainable with other methods results = ( await async_table.query() .nearest_to_text("xyz") .fast_search() .select(["text"]) .limit(5) .to_list() ) assert len(results) > 0