lancedb/python/python/tests/test_fts.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright The LanceDB Authors

# Copyright 2023 LanceDB Developers
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
import os
import random
import shutil
from unittest import mock
from pathlib import Path
import zipfile

import lancedb as ldb
from lancedb.db import DBConnection
from lancedb.index import FTS
from lancedb.query import (
    BoostQuery,
    MatchQuery,
    MultiMatchQuery,
    PhraseQuery,
    BooleanQuery,
    Occur,
    LanceFtsQueryBuilder,
)
import numpy as np
import pyarrow as pa
import pandas as pd
import pytest
import pytest_asyncio
from utils import exception_output

TEST_LANGUAGE_MODEL_HOME = Path(__file__).parent / "models"


@pytest.fixture
def table(tmp_path) -> ldb.table.LanceTable:
    # Use local random state to avoid affecting other tests
    rng = np.random.RandomState(42)
    local_random = random.Random(42)
    db = ldb.connect(tmp_path)
    vectors = [rng.randn(128) for _ in range(100)]

    text_nouns = ("puppy", "car")
    text2_nouns = ("rabbit", "girl", "monkey")
    verbs = ("runs", "hits", "jumps", "drives", "barfs")
    adv = ("crazily.", "dutifully.", "foolishly.", "merrily.", "occasionally.")
    adj = ("adorable", "clueless", "dirty", "odd", "stupid")
    text = [
        " ".join(
            [
                text_nouns[local_random.randrange(0, len(text_nouns))],
                verbs[local_random.randrange(0, 5)],
                adv[local_random.randrange(0, 5)],
                adj[local_random.randrange(0, 5)],
            ]
        )
        for _ in range(100)
    ]
    text2 = [
        " ".join(
            [
                text2_nouns[local_random.randrange(0, len(text2_nouns))],
                verbs[local_random.randrange(0, 5)],
                adv[local_random.randrange(0, 5)],
                adj[local_random.randrange(0, 5)],
            ]
        )
        for _ in range(100)
    ]
    count = [local_random.randint(1, 10000) for _ in range(100)]
    table = db.create_table(
        "test",
        data=pd.DataFrame(
            {
                "vector": vectors,
                "id": [i % 2 for i in range(100)],
                "text": text,
                "text2": text2,
                "nested": [{"text": t} for t in text],
                "count": count,
            }
        ),
    )
    return table


@pytest.fixture
def language_model_home(monkeypatch, tmp_path):
    model_home = tmp_path / "language-models"
    shutil.copytree(TEST_LANGUAGE_MODEL_HOME, model_home)
    monkeypatch.setenv("LANCE_LANGUAGE_MODEL_HOME", str(model_home))
    return model_home


@pytest.fixture
def lindera_ipadic(language_model_home):
    model_path = language_model_home / "lindera" / "ipadic"
    extracted_model = model_path / "main"
    config_path = model_path / "config.yml"

    if extracted_model.exists():
        shutil.rmtree(extracted_model)

    with zipfile.ZipFile(model_path / "main.zip", "r") as zip_ref:
        zip_ref.extractall(model_path)
    config_path.write_text(
        "segmenter:\n"
        '  mode: "normal"\n'
        "  dictionary:\n"
        f'    path: "{extracted_model.resolve().as_posix()}"\n',
        encoding="utf-8",
    )

    try:
        yield
    finally:
        if extracted_model.exists():
            shutil.rmtree(extracted_model)


@pytest_asyncio.fixture
async def async_table(tmp_path) -> ldb.table.AsyncTable:
    # Use local random state to avoid affecting other tests
    rng = np.random.RandomState(42)
    local_random = random.Random(42)
    db = await ldb.connect_async(tmp_path)
    vectors = [rng.randn(128) for _ in range(100)]

    text_nouns = ("puppy", "car")
    text2_nouns = ("rabbit", "girl", "monkey")
    verbs = ("runs", "hits", "jumps", "drives", "barfs")
    adv = ("crazily.", "dutifully.", "foolishly.", "merrily.", "occasionally.")
    adj = ("adorable", "clueless", "dirty", "odd", "stupid")
    text = [
        " ".join(
            [
                text_nouns[local_random.randrange(0, len(text_nouns))],
                verbs[local_random.randrange(0, 5)],
                adv[local_random.randrange(0, 5)],
                adj[local_random.randrange(0, 5)],
            ]
        )
        for _ in range(100)
    ]
    text2 = [
        " ".join(
            [
                text2_nouns[local_random.randrange(0, len(text2_nouns))],
                verbs[local_random.randrange(0, 5)],
                adv[local_random.randrange(0, 5)],
                adj[local_random.randrange(0, 5)],
            ]
        )
        for _ in range(100)
    ]
    count = [local_random.randint(1, 10000) for _ in range(100)]
    table = await db.create_table(
        "test",
        data=pd.DataFrame(
            {
                "vector": vectors,
                "id": [i % 2 for i in range(100)],
                "text": text,
                "text2": text2,
                "nested": [{"text": t} for t in text],
                "count": count,
            }
        ),
    )
    return table


@pytest.mark.parametrize(
    ("kwargs", "match"),
    [
        (
            {"use_tantivy": True},
            "Tantivy-based FTS has been removed",
        ),
        (
            {"ordering_field_names": ["count"]},
            "ordering_field_names was only supported",
        ),
        (
            {"writer_heap_size": 128},
            "writer_heap_size was only supported",
        ),
    ],
)
def test_reject_removed_tantivy_parameters(table, kwargs, match):
    with pytest.raises(ValueError, match=match):
        table.create_fts_index("text", **kwargs)


def test_reject_legacy_tantivy_index(table):
    path, _, _ = table._get_fts_index_path()
    os.makedirs(path, exist_ok=True)

    with pytest.raises(ValueError, match="Legacy Tantivy FTS index detected"):
        table.search("puppy").limit(5).to_list()

    with pytest.raises(ValueError, match="Legacy Tantivy FTS index detected"):
        table.create_fts_index("text")


@pytest.mark.parametrize("with_position", [True, False])
def test_create_inverted_index(table, with_position):
    table.create_fts_index(
        "text",
        with_position=with_position,
        name="custom_fts_index",
    )
    indices = table.list_indices()
    fts_indices = [i for i in indices if i.index_type == "FTS"]
    assert any(i.name == "custom_fts_index" for i in fts_indices)


def test_search_fts(table):
    table.create_fts_index("text")
    results = table.search("puppy").select(["id", "text"]).limit(5).to_list()
    assert len(results) == 5
    assert len(results[0]) == 3  # id, text, _score

    # Default limit of 10
    results = table.search("puppy").select(["id", "text"]).to_list()
    assert len(results) == 10

    # Test with a query
    results = (
        table.search(MatchQuery("puppy", "text"))
        .select(["id", "text"])
        .limit(5)
        .to_list()
    )
    assert len(results) == 5

    # Test boost query
    results = (
        table.search(
            BoostQuery(
                MatchQuery("puppy", "text"),
                MatchQuery("runs", "text"),
            )
        )
        .select(["id", "text"])
        .limit(5)
        .to_list()
    )
    assert len(results) == 5

    # Test multi match query
    table.create_fts_index("text2")
    results = (
        table.search(MultiMatchQuery("puppy", ["text", "text2"]))
        .select(["id", "text"])
        .limit(5)
        .to_list()
    )
    assert len(results) == 5
    assert len(results[0]) == 3  # id, text, _score

    # Test boolean query
    results = (
        table.search(MatchQuery("puppy", "text") & MatchQuery("runs", "text"))
        .select(["id", "text"])
        .limit(5)
        .to_list()
    )
    assert len(results) == 5
    assert len(results[0]) == 3  # id, text, _score
    for r in results:
        assert "puppy" in r["text"]
        assert "runs" in r["text"]


@pytest.mark.asyncio
async def test_fts_select_async(async_table):
    tbl = async_table
    await tbl.create_index("text", config=FTS())
    await tbl.create_index("text2", config=FTS())
    results = (
        await tbl.query()
        .nearest_to_text("puppy")
        .select(["id", "text"])
        .limit(5)
        .to_list()
    )
    assert len(results) == 5
    assert len(results[0]) == 3  # id, text, _score

    # Test with FullTextQuery
    results = (
        await tbl.query()
        .nearest_to_text(MatchQuery("puppy", "text"))
        .select(["id", "text"])
        .limit(5)
        .to_list()
    )
    assert len(results) == 5
    assert len(results[0]) == 3  # id, text, _score

    # Test with BoostQuery
    results = (
        await tbl.query()
        .nearest_to_text(
            BoostQuery(
                MatchQuery("puppy", "text"),
                MatchQuery("runs", "text"),
            )
        )
        .select(["id", "text"])
        .limit(5)
        .to_list()
    )
    assert len(results) == 5
    assert len(results[0]) == 3  # id, text, _score

    # Test with MultiMatchQuery
    results = (
        await tbl.query()
        .nearest_to_text(MultiMatchQuery("puppy", ["text", "text2"]))
        .select(["id", "text"])
        .limit(5)
        .to_list()
    )
    assert len(results) == 5
    assert len(results[0]) == 3  # id, text, _score

    # Test with search() API
    results = (
        await (await tbl.search(MatchQuery("puppy", "text")))
        .select(["id", "text"])
        .limit(5)
        .to_list()
    )
    assert len(results) == 5
    assert len(results[0]) == 3  # id, text, _score


def test_search_fts_phrase_query(table):
    table.create_fts_index("text", with_position=False)
    try:
        phrase_results = table.search('"puppy runs"').limit(100).to_list()
        assert False
    except Exception:
        pass
    table.create_fts_index("text", with_position=True, replace=True)
    results = table.search("puppy").limit(100).to_list()

    # Test with quotation marks
    phrase_results = table.search('"puppy runs"').limit(100).to_list()
    assert len(results) > len(phrase_results)
    assert len(phrase_results) > 0

    # Test with .phrase_query()
    phrase_results = table.search("puppy runs").phrase_query().limit(100).to_list()
    assert len(results) > len(phrase_results)
    assert len(phrase_results) > 0

    # Test with PhraseQuery()
    phrase_results = (
        table.search(PhraseQuery("puppy runs", "text")).limit(100).to_list()
    )
    assert len(results) > len(phrase_results)
    assert len(phrase_results) > 0


@pytest.mark.asyncio
async def test_search_fts_phrase_query_async(async_table):
    await async_table.create_index("text", config=FTS(with_position=False))
    try:
        phrase_results = (
            await async_table.query().nearest_to_text("puppy runs").limit(100).to_list()
        )
        assert False
    except Exception:
        pass
    await async_table.create_index("text", config=FTS(with_position=True))
    results = await async_table.query().nearest_to_text("puppy").limit(100).to_list()
    phrase_results = (
        await async_table.query().nearest_to_text('"puppy runs"').limit(100).to_list()
    )
    assert len(results) > len(phrase_results)
    assert len(phrase_results) > 0

    # Test with a query
    phrase_results = (
        await async_table.query()
        .nearest_to_text(PhraseQuery("puppy runs", "text"))
        .limit(100)
        .to_list()
    )
    assert len(results) > len(phrase_results)
    assert len(phrase_results) > 0


def test_search_fts_specify_column(table):
    table.create_fts_index("text")
    table.create_fts_index("text2")

    results = table.search("puppy", fts_columns="text").limit(5).to_list()
    assert len(results) == 5

    results = table.search("rabbit", fts_columns="text2").limit(5).to_list()
    assert len(results) == 5

    try:
        # we can only specify one column for now
        table.search("puppy", fts_columns=["text", "text2"]).limit(5).to_list()
        assert False
    except Exception:
        pass

    try:
        # have to specify a column because we have two fts indices
        table.search("puppy").limit(5).to_list()
        assert False
    except Exception:
        pass


@pytest.mark.asyncio
async def test_search_fts_async(async_table):
    await async_table.create_index("text", config=FTS())
    results = await async_table.query().nearest_to_text("puppy").limit(5).to_list()
    assert len(results) == 5

    expected_count = await async_table.count_rows(
        "count > 5000 and contains(text, 'puppy')"
    )
    expected_count = min(expected_count, 10)

    limited_results_pre_filter = await (
        async_table.query()
        .nearest_to_text("puppy")
        .where("count > 5000")
        .limit(10)
        .to_list()
    )
    assert len(limited_results_pre_filter) == expected_count
    limited_results_post_filter = await (
        async_table.query()
        .nearest_to_text("puppy")
        .where("count > 5000")
        .limit(10)
        .postfilter()
        .to_list()
    )
    assert len(limited_results_post_filter) <= expected_count


@pytest.mark.asyncio
async def test_search_fts_specify_column_async(async_table):
    await async_table.create_index("text", config=FTS())
    await async_table.create_index("text2", config=FTS())

    results = (
        await async_table.query()
        .nearest_to_text("puppy", columns="text")
        .limit(5)
        .to_list()
    )
    assert len(results) == 5

    results = (
        await async_table.query()
        .nearest_to_text("rabbit", columns="text2")
        .limit(5)
        .to_list()
    )
    assert len(results) == 5

    try:
        # we can only specify one column for now
        await (
            async_table.query()
            .nearest_to_text("rabbit", columns="text2")
            .limit(5)
            .to_list()
        )
        assert False
    except Exception:
        pass

    try:
        # have to specify a column because we have two fts indices
        await async_table.query().nearest_to_text("puppy").limit(5).to_list()
        assert False
    except Exception:
        pass


def test_create_index_from_table(tmp_path, table):
    table.create_fts_index("text")
    df = table.search("puppy").limit(5).select(["text"]).to_pandas()
    assert len(df) <= 5
    assert "text" in df.columns

    # Check whether it can be updated
    table.add(
        [
            {
                "vector": np.random.randn(128),
                "id": 101,
                "text": "gorilla",
                "text2": "gorilla",
                "nested": {"text": "gorilla"},
                "count": 10,
            }
        ]
    )

    with pytest.raises(Exception, match="already exists"):
        table.create_fts_index("text")

    table.create_fts_index("text", replace=True)
    assert len(table.search("gorilla").limit(1).to_pandas()) == 1


def test_create_index_multiple_columns(tmp_path, table):
    with pytest.raises(ValueError, match="Native FTS indexes can only be created"):
        table.create_fts_index(["text", "text2"])


def test_nested_schema(tmp_path, table):
    with pytest.raises(ValueError, match="top-level fields"):
        table.create_fts_index("nested.text")


def test_search_index_with_filter(table):
    table.create_fts_index("text")
    orig_import = __import__

    def import_mock(name, *args):
        if name == "duckdb":
            raise ImportError
        return orig_import(name, *args)

    # no duckdb
    with mock.patch("builtins.__import__", side_effect=import_mock):
        rs = table.search("puppy").where("id=1").limit(10)
        # test schema
        assert rs.to_arrow().drop("_score").schema.equals(table.schema)

        rs = rs.to_list()
        for r in rs:
            assert r["id"] == 1

    # yes duckdb
    rs2 = table.search("puppy").where("id=1").limit(10).to_list()
    for r in rs2:
        assert r["id"] == 1

    assert rs == rs2
    rs = table.search("puppy").where("id=1").with_row_id(True).limit(10).to_list()
    for r in rs:
        assert r["id"] == 1
        assert r["_rowid"] is not None


def test_null_input(table):
    table.add(
        [
            {
                "vector": np.random.randn(128),
                "id": 101,
                "text": None,
                "text2": None,
                "nested": {"text": None},
                "count": 7,
            }
        ]
    )
    table.create_fts_index("text")


def test_syntax(table):
    # https://github.com/lancedb/lancedb/issues/769
    table.create_fts_index("text")
    table.search("they could have been dogs OR").limit(10).to_list()

    # these should work

    # terms queries
    table.search('"they could have been dogs" OR cats').limit(10).to_list()
    table.search("(they AND could) OR (have AND been AND dogs) OR cats").limit(
        10
    ).to_list()

    # phrase queries
    table.create_fts_index("text", with_position=True, replace=True)
    table.search("they could have been dogs OR cats").phrase_query().limit(10).to_list()
    table.search('"they could have been dogs OR cats"').limit(10).to_list()
    table.search('''"the cats OR dogs were not really 'pets' at all"''').limit(
        10
    ).to_list()
    table.search('the cats OR dogs were not really "pets" at all').phrase_query().limit(
        10
    ).to_list()
    table.search('the cats OR dogs were not really "pets" at all').phrase_query().limit(
        10
    ).to_list()


def test_language(mem_db: DBConnection):
    sentences = [
        "Il n'y a que trois routes qui traversent la ville.",
        "Je veux prendre la route vers l'est.",
        "Je te retrouve au café au bout de la route.",
    ]
    data = [{"text": s} for s in sentences]
    table = mem_db.create_table("test", data=data)

    with pytest.raises(ValueError) as e:
        table.create_fts_index("text", language="klingon")

    assert exception_output(e) == (
        "ValueError: LanceDB does not support the requested language: 'klingon'\n"
        "Supported languages: Arabic, Danish, Dutch, English, Finnish, French, "
        "German, Greek, Hungarian, Italian, Norwegian, Portuguese, Romanian, "
        "Russian, Spanish, Swedish, Tamil, Turkish"
    )

    table.create_fts_index(
        "text",
        language="French",
        stem=True,
        ascii_folding=True,
        remove_stop_words=True,
    )

    # Can get "routes" and "route" from the same root
    results = table.search("route", query_type="fts").limit(5).to_list()
    assert len(results) == 3

    # Can find "café", without needing to provide accent
    results = table.search("cafe", query_type="fts").limit(5).to_list()
    assert len(results) == 1

    # Stop words -> no results
    results = table.search("la", query_type="fts").limit(5).to_list()
    assert len(results) == 0


def test_fts_on_list(mem_db: DBConnection):
    data = pa.table(
        {
            "text": [
                ["lance database", "the", "search"],
                ["lance database"],
                ["lance", "search"],
                ["database", "search"],
                ["unrelated", "doc"],
            ],
            "vector": [
                [1.0, 2.0, 3.0],
                [4.0, 5.0, 6.0],
                [7.0, 8.0, 9.0],
                [10.0, 11.0, 12.0],
                [13.0, 14.0, 15.0],
            ],
        }
    )
    table = mem_db.create_table("test", data=data)
    table.create_fts_index("text", with_position=True)

    res = table.search("lance").limit(5).to_list()
    assert len(res) == 3

    res = table.search(PhraseQuery("lance database", "text")).limit(5).to_list()
    assert len(res) == 2


def test_fts_ngram(mem_db: DBConnection):
    data = pa.table({"text": ["hello world", "lance database", "lance is cool"]})
    table = mem_db.create_table("test", data=data)
    table.create_fts_index("text", base_tokenizer="ngram")

    results = table.search("lan", query_type="fts").limit(10).to_list()
    assert len(results) == 2
    assert set(r["text"] for r in results) == {"lance database", "lance is cool"}

    results = (
        table.search("nce", query_type="fts").limit(10).to_list()
    )  # spellchecker:disable-line
    assert len(results) == 2
    assert set(r["text"] for r in results) == {"lance database", "lance is cool"}

    # the default min_ngram_length is 3, so "la" should not match
    results = table.search("la", query_type="fts").limit(10).to_list()
    assert len(results) == 0

    # test setting min_ngram_length and prefix_only
    table.create_fts_index(
        "text",
        base_tokenizer="ngram",
        replace=True,
        ngram_min_length=2,
        prefix_only=True,
    )

    results = table.search("lan", query_type="fts").limit(10).to_list()
    assert len(results) == 2
    assert set(r["text"] for r in results) == {"lance database", "lance is cool"}

    results = (
        table.search("nce", query_type="fts").limit(10).to_list()
    )  # spellchecker:disable-line
    assert len(results) == 0

    results = table.search("la", query_type="fts").limit(10).to_list()
    assert len(results) == 2
    assert set(r["text"] for r in results) == {"lance database", "lance is cool"}


def test_fts_jieba_tokenizer(mem_db: DBConnection, language_model_home):
    data = pa.table({"text": ["我们都有光明的前途", "光明的前途"]})
    table = mem_db.create_table("test_jieba", data=data)
    table.create_fts_index(
        "text",
        base_tokenizer="jieba/default",
        stem=False,
        remove_stop_words=False,
        ascii_folding=False,
    )

    results = table.search("我们", query_type="fts").limit(10).to_list()
    assert [row["text"] for row in results] == ["我们都有光明的前途"]


def test_fts_jieba_missing_language_model_note(
    mem_db: DBConnection, monkeypatch, tmp_path
):
    missing_root = tmp_path / "missing-language-models"
    monkeypatch.setenv("LANCE_LANGUAGE_MODEL_HOME", str(missing_root))
    table = mem_db.create_table(
        "test_missing_jieba_model",
        data=pa.table({"text": ["我们都有光明的前途"]}),
    )

    with pytest.raises((ValueError, RuntimeError)) as e:
        table.create_fts_index(
            "text",
            base_tokenizer="jieba/default",
            stem=False,
            remove_stop_words=False,
            ascii_folding=False,
        )

    output = exception_output(e)
    assert "Invalid directory path:" in output
    assert "LANCE_LANGUAGE_MODEL_HOME" in output
    assert "jieba/default" in output


@pytest.mark.asyncio
async def test_fts_jieba_missing_language_model_note_async(monkeypatch, tmp_path):
    missing_root = tmp_path / "missing-language-models"
    monkeypatch.setenv("LANCE_LANGUAGE_MODEL_HOME", str(missing_root))
    db = await ldb.connect_async(tmp_path / "async-db")
    table = await db.create_table(
        "test_missing_jieba_model_async",
        data=pa.table({"text": ["我们都有光明的前途"]}),
    )

    with pytest.raises((ValueError, RuntimeError)) as e:
        await table.create_index(
            "text",
            config=FTS(
                base_tokenizer="jieba/default",
                stem=False,
                remove_stop_words=False,
                ascii_folding=False,
            ),
        )

    output = exception_output(e)
    assert "Invalid directory path:" in output
    assert "LANCE_LANGUAGE_MODEL_HOME" in output
    assert "jieba/default" in output


def test_fts_lindera_tokenizer(
    mem_db: DBConnection, language_model_home, lindera_ipadic
):
    data = pa.table({"text": ["成田国際空港", "東京国際空港", "羽田空港"]})
    table = mem_db.create_table("test_lindera", data=data)
    table.create_fts_index(
        "text",
        base_tokenizer="lindera/ipadic",
        stem=False,
        remove_stop_words=False,
        ascii_folding=False,
    )

    results = table.search("成田", query_type="fts").limit(10).to_list()
    assert [row["text"] for row in results] == ["成田国際空港"]


def test_fts_query_to_json():
    """Test that FTS query to_json() produces valid JSON strings with exact format."""

    # Test MatchQuery - basic
    match_query = MatchQuery("hello world", "text")
    json_str = match_query.to_json()
    expected = (
        '{"match":{"column":"text","terms":"hello world","boost":1.0,'
        '"fuzziness":0,"max_expansions":50,"operator":"Or","prefix_length":0}}'
    )
    assert json_str == expected

    # Test MatchQuery with options
    match_query = MatchQuery("puppy", "text", fuzziness=2, boost=1.5, prefix_length=3)
    json_str = match_query.to_json()
    expected = (
        '{"match":{"column":"text","terms":"puppy","boost":1.5,"fuzziness":2,'
        '"max_expansions":50,"operator":"Or","prefix_length":3}}'
    )
    assert json_str == expected

    # Test PhraseQuery
    phrase_query = PhraseQuery("quick brown fox", "title")
    json_str = phrase_query.to_json()
    expected = '{"phrase":{"column":"title","terms":"quick brown fox","slop":0}}'
    assert json_str == expected

    # Test PhraseQuery with slop
    phrase_query = PhraseQuery("quick brown", "title", slop=2)
    json_str = phrase_query.to_json()
    expected = '{"phrase":{"column":"title","terms":"quick brown","slop":2}}'
    assert json_str == expected

    # Test BooleanQuery with MUST
    must_query = BooleanQuery(
        [
            (Occur.MUST, MatchQuery("puppy", "text")),
            (Occur.MUST, MatchQuery("runs", "text")),
        ]
    )
    json_str = must_query.to_json()
    expected = (
        '{"boolean":{"should":[],"must":[{"match":{"column":"text","terms":"puppy",'
        '"boost":1.0,"fuzziness":0,"max_expansions":50,"operator":"Or",'
        '"prefix_length":0}},{"match":{"column":"text","terms":"runs","boost":1.0,'
        '"fuzziness":0,"max_expansions":50,"operator":"Or","prefix_length":0}}],'
        '"must_not":[]}}'
    )
    assert json_str == expected

    # Test BooleanQuery with SHOULD
    should_query = BooleanQuery(
        [
            (Occur.SHOULD, MatchQuery("cat", "text")),
            (Occur.SHOULD, MatchQuery("dog", "text")),
        ]
    )
    json_str = should_query.to_json()
    expected = (
        '{"boolean":{"should":[{"match":{"column":"text","terms":"cat","boost":1.0,'
        '"fuzziness":0,"max_expansions":50,"operator":"Or","prefix_length":0}},'
        '{"match":{"column":"text","terms":"dog","boost":1.0,"fuzziness":0,'
        '"max_expansions":50,"operator":"Or","prefix_length":0}}],"must":[],'
        '"must_not":[]}}'
    )
    assert json_str == expected

    # Test BooleanQuery with MUST_NOT
    must_not_query = BooleanQuery(
        [
            (Occur.MUST, MatchQuery("puppy", "text")),
            (Occur.MUST_NOT, MatchQuery("training", "text")),
        ]
    )
    json_str = must_not_query.to_json()
    expected = (
        '{"boolean":{"should":[],"must":[{"match":{"column":"text","terms":"puppy",'
        '"boost":1.0,"fuzziness":0,"max_expansions":50,"operator":"Or",'
        '"prefix_length":0}}],"must_not":[{"match":{"column":"text",'
        '"terms":"training","boost":1.0,"fuzziness":0,"max_expansions":50,'
        '"operator":"Or","prefix_length":0}}]}}'
    )
    assert json_str == expected

    # Test BoostQuery
    positive = MatchQuery("puppy", "text")
    negative = MatchQuery("training", "text")
    boost_query = BoostQuery(positive, negative, negative_boost=0.3)
    json_str = boost_query.to_json()
    expected = (
        '{"boost":{"positive":{"match":{"column":"text","terms":"puppy",'
        '"boost":1.0,"fuzziness":0,"max_expansions":50,"operator":"Or",'
        '"prefix_length":0}},"negative":{"match":{"column":"text",'
        '"terms":"training","boost":1.0,"fuzziness":0,"max_expansions":50,'
        '"operator":"Or","prefix_length":0}},"negative_boost":0.3}}'
    )
    assert json_str == expected

    # Test MultiMatchQuery
    multi_match = MultiMatchQuery("python", ["tags", "title"])
    json_str = multi_match.to_json()
    expected = (
        '{"multi_match":{"query":"python","columns":["tags","title"],'
        '"boost":[1.0,1.0]}}'
    )
    assert json_str == expected

    # Test complex nested BooleanQuery
    inner1 = BooleanQuery(
        [
            (Occur.MUST, MatchQuery("python", "tags")),
            (Occur.MUST, MatchQuery("tutorial", "title")),
        ]
    )
    inner2 = BooleanQuery(
        [
            (Occur.MUST, MatchQuery("rust", "tags")),
            (Occur.MUST, MatchQuery("guide", "title")),
        ]
    )
    complex_query = BooleanQuery(
        [
            (Occur.SHOULD, inner1),
            (Occur.SHOULD, inner2),
        ]
    )
    json_str = complex_query.to_json()
    expected = (
        '{"boolean":{"should":[{"boolean":{"should":[],"must":[{"match":'
        '{"column":"tags","terms":"python","boost":1.0,"fuzziness":0,'
        '"max_expansions":50,"operator":"Or","prefix_length":0}},{"match":'
        '{"column":"title","terms":"tutorial","boost":1.0,"fuzziness":0,'
        '"max_expansions":50,"operator":"Or","prefix_length":0}}],"must_not":[]}}'
        ',{"boolean":{"should":[],"must":[{"match":{"column":"tags",'
        '"terms":"rust","boost":1.0,"fuzziness":0,"max_expansions":50,'
        '"operator":"Or","prefix_length":0}},{"match":{"column":"title",'
        '"terms":"guide","boost":1.0,"fuzziness":0,"max_expansions":50,'
        '"operator":"Or","prefix_length":0}}],"must_not":[]}}],"must":[],'
        '"must_not":[]}}'
    )
    assert json_str == expected


def test_fts_fast_search(table):
    table.create_fts_index("text")

    # Insert some unindexed data
    table.add(
        [
            {
                "text": "xyz",
                "vector": [0 for _ in range(128)],
                "id": 101,
                "text2": "xyz",
                "nested": {"text": "xyz"},
                "count": 10,
            }
        ]
    )

    # Without fast_search, the query object should not have fast_search set
    builder = table.search("xyz", query_type="fts").limit(10)
    query = builder.to_query_object()
    assert query.fast_search is None

    # With fast_search, the query object should have fast_search=True
    builder = table.search("xyz", query_type="fts").fast_search().limit(10)
    query = builder.to_query_object()
    assert query.fast_search is True

    # fast_search should be chainable with other methods
    builder = (
        table.search("xyz", query_type="fts").fast_search().select(["text"]).limit(5)
    )
    query = builder.to_query_object()
    assert query.fast_search is True
    assert query.limit == 5
    assert query.columns == ["text"]

    # fast_search should be enabled by keyword argument too
    query = LanceFtsQueryBuilder(table, "xyz", fast_search=True).to_query_object()
    assert query.fast_search is True

    # Verify it executes without error and skips unindexed data
    results = table.search("xyz", query_type="fts").fast_search().limit(5).to_list()
    assert len(results) == 0

    # Update index and verify it returns results
    table.optimize()
    results = table.search("xyz", query_type="fts").fast_search().limit(5).to_list()
    assert len(results) > 0


@pytest.mark.asyncio
async def test_fts_fast_search_async(async_table):
    await async_table.create_index("text", config=FTS())

    # Insert some unindexed data
    await async_table.add(
        [
            {
                "text": "xyz",
                "vector": [0 for _ in range(128)],
                "id": 101,
                "text2": "xyz",
                "nested": {"text": "xyz"},
                "count": 10,
            }
        ]
    )

    # Without fast_search, should return results
    results = await async_table.query().nearest_to_text("xyz").limit(5).to_list()
    assert len(results) > 0

    # With fast_search, should return no results data unindexed
    fast_results = (
        await async_table.query()
        .nearest_to_text("xyz")
        .fast_search()
        .limit(5)
        .to_list()
    )
    assert len(fast_results) == 0

    # Update index and verify it returns results
    await async_table.optimize()

    fast_results = (
        await async_table.query()
        .nearest_to_text("xyz")
        .fast_search()
        .limit(5)
        .to_list()
    )
    assert len(fast_results) > 0

    # fast_search should be chainable with other methods
    results = (
        await async_table.query()
        .nearest_to_text("xyz")
        .fast_search()
        .select(["text"])
        .limit(5)
        .to_list()
    )
    assert len(results) > 0