mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-26 14:49:57 +00:00
Closes #721 fts will return results as a pyarrow table. Pyarrow tables has a `filter` method but it does not take sql filter strings (only pyarrow compute expressions). Instead, we do one of two things to support `tbl.search("keywords").where("foo=5").limit(10).to_arrow()`: Default path: If duckdb is available then use duckdb to execute the sql filter string on the pyarrow table. Backup path: Otherwise, write the pyarrow table to a lance dataset and then do `to_table(filter=<filter>)` Neither is ideal. Default path has two issues: 1. requires installing an extra library (duckdb) 2. duckdb mangles some fields (like fixed size list => list) Backup path incurs a latency penalty (~20ms on ssd) to write the resultset to disk. In the short term, once #676 is addressed, we can write the dataset to "memory://" instead of disk, this makes the post filter evaluate much quicker (ETA next week). In the longer term, we'd like to be able to evaluate the filter string on the pyarrow Table directly, one possibility being that we use Substrait to generate pyarrow compute expressions from sql string. Or if there's enough progress on pyarrow, it could support Substrait expressions directly (no ETA) --------- Co-authored-by: Will Jones <willjones127@gmail.com>
150 lines
4.4 KiB
Python
150 lines
4.4 KiB
Python
# Copyright 2023 LanceDB Developers
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
import os
|
|
import random
|
|
from unittest import mock
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
import pytest
|
|
import tantivy
|
|
|
|
import lancedb as ldb
|
|
import lancedb.fts
|
|
|
|
|
|
@pytest.fixture
|
|
def table(tmp_path) -> ldb.table.LanceTable:
|
|
db = ldb.connect(tmp_path)
|
|
vectors = [np.random.randn(128) for _ in range(100)]
|
|
|
|
nouns = ("puppy", "car", "rabbit", "girl", "monkey")
|
|
verbs = ("runs", "hits", "jumps", "drives", "barfs")
|
|
adv = ("crazily.", "dutifully.", "foolishly.", "merrily.", "occasionally.")
|
|
adj = ("adorable", "clueless", "dirty", "odd", "stupid")
|
|
text = [
|
|
" ".join(
|
|
[
|
|
nouns[random.randrange(0, 5)],
|
|
verbs[random.randrange(0, 5)],
|
|
adv[random.randrange(0, 5)],
|
|
adj[random.randrange(0, 5)],
|
|
]
|
|
)
|
|
for _ in range(100)
|
|
]
|
|
table = db.create_table(
|
|
"test",
|
|
data=pd.DataFrame(
|
|
{
|
|
"vector": vectors,
|
|
"id": [i % 2 for i in range(100)],
|
|
"text": text,
|
|
"text2": text,
|
|
"nested": [{"text": t} for t in text],
|
|
}
|
|
),
|
|
)
|
|
return table
|
|
|
|
|
|
def test_create_index(tmp_path):
|
|
index = ldb.fts.create_index(str(tmp_path / "index"), ["text"])
|
|
assert isinstance(index, tantivy.Index)
|
|
assert os.path.exists(str(tmp_path / "index"))
|
|
|
|
|
|
def test_populate_index(tmp_path, table):
|
|
index = ldb.fts.create_index(str(tmp_path / "index"), ["text"])
|
|
assert ldb.fts.populate_index(index, table, ["text"]) == len(table)
|
|
|
|
|
|
def test_search_index(tmp_path, table):
|
|
index = ldb.fts.create_index(str(tmp_path / "index"), ["text"])
|
|
ldb.fts.populate_index(index, table, ["text"])
|
|
index.reload()
|
|
results = ldb.fts.search_index(index, query="puppy", limit=10)
|
|
assert len(results) == 2
|
|
assert len(results[0]) == 10 # row_ids
|
|
assert len(results[1]) == 10 # _distance
|
|
|
|
|
|
def test_create_index_from_table(tmp_path, table):
|
|
table.create_fts_index("text")
|
|
df = table.search("puppy").limit(10).select(["text"]).to_pandas()
|
|
assert len(df) == 10
|
|
assert "text" in df.columns
|
|
|
|
# Check whether it can be updated
|
|
table.add(
|
|
[
|
|
{
|
|
"vector": np.random.randn(128),
|
|
"id": 101,
|
|
"text": "gorilla",
|
|
"text2": "gorilla",
|
|
"nested": {"text": "gorilla"},
|
|
}
|
|
]
|
|
)
|
|
|
|
with pytest.raises(ValueError, match="already exists"):
|
|
table.create_fts_index("text")
|
|
|
|
table.create_fts_index("text", replace=True)
|
|
assert len(table.search("gorilla").limit(1).to_pandas()) == 1
|
|
|
|
|
|
def test_create_index_multiple_columns(tmp_path, table):
|
|
table.create_fts_index(["text", "text2"])
|
|
df = table.search("puppy").limit(10).to_pandas()
|
|
assert len(df) == 10
|
|
assert "text" in df.columns
|
|
assert "text2" in df.columns
|
|
|
|
|
|
def test_empty_rs(tmp_path, table, mocker):
|
|
table.create_fts_index(["text", "text2"])
|
|
mocker.patch("lancedb.fts.search_index", return_value=([], []))
|
|
df = table.search("puppy").limit(10).to_pandas()
|
|
assert len(df) == 0
|
|
|
|
|
|
def test_nested_schema(tmp_path, table):
|
|
table.create_fts_index("nested.text")
|
|
rs = table.search("puppy").limit(10).to_list()
|
|
assert len(rs) == 10
|
|
|
|
|
|
def test_search_index_with_filter(table):
|
|
table.create_fts_index("text")
|
|
orig_import = __import__
|
|
|
|
def import_mock(name, *args):
|
|
if name == "duckdb":
|
|
raise ImportError
|
|
return orig_import(name, *args)
|
|
|
|
# no duckdb
|
|
with mock.patch("builtins.__import__", side_effect=import_mock):
|
|
rs = table.search("puppy").where("id=1").limit(10).to_list()
|
|
for r in rs:
|
|
assert r["id"] == 1
|
|
|
|
# yes duckdb
|
|
rs2 = table.search("puppy").where("id=1").limit(10).to_list()
|
|
for r in rs2:
|
|
assert r["id"] == 1
|
|
|
|
assert rs == rs2
|