mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-26 14:49:57 +00:00
This PR adds support for passing through a set of ordering fields at index time (unsigned ints that tantivity can use as fast_fields) that at query time you can sort your results on. This is useful for cases where you want to get related hits, i.e by keyword, but order those hits by some other score, such as popularity. I.e search for songs descriptions that match on "sad AND jazz AND 1920" and then order those by number of times played. Example usage can be seen in the fts tests. --------- Co-authored-by: Nat Roth <natroth@Nats-MacBook-Pro.local> Co-authored-by: Chang She <759245+changhiskhan@users.noreply.github.com>
238 lines
7.2 KiB
Python
238 lines
7.2 KiB
Python
# Copyright 2023 LanceDB Developers
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
import os
|
|
import random
|
|
from unittest import mock
|
|
|
|
import lancedb as ldb
|
|
import numpy as np
|
|
import pandas as pd
|
|
import pytest
|
|
|
|
pytest.importorskip("lancedb.fts")
|
|
tantivy = pytest.importorskip("tantivy")
|
|
|
|
|
|
@pytest.fixture
|
|
def table(tmp_path) -> ldb.table.LanceTable:
|
|
db = ldb.connect(tmp_path)
|
|
vectors = [np.random.randn(128) for _ in range(100)]
|
|
|
|
nouns = ("puppy", "car", "rabbit", "girl", "monkey")
|
|
verbs = ("runs", "hits", "jumps", "drives", "barfs")
|
|
adv = ("crazily.", "dutifully.", "foolishly.", "merrily.", "occasionally.")
|
|
adj = ("adorable", "clueless", "dirty", "odd", "stupid")
|
|
text = [
|
|
" ".join(
|
|
[
|
|
nouns[random.randrange(0, 5)],
|
|
verbs[random.randrange(0, 5)],
|
|
adv[random.randrange(0, 5)],
|
|
adj[random.randrange(0, 5)],
|
|
]
|
|
)
|
|
for _ in range(100)
|
|
]
|
|
count = [random.randint(1, 10000) for _ in range(100)]
|
|
table = db.create_table(
|
|
"test",
|
|
data=pd.DataFrame(
|
|
{
|
|
"vector": vectors,
|
|
"id": [i % 2 for i in range(100)],
|
|
"text": text,
|
|
"text2": text,
|
|
"nested": [{"text": t} for t in text],
|
|
"count": count,
|
|
}
|
|
),
|
|
)
|
|
return table
|
|
|
|
|
|
def test_create_index(tmp_path):
|
|
index = ldb.fts.create_index(str(tmp_path / "index"), ["text"])
|
|
assert isinstance(index, tantivy.Index)
|
|
assert os.path.exists(str(tmp_path / "index"))
|
|
|
|
|
|
def test_populate_index(tmp_path, table):
|
|
index = ldb.fts.create_index(str(tmp_path / "index"), ["text"])
|
|
assert ldb.fts.populate_index(index, table, ["text"]) == len(table)
|
|
|
|
|
|
def test_search_index(tmp_path, table):
|
|
index = ldb.fts.create_index(str(tmp_path / "index"), ["text"])
|
|
ldb.fts.populate_index(index, table, ["text"])
|
|
index.reload()
|
|
results = ldb.fts.search_index(index, query="puppy", limit=10)
|
|
assert len(results) == 2
|
|
assert len(results[0]) == 10 # row_ids
|
|
assert len(results[1]) == 10 # _distance
|
|
|
|
|
|
def test_search_ordering_field_index_table(tmp_path, table):
|
|
table.create_fts_index("text", ordering_field_names=["count"])
|
|
rows = (
|
|
table.search("puppy", ordering_field_name="count")
|
|
.limit(20)
|
|
.select(["text", "count"])
|
|
.to_list()
|
|
)
|
|
for r in rows:
|
|
assert "puppy" in r["text"]
|
|
assert sorted(rows, key=lambda x: x["count"], reverse=True) == rows
|
|
|
|
|
|
def test_search_ordering_field_index(tmp_path, table):
|
|
index = ldb.fts.create_index(
|
|
str(tmp_path / "index"), ["text"], ordering_fields=["count"]
|
|
)
|
|
|
|
ldb.fts.populate_index(index, table, ["text"], ordering_fields=["count"])
|
|
index.reload()
|
|
results = ldb.fts.search_index(
|
|
index, query="puppy", limit=10, ordering_field="count"
|
|
)
|
|
assert len(results) == 2
|
|
assert len(results[0]) == 10 # row_ids
|
|
assert len(results[1]) == 10 # _distance
|
|
rows = table.to_lance().take(results[0]).to_pylist()
|
|
|
|
for r in rows:
|
|
assert "puppy" in r["text"]
|
|
assert sorted(rows, key=lambda x: x["count"], reverse=True) == rows
|
|
|
|
|
|
def test_create_index_from_table(tmp_path, table):
|
|
table.create_fts_index("text")
|
|
df = table.search("puppy").limit(10).select(["text"]).to_pandas()
|
|
assert len(df) <= 10
|
|
assert "text" in df.columns
|
|
|
|
# Check whether it can be updated
|
|
table.add(
|
|
[
|
|
{
|
|
"vector": np.random.randn(128),
|
|
"id": 101,
|
|
"text": "gorilla",
|
|
"text2": "gorilla",
|
|
"nested": {"text": "gorilla"},
|
|
"count": 10,
|
|
}
|
|
]
|
|
)
|
|
|
|
with pytest.raises(ValueError, match="already exists"):
|
|
table.create_fts_index("text")
|
|
|
|
table.create_fts_index("text", replace=True)
|
|
assert len(table.search("gorilla").limit(1).to_pandas()) == 1
|
|
|
|
|
|
def test_create_index_multiple_columns(tmp_path, table):
|
|
table.create_fts_index(["text", "text2"])
|
|
df = table.search("puppy").limit(10).to_pandas()
|
|
assert len(df) == 10
|
|
assert "text" in df.columns
|
|
assert "text2" in df.columns
|
|
|
|
|
|
def test_empty_rs(tmp_path, table, mocker):
|
|
table.create_fts_index(["text", "text2"])
|
|
mocker.patch("lancedb.fts.search_index", return_value=([], []))
|
|
df = table.search("puppy").limit(10).to_pandas()
|
|
assert len(df) == 0
|
|
|
|
|
|
def test_nested_schema(tmp_path, table):
|
|
table.create_fts_index("nested.text")
|
|
rs = table.search("puppy").limit(10).to_list()
|
|
assert len(rs) == 10
|
|
|
|
|
|
def test_search_index_with_filter(table):
|
|
table.create_fts_index("text")
|
|
orig_import = __import__
|
|
|
|
def import_mock(name, *args):
|
|
if name == "duckdb":
|
|
raise ImportError
|
|
return orig_import(name, *args)
|
|
|
|
# no duckdb
|
|
with mock.patch("builtins.__import__", side_effect=import_mock):
|
|
rs = table.search("puppy").where("id=1").limit(10)
|
|
# test schema
|
|
assert rs.to_arrow().drop("score").schema.equals(table.schema)
|
|
|
|
rs = rs.to_list()
|
|
for r in rs:
|
|
assert r["id"] == 1
|
|
|
|
# yes duckdb
|
|
rs2 = table.search("puppy").where("id=1").limit(10).to_list()
|
|
for r in rs2:
|
|
assert r["id"] == 1
|
|
|
|
assert rs == rs2
|
|
rs = table.search("puppy").where("id=1").with_row_id(True).limit(10).to_list()
|
|
for r in rs:
|
|
assert r["id"] == 1
|
|
assert r["_rowid"] is not None
|
|
|
|
|
|
def test_null_input(table):
|
|
table.add(
|
|
[
|
|
{
|
|
"vector": np.random.randn(128),
|
|
"id": 101,
|
|
"text": None,
|
|
"text2": None,
|
|
"nested": {"text": None},
|
|
"count": 7,
|
|
}
|
|
]
|
|
)
|
|
table.create_fts_index("text")
|
|
|
|
|
|
def test_syntax(table):
|
|
# https://github.com/lancedb/lancedb/issues/769
|
|
table.create_fts_index("text")
|
|
with pytest.raises(ValueError, match="Syntax Error"):
|
|
table.search("they could have been dogs OR cats").limit(10).to_list()
|
|
|
|
# these should work
|
|
|
|
# terms queries
|
|
table.search('"they could have been dogs" OR cats').limit(10).to_list()
|
|
table.search("(they AND could) OR (have AND been AND dogs) OR cats").limit(
|
|
10
|
|
).to_list()
|
|
|
|
# phrase queries
|
|
table.search("they could have been dogs OR cats").phrase_query().limit(10).to_list()
|
|
table.search('"they could have been dogs OR cats"').limit(10).to_list()
|
|
table.search('''"the cats OR dogs were not really 'pets' at all"''').limit(
|
|
10
|
|
).to_list()
|
|
table.search('the cats OR dogs were not really "pets" at all').phrase_query().limit(
|
|
10
|
|
).to_list()
|
|
table.search('the cats OR dogs were not really "pets" at all').phrase_query().limit(
|
|
10
|
|
).to_list()
|