mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-27 15:12:53 +00:00
feat: add a basic async python client starting point (#1014)
This changes `lancedb` from a "pure python" setuptools project to a maturin project and adds a rust lancedb dependency. The async python client is extremely minimal (only `connect` and `Connection.table_names` are supported). The purpose of this PR is to get the infrastructure in place for building out the rest of the async client. Although this is not technically a breaking change (no APIs are changing) it is still a considerable change in the way the wheels are built because they now include the native shared library.
This commit is contained in:
184
python/python/tests/test_fts.py
Normal file
184
python/python/tests/test_fts.py
Normal file
@@ -0,0 +1,184 @@
|
||||
# Copyright 2023 LanceDB Developers
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import os
|
||||
import random
|
||||
from unittest import mock
|
||||
|
||||
import lancedb as ldb
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import pytest
|
||||
|
||||
pytest.importorskip("lancedb.fts")
|
||||
tantivy = pytest.importorskip("tantivy")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def table(tmp_path) -> ldb.table.LanceTable:
|
||||
db = ldb.connect(tmp_path)
|
||||
vectors = [np.random.randn(128) for _ in range(100)]
|
||||
|
||||
nouns = ("puppy", "car", "rabbit", "girl", "monkey")
|
||||
verbs = ("runs", "hits", "jumps", "drives", "barfs")
|
||||
adv = ("crazily.", "dutifully.", "foolishly.", "merrily.", "occasionally.")
|
||||
adj = ("adorable", "clueless", "dirty", "odd", "stupid")
|
||||
text = [
|
||||
" ".join(
|
||||
[
|
||||
nouns[random.randrange(0, 5)],
|
||||
verbs[random.randrange(0, 5)],
|
||||
adv[random.randrange(0, 5)],
|
||||
adj[random.randrange(0, 5)],
|
||||
]
|
||||
)
|
||||
for _ in range(100)
|
||||
]
|
||||
table = db.create_table(
|
||||
"test",
|
||||
data=pd.DataFrame(
|
||||
{
|
||||
"vector": vectors,
|
||||
"id": [i % 2 for i in range(100)],
|
||||
"text": text,
|
||||
"text2": text,
|
||||
"nested": [{"text": t} for t in text],
|
||||
}
|
||||
),
|
||||
)
|
||||
return table
|
||||
|
||||
|
||||
def test_create_index(tmp_path):
|
||||
index = ldb.fts.create_index(str(tmp_path / "index"), ["text"])
|
||||
assert isinstance(index, tantivy.Index)
|
||||
assert os.path.exists(str(tmp_path / "index"))
|
||||
|
||||
|
||||
def test_populate_index(tmp_path, table):
|
||||
index = ldb.fts.create_index(str(tmp_path / "index"), ["text"])
|
||||
assert ldb.fts.populate_index(index, table, ["text"]) == len(table)
|
||||
|
||||
|
||||
def test_search_index(tmp_path, table):
|
||||
index = ldb.fts.create_index(str(tmp_path / "index"), ["text"])
|
||||
ldb.fts.populate_index(index, table, ["text"])
|
||||
index.reload()
|
||||
results = ldb.fts.search_index(index, query="puppy", limit=10)
|
||||
assert len(results) == 2
|
||||
assert len(results[0]) == 10 # row_ids
|
||||
assert len(results[1]) == 10 # _distance
|
||||
|
||||
|
||||
def test_create_index_from_table(tmp_path, table):
|
||||
table.create_fts_index("text")
|
||||
df = table.search("puppy").limit(10).select(["text"]).to_pandas()
|
||||
assert len(df) <= 10
|
||||
assert "text" in df.columns
|
||||
|
||||
# Check whether it can be updated
|
||||
table.add(
|
||||
[
|
||||
{
|
||||
"vector": np.random.randn(128),
|
||||
"id": 101,
|
||||
"text": "gorilla",
|
||||
"text2": "gorilla",
|
||||
"nested": {"text": "gorilla"},
|
||||
}
|
||||
]
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError, match="already exists"):
|
||||
table.create_fts_index("text")
|
||||
|
||||
table.create_fts_index("text", replace=True)
|
||||
assert len(table.search("gorilla").limit(1).to_pandas()) == 1
|
||||
|
||||
|
||||
def test_create_index_multiple_columns(tmp_path, table):
|
||||
table.create_fts_index(["text", "text2"])
|
||||
df = table.search("puppy").limit(10).to_pandas()
|
||||
assert len(df) == 10
|
||||
assert "text" in df.columns
|
||||
assert "text2" in df.columns
|
||||
|
||||
|
||||
def test_empty_rs(tmp_path, table, mocker):
|
||||
table.create_fts_index(["text", "text2"])
|
||||
mocker.patch("lancedb.fts.search_index", return_value=([], []))
|
||||
df = table.search("puppy").limit(10).to_pandas()
|
||||
assert len(df) == 0
|
||||
|
||||
|
||||
def test_nested_schema(tmp_path, table):
|
||||
table.create_fts_index("nested.text")
|
||||
rs = table.search("puppy").limit(10).to_list()
|
||||
assert len(rs) == 10
|
||||
|
||||
|
||||
def test_search_index_with_filter(table):
|
||||
table.create_fts_index("text")
|
||||
orig_import = __import__
|
||||
|
||||
def import_mock(name, *args):
|
||||
if name == "duckdb":
|
||||
raise ImportError
|
||||
return orig_import(name, *args)
|
||||
|
||||
# no duckdb
|
||||
with mock.patch("builtins.__import__", side_effect=import_mock):
|
||||
rs = table.search("puppy").where("id=1").limit(10).to_list()
|
||||
for r in rs:
|
||||
assert r["id"] == 1
|
||||
|
||||
# yes duckdb
|
||||
rs2 = table.search("puppy").where("id=1").limit(10).to_list()
|
||||
for r in rs2:
|
||||
assert r["id"] == 1
|
||||
|
||||
assert rs == rs2
|
||||
|
||||
|
||||
def test_null_input(table):
|
||||
table.add(
|
||||
[
|
||||
{
|
||||
"vector": np.random.randn(128),
|
||||
"id": 101,
|
||||
"text": None,
|
||||
"text2": None,
|
||||
"nested": {"text": None},
|
||||
}
|
||||
]
|
||||
)
|
||||
table.create_fts_index("text")
|
||||
|
||||
|
||||
def test_syntax(table):
|
||||
# https://github.com/lancedb/lancedb/issues/769
|
||||
table.create_fts_index("text")
|
||||
with pytest.raises(ValueError, match="Syntax Error"):
|
||||
table.search("they could have been dogs OR cats").limit(10).to_list()
|
||||
table.search("they could have been dogs OR cats").phrase_query().limit(10).to_list()
|
||||
# this should work
|
||||
table.search('"they could have been dogs OR cats"').limit(10).to_list()
|
||||
# this should work too
|
||||
table.search('''"the cats OR dogs were not really 'pets' at all"''').limit(
|
||||
10
|
||||
).to_list()
|
||||
table.search('the cats OR dogs were not really "pets" at all').phrase_query().limit(
|
||||
10
|
||||
).to_list()
|
||||
table.search('the cats OR dogs were not really "pets" at all').phrase_query().limit(
|
||||
10
|
||||
).to_list()
|
||||
Reference in New Issue
Block a user