mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-23 13:29:57 +00:00
Add `to_list` to return query results as list of python dict (so we're not too pandas-centric). Closes #555 Add `to_pandas` API and add deprecation warning on `to_df`. Closes #545 Co-authored-by: Chang She <chang@lancedb.com>
92 lines
2.9 KiB
Python
92 lines
2.9 KiB
Python
# Copyright 2023 LanceDB Developers
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
import os
|
|
import random
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
import pytest
|
|
import tantivy
|
|
|
|
import lancedb as ldb
|
|
import lancedb.fts
|
|
|
|
|
|
@pytest.fixture
|
|
def table(tmp_path) -> ldb.table.LanceTable:
|
|
db = ldb.connect(tmp_path)
|
|
vectors = [np.random.randn(128) for _ in range(100)]
|
|
|
|
nouns = ("puppy", "car", "rabbit", "girl", "monkey")
|
|
verbs = ("runs", "hits", "jumps", "drives", "barfs")
|
|
adv = ("crazily.", "dutifully.", "foolishly.", "merrily.", "occasionally.")
|
|
adj = ("adorable", "clueless", "dirty", "odd", "stupid")
|
|
text = [
|
|
" ".join(
|
|
[
|
|
nouns[random.randrange(0, 5)],
|
|
verbs[random.randrange(0, 5)],
|
|
adv[random.randrange(0, 5)],
|
|
adj[random.randrange(0, 5)],
|
|
]
|
|
)
|
|
for _ in range(100)
|
|
]
|
|
table = db.create_table(
|
|
"test", data=pd.DataFrame({"vector": vectors, "text": text, "text2": text})
|
|
)
|
|
return table
|
|
|
|
|
|
def test_create_index(tmp_path):
|
|
index = ldb.fts.create_index(str(tmp_path / "index"), ["text"])
|
|
assert isinstance(index, tantivy.Index)
|
|
assert os.path.exists(str(tmp_path / "index"))
|
|
|
|
|
|
def test_populate_index(tmp_path, table):
|
|
index = ldb.fts.create_index(str(tmp_path / "index"), ["text"])
|
|
assert ldb.fts.populate_index(index, table, ["text"]) == len(table)
|
|
|
|
|
|
def test_search_index(tmp_path, table):
|
|
index = ldb.fts.create_index(str(tmp_path / "index"), ["text"])
|
|
ldb.fts.populate_index(index, table, ["text"])
|
|
index.reload()
|
|
results = ldb.fts.search_index(index, query="puppy", limit=10)
|
|
assert len(results) == 2
|
|
assert len(results[0]) == 10 # row_ids
|
|
assert len(results[1]) == 10 # _distance
|
|
|
|
|
|
def test_create_index_from_table(tmp_path, table):
|
|
table.create_fts_index("text")
|
|
df = table.search("puppy").limit(10).select(["text"]).to_pandas()
|
|
assert len(df) == 10
|
|
assert "text" in df.columns
|
|
|
|
|
|
def test_create_index_multiple_columns(tmp_path, table):
|
|
table.create_fts_index(["text", "text2"])
|
|
df = table.search("puppy").limit(10).to_pandas()
|
|
assert len(df) == 10
|
|
assert "text" in df.columns
|
|
assert "text2" in df.columns
|
|
|
|
|
|
def test_empty_rs(tmp_path, table, mocker):
|
|
table.create_fts_index(["text", "text2"])
|
|
mocker.patch("lancedb.fts.search_index", return_value=([], []))
|
|
df = table.search("puppy").limit(10).to_pandas()
|
|
assert len(df) == 0
|