mirror of
https://github.com/lancedb/lancedb.git
synced 2026-01-11 14:22:59 +00:00
Basic full text search capabilities (#62)
This is v1 of integrating full text search index into LanceDB.
# API
The query API is roughly the same as before, except if the input is text
instead of a vector we assume that its fts search.
## Example
If `table` is a LanceDB LanceTable, then:
Build index: `table.create_fts_index("text")`
Query: `df = table.search("puppy").limit(10).select(["text"]).to_df()`
# Implementation
Here we use the tantivy-py package to build the index. We then use the
row id's as the full-text-search index's doc id then we just do a Take
operation to fetch the rows.
# Limitations
1. don't support incremental row appends yet. New data won't show up in
search
2. local filesystem only
3. requires building tantivy explicitly
---------
Co-authored-by: Chang She <chang@lancedb.com>
This commit is contained in:
@@ -14,6 +14,7 @@ from __future__ import annotations
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import pyarrow as pa
|
||||
|
||||
from .common import VECTOR_COLUMN_NAME
|
||||
|
||||
@@ -131,7 +132,6 @@ class LanceQueryBuilder:
|
||||
vector and the returned vector.
|
||||
"""
|
||||
ds = self._table.to_lance()
|
||||
# TODO indexed search
|
||||
tbl = ds.to_table(
|
||||
columns=self._columns,
|
||||
filter=self._where,
|
||||
@@ -145,3 +145,26 @@ class LanceQueryBuilder:
|
||||
},
|
||||
)
|
||||
return tbl.to_pandas()
|
||||
|
||||
|
||||
class LanceFtsQueryBuilder(LanceQueryBuilder):
|
||||
def to_df(self) -> pd.DataFrame:
|
||||
try:
|
||||
import tantivy
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"You need to install the `lancedb[fts]` extra to use this method."
|
||||
)
|
||||
|
||||
from .fts import search_index
|
||||
|
||||
# get the index path
|
||||
index_path = self._table._get_fts_index_path()
|
||||
# open the index
|
||||
index = tantivy.Index.open(index_path)
|
||||
# get the scores and doc ids
|
||||
row_ids, scores = search_index(index, self._query, self._limit)
|
||||
scores = pa.array(scores)
|
||||
output_tbl = self._table.to_lance().take(row_ids, columns=self._columns)
|
||||
output_tbl = output_tbl.append_column("score", scores)
|
||||
return output_tbl.to_pandas()
|
||||
|
||||
Reference in New Issue
Block a user