Basic full text search capabilities (#62)

This is v1 of integrating full text search index into LanceDB.

# API
The query API is roughly the same as before, except if the input is text
instead of a vector we assume that its fts search.

## Example
If `table` is a LanceDB LanceTable, then:

Build index: `table.create_fts_index("text")`

Query: `df = table.search("puppy").limit(10).select(["text"]).to_df()`

# Implementation
Here we use the tantivy-py package to build the index. We then use the
row id's as the full-text-search index's doc id then we just do a Take
operation to fetch the rows.

# Limitations

1. don't support incremental row appends yet. New data won't show up in
search
2. local filesystem only 
3. requires building tantivy explicitly

---------

Co-authored-by: Chang She <chang@lancedb.com>
This commit is contained in:
Chang She
2023-05-24 22:25:31 -06:00
committed by GitHub
parent f923cfe47f
commit f485378ea4
9 changed files with 319 additions and 6 deletions

View File

@@ -14,6 +14,7 @@ from __future__ import annotations
import numpy as np
import pandas as pd
import pyarrow as pa
from .common import VECTOR_COLUMN_NAME
@@ -131,7 +132,6 @@ class LanceQueryBuilder:
vector and the returned vector.
"""
ds = self._table.to_lance()
# TODO indexed search
tbl = ds.to_table(
columns=self._columns,
filter=self._where,
@@ -145,3 +145,26 @@ class LanceQueryBuilder:
},
)
return tbl.to_pandas()
class LanceFtsQueryBuilder(LanceQueryBuilder):
def to_df(self) -> pd.DataFrame:
try:
import tantivy
except ImportError:
raise ImportError(
"You need to install the `lancedb[fts]` extra to use this method."
)
from .fts import search_index
# get the index path
index_path = self._table._get_fts_index_path()
# open the index
index = tantivy.Index.open(index_path)
# get the scores and doc ids
row_ids, scores = search_index(index, self._query, self._limit)
scores = pa.array(scores)
output_tbl = self._table.to_lance().take(row_ids, columns=self._columns)
output_tbl = output_tbl.append_column("score", scores)
return output_tbl.to_pandas()