From f485378ea4ff4549f21364e145db8248c4875f81 Mon Sep 17 00:00:00 2001
From: Chang She <759245+changhiskhan@users.noreply.github.com>
Date: Wed, 24 May 2023 22:25:31 -0600
Subject: [PATCH] Basic full text search capabilities (#62)

This is v1 of integrating full text search index into LanceDB.

# API
The query API is roughly the same as before, except if the input is text
instead of a vector we assume that its fts search.

## Example
If `table` is a LanceDB LanceTable, then:

Build index: `table.create_fts_index("text")`

Query: `df = table.search("puppy").limit(10).select(["text"]).to_df()`

# Implementation
Here we use the tantivy-py package to build the index. We then use the
row id's as the full-text-search index's doc id then we just do a Take
operation to fetch the rows.

# Limitations

1. don't support incremental row appends yet. New data won't show up in
search
2. local filesystem only
3. requires building tantivy explicitly

---------

Co-authored-by: Chang She <chang@lancedb.com>
---
 .github/workflows/python.yml |   6 +-
 docs/mkdocs.yml              |   1 +
 docs/src/fts.md              |  50 ++++++++++++++
 docs/src/index.md            |   1 +
 python/lancedb/fts.py        | 122 +++++++++++++++++++++++++++++++++++
 python/lancedb/query.py      |  25 ++++++-
 python/lancedb/table.py      |  32 ++++++++-
 python/pyproject.toml        |   4 ++
 python/tests/test_fts.py     |  84 ++++++++++++++++++++++++
 9 files changed, 319 insertions(+), 6 deletions(-)
 create mode 100644 docs/src/fts.md
 create mode 100644 python/lancedb/fts.py
 create mode 100644 python/tests/test_fts.py

diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
index c66d81f2..88ee4977 100644
--- a/.github/workflows/python.yml
+++ b/.github/workflows/python.yml
@@ -30,7 +30,7 @@ jobs:
         python-version: 3.${{ matrix.python-minor-version }}
     - name: Install lancedb
       run: |
-        pip install -e .
+        pip install -e ".[fts]"
         pip install pytest
     - name: Run tests
       run: pytest -x -v --durations=30 tests
@@ -49,10 +49,10 @@ jobs:
     - name: Set up Python
       uses: actions/setup-python@v4
       with:
-        python-version: "3.10"
+        python-version: "3.11"
     - name: Install lancedb
       run: |
-        pip install -e .
+        pip install -e ".[fts]"
         pip install pytest
     - name: Run tests
       run: pytest -x -v --durations=30 tests
\ No newline at end of file
diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml
index 127772a8..39763513 100644
--- a/docs/mkdocs.yml
+++ b/docs/mkdocs.yml
@@ -19,6 +19,7 @@ nav:
 - Basics: basic.md
 - Embeddings: embedding.md
 - Indexing: ann_indexes.md
+- Full-text search: fts.md
 - Integrations: integrations.md
 - Python API: python.md
 
diff --git a/docs/src/fts.md b/docs/src/fts.md
new file mode 100644
index 00000000..08afd947
--- /dev/null
+++ b/docs/src/fts.md
@@ -0,0 +1,50 @@
+# [EXPERIMENTAL] Full text search
+
+LanceDB now provides experimental support for full text search.
+This is currently Python only. We plan to push the integration down to Rust in the future
+to make this available for JS as well.
+
+## Installation
+
+To use full text search, you must install the fts optional dependencies:
+
+`pip install lancedb[fts]`
+
+
+## Quickstart
+
+Assume:
+1. `table` is a LanceDB Table
+2. `text` is the name of the Table column that we want to index
+
+To create the index:
+
+```python
+table.create_fts_index("text")
+```
+
+To search:
+
+```python
+df = table.search("puppy").limit(10).select(["text"]).to_df()
+```
+
+LanceDB automatically looks for an FTS index if the input is str.
+
+## Multiple text columns
+
+If you have multiple columns to index, pass them all as a list to `create_fts_index`:
+
+```python
+table.create_fts_index(["text1", "text2"])
+```
+
+Note that the search API call does not change - you can search over all indexed columns at once.
+
+## Current limitations
+
+1. Currently we do not yet support incremental writes.
+If you add data after fts index creation, it won't be reflected
+in search results until you do a full reindex.
+
+2. We currently only support local filesystem paths for the fts index.
\ No newline at end of file
diff --git a/docs/src/index.md b/docs/src/index.md
index 2649be27..24b86488 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -45,5 +45,6 @@ We will be adding completed demo apps built using LanceDB.
 * [`Basic Operations`](basic.md) - basic functionality of LanceDB.
 * [`Embedding Functions`](embedding.md) - functions for working with embeddings.
 * [`Indexing`](ann_indexes.md) - create vector indexes to speed up queries.
+* [`Full text search`](fts.md) - [EXPERIMENTAL] full-text search API
 * [`Ecosystem Integrations`](integrations.md) - integrating LanceDB with python data tooling ecosystem.
 * [`API Reference`](python.md) - detailed documentation for the LanceDB Python SDK.
diff --git a/python/lancedb/fts.py b/python/lancedb/fts.py
new file mode 100644
index 00000000..897e192e
--- /dev/null
+++ b/python/lancedb/fts.py
@@ -0,0 +1,122 @@
+#  Copyright 2023 LanceDB Developers
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Full text search index using tantivy-py"""
+import os
+from typing import List, Tuple
+
+import pyarrow as pa
+import tantivy
+
+from .table import LanceTable
+
+
+def create_index(index_path: str, text_fields: List[str]) -> tantivy.Index:
+    """
+    Create a new Index (not populated)
+
+    Parameters
+    ----------
+    index_path : str
+        Path to the index directory
+    text_fields : List[str]
+        List of text fields to index
+
+    Returns
+    -------
+    index : tantivy.Index
+        The index object (not yet populated)
+    """
+    # Declaring our schema.
+    schema_builder = tantivy.SchemaBuilder()
+    # special field that we'll populate with row_id
+    schema_builder.add_integer_field("doc_id", stored=True)
+    # data fields
+    for name in text_fields:
+        schema_builder.add_text_field(name, stored=True)
+    schema = schema_builder.build()
+    os.makedirs(index_path, exist_ok=True)
+    index = tantivy.Index(schema, path=index_path)
+    return index
+
+
+def populate_index(index: tantivy.Index, table: LanceTable, fields: List[str]) -> int:
+    """
+    Populate an index with data from a LanceTable
+
+    Parameters
+    ----------
+    index : tantivy.Index
+        The index object
+    table : LanceTable
+        The table to index
+    fields : List[str]
+        List of fields to index
+    """
+    # first check the fields exist and are string or large string type
+    for name in fields:
+        f = table.schema.field(name)  # raises KeyError if not found
+        if not pa.types.is_string(f.type) and not pa.types.is_large_string(f.type):
+            raise TypeError(f"Field {name} is not a string type")
+
+    # create a tantivy writer
+    writer = index.writer()
+    # write data into index
+    dataset = table.to_lance()
+    row_id = 0
+    for b in dataset.to_batches(columns=fields):
+        for i in range(b.num_rows):
+            doc = tantivy.Document()
+            doc.add_integer("doc_id", row_id)
+            for name in fields:
+                doc.add_text(name, b[name][i].as_py())
+            writer.add_document(doc)
+            row_id += 1
+    # commit changes
+    writer.commit()
+    return row_id
+
+
+def search_index(
+    index: tantivy.Index, query: str, limit: int = 10
+) -> Tuple[Tuple[int], Tuple[float]]:
+    """
+    Search an index for a query
+
+    Parameters
+    ----------
+    index : tantivy.Index
+        The index object
+    query : str
+        The query string
+    limit : int
+        The maximum number of results to return
+
+    Returns
+    -------
+    ids_and_score: list[tuple[int], tuple[float]]
+        A tuple of two tuples, the first containing the document ids
+        and the second containing the scores
+    """
+    searcher = index.searcher()
+    query = index.parse_query(query)
+    # get top results
+    results = searcher.search(query, limit)
+    return tuple(
+        zip(
+            *[
+                (searcher.doc(doc_address)["doc_id"][0], score)
+                for score, doc_address in results.hits
+            ]
+        )
+    )
diff --git a/python/lancedb/query.py b/python/lancedb/query.py
index a0411c06..c3ca8ca8 100644
--- a/python/lancedb/query.py
+++ b/python/lancedb/query.py
@@ -14,6 +14,7 @@ from __future__ import annotations
 
 import numpy as np
 import pandas as pd
+import pyarrow as pa
 
 from .common import VECTOR_COLUMN_NAME
 
@@ -131,7 +132,6 @@ class LanceQueryBuilder:
         vector and the returned vector.
         """
         ds = self._table.to_lance()
-        # TODO indexed search
         tbl = ds.to_table(
             columns=self._columns,
             filter=self._where,
@@ -145,3 +145,26 @@ class LanceQueryBuilder:
             },
         )
         return tbl.to_pandas()
+
+
+class LanceFtsQueryBuilder(LanceQueryBuilder):
+    def to_df(self) -> pd.DataFrame:
+        try:
+            import tantivy
+        except ImportError:
+            raise ImportError(
+                "You need to install the `lancedb[fts]` extra to use this method."
+            )
+
+        from .fts import search_index
+
+        # get the index path
+        index_path = self._table._get_fts_index_path()
+        # open the index
+        index = tantivy.Index.open(index_path)
+        # get the scores and doc ids
+        row_ids, scores = search_index(index, self._query, self._limit)
+        scores = pa.array(scores)
+        output_tbl = self._table.to_lance().take(row_ids, columns=self._columns)
+        output_tbl = output_tbl.append_column("score", scores)
+        return output_tbl.to_pandas()
diff --git a/python/lancedb/table.py b/python/lancedb/table.py
index d743f733..14a7b91f 100644
--- a/python/lancedb/table.py
+++ b/python/lancedb/table.py
@@ -14,7 +14,9 @@
 from __future__ import annotations
 
 import os
+import shutil
 from functools import cached_property
+from typing import List, Union
 
 import lance
 import numpy as np
@@ -24,7 +26,8 @@ from lance import LanceDataset
 from lance.vector import vec_to_table
 
 from .common import DATA, VEC, VECTOR_COLUMN_NAME
-from .query import LanceQueryBuilder
+from .query import LanceFtsQueryBuilder, LanceQueryBuilder
+from .util import get_uri_scheme
 
 
 def _sanitize_data(data, schema):
@@ -130,6 +133,27 @@ class LanceTable:
         )
         self._reset_dataset()
 
+    def create_fts_index(self, field_names: Union[str, List[str]]):
+        """Create a full-text search index on the table.
+
+        Warning - this API is highly experimental and is highly likely to change
+        in the future.
+
+        Parameters
+        ----------
+        field_names: str or list of str
+            The name(s) of the field to index.
+        """
+        from .fts import create_index, populate_index
+
+        if isinstance(field_names, str):
+            field_names = [field_names]
+        index = create_index(self._get_fts_index_path(), field_names)
+        populate_index(index, self, field_names)
+
+    def _get_fts_index_path(self):
+        return os.path.join(self._dataset_uri, "_indices", "tantivy")
+
     @cached_property
     def _dataset(self) -> LanceDataset:
         return lance.dataset(self._dataset_uri, version=self._version)
@@ -158,7 +182,7 @@ class LanceTable:
         self._reset_dataset()
         return len(self)
 
-    def search(self, query: VEC) -> LanceQueryBuilder:
+    def search(self, query: Union[VEC, str]) -> LanceQueryBuilder:
         """Create a search query to find the nearest neighbors
         of the given query vector.
 
@@ -174,6 +198,10 @@ class LanceTable:
         and also the "score" column which is the distance between the query
         vector and the returned vector.
         """
+        if isinstance(query, str):
+            # fts
+            return LanceFtsQueryBuilder(self, query)
+
         if isinstance(query, list):
             query = np.array(query)
         if isinstance(query, np.ndarray):
diff --git a/python/pyproject.toml b/python/pyproject.toml
index 943a3431..17290d16 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -45,6 +45,10 @@ dev = [
 docs = [
     "mkdocs", "mkdocs-jupyter", "mkdocs-material", "mkdocstrings[python]"
 ]
+fts = [
+    # tantivy 0.19.2
+    "tantivy@git+https://github.com/quickwit-oss/tantivy-py#164adc87e1a033117001cf70e38c82a53014d985"
+]
 
 [build-system]
 requires = [
diff --git a/python/tests/test_fts.py b/python/tests/test_fts.py
new file mode 100644
index 00000000..237aff1a
--- /dev/null
+++ b/python/tests/test_fts.py
@@ -0,0 +1,84 @@
+# Copyright 2023 LanceDB Developers
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+import os
+import random
+
+import numpy as np
+import pandas as pd
+import pytest
+import tantivy
+
+import lancedb as ldb
+import lancedb.fts
+
+
+@pytest.fixture
+def table(tmp_path) -> ldb.table.LanceTable:
+    db = ldb.connect(tmp_path)
+    vectors = [np.random.randn(128) for _ in range(100)]
+
+    nouns = ("puppy", "car", "rabbit", "girl", "monkey")
+    verbs = ("runs", "hits", "jumps", "drives", "barfs")
+    adv = ("crazily.", "dutifully.", "foolishly.", "merrily.", "occasionally.")
+    adj = ("adorable", "clueless", "dirty", "odd", "stupid")
+    text = [
+        " ".join(
+            [
+                nouns[random.randrange(0, 5)],
+                verbs[random.randrange(0, 5)],
+                adv[random.randrange(0, 5)],
+                adj[random.randrange(0, 5)],
+            ]
+        )
+        for _ in range(100)
+    ]
+    table = db.create_table(
+        "test", data=pd.DataFrame({"vector": vectors, "text": text, "text2": text})
+    )
+    return table
+
+
+def test_create_index(tmp_path):
+    index = ldb.fts.create_index(str(tmp_path / "index"), ["text"])
+    assert isinstance(index, tantivy.Index)
+    assert os.path.exists(str(tmp_path / "index"))
+
+
+def test_populate_index(tmp_path, table):
+    index = ldb.fts.create_index(str(tmp_path / "index"), ["text"])
+    assert ldb.fts.populate_index(index, table, ["text"]) == len(table)
+
+
+def test_search_index(tmp_path, table):
+    index = ldb.fts.create_index(str(tmp_path / "index"), ["text"])
+    ldb.fts.populate_index(index, table, ["text"])
+    index.reload()
+    results = ldb.fts.search_index(index, query="puppy", limit=10)
+    assert len(results) == 2
+    assert len(results[0]) == 10  # row_ids
+    assert len(results[1]) == 10  # scores
+
+
+def test_create_index_from_table(tmp_path, table):
+    table.create_fts_index("text")
+    df = table.search("puppy").limit(10).select(["text"]).to_df()
+    assert len(df) == 10
+    assert "text" in df.columns
+
+
+def test_create_index_multiple_columns(tmp_path, table):
+    table.create_fts_index(["text", "text2"])
+    df = table.search("puppy").limit(10).to_df()
+    assert len(df) == 10
+    assert "text" in df.columns
+    assert "text2" in df.columns