From f485378ea4ff4549f21364e145db8248c4875f81 Mon Sep 17 00:00:00 2001 From: Chang She <759245+changhiskhan@users.noreply.github.com> Date: Wed, 24 May 2023 22:25:31 -0600 Subject: [PATCH] Basic full text search capabilities (#62) This is v1 of integrating full text search index into LanceDB. # API The query API is roughly the same as before, except if the input is text instead of a vector we assume that its fts search. ## Example If `table` is a LanceDB LanceTable, then: Build index: `table.create_fts_index("text")` Query: `df = table.search("puppy").limit(10).select(["text"]).to_df()` # Implementation Here we use the tantivy-py package to build the index. We then use the row id's as the full-text-search index's doc id then we just do a Take operation to fetch the rows. # Limitations 1. don't support incremental row appends yet. New data won't show up in search 2. local filesystem only 3. requires building tantivy explicitly --------- Co-authored-by: Chang She --- .github/workflows/python.yml | 6 +- docs/mkdocs.yml | 1 + docs/src/fts.md | 50 ++++++++++++++ docs/src/index.md | 1 + python/lancedb/fts.py | 122 +++++++++++++++++++++++++++++++++++ python/lancedb/query.py | 25 ++++++- python/lancedb/table.py | 32 ++++++++- python/pyproject.toml | 4 ++ python/tests/test_fts.py | 84 ++++++++++++++++++++++++ 9 files changed, 319 insertions(+), 6 deletions(-) create mode 100644 docs/src/fts.md create mode 100644 python/lancedb/fts.py create mode 100644 python/tests/test_fts.py diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index c66d81f2..88ee4977 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -30,7 +30,7 @@ jobs: python-version: 3.${{ matrix.python-minor-version }} - name: Install lancedb run: | - pip install -e . + pip install -e ".[fts]" pip install pytest - name: Run tests run: pytest -x -v --durations=30 tests @@ -49,10 +49,10 @@ jobs: - name: Set up Python uses: actions/setup-python@v4 with: - python-version: "3.10" + python-version: "3.11" - name: Install lancedb run: | - pip install -e . + pip install -e ".[fts]" pip install pytest - name: Run tests run: pytest -x -v --durations=30 tests \ No newline at end of file diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index 127772a8..39763513 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -19,6 +19,7 @@ nav: - Basics: basic.md - Embeddings: embedding.md - Indexing: ann_indexes.md +- Full-text search: fts.md - Integrations: integrations.md - Python API: python.md diff --git a/docs/src/fts.md b/docs/src/fts.md new file mode 100644 index 00000000..08afd947 --- /dev/null +++ b/docs/src/fts.md @@ -0,0 +1,50 @@ +# [EXPERIMENTAL] Full text search + +LanceDB now provides experimental support for full text search. +This is currently Python only. We plan to push the integration down to Rust in the future +to make this available for JS as well. + +## Installation + +To use full text search, you must install the fts optional dependencies: + +`pip install lancedb[fts]` + + +## Quickstart + +Assume: +1. `table` is a LanceDB Table +2. `text` is the name of the Table column that we want to index + +To create the index: + +```python +table.create_fts_index("text") +``` + +To search: + +```python +df = table.search("puppy").limit(10).select(["text"]).to_df() +``` + +LanceDB automatically looks for an FTS index if the input is str. + +## Multiple text columns + +If you have multiple columns to index, pass them all as a list to `create_fts_index`: + +```python +table.create_fts_index(["text1", "text2"]) +``` + +Note that the search API call does not change - you can search over all indexed columns at once. + +## Current limitations + +1. Currently we do not yet support incremental writes. +If you add data after fts index creation, it won't be reflected +in search results until you do a full reindex. + +2. We currently only support local filesystem paths for the fts index. \ No newline at end of file diff --git a/docs/src/index.md b/docs/src/index.md index 2649be27..24b86488 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -45,5 +45,6 @@ We will be adding completed demo apps built using LanceDB. * [`Basic Operations`](basic.md) - basic functionality of LanceDB. * [`Embedding Functions`](embedding.md) - functions for working with embeddings. * [`Indexing`](ann_indexes.md) - create vector indexes to speed up queries. +* [`Full text search`](fts.md) - [EXPERIMENTAL] full-text search API * [`Ecosystem Integrations`](integrations.md) - integrating LanceDB with python data tooling ecosystem. * [`API Reference`](python.md) - detailed documentation for the LanceDB Python SDK. diff --git a/python/lancedb/fts.py b/python/lancedb/fts.py new file mode 100644 index 00000000..897e192e --- /dev/null +++ b/python/lancedb/fts.py @@ -0,0 +1,122 @@ +# Copyright 2023 LanceDB Developers +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Full text search index using tantivy-py""" +import os +from typing import List, Tuple + +import pyarrow as pa +import tantivy + +from .table import LanceTable + + +def create_index(index_path: str, text_fields: List[str]) -> tantivy.Index: + """ + Create a new Index (not populated) + + Parameters + ---------- + index_path : str + Path to the index directory + text_fields : List[str] + List of text fields to index + + Returns + ------- + index : tantivy.Index + The index object (not yet populated) + """ + # Declaring our schema. + schema_builder = tantivy.SchemaBuilder() + # special field that we'll populate with row_id + schema_builder.add_integer_field("doc_id", stored=True) + # data fields + for name in text_fields: + schema_builder.add_text_field(name, stored=True) + schema = schema_builder.build() + os.makedirs(index_path, exist_ok=True) + index = tantivy.Index(schema, path=index_path) + return index + + +def populate_index(index: tantivy.Index, table: LanceTable, fields: List[str]) -> int: + """ + Populate an index with data from a LanceTable + + Parameters + ---------- + index : tantivy.Index + The index object + table : LanceTable + The table to index + fields : List[str] + List of fields to index + """ + # first check the fields exist and are string or large string type + for name in fields: + f = table.schema.field(name) # raises KeyError if not found + if not pa.types.is_string(f.type) and not pa.types.is_large_string(f.type): + raise TypeError(f"Field {name} is not a string type") + + # create a tantivy writer + writer = index.writer() + # write data into index + dataset = table.to_lance() + row_id = 0 + for b in dataset.to_batches(columns=fields): + for i in range(b.num_rows): + doc = tantivy.Document() + doc.add_integer("doc_id", row_id) + for name in fields: + doc.add_text(name, b[name][i].as_py()) + writer.add_document(doc) + row_id += 1 + # commit changes + writer.commit() + return row_id + + +def search_index( + index: tantivy.Index, query: str, limit: int = 10 +) -> Tuple[Tuple[int], Tuple[float]]: + """ + Search an index for a query + + Parameters + ---------- + index : tantivy.Index + The index object + query : str + The query string + limit : int + The maximum number of results to return + + Returns + ------- + ids_and_score: list[tuple[int], tuple[float]] + A tuple of two tuples, the first containing the document ids + and the second containing the scores + """ + searcher = index.searcher() + query = index.parse_query(query) + # get top results + results = searcher.search(query, limit) + return tuple( + zip( + *[ + (searcher.doc(doc_address)["doc_id"][0], score) + for score, doc_address in results.hits + ] + ) + ) diff --git a/python/lancedb/query.py b/python/lancedb/query.py index a0411c06..c3ca8ca8 100644 --- a/python/lancedb/query.py +++ b/python/lancedb/query.py @@ -14,6 +14,7 @@ from __future__ import annotations import numpy as np import pandas as pd +import pyarrow as pa from .common import VECTOR_COLUMN_NAME @@ -131,7 +132,6 @@ class LanceQueryBuilder: vector and the returned vector. """ ds = self._table.to_lance() - # TODO indexed search tbl = ds.to_table( columns=self._columns, filter=self._where, @@ -145,3 +145,26 @@ class LanceQueryBuilder: }, ) return tbl.to_pandas() + + +class LanceFtsQueryBuilder(LanceQueryBuilder): + def to_df(self) -> pd.DataFrame: + try: + import tantivy + except ImportError: + raise ImportError( + "You need to install the `lancedb[fts]` extra to use this method." + ) + + from .fts import search_index + + # get the index path + index_path = self._table._get_fts_index_path() + # open the index + index = tantivy.Index.open(index_path) + # get the scores and doc ids + row_ids, scores = search_index(index, self._query, self._limit) + scores = pa.array(scores) + output_tbl = self._table.to_lance().take(row_ids, columns=self._columns) + output_tbl = output_tbl.append_column("score", scores) + return output_tbl.to_pandas() diff --git a/python/lancedb/table.py b/python/lancedb/table.py index d743f733..14a7b91f 100644 --- a/python/lancedb/table.py +++ b/python/lancedb/table.py @@ -14,7 +14,9 @@ from __future__ import annotations import os +import shutil from functools import cached_property +from typing import List, Union import lance import numpy as np @@ -24,7 +26,8 @@ from lance import LanceDataset from lance.vector import vec_to_table from .common import DATA, VEC, VECTOR_COLUMN_NAME -from .query import LanceQueryBuilder +from .query import LanceFtsQueryBuilder, LanceQueryBuilder +from .util import get_uri_scheme def _sanitize_data(data, schema): @@ -130,6 +133,27 @@ class LanceTable: ) self._reset_dataset() + def create_fts_index(self, field_names: Union[str, List[str]]): + """Create a full-text search index on the table. + + Warning - this API is highly experimental and is highly likely to change + in the future. + + Parameters + ---------- + field_names: str or list of str + The name(s) of the field to index. + """ + from .fts import create_index, populate_index + + if isinstance(field_names, str): + field_names = [field_names] + index = create_index(self._get_fts_index_path(), field_names) + populate_index(index, self, field_names) + + def _get_fts_index_path(self): + return os.path.join(self._dataset_uri, "_indices", "tantivy") + @cached_property def _dataset(self) -> LanceDataset: return lance.dataset(self._dataset_uri, version=self._version) @@ -158,7 +182,7 @@ class LanceTable: self._reset_dataset() return len(self) - def search(self, query: VEC) -> LanceQueryBuilder: + def search(self, query: Union[VEC, str]) -> LanceQueryBuilder: """Create a search query to find the nearest neighbors of the given query vector. @@ -174,6 +198,10 @@ class LanceTable: and also the "score" column which is the distance between the query vector and the returned vector. """ + if isinstance(query, str): + # fts + return LanceFtsQueryBuilder(self, query) + if isinstance(query, list): query = np.array(query) if isinstance(query, np.ndarray): diff --git a/python/pyproject.toml b/python/pyproject.toml index 943a3431..17290d16 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -45,6 +45,10 @@ dev = [ docs = [ "mkdocs", "mkdocs-jupyter", "mkdocs-material", "mkdocstrings[python]" ] +fts = [ + # tantivy 0.19.2 + "tantivy@git+https://github.com/quickwit-oss/tantivy-py#164adc87e1a033117001cf70e38c82a53014d985" +] [build-system] requires = [ diff --git a/python/tests/test_fts.py b/python/tests/test_fts.py new file mode 100644 index 00000000..237aff1a --- /dev/null +++ b/python/tests/test_fts.py @@ -0,0 +1,84 @@ +# Copyright 2023 LanceDB Developers +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import random + +import numpy as np +import pandas as pd +import pytest +import tantivy + +import lancedb as ldb +import lancedb.fts + + +@pytest.fixture +def table(tmp_path) -> ldb.table.LanceTable: + db = ldb.connect(tmp_path) + vectors = [np.random.randn(128) for _ in range(100)] + + nouns = ("puppy", "car", "rabbit", "girl", "monkey") + verbs = ("runs", "hits", "jumps", "drives", "barfs") + adv = ("crazily.", "dutifully.", "foolishly.", "merrily.", "occasionally.") + adj = ("adorable", "clueless", "dirty", "odd", "stupid") + text = [ + " ".join( + [ + nouns[random.randrange(0, 5)], + verbs[random.randrange(0, 5)], + adv[random.randrange(0, 5)], + adj[random.randrange(0, 5)], + ] + ) + for _ in range(100) + ] + table = db.create_table( + "test", data=pd.DataFrame({"vector": vectors, "text": text, "text2": text}) + ) + return table + + +def test_create_index(tmp_path): + index = ldb.fts.create_index(str(tmp_path / "index"), ["text"]) + assert isinstance(index, tantivy.Index) + assert os.path.exists(str(tmp_path / "index")) + + +def test_populate_index(tmp_path, table): + index = ldb.fts.create_index(str(tmp_path / "index"), ["text"]) + assert ldb.fts.populate_index(index, table, ["text"]) == len(table) + + +def test_search_index(tmp_path, table): + index = ldb.fts.create_index(str(tmp_path / "index"), ["text"]) + ldb.fts.populate_index(index, table, ["text"]) + index.reload() + results = ldb.fts.search_index(index, query="puppy", limit=10) + assert len(results) == 2 + assert len(results[0]) == 10 # row_ids + assert len(results[1]) == 10 # scores + + +def test_create_index_from_table(tmp_path, table): + table.create_fts_index("text") + df = table.search("puppy").limit(10).select(["text"]).to_df() + assert len(df) == 10 + assert "text" in df.columns + + +def test_create_index_multiple_columns(tmp_path, table): + table.create_fts_index(["text", "text2"]) + df = table.search("puppy").limit(10).to_df() + assert len(df) == 10 + assert "text" in df.columns + assert "text2" in df.columns