diff --git a/docs/src/fts.md b/docs/src/fts.md index 78c20f6b..721a5cf1 100644 --- a/docs/src/fts.md +++ b/docs/src/fts.md @@ -75,6 +75,18 @@ applied on top of the full text search results. This can be invoked via the fami table.search("puppy").limit(10).where("meta='foo'").to_list() ``` +## Configurations + +By default, LanceDB configures a 1GB heap size limit for creating the index. You can +reduce this if running on a smaller node, or increase this for faster performance while +indexing a larger corpus. + +```python +# configure a 512MB heap size +heap = 1024 * 1024 * 512 +table.create_fts_index(["text1", "text2"], writer_heap_size=heap, replace=True) +``` + ## Current limitations 1. Currently we do not yet support incremental writes. diff --git a/python/lancedb/fts.py b/python/lancedb/fts.py index eb5c37e3..f9667fcc 100644 --- a/python/lancedb/fts.py +++ b/python/lancedb/fts.py @@ -13,7 +13,7 @@ """Full text search index using tantivy-py""" import os -from typing import List, Tuple +from typing import List, Optional, Tuple import pyarrow as pa @@ -56,7 +56,12 @@ def create_index(index_path: str, text_fields: List[str]) -> tantivy.Index: return index -def populate_index(index: tantivy.Index, table: LanceTable, fields: List[str]) -> int: +def populate_index( + index: tantivy.Index, + table: LanceTable, + fields: List[str], + writer_heap_size: int = 1024 * 1024 * 1024, +) -> int: """ Populate an index with data from a LanceTable @@ -68,6 +73,8 @@ def populate_index(index: tantivy.Index, table: LanceTable, fields: List[str]) - The table to index fields : List[str] List of fields to index + writer_heap_size : int + The writer heap size in bytes, defaults to 1GB Returns ------- @@ -87,7 +94,7 @@ def populate_index(index: tantivy.Index, table: LanceTable, fields: List[str]) - raise TypeError(f"Field {name} is not a string type") # create a tantivy writer - writer = index.writer() + writer = index.writer(heap_size=writer_heap_size) # write data into index dataset = table.to_lance() row_id = 0 diff --git a/python/lancedb/table.py b/python/lancedb/table.py index ea4d62bb..0ffbea74 100644 --- a/python/lancedb/table.py +++ b/python/lancedb/table.py @@ -709,7 +709,11 @@ class LanceTable(Table): self._dataset.create_scalar_index(column, index_type="BTREE", replace=replace) def create_fts_index( - self, field_names: Union[str, List[str]], *, replace: bool = False + self, + field_names: Union[str, List[str]], + *, + replace: bool = False, + writer_heap_size: Optional[int] = 1024 * 1024 * 1024, ): """Create a full-text search index on the table. @@ -724,6 +728,7 @@ class LanceTable(Table): If True, replace the existing index if it exists. Note that this is not yet an atomic operation; the index will be temporarily unavailable while the new index is being created. + writer_heap_size: int, default 1GB """ from .fts import create_index, populate_index @@ -740,7 +745,7 @@ class LanceTable(Table): fs.delete_dir(path) index = create_index(self._get_fts_index_path(), field_names) - populate_index(index, self, field_names) + populate_index(index, self, field_names, writer_heap_size=writer_heap_size) register_event("create_fts_index") def _get_fts_index_path(self):