feat(python): Set heap size to get faster fts indexing performance (#762)

By default tantivy-py uses 128MB heapsize. We change the default to 1GB
and we allow the user to customize this

locally this makes `test_fts.py` run 10x faster
This commit is contained in:
Chang She
2024-01-07 15:15:13 -08:00
committed by Weston Pace
parent 328aa2247b
commit 3100f0d861
3 changed files with 29 additions and 5 deletions

View File

@@ -13,7 +13,7 @@
"""Full text search index using tantivy-py"""
import os
from typing import List, Tuple
from typing import List, Optional, Tuple
import pyarrow as pa
@@ -56,7 +56,12 @@ def create_index(index_path: str, text_fields: List[str]) -> tantivy.Index:
return index
def populate_index(index: tantivy.Index, table: LanceTable, fields: List[str]) -> int:
def populate_index(
index: tantivy.Index,
table: LanceTable,
fields: List[str],
writer_heap_size: int = 1024 * 1024 * 1024,
) -> int:
"""
Populate an index with data from a LanceTable
@@ -68,6 +73,8 @@ def populate_index(index: tantivy.Index, table: LanceTable, fields: List[str]) -
The table to index
fields : List[str]
List of fields to index
writer_heap_size : int
The writer heap size in bytes, defaults to 1GB
Returns
-------
@@ -87,7 +94,7 @@ def populate_index(index: tantivy.Index, table: LanceTable, fields: List[str]) -
raise TypeError(f"Field {name} is not a string type")
# create a tantivy writer
writer = index.writer()
writer = index.writer(heap_size=writer_heap_size)
# write data into index
dataset = table.to_lance()
row_id = 0

View File

@@ -707,7 +707,11 @@ class LanceTable(Table):
self._dataset.create_scalar_index(column, index_type="BTREE", replace=replace)
def create_fts_index(
self, field_names: Union[str, List[str]], *, replace: bool = False
self,
field_names: Union[str, List[str]],
*,
replace: bool = False,
writer_heap_size: Optional[int] = 1024 * 1024 * 1024,
):
"""Create a full-text search index on the table.
@@ -722,6 +726,7 @@ class LanceTable(Table):
If True, replace the existing index if it exists. Note that this is
not yet an atomic operation; the index will be temporarily
unavailable while the new index is being created.
writer_heap_size: int, default 1GB
"""
from .fts import create_index, populate_index
@@ -738,7 +743,7 @@ class LanceTable(Table):
fs.delete_dir(path)
index = create_index(self._get_fts_index_path(), field_names)
populate_index(index, self, field_names)
populate_index(index, self, field_names, writer_heap_size=writer_heap_size)
def _get_fts_index_path(self):
return join_uri(self._dataset_uri, "_indices", "tantivy")