feat(python): Set heap size to get faster fts indexing performance (#762)

By default tantivy-py uses 128MB heapsize. We change the default to 1GB
and we allow the user to customize this

locally this makes `test_fts.py` run 10x faster
This commit is contained in:
Chang She
2024-01-07 15:15:13 -08:00
committed by GitHub
parent d41d849e0e
commit b0a88a7286
3 changed files with 29 additions and 5 deletions

View File

@@ -709,7 +709,11 @@ class LanceTable(Table):
self._dataset.create_scalar_index(column, index_type="BTREE", replace=replace)
def create_fts_index(
self, field_names: Union[str, List[str]], *, replace: bool = False
self,
field_names: Union[str, List[str]],
*,
replace: bool = False,
writer_heap_size: Optional[int] = 1024 * 1024 * 1024,
):
"""Create a full-text search index on the table.
@@ -724,6 +728,7 @@ class LanceTable(Table):
If True, replace the existing index if it exists. Note that this is
not yet an atomic operation; the index will be temporarily
unavailable while the new index is being created.
writer_heap_size: int, default 1GB
"""
from .fts import create_index, populate_index
@@ -740,7 +745,7 @@ class LanceTable(Table):
fs.delete_dir(path)
index = create_index(self._get_fts_index_path(), field_names)
populate_index(index, self, field_names)
populate_index(index, self, field_names, writer_heap_size=writer_heap_size)
register_event("create_fts_index")
def _get_fts_index_path(self):