mirror of
https://github.com/lancedb/lancedb.git
synced 2026-01-05 11:22:58 +00:00
feat(python): Set heap size to get faster fts indexing performance (#762)
By default tantivy-py uses 128MB heapsize. We change the default to 1GB and we allow the user to customize this locally this makes `test_fts.py` run 10x faster
This commit is contained in:
@@ -13,7 +13,7 @@
|
||||
|
||||
"""Full text search index using tantivy-py"""
|
||||
import os
|
||||
from typing import List, Tuple
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
import pyarrow as pa
|
||||
|
||||
@@ -56,7 +56,12 @@ def create_index(index_path: str, text_fields: List[str]) -> tantivy.Index:
|
||||
return index
|
||||
|
||||
|
||||
def populate_index(index: tantivy.Index, table: LanceTable, fields: List[str]) -> int:
|
||||
def populate_index(
|
||||
index: tantivy.Index,
|
||||
table: LanceTable,
|
||||
fields: List[str],
|
||||
writer_heap_size: int = 1024 * 1024 * 1024,
|
||||
) -> int:
|
||||
"""
|
||||
Populate an index with data from a LanceTable
|
||||
|
||||
@@ -68,6 +73,8 @@ def populate_index(index: tantivy.Index, table: LanceTable, fields: List[str]) -
|
||||
The table to index
|
||||
fields : List[str]
|
||||
List of fields to index
|
||||
writer_heap_size : int
|
||||
The writer heap size in bytes, defaults to 1GB
|
||||
|
||||
Returns
|
||||
-------
|
||||
@@ -87,7 +94,7 @@ def populate_index(index: tantivy.Index, table: LanceTable, fields: List[str]) -
|
||||
raise TypeError(f"Field {name} is not a string type")
|
||||
|
||||
# create a tantivy writer
|
||||
writer = index.writer()
|
||||
writer = index.writer(heap_size=writer_heap_size)
|
||||
# write data into index
|
||||
dataset = table.to_lance()
|
||||
row_id = 0
|
||||
|
||||
@@ -709,7 +709,11 @@ class LanceTable(Table):
|
||||
self._dataset.create_scalar_index(column, index_type="BTREE", replace=replace)
|
||||
|
||||
def create_fts_index(
|
||||
self, field_names: Union[str, List[str]], *, replace: bool = False
|
||||
self,
|
||||
field_names: Union[str, List[str]],
|
||||
*,
|
||||
replace: bool = False,
|
||||
writer_heap_size: Optional[int] = 1024 * 1024 * 1024,
|
||||
):
|
||||
"""Create a full-text search index on the table.
|
||||
|
||||
@@ -724,6 +728,7 @@ class LanceTable(Table):
|
||||
If True, replace the existing index if it exists. Note that this is
|
||||
not yet an atomic operation; the index will be temporarily
|
||||
unavailable while the new index is being created.
|
||||
writer_heap_size: int, default 1GB
|
||||
"""
|
||||
from .fts import create_index, populate_index
|
||||
|
||||
@@ -740,7 +745,7 @@ class LanceTable(Table):
|
||||
fs.delete_dir(path)
|
||||
|
||||
index = create_index(self._get_fts_index_path(), field_names)
|
||||
populate_index(index, self, field_names)
|
||||
populate_index(index, self, field_names, writer_heap_size=writer_heap_size)
|
||||
register_event("create_fts_index")
|
||||
|
||||
def _get_fts_index_path(self):
|
||||
|
||||
Reference in New Issue
Block a user