feat: support to build FTS without positions (#1621)

This commit is contained in:
BubbleCal
2024-09-10 22:51:32 +08:00
committed by GitHub
parent a405847f9b
commit 2bde5401eb
11 changed files with 150 additions and 25 deletions

View File

@@ -78,8 +78,8 @@ class FTS:
For example, it works with `title`, `description`, `content`, etc.
"""
def __init__(self):
self._inner = LanceDbIndex.fts()
def __init__(self, with_position: bool = True):
self._inner = LanceDbIndex.fts(with_position=with_position)
class IvfPq:

View File

@@ -126,6 +126,7 @@ class RemoteTable(Table):
column: str,
*,
replace: bool = False,
with_position: bool = True,
):
data = {
"column": column,

View File

@@ -468,6 +468,7 @@ class Table(ABC):
ordering_field_names: Union[str, List[str]] = None,
*,
replace: bool = False,
with_position: bool = True,
writer_heap_size: Optional[int] = 1024 * 1024 * 1024,
tokenizer_name: str = "default",
use_tantivy: bool = True,
@@ -500,6 +501,12 @@ class Table(ABC):
use_tantivy: bool, default True
If True, use the legacy full-text search implementation based on tantivy.
If False, use the new full-text search implementation based on lance-index.
with_position: bool, default True
Only available with use_tantivy=False
If False, do not store the positions of the terms in the text.
This can reduce the size of the index and improve indexing speed.
But it will not be possible to use phrase queries.
"""
raise NotImplementedError
@@ -1305,6 +1312,7 @@ class LanceTable(Table):
ordering_field_names: Union[str, List[str]] = None,
*,
replace: bool = False,
with_position: bool = True,
writer_heap_size: Optional[int] = 1024 * 1024 * 1024,
tokenizer_name: str = "default",
use_tantivy: bool = True,
@@ -1318,7 +1326,10 @@ class LanceTable(Table):
if exist:
fs.delete_dir(path)
self._dataset_mut.create_scalar_index(
field_names, index_type="INVERTED", replace=replace
field_names,
index_type="INVERTED",
replace=replace,
with_position=with_position,
)
return

View File

@@ -140,8 +140,11 @@ def test_create_index_with_stemming(tmp_path, table):
@pytest.mark.parametrize("use_tantivy", [True, False])
def test_create_inverted_index(table, use_tantivy):
table.create_fts_index("text", use_tantivy=use_tantivy)
@pytest.mark.parametrize("with_position", [True, False])
def test_create_inverted_index(table, use_tantivy, with_position):
if use_tantivy and not with_position:
pytest.skip("we don't support to build tantivy index without position")
table.create_fts_index("text", use_tantivy=use_tantivy, with_position=with_position)
def test_populate_index(tmp_path, table):
@@ -166,6 +169,40 @@ def test_search_fts(table, use_tantivy):
assert len(results) == 5
def test_search_fts_phrase_query(table):
table.create_fts_index("text", use_tantivy=False, with_position=False)
try:
phrase_results = table.search('"puppy runs"').limit(100).to_list()
assert False
except Exception:
pass
table.create_fts_index("text", use_tantivy=False, replace=True)
results = table.search("puppy").limit(100).to_list()
phrase_results = table.search('"puppy runs"').limit(100).to_list()
assert len(results) > len(phrase_results)
assert len(phrase_results) > 0
@pytest.mark.asyncio
async def test_search_fts_phrase_query_async(async_table):
async_table = await async_table
await async_table.create_index("text", config=FTS(with_position=False))
try:
phrase_results = (
await async_table.query().nearest_to_text("puppy runs").limit(100).to_list()
)
assert False
except Exception:
pass
await async_table.create_index("text", config=FTS())
results = await async_table.query().nearest_to_text("puppy").limit(100).to_list()
phrase_results = (
await async_table.query().nearest_to_text('"puppy runs"').limit(100).to_list()
)
assert len(results) > len(phrase_results)
assert len(phrase_results) > 0
def test_search_fts_specify_column(table):
table.create_fts_index("text", use_tantivy=False)
table.create_fts_index("text2", use_tantivy=False)