mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-26 22:59:57 +00:00
feat: support to build FTS without positions (#1621)
This commit is contained in:
@@ -78,8 +78,8 @@ class FTS:
|
||||
For example, it works with `title`, `description`, `content`, etc.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self._inner = LanceDbIndex.fts()
|
||||
def __init__(self, with_position: bool = True):
|
||||
self._inner = LanceDbIndex.fts(with_position=with_position)
|
||||
|
||||
|
||||
class IvfPq:
|
||||
|
||||
@@ -126,6 +126,7 @@ class RemoteTable(Table):
|
||||
column: str,
|
||||
*,
|
||||
replace: bool = False,
|
||||
with_position: bool = True,
|
||||
):
|
||||
data = {
|
||||
"column": column,
|
||||
|
||||
@@ -468,6 +468,7 @@ class Table(ABC):
|
||||
ordering_field_names: Union[str, List[str]] = None,
|
||||
*,
|
||||
replace: bool = False,
|
||||
with_position: bool = True,
|
||||
writer_heap_size: Optional[int] = 1024 * 1024 * 1024,
|
||||
tokenizer_name: str = "default",
|
||||
use_tantivy: bool = True,
|
||||
@@ -500,6 +501,12 @@ class Table(ABC):
|
||||
use_tantivy: bool, default True
|
||||
If True, use the legacy full-text search implementation based on tantivy.
|
||||
If False, use the new full-text search implementation based on lance-index.
|
||||
with_position: bool, default True
|
||||
Only available with use_tantivy=False
|
||||
If False, do not store the positions of the terms in the text.
|
||||
This can reduce the size of the index and improve indexing speed.
|
||||
But it will not be possible to use phrase queries.
|
||||
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@@ -1305,6 +1312,7 @@ class LanceTable(Table):
|
||||
ordering_field_names: Union[str, List[str]] = None,
|
||||
*,
|
||||
replace: bool = False,
|
||||
with_position: bool = True,
|
||||
writer_heap_size: Optional[int] = 1024 * 1024 * 1024,
|
||||
tokenizer_name: str = "default",
|
||||
use_tantivy: bool = True,
|
||||
@@ -1318,7 +1326,10 @@ class LanceTable(Table):
|
||||
if exist:
|
||||
fs.delete_dir(path)
|
||||
self._dataset_mut.create_scalar_index(
|
||||
field_names, index_type="INVERTED", replace=replace
|
||||
field_names,
|
||||
index_type="INVERTED",
|
||||
replace=replace,
|
||||
with_position=with_position,
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@@ -140,8 +140,11 @@ def test_create_index_with_stemming(tmp_path, table):
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
||||
def test_create_inverted_index(table, use_tantivy):
|
||||
table.create_fts_index("text", use_tantivy=use_tantivy)
|
||||
@pytest.mark.parametrize("with_position", [True, False])
|
||||
def test_create_inverted_index(table, use_tantivy, with_position):
|
||||
if use_tantivy and not with_position:
|
||||
pytest.skip("we don't support to build tantivy index without position")
|
||||
table.create_fts_index("text", use_tantivy=use_tantivy, with_position=with_position)
|
||||
|
||||
|
||||
def test_populate_index(tmp_path, table):
|
||||
@@ -166,6 +169,40 @@ def test_search_fts(table, use_tantivy):
|
||||
assert len(results) == 5
|
||||
|
||||
|
||||
def test_search_fts_phrase_query(table):
|
||||
table.create_fts_index("text", use_tantivy=False, with_position=False)
|
||||
try:
|
||||
phrase_results = table.search('"puppy runs"').limit(100).to_list()
|
||||
assert False
|
||||
except Exception:
|
||||
pass
|
||||
table.create_fts_index("text", use_tantivy=False, replace=True)
|
||||
results = table.search("puppy").limit(100).to_list()
|
||||
phrase_results = table.search('"puppy runs"').limit(100).to_list()
|
||||
assert len(results) > len(phrase_results)
|
||||
assert len(phrase_results) > 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_search_fts_phrase_query_async(async_table):
|
||||
async_table = await async_table
|
||||
await async_table.create_index("text", config=FTS(with_position=False))
|
||||
try:
|
||||
phrase_results = (
|
||||
await async_table.query().nearest_to_text("puppy runs").limit(100).to_list()
|
||||
)
|
||||
assert False
|
||||
except Exception:
|
||||
pass
|
||||
await async_table.create_index("text", config=FTS())
|
||||
results = await async_table.query().nearest_to_text("puppy").limit(100).to_list()
|
||||
phrase_results = (
|
||||
await async_table.query().nearest_to_text('"puppy runs"').limit(100).to_list()
|
||||
)
|
||||
assert len(results) > len(phrase_results)
|
||||
assert len(phrase_results) > 0
|
||||
|
||||
|
||||
def test_search_fts_specify_column(table):
|
||||
table.create_fts_index("text", use_tantivy=False)
|
||||
table.create_fts_index("text2", use_tantivy=False)
|
||||
|
||||
Reference in New Issue
Block a user