mirror of
https://github.com/lancedb/lancedb.git
synced 2026-07-03 02:50:41 +00:00
feat: support FM-Index scalar index for substring search (#3532)
Adds an FM-Index — a scalar index over string and binary columns that
accelerates substring search (`contains(col, 'needle')`), distinct from
the tokenized `FTS` index — across the Rust core and the Python and
TypeScript bindings.
## Rust
- `Index::Fm(FmIndexBuilder)` and `IndexType::Fm`.
- `make_index_params` maps `Index::Fm` to Lance's
`ScalarIndexParams::for_builtin(BuiltinIndexType::Fm)`.
- `supported_fm_data_type` validates
`Utf8`/`LargeUtf8`/`Binary`/`LargeBinary` columns.
- `list_indices` round-trips the type (`"Fm"` → `IndexType::Fm`); the
remote wire type is `"FM"`.
## Python
Adds `lancedb.index.Fm`, accepted by `create_index`:
```python
from lancedb.index import Fm
await tbl.create_index("text", config=Fm())
```
## TypeScript
Adds the `Index.fm()` factory:
```ts
await tbl.createIndex("text", { config: Index.fm() });
```
This commit is contained in:
@@ -10,6 +10,7 @@ from .index import (
|
||||
IvfSq,
|
||||
Bitmap,
|
||||
LabelList,
|
||||
Fm,
|
||||
HnswPq,
|
||||
HnswSq,
|
||||
HnswFlat,
|
||||
@@ -186,6 +187,7 @@ class Table:
|
||||
BTree,
|
||||
Bitmap,
|
||||
LabelList,
|
||||
Fm,
|
||||
FTS,
|
||||
],
|
||||
replace: Optional[bool],
|
||||
|
||||
@@ -93,6 +93,20 @@ class LabelList:
|
||||
pass
|
||||
|
||||
|
||||
@dataclass
|
||||
class Fm:
|
||||
"""Describe an FM-Index configuration.
|
||||
|
||||
`Fm` is a scalar index on string or binary columns that accelerates
|
||||
substring search, i.e. `contains(col, 'needle')`. Unlike the tokenized
|
||||
`FTS` index, it matches arbitrary substrings of the raw bytes.
|
||||
|
||||
For example, it works with `url`, `path`, `content`, etc.
|
||||
"""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
@dataclass
|
||||
class FTS:
|
||||
"""Describe a FTS index configuration.
|
||||
@@ -828,4 +842,5 @@ __all__ = [
|
||||
"FTS",
|
||||
"Bitmap",
|
||||
"LabelList",
|
||||
"Fm",
|
||||
]
|
||||
|
||||
@@ -55,6 +55,7 @@ from .index import (
|
||||
Bitmap,
|
||||
IvfRq,
|
||||
LabelList,
|
||||
Fm,
|
||||
HnswPq,
|
||||
HnswSq,
|
||||
HnswFlat,
|
||||
@@ -213,6 +214,7 @@ IndexConfigType = Union[
|
||||
BTree,
|
||||
Bitmap,
|
||||
LabelList,
|
||||
Fm,
|
||||
FTS,
|
||||
]
|
||||
|
||||
@@ -938,7 +940,7 @@ class Table(ABC):
|
||||
config : IndexConfigType, optional
|
||||
The index configuration object. If provided, uses the new unified API.
|
||||
Can be one of: IvfFlat, IvfPq, IvfSq, IvfRq, HnswPq, HnswSq,
|
||||
BTree, Bitmap, LabelList, FTS.
|
||||
BTree, Bitmap, LabelList, Fm, FTS.
|
||||
replace : bool, default True
|
||||
Whether to replace an existing index on this column.
|
||||
wait_timeout : timedelta, optional
|
||||
@@ -2487,7 +2489,7 @@ class LanceTable(Table):
|
||||
config : IndexConfigType, optional
|
||||
The index configuration object. If provided, uses the new unified API.
|
||||
Can be one of: IvfFlat, IvfPq, IvfSq, IvfRq, HnswPq, HnswSq,
|
||||
BTree, Bitmap, LabelList, FTS.
|
||||
BTree, Bitmap, LabelList, Fm, FTS.
|
||||
replace : bool, default True
|
||||
Whether to replace an existing index on this column.
|
||||
wait_timeout : timedelta, optional
|
||||
@@ -4487,6 +4489,7 @@ class AsyncTable:
|
||||
BTree,
|
||||
Bitmap,
|
||||
LabelList,
|
||||
Fm,
|
||||
FTS,
|
||||
]
|
||||
] = None,
|
||||
@@ -4539,12 +4542,14 @@ class AsyncTable:
|
||||
BTree,
|
||||
Bitmap,
|
||||
LabelList,
|
||||
Fm,
|
||||
FTS,
|
||||
),
|
||||
):
|
||||
raise TypeError(
|
||||
"config must be an instance of IvfSq, IvfPq, IvfRq, HnswPq, HnswSq,"
|
||||
" BTree, Bitmap, LabelList, or FTS, but got " + str(type(config))
|
||||
" BTree, Bitmap, LabelList, Fm, or FTS, but got "
|
||||
+ str(type(config))
|
||||
)
|
||||
try:
|
||||
await self._inner.create_index(
|
||||
|
||||
@@ -20,6 +20,7 @@ from lancedb.index import (
|
||||
IvfRq,
|
||||
Bitmap,
|
||||
LabelList,
|
||||
Fm,
|
||||
HnswPq,
|
||||
HnswSq,
|
||||
HnswFlat,
|
||||
@@ -203,6 +204,16 @@ async def test_create_fixed_size_binary_index(some_table: AsyncTable):
|
||||
assert indices[0].columns == ["fsb"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_create_fm_index(some_table: AsyncTable):
|
||||
# FM-Index accelerates substring search on string/binary columns.
|
||||
await some_table.create_index("data", config=Fm())
|
||||
indices = await some_table.list_indices()
|
||||
assert len(indices) == 1
|
||||
assert indices[0].index_type == "Fm"
|
||||
assert indices[0].columns == ["data"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_create_bitmap_index(some_table: AsyncTable):
|
||||
await some_table.create_index("id", config=Bitmap())
|
||||
|
||||
Reference in New Issue
Block a user