feat: support FM-Index scalar index for substring search (#3532)

Adds an FM-Index — a scalar index over string and binary columns that
accelerates substring search (`contains(col, 'needle')`), distinct from
the tokenized `FTS` index — across the Rust core and the Python and
TypeScript bindings.

## Rust

- `Index::Fm(FmIndexBuilder)` and `IndexType::Fm`.
- `make_index_params` maps `Index::Fm` to Lance's
`ScalarIndexParams::for_builtin(BuiltinIndexType::Fm)`.
- `supported_fm_data_type` validates
`Utf8`/`LargeUtf8`/`Binary`/`LargeBinary` columns.
- `list_indices` round-trips the type (`"Fm"` → `IndexType::Fm`); the
remote wire type is `"FM"`.

## Python

Adds `lancedb.index.Fm`, accepted by `create_index`:

```python
from lancedb.index import Fm

await tbl.create_index("text", config=Fm())
```

## TypeScript

Adds the `Index.fm()` factory:

```ts
await tbl.createIndex("text", { config: Index.fm() });
```
This commit is contained in:
Jack Ye
2026-06-10 12:28:20 -07:00
committed by GitHub
parent 8308cca05e
commit 8373318e89
14 changed files with 179 additions and 9 deletions

View File

@@ -10,6 +10,7 @@ from .index import (
IvfSq,
Bitmap,
LabelList,
Fm,
HnswPq,
HnswSq,
HnswFlat,
@@ -186,6 +187,7 @@ class Table:
BTree,
Bitmap,
LabelList,
Fm,
FTS,
],
replace: Optional[bool],

View File

@@ -93,6 +93,20 @@ class LabelList:
pass
@dataclass
class Fm:
"""Describe an FM-Index configuration.
`Fm` is a scalar index on string or binary columns that accelerates
substring search, i.e. `contains(col, 'needle')`. Unlike the tokenized
`FTS` index, it matches arbitrary substrings of the raw bytes.
For example, it works with `url`, `path`, `content`, etc.
"""
pass
@dataclass
class FTS:
"""Describe a FTS index configuration.
@@ -828,4 +842,5 @@ __all__ = [
"FTS",
"Bitmap",
"LabelList",
"Fm",
]

View File

@@ -55,6 +55,7 @@ from .index import (
Bitmap,
IvfRq,
LabelList,
Fm,
HnswPq,
HnswSq,
HnswFlat,
@@ -213,6 +214,7 @@ IndexConfigType = Union[
BTree,
Bitmap,
LabelList,
Fm,
FTS,
]
@@ -938,7 +940,7 @@ class Table(ABC):
config : IndexConfigType, optional
The index configuration object. If provided, uses the new unified API.
Can be one of: IvfFlat, IvfPq, IvfSq, IvfRq, HnswPq, HnswSq,
BTree, Bitmap, LabelList, FTS.
BTree, Bitmap, LabelList, Fm, FTS.
replace : bool, default True
Whether to replace an existing index on this column.
wait_timeout : timedelta, optional
@@ -2487,7 +2489,7 @@ class LanceTable(Table):
config : IndexConfigType, optional
The index configuration object. If provided, uses the new unified API.
Can be one of: IvfFlat, IvfPq, IvfSq, IvfRq, HnswPq, HnswSq,
BTree, Bitmap, LabelList, FTS.
BTree, Bitmap, LabelList, Fm, FTS.
replace : bool, default True
Whether to replace an existing index on this column.
wait_timeout : timedelta, optional
@@ -4487,6 +4489,7 @@ class AsyncTable:
BTree,
Bitmap,
LabelList,
Fm,
FTS,
]
] = None,
@@ -4539,12 +4542,14 @@ class AsyncTable:
BTree,
Bitmap,
LabelList,
Fm,
FTS,
),
):
raise TypeError(
"config must be an instance of IvfSq, IvfPq, IvfRq, HnswPq, HnswSq,"
" BTree, Bitmap, LabelList, or FTS, but got " + str(type(config))
" BTree, Bitmap, LabelList, Fm, or FTS, but got "
+ str(type(config))
)
try:
await self._inner.create_index(

View File

@@ -20,6 +20,7 @@ from lancedb.index import (
IvfRq,
Bitmap,
LabelList,
Fm,
HnswPq,
HnswSq,
HnswFlat,
@@ -203,6 +204,16 @@ async def test_create_fixed_size_binary_index(some_table: AsyncTable):
assert indices[0].columns == ["fsb"]
@pytest.mark.asyncio
async def test_create_fm_index(some_table: AsyncTable):
# FM-Index accelerates substring search on string/binary columns.
await some_table.create_index("data", config=Fm())
indices = await some_table.list_indices()
assert len(indices) == 1
assert indices[0].index_type == "Fm"
assert indices[0].columns == ["data"]
@pytest.mark.asyncio
async def test_create_bitmap_index(some_table: AsyncTable):
await some_table.create_index("id", config=Bitmap())