feat: support FM-Index scalar index for substring search (#3532)

Adds an FM-Index — a scalar index over string and binary columns that
accelerates substring search (`contains(col, 'needle')`), distinct from
the tokenized `FTS` index — across the Rust core and the Python and
TypeScript bindings.

## Rust

- `Index::Fm(FmIndexBuilder)` and `IndexType::Fm`.
- `make_index_params` maps `Index::Fm` to Lance's
`ScalarIndexParams::for_builtin(BuiltinIndexType::Fm)`.
- `supported_fm_data_type` validates
`Utf8`/`LargeUtf8`/`Binary`/`LargeBinary` columns.
- `list_indices` round-trips the type (`"Fm"` → `IndexType::Fm`); the
remote wire type is `"FM"`.

## Python

Adds `lancedb.index.Fm`, accepted by `create_index`:

```python
from lancedb.index import Fm

await tbl.create_index("text", config=Fm())
```

## TypeScript

Adds the `Index.fm()` factory:

```ts
await tbl.createIndex("text", { config: Index.fm() });
```
This commit is contained in:
Jack Ye
2026-06-10 12:28:20 -07:00
committed by GitHub
parent 8308cca05e
commit 8373318e89
14 changed files with 179 additions and 9 deletions

View File

@@ -10,6 +10,7 @@ from .index import (
IvfSq,
Bitmap,
LabelList,
Fm,
HnswPq,
HnswSq,
HnswFlat,
@@ -186,6 +187,7 @@ class Table:
BTree,
Bitmap,
LabelList,
Fm,
FTS,
],
replace: Optional[bool],

View File

@@ -93,6 +93,20 @@ class LabelList:
pass
@dataclass
class Fm:
"""Describe an FM-Index configuration.
`Fm` is a scalar index on string or binary columns that accelerates
substring search, i.e. `contains(col, 'needle')`. Unlike the tokenized
`FTS` index, it matches arbitrary substrings of the raw bytes.
For example, it works with `url`, `path`, `content`, etc.
"""
pass
@dataclass
class FTS:
"""Describe a FTS index configuration.
@@ -828,4 +842,5 @@ __all__ = [
"FTS",
"Bitmap",
"LabelList",
"Fm",
]

View File

@@ -55,6 +55,7 @@ from .index import (
Bitmap,
IvfRq,
LabelList,
Fm,
HnswPq,
HnswSq,
HnswFlat,
@@ -213,6 +214,7 @@ IndexConfigType = Union[
BTree,
Bitmap,
LabelList,
Fm,
FTS,
]
@@ -938,7 +940,7 @@ class Table(ABC):
config : IndexConfigType, optional
The index configuration object. If provided, uses the new unified API.
Can be one of: IvfFlat, IvfPq, IvfSq, IvfRq, HnswPq, HnswSq,
BTree, Bitmap, LabelList, FTS.
BTree, Bitmap, LabelList, Fm, FTS.
replace : bool, default True
Whether to replace an existing index on this column.
wait_timeout : timedelta, optional
@@ -2487,7 +2489,7 @@ class LanceTable(Table):
config : IndexConfigType, optional
The index configuration object. If provided, uses the new unified API.
Can be one of: IvfFlat, IvfPq, IvfSq, IvfRq, HnswPq, HnswSq,
BTree, Bitmap, LabelList, FTS.
BTree, Bitmap, LabelList, Fm, FTS.
replace : bool, default True
Whether to replace an existing index on this column.
wait_timeout : timedelta, optional
@@ -4487,6 +4489,7 @@ class AsyncTable:
BTree,
Bitmap,
LabelList,
Fm,
FTS,
]
] = None,
@@ -4539,12 +4542,14 @@ class AsyncTable:
BTree,
Bitmap,
LabelList,
Fm,
FTS,
),
):
raise TypeError(
"config must be an instance of IvfSq, IvfPq, IvfRq, HnswPq, HnswSq,"
" BTree, Bitmap, LabelList, or FTS, but got " + str(type(config))
" BTree, Bitmap, LabelList, Fm, or FTS, but got "
+ str(type(config))
)
try:
await self._inner.create_index(

View File

@@ -20,6 +20,7 @@ from lancedb.index import (
IvfRq,
Bitmap,
LabelList,
Fm,
HnswPq,
HnswSq,
HnswFlat,
@@ -203,6 +204,16 @@ async def test_create_fixed_size_binary_index(some_table: AsyncTable):
assert indices[0].columns == ["fsb"]
@pytest.mark.asyncio
async def test_create_fm_index(some_table: AsyncTable):
# FM-Index accelerates substring search on string/binary columns.
await some_table.create_index("data", config=Fm())
indices = await some_table.list_indices()
assert len(indices) == 1
assert indices[0].index_type == "Fm"
assert indices[0].columns == ["data"]
@pytest.mark.asyncio
async def test_create_bitmap_index(some_table: AsyncTable):
await some_table.create_index("id", config=Bitmap())

View File

@@ -7,7 +7,7 @@ use lancedb::index::vector::{
};
use lancedb::index::{
Index as LanceDbIndex,
scalar::{BTreeIndexBuilder, FtsIndexBuilder},
scalar::{BTreeIndexBuilder, FmIndexBuilder, FtsIndexBuilder},
};
use pyo3::IntoPyObject;
use pyo3::types::PyStringMethods;
@@ -38,6 +38,7 @@ pub fn extract_index_params(source: &Option<Bound<'_, PyAny>>) -> PyResult<Lance
"BTree" => Ok(LanceDbIndex::BTree(BTreeIndexBuilder::default())),
"Bitmap" => Ok(LanceDbIndex::Bitmap(Default::default())),
"LabelList" => Ok(LanceDbIndex::LabelList(Default::default())),
"Fm" => Ok(LanceDbIndex::Fm(FmIndexBuilder::default())),
"FTS" => {
let params = source.extract::<FtsParams>()?;
let inner_opts = FtsIndexBuilder::default()
@@ -183,7 +184,7 @@ pub fn extract_index_params(source: &Option<Bound<'_, PyAny>>) -> PyResult<Lance
Ok(LanceDbIndex::IvfHnswFlat(hnsw_flat_builder))
}
not_supported => Err(PyValueError::new_err(format!(
"Invalid index type '{}'. Must be one of BTree, Bitmap, LabelList, FTS, IvfPq, IvfSq, IvfHnswPq, IvfHnswSq, or IvfHnswFlat",
"Invalid index type '{}'. Must be one of BTree, Bitmap, LabelList, Fm, FTS, IvfPq, IvfSq, IvfHnswPq, IvfHnswSq, or IvfHnswFlat",
not_supported
))),
}