diff --git a/docs/src/js/classes/Index.md b/docs/src/js/classes/Index.md index 372a1ac42..7b5033160 100644 --- a/docs/src/js/classes/Index.md +++ b/docs/src/js/classes/Index.md @@ -57,6 +57,24 @@ block size may be added in the future. *** +### fm() + +```ts +static fm(): Index +``` + +Create an FM-Index. + +An FM-Index is a scalar index on string or binary columns that accelerates +substring search, i.e. `contains(col, 'needle')`. Unlike the tokenized +full-text-search index, it matches arbitrary substrings of the raw bytes. + +#### Returns + +[`Index`](Index.md) + +*** + ### fts() ```ts diff --git a/nodejs/__test__/table.test.ts b/nodejs/__test__/table.test.ts index b117d2c1d..12406c402 100644 --- a/nodejs/__test__/table.test.ts +++ b/nodejs/__test__/table.test.ts @@ -1431,6 +1431,20 @@ describe("When creating an index", () => { expect(fs.readdirSync(indexDir)).toHaveLength(1); }); + test("create an FM index", async () => { + // FM-Index accelerates substring search on a string/binary column. + const db = await connect(tmpDir.name); + const fmTbl = await db.createTable("fm_table", [ + { id: 0, text: "hello world" }, + { id: 1, text: "foo bar" }, + ]); + await fmTbl.createIndex("text", { + config: Index.fm(), + }); + const indexDir = path.join(tmpDir.name, "fm_table.lance", "_indices"); + expect(fs.readdirSync(indexDir)).toHaveLength(1); + }); + test("should be able to get index stats", async () => { await tbl.createIndex("id"); diff --git a/nodejs/lancedb/indices.ts b/nodejs/lancedb/indices.ts index b15106c20..c82c3edb8 100644 --- a/nodejs/lancedb/indices.ts +++ b/nodejs/lancedb/indices.ts @@ -702,6 +702,17 @@ export class Index { return new Index(LanceDbIndex.labelList()); } + /** + * Create an FM-Index. + * + * An FM-Index is a scalar index on string or binary columns that accelerates + * substring search, i.e. `contains(col, 'needle')`. Unlike the tokenized + * full-text-search index, it matches arbitrary substrings of the raw bytes. + */ + static fm() { + return new Index(LanceDbIndex.fm()); + } + /** * Create a full text search index * diff --git a/nodejs/src/index.rs b/nodejs/src/index.rs index c957cea78..868f15e89 100644 --- a/nodejs/src/index.rs +++ b/nodejs/src/index.rs @@ -4,7 +4,7 @@ use std::sync::Mutex; use lancedb::index::Index as LanceDbIndex; -use lancedb::index::scalar::{BTreeIndexBuilder, FtsIndexBuilder}; +use lancedb::index::scalar::{BTreeIndexBuilder, FmIndexBuilder, FtsIndexBuilder}; use lancedb::index::vector::{ IvfFlatIndexBuilder, IvfHnswPqIndexBuilder, IvfHnswSqIndexBuilder, IvfPqIndexBuilder, IvfRqIndexBuilder, @@ -143,6 +143,13 @@ impl Index { } } + #[napi(factory)] + pub fn fm() -> Self { + Self { + inner: Mutex::new(Some(LanceDbIndex::Fm(FmIndexBuilder::default()))), + } + } + #[napi(factory)] #[allow(clippy::too_many_arguments)] pub fn fts( diff --git a/python/python/lancedb/_lancedb.pyi b/python/python/lancedb/_lancedb.pyi index 7f0aaf9be..5683a9c9b 100644 --- a/python/python/lancedb/_lancedb.pyi +++ b/python/python/lancedb/_lancedb.pyi @@ -10,6 +10,7 @@ from .index import ( IvfSq, Bitmap, LabelList, + Fm, HnswPq, HnswSq, HnswFlat, @@ -186,6 +187,7 @@ class Table: BTree, Bitmap, LabelList, + Fm, FTS, ], replace: Optional[bool], diff --git a/python/python/lancedb/index.py b/python/python/lancedb/index.py index 67656d8a3..0fb6d45ba 100644 --- a/python/python/lancedb/index.py +++ b/python/python/lancedb/index.py @@ -93,6 +93,20 @@ class LabelList: pass +@dataclass +class Fm: + """Describe an FM-Index configuration. + + `Fm` is a scalar index on string or binary columns that accelerates + substring search, i.e. `contains(col, 'needle')`. Unlike the tokenized + `FTS` index, it matches arbitrary substrings of the raw bytes. + + For example, it works with `url`, `path`, `content`, etc. + """ + + pass + + @dataclass class FTS: """Describe a FTS index configuration. @@ -828,4 +842,5 @@ __all__ = [ "FTS", "Bitmap", "LabelList", + "Fm", ] diff --git a/python/python/lancedb/table.py b/python/python/lancedb/table.py index cee19a29a..d25a34b80 100644 --- a/python/python/lancedb/table.py +++ b/python/python/lancedb/table.py @@ -55,6 +55,7 @@ from .index import ( Bitmap, IvfRq, LabelList, + Fm, HnswPq, HnswSq, HnswFlat, @@ -213,6 +214,7 @@ IndexConfigType = Union[ BTree, Bitmap, LabelList, + Fm, FTS, ] @@ -938,7 +940,7 @@ class Table(ABC): config : IndexConfigType, optional The index configuration object. If provided, uses the new unified API. Can be one of: IvfFlat, IvfPq, IvfSq, IvfRq, HnswPq, HnswSq, - BTree, Bitmap, LabelList, FTS. + BTree, Bitmap, LabelList, Fm, FTS. replace : bool, default True Whether to replace an existing index on this column. wait_timeout : timedelta, optional @@ -2487,7 +2489,7 @@ class LanceTable(Table): config : IndexConfigType, optional The index configuration object. If provided, uses the new unified API. Can be one of: IvfFlat, IvfPq, IvfSq, IvfRq, HnswPq, HnswSq, - BTree, Bitmap, LabelList, FTS. + BTree, Bitmap, LabelList, Fm, FTS. replace : bool, default True Whether to replace an existing index on this column. wait_timeout : timedelta, optional @@ -4487,6 +4489,7 @@ class AsyncTable: BTree, Bitmap, LabelList, + Fm, FTS, ] ] = None, @@ -4539,12 +4542,14 @@ class AsyncTable: BTree, Bitmap, LabelList, + Fm, FTS, ), ): raise TypeError( "config must be an instance of IvfSq, IvfPq, IvfRq, HnswPq, HnswSq," - " BTree, Bitmap, LabelList, or FTS, but got " + str(type(config)) + " BTree, Bitmap, LabelList, Fm, or FTS, but got " + + str(type(config)) ) try: await self._inner.create_index( diff --git a/python/python/tests/test_index.py b/python/python/tests/test_index.py index 4e8e4633a..ab5fd46f3 100644 --- a/python/python/tests/test_index.py +++ b/python/python/tests/test_index.py @@ -20,6 +20,7 @@ from lancedb.index import ( IvfRq, Bitmap, LabelList, + Fm, HnswPq, HnswSq, HnswFlat, @@ -203,6 +204,16 @@ async def test_create_fixed_size_binary_index(some_table: AsyncTable): assert indices[0].columns == ["fsb"] +@pytest.mark.asyncio +async def test_create_fm_index(some_table: AsyncTable): + # FM-Index accelerates substring search on string/binary columns. + await some_table.create_index("data", config=Fm()) + indices = await some_table.list_indices() + assert len(indices) == 1 + assert indices[0].index_type == "Fm" + assert indices[0].columns == ["data"] + + @pytest.mark.asyncio async def test_create_bitmap_index(some_table: AsyncTable): await some_table.create_index("id", config=Bitmap()) diff --git a/python/src/index.rs b/python/src/index.rs index 508b10f17..175f37093 100644 --- a/python/src/index.rs +++ b/python/src/index.rs @@ -7,7 +7,7 @@ use lancedb::index::vector::{ }; use lancedb::index::{ Index as LanceDbIndex, - scalar::{BTreeIndexBuilder, FtsIndexBuilder}, + scalar::{BTreeIndexBuilder, FmIndexBuilder, FtsIndexBuilder}, }; use pyo3::IntoPyObject; use pyo3::types::PyStringMethods; @@ -38,6 +38,7 @@ pub fn extract_index_params(source: &Option>) -> PyResult Ok(LanceDbIndex::BTree(BTreeIndexBuilder::default())), "Bitmap" => Ok(LanceDbIndex::Bitmap(Default::default())), "LabelList" => Ok(LanceDbIndex::LabelList(Default::default())), + "Fm" => Ok(LanceDbIndex::Fm(FmIndexBuilder::default())), "FTS" => { let params = source.extract::()?; let inner_opts = FtsIndexBuilder::default() @@ -183,7 +184,7 @@ pub fn extract_index_params(source: &Option>) -> PyResult Err(PyValueError::new_err(format!( - "Invalid index type '{}'. Must be one of BTree, Bitmap, LabelList, FTS, IvfPq, IvfSq, IvfHnswPq, IvfHnswSq, or IvfHnswFlat", + "Invalid index type '{}'. Must be one of BTree, Bitmap, LabelList, Fm, FTS, IvfPq, IvfSq, IvfHnswPq, IvfHnswSq, or IvfHnswFlat", not_supported ))), } diff --git a/rust/lancedb/src/index.rs b/rust/lancedb/src/index.rs index f7a53375b..69dbaf21b 100644 --- a/rust/lancedb/src/index.rs +++ b/rust/lancedb/src/index.rs @@ -12,7 +12,7 @@ use crate::index::vector::IvfRqIndexBuilder; use crate::{DistanceType, Error, Result, table::BaseTable}; use self::{ - scalar::{BTreeIndexBuilder, BitmapIndexBuilder, LabelListIndexBuilder}, + scalar::{BTreeIndexBuilder, BitmapIndexBuilder, FmIndexBuilder, LabelListIndexBuilder}, vector::{ IvfHnswFlatIndexBuilder, IvfHnswPqIndexBuilder, IvfHnswSqIndexBuilder, IvfPqIndexBuilder, IvfSqIndexBuilder, @@ -48,6 +48,11 @@ pub enum Index { /// using an underlying bitmap index. LabelList(LabelListIndexBuilder), + /// An `FM` index is a scalar index on string/binary columns that accelerates + /// substring search (`contains(col, 'needle')`). It matches arbitrary + /// substrings of the raw bytes, unlike the tokenized [`Index::FTS`] index. + Fm(FmIndexBuilder), + /// Full text search index using bm25. FTS(FtsIndexBuilder), @@ -306,6 +311,8 @@ pub enum IndexType { Bitmap, #[serde(alias = "LABEL_LIST")] LabelList, + #[serde(alias = "FM", alias = "FMINDEX", alias = "FMIndex")] + Fm, // FTS #[serde(alias = "INVERTED", alias = "Inverted")] FTS, @@ -324,6 +331,7 @@ impl std::fmt::Display for IndexType { Self::BTree => write!(f, "BTREE"), Self::Bitmap => write!(f, "BITMAP"), Self::LabelList => write!(f, "LABEL_LIST"), + Self::Fm => write!(f, "FM"), Self::FTS => write!(f, "FTS"), } } @@ -337,6 +345,7 @@ impl std::str::FromStr for IndexType { "BTREE" => Ok(Self::BTree), "BITMAP" => Ok(Self::Bitmap), "LABEL_LIST" | "LABELLIST" => Ok(Self::LabelList), + "FM" | "FMINDEX" => Ok(Self::Fm), "FTS" | "INVERTED" => Ok(Self::FTS), "IVF_FLAT" => Ok(Self::IvfFlat), "IVF_SQ" => Ok(Self::IvfSq), diff --git a/rust/lancedb/src/index/scalar.rs b/rust/lancedb/src/index/scalar.rs index b9292105a..10d835bb1 100644 --- a/rust/lancedb/src/index/scalar.rs +++ b/rust/lancedb/src/index/scalar.rs @@ -51,6 +51,15 @@ pub struct BitmapIndexBuilder {} #[derive(Debug, Clone, Default, serde::Serialize)] pub struct LabelListIndexBuilder {} +/// Builder for an FM-Index. +/// +/// An FM-Index (Ferragina–Manzini) is a scalar index over string/binary columns +/// that accelerates substring search, i.e. `contains(col, 'needle')`. Unlike an +/// inverted (FTS) index it matches arbitrary substrings of the raw bytes rather +/// than tokenized words. +#[derive(Debug, Clone, Default, serde::Serialize)] +pub struct FmIndexBuilder {} + pub use lance_index::scalar::FullTextSearchQuery; pub use lance_index::scalar::InvertedIndexParams as FtsIndexBuilder; pub use lance_index::scalar::InvertedIndexParams; diff --git a/rust/lancedb/src/remote/table.rs b/rust/lancedb/src/remote/table.rs index 9a38b9bb3..6a7b5fe47 100644 --- a/rust/lancedb/src/remote/table.rs +++ b/rust/lancedb/src/remote/table.rs @@ -1750,6 +1750,7 @@ impl BaseTable for RemoteTable { Index::BTree(p) => ("BTREE", Some(to_json(p)?)), Index::Bitmap(p) => ("BITMAP", Some(to_json(p)?)), Index::LabelList(p) => ("LABEL_LIST", Some(to_json(p)?)), + Index::Fm(p) => ("FM", Some(to_json(p)?)), Index::FTS(p) => ("FTS", Some(to_json(p)?)), Index::Auto => { if supported_vector_data_type(field.data_type()) { diff --git a/rust/lancedb/src/table.rs b/rust/lancedb/src/table.rs index 5006651ce..397f754da 100644 --- a/rust/lancedb/src/table.rs +++ b/rust/lancedb/src/table.rs @@ -62,7 +62,8 @@ use crate::query::{IntoQueryVector, Query, QueryExecutionOptions, TakeQuery, Vec use crate::table::datafusion::insert::InsertExec; use crate::utils::{ PatchReadParam, PatchWriteParam, supported_bitmap_data_type, supported_btree_data_type, - supported_fts_data_type, supported_label_list_data_type, supported_vector_data_type, + supported_fm_data_type, supported_fts_data_type, supported_label_list_data_type, + supported_vector_data_type, }; use self::dataset::DatasetConsistencyWrapper; @@ -2460,6 +2461,12 @@ impl NativeTable { BuiltinIndexType::LabelList, ))) } + Index::Fm(_) => { + Self::validate_index_type(field, "FM", supported_fm_data_type)?; + Ok(Box::new(ScalarIndexParams::for_builtin( + BuiltinIndexType::Fm, + ))) + } Index::FTS(fts_opts) => { Self::validate_index_type(field, "FTS", supported_fts_data_type)?; Ok(Box::new(fts_opts)) @@ -2618,6 +2625,7 @@ impl NativeTable { Index::BTree(_) => IndexType::BTree, Index::Bitmap(_) => IndexType::Bitmap, Index::LabelList(_) => IndexType::LabelList, + Index::Fm(_) => IndexType::Fm, Index::FTS(_) => IndexType::Inverted, Index::IvfFlat(_) | Index::IvfSq(_) @@ -3353,7 +3361,7 @@ mod tests { use super::*; use crate::connect; use crate::connection::ConnectBuilder; - use crate::index::scalar::{BTreeIndexBuilder, BitmapIndexBuilder}; + use crate::index::scalar::{BTreeIndexBuilder, BitmapIndexBuilder, FmIndexBuilder}; use crate::index::vector::{IvfHnswPqIndexBuilder, IvfHnswSqIndexBuilder}; use crate::query::Select; use crate::query::{ExecutableQuery, QueryBase}; @@ -4303,6 +4311,56 @@ mod tests { assert_eq!(stats.num_unindexed_rows, 0); } + #[tokio::test] + async fn test_create_fm_index() { + let tmp_dir = tempdir().unwrap(); + let uri = tmp_dir.path().to_str().unwrap(); + + // FM-Index accelerates substring search, so it applies to a string column. + let batch = RecordBatch::try_new( + Arc::new(Schema::new(vec![Field::new("text", DataType::Utf8, false)])), + vec![Arc::new(StringArray::from(vec!["hello world"]))], + ) + .unwrap(); + let conn = ConnectBuilder::new(uri).execute().await.unwrap(); + let table = conn + .create_table("my_table", batch.clone()) + .execute() + .await + .unwrap(); + + table + .create_index(&["text"], Index::Fm(FmIndexBuilder::default())) + .execute() + .await + .unwrap(); + table + .wait_for_index(&["text_idx"], Duration::from_millis(10)) + .await + .unwrap(); + + let index_configs = table.list_indices().await.unwrap(); + assert_eq!(index_configs.len(), 1); + let index = index_configs.into_iter().next().unwrap(); + assert_eq!(index.index_type, crate::index::IndexType::Fm); + assert_eq!(index.columns, vec!["text".to_string()]); + + // The committed FM-Index must answer a substring `contains` query. + let count = table + .query() + .only_if("contains(text, 'world')") + .execute() + .await + .unwrap() + .try_collect::>() + .await + .unwrap() + .iter() + .map(|b| b.num_rows()) + .sum::(); + assert_eq!(count, 1); + } + #[tokio::test] async fn test_create_index_nested_field_paths() { let tmp_dir = tempdir().unwrap(); diff --git a/rust/lancedb/src/utils/mod.rs b/rust/lancedb/src/utils/mod.rs index b435f4a0e..c0b4424ad 100644 --- a/rust/lancedb/src/utils/mod.rs +++ b/rust/lancedb/src/utils/mod.rs @@ -279,6 +279,15 @@ fn supported_fts_data_type_impl(dtype: &DataType, in_list: bool) -> bool { } } +/// FM-Index accelerates substring (`contains`) search over raw bytes, so it +/// applies to string and binary columns. +pub fn supported_fm_data_type(dtype: &DataType) -> bool { + matches!( + dtype, + DataType::Utf8 | DataType::LargeUtf8 | DataType::Binary | DataType::LargeBinary + ) +} + pub fn supported_vector_data_type(dtype: &DataType) -> bool { match dtype { DataType::FixedSizeList(field, _) => {