From 4b24e28539dc6103902a54a561d3281f5a4f8acc Mon Sep 17 00:00:00 2001 From: BubbleCal Date: Tue, 20 Jan 2026 21:32:41 +0800 Subject: [PATCH] Add skip_merge option for FTS indexes --- nodejs/lancedb/indices.ts | 7 +++++++ nodejs/src/index.rs | 4 ++++ python/python/lancedb/index.py | 4 ++++ python/python/lancedb/remote/table.py | 2 ++ python/python/lancedb/table.py | 7 +++++++ python/python/tests/test_remote_db.py | 6 +++++- python/src/index.rs | 4 +++- rust/lancedb/src/index/scalar.rs | 3 +++ rust/lancedb/src/remote/table.rs | 5 +++++ 9 files changed, 40 insertions(+), 2 deletions(-) diff --git a/nodejs/lancedb/indices.ts b/nodejs/lancedb/indices.ts index b15106c20..cbbfcaaf6 100644 --- a/nodejs/lancedb/indices.ts +++ b/nodejs/lancedb/indices.ts @@ -558,6 +558,12 @@ export interface FtsOptions { * whether to only index the prefix of the token for ngram tokenizer */ prefixOnly?: boolean; + + /** + * Whether to skip the partition merge stage after indexing. + * Useful for distributed indexing where merges are handled separately. + */ + skipMerge?: boolean; } export class Index { @@ -726,6 +732,7 @@ export class Index { options?.ngramMinLength, options?.ngramMaxLength, options?.prefixOnly, + options?.skipMerge, ), ); } diff --git a/nodejs/src/index.rs b/nodejs/src/index.rs index bf8b280a6..71c6eecbe 100644 --- a/nodejs/src/index.rs +++ b/nodejs/src/index.rs @@ -157,6 +157,7 @@ impl Index { ngram_min_length: Option, ngram_max_length: Option, prefix_only: Option, + skip_merge: Option, ) -> Self { let mut opts = FtsIndexBuilder::default(); if let Some(with_position) = with_position { @@ -192,6 +193,9 @@ impl Index { if let Some(prefix_only) = prefix_only { opts = opts.ngram_prefix_only(prefix_only); } + if let Some(skip_merge) = skip_merge { + opts = opts.skip_merge(skip_merge); + } Self { inner: Mutex::new(Some(LanceDbIndex::FTS(opts))), diff --git a/python/python/lancedb/index.py b/python/python/lancedb/index.py index d5e4bfcdc..40726b894 100644 --- a/python/python/lancedb/index.py +++ b/python/python/lancedb/index.py @@ -127,6 +127,9 @@ class FTS: ascii_folding : bool, default True Whether to fold ASCII characters. This converts accented characters to their ASCII equivalent. For example, "café" would be converted to "cafe". + skip_merge : bool, default False + Whether to skip the partition merge stage after indexing. This can be + useful for distributed indexing where merges are handled separately. """ with_position: bool = False @@ -140,6 +143,7 @@ class FTS: ngram_min_length: int = 3 ngram_max_length: int = 3 prefix_only: bool = False + skip_merge: bool = False @dataclass diff --git a/python/python/lancedb/remote/table.py b/python/python/lancedb/remote/table.py index 7961995c6..0886ad66d 100644 --- a/python/python/lancedb/remote/table.py +++ b/python/python/lancedb/remote/table.py @@ -176,6 +176,7 @@ class RemoteTable(Table): ngram_min_length: int = 3, ngram_max_length: int = 3, prefix_only: bool = False, + skip_merge: bool = False, name: Optional[str] = None, ): config = FTS( @@ -190,6 +191,7 @@ class RemoteTable(Table): ngram_min_length=ngram_min_length, ngram_max_length=ngram_max_length, prefix_only=prefix_only, + skip_merge=skip_merge, ) LOOP.run( self._table.create_index( diff --git a/python/python/lancedb/table.py b/python/python/lancedb/table.py index 72451d2be..fdbd3c05b 100644 --- a/python/python/lancedb/table.py +++ b/python/python/lancedb/table.py @@ -892,6 +892,7 @@ class Table(ABC): ngram_min_length: int = 3, ngram_max_length: int = 3, prefix_only: bool = False, + skip_merge: bool = False, wait_timeout: Optional[timedelta] = None, name: Optional[str] = None, ): @@ -956,6 +957,9 @@ class Table(ABC): The maximum length of an n-gram. prefix_only: bool, default False Whether to only index the prefix of the token for ngram tokenizer. + skip_merge: bool, default False + Only available with use_tantivy=False. + If True, skip the partition merge stage after indexing. wait_timeout: timedelta, optional The timeout to wait if indexing is asynchronous. name: str, optional @@ -2259,6 +2263,7 @@ class LanceTable(Table): ngram_min_length: int = 3, ngram_max_length: int = 3, prefix_only: bool = False, + skip_merge: bool = False, name: Optional[str] = None, ): if not use_tantivy: @@ -2282,6 +2287,8 @@ class LanceTable(Table): else: tokenizer_configs = self.infer_tokenizer_configs(tokenizer_name) + tokenizer_configs["skip_merge"] = skip_merge + config = FTS( **tokenizer_configs, ) diff --git a/python/python/tests/test_remote_db.py b/python/python/tests/test_remote_db.py index 566e1fba4..5aeb527aa 100644 --- a/python/python/tests/test_remote_db.py +++ b/python/python/tests/test_remote_db.py @@ -405,7 +405,10 @@ def test_table_create_indices(): # Test create_fts_index with custom name table.create_fts_index( - "text", wait_timeout=timedelta(seconds=2), name="custom_fts_idx" + "text", + wait_timeout=timedelta(seconds=2), + name="custom_fts_idx", + skip_merge=True, ) # Test create_index with custom name @@ -427,6 +430,7 @@ def test_table_create_indices(): fts_req = received_requests[1] assert "name" in fts_req assert fts_req["name"] == "custom_fts_idx" + assert fts_req["skip_merge"] is True # Check vector index request has custom name vector_req = received_requests[2] diff --git a/python/src/index.rs b/python/src/index.rs index c93b23eb3..0f14328c1 100644 --- a/python/src/index.rs +++ b/python/src/index.rs @@ -50,7 +50,8 @@ pub fn extract_index_params(source: &Option>) -> PyResult { @@ -179,6 +180,7 @@ struct FtsParams { ngram_min_length: u32, ngram_max_length: u32, prefix_only: bool, + skip_merge: bool, } #[derive(FromPyObject)] diff --git a/rust/lancedb/src/index/scalar.rs b/rust/lancedb/src/index/scalar.rs index 980b57d25..d1fcd2ba7 100644 --- a/rust/lancedb/src/index/scalar.rs +++ b/rust/lancedb/src/index/scalar.rs @@ -53,5 +53,8 @@ pub struct LabelListIndexBuilder {} pub use lance_index::scalar::inverted::query::*; pub use lance_index::scalar::FullTextSearchQuery; +/// Builder for full text search (FTS) index parameters. +/// +/// Use [`FtsIndexBuilder::skip_merge`] to skip the partition merge stage after indexing. pub use lance_index::scalar::InvertedIndexParams as FtsIndexBuilder; pub use lance_index::scalar::InvertedIndexParams; diff --git a/rust/lancedb/src/remote/table.rs b/rust/lancedb/src/remote/table.rs index 2a941144e..85b3e5e7d 100644 --- a/rust/lancedb/src/remote/table.rs +++ b/rust/lancedb/src/remote/table.rs @@ -2624,6 +2624,11 @@ mod tests { serde_json::to_value(InvertedIndexParams::default()).unwrap(), Index::FTS(Default::default()), ), + ( + "FTS", + serde_json::to_value(InvertedIndexParams::default().skip_merge(true)).unwrap(), + Index::FTS(InvertedIndexParams::default().skip_merge(true)), + ), ]; for (index_type, expected_body, index) in cases {