From d617cdef4ab3cddb28aa1a6ffdf2a26ced8f9856 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Wed, 24 Sep 2025 12:50:21 -0700 Subject: [PATCH] feat: add use_index parameter to merge insert operations (#2674) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary Exposes `use_index` Merge Insert parameter, which was created upstream in https://github.com/lancedb/lance/pull/4688. ## API Examples ### Python ```python # Force table scan table.merge_insert(["id"]) \ .when_not_matched_insert_all() \ .use_index(False) \ .execute(data) ``` ### Node.js/TypeScript ```typescript // Force table scan await table.mergeInsert("id") .whenNotMatchedInsertAll() .useIndex(false) .execute(data); ``` ### Rust ```rust // Force table scan let mut builder = table.merge_insert(&["id"]); builder.when_not_matched_insert_all() .use_index(false); builder.execute(data).await?; ``` 🤖 Generated with [Claude Code](https://claude.ai/code) Co-authored-by: Claude --- nodejs/__test__/table.test.ts | 26 +++++++++++++++++++++++++ nodejs/lancedb/merge.ts | 17 ++++++++++++++++ nodejs/src/merge.rs | 7 +++++++ python/python/lancedb/merge.py | 18 +++++++++++++++++ python/python/lancedb/table.py | 1 + python/src/table.rs | 4 ++++ rust/lancedb/src/remote/table.rs | 11 +++++++++++ rust/lancedb/src/table.rs | 33 ++++++++++++++++++++++++++++++++ rust/lancedb/src/table/merge.rs | 15 +++++++++++++++ 9 files changed, 132 insertions(+) diff --git a/nodejs/__test__/table.test.ts b/nodejs/__test__/table.test.ts index fa6f76b1..9e1946ee 100644 --- a/nodejs/__test__/table.test.ts +++ b/nodejs/__test__/table.test.ts @@ -487,6 +487,32 @@ describe("merge insert", () => { .execute(newData, { timeoutMs: 0 }), ).rejects.toThrow("merge insert timed out"); }); + + test("useIndex", async () => { + const newData = [ + { a: 2, b: "x" }, + { a: 4, b: "z" }, + ]; + + // Test with useIndex(true) - should work fine + const result1 = await table + .mergeInsert("a") + .whenNotMatchedInsertAll() + .useIndex(true) + .execute(newData); + + expect(result1.numInsertedRows).toBe(1); // Only a=4 should be inserted + + // Test with useIndex(false) - should also work fine + const newData2 = [{ a: 5, b: "w" }]; + const result2 = await table + .mergeInsert("a") + .whenNotMatchedInsertAll() + .useIndex(false) + .execute(newData2); + + expect(result2.numInsertedRows).toBe(1); // a=5 should be inserted + }); }); describe("When creating an index", () => { diff --git a/nodejs/lancedb/merge.ts b/nodejs/lancedb/merge.ts index d4823130..dc9144fd 100644 --- a/nodejs/lancedb/merge.ts +++ b/nodejs/lancedb/merge.ts @@ -70,6 +70,23 @@ export class MergeInsertBuilder { this.#schema, ); } + + /** + * Controls whether to use indexes for the merge operation. + * + * When set to `true` (the default), the operation will use an index if available + * on the join key for improved performance. When set to `false`, it forces a full + * table scan even if an index exists. This can be useful for benchmarking or when + * the query optimizer chooses a suboptimal path. + * + * @param useIndex - Whether to use indices for the merge operation. Defaults to `true`. + */ + useIndex(useIndex: boolean): MergeInsertBuilder { + return new MergeInsertBuilder( + this.#native.useIndex(useIndex), + this.#schema, + ); + } /** * Executes the merge insert operation * diff --git a/nodejs/src/merge.rs b/nodejs/src/merge.rs index 5b2c93a4..98d637fb 100644 --- a/nodejs/src/merge.rs +++ b/nodejs/src/merge.rs @@ -43,6 +43,13 @@ impl NativeMergeInsertBuilder { self.inner.timeout(Duration::from_millis(timeout as u64)); } + #[napi] + pub fn use_index(&self, use_index: bool) -> Self { + let mut this = self.clone(); + this.inner.use_index(use_index); + this + } + #[napi(catch_unwind)] pub async fn execute(&self, buf: Buffer) -> napi::Result { let data = ipc_file_to_batches(buf.to_vec()) diff --git a/python/python/lancedb/merge.py b/python/python/lancedb/merge.py index 3cf56a9d..b2564740 100644 --- a/python/python/lancedb/merge.py +++ b/python/python/lancedb/merge.py @@ -33,6 +33,7 @@ class LanceMergeInsertBuilder(object): self._when_not_matched_by_source_delete = False self._when_not_matched_by_source_condition = None self._timeout = None + self._use_index = True def when_matched_update_all( self, *, where: Optional[str] = None @@ -78,6 +79,23 @@ class LanceMergeInsertBuilder(object): self._when_not_matched_by_source_condition = condition return self + def use_index(self, use_index: bool) -> LanceMergeInsertBuilder: + """ + Controls whether to use indexes for the merge operation. + + When set to `True` (the default), the operation will use an index if available + on the join key for improved performance. When set to `False`, it forces a full + table scan even if an index exists. This can be useful for benchmarking or when + the query optimizer chooses a suboptimal path. + + Parameters + ---------- + use_index: bool + Whether to use indices for the merge operation. Defaults to `True`. + """ + self._use_index = use_index + return self + def execute( self, new_data: DATA, diff --git a/python/python/lancedb/table.py b/python/python/lancedb/table.py index 7efae5d8..45133567 100644 --- a/python/python/lancedb/table.py +++ b/python/python/lancedb/table.py @@ -3920,6 +3920,7 @@ class AsyncTable: when_not_matched_by_source_delete=merge._when_not_matched_by_source_delete, when_not_matched_by_source_condition=merge._when_not_matched_by_source_condition, timeout=merge._timeout, + use_index=merge._use_index, ), ) diff --git a/python/src/table.rs b/python/src/table.rs index dafd79b5..f9f7f995 100644 --- a/python/src/table.rs +++ b/python/src/table.rs @@ -672,6 +672,9 @@ impl Table { if let Some(timeout) = parameters.timeout { builder.timeout(timeout); } + if let Some(use_index) = parameters.use_index { + builder.use_index(use_index); + } future_into_py(self_.py(), async move { let res = builder.execute(Box::new(batches)).await.infer_error()?; @@ -831,6 +834,7 @@ pub struct MergeInsertParams { when_not_matched_by_source_delete: bool, when_not_matched_by_source_condition: Option, timeout: Option, + use_index: Option, } #[pyclass] diff --git a/rust/lancedb/src/remote/table.rs b/rust/lancedb/src/remote/table.rs index 8338978a..3ed4c6da 100644 --- a/rust/lancedb/src/remote/table.rs +++ b/rust/lancedb/src/remote/table.rs @@ -1452,6 +1452,14 @@ struct MergeInsertRequest { when_not_matched_insert_all: bool, when_not_matched_by_source_delete: bool, when_not_matched_by_source_delete_filt: Option, + // For backwards compatibility, only serialize use_index when it's false + // (the default is true) + #[serde(skip_serializing_if = "is_true")] + use_index: bool, +} + +fn is_true(b: &bool) -> bool { + *b } impl TryFrom for MergeInsertRequest { @@ -1476,6 +1484,8 @@ impl TryFrom for MergeInsertRequest { when_not_matched_insert_all: value.when_not_matched_insert_all, when_not_matched_by_source_delete: value.when_not_matched_by_source_delete, when_not_matched_by_source_delete_filt: value.when_not_matched_by_source_delete_filt, + // Only serialize use_index when it's false for backwards compatibility + use_index: value.use_index, }) } } @@ -1942,6 +1952,7 @@ mod tests { assert_eq!(params["when_not_matched_by_source_delete"], "false"); assert!(!params.contains_key("when_matched_update_all_filt")); assert!(!params.contains_key("when_not_matched_by_source_delete_filt")); + assert!(!params.contains_key("use_index")); if old_server { http::Response::builder().status(200).body("{}").unwrap() diff --git a/rust/lancedb/src/table.rs b/rust/lancedb/src/table.rs index b4d6f4bb..d31ee2c3 100644 --- a/rust/lancedb/src/table.rs +++ b/rust/lancedb/src/table.rs @@ -2399,6 +2399,7 @@ impl BaseTable for NativeTable { } else { builder.when_not_matched_by_source(WhenNotMatchedBySource::Keep); } + builder.use_index(params.use_index); let future = if let Some(timeout) = params.timeout { // The default retry timeout is 30s, so we pass the full timeout down @@ -2906,6 +2907,38 @@ mod tests { ); } + #[tokio::test] + async fn test_merge_insert_use_index() { + let tmp_dir = tempdir().unwrap(); + let uri = tmp_dir.path().to_str().unwrap(); + let conn = connect(uri).execute().await.unwrap(); + + // Create a dataset with i=0..10 + let batches = merge_insert_test_batches(0, 0); + let table = conn + .create_table("my_table", batches) + .execute() + .await + .unwrap(); + assert_eq!(table.count_rows(None).await.unwrap(), 10); + + // Test use_index=true (default behavior) + let new_batches = Box::new(merge_insert_test_batches(5, 1)); + let mut merge_insert_builder = table.merge_insert(&["i"]); + merge_insert_builder.when_not_matched_insert_all(); + merge_insert_builder.use_index(true); + merge_insert_builder.execute(new_batches).await.unwrap(); + assert_eq!(table.count_rows(None).await.unwrap(), 15); + + // Test use_index=false (force table scan) + let new_batches = Box::new(merge_insert_test_batches(15, 2)); + let mut merge_insert_builder = table.merge_insert(&["i"]); + merge_insert_builder.when_not_matched_insert_all(); + merge_insert_builder.use_index(false); + merge_insert_builder.execute(new_batches).await.unwrap(); + assert_eq!(table.count_rows(None).await.unwrap(), 25); + } + #[tokio::test] async fn test_add_overwrite() { let tmp_dir = tempdir().unwrap(); diff --git a/rust/lancedb/src/table/merge.rs b/rust/lancedb/src/table/merge.rs index 376bd3da..c61c5f1b 100644 --- a/rust/lancedb/src/table/merge.rs +++ b/rust/lancedb/src/table/merge.rs @@ -22,6 +22,7 @@ pub struct MergeInsertBuilder { pub(crate) when_not_matched_by_source_delete: bool, pub(crate) when_not_matched_by_source_delete_filt: Option, pub(crate) timeout: Option, + pub(crate) use_index: bool, } impl MergeInsertBuilder { @@ -35,6 +36,7 @@ impl MergeInsertBuilder { when_not_matched_by_source_delete: false, when_not_matched_by_source_delete_filt: None, timeout: None, + use_index: true, } } @@ -101,6 +103,19 @@ impl MergeInsertBuilder { self } + /// Controls whether to use indexes for the merge operation. + /// + /// When set to `true` (the default), the operation will use an index if available + /// on the join key for improved performance. When set to `false`, it forces a full + /// table scan even if an index exists. This can be useful for benchmarking or when + /// the query optimizer chooses a suboptimal path. + /// + /// If not set, defaults to `true` (use index if available). + pub fn use_index(&mut self, use_index: bool) -> &mut Self { + self.use_index = use_index; + self + } + /// Executes the merge insert operation /// /// Returns version and statistics about the merge operation including the number of rows