feat: add use_index parameter to merge insert operations (#2674)

## Summary

Exposes `use_index` Merge Insert parameter, which was created upstream
in https://github.com/lancedb/lance/pull/4688.

## API Examples

### Python
```python
# Force table scan
table.merge_insert(["id"]) \
    .when_not_matched_insert_all() \
    .use_index(False) \
    .execute(data)
```

### Node.js/TypeScript
```typescript
// Force table scan  
await table.mergeInsert("id")
    .whenNotMatchedInsertAll()
    .useIndex(false)
    .execute(data);
```

### Rust
```rust
// Force table scan
let mut builder = table.merge_insert(&["id"]);
builder.when_not_matched_insert_all()
       .use_index(false);
builder.execute(data).await?;
```

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-authored-by: Claude <noreply@anthropic.com>
This commit is contained in:
Will Jones
2025-09-24 12:50:21 -07:00
committed by GitHub
parent 356d7046fd
commit d617cdef4a
9 changed files with 132 additions and 0 deletions

View File

@@ -487,6 +487,32 @@ describe("merge insert", () => {
.execute(newData, { timeoutMs: 0 }),
).rejects.toThrow("merge insert timed out");
});
test("useIndex", async () => {
const newData = [
{ a: 2, b: "x" },
{ a: 4, b: "z" },
];
// Test with useIndex(true) - should work fine
const result1 = await table
.mergeInsert("a")
.whenNotMatchedInsertAll()
.useIndex(true)
.execute(newData);
expect(result1.numInsertedRows).toBe(1); // Only a=4 should be inserted
// Test with useIndex(false) - should also work fine
const newData2 = [{ a: 5, b: "w" }];
const result2 = await table
.mergeInsert("a")
.whenNotMatchedInsertAll()
.useIndex(false)
.execute(newData2);
expect(result2.numInsertedRows).toBe(1); // a=5 should be inserted
});
});
describe("When creating an index", () => {

View File

@@ -70,6 +70,23 @@ export class MergeInsertBuilder {
this.#schema,
);
}
/**
* Controls whether to use indexes for the merge operation.
*
* When set to `true` (the default), the operation will use an index if available
* on the join key for improved performance. When set to `false`, it forces a full
* table scan even if an index exists. This can be useful for benchmarking or when
* the query optimizer chooses a suboptimal path.
*
* @param useIndex - Whether to use indices for the merge operation. Defaults to `true`.
*/
useIndex(useIndex: boolean): MergeInsertBuilder {
return new MergeInsertBuilder(
this.#native.useIndex(useIndex),
this.#schema,
);
}
/**
* Executes the merge insert operation
*

View File

@@ -43,6 +43,13 @@ impl NativeMergeInsertBuilder {
self.inner.timeout(Duration::from_millis(timeout as u64));
}
#[napi]
pub fn use_index(&self, use_index: bool) -> Self {
let mut this = self.clone();
this.inner.use_index(use_index);
this
}
#[napi(catch_unwind)]
pub async fn execute(&self, buf: Buffer) -> napi::Result<MergeResult> {
let data = ipc_file_to_batches(buf.to_vec())

View File

@@ -33,6 +33,7 @@ class LanceMergeInsertBuilder(object):
self._when_not_matched_by_source_delete = False
self._when_not_matched_by_source_condition = None
self._timeout = None
self._use_index = True
def when_matched_update_all(
self, *, where: Optional[str] = None
@@ -78,6 +79,23 @@ class LanceMergeInsertBuilder(object):
self._when_not_matched_by_source_condition = condition
return self
def use_index(self, use_index: bool) -> LanceMergeInsertBuilder:
"""
Controls whether to use indexes for the merge operation.
When set to `True` (the default), the operation will use an index if available
on the join key for improved performance. When set to `False`, it forces a full
table scan even if an index exists. This can be useful for benchmarking or when
the query optimizer chooses a suboptimal path.
Parameters
----------
use_index: bool
Whether to use indices for the merge operation. Defaults to `True`.
"""
self._use_index = use_index
return self
def execute(
self,
new_data: DATA,

View File

@@ -3920,6 +3920,7 @@ class AsyncTable:
when_not_matched_by_source_delete=merge._when_not_matched_by_source_delete,
when_not_matched_by_source_condition=merge._when_not_matched_by_source_condition,
timeout=merge._timeout,
use_index=merge._use_index,
),
)

View File

@@ -672,6 +672,9 @@ impl Table {
if let Some(timeout) = parameters.timeout {
builder.timeout(timeout);
}
if let Some(use_index) = parameters.use_index {
builder.use_index(use_index);
}
future_into_py(self_.py(), async move {
let res = builder.execute(Box::new(batches)).await.infer_error()?;
@@ -831,6 +834,7 @@ pub struct MergeInsertParams {
when_not_matched_by_source_delete: bool,
when_not_matched_by_source_condition: Option<String>,
timeout: Option<std::time::Duration>,
use_index: Option<bool>,
}
#[pyclass]

View File

@@ -1452,6 +1452,14 @@ struct MergeInsertRequest {
when_not_matched_insert_all: bool,
when_not_matched_by_source_delete: bool,
when_not_matched_by_source_delete_filt: Option<String>,
// For backwards compatibility, only serialize use_index when it's false
// (the default is true)
#[serde(skip_serializing_if = "is_true")]
use_index: bool,
}
fn is_true(b: &bool) -> bool {
*b
}
impl TryFrom<MergeInsertBuilder> for MergeInsertRequest {
@@ -1476,6 +1484,8 @@ impl TryFrom<MergeInsertBuilder> for MergeInsertRequest {
when_not_matched_insert_all: value.when_not_matched_insert_all,
when_not_matched_by_source_delete: value.when_not_matched_by_source_delete,
when_not_matched_by_source_delete_filt: value.when_not_matched_by_source_delete_filt,
// Only serialize use_index when it's false for backwards compatibility
use_index: value.use_index,
})
}
}
@@ -1942,6 +1952,7 @@ mod tests {
assert_eq!(params["when_not_matched_by_source_delete"], "false");
assert!(!params.contains_key("when_matched_update_all_filt"));
assert!(!params.contains_key("when_not_matched_by_source_delete_filt"));
assert!(!params.contains_key("use_index"));
if old_server {
http::Response::builder().status(200).body("{}").unwrap()

View File

@@ -2399,6 +2399,7 @@ impl BaseTable for NativeTable {
} else {
builder.when_not_matched_by_source(WhenNotMatchedBySource::Keep);
}
builder.use_index(params.use_index);
let future = if let Some(timeout) = params.timeout {
// The default retry timeout is 30s, so we pass the full timeout down
@@ -2906,6 +2907,38 @@ mod tests {
);
}
#[tokio::test]
async fn test_merge_insert_use_index() {
let tmp_dir = tempdir().unwrap();
let uri = tmp_dir.path().to_str().unwrap();
let conn = connect(uri).execute().await.unwrap();
// Create a dataset with i=0..10
let batches = merge_insert_test_batches(0, 0);
let table = conn
.create_table("my_table", batches)
.execute()
.await
.unwrap();
assert_eq!(table.count_rows(None).await.unwrap(), 10);
// Test use_index=true (default behavior)
let new_batches = Box::new(merge_insert_test_batches(5, 1));
let mut merge_insert_builder = table.merge_insert(&["i"]);
merge_insert_builder.when_not_matched_insert_all();
merge_insert_builder.use_index(true);
merge_insert_builder.execute(new_batches).await.unwrap();
assert_eq!(table.count_rows(None).await.unwrap(), 15);
// Test use_index=false (force table scan)
let new_batches = Box::new(merge_insert_test_batches(15, 2));
let mut merge_insert_builder = table.merge_insert(&["i"]);
merge_insert_builder.when_not_matched_insert_all();
merge_insert_builder.use_index(false);
merge_insert_builder.execute(new_batches).await.unwrap();
assert_eq!(table.count_rows(None).await.unwrap(), 25);
}
#[tokio::test]
async fn test_add_overwrite() {
let tmp_dir = tempdir().unwrap();

View File

@@ -22,6 +22,7 @@ pub struct MergeInsertBuilder {
pub(crate) when_not_matched_by_source_delete: bool,
pub(crate) when_not_matched_by_source_delete_filt: Option<String>,
pub(crate) timeout: Option<Duration>,
pub(crate) use_index: bool,
}
impl MergeInsertBuilder {
@@ -35,6 +36,7 @@ impl MergeInsertBuilder {
when_not_matched_by_source_delete: false,
when_not_matched_by_source_delete_filt: None,
timeout: None,
use_index: true,
}
}
@@ -101,6 +103,19 @@ impl MergeInsertBuilder {
self
}
/// Controls whether to use indexes for the merge operation.
///
/// When set to `true` (the default), the operation will use an index if available
/// on the join key for improved performance. When set to `false`, it forces a full
/// table scan even if an index exists. This can be useful for benchmarking or when
/// the query optimizer chooses a suboptimal path.
///
/// If not set, defaults to `true` (use index if available).
pub fn use_index(&mut self, use_index: bool) -> &mut Self {
self.use_index = use_index;
self
}
/// Executes the merge insert operation
///
/// Returns version and statistics about the merge operation including the number of rows