feat: implement bindings to return merge stats (#2367)

Based on this comment:
https://github.com/lancedb/lancedb/issues/2228#issuecomment-2730463075
and https://github.com/lancedb/lance/pull/2357

Here is my attempt at implementing bindings for returning merge stats
from a `merge_insert.execute` call for lancedb.

Note: I have almost no idea what I am doing in Rust but tried to follow
existing code patterns and pay attention to compiler hints.
- The change in nodejs binding appeared to be necessary to get
compilation to work, presumably this could actual work properly by
returning some kind of NAPI JS object of the stats data?
- I am unsure of what to do with the remote/table.rs changes -
necessarily for compilation to work; I assume this is related to LanceDB
cloud, but unsure the best way to handle that at this point.

Proof of function:

```python
import pandas as pd
import lancedb


db = lancedb.connect("/tmp/test.db")

test_data = pd.DataFrame(
    {
        "title": ["Hello", "Test Document", "Example", "Data Sample", "Last One"],
        "id": [1, 2, 3, 4, 5],
        "content": [
            "World",
            "This is a test",
            "Another example",
            "More test data",
            "Final entry",
        ],
    }
)

table = db.create_table("documents", data=test_data, exist_ok=True, mode="overwrite")

update_data = pd.DataFrame(
    {
        "title": [
            "Hello, World",
            "Test Document, it's good",
            "Example",
            "Data Sample",
            "Last One",
            "New One",
        ],
        "id": [1, 2, 3, 4, 5, 6],
        "content": [
            "World",
            "This is a test",
            "Another example",
            "More test data",
            "Final entry",
            "New content",
        ],
    }
)

stats = (
    table.merge_insert(on="id")
    .when_matched_update_all()
    .when_not_matched_insert_all()
    .execute(update_data)
)

print(stats)
```

returns

```
{'num_inserted_rows': 1, 'num_updated_rows': 5, 'num_deleted_rows': 0}
```

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

## Summary by CodeRabbit

- **New Features**
- Merge-insert operations now return detailed statistics, including
counts of inserted, updated, and deleted rows.
- **Bug Fixes**
- Tests updated to validate returned merge-insert statistics for
accuracy.
- **Documentation**
- Method documentation improved to reflect new return values and clarify
merge operation results.
- Added documentation for the new `MergeStats` interface detailing
operation statistics.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->

---------

Co-authored-by: Will Jones <willjones127@gmail.com>
This commit is contained in:
Alex Pilon
2025-05-01 13:00:20 -04:00
committed by GitHub
parent 5deb26bc8b
commit f315f9665a
13 changed files with 140 additions and 34 deletions

View File

@@ -338,11 +338,16 @@ describe("merge insert", () => {
{ a: 3, b: "y" },
{ a: 4, b: "z" },
];
await table
const stats = await table
.mergeInsert("a")
.whenMatchedUpdateAll()
.whenNotMatchedInsertAll()
.execute(newData);
expect(stats.numInsertedRows).toBe(1n);
expect(stats.numUpdatedRows).toBe(2n);
expect(stats.numDeletedRows).toBe(0n);
const expected = [
{ a: 1, b: "a" },
{ a: 2, b: "x" },

View File

@@ -28,6 +28,7 @@ export {
FragmentSummaryStats,
Tags,
TagContents,
MergeStats,
} from "./native.js";
export {

View File

@@ -1,7 +1,7 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
import { Data, Schema, fromDataToBuffer } from "./arrow";
import { NativeMergeInsertBuilder } from "./native";
import { MergeStats, NativeMergeInsertBuilder } from "./native";
/** A builder used to create and run a merge insert operation */
export class MergeInsertBuilder {
@@ -73,9 +73,9 @@ export class MergeInsertBuilder {
/**
* Executes the merge insert operation
*
* Nothing is returned but the `Table` is updated
* @returns Statistics about the merge operation: counts of inserted, updated, and deleted rows
*/
async execute(data: Data): Promise<void> {
async execute(data: Data): Promise<MergeStats> {
let schema: Schema;
if (this.#schema instanceof Promise) {
schema = await this.#schema;
@@ -84,6 +84,6 @@ export class MergeInsertBuilder {
schema = this.#schema;
}
const buffer = await fromDataToBuffer(data, undefined, schema);
await this.#native.execute(buffer);
return await this.#native.execute(buffer);
}
}

View File

@@ -37,7 +37,7 @@ impl NativeMergeInsertBuilder {
}
#[napi(catch_unwind)]
pub async fn execute(&self, buf: Buffer) -> napi::Result<()> {
pub async fn execute(&self, buf: Buffer) -> napi::Result<MergeStats> {
let data = ipc_file_to_batches(buf.to_vec())
.and_then(IntoArrow::into_arrow)
.map_err(|e| {
@@ -46,12 +46,14 @@ impl NativeMergeInsertBuilder {
let this = self.clone();
this.inner.execute(data).await.map_err(|e| {
let stats = this.inner.execute(data).await.map_err(|e| {
napi::Error::from_reason(format!(
"Failed to execute merge insert: {}",
convert_error(&e)
))
})
})?;
Ok(stats.into())
}
}
@@ -60,3 +62,20 @@ impl From<MergeInsertBuilder> for NativeMergeInsertBuilder {
Self { inner }
}
}
#[napi(object)]
pub struct MergeStats {
pub num_inserted_rows: BigInt,
pub num_updated_rows: BigInt,
pub num_deleted_rows: BigInt,
}
impl From<lancedb::table::MergeStats> for MergeStats {
fn from(stats: lancedb::table::MergeStats) -> Self {
Self {
num_inserted_rows: stats.num_inserted_rows.into(),
num_updated_rows: stats.num_updated_rows.into(),
num_deleted_rows: stats.num_deleted_rows.into(),
}
}
}