Compare commits

...

9 Commits

Author SHA1 Message Date
David Myriel
9e278fc5a6 fix small details 2025-05-05 23:03:17 +02:00
David Myriel
09fed1f286 add quickstart doc 2025-05-05 22:02:11 +02:00
Will Jones
cee2b5ea42 chore: upgrade pyarrow pin (#2192)
Closes #2191


<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->

## Summary by CodeRabbit

- **Chores**
- Updated the required version of the pyarrow package to version 16 or
higher.
- Adjusted automated testing workflows to install pyarrow version 16 for
compatibility checks.

<!-- end of auto-generated comment: release notes by coderabbit.ai -->
2025-05-05 11:23:13 -07:00
Alex Pilon
f315f9665a feat: implement bindings to return merge stats (#2367)
Based on this comment:
https://github.com/lancedb/lancedb/issues/2228#issuecomment-2730463075
and https://github.com/lancedb/lance/pull/2357

Here is my attempt at implementing bindings for returning merge stats
from a `merge_insert.execute` call for lancedb.

Note: I have almost no idea what I am doing in Rust but tried to follow
existing code patterns and pay attention to compiler hints.
- The change in nodejs binding appeared to be necessary to get
compilation to work, presumably this could actual work properly by
returning some kind of NAPI JS object of the stats data?
- I am unsure of what to do with the remote/table.rs changes -
necessarily for compilation to work; I assume this is related to LanceDB
cloud, but unsure the best way to handle that at this point.

Proof of function:

```python
import pandas as pd
import lancedb


db = lancedb.connect("/tmp/test.db")

test_data = pd.DataFrame(
    {
        "title": ["Hello", "Test Document", "Example", "Data Sample", "Last One"],
        "id": [1, 2, 3, 4, 5],
        "content": [
            "World",
            "This is a test",
            "Another example",
            "More test data",
            "Final entry",
        ],
    }
)

table = db.create_table("documents", data=test_data, exist_ok=True, mode="overwrite")

update_data = pd.DataFrame(
    {
        "title": [
            "Hello, World",
            "Test Document, it's good",
            "Example",
            "Data Sample",
            "Last One",
            "New One",
        ],
        "id": [1, 2, 3, 4, 5, 6],
        "content": [
            "World",
            "This is a test",
            "Another example",
            "More test data",
            "Final entry",
            "New content",
        ],
    }
)

stats = (
    table.merge_insert(on="id")
    .when_matched_update_all()
    .when_not_matched_insert_all()
    .execute(update_data)
)

print(stats)
```

returns

```
{'num_inserted_rows': 1, 'num_updated_rows': 5, 'num_deleted_rows': 0}
```

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

## Summary by CodeRabbit

- **New Features**
- Merge-insert operations now return detailed statistics, including
counts of inserted, updated, and deleted rows.
- **Bug Fixes**
- Tests updated to validate returned merge-insert statistics for
accuracy.
- **Documentation**
- Method documentation improved to reflect new return values and clarify
merge operation results.
- Added documentation for the new `MergeStats` interface detailing
operation statistics.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->

---------

Co-authored-by: Will Jones <willjones127@gmail.com>
2025-05-01 10:00:20 -07:00
Andrew C. Oliver
5deb26bc8b fix: prevent embedded objects from returning null in all of their fields (#2355)
metadata{filename=xyz} filename would be there structurally, but ALWAYS
null.

I didn't include this as a file but it may be useful for understanding
the problem for people searching on this issue so I'm including it here
as documentation. Before this patch any field that is more than 1 deep
is accepted but returns null values for subfields when queried.

```js
const lancedb = require('@lancedb/lancedb');

// Debug logger
function debug(message, data) {
  console.log(`[TEST] ${message}`, data !== undefined ? data : '');
}

// Log when our unwrapArrowObject is called
const kParent = Symbol.for("parent");
const kRowIndex = Symbol.for("rowIndex");

// Override console.log for our test
const originalConsoleLog = console.log;
console.log = function() {
  // Filter out noisy logs
  if (arguments[0] && typeof arguments[0] === 'string' && arguments[0].includes('[INFO] [LanceDB]')) {
    originalConsoleLog.apply(console, arguments);
  }
  originalConsoleLog.apply(console, arguments);
};

async function main() {
  debug('Starting test...');
  
  // Connect to the database
  debug('Connecting to database...');
  const db = await lancedb.connect('./.lancedb');
  
  // Try to open an existing table, or create a new one if it doesn't exist
  let table;
  try {
    table = await db.openTable('test_nested_fields');
    debug('Opened existing table');
  } catch (e) {
    debug('Creating new table...');
    
    // Create test data with nested metadata structure
    const data = [
      {
        id: 'test1',
        vector: [1, 2, 3],
        metadata: {
          filePath: "/path/to/file1.ts",
          startLine: 10,
          endLine: 20,
          text: "function test() { return true; }"
        }
      },
      {
        id: 'test2',
        vector: [4, 5, 6],
        metadata: {
          filePath: "/path/to/file2.ts",
          startLine: 30,
          endLine: 40,
          text: "function test2() { return false; }"
        }
      }
    ];
    
    debug('Data to be inserted:', JSON.stringify(data, null, 2));
    
    // Create the table
    table = await db.createTable('test_nested_fields', data);
    debug('Table created successfully');
  }
  
  // Query the table and get results
  debug('Querying table...');
  const results = await table.search([1, 2, 3]).limit(10).toArray();
  
  // Log the results
  debug('Number of results:', results.length);
  
  if (results.length > 0) {
    const firstResult = results[0];
    debug('First result properties:', Object.keys(firstResult));
    
    // Check if metadata is accessible and what properties it has
    if (firstResult.metadata) {
      debug('Metadata properties:', Object.keys(firstResult.metadata));
      debug('Metadata filePath:', firstResult.metadata.filePath);
      debug('Metadata startLine:', firstResult.metadata.startLine);
      
      // Destructure to see if that helps
      const { filePath, startLine, endLine, text } = firstResult.metadata;
      debug('Destructured values:', { filePath, startLine, endLine, text });
      
      // Check if it's a proxy object
      debug('Result is proxy?', Object.getPrototypeOf(firstResult) === Object.prototype ? false : true);
      debug('Metadata is proxy?', Object.getPrototypeOf(firstResult.metadata) === Object.prototype ? false : true);
    } else {
      debug('Metadata is not accessible!');
    }
  }
  
  // Close the database
  await db.close();
}

main().catch(e => {
  console.error('Error:', e);
}); 
```

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

## Summary by CodeRabbit

- **Bug Fixes**
- Improved handling of nested struct fields to ensure accurate
preservation of values during serialization and deserialization.
- Enhanced robustness when accessing nested object properties, reducing
errors with missing or null values.

- **Tests**
- Added tests to verify correct handling of nested struct fields through
serialization and deserialization.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->

---------

Co-authored-by: Will Jones <willjones127@gmail.com>
2025-05-01 09:38:55 -07:00
Lance Release
3cc670ac38 Updating package-lock.json 2025-04-29 23:21:19 +00:00
Lance Release
4ade3e31e2 Updating package-lock.json 2025-04-29 22:19:46 +00:00
Lance Release
a222d2cd91 Updating package-lock.json 2025-04-29 22:19:30 +00:00
Lance Release
508e621f3d Bump version: 0.19.1-beta.0 → 0.19.1-beta.1 2025-04-29 22:19:14 +00:00
39 changed files with 373 additions and 89 deletions

View File

@@ -1,5 +1,5 @@
[tool.bumpversion] [tool.bumpversion]
current_version = "0.19.1-beta.0" current_version = "0.19.1-beta.1"
parse = """(?x) parse = """(?x)
(?P<major>0|[1-9]\\d*)\\. (?P<major>0|[1-9]\\d*)\\.
(?P<minor>0|[1-9]\\d*)\\. (?P<minor>0|[1-9]\\d*)\\.

View File

@@ -228,6 +228,7 @@ jobs:
- name: Install lancedb - name: Install lancedb
run: | run: |
pip install "pydantic<2" pip install "pydantic<2"
pip install pyarrow==16
pip install --extra-index-url https://pypi.fury.io/lancedb/ -e .[tests] pip install --extra-index-url https://pypi.fury.io/lancedb/ -e .[tests]
pip install tantivy pip install tantivy
- name: Run tests - name: Run tests

8
Cargo.lock generated
View File

@@ -4135,7 +4135,7 @@ dependencies = [
[[package]] [[package]]
name = "lancedb" name = "lancedb"
version = "0.19.1-beta.0" version = "0.19.1-beta.1"
dependencies = [ dependencies = [
"arrow", "arrow",
"arrow-array", "arrow-array",
@@ -4222,7 +4222,7 @@ dependencies = [
[[package]] [[package]]
name = "lancedb-node" name = "lancedb-node"
version = "0.19.1-beta.0" version = "0.19.1-beta.1"
dependencies = [ dependencies = [
"arrow-array", "arrow-array",
"arrow-ipc", "arrow-ipc",
@@ -4247,7 +4247,7 @@ dependencies = [
[[package]] [[package]]
name = "lancedb-nodejs" name = "lancedb-nodejs"
version = "0.19.1-beta.0" version = "0.19.1-beta.1"
dependencies = [ dependencies = [
"arrow-array", "arrow-array",
"arrow-ipc", "arrow-ipc",
@@ -4266,7 +4266,7 @@ dependencies = [
[[package]] [[package]]
name = "lancedb-python" name = "lancedb-python"
version = "0.22.1-beta.0" version = "0.22.1-beta.1"
dependencies = [ dependencies = [
"arrow", "arrow",
"env_logger", "env_logger",

View File

@@ -105,7 +105,8 @@ markdown_extensions:
nav: nav:
- Home: - Home:
- LanceDB: index.md - LanceDB: index.md
- 🏃🏼‍♂️ Quick start: basic.md - 👉 Quickstart: quickstart.md
- 🏃🏼‍♂️ Basic Usage: basic.md
- 📚 Concepts: - 📚 Concepts:
- Vector search: concepts/vector_search.md - Vector search: concepts/vector_search.md
- Indexing: - Indexing:
@@ -237,7 +238,9 @@ nav:
- 👾 JavaScript (lancedb): js/globals.md - 👾 JavaScript (lancedb): js/globals.md
- 🦀 Rust: https://docs.rs/lancedb/latest/lancedb/ - 🦀 Rust: https://docs.rs/lancedb/latest/lancedb/
- Quick start: basic.md - Getting Started:
- Quickstart: quickstart.md
- Basic Usage: basic.md
- Concepts: - Concepts:
- Vector search: concepts/vector_search.md - Vector search: concepts/vector_search.md
- Indexing: - Indexing:

View File

@@ -1,4 +1,4 @@
# Quick start # Basic Usage
!!! info "LanceDB can be run in a number of ways:" !!! info "LanceDB can be run in a number of ways:"

View File

@@ -33,20 +33,20 @@ Construct a MergeInsertBuilder. __Internal use only.__
### execute() ### execute()
```ts ```ts
execute(data): Promise<void> execute(data): Promise<MergeStats>
``` ```
Executes the merge insert operation Executes the merge insert operation
Nothing is returned but the `Table` is updated
#### Parameters #### Parameters
* **data**: [`Data`](../type-aliases/Data.md) * **data**: [`Data`](../type-aliases/Data.md)
#### Returns #### Returns
`Promise`&lt;`void`&gt; `Promise`&lt;[`MergeStats`](../interfaces/MergeStats.md)&gt;
Statistics about the merge operation: counts of inserted, updated, and deleted rows
*** ***

View File

@@ -54,6 +54,7 @@
- [IndexStatistics](interfaces/IndexStatistics.md) - [IndexStatistics](interfaces/IndexStatistics.md)
- [IvfFlatOptions](interfaces/IvfFlatOptions.md) - [IvfFlatOptions](interfaces/IvfFlatOptions.md)
- [IvfPqOptions](interfaces/IvfPqOptions.md) - [IvfPqOptions](interfaces/IvfPqOptions.md)
- [MergeStats](interfaces/MergeStats.md)
- [OpenTableOptions](interfaces/OpenTableOptions.md) - [OpenTableOptions](interfaces/OpenTableOptions.md)
- [OptimizeOptions](interfaces/OptimizeOptions.md) - [OptimizeOptions](interfaces/OptimizeOptions.md)
- [OptimizeStats](interfaces/OptimizeStats.md) - [OptimizeStats](interfaces/OptimizeStats.md)

View File

@@ -0,0 +1,31 @@
[**@lancedb/lancedb**](../README.md) • **Docs**
***
[@lancedb/lancedb](../globals.md) / MergeStats
# Interface: MergeStats
## Properties
### numDeletedRows
```ts
numDeletedRows: bigint;
```
***
### numInsertedRows
```ts
numInsertedRows: bigint;
```
***
### numUpdatedRows
```ts
numUpdatedRows: bigint;
```

101
docs/src/quickstart.md Normal file
View File

@@ -0,0 +1,101 @@
# Getting Started with LanceDB: A Minimal Vector Search Tutorial
Let's set up a LanceDB database, insert vector data, and perform a simple vector search. We'll use simple character classes like "knight" and "rogue" to illustrate semantic relevance.
## 1. Install Dependencies
Before starting, make sure you have the necessary packages:
```bash
pip install lancedb pandas numpy
```
## 2. Import Required Libraries
```python
import lancedb
import pandas as pd
import numpy as np
```
## 3. Connect to LanceDB
You can use a local directory to store your database:
```python
db = lancedb.connect("./lancedb")
```
## 4. Create Sample Data
Add sample text data and corresponding 4D vectors:
```python
data = pd.DataFrame([
{"id": "1", "vector": [1.0, 0.0, 0.0, 0.0], "text": "knight"},
{"id": "2", "vector": [0.9, 0.1, 0.0, 0.0], "text": "warrior"},
{"id": "3", "vector": [0.0, 1.0, 0.0, 0.0], "text": "rogue"},
{"id": "4", "vector": [0.0, 0.9, 0.1, 0.0], "text": "thief"},
{"id": "5", "vector": [0.5, 0.5, 0.0, 0.0], "text": "ranger"},
])
```
## 5. Create a Table in LanceDB
```python
table = db.create_table("rpg_classes", data=data, mode="overwrite")
```
Let's see how the table looks:
```python
print(data)
```
| id | vector | text |
|----|--------|------|
| 1 | [1.0, 0.0, 0.0, 0.0] | knight |
| 2 | [0.9, 0.1, 0.0, 0.0] | warrior |
| 3 | [0.0, 1.0, 0.0, 0.0] | rogue |
| 4 | [0.0, 0.9, 0.1, 0.0] | thief |
| 5 | [0.5, 0.5, 0.0, 0.0] | ranger |
## 6. Perform a Vector Search
Search for the most similar character classes to our query vector:
```python
# Query as if we are searching for "rogue"
results = table.search([0.95, 0.05, 0.0, 0.0]).limit(3).to_df()
print(results)
```
This will return the top 3 closest classes to the vector, effectively showing how LanceDB can be used for semantic search.
| id | vector | text | _distance |
|------|------------------------|----------|-----------|
| 3 | [0.0, 1.0, 0.0, 0.0] | rogue | 0.00 |
| 4 | [0.0, 0.9, 0.1, 0.0] | thief | 0.02 |
| 5 | [0.5, 0.5, 0.0, 0.0] | ranger | 0.50 |
Let's try searching for "knight"
```python
query_vector = [1.0, 0.0, 0.0, 0.0]
results = table.search(query_vector).limit(3).to_pandas()
print(results)
```
| id | vector | text | _distance |
|------|------------------------|----------|-----------|
| 1 | [1.0, 0.0, 0.0, 0.0] | knight | 0.00 |
| 2 | [0.9, 0.1, 0.0, 0.0] | warrior | 0.02 |
| 5 | [0.5, 0.5, 0.0, 0.0] | ranger | 0.50 |
## Next Steps
That's it - you just conducted vector search!
For more beginner tips, check out the [Basic Usage](basic.md) guide.

View File

@@ -8,7 +8,7 @@
<parent> <parent>
<groupId>com.lancedb</groupId> <groupId>com.lancedb</groupId>
<artifactId>lancedb-parent</artifactId> <artifactId>lancedb-parent</artifactId>
<version>0.19.1-beta.0</version> <version>0.19.1-beta.1</version>
<relativePath>../pom.xml</relativePath> <relativePath>../pom.xml</relativePath>
</parent> </parent>

View File

@@ -6,7 +6,7 @@
<groupId>com.lancedb</groupId> <groupId>com.lancedb</groupId>
<artifactId>lancedb-parent</artifactId> <artifactId>lancedb-parent</artifactId>
<version>0.19.1-beta.0</version> <version>0.19.1-beta.1</version>
<packaging>pom</packaging> <packaging>pom</packaging>
<name>LanceDB Parent</name> <name>LanceDB Parent</name>

44
node/package-lock.json generated
View File

@@ -1,12 +1,12 @@
{ {
"name": "vectordb", "name": "vectordb",
"version": "0.19.1-beta.0", "version": "0.19.1-beta.1",
"lockfileVersion": 3, "lockfileVersion": 3,
"requires": true, "requires": true,
"packages": { "packages": {
"": { "": {
"name": "vectordb", "name": "vectordb",
"version": "0.19.1-beta.0", "version": "0.19.1-beta.1",
"cpu": [ "cpu": [
"x64", "x64",
"arm64" "arm64"
@@ -52,11 +52,11 @@
"uuid": "^9.0.0" "uuid": "^9.0.0"
}, },
"optionalDependencies": { "optionalDependencies": {
"@lancedb/vectordb-darwin-arm64": "0.19.1-beta.0", "@lancedb/vectordb-darwin-arm64": "0.19.1-beta.1",
"@lancedb/vectordb-darwin-x64": "0.19.1-beta.0", "@lancedb/vectordb-darwin-x64": "0.19.1-beta.1",
"@lancedb/vectordb-linux-arm64-gnu": "0.19.1-beta.0", "@lancedb/vectordb-linux-arm64-gnu": "0.19.1-beta.1",
"@lancedb/vectordb-linux-x64-gnu": "0.19.1-beta.0", "@lancedb/vectordb-linux-x64-gnu": "0.19.1-beta.1",
"@lancedb/vectordb-win32-x64-msvc": "0.19.1-beta.0" "@lancedb/vectordb-win32-x64-msvc": "0.19.1-beta.1"
}, },
"peerDependencies": { "peerDependencies": {
"@apache-arrow/ts": "^14.0.2", "@apache-arrow/ts": "^14.0.2",
@@ -327,9 +327,9 @@
} }
}, },
"node_modules/@lancedb/vectordb-darwin-arm64": { "node_modules/@lancedb/vectordb-darwin-arm64": {
"version": "0.19.1-beta.0", "version": "0.19.1-beta.1",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.19.1-beta.0.tgz", "resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.19.1-beta.1.tgz",
"integrity": "sha512-LXdyFLniwciIslw487JRmYB+PpuwBhq4jT9hNxKKMrnXfBvUEQ8HN9kxkLFy+pwFaY1wmFH0EfujnXlt8qXgqA==", "integrity": "sha512-Epvel0pF5TM6MtIWQ2KhqezqSSHTL3Wr7a2rGAwz6X/XY23i6DbMPpPs0HyeIDzDrhxNfE3cz3S+SiCA6xpR0g==",
"cpu": [ "cpu": [
"arm64" "arm64"
], ],
@@ -340,9 +340,9 @@
] ]
}, },
"node_modules/@lancedb/vectordb-darwin-x64": { "node_modules/@lancedb/vectordb-darwin-x64": {
"version": "0.19.1-beta.0", "version": "0.19.1-beta.1",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.19.1-beta.0.tgz", "resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.19.1-beta.1.tgz",
"integrity": "sha512-kVZzc6j778vg3AftFdu8pSg/3J9YHCID70bf3QpMnl43b9ZSl4Sh8WNVYWiqfwOAElD+kOaTq9MC4f7jkW/DEQ==", "integrity": "sha512-hOiUSlIoISbiXytp46hToi/r6sF5pImAsfbzCsIq8ExDV4TPa8fjbhcIT80vxxOwc2mpSSK4HsVJYod95RSbEQ==",
"cpu": [ "cpu": [
"x64" "x64"
], ],
@@ -353,9 +353,9 @@
] ]
}, },
"node_modules/@lancedb/vectordb-linux-arm64-gnu": { "node_modules/@lancedb/vectordb-linux-arm64-gnu": {
"version": "0.19.1-beta.0", "version": "0.19.1-beta.1",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.19.1-beta.0.tgz", "resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.19.1-beta.1.tgz",
"integrity": "sha512-EOrAwqUkCXBp7GVwzhyUoFIUbV+7eXDbEHo6mHgZN6E1SOAgBuPiwRqDlqA4uCYU8YhZJZabXdiLTP+EuSjpgw==", "integrity": "sha512-/1JhGVDEngwrlM8o2TNW8G6nJ9U/VgHKAORmj/cTA7O30helJIoo9jfvUAUy+vZ4VoEwRXQbMI+gaYTg0l3MTg==",
"cpu": [ "cpu": [
"arm64" "arm64"
], ],
@@ -366,9 +366,9 @@
] ]
}, },
"node_modules/@lancedb/vectordb-linux-x64-gnu": { "node_modules/@lancedb/vectordb-linux-x64-gnu": {
"version": "0.19.1-beta.0", "version": "0.19.1-beta.1",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.19.1-beta.0.tgz", "resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.19.1-beta.1.tgz",
"integrity": "sha512-Xs7wlM6LJS4/jps9Vi1TMtR9kYedzShZQ1sepJzn2cuI1jnbCrlLsFSMUfjhxy7/0+E8QO1f7cBO6gwOTunrog==", "integrity": "sha512-zNRGSSUt8nTJMmll4NdxhQjwxR8Rezq3T4dsRoiDts5ienMam5HFjYiZ3FkDZQo16rgq2BcbFuH1G8u1chywlg==",
"cpu": [ "cpu": [
"x64" "x64"
], ],
@@ -379,9 +379,9 @@
] ]
}, },
"node_modules/@lancedb/vectordb-win32-x64-msvc": { "node_modules/@lancedb/vectordb-win32-x64-msvc": {
"version": "0.19.1-beta.0", "version": "0.19.1-beta.1",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.19.1-beta.0.tgz", "resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.19.1-beta.1.tgz",
"integrity": "sha512-tS9g2S4hxdsQfVd6S6N+HuX1Mt4iJEQjpHmmfz+WEfdc5JP6eyKoqqQzFw2oL3TJAG7HXfD88/24nCUUCHDKoA==", "integrity": "sha512-yV550AJGlsIFdm1KoHQPJ1TZx121ZXCIdebBtBZj3wOObIhyB/i0kZAtGvwjkmr7EYyfzt1EHZzbjSGVdehIAA==",
"cpu": [ "cpu": [
"x64" "x64"
], ],

View File

@@ -1,6 +1,6 @@
{ {
"name": "vectordb", "name": "vectordb",
"version": "0.19.1-beta.0", "version": "0.19.1-beta.1",
"description": " Serverless, low-latency vector database for AI applications", "description": " Serverless, low-latency vector database for AI applications",
"private": false, "private": false,
"main": "dist/index.js", "main": "dist/index.js",
@@ -89,10 +89,10 @@
} }
}, },
"optionalDependencies": { "optionalDependencies": {
"@lancedb/vectordb-darwin-x64": "0.19.1-beta.0", "@lancedb/vectordb-darwin-x64": "0.19.1-beta.1",
"@lancedb/vectordb-darwin-arm64": "0.19.1-beta.0", "@lancedb/vectordb-darwin-arm64": "0.19.1-beta.1",
"@lancedb/vectordb-linux-x64-gnu": "0.19.1-beta.0", "@lancedb/vectordb-linux-x64-gnu": "0.19.1-beta.1",
"@lancedb/vectordb-linux-arm64-gnu": "0.19.1-beta.0", "@lancedb/vectordb-linux-arm64-gnu": "0.19.1-beta.1",
"@lancedb/vectordb-win32-x64-msvc": "0.19.1-beta.0" "@lancedb/vectordb-win32-x64-msvc": "0.19.1-beta.1"
} }
} }

View File

@@ -1,7 +1,7 @@
[package] [package]
name = "lancedb-nodejs" name = "lancedb-nodejs"
edition.workspace = true edition.workspace = true
version = "0.19.1-beta.0" version = "0.19.1-beta.1"
license.workspace = true license.workspace = true
description.workspace = true description.workspace = true
repository.workspace = true repository.workspace = true

View File

@@ -374,6 +374,71 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
expect(table2.numRows).toBe(4); expect(table2.numRows).toBe(4);
expect(table2.schema).toEqual(schema); expect(table2.schema).toEqual(schema);
}); });
it("should correctly retain values in nested struct fields", async function () {
// Define test data with nested struct
const testData = [
{
id: "doc1",
vector: [1, 2, 3],
metadata: {
filePath: "/path/to/file1.ts",
startLine: 10,
endLine: 20,
text: "function test() { return true; }",
},
},
{
id: "doc2",
vector: [4, 5, 6],
metadata: {
filePath: "/path/to/file2.ts",
startLine: 30,
endLine: 40,
text: "function test2() { return false; }",
},
},
];
// Create Arrow table from the data
const table = makeArrowTable(testData);
// Verify schema has the nested struct fields
const metadataField = table.schema.fields.find(
(f) => f.name === "metadata",
);
expect(metadataField).toBeDefined();
// biome-ignore lint/suspicious/noExplicitAny: accessing fields in different Arrow versions
const childNames = metadataField?.type.children.map((c: any) => c.name);
expect(childNames).toEqual([
"filePath",
"startLine",
"endLine",
"text",
]);
// Convert to buffer and back (simulating storage and retrieval)
const buf = await fromTableToBuffer(table);
const retrievedTable = tableFromIPC(buf);
// Verify the retrieved table has the same structure
const rows = [];
for (let i = 0; i < retrievedTable.numRows; i++) {
rows.push(retrievedTable.get(i));
}
// Check values in the first row
const firstRow = rows[0];
expect(firstRow.id).toBe("doc1");
expect(firstRow.vector.toJSON()).toEqual([1, 2, 3]);
// Verify metadata values are preserved (this is where the bug is)
expect(firstRow.metadata).toBeDefined();
expect(firstRow.metadata.filePath).toBe("/path/to/file1.ts");
expect(firstRow.metadata.startLine).toBe(10);
expect(firstRow.metadata.endLine).toBe(20);
expect(firstRow.metadata.text).toBe("function test() { return true; }");
});
}); });
class DummyEmbedding extends EmbeddingFunction<string> { class DummyEmbedding extends EmbeddingFunction<string> {

View File

@@ -338,11 +338,16 @@ describe("merge insert", () => {
{ a: 3, b: "y" }, { a: 3, b: "y" },
{ a: 4, b: "z" }, { a: 4, b: "z" },
]; ];
await table const stats = await table
.mergeInsert("a") .mergeInsert("a")
.whenMatchedUpdateAll() .whenMatchedUpdateAll()
.whenNotMatchedInsertAll() .whenNotMatchedInsertAll()
.execute(newData); .execute(newData);
expect(stats.numInsertedRows).toBe(1n);
expect(stats.numUpdatedRows).toBe(2n);
expect(stats.numDeletedRows).toBe(0n);
const expected = [ const expected = [
{ a: 1, b: "a" }, { a: 1, b: "a" },
{ a: 2, b: "x" }, { a: 2, b: "x" },

View File

@@ -639,8 +639,9 @@ function transposeData(
): Vector { ): Vector {
if (field.type instanceof Struct) { if (field.type instanceof Struct) {
const childFields = field.type.children; const childFields = field.type.children;
const fullPath = [...path, field.name];
const childVectors = childFields.map((child) => { const childVectors = childFields.map((child) => {
return transposeData(data, child, [...path, child.name]); return transposeData(data, child, fullPath);
}); });
const structData = makeData({ const structData = makeData({
type: field.type, type: field.type,
@@ -652,7 +653,14 @@ function transposeData(
const values = data.map((datum) => { const values = data.map((datum) => {
let current: unknown = datum; let current: unknown = datum;
for (const key of valuesPath) { for (const key of valuesPath) {
if (isObject(current) && Object.hasOwn(current, key)) { if (current == null) {
return null;
}
if (
isObject(current) &&
(Object.hasOwn(current, key) || key in current)
) {
current = current[key]; current = current[key];
} else { } else {
return null; return null;

View File

@@ -28,6 +28,7 @@ export {
FragmentSummaryStats, FragmentSummaryStats,
Tags, Tags,
TagContents, TagContents,
MergeStats,
} from "./native.js"; } from "./native.js";
export { export {

View File

@@ -1,7 +1,7 @@
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The LanceDB Authors // SPDX-FileCopyrightText: Copyright The LanceDB Authors
import { Data, Schema, fromDataToBuffer } from "./arrow"; import { Data, Schema, fromDataToBuffer } from "./arrow";
import { NativeMergeInsertBuilder } from "./native"; import { MergeStats, NativeMergeInsertBuilder } from "./native";
/** A builder used to create and run a merge insert operation */ /** A builder used to create and run a merge insert operation */
export class MergeInsertBuilder { export class MergeInsertBuilder {
@@ -73,9 +73,9 @@ export class MergeInsertBuilder {
/** /**
* Executes the merge insert operation * Executes the merge insert operation
* *
* Nothing is returned but the `Table` is updated * @returns Statistics about the merge operation: counts of inserted, updated, and deleted rows
*/ */
async execute(data: Data): Promise<void> { async execute(data: Data): Promise<MergeStats> {
let schema: Schema; let schema: Schema;
if (this.#schema instanceof Promise) { if (this.#schema instanceof Promise) {
schema = await this.#schema; schema = await this.#schema;
@@ -84,6 +84,6 @@ export class MergeInsertBuilder {
schema = this.#schema; schema = this.#schema;
} }
const buffer = await fromDataToBuffer(data, undefined, schema); const buffer = await fromDataToBuffer(data, undefined, schema);
await this.#native.execute(buffer); return await this.#native.execute(buffer);
} }
} }

View File

@@ -1,6 +1,6 @@
{ {
"name": "@lancedb/lancedb-darwin-arm64", "name": "@lancedb/lancedb-darwin-arm64",
"version": "0.19.1-beta.0", "version": "0.19.1-beta.1",
"os": ["darwin"], "os": ["darwin"],
"cpu": ["arm64"], "cpu": ["arm64"],
"main": "lancedb.darwin-arm64.node", "main": "lancedb.darwin-arm64.node",

View File

@@ -1,6 +1,6 @@
{ {
"name": "@lancedb/lancedb-darwin-x64", "name": "@lancedb/lancedb-darwin-x64",
"version": "0.19.1-beta.0", "version": "0.19.1-beta.1",
"os": ["darwin"], "os": ["darwin"],
"cpu": ["x64"], "cpu": ["x64"],
"main": "lancedb.darwin-x64.node", "main": "lancedb.darwin-x64.node",

View File

@@ -1,6 +1,6 @@
{ {
"name": "@lancedb/lancedb-linux-arm64-gnu", "name": "@lancedb/lancedb-linux-arm64-gnu",
"version": "0.19.1-beta.0", "version": "0.19.1-beta.1",
"os": ["linux"], "os": ["linux"],
"cpu": ["arm64"], "cpu": ["arm64"],
"main": "lancedb.linux-arm64-gnu.node", "main": "lancedb.linux-arm64-gnu.node",

View File

@@ -1,6 +1,6 @@
{ {
"name": "@lancedb/lancedb-linux-arm64-musl", "name": "@lancedb/lancedb-linux-arm64-musl",
"version": "0.19.1-beta.0", "version": "0.19.1-beta.1",
"os": ["linux"], "os": ["linux"],
"cpu": ["arm64"], "cpu": ["arm64"],
"main": "lancedb.linux-arm64-musl.node", "main": "lancedb.linux-arm64-musl.node",

View File

@@ -1,6 +1,6 @@
{ {
"name": "@lancedb/lancedb-linux-x64-gnu", "name": "@lancedb/lancedb-linux-x64-gnu",
"version": "0.19.1-beta.0", "version": "0.19.1-beta.1",
"os": ["linux"], "os": ["linux"],
"cpu": ["x64"], "cpu": ["x64"],
"main": "lancedb.linux-x64-gnu.node", "main": "lancedb.linux-x64-gnu.node",

View File

@@ -1,6 +1,6 @@
{ {
"name": "@lancedb/lancedb-linux-x64-musl", "name": "@lancedb/lancedb-linux-x64-musl",
"version": "0.19.1-beta.0", "version": "0.19.1-beta.1",
"os": ["linux"], "os": ["linux"],
"cpu": ["x64"], "cpu": ["x64"],
"main": "lancedb.linux-x64-musl.node", "main": "lancedb.linux-x64-musl.node",

View File

@@ -1,6 +1,6 @@
{ {
"name": "@lancedb/lancedb-win32-arm64-msvc", "name": "@lancedb/lancedb-win32-arm64-msvc",
"version": "0.19.1-beta.0", "version": "0.19.1-beta.1",
"os": [ "os": [
"win32" "win32"
], ],

View File

@@ -1,6 +1,6 @@
{ {
"name": "@lancedb/lancedb-win32-x64-msvc", "name": "@lancedb/lancedb-win32-x64-msvc",
"version": "0.19.1-beta.0", "version": "0.19.1-beta.1",
"os": ["win32"], "os": ["win32"],
"cpu": ["x64"], "cpu": ["x64"],
"main": "lancedb.win32-x64-msvc.node", "main": "lancedb.win32-x64-msvc.node",

View File

@@ -1,12 +1,12 @@
{ {
"name": "@lancedb/lancedb", "name": "@lancedb/lancedb",
"version": "0.19.1-beta.0", "version": "0.19.1-beta.1",
"lockfileVersion": 3, "lockfileVersion": 3,
"requires": true, "requires": true,
"packages": { "packages": {
"": { "": {
"name": "@lancedb/lancedb", "name": "@lancedb/lancedb",
"version": "0.19.1-beta.0", "version": "0.19.1-beta.1",
"cpu": [ "cpu": [
"x64", "x64",
"arm64" "arm64"

View File

@@ -11,7 +11,7 @@
"ann" "ann"
], ],
"private": false, "private": false,
"version": "0.19.1-beta.0", "version": "0.19.1-beta.1",
"main": "dist/index.js", "main": "dist/index.js",
"exports": { "exports": {
".": "./dist/index.js", ".": "./dist/index.js",

View File

@@ -37,7 +37,7 @@ impl NativeMergeInsertBuilder {
} }
#[napi(catch_unwind)] #[napi(catch_unwind)]
pub async fn execute(&self, buf: Buffer) -> napi::Result<()> { pub async fn execute(&self, buf: Buffer) -> napi::Result<MergeStats> {
let data = ipc_file_to_batches(buf.to_vec()) let data = ipc_file_to_batches(buf.to_vec())
.and_then(IntoArrow::into_arrow) .and_then(IntoArrow::into_arrow)
.map_err(|e| { .map_err(|e| {
@@ -46,12 +46,14 @@ impl NativeMergeInsertBuilder {
let this = self.clone(); let this = self.clone();
this.inner.execute(data).await.map_err(|e| { let stats = this.inner.execute(data).await.map_err(|e| {
napi::Error::from_reason(format!( napi::Error::from_reason(format!(
"Failed to execute merge insert: {}", "Failed to execute merge insert: {}",
convert_error(&e) convert_error(&e)
)) ))
}) })?;
Ok(stats.into())
} }
} }
@@ -60,3 +62,20 @@ impl From<MergeInsertBuilder> for NativeMergeInsertBuilder {
Self { inner } Self { inner }
} }
} }
#[napi(object)]
pub struct MergeStats {
pub num_inserted_rows: BigInt,
pub num_updated_rows: BigInt,
pub num_deleted_rows: BigInt,
}
impl From<lancedb::table::MergeStats> for MergeStats {
fn from(stats: lancedb::table::MergeStats) -> Self {
Self {
num_inserted_rows: stats.num_inserted_rows.into(),
num_updated_rows: stats.num_updated_rows.into(),
num_deleted_rows: stats.num_deleted_rows.into(),
}
}
}

View File

@@ -7,7 +7,7 @@ dependencies = [
"numpy", "numpy",
"overrides>=0.7", "overrides>=0.7",
"packaging", "packaging",
"pyarrow>=14", "pyarrow>=16",
"pydantic>=1.10", "pydantic>=1.10",
"tqdm>=4.27.0", "tqdm>=4.27.0",
] ]

View File

@@ -962,10 +962,12 @@ class Table(ABC):
>>> table = db.create_table("my_table", data) >>> table = db.create_table("my_table", data)
>>> new_data = pa.table({"a": [2, 3, 4], "b": ["x", "y", "z"]}) >>> new_data = pa.table({"a": [2, 3, 4], "b": ["x", "y", "z"]})
>>> # Perform a "upsert" operation >>> # Perform a "upsert" operation
>>> table.merge_insert("a") \\ >>> stats = table.merge_insert("a") \\
... .when_matched_update_all() \\ ... .when_matched_update_all() \\
... .when_not_matched_insert_all() \\ ... .when_not_matched_insert_all() \\
... .execute(new_data) ... .execute(new_data)
>>> stats
{'num_inserted_rows': 1, 'num_updated_rows': 2, 'num_deleted_rows': 0}
>>> # The order of new rows is non-deterministic since we use >>> # The order of new rows is non-deterministic since we use
>>> # a hash-join as part of this operation and so we sort here >>> # a hash-join as part of this operation and so we sort here
>>> table.to_arrow().sort_by("a").to_pandas() >>> table.to_arrow().sort_by("a").to_pandas()
@@ -2489,7 +2491,9 @@ class LanceTable(Table):
on_bad_vectors: OnBadVectorsType, on_bad_vectors: OnBadVectorsType,
fill_value: float, fill_value: float,
): ):
LOOP.run(self._table._do_merge(merge, new_data, on_bad_vectors, fill_value)) return LOOP.run(
self._table._do_merge(merge, new_data, on_bad_vectors, fill_value)
)
@deprecation.deprecated( @deprecation.deprecated(
deprecated_in="0.21.0", deprecated_in="0.21.0",
@@ -3277,10 +3281,12 @@ class AsyncTable:
>>> table = db.create_table("my_table", data) >>> table = db.create_table("my_table", data)
>>> new_data = pa.table({"a": [2, 3, 4], "b": ["x", "y", "z"]}) >>> new_data = pa.table({"a": [2, 3, 4], "b": ["x", "y", "z"]})
>>> # Perform a "upsert" operation >>> # Perform a "upsert" operation
>>> table.merge_insert("a") \\ >>> stats = table.merge_insert("a") \\
... .when_matched_update_all() \\ ... .when_matched_update_all() \\
... .when_not_matched_insert_all() \\ ... .when_not_matched_insert_all() \\
... .execute(new_data) ... .execute(new_data)
>>> stats
{'num_inserted_rows': 1, 'num_updated_rows': 2, 'num_deleted_rows': 0}
>>> # The order of new rows is non-deterministic since we use >>> # The order of new rows is non-deterministic since we use
>>> # a hash-join as part of this operation and so we sort here >>> # a hash-join as part of this operation and so we sort here
>>> table.to_arrow().sort_by("a").to_pandas() >>> table.to_arrow().sort_by("a").to_pandas()
@@ -3636,7 +3642,7 @@ class AsyncTable:
) )
if isinstance(data, pa.Table): if isinstance(data, pa.Table):
data = pa.RecordBatchReader.from_batches(data.schema, data.to_batches()) data = pa.RecordBatchReader.from_batches(data.schema, data.to_batches())
await self._inner.execute_merge_insert( return await self._inner.execute_merge_insert(
data, data,
dict( dict(
on=merge._on, on=merge._on,

View File

@@ -18,15 +18,19 @@ def test_upsert(mem_db):
{"id": 1, "name": "Bobby"}, {"id": 1, "name": "Bobby"},
{"id": 2, "name": "Charlie"}, {"id": 2, "name": "Charlie"},
] ]
( stats = (
table.merge_insert("id") table.merge_insert("id")
.when_matched_update_all() .when_matched_update_all()
.when_not_matched_insert_all() .when_not_matched_insert_all()
.execute(new_users) .execute(new_users)
) )
table.count_rows() # 3 table.count_rows() # 3
stats # {'num_inserted_rows': 1, 'num_updated_rows': 1, 'num_deleted_rows': 0}
# --8<-- [end:upsert_basic] # --8<-- [end:upsert_basic]
assert table.count_rows() == 3 assert table.count_rows() == 3
assert stats["num_inserted_rows"] == 1
assert stats["num_updated_rows"] == 1
assert stats["num_deleted_rows"] == 0
@pytest.mark.asyncio @pytest.mark.asyncio
@@ -44,15 +48,19 @@ async def test_upsert_async(mem_db_async):
{"id": 1, "name": "Bobby"}, {"id": 1, "name": "Bobby"},
{"id": 2, "name": "Charlie"}, {"id": 2, "name": "Charlie"},
] ]
await ( stats = await (
table.merge_insert("id") table.merge_insert("id")
.when_matched_update_all() .when_matched_update_all()
.when_not_matched_insert_all() .when_not_matched_insert_all()
.execute(new_users) .execute(new_users)
) )
await table.count_rows() # 3 await table.count_rows() # 3
stats # {'num_inserted_rows': 1, 'num_updated_rows': 1, 'num_deleted_rows': 0}
# --8<-- [end:upsert_basic_async] # --8<-- [end:upsert_basic_async]
assert await table.count_rows() == 3 assert await table.count_rows() == 3
assert stats["num_inserted_rows"] == 1
assert stats["num_updated_rows"] == 1
assert stats["num_deleted_rows"] == 0
def test_insert_if_not_exists(mem_db): def test_insert_if_not_exists(mem_db):
@@ -69,10 +77,16 @@ def test_insert_if_not_exists(mem_db):
{"domain": "google.com", "name": "Google"}, {"domain": "google.com", "name": "Google"},
{"domain": "facebook.com", "name": "Facebook"}, {"domain": "facebook.com", "name": "Facebook"},
] ]
(table.merge_insert("domain").when_not_matched_insert_all().execute(new_domains)) stats = (
table.merge_insert("domain").when_not_matched_insert_all().execute(new_domains)
)
table.count_rows() # 3 table.count_rows() # 3
stats # {'num_inserted_rows': 1, 'num_updated_rows': 0, 'num_deleted_rows': 0}
# --8<-- [end:insert_if_not_exists] # --8<-- [end:insert_if_not_exists]
assert table.count_rows() == 3 assert table.count_rows() == 3
assert stats["num_inserted_rows"] == 1
assert stats["num_updated_rows"] == 0
assert stats["num_deleted_rows"] == 0
@pytest.mark.asyncio @pytest.mark.asyncio
@@ -90,12 +104,16 @@ async def test_insert_if_not_exists_async(mem_db_async):
{"domain": "google.com", "name": "Google"}, {"domain": "google.com", "name": "Google"},
{"domain": "facebook.com", "name": "Facebook"}, {"domain": "facebook.com", "name": "Facebook"},
] ]
await ( stats = await (
table.merge_insert("domain").when_not_matched_insert_all().execute(new_domains) table.merge_insert("domain").when_not_matched_insert_all().execute(new_domains)
) )
await table.count_rows() # 3 await table.count_rows() # 3
stats # {'num_inserted_rows': 1, 'num_updated_rows': 0, 'num_deleted_rows': 0}
# --8<-- [end:insert_if_not_exists_async] # --8<-- [end:insert_if_not_exists_async]
assert await table.count_rows() == 3 assert await table.count_rows() == 3
assert stats["num_inserted_rows"] == 1
assert stats["num_updated_rows"] == 0
assert stats["num_deleted_rows"] == 0
def test_replace_range(mem_db): def test_replace_range(mem_db):
@@ -113,7 +131,7 @@ def test_replace_range(mem_db):
new_chunks = [ new_chunks = [
{"doc_id": 1, "chunk_id": 0, "text": "Baz"}, {"doc_id": 1, "chunk_id": 0, "text": "Baz"},
] ]
( stats = (
table.merge_insert(["doc_id", "chunk_id"]) table.merge_insert(["doc_id", "chunk_id"])
.when_matched_update_all() .when_matched_update_all()
.when_not_matched_insert_all() .when_not_matched_insert_all()
@@ -121,8 +139,12 @@ def test_replace_range(mem_db):
.execute(new_chunks) .execute(new_chunks)
) )
table.count_rows("doc_id = 1") # 1 table.count_rows("doc_id = 1") # 1
stats # {'num_inserted_rows': 0, 'num_updated_rows': 1, 'num_deleted_rows': 1}
# --8<-- [end:replace_range] # --8<-- [end:replace_range]
assert table.count_rows("doc_id = 1") == 1 assert table.count_rows("doc_id = 1") == 1
assert stats["num_inserted_rows"] == 0
assert stats["num_updated_rows"] == 1
assert stats["num_deleted_rows"] == 1
@pytest.mark.asyncio @pytest.mark.asyncio
@@ -141,7 +163,7 @@ async def test_replace_range_async(mem_db_async):
new_chunks = [ new_chunks = [
{"doc_id": 1, "chunk_id": 0, "text": "Baz"}, {"doc_id": 1, "chunk_id": 0, "text": "Baz"},
] ]
await ( stats = await (
table.merge_insert(["doc_id", "chunk_id"]) table.merge_insert(["doc_id", "chunk_id"])
.when_matched_update_all() .when_matched_update_all()
.when_not_matched_insert_all() .when_not_matched_insert_all()
@@ -149,5 +171,9 @@ async def test_replace_range_async(mem_db_async):
.execute(new_chunks) .execute(new_chunks)
) )
await table.count_rows("doc_id = 1") # 1 await table.count_rows("doc_id = 1") # 1
stats # {'num_inserted_rows': 0, 'num_updated_rows': 1, 'num_deleted_rows': 1}
# --8<-- [end:replace_range_async] # --8<-- [end:replace_range_async]
assert await table.count_rows("doc_id = 1") == 1 assert await table.count_rows("doc_id = 1") == 1
assert stats["num_inserted_rows"] == 0
assert stats["num_updated_rows"] == 1
assert stats["num_deleted_rows"] == 1

View File

@@ -489,8 +489,14 @@ impl Table {
} }
future_into_py(self_.py(), async move { future_into_py(self_.py(), async move {
builder.execute(Box::new(batches)).await.infer_error()?; let stats = builder.execute(Box::new(batches)).await.infer_error()?;
Ok(()) Python::with_gil(|py| {
let dict = PyDict::new(py);
dict.set_item("num_inserted_rows", stats.num_inserted_rows)?;
dict.set_item("num_updated_rows", stats.num_updated_rows)?;
dict.set_item("num_deleted_rows", stats.num_deleted_rows)?;
Ok(dict.unbind())
})
}) })
} }

View File

@@ -1,6 +1,6 @@
[package] [package]
name = "lancedb-node" name = "lancedb-node"
version = "0.19.1-beta.0" version = "0.19.1-beta.1"
description = "Serverless, low-latency vector database for AI applications" description = "Serverless, low-latency vector database for AI applications"
license.workspace = true license.workspace = true
edition.workspace = true edition.workspace = true

View File

@@ -1,6 +1,6 @@
[package] [package]
name = "lancedb" name = "lancedb"
version = "0.19.1-beta.0" version = "0.19.1-beta.1"
edition.workspace = true edition.workspace = true
description = "LanceDB: A serverless, low-latency vector database for AI applications" description = "LanceDB: A serverless, low-latency vector database for AI applications"
license.workspace = true license.workspace = true

View File

@@ -47,6 +47,7 @@ use crate::{
TableDefinition, UpdateBuilder, TableDefinition, UpdateBuilder,
}, },
}; };
use lance::dataset::MergeStats;
const REQUEST_TIMEOUT_HEADER: HeaderName = HeaderName::from_static("x-request-timeout-ms"); const REQUEST_TIMEOUT_HEADER: HeaderName = HeaderName::from_static("x-request-timeout-ms");
@@ -1022,7 +1023,7 @@ impl<S: HttpSend> BaseTable for RemoteTable<S> {
&self, &self,
params: MergeInsertBuilder, params: MergeInsertBuilder,
new_data: Box<dyn RecordBatchReader + Send>, new_data: Box<dyn RecordBatchReader + Send>,
) -> Result<()> { ) -> Result<MergeStats> {
self.check_mutable().await?; self.check_mutable().await?;
let query = MergeInsertRequest::try_from(params)?; let query = MergeInsertRequest::try_from(params)?;
@@ -1034,9 +1035,11 @@ impl<S: HttpSend> BaseTable for RemoteTable<S> {
let (request_id, response) = self.send_streaming(request, new_data, true).await?; let (request_id, response) = self.send_streaming(request, new_data, true).await?;
// TODO: server can response with these stats in response body.
// We should test that we can handle both empty response from old server
// and response with stats from new server.
self.check_table_response(&request_id, response).await?; self.check_table_response(&request_id, response).await?;
Ok(MergeStats::default())
Ok(())
} }
async fn tags(&self) -> Result<Box<dyn Tags + '_>> { async fn tags(&self) -> Result<Box<dyn Tags + '_>> {
@@ -1348,7 +1351,12 @@ mod tests {
Box::pin(table.count_rows(None).map_ok(|_| ())), Box::pin(table.count_rows(None).map_ok(|_| ())),
Box::pin(table.update().column("a", "a + 1").execute().map_ok(|_| ())), Box::pin(table.update().column("a", "a + 1").execute().map_ok(|_| ())),
Box::pin(table.add(example_data()).execute().map_ok(|_| ())), Box::pin(table.add(example_data()).execute().map_ok(|_| ())),
Box::pin(table.merge_insert(&["test"]).execute(example_data())), Box::pin(
table
.merge_insert(&["test"])
.execute(example_data())
.map_ok(|_| ()),
),
Box::pin(table.delete("false")), Box::pin(table.delete("false")),
Box::pin(table.add_columns( Box::pin(table.add_columns(
NewColumnTransform::SqlExpressions(vec![("x".into(), "y".into())]), NewColumnTransform::SqlExpressions(vec![("x".into(), "y".into())]),

View File

@@ -20,6 +20,7 @@ use lance::dataset::cleanup::RemovalStats;
use lance::dataset::optimize::{compact_files, CompactionMetrics, IndexRemapperOptions}; use lance::dataset::optimize::{compact_files, CompactionMetrics, IndexRemapperOptions};
use lance::dataset::scanner::Scanner; use lance::dataset::scanner::Scanner;
pub use lance::dataset::ColumnAlteration; pub use lance::dataset::ColumnAlteration;
pub use lance::dataset::MergeStats;
pub use lance::dataset::NewColumnTransform; pub use lance::dataset::NewColumnTransform;
pub use lance::dataset::ReadParams; pub use lance::dataset::ReadParams;
pub use lance::dataset::Version; pub use lance::dataset::Version;
@@ -487,7 +488,7 @@ pub trait BaseTable: std::fmt::Display + std::fmt::Debug + Send + Sync {
&self, &self,
params: MergeInsertBuilder, params: MergeInsertBuilder,
new_data: Box<dyn RecordBatchReader + Send>, new_data: Box<dyn RecordBatchReader + Send>,
) -> Result<()>; ) -> Result<MergeStats>;
/// Gets the table tag manager. /// Gets the table tag manager.
async fn tags(&self) -> Result<Box<dyn Tags + '_>>; async fn tags(&self) -> Result<Box<dyn Tags + '_>>;
/// Optimize the dataset. /// Optimize the dataset.
@@ -2367,7 +2368,7 @@ impl BaseTable for NativeTable {
&self, &self,
params: MergeInsertBuilder, params: MergeInsertBuilder,
new_data: Box<dyn RecordBatchReader + Send>, new_data: Box<dyn RecordBatchReader + Send>,
) -> Result<()> { ) -> Result<MergeStats> {
let dataset = Arc::new(self.dataset.get().await?.clone()); let dataset = Arc::new(self.dataset.get().await?.clone());
let mut builder = LanceMergeInsertBuilder::try_new(dataset.clone(), params.on)?; let mut builder = LanceMergeInsertBuilder::try_new(dataset.clone(), params.on)?;
match ( match (
@@ -2394,9 +2395,9 @@ impl BaseTable for NativeTable {
builder.when_not_matched_by_source(WhenNotMatchedBySource::Keep); builder.when_not_matched_by_source(WhenNotMatchedBySource::Keep);
} }
let job = builder.try_build()?; let job = builder.try_build()?;
let (new_dataset, _stats) = job.execute_reader(new_data).await?; let (new_dataset, stats) = job.execute_reader(new_data).await?;
self.dataset.set_latest(new_dataset.as_ref().clone()).await; self.dataset.set_latest(new_dataset.as_ref().clone()).await;
Ok(()) Ok(stats)
} }
/// Delete rows from the table /// Delete rows from the table

View File

@@ -4,6 +4,7 @@
use std::sync::Arc; use std::sync::Arc;
use arrow_array::RecordBatchReader; use arrow_array::RecordBatchReader;
use lance::dataset::MergeStats;
use crate::Result; use crate::Result;
@@ -86,8 +87,9 @@ impl MergeInsertBuilder {
/// Executes the merge insert operation /// Executes the merge insert operation
/// ///
/// Nothing is returned but the [`super::Table`] is updated /// Returns statistics about the merge operation including the number of rows
pub async fn execute(self, new_data: Box<dyn RecordBatchReader + Send>) -> Result<()> { /// inserted, updated, and deleted.
pub async fn execute(self, new_data: Box<dyn RecordBatchReader + Send>) -> Result<MergeStats> {
self.table.clone().merge_insert(self, new_data).await self.table.clone().merge_insert(self, new_data).await
} }
} }