mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-23 05:19:58 +00:00
Compare commits
8 Commits
v0.19.1-be
...
docs/quick
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
9e278fc5a6 | ||
|
|
09fed1f286 | ||
|
|
cee2b5ea42 | ||
|
|
f315f9665a | ||
|
|
5deb26bc8b | ||
|
|
3cc670ac38 | ||
|
|
4ade3e31e2 | ||
|
|
a222d2cd91 |
1
.github/workflows/python.yml
vendored
1
.github/workflows/python.yml
vendored
@@ -228,6 +228,7 @@ jobs:
|
||||
- name: Install lancedb
|
||||
run: |
|
||||
pip install "pydantic<2"
|
||||
pip install pyarrow==16
|
||||
pip install --extra-index-url https://pypi.fury.io/lancedb/ -e .[tests]
|
||||
pip install tantivy
|
||||
- name: Run tests
|
||||
|
||||
8
Cargo.lock
generated
8
Cargo.lock
generated
@@ -4135,7 +4135,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lancedb"
|
||||
version = "0.19.1-beta.0"
|
||||
version = "0.19.1-beta.1"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"arrow-array",
|
||||
@@ -4222,7 +4222,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lancedb-node"
|
||||
version = "0.19.1-beta.0"
|
||||
version = "0.19.1-beta.1"
|
||||
dependencies = [
|
||||
"arrow-array",
|
||||
"arrow-ipc",
|
||||
@@ -4247,7 +4247,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lancedb-nodejs"
|
||||
version = "0.19.1-beta.0"
|
||||
version = "0.19.1-beta.1"
|
||||
dependencies = [
|
||||
"arrow-array",
|
||||
"arrow-ipc",
|
||||
@@ -4266,7 +4266,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lancedb-python"
|
||||
version = "0.22.1-beta.0"
|
||||
version = "0.22.1-beta.1"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"env_logger",
|
||||
|
||||
@@ -105,7 +105,8 @@ markdown_extensions:
|
||||
nav:
|
||||
- Home:
|
||||
- LanceDB: index.md
|
||||
- 🏃🏼♂️ Quick start: basic.md
|
||||
- 👉 Quickstart: quickstart.md
|
||||
- 🏃🏼♂️ Basic Usage: basic.md
|
||||
- 📚 Concepts:
|
||||
- Vector search: concepts/vector_search.md
|
||||
- Indexing:
|
||||
@@ -237,7 +238,9 @@ nav:
|
||||
- 👾 JavaScript (lancedb): js/globals.md
|
||||
- 🦀 Rust: https://docs.rs/lancedb/latest/lancedb/
|
||||
|
||||
- Quick start: basic.md
|
||||
- Getting Started:
|
||||
- Quickstart: quickstart.md
|
||||
- Basic Usage: basic.md
|
||||
- Concepts:
|
||||
- Vector search: concepts/vector_search.md
|
||||
- Indexing:
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
# Quick start
|
||||
# Basic Usage
|
||||
|
||||
!!! info "LanceDB can be run in a number of ways:"
|
||||
|
||||
|
||||
@@ -33,20 +33,20 @@ Construct a MergeInsertBuilder. __Internal use only.__
|
||||
### execute()
|
||||
|
||||
```ts
|
||||
execute(data): Promise<void>
|
||||
execute(data): Promise<MergeStats>
|
||||
```
|
||||
|
||||
Executes the merge insert operation
|
||||
|
||||
Nothing is returned but the `Table` is updated
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **data**: [`Data`](../type-aliases/Data.md)
|
||||
|
||||
#### Returns
|
||||
|
||||
`Promise`<`void`>
|
||||
`Promise`<[`MergeStats`](../interfaces/MergeStats.md)>
|
||||
|
||||
Statistics about the merge operation: counts of inserted, updated, and deleted rows
|
||||
|
||||
***
|
||||
|
||||
|
||||
@@ -54,6 +54,7 @@
|
||||
- [IndexStatistics](interfaces/IndexStatistics.md)
|
||||
- [IvfFlatOptions](interfaces/IvfFlatOptions.md)
|
||||
- [IvfPqOptions](interfaces/IvfPqOptions.md)
|
||||
- [MergeStats](interfaces/MergeStats.md)
|
||||
- [OpenTableOptions](interfaces/OpenTableOptions.md)
|
||||
- [OptimizeOptions](interfaces/OptimizeOptions.md)
|
||||
- [OptimizeStats](interfaces/OptimizeStats.md)
|
||||
|
||||
31
docs/src/js/interfaces/MergeStats.md
Normal file
31
docs/src/js/interfaces/MergeStats.md
Normal file
@@ -0,0 +1,31 @@
|
||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../globals.md) / MergeStats
|
||||
|
||||
# Interface: MergeStats
|
||||
|
||||
## Properties
|
||||
|
||||
### numDeletedRows
|
||||
|
||||
```ts
|
||||
numDeletedRows: bigint;
|
||||
```
|
||||
|
||||
***
|
||||
|
||||
### numInsertedRows
|
||||
|
||||
```ts
|
||||
numInsertedRows: bigint;
|
||||
```
|
||||
|
||||
***
|
||||
|
||||
### numUpdatedRows
|
||||
|
||||
```ts
|
||||
numUpdatedRows: bigint;
|
||||
```
|
||||
101
docs/src/quickstart.md
Normal file
101
docs/src/quickstart.md
Normal file
@@ -0,0 +1,101 @@
|
||||
|
||||
# Getting Started with LanceDB: A Minimal Vector Search Tutorial
|
||||
|
||||
Let's set up a LanceDB database, insert vector data, and perform a simple vector search. We'll use simple character classes like "knight" and "rogue" to illustrate semantic relevance.
|
||||
|
||||
## 1. Install Dependencies
|
||||
|
||||
Before starting, make sure you have the necessary packages:
|
||||
|
||||
```bash
|
||||
pip install lancedb pandas numpy
|
||||
```
|
||||
|
||||
## 2. Import Required Libraries
|
||||
|
||||
```python
|
||||
import lancedb
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
```
|
||||
|
||||
## 3. Connect to LanceDB
|
||||
|
||||
You can use a local directory to store your database:
|
||||
|
||||
```python
|
||||
db = lancedb.connect("./lancedb")
|
||||
```
|
||||
|
||||
## 4. Create Sample Data
|
||||
|
||||
Add sample text data and corresponding 4D vectors:
|
||||
|
||||
```python
|
||||
data = pd.DataFrame([
|
||||
{"id": "1", "vector": [1.0, 0.0, 0.0, 0.0], "text": "knight"},
|
||||
{"id": "2", "vector": [0.9, 0.1, 0.0, 0.0], "text": "warrior"},
|
||||
{"id": "3", "vector": [0.0, 1.0, 0.0, 0.0], "text": "rogue"},
|
||||
{"id": "4", "vector": [0.0, 0.9, 0.1, 0.0], "text": "thief"},
|
||||
{"id": "5", "vector": [0.5, 0.5, 0.0, 0.0], "text": "ranger"},
|
||||
])
|
||||
```
|
||||
|
||||
## 5. Create a Table in LanceDB
|
||||
|
||||
```python
|
||||
table = db.create_table("rpg_classes", data=data, mode="overwrite")
|
||||
```
|
||||
|
||||
Let's see how the table looks:
|
||||
```python
|
||||
print(data)
|
||||
```
|
||||
|
||||
| id | vector | text |
|
||||
|----|--------|------|
|
||||
| 1 | [1.0, 0.0, 0.0, 0.0] | knight |
|
||||
| 2 | [0.9, 0.1, 0.0, 0.0] | warrior |
|
||||
| 3 | [0.0, 1.0, 0.0, 0.0] | rogue |
|
||||
| 4 | [0.0, 0.9, 0.1, 0.0] | thief |
|
||||
| 5 | [0.5, 0.5, 0.0, 0.0] | ranger |
|
||||
|
||||
|
||||
|
||||
## 6. Perform a Vector Search
|
||||
|
||||
Search for the most similar character classes to our query vector:
|
||||
|
||||
```python
|
||||
# Query as if we are searching for "rogue"
|
||||
results = table.search([0.95, 0.05, 0.0, 0.0]).limit(3).to_df()
|
||||
print(results)
|
||||
```
|
||||
|
||||
This will return the top 3 closest classes to the vector, effectively showing how LanceDB can be used for semantic search.
|
||||
|
||||
| id | vector | text | _distance |
|
||||
|------|------------------------|----------|-----------|
|
||||
| 3 | [0.0, 1.0, 0.0, 0.0] | rogue | 0.00 |
|
||||
| 4 | [0.0, 0.9, 0.1, 0.0] | thief | 0.02 |
|
||||
| 5 | [0.5, 0.5, 0.0, 0.0] | ranger | 0.50 |
|
||||
|
||||
Let's try searching for "knight"
|
||||
|
||||
```python
|
||||
query_vector = [1.0, 0.0, 0.0, 0.0]
|
||||
results = table.search(query_vector).limit(3).to_pandas()
|
||||
print(results)
|
||||
```
|
||||
|
||||
| id | vector | text | _distance |
|
||||
|------|------------------------|----------|-----------|
|
||||
| 1 | [1.0, 0.0, 0.0, 0.0] | knight | 0.00 |
|
||||
| 2 | [0.9, 0.1, 0.0, 0.0] | warrior | 0.02 |
|
||||
| 5 | [0.5, 0.5, 0.0, 0.0] | ranger | 0.50 |
|
||||
|
||||
## Next Steps
|
||||
|
||||
That's it - you just conducted vector search!
|
||||
|
||||
For more beginner tips, check out the [Basic Usage](basic.md) guide.
|
||||
44
node/package-lock.json
generated
44
node/package-lock.json
generated
@@ -1,12 +1,12 @@
|
||||
{
|
||||
"name": "vectordb",
|
||||
"version": "0.19.1-beta.0",
|
||||
"version": "0.19.1-beta.1",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "vectordb",
|
||||
"version": "0.19.1-beta.0",
|
||||
"version": "0.19.1-beta.1",
|
||||
"cpu": [
|
||||
"x64",
|
||||
"arm64"
|
||||
@@ -52,11 +52,11 @@
|
||||
"uuid": "^9.0.0"
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"@lancedb/vectordb-darwin-arm64": "0.19.1-beta.0",
|
||||
"@lancedb/vectordb-darwin-x64": "0.19.1-beta.0",
|
||||
"@lancedb/vectordb-linux-arm64-gnu": "0.19.1-beta.0",
|
||||
"@lancedb/vectordb-linux-x64-gnu": "0.19.1-beta.0",
|
||||
"@lancedb/vectordb-win32-x64-msvc": "0.19.1-beta.0"
|
||||
"@lancedb/vectordb-darwin-arm64": "0.19.1-beta.1",
|
||||
"@lancedb/vectordb-darwin-x64": "0.19.1-beta.1",
|
||||
"@lancedb/vectordb-linux-arm64-gnu": "0.19.1-beta.1",
|
||||
"@lancedb/vectordb-linux-x64-gnu": "0.19.1-beta.1",
|
||||
"@lancedb/vectordb-win32-x64-msvc": "0.19.1-beta.1"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"@apache-arrow/ts": "^14.0.2",
|
||||
@@ -327,9 +327,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-darwin-arm64": {
|
||||
"version": "0.19.1-beta.0",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.19.1-beta.0.tgz",
|
||||
"integrity": "sha512-LXdyFLniwciIslw487JRmYB+PpuwBhq4jT9hNxKKMrnXfBvUEQ8HN9kxkLFy+pwFaY1wmFH0EfujnXlt8qXgqA==",
|
||||
"version": "0.19.1-beta.1",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.19.1-beta.1.tgz",
|
||||
"integrity": "sha512-Epvel0pF5TM6MtIWQ2KhqezqSSHTL3Wr7a2rGAwz6X/XY23i6DbMPpPs0HyeIDzDrhxNfE3cz3S+SiCA6xpR0g==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
@@ -340,9 +340,9 @@
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-darwin-x64": {
|
||||
"version": "0.19.1-beta.0",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.19.1-beta.0.tgz",
|
||||
"integrity": "sha512-kVZzc6j778vg3AftFdu8pSg/3J9YHCID70bf3QpMnl43b9ZSl4Sh8WNVYWiqfwOAElD+kOaTq9MC4f7jkW/DEQ==",
|
||||
"version": "0.19.1-beta.1",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.19.1-beta.1.tgz",
|
||||
"integrity": "sha512-hOiUSlIoISbiXytp46hToi/r6sF5pImAsfbzCsIq8ExDV4TPa8fjbhcIT80vxxOwc2mpSSK4HsVJYod95RSbEQ==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
@@ -353,9 +353,9 @@
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-linux-arm64-gnu": {
|
||||
"version": "0.19.1-beta.0",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.19.1-beta.0.tgz",
|
||||
"integrity": "sha512-EOrAwqUkCXBp7GVwzhyUoFIUbV+7eXDbEHo6mHgZN6E1SOAgBuPiwRqDlqA4uCYU8YhZJZabXdiLTP+EuSjpgw==",
|
||||
"version": "0.19.1-beta.1",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.19.1-beta.1.tgz",
|
||||
"integrity": "sha512-/1JhGVDEngwrlM8o2TNW8G6nJ9U/VgHKAORmj/cTA7O30helJIoo9jfvUAUy+vZ4VoEwRXQbMI+gaYTg0l3MTg==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
@@ -366,9 +366,9 @@
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-linux-x64-gnu": {
|
||||
"version": "0.19.1-beta.0",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.19.1-beta.0.tgz",
|
||||
"integrity": "sha512-Xs7wlM6LJS4/jps9Vi1TMtR9kYedzShZQ1sepJzn2cuI1jnbCrlLsFSMUfjhxy7/0+E8QO1f7cBO6gwOTunrog==",
|
||||
"version": "0.19.1-beta.1",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.19.1-beta.1.tgz",
|
||||
"integrity": "sha512-zNRGSSUt8nTJMmll4NdxhQjwxR8Rezq3T4dsRoiDts5ienMam5HFjYiZ3FkDZQo16rgq2BcbFuH1G8u1chywlg==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
@@ -379,9 +379,9 @@
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-win32-x64-msvc": {
|
||||
"version": "0.19.1-beta.0",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.19.1-beta.0.tgz",
|
||||
"integrity": "sha512-tS9g2S4hxdsQfVd6S6N+HuX1Mt4iJEQjpHmmfz+WEfdc5JP6eyKoqqQzFw2oL3TJAG7HXfD88/24nCUUCHDKoA==",
|
||||
"version": "0.19.1-beta.1",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.19.1-beta.1.tgz",
|
||||
"integrity": "sha512-yV550AJGlsIFdm1KoHQPJ1TZx121ZXCIdebBtBZj3wOObIhyB/i0kZAtGvwjkmr7EYyfzt1EHZzbjSGVdehIAA==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
|
||||
@@ -374,6 +374,71 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
|
||||
expect(table2.numRows).toBe(4);
|
||||
expect(table2.schema).toEqual(schema);
|
||||
});
|
||||
|
||||
it("should correctly retain values in nested struct fields", async function () {
|
||||
// Define test data with nested struct
|
||||
const testData = [
|
||||
{
|
||||
id: "doc1",
|
||||
vector: [1, 2, 3],
|
||||
metadata: {
|
||||
filePath: "/path/to/file1.ts",
|
||||
startLine: 10,
|
||||
endLine: 20,
|
||||
text: "function test() { return true; }",
|
||||
},
|
||||
},
|
||||
{
|
||||
id: "doc2",
|
||||
vector: [4, 5, 6],
|
||||
metadata: {
|
||||
filePath: "/path/to/file2.ts",
|
||||
startLine: 30,
|
||||
endLine: 40,
|
||||
text: "function test2() { return false; }",
|
||||
},
|
||||
},
|
||||
];
|
||||
|
||||
// Create Arrow table from the data
|
||||
const table = makeArrowTable(testData);
|
||||
|
||||
// Verify schema has the nested struct fields
|
||||
const metadataField = table.schema.fields.find(
|
||||
(f) => f.name === "metadata",
|
||||
);
|
||||
expect(metadataField).toBeDefined();
|
||||
// biome-ignore lint/suspicious/noExplicitAny: accessing fields in different Arrow versions
|
||||
const childNames = metadataField?.type.children.map((c: any) => c.name);
|
||||
expect(childNames).toEqual([
|
||||
"filePath",
|
||||
"startLine",
|
||||
"endLine",
|
||||
"text",
|
||||
]);
|
||||
|
||||
// Convert to buffer and back (simulating storage and retrieval)
|
||||
const buf = await fromTableToBuffer(table);
|
||||
const retrievedTable = tableFromIPC(buf);
|
||||
|
||||
// Verify the retrieved table has the same structure
|
||||
const rows = [];
|
||||
for (let i = 0; i < retrievedTable.numRows; i++) {
|
||||
rows.push(retrievedTable.get(i));
|
||||
}
|
||||
|
||||
// Check values in the first row
|
||||
const firstRow = rows[0];
|
||||
expect(firstRow.id).toBe("doc1");
|
||||
expect(firstRow.vector.toJSON()).toEqual([1, 2, 3]);
|
||||
|
||||
// Verify metadata values are preserved (this is where the bug is)
|
||||
expect(firstRow.metadata).toBeDefined();
|
||||
expect(firstRow.metadata.filePath).toBe("/path/to/file1.ts");
|
||||
expect(firstRow.metadata.startLine).toBe(10);
|
||||
expect(firstRow.metadata.endLine).toBe(20);
|
||||
expect(firstRow.metadata.text).toBe("function test() { return true; }");
|
||||
});
|
||||
});
|
||||
|
||||
class DummyEmbedding extends EmbeddingFunction<string> {
|
||||
|
||||
@@ -338,11 +338,16 @@ describe("merge insert", () => {
|
||||
{ a: 3, b: "y" },
|
||||
{ a: 4, b: "z" },
|
||||
];
|
||||
await table
|
||||
const stats = await table
|
||||
.mergeInsert("a")
|
||||
.whenMatchedUpdateAll()
|
||||
.whenNotMatchedInsertAll()
|
||||
.execute(newData);
|
||||
|
||||
expect(stats.numInsertedRows).toBe(1n);
|
||||
expect(stats.numUpdatedRows).toBe(2n);
|
||||
expect(stats.numDeletedRows).toBe(0n);
|
||||
|
||||
const expected = [
|
||||
{ a: 1, b: "a" },
|
||||
{ a: 2, b: "x" },
|
||||
|
||||
@@ -639,8 +639,9 @@ function transposeData(
|
||||
): Vector {
|
||||
if (field.type instanceof Struct) {
|
||||
const childFields = field.type.children;
|
||||
const fullPath = [...path, field.name];
|
||||
const childVectors = childFields.map((child) => {
|
||||
return transposeData(data, child, [...path, child.name]);
|
||||
return transposeData(data, child, fullPath);
|
||||
});
|
||||
const structData = makeData({
|
||||
type: field.type,
|
||||
@@ -652,7 +653,14 @@ function transposeData(
|
||||
const values = data.map((datum) => {
|
||||
let current: unknown = datum;
|
||||
for (const key of valuesPath) {
|
||||
if (isObject(current) && Object.hasOwn(current, key)) {
|
||||
if (current == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (
|
||||
isObject(current) &&
|
||||
(Object.hasOwn(current, key) || key in current)
|
||||
) {
|
||||
current = current[key];
|
||||
} else {
|
||||
return null;
|
||||
|
||||
@@ -28,6 +28,7 @@ export {
|
||||
FragmentSummaryStats,
|
||||
Tags,
|
||||
TagContents,
|
||||
MergeStats,
|
||||
} from "./native.js";
|
||||
|
||||
export {
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
import { Data, Schema, fromDataToBuffer } from "./arrow";
|
||||
import { NativeMergeInsertBuilder } from "./native";
|
||||
import { MergeStats, NativeMergeInsertBuilder } from "./native";
|
||||
|
||||
/** A builder used to create and run a merge insert operation */
|
||||
export class MergeInsertBuilder {
|
||||
@@ -73,9 +73,9 @@ export class MergeInsertBuilder {
|
||||
/**
|
||||
* Executes the merge insert operation
|
||||
*
|
||||
* Nothing is returned but the `Table` is updated
|
||||
* @returns Statistics about the merge operation: counts of inserted, updated, and deleted rows
|
||||
*/
|
||||
async execute(data: Data): Promise<void> {
|
||||
async execute(data: Data): Promise<MergeStats> {
|
||||
let schema: Schema;
|
||||
if (this.#schema instanceof Promise) {
|
||||
schema = await this.#schema;
|
||||
@@ -84,6 +84,6 @@ export class MergeInsertBuilder {
|
||||
schema = this.#schema;
|
||||
}
|
||||
const buffer = await fromDataToBuffer(data, undefined, schema);
|
||||
await this.#native.execute(buffer);
|
||||
return await this.#native.execute(buffer);
|
||||
}
|
||||
}
|
||||
|
||||
4
nodejs/package-lock.json
generated
4
nodejs/package-lock.json
generated
@@ -1,12 +1,12 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb",
|
||||
"version": "0.19.1-beta.0",
|
||||
"version": "0.19.1-beta.1",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "@lancedb/lancedb",
|
||||
"version": "0.19.1-beta.0",
|
||||
"version": "0.19.1-beta.1",
|
||||
"cpu": [
|
||||
"x64",
|
||||
"arm64"
|
||||
|
||||
@@ -37,7 +37,7 @@ impl NativeMergeInsertBuilder {
|
||||
}
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
pub async fn execute(&self, buf: Buffer) -> napi::Result<()> {
|
||||
pub async fn execute(&self, buf: Buffer) -> napi::Result<MergeStats> {
|
||||
let data = ipc_file_to_batches(buf.to_vec())
|
||||
.and_then(IntoArrow::into_arrow)
|
||||
.map_err(|e| {
|
||||
@@ -46,12 +46,14 @@ impl NativeMergeInsertBuilder {
|
||||
|
||||
let this = self.clone();
|
||||
|
||||
this.inner.execute(data).await.map_err(|e| {
|
||||
let stats = this.inner.execute(data).await.map_err(|e| {
|
||||
napi::Error::from_reason(format!(
|
||||
"Failed to execute merge insert: {}",
|
||||
convert_error(&e)
|
||||
))
|
||||
})
|
||||
})?;
|
||||
|
||||
Ok(stats.into())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -60,3 +62,20 @@ impl From<MergeInsertBuilder> for NativeMergeInsertBuilder {
|
||||
Self { inner }
|
||||
}
|
||||
}
|
||||
|
||||
#[napi(object)]
|
||||
pub struct MergeStats {
|
||||
pub num_inserted_rows: BigInt,
|
||||
pub num_updated_rows: BigInt,
|
||||
pub num_deleted_rows: BigInt,
|
||||
}
|
||||
|
||||
impl From<lancedb::table::MergeStats> for MergeStats {
|
||||
fn from(stats: lancedb::table::MergeStats) -> Self {
|
||||
Self {
|
||||
num_inserted_rows: stats.num_inserted_rows.into(),
|
||||
num_updated_rows: stats.num_updated_rows.into(),
|
||||
num_deleted_rows: stats.num_deleted_rows.into(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,7 +7,7 @@ dependencies = [
|
||||
"numpy",
|
||||
"overrides>=0.7",
|
||||
"packaging",
|
||||
"pyarrow>=14",
|
||||
"pyarrow>=16",
|
||||
"pydantic>=1.10",
|
||||
"tqdm>=4.27.0",
|
||||
]
|
||||
|
||||
@@ -962,10 +962,12 @@ class Table(ABC):
|
||||
>>> table = db.create_table("my_table", data)
|
||||
>>> new_data = pa.table({"a": [2, 3, 4], "b": ["x", "y", "z"]})
|
||||
>>> # Perform a "upsert" operation
|
||||
>>> table.merge_insert("a") \\
|
||||
>>> stats = table.merge_insert("a") \\
|
||||
... .when_matched_update_all() \\
|
||||
... .when_not_matched_insert_all() \\
|
||||
... .execute(new_data)
|
||||
>>> stats
|
||||
{'num_inserted_rows': 1, 'num_updated_rows': 2, 'num_deleted_rows': 0}
|
||||
>>> # The order of new rows is non-deterministic since we use
|
||||
>>> # a hash-join as part of this operation and so we sort here
|
||||
>>> table.to_arrow().sort_by("a").to_pandas()
|
||||
@@ -2489,7 +2491,9 @@ class LanceTable(Table):
|
||||
on_bad_vectors: OnBadVectorsType,
|
||||
fill_value: float,
|
||||
):
|
||||
LOOP.run(self._table._do_merge(merge, new_data, on_bad_vectors, fill_value))
|
||||
return LOOP.run(
|
||||
self._table._do_merge(merge, new_data, on_bad_vectors, fill_value)
|
||||
)
|
||||
|
||||
@deprecation.deprecated(
|
||||
deprecated_in="0.21.0",
|
||||
@@ -3277,10 +3281,12 @@ class AsyncTable:
|
||||
>>> table = db.create_table("my_table", data)
|
||||
>>> new_data = pa.table({"a": [2, 3, 4], "b": ["x", "y", "z"]})
|
||||
>>> # Perform a "upsert" operation
|
||||
>>> table.merge_insert("a") \\
|
||||
>>> stats = table.merge_insert("a") \\
|
||||
... .when_matched_update_all() \\
|
||||
... .when_not_matched_insert_all() \\
|
||||
... .execute(new_data)
|
||||
>>> stats
|
||||
{'num_inserted_rows': 1, 'num_updated_rows': 2, 'num_deleted_rows': 0}
|
||||
>>> # The order of new rows is non-deterministic since we use
|
||||
>>> # a hash-join as part of this operation and so we sort here
|
||||
>>> table.to_arrow().sort_by("a").to_pandas()
|
||||
@@ -3636,7 +3642,7 @@ class AsyncTable:
|
||||
)
|
||||
if isinstance(data, pa.Table):
|
||||
data = pa.RecordBatchReader.from_batches(data.schema, data.to_batches())
|
||||
await self._inner.execute_merge_insert(
|
||||
return await self._inner.execute_merge_insert(
|
||||
data,
|
||||
dict(
|
||||
on=merge._on,
|
||||
|
||||
@@ -18,15 +18,19 @@ def test_upsert(mem_db):
|
||||
{"id": 1, "name": "Bobby"},
|
||||
{"id": 2, "name": "Charlie"},
|
||||
]
|
||||
(
|
||||
stats = (
|
||||
table.merge_insert("id")
|
||||
.when_matched_update_all()
|
||||
.when_not_matched_insert_all()
|
||||
.execute(new_users)
|
||||
)
|
||||
table.count_rows() # 3
|
||||
stats # {'num_inserted_rows': 1, 'num_updated_rows': 1, 'num_deleted_rows': 0}
|
||||
# --8<-- [end:upsert_basic]
|
||||
assert table.count_rows() == 3
|
||||
assert stats["num_inserted_rows"] == 1
|
||||
assert stats["num_updated_rows"] == 1
|
||||
assert stats["num_deleted_rows"] == 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@@ -44,15 +48,19 @@ async def test_upsert_async(mem_db_async):
|
||||
{"id": 1, "name": "Bobby"},
|
||||
{"id": 2, "name": "Charlie"},
|
||||
]
|
||||
await (
|
||||
stats = await (
|
||||
table.merge_insert("id")
|
||||
.when_matched_update_all()
|
||||
.when_not_matched_insert_all()
|
||||
.execute(new_users)
|
||||
)
|
||||
await table.count_rows() # 3
|
||||
stats # {'num_inserted_rows': 1, 'num_updated_rows': 1, 'num_deleted_rows': 0}
|
||||
# --8<-- [end:upsert_basic_async]
|
||||
assert await table.count_rows() == 3
|
||||
assert stats["num_inserted_rows"] == 1
|
||||
assert stats["num_updated_rows"] == 1
|
||||
assert stats["num_deleted_rows"] == 0
|
||||
|
||||
|
||||
def test_insert_if_not_exists(mem_db):
|
||||
@@ -69,10 +77,16 @@ def test_insert_if_not_exists(mem_db):
|
||||
{"domain": "google.com", "name": "Google"},
|
||||
{"domain": "facebook.com", "name": "Facebook"},
|
||||
]
|
||||
(table.merge_insert("domain").when_not_matched_insert_all().execute(new_domains))
|
||||
stats = (
|
||||
table.merge_insert("domain").when_not_matched_insert_all().execute(new_domains)
|
||||
)
|
||||
table.count_rows() # 3
|
||||
stats # {'num_inserted_rows': 1, 'num_updated_rows': 0, 'num_deleted_rows': 0}
|
||||
# --8<-- [end:insert_if_not_exists]
|
||||
assert table.count_rows() == 3
|
||||
assert stats["num_inserted_rows"] == 1
|
||||
assert stats["num_updated_rows"] == 0
|
||||
assert stats["num_deleted_rows"] == 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@@ -90,12 +104,16 @@ async def test_insert_if_not_exists_async(mem_db_async):
|
||||
{"domain": "google.com", "name": "Google"},
|
||||
{"domain": "facebook.com", "name": "Facebook"},
|
||||
]
|
||||
await (
|
||||
stats = await (
|
||||
table.merge_insert("domain").when_not_matched_insert_all().execute(new_domains)
|
||||
)
|
||||
await table.count_rows() # 3
|
||||
stats # {'num_inserted_rows': 1, 'num_updated_rows': 0, 'num_deleted_rows': 0}
|
||||
# --8<-- [end:insert_if_not_exists_async]
|
||||
assert await table.count_rows() == 3
|
||||
assert stats["num_inserted_rows"] == 1
|
||||
assert stats["num_updated_rows"] == 0
|
||||
assert stats["num_deleted_rows"] == 0
|
||||
|
||||
|
||||
def test_replace_range(mem_db):
|
||||
@@ -113,7 +131,7 @@ def test_replace_range(mem_db):
|
||||
new_chunks = [
|
||||
{"doc_id": 1, "chunk_id": 0, "text": "Baz"},
|
||||
]
|
||||
(
|
||||
stats = (
|
||||
table.merge_insert(["doc_id", "chunk_id"])
|
||||
.when_matched_update_all()
|
||||
.when_not_matched_insert_all()
|
||||
@@ -121,8 +139,12 @@ def test_replace_range(mem_db):
|
||||
.execute(new_chunks)
|
||||
)
|
||||
table.count_rows("doc_id = 1") # 1
|
||||
stats # {'num_inserted_rows': 0, 'num_updated_rows': 1, 'num_deleted_rows': 1}
|
||||
# --8<-- [end:replace_range]
|
||||
assert table.count_rows("doc_id = 1") == 1
|
||||
assert stats["num_inserted_rows"] == 0
|
||||
assert stats["num_updated_rows"] == 1
|
||||
assert stats["num_deleted_rows"] == 1
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@@ -141,7 +163,7 @@ async def test_replace_range_async(mem_db_async):
|
||||
new_chunks = [
|
||||
{"doc_id": 1, "chunk_id": 0, "text": "Baz"},
|
||||
]
|
||||
await (
|
||||
stats = await (
|
||||
table.merge_insert(["doc_id", "chunk_id"])
|
||||
.when_matched_update_all()
|
||||
.when_not_matched_insert_all()
|
||||
@@ -149,5 +171,9 @@ async def test_replace_range_async(mem_db_async):
|
||||
.execute(new_chunks)
|
||||
)
|
||||
await table.count_rows("doc_id = 1") # 1
|
||||
stats # {'num_inserted_rows': 0, 'num_updated_rows': 1, 'num_deleted_rows': 1}
|
||||
# --8<-- [end:replace_range_async]
|
||||
assert await table.count_rows("doc_id = 1") == 1
|
||||
assert stats["num_inserted_rows"] == 0
|
||||
assert stats["num_updated_rows"] == 1
|
||||
assert stats["num_deleted_rows"] == 1
|
||||
|
||||
@@ -489,8 +489,14 @@ impl Table {
|
||||
}
|
||||
|
||||
future_into_py(self_.py(), async move {
|
||||
builder.execute(Box::new(batches)).await.infer_error()?;
|
||||
Ok(())
|
||||
let stats = builder.execute(Box::new(batches)).await.infer_error()?;
|
||||
Python::with_gil(|py| {
|
||||
let dict = PyDict::new(py);
|
||||
dict.set_item("num_inserted_rows", stats.num_inserted_rows)?;
|
||||
dict.set_item("num_updated_rows", stats.num_updated_rows)?;
|
||||
dict.set_item("num_deleted_rows", stats.num_deleted_rows)?;
|
||||
Ok(dict.unbind())
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
@@ -47,6 +47,7 @@ use crate::{
|
||||
TableDefinition, UpdateBuilder,
|
||||
},
|
||||
};
|
||||
use lance::dataset::MergeStats;
|
||||
|
||||
const REQUEST_TIMEOUT_HEADER: HeaderName = HeaderName::from_static("x-request-timeout-ms");
|
||||
|
||||
@@ -1022,7 +1023,7 @@ impl<S: HttpSend> BaseTable for RemoteTable<S> {
|
||||
&self,
|
||||
params: MergeInsertBuilder,
|
||||
new_data: Box<dyn RecordBatchReader + Send>,
|
||||
) -> Result<()> {
|
||||
) -> Result<MergeStats> {
|
||||
self.check_mutable().await?;
|
||||
|
||||
let query = MergeInsertRequest::try_from(params)?;
|
||||
@@ -1034,9 +1035,11 @@ impl<S: HttpSend> BaseTable for RemoteTable<S> {
|
||||
|
||||
let (request_id, response) = self.send_streaming(request, new_data, true).await?;
|
||||
|
||||
// TODO: server can response with these stats in response body.
|
||||
// We should test that we can handle both empty response from old server
|
||||
// and response with stats from new server.
|
||||
self.check_table_response(&request_id, response).await?;
|
||||
|
||||
Ok(())
|
||||
Ok(MergeStats::default())
|
||||
}
|
||||
|
||||
async fn tags(&self) -> Result<Box<dyn Tags + '_>> {
|
||||
@@ -1348,7 +1351,12 @@ mod tests {
|
||||
Box::pin(table.count_rows(None).map_ok(|_| ())),
|
||||
Box::pin(table.update().column("a", "a + 1").execute().map_ok(|_| ())),
|
||||
Box::pin(table.add(example_data()).execute().map_ok(|_| ())),
|
||||
Box::pin(table.merge_insert(&["test"]).execute(example_data())),
|
||||
Box::pin(
|
||||
table
|
||||
.merge_insert(&["test"])
|
||||
.execute(example_data())
|
||||
.map_ok(|_| ()),
|
||||
),
|
||||
Box::pin(table.delete("false")),
|
||||
Box::pin(table.add_columns(
|
||||
NewColumnTransform::SqlExpressions(vec![("x".into(), "y".into())]),
|
||||
|
||||
@@ -20,6 +20,7 @@ use lance::dataset::cleanup::RemovalStats;
|
||||
use lance::dataset::optimize::{compact_files, CompactionMetrics, IndexRemapperOptions};
|
||||
use lance::dataset::scanner::Scanner;
|
||||
pub use lance::dataset::ColumnAlteration;
|
||||
pub use lance::dataset::MergeStats;
|
||||
pub use lance::dataset::NewColumnTransform;
|
||||
pub use lance::dataset::ReadParams;
|
||||
pub use lance::dataset::Version;
|
||||
@@ -487,7 +488,7 @@ pub trait BaseTable: std::fmt::Display + std::fmt::Debug + Send + Sync {
|
||||
&self,
|
||||
params: MergeInsertBuilder,
|
||||
new_data: Box<dyn RecordBatchReader + Send>,
|
||||
) -> Result<()>;
|
||||
) -> Result<MergeStats>;
|
||||
/// Gets the table tag manager.
|
||||
async fn tags(&self) -> Result<Box<dyn Tags + '_>>;
|
||||
/// Optimize the dataset.
|
||||
@@ -2367,7 +2368,7 @@ impl BaseTable for NativeTable {
|
||||
&self,
|
||||
params: MergeInsertBuilder,
|
||||
new_data: Box<dyn RecordBatchReader + Send>,
|
||||
) -> Result<()> {
|
||||
) -> Result<MergeStats> {
|
||||
let dataset = Arc::new(self.dataset.get().await?.clone());
|
||||
let mut builder = LanceMergeInsertBuilder::try_new(dataset.clone(), params.on)?;
|
||||
match (
|
||||
@@ -2394,9 +2395,9 @@ impl BaseTable for NativeTable {
|
||||
builder.when_not_matched_by_source(WhenNotMatchedBySource::Keep);
|
||||
}
|
||||
let job = builder.try_build()?;
|
||||
let (new_dataset, _stats) = job.execute_reader(new_data).await?;
|
||||
let (new_dataset, stats) = job.execute_reader(new_data).await?;
|
||||
self.dataset.set_latest(new_dataset.as_ref().clone()).await;
|
||||
Ok(())
|
||||
Ok(stats)
|
||||
}
|
||||
|
||||
/// Delete rows from the table
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use arrow_array::RecordBatchReader;
|
||||
use lance::dataset::MergeStats;
|
||||
|
||||
use crate::Result;
|
||||
|
||||
@@ -86,8 +87,9 @@ impl MergeInsertBuilder {
|
||||
|
||||
/// Executes the merge insert operation
|
||||
///
|
||||
/// Nothing is returned but the [`super::Table`] is updated
|
||||
pub async fn execute(self, new_data: Box<dyn RecordBatchReader + Send>) -> Result<()> {
|
||||
/// Returns statistics about the merge operation including the number of rows
|
||||
/// inserted, updated, and deleted.
|
||||
pub async fn execute(self, new_data: Box<dyn RecordBatchReader + Send>) -> Result<MergeStats> {
|
||||
self.table.clone().merge_insert(self, new_data).await
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user