Compare commits

...

7 Commits

Author SHA1 Message Date
qzhu
c3be2e3962 small fix for the guides/table page 2024-02-01 14:41:00 -08:00
Weston Pace
d77e95a4f4 feat: upgrade to lance 0.9.11 and expose merge_insert (#906)
This adds the python bindings requested in #870 The javascript/rust
bindings will be added in a future PR.
2024-02-01 11:36:29 -08:00
Lei Xu
62f053ac92 ci: bump to new version of python action to use node 20 gIthub action runtime (#909)
Github action is deprecating old node-16 runtime.
2024-02-01 11:36:03 -08:00
JacobLinCool
34e10caad2 fix the repo link on npm, add links for homepage and bug report (#910)
- fix the repo link on npm
- add links for homepage and bug report
2024-01-31 21:07:11 -08:00
QianZhu
f5726e2d0c arrow table/f16 example (#907) 2024-01-31 14:41:28 -08:00
Lance Release
12b4fb42fc Updating package-lock.json 2024-01-31 21:18:24 +00:00
Lance Release
1328cd46f1 Updating package-lock.json 2024-01-31 20:29:38 +00:00
17 changed files with 427 additions and 144 deletions

View File

@@ -29,7 +29,7 @@ jobs:
- name: Checkout
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v4
uses: actions/setup-python@v5
with:
python-version: "3.10"
cache: "pip"

View File

@@ -29,7 +29,7 @@ jobs:
- name: Checkout
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v4
uses: actions/setup-python@v5
with:
python-version: 3.11
cache: "pip"

View File

@@ -37,10 +37,10 @@ jobs:
run: |
git config user.name 'Lance Release'
git config user.email 'lance-dev@lancedb.com'
- name: Set up Python 3.10
uses: actions/setup-python@v4
- name: Set up Python 3.11
uses: actions/setup-python@v5
with:
python-version: "3.10"
python-version: "3.11"
- name: Bump version, create tag and commit
run: |
pip install bump2version

View File

@@ -16,7 +16,7 @@ jobs:
steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v4
uses: actions/setup-python@v5
with:
python-version: "3.8"
- name: Build distribution

View File

@@ -37,10 +37,10 @@ jobs:
run: |
git config user.name 'Lance Release'
git config user.email 'lance-dev@lancedb.com'
- name: Set up Python 3.10
uses: actions/setup-python@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.10"
python-version: "3.11"
- name: Bump version, create tag and commit
working-directory: python
run: |

View File

@@ -30,7 +30,7 @@ jobs:
fetch-depth: 0
lfs: true
- name: Set up Python
uses: actions/setup-python@v4
uses: actions/setup-python@v5
with:
python-version: 3.${{ matrix.python-minor-version }}
- name: Install lancedb
@@ -69,7 +69,7 @@ jobs:
fetch-depth: 0
lfs: true
- name: Set up Python
uses: actions/setup-python@v4
uses: actions/setup-python@v5
with:
python-version: "3.11"
- name: Install lancedb
@@ -92,7 +92,7 @@ jobs:
fetch-depth: 0
lfs: true
- name: Set up Python
uses: actions/setup-python@v4
uses: actions/setup-python@v5
with:
python-version: 3.9
- name: Install lancedb

View File

@@ -1,6 +1,6 @@
// --8<-- [start:import]
import * as lancedb from "vectordb";
import { Schema, Field, Float32, FixedSizeList, Int32 } from "apache-arrow";
import { Schema, Field, Float32, FixedSizeList, Int32, Float16 } from "apache-arrow";
// --8<-- [end:import]
import * as fs from "fs";
import { Table as ArrowTable, Utf8 } from "apache-arrow";
@@ -8,6 +8,7 @@ import { Table as ArrowTable, Utf8 } from "apache-arrow";
const example = async () => {
fs.rmSync("data/sample-lancedb", { recursive: true, force: true });
// --8<-- [start:open_db]
const lancedb = require("vectordb");
const uri = "data/sample-lancedb";
const db = await lancedb.connect(uri);
// --8<-- [end:open_db]
@@ -48,6 +49,27 @@ const example = async () => {
const empty_tbl = await db.createTable({ name: "empty_table", schema });
// --8<-- [end:create_empty_table]
// --8<-- [start:create_f16_table]
const dim = 16
const total = 10
const f16_schema = new Schema([
new Field('id', new Int32()),
new Field(
'vector',
new FixedSizeList(dim, new Field('item', new Float16(), true)),
false
)
])
const data = lancedb.makeArrowTable(
Array.from(Array(total), (_, i) => ({
id: i,
vector: Array.from(Array(dim), Math.random)
})),
{ f16_schema }
)
const table = await db.createTable('f16_tbl', data)
// --8<-- [end:create_f16_table]
// --8<-- [start:search]
const query = await tbl.search([100, 100]).limit(2).execute();
// --8<-- [end:search]

View File

@@ -16,9 +16,22 @@ This guide will show how to create tables, insert data into them, and update the
db = lancedb.connect("./.lancedb")
```
=== "Javascript"
Initialize a VectorDB connection and create a table using one of the many methods listed below.
```javascript
const lancedb = require("vectordb");
const uri = "data/sample-lancedb";
const db = await lancedb.connect(uri);
```
LanceDB allows ingesting data from various sources - `dict`, `list[dict]`, `pd.DataFrame`, `pa.Table` or a `Iterator[pa.RecordBatch]`. Let's take a look at some of the these.
### From list of tuples or dictionaries
### From list of tuples or dictionaries
=== "Python"
```python
import lancedb
@@ -32,7 +45,6 @@ This guide will show how to create tables, insert data into them, and update the
db["my_table"].head()
```
!!! info "Note"
If the table already exists, LanceDB will raise an error by default.
@@ -51,6 +63,27 @@ This guide will show how to create tables, insert data into them, and update the
db.create_table("name", data, mode="overwrite")
```
=== "Javascript"
You can create a LanceDB table in JavaScript using an array of JSON records as follows.
```javascript
const tb = await db.createTable("my_table", [{
"vector": [3.1, 4.1],
"item": "foo",
"price": 10.0
}, {
"vector": [5.9, 26.5],
"item": "bar",
"price": 20.0
}]);
```
!!! info "Note"
If the table already exists, LanceDB will raise an error by default. If you want to overwrite the table, you need to specify the `WriteMode` in the createTable function.
```javascript
const table = await con.createTable(tableName, data, { writeMode: WriteMode.Overwrite })
```
### From a Pandas DataFrame
```python
@@ -79,7 +112,7 @@ This guide will show how to create tables, insert data into them, and update the
table = db.create_table("my_table", data, schema=custom_schema)
```
### From a Polars DataFrame
### From a Polars DataFrame
LanceDB supports [Polars](https://pola.rs/), a modern, fast DataFrame library
written in Rust. Just like in Pandas, the Polars integration is enabled by PyArrow
@@ -97,26 +130,44 @@ This guide will show how to create tables, insert data into them, and update the
table = db.create_table("pl_table", data=data)
```
### From PyArrow Tables
You can also create LanceDB tables directly from PyArrow tables
### From an Arrow Table
=== "Python"
You can also create LanceDB tables directly from Arrow tables.
LanceDB supports float16 data type!
```python
table = pa.Table.from_arrays(
[
pa.array([[3.1, 4.1, 5.1, 6.1], [5.9, 26.5, 4.7, 32.8]],
pa.list_(pa.float32(), 4)),
pa.array(["foo", "bar"]),
pa.array([10.0, 20.0]),
],
["vector", "item", "price"],
)
import pyarrows as pa
import numpy as np
dim = 16
total = 2
schema = pa.schema(
[
pa.field("vector", pa.list_(pa.float16(), dim)),
pa.field("text", pa.string())
]
)
data = pa.Table.from_arrays(
[
pa.array([np.random.randn(dim).astype(np.float16) for _ in range(total)],
pa.list_(pa.float16(), dim)),
pa.array(["foo", "bar"])
],
["vector", "text"],
)
tbl = db.create_table("f16_tbl", data, schema=schema)
```
db = lancedb.connect("db")
=== "Javascript"
You can also create LanceDB tables directly from Arrow tables.
LanceDB supports Float16 data type!
tbl = db.create_table("my_table", table)
```javascript
--8<-- "docs/src/basic_legacy.ts:create_f16_table"
```
### From Pydantic Models
When you create an empty table without data, you must specify the table schema.
LanceDB supports creating tables by specifying a PyArrow schema or a specialized
Pydantic model called `LanceModel`.
@@ -261,37 +312,6 @@ This guide will show how to create tables, insert data into them, and update the
You can also use iterators of other types like Pandas DataFrame or Pylists directly in the above example.
=== "JavaScript"
Initialize a VectorDB connection and create a table using one of the many methods listed below.
```javascript
const lancedb = require("vectordb");
const uri = "data/sample-lancedb";
const db = await lancedb.connect(uri);
```
You can create a LanceDB table in JavaScript using an array of JSON records as follows.
```javascript
const tb = await db.createTable("my_table", [{
"vector": [3.1, 4.1],
"item": "foo",
"price": 10.0
}, {
"vector": [5.9, 26.5],
"item": "bar",
"price": 20.0
}]);
```
!!! info "Note"
If the table already exists, LanceDB will raise an error by default. If you want to overwrite the table, you need to specify the `WriteMode` in the createTable function.
```javascript
const table = await con.createTable(tableName, data, { writeMode: WriteMode.Overwrite })
```
## Open existing tables
=== "Python"

View File

@@ -13,7 +13,7 @@
},
{
"cell_type": "code",
"execution_count": 50,
"execution_count": 2,
"id": "c1b4e34b-a49c-471d-a343-a5940bb5138a",
"metadata": {},
"outputs": [],
@@ -23,7 +23,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 3,
"id": "4e5a8d07-d9a1-48c1-913a-8e0629289579",
"metadata": {},
"outputs": [],
@@ -44,7 +44,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 4,
"id": "5df12f66-8d99-43ad-8d0b-22189ec0a6b9",
"metadata": {},
"outputs": [
@@ -62,7 +62,7 @@
"long: [[-122.7,-74.1]]"
]
},
"execution_count": 2,
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
@@ -90,7 +90,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 5,
"id": "f4d87ae9-0ccb-48eb-b31d-bb8f2370e47e",
"metadata": {},
"outputs": [
@@ -108,7 +108,7 @@
"long: [[-122.7,-74.1]]"
]
},
"execution_count": 3,
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
@@ -135,10 +135,17 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 6,
"id": "25f34bcf-fca0-4431-8601-eac95d1bd347",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[2024-01-31T18:59:33Z WARN lance::dataset] No existing dataset at /Users/qian/Work/LanceDB/lancedb/docs/src/notebooks/.lancedb/table3.lance, it will be created\n"
]
},
{
"data": {
"text/plain": [
@@ -148,7 +155,7 @@
"long: float"
]
},
"execution_count": 8,
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
@@ -171,45 +178,51 @@
"id": "4df51925-7ca2-4005-9c72-38b3d26240c6",
"metadata": {},
"source": [
"### From PyArrow Tables\n",
"### From an Arrow Table\n",
"\n",
"You can also create LanceDB tables directly from pyarrow tables"
"You can also create LanceDB tables directly from pyarrow tables. LanceDB supports float16 type."
]
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 7,
"id": "90a880f6-be43-4c9d-ba65-0b05197c0f6f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"vector: fixed_size_list<item: float>[2]\n",
" child 0, item: float\n",
"item: string\n",
"price: double"
"vector: fixed_size_list<item: halffloat>[16]\n",
" child 0, item: halffloat\n",
"text: string"
]
},
"execution_count": 12,
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"table = pa.Table.from_arrays(\n",
" [\n",
" pa.array([[3.1, 4.1], [5.9, 26.5]],\n",
" pa.list_(pa.float32(), 2)),\n",
" pa.array([\"foo\", \"bar\"]),\n",
" pa.array([10.0, 20.0]),\n",
" ],\n",
" [\"vector\", \"item\", \"price\"],\n",
" )\n",
"import numpy as np\n",
"\n",
"db = lancedb.connect(\"db\")\n",
"dim = 16\n",
"total = 2\n",
"schema = pa.schema(\n",
" [\n",
" pa.field(\"vector\", pa.list_(pa.float16(), dim)),\n",
" pa.field(\"text\", pa.string())\n",
" ]\n",
")\n",
"data = pa.Table.from_arrays(\n",
" [\n",
" pa.array([np.random.randn(dim).astype(np.float16) for _ in range(total)],\n",
" pa.list_(pa.float16(), dim)),\n",
" pa.array([\"foo\", \"bar\"])\n",
" ],\n",
" [\"vector\", \"text\"],\n",
")\n",
"\n",
"tbl = db.create_table(\"test1\", table, mode=\"overwrite\")\n",
"tbl = db.create_table(\"f16_tbl\", data, schema=schema)\n",
"tbl.schema"
]
},
@@ -225,7 +238,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 8,
"id": "d81121d7-e4b7-447c-a48c-974b6ebb464a",
"metadata": {},
"outputs": [
@@ -240,7 +253,7 @@
"imdb_id: int64 not null"
]
},
"execution_count": 13,
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
@@ -282,7 +295,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 9,
"id": "bc247142-4e3c-41a2-b94c-8e00d2c2a508",
"metadata": {},
"outputs": [
@@ -292,7 +305,7 @@
"LanceTable(table4)"
]
},
"execution_count": 14,
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
@@ -333,7 +346,7 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 10,
"id": "25ad3523-e0c9-4c28-b3df-38189c4e0e5f",
"metadata": {},
"outputs": [
@@ -346,7 +359,7 @@
"price: double not null"
]
},
"execution_count": 16,
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
@@ -385,7 +398,7 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 11,
"id": "2814173a-eacc-4dd8-a64d-6312b44582cc",
"metadata": {},
"outputs": [],
@@ -411,7 +424,7 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 12,
"id": "df9e13c0-41f6-437f-9dfa-2fd71d3d9c45",
"metadata": {},
"outputs": [
@@ -421,7 +434,7 @@
"['table6', 'table4', 'table5', 'movielens_small']"
]
},
"execution_count": 18,
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
@@ -432,7 +445,7 @@
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": 13,
"id": "9343f5ad-6024-42ee-ac2f-6c1471df8679",
"metadata": {},
"outputs": [
@@ -541,7 +554,7 @@
"9 [5.9, 26.5] bar 20.0"
]
},
"execution_count": 20,
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
@@ -564,7 +577,7 @@
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": 14,
"id": "8a56250f-73a1-4c26-a6ad-5c7a0ce3a9ab",
"metadata": {},
"outputs": [],
@@ -590,7 +603,7 @@
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": 15,
"id": "030c7057-b98e-4e2f-be14-b8c1f927f83c",
"metadata": {},
"outputs": [],
@@ -621,7 +634,7 @@
},
{
"cell_type": "code",
"execution_count": 24,
"execution_count": 16,
"id": "e7a17de2-08d2-41b7-bd05-f63d1045ab1f",
"metadata": {},
"outputs": [
@@ -629,16 +642,16 @@
"name": "stdout",
"output_type": "stream",
"text": [
"32\n"
"22\n"
]
},
{
"data": {
"text/plain": [
"17"
"12"
]
},
"execution_count": 24,
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
@@ -661,7 +674,7 @@
},
{
"cell_type": "code",
"execution_count": 30,
"execution_count": 17,
"id": "fe3310bd-08f4-4a22-a63b-b3127d22f9f7",
"metadata": {},
"outputs": [
@@ -681,25 +694,20 @@
"8 [3.1, 4.1] foo 10.0\n",
"9 [3.1, 4.1] foo 10.0\n",
"10 [3.1, 4.1] foo 10.0\n",
"11 [3.1, 4.1] foo 10.0\n",
"12 [3.1, 4.1] foo 10.0\n",
"13 [3.1, 4.1] foo 10.0\n",
"14 [3.1, 4.1] foo 10.0\n",
"15 [3.1, 4.1] foo 10.0\n",
"16 [3.1, 4.1] foo 10.0\n"
"11 [3.1, 4.1] foo 10.0\n"
]
},
{
"ename": "OSError",
"evalue": "LanceError(IO): Error during planning: column foo does not exist",
"evalue": "LanceError(IO): Error during planning: column foo does not exist, /Users/runner/work/lance/lance/rust/lance-core/src/error.rs:212:23",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mOSError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[30], line 4\u001b[0m\n\u001b[1;32m 2\u001b[0m to_remove \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m, \u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mjoin(\u001b[38;5;28mstr\u001b[39m(v) \u001b[38;5;28;01mfor\u001b[39;00m v \u001b[38;5;129;01min\u001b[39;00m to_remove)\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28mprint\u001b[39m(tbl\u001b[38;5;241m.\u001b[39mto_pandas())\n\u001b[0;32m----> 4\u001b[0m \u001b[43mtbl\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdelete\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43mf\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mitem IN (\u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[43mto_remove\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[38;5;124;43m)\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 5\u001b[0m tbl\u001b[38;5;241m.\u001b[39mto_pandas()\n",
"File \u001b[0;32m~/Documents/lancedb/lancedb/python/lancedb/table.py:610\u001b[0m, in \u001b[0;36mLanceTable.delete\u001b[0;34m(self, where)\u001b[0m\n\u001b[1;32m 609\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdelete\u001b[39m(\u001b[38;5;28mself\u001b[39m, where: \u001b[38;5;28mstr\u001b[39m):\n\u001b[0;32m--> 610\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_dataset\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdelete\u001b[49m\u001b[43m(\u001b[49m\u001b[43mwhere\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m~/Documents/lancedb/lancedb/env/lib/python3.11/site-packages/lance/dataset.py:489\u001b[0m, in \u001b[0;36mLanceDataset.delete\u001b[0;34m(self, predicate)\u001b[0m\n\u001b[1;32m 487\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(predicate, pa\u001b[38;5;241m.\u001b[39mcompute\u001b[38;5;241m.\u001b[39mExpression):\n\u001b[1;32m 488\u001b[0m predicate \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mstr\u001b[39m(predicate)\n\u001b[0;32m--> 489\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_ds\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdelete\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpredicate\u001b[49m\u001b[43m)\u001b[49m\n",
"\u001b[0;31mOSError\u001b[0m: LanceError(IO): Error during planning: column foo does not exist"
"Cell \u001b[0;32mIn[17], line 4\u001b[0m\n\u001b[1;32m 2\u001b[0m to_remove \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m, \u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mjoin(\u001b[38;5;28mstr\u001b[39m(v) \u001b[38;5;28;01mfor\u001b[39;00m v \u001b[38;5;129;01min\u001b[39;00m to_remove)\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28mprint\u001b[39m(tbl\u001b[38;5;241m.\u001b[39mto_pandas())\n\u001b[0;32m----> 4\u001b[0m \u001b[43mtbl\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdelete\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43mf\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mitem IN (\u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[43mto_remove\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[38;5;124;43m)\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m~/Work/LanceDB/lancedb/docs/doc-venv/lib/python3.11/site-packages/lancedb/table.py:872\u001b[0m, in \u001b[0;36mLanceTable.delete\u001b[0;34m(self, where)\u001b[0m\n\u001b[1;32m 871\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdelete\u001b[39m(\u001b[38;5;28mself\u001b[39m, where: \u001b[38;5;28mstr\u001b[39m):\n\u001b[0;32m--> 872\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_dataset\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdelete\u001b[49m\u001b[43m(\u001b[49m\u001b[43mwhere\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m~/Work/LanceDB/lancedb/docs/doc-venv/lib/python3.11/site-packages/lance/dataset.py:596\u001b[0m, in \u001b[0;36mLanceDataset.delete\u001b[0;34m(self, predicate)\u001b[0m\n\u001b[1;32m 594\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(predicate, pa\u001b[38;5;241m.\u001b[39mcompute\u001b[38;5;241m.\u001b[39mExpression):\n\u001b[1;32m 595\u001b[0m predicate \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mstr\u001b[39m(predicate)\n\u001b[0;32m--> 596\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_ds\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdelete\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpredicate\u001b[49m\u001b[43m)\u001b[49m\n",
"\u001b[0;31mOSError\u001b[0m: LanceError(IO): Error during planning: column foo does not exist, /Users/runner/work/lance/lance/rust/lance-core/src/error.rs:212:23"
]
}
],
@@ -712,7 +720,7 @@
},
{
"cell_type": "code",
"execution_count": 43,
"execution_count": null,
"id": "87d5bc21-847f-4c81-b56e-f6dbe5d05aac",
"metadata": {},
"outputs": [],
@@ -729,7 +737,7 @@
},
{
"cell_type": "code",
"execution_count": 44,
"execution_count": null,
"id": "9cba4519-eb3a-4941-ab7e-873d762e750f",
"metadata": {},
"outputs": [],
@@ -742,7 +750,7 @@
},
{
"cell_type": "code",
"execution_count": 46,
"execution_count": null,
"id": "5bdc9801-d5ed-4871-92d0-88b27108e788",
"metadata": {},
"outputs": [
@@ -817,7 +825,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.4"
"version": "3.11.7"
}
},
"nbformat": 4,

View File

@@ -58,6 +58,8 @@ pip install lancedb
::: lancedb.schema.vector
::: lancedb.merge.LanceMergeInsertBuilder
## Integrations
### Pydantic

44
node/package-lock.json generated
View File

@@ -1,12 +1,12 @@
{
"name": "vectordb",
"version": "0.4.6",
"version": "0.4.7",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "vectordb",
"version": "0.4.6",
"version": "0.4.7",
"cpu": [
"x64",
"arm64"
@@ -53,11 +53,11 @@
"uuid": "^9.0.0"
},
"optionalDependencies": {
"@lancedb/vectordb-darwin-arm64": "0.4.6",
"@lancedb/vectordb-darwin-x64": "0.4.6",
"@lancedb/vectordb-linux-arm64-gnu": "0.4.6",
"@lancedb/vectordb-linux-x64-gnu": "0.4.6",
"@lancedb/vectordb-win32-x64-msvc": "0.4.6"
"@lancedb/vectordb-darwin-arm64": "0.4.7",
"@lancedb/vectordb-darwin-x64": "0.4.7",
"@lancedb/vectordb-linux-arm64-gnu": "0.4.7",
"@lancedb/vectordb-linux-x64-gnu": "0.4.7",
"@lancedb/vectordb-win32-x64-msvc": "0.4.7"
}
},
"node_modules/@75lb/deep-merge": {
@@ -329,9 +329,9 @@
}
},
"node_modules/@lancedb/vectordb-darwin-arm64": {
"version": "0.4.6",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.4.6.tgz",
"integrity": "sha512-p6w/BXBxgFHR87phxvfBPPbvz4wDGmG2guRSQPEriwrc8h/gQ3wuexHhyzi7SWcV2E25vyUO9QcFL3vYKhIJRg==",
"version": "0.4.7",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.4.7.tgz",
"integrity": "sha512-kACOIytgjBfX8NRwjPKe311XRN3lbSN13B7avT5htMd3kYm3AnnMag9tZhlwoO7lIuvGaXhy7mApygJrjhfJ4g==",
"cpu": [
"arm64"
],
@@ -341,9 +341,9 @@
]
},
"node_modules/@lancedb/vectordb-darwin-x64": {
"version": "0.4.6",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.4.6.tgz",
"integrity": "sha512-7Fmg63Ky783ROpaQEL6I1uTrO//YDi4MgG0pjWAkDKsdHQ8QisFF8kd+JvjPh4PhMScC/rtB0SXpY/Y4zZvLfw==",
"version": "0.4.7",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.4.7.tgz",
"integrity": "sha512-vb74iK5uPWCwz5E60r3yWp/R/HSg54/Z9AZWYckYXqsPv4w/nfbkM5iZhfRqqR/9uE6JClWJKOtjbk7b8CFRFg==",
"cpu": [
"x64"
],
@@ -353,9 +353,9 @@
]
},
"node_modules/@lancedb/vectordb-linux-arm64-gnu": {
"version": "0.4.6",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.4.6.tgz",
"integrity": "sha512-2wM+BKnjtZyKhiQPvldpfORH2JdKy6AuLFJ7AQtuyly57mkvgZRJeqK0DsRi/hyyZPRUOvWaDp/LfAxZvhLWgA==",
"version": "0.4.7",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.4.7.tgz",
"integrity": "sha512-jHp7THm6S9sB8RaCxGoZXLAwGAUHnawUUilB1K3mvQsRdfB2bBs0f7wDehW+PDhr+Iog4LshaWbcnoQEUJWR+Q==",
"cpu": [
"arm64"
],
@@ -365,9 +365,9 @@
]
},
"node_modules/@lancedb/vectordb-linux-x64-gnu": {
"version": "0.4.6",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.4.6.tgz",
"integrity": "sha512-1BK9i3DnnFHyBVLxOfsIW2i800o9exDEHm5onikvfoa5Ot5tXwIwAw86+0HGsBm5YbJnKKxZmbAM6Pr9qfMKiQ==",
"version": "0.4.7",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.4.7.tgz",
"integrity": "sha512-LKbVe6Wrp/AGqCCjKliNDmYoeTNgY/wfb2DTLjrx41Jko/04ywLrJ6xSEAn3XD5RDCO5u3fyUdXHHHv5a3VAAQ==",
"cpu": [
"x64"
],
@@ -377,9 +377,9 @@
]
},
"node_modules/@lancedb/vectordb-win32-x64-msvc": {
"version": "0.4.6",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.4.6.tgz",
"integrity": "sha512-Fh/fw+HRf/LDZKCDQTvpWoacFfmLXGwQpcqxxlwIZ0vy45eCNYvnZrpjQBjej0uh3tEVC6OHh6Jhn7Pr9k8r2w==",
"version": "0.4.7",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.4.7.tgz",
"integrity": "sha512-C5ln4+wafeY1Sm4PeV0Ios9lUaQVVip5Mjl9XU7ngioSEMEuXI/XMVfIdVfDPppVNXPeQxg33wLA272uw88D1Q==",
"cpu": [
"x64"
],

View File

@@ -17,7 +17,11 @@
},
"repository": {
"type": "git",
"url": "https://github.com/lancedb/lancedb/node"
"url": "https://github.com/lancedb/lancedb.git"
},
"homepage": "https://lancedb.github.io/lancedb/",
"bugs": {
"url": "https://github.com/lancedb/lancedb/issues"
},
"keywords": [
"data-format",

86
python/lancedb/merge.py Normal file
View File

@@ -0,0 +1,86 @@
# Copyright 2023 LanceDB Developers
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
from typing import TYPE_CHECKING, Iterable, Optional
if TYPE_CHECKING:
from .common import DATA
class LanceMergeInsertBuilder(object):
"""Builder for a LanceDB merge insert operation
See [`merge_insert`][lancedb.table.Table.merge_insert] for
more context
"""
def __init__(self, table: "Table", on: Iterable[str]): # noqa: F821
# Do not put a docstring here. This method should be hidden
# from API docs. Users should use merge_insert to create
# this object.
self._table = table
self._on = on
self._when_matched_update_all = False
self._when_not_matched_insert_all = False
self._when_not_matched_by_source_delete = False
self._when_not_matched_by_source_condition = None
def when_matched_update_all(self) -> LanceMergeInsertBuilder:
"""
Rows that exist in both the source table (new data) and
the target table (old data) will be updated, replacing
the old row with the corresponding matching row.
If there are multiple matches then the behavior is undefined.
Currently this causes multiple copies of the row to be created
but that behavior is subject to change.
"""
self._when_matched_update_all = True
return self
def when_not_matched_insert_all(self) -> LanceMergeInsertBuilder:
"""
Rows that exist only in the source table (new data) should
be inserted into the target table.
"""
self._when_not_matched_insert_all = True
return self
def when_not_matched_by_source_delete(
self, condition: Optional[str] = None
) -> LanceMergeInsertBuilder:
"""
Rows that exist only in the target table (old data) will be
deleted. An optional condition can be provided to limit what
data is deleted.
Parameters
----------
condition: Optional[str], default None
If None then all such rows will be deleted. Otherwise the
condition will be used as an SQL filter to limit what rows
are deleted.
"""
self._when_not_matched_by_source_delete = True
if condition is not None:
self._when_not_matched_by_source_condition = condition
return self
def execute(self, new_data: DATA):
"""
Executes the merge insert operation
Nothing is returned but the [`Table`][lancedb.table.Table] is updated
"""
self._table._do_merge(self, new_data)

View File

@@ -244,6 +244,10 @@ class RemoteTable(Table):
result = self._conn._client.query(self._name, query)
return result.to_arrow()
def _do_merge(self, *_args):
"""_do_merge() is not supported on the LanceDB cloud yet"""
return NotImplementedError("_do_merge() is not supported on the LanceDB cloud")
def delete(self, predicate: str):
"""Delete rows from the table.

View File

@@ -28,6 +28,7 @@ from lance.vector import vec_to_table
from .common import DATA, VEC, VECTOR_COLUMN_NAME
from .embeddings import EmbeddingFunctionConfig, EmbeddingFunctionRegistry
from .merge import LanceMergeInsertBuilder
from .pydantic import LanceModel, model_to_dict
from .query import LanceQueryBuilder, Query
from .util import (
@@ -334,6 +335,64 @@ class Table(ABC):
"""
raise NotImplementedError
def merge_insert(self, on: Union[str, Iterable[str]]) -> LanceMergeInsertBuilder:
"""
Returns a [`LanceMergeInsertBuilder`][lancedb.merge.LanceMergeInsertBuilder]
that can be used to create a "merge insert" operation
This operation can add rows, update rows, and remove rows all in a single
transaction. It is a very generic tool that can be used to create
behaviors like "insert if not exists", "update or insert (i.e. upsert)",
or even replace a portion of existing data with new data (e.g. replace
all data where month="january")
The merge insert operation works by combining new data from a
**source table** with existing data in a **target table** by using a
join. There are three categories of records.
"Matched" records are records that exist in both the source table and
the target table. "Not matched" records exist only in the source table
(e.g. these are new data) "Not matched by source" records exist only
in the target table (this is old data)
The builder returned by this method can be used to customize what
should happen for each category of data.
Please note that the data may appear to be reordered as part of this
operation. This is because updated rows will be deleted from the
dataset and then reinserted at the end with the new values.
Parameters
----------
on: Union[str, Iterable[str]]
A column (or columns) to join on. This is how records from the
source table and target table are matched. Typically this is some
kind of key or id column.
Examples
--------
>>> import lancedb
>>> data = pa.table({"a": [2, 1, 3], "b": ["a", "b", "c"]})
>>> db = lancedb.connect("./.lancedb")
>>> table = db.create_table("my_table", data)
>>> new_data = pa.table({"a": [2, 3, 4], "b": ["x", "y", "z"]})
>>> # Perform a "upsert" operation
>>> table.merge_insert("a") \\
... .when_matched_update_all() \\
... .when_not_matched_insert_all() \\
... .execute(new_data)
>>> # The order of new rows is non-deterministic since we use
>>> # a hash-join as part of this operation and so we sort here
>>> table.to_arrow().sort_by("a").to_pandas()
a b
0 1 b
1 2 x
2 3 y
3 4 z
"""
return LanceMergeInsertBuilder(self, on)
@abstractmethod
def search(
self,
@@ -414,6 +473,16 @@ class Table(ABC):
def _execute_query(self, query: Query) -> pa.Table:
pass
@abstractmethod
def _do_merge(
self,
merge: LanceMergeInsertBuilder,
new_data: DATA,
*,
schema: Optional[pa.Schema] = None,
):
pass
@abstractmethod
def delete(self, where: str):
"""Delete rows from the table.
@@ -1196,6 +1265,18 @@ class LanceTable(Table):
with_row_id=query.with_row_id,
)
def _do_merge(self, merge: LanceMergeInsertBuilder, new_data: DATA, *, schema=None):
ds = self.to_lance()
builder = ds.merge_insert(merge._on)
if merge._when_matched_update_all:
builder.when_matched_update_all()
if merge._when_not_matched_insert_all:
builder.when_not_matched_insert_all()
if merge._when_not_matched_by_source_delete:
cond = merge._when_not_matched_by_source_condition
builder.when_not_matched_by_source_delete(cond)
builder.execute(new_data, schema=schema)
def cleanup_old_versions(
self,
older_than: Optional[timedelta] = None,

View File

@@ -3,7 +3,7 @@ name = "lancedb"
version = "0.5.1"
dependencies = [
"deprecation",
"pylance==0.9.10",
"pylance==0.9.11",
"ratelimiter~=1.0",
"retry>=0.9.2",
"tqdm>=4.27.0",

View File

@@ -493,6 +493,62 @@ def test_update_types(db):
assert actual == expected
def test_merge_insert(db):
table = LanceTable.create(
db,
"my_table",
data=pa.table({"a": [1, 2, 3], "b": ["a", "b", "c"]}),
)
assert len(table) == 3
version = table.version
new_data = pa.table({"a": [2, 3, 4], "b": ["x", "y", "z"]})
# upsert
table.merge_insert(
"a"
).when_matched_update_all().when_not_matched_insert_all().execute(new_data)
expected = pa.table({"a": [1, 2, 3, 4], "b": ["a", "x", "y", "z"]})
# These `sort_by` calls can be removed once lance#1892
# is merged (it fixes the ordering)
assert table.to_arrow().sort_by("a") == expected
table.restore(version)
# insert-if-not-exists
table.merge_insert("a").when_not_matched_insert_all().execute(new_data)
expected = pa.table({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "z"]})
assert table.to_arrow().sort_by("a") == expected
table.restore(version)
new_data = pa.table({"a": [2, 4], "b": ["x", "z"]})
# replace-range
table.merge_insert(
"a"
).when_matched_update_all().when_not_matched_insert_all().when_not_matched_by_source_delete(
"a > 2"
).execute(new_data)
expected = pa.table({"a": [1, 2, 4], "b": ["a", "x", "z"]})
assert table.to_arrow().sort_by("a") == expected
table.restore(version)
# replace-range no condition
table.merge_insert(
"a"
).when_matched_update_all().when_not_matched_insert_all().when_not_matched_by_source_delete().execute(
new_data
)
expected = pa.table({"a": [2, 4], "b": ["x", "z"]})
assert table.to_arrow().sort_by("a") == expected
def test_create_with_embedding_function(db):
class MyTable(LanceModel):
text: str