mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-24 13:59:58 +00:00
Compare commits
11 Commits
reproducib
...
python-v0.
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8dc2315479 | ||
|
|
f6bfb5da11 | ||
|
|
661fcecf38 | ||
|
|
07fe284810 | ||
|
|
800bb691c3 | ||
|
|
ec24e09add | ||
|
|
0554db03b3 | ||
|
|
b315ea3978 | ||
|
|
aa7806cf0d | ||
|
|
6799613109 | ||
|
|
0f26915d22 |
@@ -1,5 +1,5 @@
|
||||
[bumpversion]
|
||||
current_version = 0.2.4
|
||||
current_version = 0.2.5
|
||||
commit = True
|
||||
message = Bump version: {current_version} → {new_version}
|
||||
tag = True
|
||||
|
||||
53
.github/workflows/node.yml
vendored
53
.github/workflows/node.yml
vendored
@@ -107,3 +107,56 @@ jobs:
|
||||
- name: Test
|
||||
run: |
|
||||
npm run test
|
||||
aws-integtest:
|
||||
timeout-minutes: 45
|
||||
runs-on: "ubuntu-22.04"
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
working-directory: node
|
||||
env:
|
||||
AWS_ACCESS_KEY_ID: ACCESSKEY
|
||||
AWS_SECRET_ACCESS_KEY: SECRETKEY
|
||||
AWS_DEFAULT_REGION: us-west-2
|
||||
# this one is for s3
|
||||
AWS_ENDPOINT: http://localhost:4566
|
||||
# this one is for dynamodb
|
||||
DYNAMODB_ENDPOINT: http://localhost:4566
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 0
|
||||
lfs: true
|
||||
- uses: actions/setup-node@v3
|
||||
with:
|
||||
node-version: 18
|
||||
cache: 'npm'
|
||||
cache-dependency-path: node/package-lock.json
|
||||
- name: start local stack
|
||||
run: docker compose -f ../docker-compose.yml up -d
|
||||
- name: create s3
|
||||
run: aws s3 mb s3://lancedb-integtest --endpoint $AWS_ENDPOINT
|
||||
- name: create ddb
|
||||
run: |
|
||||
aws dynamodb create-table \
|
||||
--table-name lancedb-integtest \
|
||||
--attribute-definitions '[{"AttributeName": "base_uri", "AttributeType": "S"}, {"AttributeName": "version", "AttributeType": "N"}]' \
|
||||
--key-schema '[{"AttributeName": "base_uri", "KeyType": "HASH"}, {"AttributeName": "version", "KeyType": "RANGE"}]' \
|
||||
--provisioned-throughput '{"ReadCapacityUnits": 10, "WriteCapacityUnits": 10}' \
|
||||
--endpoint-url $DYNAMODB_ENDPOINT
|
||||
- uses: Swatinem/rust-cache@v2
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install -y protobuf-compiler libssl-dev
|
||||
- name: Build
|
||||
run: |
|
||||
npm ci
|
||||
npm run tsc
|
||||
npm run build
|
||||
npm run pack-build
|
||||
npm install --no-save ./dist/lancedb-vectordb-*.tgz
|
||||
# Remove index.node to test with dependency installed
|
||||
rm index.node
|
||||
- name: Test
|
||||
run: npm run integration-test
|
||||
|
||||
23
Cargo.toml
23
Cargo.toml
@@ -1,16 +1,25 @@
|
||||
[workspace]
|
||||
members = [
|
||||
"rust/vectordb",
|
||||
"rust/ffi/node"
|
||||
]
|
||||
members = ["rust/ffi/node", "rust/vectordb"]
|
||||
# Python package needs to be built by maturin.
|
||||
exclude = ["python"]
|
||||
resolver = "2"
|
||||
|
||||
[workspace.dependencies]
|
||||
lance = "=0.6.5"
|
||||
lance = { "version" = "=0.7.4", "features" = ["dynamodb"] }
|
||||
lance-linalg = { "version" = "=0.7.4" }
|
||||
# Note that this one does not include pyarrow
|
||||
arrow = { version = "43.0.0", optional = false }
|
||||
arrow-array = "43.0"
|
||||
arrow-data = "43.0"
|
||||
arrow-schema = "43.0"
|
||||
arrow-ipc = "43.0"
|
||||
half = { "version" = "=2.2.1", default-features = false }
|
||||
arrow-ord = "43.0"
|
||||
arrow-schema = "43.0"
|
||||
arrow-arith = "43.0"
|
||||
arrow-cast = "43.0"
|
||||
half = { "version" = "=2.2.1", default-features = false, features = [
|
||||
"num-traits"
|
||||
] }
|
||||
log = "0.4"
|
||||
object_store = "0.6.1"
|
||||
snafu = "0.7.4"
|
||||
url = "2"
|
||||
|
||||
15
docker-compose.yml
Normal file
15
docker-compose.yml
Normal file
@@ -0,0 +1,15 @@
|
||||
version: "3.9"
|
||||
services:
|
||||
localstack:
|
||||
image: localstack/localstack:0.14
|
||||
ports:
|
||||
- 4566:4566
|
||||
environment:
|
||||
- SERVICES=s3,dynamodb
|
||||
- DEBUG=1
|
||||
- LS_LOG=trace
|
||||
- DOCKER_HOST=unix:///var/run/docker.sock
|
||||
- AWS_ACCESS_KEY_ID=ACCESSKEY
|
||||
- AWS_SECRET_ACCESS_KEY=SECRETKEY
|
||||
healthcheck:
|
||||
test: [ "CMD", "curl", "-f", "http://localhost:4566/health" ]
|
||||
@@ -49,11 +49,11 @@ A Table is a collection of Records in a LanceDB Database. You can follow along o
|
||||
|
||||
db.create_table("table2", data)
|
||||
|
||||
db["table2"].head()
|
||||
db["table2"].head()
|
||||
```
|
||||
!!! info "Note"
|
||||
Data is converted to Arrow before being written to disk. For maximum control over how data is saved, either provide the PyArrow schema to convert to or else provide a PyArrow Table directly.
|
||||
|
||||
|
||||
```python
|
||||
custom_schema = pa.schema([
|
||||
pa.field("vector", pa.list_(pa.float32(), 2)),
|
||||
@@ -66,7 +66,7 @@ A Table is a collection of Records in a LanceDB Database. You can follow along o
|
||||
|
||||
### From PyArrow Tables
|
||||
You can also create LanceDB tables directly from pyarrow tables
|
||||
|
||||
|
||||
```python
|
||||
table = pa.Table.from_arrays(
|
||||
[
|
||||
@@ -87,15 +87,15 @@ A Table is a collection of Records in a LanceDB Database. You can follow along o
|
||||
LanceDB supports to create Apache Arrow Schema from a Pydantic BaseModel via pydantic_to_schema() method.
|
||||
|
||||
```python
|
||||
from lancedb.pydantic import vector, LanceModel
|
||||
from lancedb.pydantic import Vector, LanceModel
|
||||
|
||||
class Content(LanceModel):
|
||||
movie_id: int
|
||||
vector: vector(128)
|
||||
vector: Vector(128)
|
||||
genres: str
|
||||
title: str
|
||||
imdb_id: int
|
||||
|
||||
|
||||
@property
|
||||
def imdb_url(self) -> str:
|
||||
return f"https://www.imdb.com/title/tt{self.imdb_id}"
|
||||
@@ -103,7 +103,7 @@ A Table is a collection of Records in a LanceDB Database. You can follow along o
|
||||
import pyarrow as pa
|
||||
db = lancedb.connect("~/.lancedb")
|
||||
table_name = "movielens_small"
|
||||
table = db.create_table(table_name, schema=Content.to_arrow_schema())
|
||||
table = db.create_table(table_name, schema=Content)
|
||||
```
|
||||
|
||||
### Using Iterators / Writing Large Datasets
|
||||
@@ -113,7 +113,7 @@ A Table is a collection of Records in a LanceDB Database. You can follow along o
|
||||
LanceDB additionally supports pyarrow's `RecordBatch` Iterators or other generators producing supported data types.
|
||||
|
||||
Here's an example using using `RecordBatch` iterator for creating tables.
|
||||
|
||||
|
||||
```python
|
||||
import pyarrow as pa
|
||||
|
||||
@@ -142,11 +142,11 @@ A Table is a collection of Records in a LanceDB Database. You can follow along o
|
||||
|
||||
## Creating Empty Table
|
||||
You can also create empty tables in python. Initialize it with schema and later ingest data into it.
|
||||
|
||||
|
||||
```python
|
||||
import lancedb
|
||||
import pyarrow as pa
|
||||
|
||||
|
||||
schema = pa.schema(
|
||||
[
|
||||
pa.field("vector", pa.list_(pa.float32(), 2)),
|
||||
@@ -168,8 +168,8 @@ A Table is a collection of Records in a LanceDB Database. You can follow along o
|
||||
from lancedb.pydantic import LanceModel, vector
|
||||
|
||||
class Model(LanceModel):
|
||||
vector: vector(2)
|
||||
|
||||
vector: Vector(2)
|
||||
|
||||
tbl = db.create_table("table5", schema=Model.to_arrow_schema())
|
||||
```
|
||||
|
||||
@@ -249,7 +249,7 @@ After a table has been created, you can always add more data to it using
|
||||
You can also add a large dataset batch in one go using Iterator of any supported data types.
|
||||
|
||||
### Adding to table using Iterator
|
||||
|
||||
|
||||
```python
|
||||
import pandas as pd
|
||||
|
||||
@@ -261,10 +261,10 @@ After a table has been created, you can always add more data to it using
|
||||
"item": ["foo", "bar"],
|
||||
"price": [10.0, 20.0],
|
||||
})
|
||||
|
||||
|
||||
tbl.add(make_batches())
|
||||
```
|
||||
|
||||
|
||||
The other arguments accepted:
|
||||
|
||||
| Name | Type | Description | Default |
|
||||
@@ -274,7 +274,7 @@ After a table has been created, you can always add more data to it using
|
||||
| on_bad_vectors | str | What to do if any of the vectors are not the same size or contains NaNs. One of "error", "drop", "fill". | drop |
|
||||
| fill value | float | The value to use when filling vectors: Only used if on_bad_vectors="fill". | 0.0 |
|
||||
|
||||
|
||||
|
||||
=== "Javascript/Typescript"
|
||||
|
||||
```javascript
|
||||
@@ -312,7 +312,7 @@ Use the `delete()` method on tables to delete rows from a table. To choose which
|
||||
# x vector
|
||||
# 0 1 [1.0, 2.0]
|
||||
# 1 3 [5.0, 6.0]
|
||||
```
|
||||
```
|
||||
|
||||
### Delete from a list of values
|
||||
|
||||
@@ -325,7 +325,7 @@ Use the `delete()` method on tables to delete rows from a table. To choose which
|
||||
# x vector
|
||||
# 0 3 [5.0, 6.0]
|
||||
```
|
||||
|
||||
|
||||
=== "Javascript/Typescript"
|
||||
|
||||
```javascript
|
||||
|
||||
@@ -249,11 +249,11 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from lancedb.pydantic import vector, LanceModel\n",
|
||||
"from lancedb.pydantic import Vector, LanceModel\n",
|
||||
"\n",
|
||||
"class Content(LanceModel):\n",
|
||||
" movie_id: int\n",
|
||||
" vector: vector(128)\n",
|
||||
" vector: Vector(128)\n",
|
||||
" genres: str\n",
|
||||
" title: str\n",
|
||||
" imdb_id: int\n",
|
||||
@@ -359,7 +359,7 @@
|
||||
"import pandas as pd\n",
|
||||
"\n",
|
||||
"class PydanticSchema(LanceModel):\n",
|
||||
" vector: vector(2)\n",
|
||||
" vector: Vector(2)\n",
|
||||
" item: str\n",
|
||||
" price: float\n",
|
||||
"\n",
|
||||
@@ -394,10 +394,10 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import lancedb\n",
|
||||
"from lancedb.pydantic import LanceModel, vector\n",
|
||||
"from lancedb.pydantic import LanceModel, Vector\n",
|
||||
"\n",
|
||||
"class Model(LanceModel):\n",
|
||||
" vector: vector(2)\n",
|
||||
" vector: Vector(2)\n",
|
||||
"\n",
|
||||
"tbl = db.create_table(\"table6\", schema=Model.to_arrow_schema())"
|
||||
]
|
||||
|
||||
@@ -13,10 +13,10 @@ via [pydantic_to_schema()](python.md##lancedb.pydantic.pydantic_to_schema) metho
|
||||
|
||||
## Vector Field
|
||||
|
||||
LanceDB provides a [`vector(dim)`](python.md#lancedb.pydantic.vector) method to define a
|
||||
LanceDB provides a [`Vector(dim)`](python.md#lancedb.pydantic.Vector) method to define a
|
||||
vector Field in a Pydantic Model.
|
||||
|
||||
::: lancedb.pydantic.vector
|
||||
::: lancedb.pydantic.Vector
|
||||
|
||||
## Type Conversion
|
||||
|
||||
@@ -33,4 +33,4 @@ Current supported type conversions:
|
||||
| `str` | `pyarrow.utf8()` |
|
||||
| `list` | `pyarrow.List` |
|
||||
| `BaseModel` | `pyarrow.Struct` |
|
||||
| `vector(n)` | `pyarrow.FixedSizeList(float32, n)` |
|
||||
| `Vector(n)` | `pyarrow.FixedSizeList(float32, n)` |
|
||||
|
||||
105
node/package-lock.json
generated
105
node/package-lock.json
generated
@@ -1,12 +1,12 @@
|
||||
{
|
||||
"name": "vectordb",
|
||||
"version": "0.2.4",
|
||||
"version": "0.2.5",
|
||||
"lockfileVersion": 2,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "vectordb",
|
||||
"version": "0.2.4",
|
||||
"version": "0.2.5",
|
||||
"cpu": [
|
||||
"x64",
|
||||
"arm64"
|
||||
@@ -31,6 +31,7 @@
|
||||
"@types/node": "^18.16.2",
|
||||
"@types/sinon": "^10.0.15",
|
||||
"@types/temp": "^0.9.1",
|
||||
"@types/uuid": "^9.0.3",
|
||||
"@typescript-eslint/eslint-plugin": "^5.59.1",
|
||||
"cargo-cp-artifact": "^0.1",
|
||||
"chai": "^4.3.7",
|
||||
@@ -48,14 +49,15 @@
|
||||
"ts-node-dev": "^2.0.0",
|
||||
"typedoc": "^0.24.7",
|
||||
"typedoc-plugin-markdown": "^3.15.3",
|
||||
"typescript": "*"
|
||||
"typescript": "*",
|
||||
"uuid": "^9.0.0"
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"@lancedb/vectordb-darwin-arm64": "0.2.4",
|
||||
"@lancedb/vectordb-darwin-x64": "0.2.4",
|
||||
"@lancedb/vectordb-linux-arm64-gnu": "0.2.4",
|
||||
"@lancedb/vectordb-linux-x64-gnu": "0.2.4",
|
||||
"@lancedb/vectordb-win32-x64-msvc": "0.2.4"
|
||||
"@lancedb/vectordb-darwin-arm64": "0.2.5",
|
||||
"@lancedb/vectordb-darwin-x64": "0.2.5",
|
||||
"@lancedb/vectordb-linux-arm64-gnu": "0.2.5",
|
||||
"@lancedb/vectordb-linux-x64-gnu": "0.2.5",
|
||||
"@lancedb/vectordb-win32-x64-msvc": "0.2.5"
|
||||
}
|
||||
},
|
||||
"node_modules/@apache-arrow/ts": {
|
||||
@@ -315,9 +317,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-darwin-arm64": {
|
||||
"version": "0.2.4",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.2.4.tgz",
|
||||
"integrity": "sha512-MqiZXamHYEOfguPsHWLBQ56IabIN6Az8u2Hx8LCyXcxW9gcyJZMSAfJc+CcA4KYHKotv0KsVBhgxZ3kaZQQyiw==",
|
||||
"version": "0.2.5",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.2.5.tgz",
|
||||
"integrity": "sha512-V4206SajkMN3o+bBFBAYJq5emlrjevitP0g8RFfVlmj/LS38i8k4uvSe1bICQ2amUrYkL/Jw4ktYn19NRfTU+g==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
@@ -327,9 +329,9 @@
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-darwin-x64": {
|
||||
"version": "0.2.4",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.2.4.tgz",
|
||||
"integrity": "sha512-DzL+mw5WhKDwXdEFlPh8M9zSDhGnfks7NvEh6ZqKbU6znH206YB7g3OA4WfFyV579IIEQ8jd4v/XDthNzQKuSA==",
|
||||
"version": "0.2.5",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.2.5.tgz",
|
||||
"integrity": "sha512-orePizgXCbTJbDJ4bMMnYh/4OgmWDBbHShNxHKQobcX+NgWTexmR0lV1WNOG+DtczBiGH422e3gHJ+xhTO13vg==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
@@ -339,9 +341,9 @@
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-linux-arm64-gnu": {
|
||||
"version": "0.2.4",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.2.4.tgz",
|
||||
"integrity": "sha512-LP1nNfIpFxCgcCMlIQdseDX9dZU27TNhCL41xar8euqcetY5uKvi0YqhiVlpNO85Ss1FRQBgQ/GtnOM6Bo7oBQ==",
|
||||
"version": "0.2.5",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.2.5.tgz",
|
||||
"integrity": "sha512-xIMNwsFGOHeY9EUWCHhUAcA2sCHZ5Lim0sc42uuUOeWayyH+HeR6ZWReptDQRuAoJHqQeag9qcqteE0AZPDTEw==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
@@ -351,9 +353,9 @@
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-linux-x64-gnu": {
|
||||
"version": "0.2.4",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.2.4.tgz",
|
||||
"integrity": "sha512-m4RhOI5JJWPU9Ip2LlRIzXu4mwIv9M//OyAuTLiLKRm8726jQHhYi5VFUEtNzqY0o0p6pS0b3XbifYQ+cyJn3Q==",
|
||||
"version": "0.2.5",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.2.5.tgz",
|
||||
"integrity": "sha512-Qr8dbHavtE+Zfd45kEORJQe01kRWhMF703gk8zhtZhskDUBCfqm3ap22JIux58tASxVcBqY8EtUFojfYGnQVvA==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
@@ -363,9 +365,9 @@
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-win32-x64-msvc": {
|
||||
"version": "0.2.4",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.2.4.tgz",
|
||||
"integrity": "sha512-lMF/2e3YkKWnTYv0R7cUCfjMkAqepNaHSc/dvJzCNsFVEhfDsFdScQFLToARs5GGxnq4fOf+MKpaHg/W6QTxiA==",
|
||||
"version": "0.2.5",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.2.5.tgz",
|
||||
"integrity": "sha512-jTqkR9HRfbjxhUrlTfveNkJ78tlpVXeNn3BS4wBm4VIsPd75jminKBRYtrlQCWyHusqrUQedKny4hhG1CuNUkg==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
@@ -596,6 +598,12 @@
|
||||
"@types/node": "*"
|
||||
}
|
||||
},
|
||||
"node_modules/@types/uuid": {
|
||||
"version": "9.0.3",
|
||||
"resolved": "https://registry.npmjs.org/@types/uuid/-/uuid-9.0.3.tgz",
|
||||
"integrity": "sha512-taHQQH/3ZyI3zP8M/puluDEIEvtQHVYcC6y3N8ijFtAd28+Ey/G4sg1u2gB01S8MwybLOKAp9/yCMu/uR5l3Ug==",
|
||||
"dev": true
|
||||
},
|
||||
"node_modules/@typescript-eslint/eslint-plugin": {
|
||||
"version": "5.59.1",
|
||||
"resolved": "https://registry.npmjs.org/@typescript-eslint/eslint-plugin/-/eslint-plugin-5.59.1.tgz",
|
||||
@@ -4451,6 +4459,15 @@
|
||||
"punycode": "^2.1.0"
|
||||
}
|
||||
},
|
||||
"node_modules/uuid": {
|
||||
"version": "9.0.0",
|
||||
"resolved": "https://registry.npmjs.org/uuid/-/uuid-9.0.0.tgz",
|
||||
"integrity": "sha512-MXcSTerfPa4uqyzStbRoTgt5XIe3x5+42+q1sDuy3R5MDk66URdLMOZe5aPX/SQd+kuYAh0FdP/pO28IkQyTeg==",
|
||||
"dev": true,
|
||||
"bin": {
|
||||
"uuid": "dist/bin/uuid"
|
||||
}
|
||||
},
|
||||
"node_modules/v8-compile-cache-lib": {
|
||||
"version": "3.0.1",
|
||||
"resolved": "https://registry.npmjs.org/v8-compile-cache-lib/-/v8-compile-cache-lib-3.0.1.tgz",
|
||||
@@ -4852,33 +4869,33 @@
|
||||
}
|
||||
},
|
||||
"@lancedb/vectordb-darwin-arm64": {
|
||||
"version": "0.2.4",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.2.4.tgz",
|
||||
"integrity": "sha512-MqiZXamHYEOfguPsHWLBQ56IabIN6Az8u2Hx8LCyXcxW9gcyJZMSAfJc+CcA4KYHKotv0KsVBhgxZ3kaZQQyiw==",
|
||||
"version": "0.2.5",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.2.5.tgz",
|
||||
"integrity": "sha512-V4206SajkMN3o+bBFBAYJq5emlrjevitP0g8RFfVlmj/LS38i8k4uvSe1bICQ2amUrYkL/Jw4ktYn19NRfTU+g==",
|
||||
"optional": true
|
||||
},
|
||||
"@lancedb/vectordb-darwin-x64": {
|
||||
"version": "0.2.4",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.2.4.tgz",
|
||||
"integrity": "sha512-DzL+mw5WhKDwXdEFlPh8M9zSDhGnfks7NvEh6ZqKbU6znH206YB7g3OA4WfFyV579IIEQ8jd4v/XDthNzQKuSA==",
|
||||
"version": "0.2.5",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.2.5.tgz",
|
||||
"integrity": "sha512-orePizgXCbTJbDJ4bMMnYh/4OgmWDBbHShNxHKQobcX+NgWTexmR0lV1WNOG+DtczBiGH422e3gHJ+xhTO13vg==",
|
||||
"optional": true
|
||||
},
|
||||
"@lancedb/vectordb-linux-arm64-gnu": {
|
||||
"version": "0.2.4",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.2.4.tgz",
|
||||
"integrity": "sha512-LP1nNfIpFxCgcCMlIQdseDX9dZU27TNhCL41xar8euqcetY5uKvi0YqhiVlpNO85Ss1FRQBgQ/GtnOM6Bo7oBQ==",
|
||||
"version": "0.2.5",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.2.5.tgz",
|
||||
"integrity": "sha512-xIMNwsFGOHeY9EUWCHhUAcA2sCHZ5Lim0sc42uuUOeWayyH+HeR6ZWReptDQRuAoJHqQeag9qcqteE0AZPDTEw==",
|
||||
"optional": true
|
||||
},
|
||||
"@lancedb/vectordb-linux-x64-gnu": {
|
||||
"version": "0.2.4",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.2.4.tgz",
|
||||
"integrity": "sha512-m4RhOI5JJWPU9Ip2LlRIzXu4mwIv9M//OyAuTLiLKRm8726jQHhYi5VFUEtNzqY0o0p6pS0b3XbifYQ+cyJn3Q==",
|
||||
"version": "0.2.5",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.2.5.tgz",
|
||||
"integrity": "sha512-Qr8dbHavtE+Zfd45kEORJQe01kRWhMF703gk8zhtZhskDUBCfqm3ap22JIux58tASxVcBqY8EtUFojfYGnQVvA==",
|
||||
"optional": true
|
||||
},
|
||||
"@lancedb/vectordb-win32-x64-msvc": {
|
||||
"version": "0.2.4",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.2.4.tgz",
|
||||
"integrity": "sha512-lMF/2e3YkKWnTYv0R7cUCfjMkAqepNaHSc/dvJzCNsFVEhfDsFdScQFLToARs5GGxnq4fOf+MKpaHg/W6QTxiA==",
|
||||
"version": "0.2.5",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.2.5.tgz",
|
||||
"integrity": "sha512-jTqkR9HRfbjxhUrlTfveNkJ78tlpVXeNn3BS4wBm4VIsPd75jminKBRYtrlQCWyHusqrUQedKny4hhG1CuNUkg==",
|
||||
"optional": true
|
||||
},
|
||||
"@neon-rs/cli": {
|
||||
@@ -5093,6 +5110,12 @@
|
||||
"@types/node": "*"
|
||||
}
|
||||
},
|
||||
"@types/uuid": {
|
||||
"version": "9.0.3",
|
||||
"resolved": "https://registry.npmjs.org/@types/uuid/-/uuid-9.0.3.tgz",
|
||||
"integrity": "sha512-taHQQH/3ZyI3zP8M/puluDEIEvtQHVYcC6y3N8ijFtAd28+Ey/G4sg1u2gB01S8MwybLOKAp9/yCMu/uR5l3Ug==",
|
||||
"dev": true
|
||||
},
|
||||
"@typescript-eslint/eslint-plugin": {
|
||||
"version": "5.59.1",
|
||||
"resolved": "https://registry.npmjs.org/@typescript-eslint/eslint-plugin/-/eslint-plugin-5.59.1.tgz",
|
||||
@@ -7844,6 +7867,12 @@
|
||||
"punycode": "^2.1.0"
|
||||
}
|
||||
},
|
||||
"uuid": {
|
||||
"version": "9.0.0",
|
||||
"resolved": "https://registry.npmjs.org/uuid/-/uuid-9.0.0.tgz",
|
||||
"integrity": "sha512-MXcSTerfPa4uqyzStbRoTgt5XIe3x5+42+q1sDuy3R5MDk66URdLMOZe5aPX/SQd+kuYAh0FdP/pO28IkQyTeg==",
|
||||
"dev": true
|
||||
},
|
||||
"v8-compile-cache-lib": {
|
||||
"version": "3.0.1",
|
||||
"resolved": "https://registry.npmjs.org/v8-compile-cache-lib/-/v8-compile-cache-lib-3.0.1.tgz",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "vectordb",
|
||||
"version": "0.2.4",
|
||||
"version": "0.2.5",
|
||||
"description": " Serverless, low-latency vector database for AI applications",
|
||||
"main": "dist/index.js",
|
||||
"types": "dist/index.d.ts",
|
||||
@@ -9,6 +9,7 @@
|
||||
"build": "cargo-cp-artifact --artifact cdylib vectordb-node index.node -- cargo build --message-format=json",
|
||||
"build-release": "npm run build -- --release",
|
||||
"test": "npm run tsc && mocha -recursive dist/test",
|
||||
"integration-test": "npm run tsc && mocha -recursive dist/integration_test",
|
||||
"lint": "eslint native.js src --ext .js,.ts",
|
||||
"clean": "rm -rf node_modules *.node dist/",
|
||||
"pack-build": "neon pack-build",
|
||||
@@ -34,6 +35,7 @@
|
||||
"@types/node": "^18.16.2",
|
||||
"@types/sinon": "^10.0.15",
|
||||
"@types/temp": "^0.9.1",
|
||||
"@types/uuid": "^9.0.3",
|
||||
"@typescript-eslint/eslint-plugin": "^5.59.1",
|
||||
"cargo-cp-artifact": "^0.1",
|
||||
"chai": "^4.3.7",
|
||||
@@ -51,7 +53,8 @@
|
||||
"ts-node-dev": "^2.0.0",
|
||||
"typedoc": "^0.24.7",
|
||||
"typedoc-plugin-markdown": "^3.15.3",
|
||||
"typescript": "*"
|
||||
"typescript": "*",
|
||||
"uuid": "^9.0.0"
|
||||
},
|
||||
"dependencies": {
|
||||
"@apache-arrow/ts": "^12.0.0",
|
||||
@@ -78,10 +81,10 @@
|
||||
}
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"@lancedb/vectordb-darwin-arm64": "0.2.4",
|
||||
"@lancedb/vectordb-darwin-x64": "0.2.4",
|
||||
"@lancedb/vectordb-linux-arm64-gnu": "0.2.4",
|
||||
"@lancedb/vectordb-linux-x64-gnu": "0.2.4",
|
||||
"@lancedb/vectordb-win32-x64-msvc": "0.2.4"
|
||||
"@lancedb/vectordb-darwin-arm64": "0.2.5",
|
||||
"@lancedb/vectordb-darwin-x64": "0.2.5",
|
||||
"@lancedb/vectordb-linux-arm64-gnu": "0.2.5",
|
||||
"@lancedb/vectordb-linux-x64-gnu": "0.2.5",
|
||||
"@lancedb/vectordb-win32-x64-msvc": "0.2.5"
|
||||
}
|
||||
}
|
||||
|
||||
43
node/src/integration_test/test.ts
Normal file
43
node/src/integration_test/test.ts
Normal file
@@ -0,0 +1,43 @@
|
||||
// Copyright 2023 LanceDB Developers.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
import { describe } from 'mocha'
|
||||
import * as chai from 'chai'
|
||||
import * as chaiAsPromised from 'chai-as-promised'
|
||||
import { v4 as uuidv4 } from 'uuid'
|
||||
|
||||
import * as lancedb from '../index'
|
||||
|
||||
const assert = chai.assert
|
||||
chai.use(chaiAsPromised)
|
||||
|
||||
describe('LanceDB AWS Integration test', function () {
|
||||
it('s3+ddb schema is processed correctly', async function () {
|
||||
this.timeout(5000)
|
||||
|
||||
// WARNING: specifying engine is NOT a publicly supported feature in lancedb yet
|
||||
// THE API WILL CHANGE
|
||||
const conn = await lancedb.connect('s3://lancedb-integtest?engine=ddb&ddbTableName=lancedb-integtest')
|
||||
const data = [{ vector: Array(128).fill(1.0) }]
|
||||
|
||||
const tableName = uuidv4()
|
||||
let table = await conn.createTable(tableName, data, { writeMode: lancedb.WriteMode.Overwrite })
|
||||
|
||||
const futs = [table.add(data), table.add(data), table.add(data), table.add(data), table.add(data)]
|
||||
await Promise.allSettled(futs)
|
||||
|
||||
table = await conn.openTable(tableName)
|
||||
assert.equal(await table.countRows(), 6)
|
||||
})
|
||||
})
|
||||
@@ -1,5 +1,5 @@
|
||||
[bumpversion]
|
||||
current_version = 0.2.2
|
||||
current_version = 0.2.4
|
||||
commit = True
|
||||
message = [python] Bump version: {current_version} → {new_version}
|
||||
tag = True
|
||||
|
||||
@@ -46,7 +46,19 @@ class FixedSizeListMixin(ABC):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
def vector(
|
||||
def vector(dim: int, value_type: pa.DataType = pa.float32()):
|
||||
# TODO: remove in future release
|
||||
from warnings import warn
|
||||
|
||||
warn(
|
||||
"lancedb.pydantic.vector() is deprecated, use lancedb.pydantic.Vector instead."
|
||||
"This function will be removed in future release",
|
||||
DeprecationWarning,
|
||||
)
|
||||
return Vector(dim, value_type)
|
||||
|
||||
|
||||
def Vector(
|
||||
dim: int, value_type: pa.DataType = pa.float32()
|
||||
) -> Type[FixedSizeListMixin]:
|
||||
"""Pydantic Vector Type.
|
||||
@@ -65,12 +77,12 @@ def vector(
|
||||
--------
|
||||
|
||||
>>> import pydantic
|
||||
>>> from lancedb.pydantic import vector
|
||||
>>> from lancedb.pydantic import Vector
|
||||
...
|
||||
>>> class MyModel(pydantic.BaseModel):
|
||||
... id: int
|
||||
... url: str
|
||||
... embeddings: vector(768)
|
||||
... embeddings: Vector(768)
|
||||
>>> schema = pydantic_to_schema(MyModel)
|
||||
>>> assert schema == pa.schema([
|
||||
... pa.field("id", pa.int64(), False),
|
||||
@@ -258,11 +270,11 @@ class LanceModel(pydantic.BaseModel):
|
||||
Examples
|
||||
--------
|
||||
>>> import lancedb
|
||||
>>> from lancedb.pydantic import LanceModel, vector
|
||||
>>> from lancedb.pydantic import LanceModel, Vector
|
||||
>>>
|
||||
>>> class TestModel(LanceModel):
|
||||
... name: str
|
||||
... vector: vector(2)
|
||||
... vector: Vector(2)
|
||||
...
|
||||
>>> db = lancedb.connect("/tmp")
|
||||
>>> table = db.create_table("test", schema=TestModel.to_arrow_schema())
|
||||
|
||||
@@ -102,7 +102,8 @@ def _to_record_batch_generator(
|
||||
table = _sanitize_data(batch, schema, metadata, on_bad_vectors, fill_value)
|
||||
for batch in table.to_batches():
|
||||
yield batch
|
||||
yield batch
|
||||
else:
|
||||
yield batch
|
||||
|
||||
|
||||
class Table(ABC):
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
[project]
|
||||
name = "lancedb"
|
||||
version = "0.2.2"
|
||||
version = "0.2.4"
|
||||
dependencies = [
|
||||
"pylance==0.6.5",
|
||||
"pylance==0.7.4",
|
||||
"ratelimiter",
|
||||
"retry",
|
||||
"tqdm",
|
||||
|
||||
@@ -17,7 +17,7 @@ import pyarrow as pa
|
||||
import pytest
|
||||
|
||||
import lancedb
|
||||
from lancedb.pydantic import LanceModel, vector
|
||||
from lancedb.pydantic import LanceModel, Vector
|
||||
|
||||
|
||||
def test_basic(tmp_path):
|
||||
@@ -79,7 +79,7 @@ def test_ingest_pd(tmp_path):
|
||||
|
||||
def test_ingest_iterator(tmp_path):
|
||||
class PydanticSchema(LanceModel):
|
||||
vector: vector(2)
|
||||
vector: Vector(2)
|
||||
item: str
|
||||
price: float
|
||||
|
||||
@@ -143,6 +143,7 @@ def test_ingest_iterator(tmp_path):
|
||||
|
||||
tbl_len = len(tbl)
|
||||
tbl.add(make_batches())
|
||||
assert tbl_len == 50
|
||||
assert len(tbl) == tbl_len * 2
|
||||
assert len(tbl.list_versions()) == 3
|
||||
db.drop_database()
|
||||
|
||||
@@ -19,8 +19,9 @@ from typing import List, Optional
|
||||
import pyarrow as pa
|
||||
import pydantic
|
||||
import pytest
|
||||
from pydantic import Field
|
||||
|
||||
from lancedb.pydantic import PYDANTIC_VERSION, LanceModel, pydantic_to_schema, vector
|
||||
from lancedb.pydantic import PYDANTIC_VERSION, LanceModel, Vector, pydantic_to_schema
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
@@ -107,7 +108,7 @@ def test_pydantic_to_arrow_py38():
|
||||
|
||||
def test_fixed_size_list_field():
|
||||
class TestModel(pydantic.BaseModel):
|
||||
vec: vector(16)
|
||||
vec: Vector(16)
|
||||
li: List[int]
|
||||
|
||||
data = TestModel(vec=list(range(16)), li=[1, 2, 3])
|
||||
@@ -154,7 +155,7 @@ def test_fixed_size_list_field():
|
||||
|
||||
def test_fixed_size_list_validation():
|
||||
class TestModel(pydantic.BaseModel):
|
||||
vec: vector(8)
|
||||
vec: Vector(8)
|
||||
|
||||
with pytest.raises(pydantic.ValidationError):
|
||||
TestModel(vec=range(9))
|
||||
@@ -167,9 +168,12 @@ def test_fixed_size_list_validation():
|
||||
|
||||
def test_lance_model():
|
||||
class TestModel(LanceModel):
|
||||
vec: vector(16)
|
||||
li: List[int]
|
||||
vector: Vector(16) = Field(default=[0.0] * 16)
|
||||
li: List[int] = Field(default=[1, 2, 3])
|
||||
|
||||
schema = pydantic_to_schema(TestModel)
|
||||
assert schema == TestModel.to_arrow_schema()
|
||||
assert TestModel.field_names() == ["vec", "li"]
|
||||
assert TestModel.field_names() == ["vector", "li"]
|
||||
|
||||
t = TestModel()
|
||||
assert t == TestModel(vec=[0.0] * 16, li=[1, 2, 3])
|
||||
|
||||
@@ -20,7 +20,7 @@ import pyarrow as pa
|
||||
import pytest
|
||||
|
||||
from lancedb.db import LanceDBConnection
|
||||
from lancedb.pydantic import LanceModel, vector
|
||||
from lancedb.pydantic import LanceModel, Vector
|
||||
from lancedb.query import LanceVectorQueryBuilder, Query
|
||||
from lancedb.table import LanceTable
|
||||
|
||||
@@ -67,7 +67,7 @@ def table(tmp_path) -> MockTable:
|
||||
|
||||
def test_cast(table):
|
||||
class TestModel(LanceModel):
|
||||
vector: vector(2)
|
||||
vector: Vector(2)
|
||||
id: int
|
||||
str_field: str
|
||||
float_field: float
|
||||
|
||||
@@ -24,7 +24,7 @@ import pytest
|
||||
|
||||
from lancedb.conftest import MockEmbeddingFunction
|
||||
from lancedb.db import LanceDBConnection
|
||||
from lancedb.pydantic import LanceModel, vector
|
||||
from lancedb.pydantic import LanceModel, Vector
|
||||
from lancedb.table import LanceTable
|
||||
|
||||
|
||||
@@ -140,7 +140,7 @@ def test_add(db):
|
||||
|
||||
def test_add_pydantic_model(db):
|
||||
class TestModel(LanceModel):
|
||||
vector: vector(16)
|
||||
vector: Vector(16)
|
||||
li: List[int]
|
||||
|
||||
data = TestModel(vector=list(range(16)), li=[1, 2, 3])
|
||||
@@ -354,7 +354,7 @@ def test_update(db):
|
||||
def test_create_with_embedding_function(db):
|
||||
class MyTable(LanceModel):
|
||||
text: str
|
||||
vector: vector(10)
|
||||
vector: Vector(10)
|
||||
|
||||
func = MockEmbeddingFunction(source_column="text", vector_column="vector")
|
||||
texts = ["hello world", "goodbye world", "foo bar baz fizz buzz"]
|
||||
@@ -379,7 +379,7 @@ def test_create_with_embedding_function(db):
|
||||
def test_add_with_embedding_function(db):
|
||||
class MyTable(LanceModel):
|
||||
text: str
|
||||
vector: vector(10)
|
||||
vector: Vector(10)
|
||||
|
||||
func = MockEmbeddingFunction(source_column="text", vector_column="vector")
|
||||
table = LanceTable.create(
|
||||
@@ -407,8 +407,8 @@ def test_add_with_embedding_function(db):
|
||||
def test_multiple_vector_columns(db):
|
||||
class MyTable(LanceModel):
|
||||
text: str
|
||||
vector1: vector(10)
|
||||
vector2: vector(10)
|
||||
vector1: Vector(10)
|
||||
vector2: Vector(10)
|
||||
|
||||
table = LanceTable.create(
|
||||
db,
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "vectordb-node"
|
||||
version = "0.2.4"
|
||||
version = "0.2.5"
|
||||
description = "Serverless, low-latency vector database for AI applications"
|
||||
license = "Apache-2.0"
|
||||
edition = "2018"
|
||||
@@ -18,6 +18,7 @@ once_cell = "1"
|
||||
futures = "0.3"
|
||||
half = { workspace = true }
|
||||
lance = { workspace = true }
|
||||
lance-linalg = { workspace = true }
|
||||
vectordb = { path = "../../vectordb" }
|
||||
tokio = { version = "1.23", features = ["rt-multi-thread"] }
|
||||
neon = {version = "0.10.1", default-features = false, features = ["channel-api", "napi-6", "promise-api", "task-api"] }
|
||||
|
||||
@@ -28,7 +28,9 @@ fn validate_vector_column(record_batch: &RecordBatch) -> Result<()> {
|
||||
record_batch
|
||||
.column_by_name(VECTOR_COLUMN_NAME)
|
||||
.map(|_| ())
|
||||
.context(MissingColumnSnafu { name: VECTOR_COLUMN_NAME })
|
||||
.context(MissingColumnSnafu {
|
||||
name: VECTOR_COLUMN_NAME,
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn arrow_buffer_to_record_batch(slice: &[u8]) -> Result<(Vec<RecordBatch>, SchemaRef)> {
|
||||
|
||||
@@ -14,7 +14,7 @@
|
||||
|
||||
use lance::index::vector::ivf::IvfBuildParams;
|
||||
use lance::index::vector::pq::PQBuildParams;
|
||||
use lance::index::vector::MetricType;
|
||||
use lance_linalg::distance::MetricType;
|
||||
use neon::context::FunctionContext;
|
||||
use neon::prelude::*;
|
||||
use std::convert::TryFrom;
|
||||
|
||||
@@ -183,11 +183,9 @@ fn database_open_table(mut cx: FunctionContext) -> JsResult<JsPromise> {
|
||||
let aws_region = get_aws_region(&mut cx, 4)?;
|
||||
|
||||
let params = ReadParams {
|
||||
store_options: Some(ObjectStoreParams {
|
||||
aws_credentials: aws_creds,
|
||||
aws_region,
|
||||
..ObjectStoreParams::default()
|
||||
}),
|
||||
store_options: Some(ObjectStoreParams::with_aws_credentials(
|
||||
aws_creds, aws_region,
|
||||
)),
|
||||
..ReadParams::default()
|
||||
};
|
||||
|
||||
|
||||
@@ -3,7 +3,7 @@ use std::ops::Deref;
|
||||
|
||||
use arrow_array::Float32Array;
|
||||
use futures::{TryFutureExt, TryStreamExt};
|
||||
use lance::index::vector::MetricType;
|
||||
use lance_linalg::distance::MetricType;
|
||||
use neon::context::FunctionContext;
|
||||
use neon::handle::Handle;
|
||||
use neon::prelude::*;
|
||||
|
||||
@@ -43,7 +43,8 @@ impl JsTable {
|
||||
.downcast_or_throw::<JsBox<JsDatabase>, _>(&mut cx)?;
|
||||
let table_name = cx.argument::<JsString>(0)?.value(&mut cx);
|
||||
let buffer = cx.argument::<JsBuffer>(1)?;
|
||||
let (batches, schema) = arrow_buffer_to_record_batch(buffer.as_slice(&mut cx)).or_throw(&mut cx)?;
|
||||
let (batches, schema) =
|
||||
arrow_buffer_to_record_batch(buffer.as_slice(&mut cx)).or_throw(&mut cx)?;
|
||||
|
||||
// Write mode
|
||||
let mode = match cx.argument::<JsString>(2)?.value(&mut cx).as_str() {
|
||||
@@ -65,11 +66,9 @@ impl JsTable {
|
||||
let aws_region = get_aws_region(&mut cx, 6)?;
|
||||
|
||||
let params = WriteParams {
|
||||
store_params: Some(ObjectStoreParams {
|
||||
aws_credentials: aws_creds,
|
||||
aws_region,
|
||||
..ObjectStoreParams::default()
|
||||
}),
|
||||
store_params: Some(ObjectStoreParams::with_aws_credentials(
|
||||
aws_creds, aws_region,
|
||||
)),
|
||||
mode: mode,
|
||||
..WriteParams::default()
|
||||
};
|
||||
@@ -92,7 +91,8 @@ impl JsTable {
|
||||
let js_table = cx.this().downcast_or_throw::<JsBox<JsTable>, _>(&mut cx)?;
|
||||
let buffer = cx.argument::<JsBuffer>(0)?;
|
||||
let write_mode = cx.argument::<JsString>(1)?.value(&mut cx);
|
||||
let (batches, schema) = arrow_buffer_to_record_batch(buffer.as_slice(&mut cx)).or_throw(&mut cx)?;
|
||||
let (batches, schema) =
|
||||
arrow_buffer_to_record_batch(buffer.as_slice(&mut cx)).or_throw(&mut cx)?;
|
||||
let rt = runtime(&mut cx)?;
|
||||
let channel = cx.channel();
|
||||
let mut table = js_table.table.clone();
|
||||
@@ -108,11 +108,9 @@ impl JsTable {
|
||||
let aws_region = get_aws_region(&mut cx, 5)?;
|
||||
|
||||
let params = WriteParams {
|
||||
store_params: Some(ObjectStoreParams {
|
||||
aws_credentials: aws_creds,
|
||||
aws_region,
|
||||
..ObjectStoreParams::default()
|
||||
}),
|
||||
store_params: Some(ObjectStoreParams::with_aws_credentials(
|
||||
aws_creds, aws_region,
|
||||
)),
|
||||
mode: write_mode,
|
||||
..WriteParams::default()
|
||||
};
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "vectordb"
|
||||
version = "0.2.4"
|
||||
version = "0.2.5"
|
||||
edition = "2021"
|
||||
description = "LanceDB: A serverless, low-latency vector database for AI applications"
|
||||
license = "Apache-2.0"
|
||||
@@ -10,14 +10,21 @@ categories = ["database-implementations"]
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
[dependencies]
|
||||
arrow = { workspace = true }
|
||||
arrow-array = { workspace = true }
|
||||
arrow-data = { workspace = true }
|
||||
arrow-schema = { workspace = true }
|
||||
arrow-ord = { workspace = true }
|
||||
arrow-cast = { workspace = true }
|
||||
object_store = { workspace = true }
|
||||
snafu = { workspace = true }
|
||||
half = { workspace = true }
|
||||
lance = { workspace = true }
|
||||
lance-linalg = { workspace = true }
|
||||
tokio = { version = "1.23", features = ["rt-multi-thread"] }
|
||||
log = { workspace = true }
|
||||
num-traits = "0"
|
||||
url = { workspace = true }
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile = "3.5.0"
|
||||
|
||||
15
rust/vectordb/src/arrow.rs
Normal file
15
rust/vectordb/src/arrow.rs
Normal file
@@ -0,0 +1,15 @@
|
||||
// Copyright 2023 Lance Developers.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
pub use lance::arrow::*;
|
||||
18
rust/vectordb/src/data.rs
Normal file
18
rust/vectordb/src/data.rs
Normal file
@@ -0,0 +1,18 @@
|
||||
// Copyright 2023 Lance Developers.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//! Data types, schema coercion, and data cleaning and etc.
|
||||
|
||||
pub mod inspect;
|
||||
pub mod sanitize;
|
||||
180
rust/vectordb/src/data/inspect.rs
Normal file
180
rust/vectordb/src/data/inspect.rs
Normal file
@@ -0,0 +1,180 @@
|
||||
// Copyright 2023 Lance Developers.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::collections::HashMap;
|
||||
|
||||
use arrow::compute::kernels::{aggregate::bool_and, length::length};
|
||||
use arrow_array::{
|
||||
cast::AsArray,
|
||||
types::{ArrowPrimitiveType, Int32Type, Int64Type},
|
||||
Array, GenericListArray, OffsetSizeTrait, RecordBatchReader,
|
||||
};
|
||||
use arrow_ord::comparison::eq_dyn_scalar;
|
||||
use arrow_schema::DataType;
|
||||
use num_traits::{ToPrimitive, Zero};
|
||||
|
||||
use crate::error::{Error, Result};
|
||||
|
||||
pub(crate) fn infer_dimension<T: ArrowPrimitiveType>(
|
||||
list_arr: &GenericListArray<T::Native>,
|
||||
) -> Result<Option<T::Native>>
|
||||
where
|
||||
T::Native: OffsetSizeTrait + ToPrimitive,
|
||||
{
|
||||
let len_arr = length(list_arr)?;
|
||||
if len_arr.is_empty() {
|
||||
return Ok(Some(Zero::zero()));
|
||||
}
|
||||
|
||||
let dim = len_arr.as_primitive::<T>().value(0);
|
||||
if bool_and(&eq_dyn_scalar(len_arr.as_primitive::<T>(), dim)?) != Some(true) {
|
||||
Ok(None)
|
||||
} else {
|
||||
Ok(Some(dim))
|
||||
}
|
||||
}
|
||||
|
||||
/// Infer the vector columns from a dataset.
|
||||
///
|
||||
/// Parameters
|
||||
/// ----------
|
||||
/// - reader: RecordBatchReader
|
||||
/// - strict: if set true, only fixed_size_list<float> is considered as vector column. If set to false,
|
||||
/// a list<float> column with same length is also considered as vector column.
|
||||
pub fn infer_vector_columns(
|
||||
reader: impl RecordBatchReader + Send,
|
||||
strict: bool,
|
||||
) -> Result<Vec<String>> {
|
||||
let mut columns = vec![];
|
||||
|
||||
let mut columns_to_infer: HashMap<String, Option<i64>> = HashMap::new();
|
||||
for field in reader.schema().fields() {
|
||||
match field.data_type() {
|
||||
DataType::FixedSizeList(sub_field, _) if sub_field.data_type().is_floating() => {
|
||||
columns.push(field.name().to_string());
|
||||
}
|
||||
DataType::List(sub_field) if sub_field.data_type().is_floating() && !strict => {
|
||||
columns_to_infer.insert(field.name().to_string(), None);
|
||||
}
|
||||
DataType::LargeList(sub_field) if sub_field.data_type().is_floating() && !strict => {
|
||||
columns_to_infer.insert(field.name().to_string(), None);
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
for batch in reader {
|
||||
let batch = batch?;
|
||||
let col_names = columns_to_infer.keys().cloned().collect::<Vec<_>>();
|
||||
for col_name in col_names {
|
||||
let col = batch.column_by_name(&col_name).ok_or(Error::Schema {
|
||||
message: format!("Column {} not found", col_name),
|
||||
})?;
|
||||
if let Some(dim) = match *col.data_type() {
|
||||
DataType::List(_) => {
|
||||
infer_dimension::<Int32Type>(col.as_list::<i32>())?.map(|d| d as i64)
|
||||
}
|
||||
DataType::LargeList(_) => infer_dimension::<Int64Type>(col.as_list::<i64>())?,
|
||||
_ => {
|
||||
return Err(Error::Schema {
|
||||
message: format!("Column {} is not a list", col_name),
|
||||
})
|
||||
}
|
||||
} {
|
||||
if let Some(Some(prev_dim)) = columns_to_infer.get(&col_name) {
|
||||
if prev_dim != &dim {
|
||||
columns_to_infer.remove(&col_name);
|
||||
}
|
||||
} else {
|
||||
columns_to_infer.insert(col_name, Some(dim));
|
||||
}
|
||||
} else {
|
||||
columns_to_infer.remove(&col_name);
|
||||
}
|
||||
}
|
||||
}
|
||||
columns.extend(columns_to_infer.keys().cloned());
|
||||
Ok(columns)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
use arrow_array::{
|
||||
types::{Float32Type, Float64Type},
|
||||
FixedSizeListArray, Float32Array, ListArray, RecordBatch, RecordBatchIterator, StringArray,
|
||||
};
|
||||
use arrow_schema::{DataType, Field, Schema};
|
||||
use std::{sync::Arc, vec};
|
||||
|
||||
#[test]
|
||||
fn test_infer_vector_columns() {
|
||||
let schema = Arc::new(Schema::new(vec![
|
||||
Field::new("f", DataType::Float32, false),
|
||||
Field::new("s", DataType::Utf8, false),
|
||||
Field::new(
|
||||
"l1",
|
||||
DataType::List(Arc::new(Field::new("item", DataType::Float32, true))),
|
||||
false,
|
||||
),
|
||||
Field::new(
|
||||
"l2",
|
||||
DataType::List(Arc::new(Field::new("item", DataType::Float64, true))),
|
||||
false,
|
||||
),
|
||||
Field::new(
|
||||
"fl",
|
||||
DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Float32, true)), 32),
|
||||
true,
|
||||
),
|
||||
]));
|
||||
|
||||
let batch = RecordBatch::try_new(
|
||||
schema.clone(),
|
||||
vec![
|
||||
Arc::new(Float32Array::from(vec![1.0, 2.0, 3.0])),
|
||||
Arc::new(StringArray::from(vec!["a", "b", "c"])),
|
||||
Arc::new(ListArray::from_iter_primitive::<Float32Type, _, _>(
|
||||
(0..3).map(|_| Some(vec![Some(1.0), Some(2.0), Some(3.0), Some(4.0)])),
|
||||
)),
|
||||
// Var-length list
|
||||
Arc::new(ListArray::from_iter_primitive::<Float64Type, _, _>(vec![
|
||||
Some(vec![Some(1.0_f64)]),
|
||||
Some(vec![Some(2.0_f64), Some(3.0_f64)]),
|
||||
Some(vec![Some(4.0_f64), Some(5.0_f64), Some(6.0_f64)]),
|
||||
])),
|
||||
Arc::new(
|
||||
FixedSizeListArray::from_iter_primitive::<Float32Type, _, _>(
|
||||
vec![
|
||||
Some(vec![Some(1.0); 32]),
|
||||
Some(vec![Some(2.0); 32]),
|
||||
Some(vec![Some(3.0); 32]),
|
||||
],
|
||||
32,
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
.unwrap();
|
||||
let reader =
|
||||
RecordBatchIterator::new(vec![batch.clone()].into_iter().map(Ok), schema.clone());
|
||||
|
||||
let cols = infer_vector_columns(reader, false).unwrap();
|
||||
assert_eq!(cols, vec!["fl", "l1"]);
|
||||
|
||||
let reader = RecordBatchIterator::new(vec![batch].into_iter().map(Ok), schema);
|
||||
let cols = infer_vector_columns(reader, true).unwrap();
|
||||
assert_eq!(cols, vec!["fl"]);
|
||||
}
|
||||
}
|
||||
284
rust/vectordb/src/data/sanitize.rs
Normal file
284
rust/vectordb/src/data/sanitize.rs
Normal file
@@ -0,0 +1,284 @@
|
||||
// Copyright 2023 Lance Developers.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::{iter::repeat_with, sync::Arc};
|
||||
|
||||
use arrow_array::{
|
||||
cast::AsArray,
|
||||
types::{Float16Type, Float32Type, Float64Type, Int32Type, Int64Type},
|
||||
Array, ArrowNumericType, FixedSizeListArray, PrimitiveArray, RecordBatch, RecordBatchIterator,
|
||||
RecordBatchReader,
|
||||
};
|
||||
use arrow_cast::{can_cast_types, cast};
|
||||
use arrow_schema::{ArrowError, DataType, Field, Schema};
|
||||
use half::f16;
|
||||
use lance::arrow::{DataTypeExt, FixedSizeListArrayExt};
|
||||
use log::warn;
|
||||
use num_traits::cast::AsPrimitive;
|
||||
|
||||
use super::inspect::infer_dimension;
|
||||
use crate::error::Result;
|
||||
|
||||
fn cast_array<I: ArrowNumericType, O: ArrowNumericType>(
|
||||
arr: &PrimitiveArray<I>,
|
||||
) -> Arc<PrimitiveArray<O>>
|
||||
where
|
||||
I::Native: AsPrimitive<O::Native>,
|
||||
{
|
||||
Arc::new(PrimitiveArray::<O>::from_iter_values(
|
||||
arr.values().iter().map(|v| (*v).as_()),
|
||||
))
|
||||
}
|
||||
|
||||
fn cast_float_array<I: ArrowNumericType>(
|
||||
arr: &PrimitiveArray<I>,
|
||||
dt: &DataType,
|
||||
) -> std::result::Result<Arc<dyn Array>, ArrowError>
|
||||
where
|
||||
I::Native: AsPrimitive<f64> + AsPrimitive<f32> + AsPrimitive<f16>,
|
||||
{
|
||||
match dt {
|
||||
DataType::Float16 => Ok(cast_array::<I, Float16Type>(arr)),
|
||||
DataType::Float32 => Ok(cast_array::<I, Float32Type>(arr)),
|
||||
DataType::Float64 => Ok(cast_array::<I, Float64Type>(arr)),
|
||||
_ => Err(ArrowError::SchemaError(format!(
|
||||
"Incompatible change field: unable to coerce {:?} to {:?}",
|
||||
arr.data_type(),
|
||||
dt
|
||||
))),
|
||||
}
|
||||
}
|
||||
|
||||
fn coerce_array(
|
||||
array: &Arc<dyn Array>,
|
||||
field: &Field,
|
||||
) -> std::result::Result<Arc<dyn Array>, ArrowError> {
|
||||
if array.data_type() == field.data_type() {
|
||||
return Ok(array.clone());
|
||||
}
|
||||
match (array.data_type(), field.data_type()) {
|
||||
// Normal cast-able types.
|
||||
(adt, dt) if can_cast_types(adt, dt) => cast(&array, dt),
|
||||
// Casting between f16/f32/f64 can be lossy.
|
||||
(adt, dt) if (adt.is_floating() || dt.is_floating()) => {
|
||||
if adt.byte_width() > dt.byte_width() {
|
||||
warn!(
|
||||
"Coercing field {} {:?} to {:?} might lose precision",
|
||||
field.name(),
|
||||
adt,
|
||||
dt
|
||||
);
|
||||
}
|
||||
match adt {
|
||||
DataType::Float16 => cast_float_array(array.as_primitive::<Float16Type>(), dt),
|
||||
DataType::Float32 => cast_float_array(array.as_primitive::<Float32Type>(), dt),
|
||||
DataType::Float64 => cast_float_array(array.as_primitive::<Float64Type>(), dt),
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
(adt, DataType::FixedSizeList(exp_field, exp_dim)) => match adt {
|
||||
// Cast a float fixed size array with same dimension to the expected type.
|
||||
DataType::FixedSizeList(_, dim) if dim == exp_dim => {
|
||||
let actual_sub = array.as_fixed_size_list();
|
||||
let values = coerce_array(actual_sub.values(), exp_field)?;
|
||||
Ok(Arc::new(FixedSizeListArray::try_new_from_values(
|
||||
values.clone(),
|
||||
*dim,
|
||||
)?) as Arc<dyn Array>)
|
||||
}
|
||||
DataType::List(_) | DataType::LargeList(_) => {
|
||||
let Some(dim) = (match adt {
|
||||
DataType::List(_) => infer_dimension::<Int32Type>(array.as_list::<i32>())
|
||||
.map_err(|e| {
|
||||
ArrowError::SchemaError(format!(
|
||||
"failed to infer dimension from list: {}",
|
||||
e
|
||||
))
|
||||
})?
|
||||
.map(|d| d as i64),
|
||||
DataType::LargeList(_) => infer_dimension::<Int64Type>(array.as_list::<i64>())
|
||||
.map_err(|e| {
|
||||
ArrowError::SchemaError(format!(
|
||||
"failed to infer dimension from large list: {}",
|
||||
e
|
||||
))
|
||||
})?,
|
||||
_ => unreachable!(),
|
||||
}) else {
|
||||
return Err(ArrowError::SchemaError(format!(
|
||||
"Incompatible coerce fixed size list: unable to coerce {:?} from {:?}",
|
||||
field,
|
||||
array.data_type()
|
||||
)));
|
||||
};
|
||||
|
||||
if dim != *exp_dim as i64 {
|
||||
return Err(ArrowError::SchemaError(format!(
|
||||
"Incompatible coerce fixed size list: expected dimension {} but got {}",
|
||||
exp_dim, dim
|
||||
)));
|
||||
}
|
||||
|
||||
let values = coerce_array(array, exp_field)?;
|
||||
Ok(Arc::new(FixedSizeListArray::try_new_from_values(
|
||||
values.clone(),
|
||||
*exp_dim,
|
||||
)?) as Arc<dyn Array>)
|
||||
}
|
||||
_ => Err(ArrowError::SchemaError(format!(
|
||||
"Incompatible coerce fixed size list: unable to coerce {:?} from {:?}",
|
||||
field,
|
||||
array.data_type()
|
||||
)))?,
|
||||
},
|
||||
_ => Err(ArrowError::SchemaError(format!(
|
||||
"Incompatible change field {}: unable to coerce {:?} to {:?}",
|
||||
field.name(),
|
||||
array.data_type(),
|
||||
field.data_type()
|
||||
)))?,
|
||||
}
|
||||
}
|
||||
|
||||
fn coerce_schema_batch(
|
||||
batch: RecordBatch,
|
||||
schema: Arc<Schema>,
|
||||
) -> std::result::Result<RecordBatch, ArrowError> {
|
||||
if batch.schema() == schema {
|
||||
return Ok(batch);
|
||||
}
|
||||
let columns = schema
|
||||
.fields()
|
||||
.iter()
|
||||
.map(|field| {
|
||||
batch
|
||||
.column_by_name(field.name())
|
||||
.ok_or_else(|| {
|
||||
ArrowError::SchemaError(format!("Column {} not found", field.name()))
|
||||
})
|
||||
.and_then(|c| coerce_array(c, field))
|
||||
})
|
||||
.collect::<std::result::Result<Vec<_>, ArrowError>>()?;
|
||||
RecordBatch::try_new(schema, columns)
|
||||
}
|
||||
|
||||
/// Coerce the reader (input data) to match the given [Schema].
|
||||
///
|
||||
pub fn coerce_schema(
|
||||
reader: impl RecordBatchReader + Send + 'static,
|
||||
schema: Arc<Schema>,
|
||||
) -> Result<Box<dyn RecordBatchReader + Send>> {
|
||||
if reader.schema() == schema {
|
||||
return Ok(Box::new(RecordBatchIterator::new(reader, schema)));
|
||||
}
|
||||
let s = schema.clone();
|
||||
let batches = reader
|
||||
.zip(repeat_with(move || s.clone()))
|
||||
.map(|(batch, s)| coerce_schema_batch(batch?, s));
|
||||
Ok(Box::new(RecordBatchIterator::new(batches, schema)))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use arrow_array::{
|
||||
FixedSizeListArray, Float16Array, Float32Array, Float64Array, Int32Array, Int8Array,
|
||||
RecordBatch, RecordBatchIterator, StringArray,
|
||||
};
|
||||
use arrow_schema::Field;
|
||||
use half::f16;
|
||||
use lance::arrow::FixedSizeListArrayExt;
|
||||
|
||||
#[test]
|
||||
fn test_coerce_list_to_fixed_size_list() {
|
||||
let schema = Arc::new(Schema::new(vec![
|
||||
Field::new(
|
||||
"fl",
|
||||
DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Float32, true)), 64),
|
||||
true,
|
||||
),
|
||||
Field::new("s", DataType::Utf8, true),
|
||||
Field::new("f", DataType::Float16, true),
|
||||
Field::new("i", DataType::Int32, true),
|
||||
]));
|
||||
|
||||
let batch = RecordBatch::try_new(
|
||||
schema.clone(),
|
||||
vec![
|
||||
Arc::new(
|
||||
FixedSizeListArray::try_new_from_values(
|
||||
Float32Array::from_iter_values((0..256).map(|v| v as f32)),
|
||||
64,
|
||||
)
|
||||
.unwrap(),
|
||||
),
|
||||
Arc::new(StringArray::from(vec![
|
||||
Some("hello"),
|
||||
Some("world"),
|
||||
Some("from"),
|
||||
Some("lance"),
|
||||
])),
|
||||
Arc::new(Float16Array::from_iter_values(
|
||||
(0..4).map(|v| f16::from_f32(v as f32)),
|
||||
)),
|
||||
Arc::new(Int32Array::from_iter_values(0..4)),
|
||||
],
|
||||
)
|
||||
.unwrap();
|
||||
let reader =
|
||||
RecordBatchIterator::new(vec![batch.clone()].into_iter().map(Ok), schema.clone());
|
||||
|
||||
let expected_schema = Arc::new(Schema::new(vec![
|
||||
Field::new(
|
||||
"fl",
|
||||
DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Float16, true)), 64),
|
||||
true,
|
||||
),
|
||||
Field::new("s", DataType::Utf8, true),
|
||||
Field::new("f", DataType::Float64, true),
|
||||
Field::new("i", DataType::Int8, true),
|
||||
]));
|
||||
let stream = coerce_schema(reader, expected_schema.clone()).unwrap();
|
||||
let batches = stream.collect::<Vec<_>>();
|
||||
assert_eq!(batches.len(), 1);
|
||||
let batch = batches[0].as_ref().unwrap();
|
||||
assert_eq!(batch.schema(), expected_schema);
|
||||
|
||||
let expected = RecordBatch::try_new(
|
||||
expected_schema,
|
||||
vec![
|
||||
Arc::new(
|
||||
FixedSizeListArray::try_new_from_values(
|
||||
Float16Array::from_iter_values((0..256).map(|v| f16::from_f32(v as f32))),
|
||||
64,
|
||||
)
|
||||
.unwrap(),
|
||||
),
|
||||
Arc::new(StringArray::from(vec![
|
||||
Some("hello"),
|
||||
Some("world"),
|
||||
Some("from"),
|
||||
Some("lance"),
|
||||
])),
|
||||
Arc::new(Float64Array::from_iter_values((0..4).map(|v| v as f64))),
|
||||
Arc::new(Int8Array::from_iter_values(0..4)),
|
||||
],
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(batch, &expected);
|
||||
}
|
||||
}
|
||||
@@ -20,19 +20,32 @@ use lance::dataset::WriteParams;
|
||||
use lance::io::object_store::ObjectStore;
|
||||
use snafu::prelude::*;
|
||||
|
||||
use crate::error::{CreateDirSnafu, InvalidTableNameSnafu, Result};
|
||||
use crate::error::{CreateDirSnafu, Error, InvalidTableNameSnafu, Result};
|
||||
use crate::table::{ReadParams, Table};
|
||||
|
||||
pub const LANCE_FILE_EXTENSION: &str = "lance";
|
||||
|
||||
pub struct Database {
|
||||
object_store: ObjectStore,
|
||||
query_string: Option<String>,
|
||||
|
||||
pub(crate) uri: String,
|
||||
pub(crate) base_path: object_store::path::Path,
|
||||
}
|
||||
|
||||
const LANCE_EXTENSION: &str = "lance";
|
||||
const ENGINE: &str = "engine";
|
||||
|
||||
/// Parse a url, if it's not a valid url, assume it's a local file
|
||||
/// and try to parse with file:// appended
|
||||
fn parse_url(url: &str) -> Result<url::Url> {
|
||||
match url::Url::parse(url) {
|
||||
Ok(url) => Ok(url),
|
||||
Err(_) => url::Url::parse(format!("file://{}", url).as_str()).map_err(|e| Error::Lance {
|
||||
message: format!("Failed to parse uri: {}", e),
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
/// A connection to LanceDB
|
||||
impl Database {
|
||||
@@ -46,12 +59,71 @@ impl Database {
|
||||
///
|
||||
/// * A [Database] object.
|
||||
pub async fn connect(uri: &str) -> Result<Database> {
|
||||
let (object_store, base_path) = ObjectStore::from_uri(uri).await?;
|
||||
if object_store.is_local() {
|
||||
Self::try_create_dir(uri).context(CreateDirSnafu { path: uri })?;
|
||||
// For a native (using lance directly) connection
|
||||
// The DB doesn't use any uri parameters, but lance does
|
||||
// So we need to parse the uri, extract the query string, and progate it to lance
|
||||
let mut url = parse_url(uri)?;
|
||||
|
||||
// special handling for windows
|
||||
if url.scheme().len() == 1 && cfg!(windows) {
|
||||
let (object_store, base_path) = ObjectStore::from_uri(uri).await?;
|
||||
if object_store.is_local() {
|
||||
Self::try_create_dir(uri).context(CreateDirSnafu { path: uri })?;
|
||||
}
|
||||
return Ok(Database {
|
||||
uri: uri.to_string(),
|
||||
query_string: None,
|
||||
base_path,
|
||||
object_store,
|
||||
});
|
||||
}
|
||||
|
||||
// iter thru the query params and extract the commit store param
|
||||
let mut engine = None;
|
||||
let mut filtered_querys = vec![];
|
||||
|
||||
// WARNING: specifying engine is NOT a publicly supported feature in lancedb yet
|
||||
// THE API WILL CHANGE
|
||||
for (key, value) in url.query_pairs() {
|
||||
if key == ENGINE {
|
||||
engine = Some(value.to_string());
|
||||
} else {
|
||||
// to owned so we can modify the url
|
||||
filtered_querys.push((key.to_string(), value.to_string()));
|
||||
}
|
||||
}
|
||||
|
||||
// Filter out the commit store query param -- it's a lancedb param
|
||||
url.query_pairs_mut().clear();
|
||||
url.query_pairs_mut().extend_pairs(filtered_querys);
|
||||
// Take a copy of the query string so we can propagate it to lance
|
||||
let query_string = url.query().map(|s| s.to_string());
|
||||
// clear the query string so we can use the url as the base uri
|
||||
// use .set_query(None) instead of .set_query("") because the latter
|
||||
// will add a trailing '?' to the url
|
||||
url.set_query(None);
|
||||
|
||||
let table_base_uri = if let Some(store) = engine {
|
||||
static WARN_ONCE: std::sync::Once = std::sync::Once::new();
|
||||
WARN_ONCE.call_once(|| {
|
||||
log::warn!("Specifing engine is not a publicly supported feature in lancedb yet. THE API WILL CHANGE");
|
||||
});
|
||||
let old_scheme = url.scheme().to_string();
|
||||
let new_scheme = format!("{}+{}", old_scheme, store);
|
||||
url.to_string().replacen(&old_scheme, &new_scheme, 1)
|
||||
} else {
|
||||
url.to_string()
|
||||
};
|
||||
|
||||
let plain_uri = url.to_string();
|
||||
let (object_store, base_path) = ObjectStore::from_uri(&plain_uri).await?;
|
||||
if object_store.is_local() {
|
||||
Self::try_create_dir(&plain_uri).context(CreateDirSnafu { path: plain_uri })?;
|
||||
}
|
||||
|
||||
Ok(Database {
|
||||
uri: uri.to_string(),
|
||||
uri: table_base_uri,
|
||||
query_string,
|
||||
base_path,
|
||||
object_store,
|
||||
})
|
||||
@@ -149,11 +221,19 @@ impl Database {
|
||||
let path = Path::new(&self.uri);
|
||||
let table_uri = path.join(format!("{}.{}", name, LANCE_FILE_EXTENSION));
|
||||
|
||||
let uri = table_uri
|
||||
let mut uri = table_uri
|
||||
.as_path()
|
||||
.to_str()
|
||||
.context(InvalidTableNameSnafu { name })?;
|
||||
Ok(uri.to_string())
|
||||
.context(InvalidTableNameSnafu { name })?
|
||||
.to_string();
|
||||
|
||||
// If there are query string set on the connection, propagate to lance
|
||||
if let Some(query) = self.query_string.as_ref() {
|
||||
uri.push('?');
|
||||
uri.push_str(query.as_str());
|
||||
}
|
||||
|
||||
Ok(uri)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -170,7 +250,15 @@ mod tests {
|
||||
let uri = tmp_dir.path().to_str().unwrap();
|
||||
let db = Database::connect(uri).await.unwrap();
|
||||
|
||||
assert_eq!(db.uri, uri);
|
||||
// file:// scheme should be automatically appended if not specified
|
||||
// windows path come with drive letter, so file:// won't be appended
|
||||
let expected = if cfg!(windows) {
|
||||
uri.to_string()
|
||||
} else {
|
||||
format!("file://{}", uri)
|
||||
};
|
||||
|
||||
assert_eq!(db.uri, expected);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use arrow_schema::ArrowError;
|
||||
use snafu::Snafu;
|
||||
|
||||
#[derive(Debug, Snafu)]
|
||||
@@ -32,10 +33,20 @@ pub enum Error {
|
||||
Store { message: String },
|
||||
#[snafu(display("LanceDBError: {message}"))]
|
||||
Lance { message: String },
|
||||
#[snafu(display("LanceDB Schema Error: {message}"))]
|
||||
Schema { message: String },
|
||||
}
|
||||
|
||||
pub type Result<T> = std::result::Result<T, Error>;
|
||||
|
||||
impl From<ArrowError> for Error {
|
||||
fn from(e: ArrowError) -> Self {
|
||||
Self::Lance {
|
||||
message: e.to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<lance::Error> for Error {
|
||||
fn from(e: lance::Error) -> Self {
|
||||
Self::Lance {
|
||||
|
||||
@@ -14,7 +14,8 @@
|
||||
|
||||
use lance::index::vector::ivf::IvfBuildParams;
|
||||
use lance::index::vector::pq::PQBuildParams;
|
||||
use lance::index::vector::{MetricType, VectorIndexParams};
|
||||
use lance::index::vector::VectorIndexParams;
|
||||
use lance_linalg::distance::MetricType;
|
||||
|
||||
pub trait VectorIndexBuilder {
|
||||
fn get_column(&self) -> Option<String>;
|
||||
@@ -107,9 +108,11 @@ impl VectorIndexBuilder for IvfPQIndexBuilder {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
use lance::index::vector::ivf::IvfBuildParams;
|
||||
use lance::index::vector::pq::PQBuildParams;
|
||||
use lance::index::vector::{MetricType, StageParams};
|
||||
use lance::index::vector::StageParams;
|
||||
|
||||
use crate::index::vector::{IvfPQIndexBuilder, VectorIndexBuilder};
|
||||
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
pub mod data;
|
||||
pub mod database;
|
||||
pub mod error;
|
||||
pub mod index;
|
||||
|
||||
@@ -17,7 +17,7 @@ use std::sync::Arc;
|
||||
use arrow_array::Float32Array;
|
||||
use lance::dataset::scanner::{DatasetRecordBatchStream, Scanner};
|
||||
use lance::dataset::Dataset;
|
||||
use lance::index::vector::MetricType;
|
||||
use lance_linalg::distance::MetricType;
|
||||
|
||||
use crate::error::Result;
|
||||
|
||||
@@ -164,10 +164,10 @@ impl Query {
|
||||
mod tests {
|
||||
use std::sync::Arc;
|
||||
|
||||
use super::*;
|
||||
use arrow_array::{Float32Array, RecordBatch, RecordBatchIterator, RecordBatchReader};
|
||||
use arrow_schema::{DataType, Field as ArrowField, Schema as ArrowSchema};
|
||||
use lance::dataset::Dataset;
|
||||
use lance::index::vector::MetricType;
|
||||
|
||||
use crate::query::Query;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user