Compare commits

...

9 Commits

Author SHA1 Message Date
Lance Release
661fcecf38 [python] Bump version: 0.2.2 → 0.2.3 2023-09-14 17:48:32 +00:00
Lance Release
07fe284810 Updating package-lock.json 2023-09-10 23:58:06 +00:00
Lance Release
800bb691c3 Updating package-lock.json 2023-09-09 19:45:58 +00:00
Lance Release
ec24e09add Bump version: 0.2.4 → 0.2.5 2023-09-09 19:45:43 +00:00
Rob Meng
0554db03b3 progagate uri query string to lance; add aws integration tests (#486)
# WARNING: specifying engine is NOT a publicly supported feature in
lancedb yet. THE API WILL CHANGE.

This PR exposes dynamodb based commit to `vectordb` and JS SDK (will do
python in another PR since it's on a different release track)

This PR also added aws integration test using `localstack`

## What?
This PR adds uri parameters to DB connection string. User may specify
`engine` in the connection string to let LanceDB know that the user
wants to use an external store when reading and writing a table. User
may also pass any parameters required by the commitStore in the
connection string, these parameters will be propagated to lance.

e.g.
```
vectordb.connect("s3://my-db-bucket?engine=ddb&ddbTableName=my-commit-table")
```
will automatically convert table path to
```
s3+ddb://my-db-bucket/my_table.lance?&ddbTableName=my-commit-table
```
2023-09-09 13:33:16 -04:00
Lei Xu
b315ea3978 [Python] Pydantic vector field with default value (#474)
Rename `lance.pydantic.vector` to `Vector` and deprecate `vector(dim)`
2023-09-08 22:35:31 -07:00
Ayush Chaurasia
aa7806cf0d [Python]Fix record_batch_generator (#483)
Should fix - https://github.com/lancedb/lancedb/issues/482
2023-09-08 21:18:50 +05:30
Lei Xu
6799613109 feat: upgrade lance to 0.7.3 (#481) 2023-09-07 17:01:45 -07:00
Lei Xu
0f26915d22 [Rust] schema coerce and vector column inference (#476)
Split the rust core from #466 for easy review and less merge conflicts.
2023-09-06 10:00:46 -07:00
30 changed files with 903 additions and 133 deletions

View File

@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.2.4
current_version = 0.2.5
commit = True
message = Bump version: {current_version} → {new_version}
tag = True

View File

@@ -107,3 +107,56 @@ jobs:
- name: Test
run: |
npm run test
aws-integtest:
timeout-minutes: 45
runs-on: "ubuntu-22.04"
defaults:
run:
shell: bash
working-directory: node
env:
AWS_ACCESS_KEY_ID: ACCESSKEY
AWS_SECRET_ACCESS_KEY: SECRETKEY
AWS_DEFAULT_REGION: us-west-2
# this one is for s3
AWS_ENDPOINT: http://localhost:4566
# this one is for dynamodb
DYNAMODB_ENDPOINT: http://localhost:4566
steps:
- uses: actions/checkout@v3
with:
fetch-depth: 0
lfs: true
- uses: actions/setup-node@v3
with:
node-version: 18
cache: 'npm'
cache-dependency-path: node/package-lock.json
- name: start local stack
run: docker compose -f ../docker-compose.yml up -d
- name: create s3
run: aws s3 mb s3://lancedb-integtest --endpoint $AWS_ENDPOINT
- name: create ddb
run: |
aws dynamodb create-table \
--table-name lancedb-integtest \
--attribute-definitions '[{"AttributeName": "base_uri", "AttributeType": "S"}, {"AttributeName": "version", "AttributeType": "N"}]' \
--key-schema '[{"AttributeName": "base_uri", "KeyType": "HASH"}, {"AttributeName": "version", "KeyType": "RANGE"}]' \
--provisioned-throughput '{"ReadCapacityUnits": 10, "WriteCapacityUnits": 10}' \
--endpoint-url $DYNAMODB_ENDPOINT
- uses: Swatinem/rust-cache@v2
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y protobuf-compiler libssl-dev
- name: Build
run: |
npm ci
npm run tsc
npm run build
npm run pack-build
npm install --no-save ./dist/lancedb-vectordb-*.tgz
# Remove index.node to test with dependency installed
rm index.node
- name: Test
run: npm run integration-test

View File

@@ -1,16 +1,24 @@
[workspace]
members = [
"rust/vectordb",
"rust/ffi/node"
]
members = ["rust/ffi/node", "rust/vectordb"]
# Python package needs to be built by maturin.
exclude = ["python"]
resolver = "2"
[workspace.dependencies]
lance = "=0.6.5"
lance = { "version" = "=0.7.3", "features" = ["dynamodb"] }
# Note that this one does not include pyarrow
arrow = { version = "43.0.0", optional = false }
arrow-array = "43.0"
arrow-data = "43.0"
arrow-schema = "43.0"
arrow-ipc = "43.0"
half = { "version" = "=2.2.1", default-features = false }
arrow-ord = "43.0"
arrow-schema = "43.0"
arrow-arith = "43.0"
arrow-cast = "43.0"
half = { "version" = "=2.2.1", default-features = false, features = [
"num-traits"
] }
log = "0.4"
object_store = "0.6.1"
snafu = "0.7.4"
url = "2"

15
docker-compose.yml Normal file
View File

@@ -0,0 +1,15 @@
version: "3.9"
services:
localstack:
image: localstack/localstack:0.14
ports:
- 4566:4566
environment:
- SERVICES=s3,dynamodb
- DEBUG=1
- LS_LOG=trace
- DOCKER_HOST=unix:///var/run/docker.sock
- AWS_ACCESS_KEY_ID=ACCESSKEY
- AWS_SECRET_ACCESS_KEY=SECRETKEY
healthcheck:
test: [ "CMD", "curl", "-f", "http://localhost:4566/health" ]

View File

@@ -49,11 +49,11 @@ A Table is a collection of Records in a LanceDB Database. You can follow along o
db.create_table("table2", data)
db["table2"].head()
db["table2"].head()
```
!!! info "Note"
Data is converted to Arrow before being written to disk. For maximum control over how data is saved, either provide the PyArrow schema to convert to or else provide a PyArrow Table directly.
```python
custom_schema = pa.schema([
pa.field("vector", pa.list_(pa.float32(), 2)),
@@ -66,7 +66,7 @@ A Table is a collection of Records in a LanceDB Database. You can follow along o
### From PyArrow Tables
You can also create LanceDB tables directly from pyarrow tables
```python
table = pa.Table.from_arrays(
[
@@ -87,15 +87,15 @@ A Table is a collection of Records in a LanceDB Database. You can follow along o
LanceDB supports to create Apache Arrow Schema from a Pydantic BaseModel via pydantic_to_schema() method.
```python
from lancedb.pydantic import vector, LanceModel
from lancedb.pydantic import Vector, LanceModel
class Content(LanceModel):
movie_id: int
vector: vector(128)
vector: Vector(128)
genres: str
title: str
imdb_id: int
@property
def imdb_url(self) -> str:
return f"https://www.imdb.com/title/tt{self.imdb_id}"
@@ -103,7 +103,7 @@ A Table is a collection of Records in a LanceDB Database. You can follow along o
import pyarrow as pa
db = lancedb.connect("~/.lancedb")
table_name = "movielens_small"
table = db.create_table(table_name, schema=Content.to_arrow_schema())
table = db.create_table(table_name, schema=Content)
```
### Using Iterators / Writing Large Datasets
@@ -113,7 +113,7 @@ A Table is a collection of Records in a LanceDB Database. You can follow along o
LanceDB additionally supports pyarrow's `RecordBatch` Iterators or other generators producing supported data types.
Here's an example using using `RecordBatch` iterator for creating tables.
```python
import pyarrow as pa
@@ -142,11 +142,11 @@ A Table is a collection of Records in a LanceDB Database. You can follow along o
## Creating Empty Table
You can also create empty tables in python. Initialize it with schema and later ingest data into it.
```python
import lancedb
import pyarrow as pa
schema = pa.schema(
[
pa.field("vector", pa.list_(pa.float32(), 2)),
@@ -168,8 +168,8 @@ A Table is a collection of Records in a LanceDB Database. You can follow along o
from lancedb.pydantic import LanceModel, vector
class Model(LanceModel):
vector: vector(2)
vector: Vector(2)
tbl = db.create_table("table5", schema=Model.to_arrow_schema())
```
@@ -249,7 +249,7 @@ After a table has been created, you can always add more data to it using
You can also add a large dataset batch in one go using Iterator of any supported data types.
### Adding to table using Iterator
```python
import pandas as pd
@@ -261,10 +261,10 @@ After a table has been created, you can always add more data to it using
"item": ["foo", "bar"],
"price": [10.0, 20.0],
})
tbl.add(make_batches())
```
The other arguments accepted:
| Name | Type | Description | Default |
@@ -274,7 +274,7 @@ After a table has been created, you can always add more data to it using
| on_bad_vectors | str | What to do if any of the vectors are not the same size or contains NaNs. One of "error", "drop", "fill". | drop |
| fill value | float | The value to use when filling vectors: Only used if on_bad_vectors="fill". | 0.0 |
=== "Javascript/Typescript"
```javascript
@@ -312,7 +312,7 @@ Use the `delete()` method on tables to delete rows from a table. To choose which
# x vector
# 0 1 [1.0, 2.0]
# 1 3 [5.0, 6.0]
```
```
### Delete from a list of values
@@ -325,7 +325,7 @@ Use the `delete()` method on tables to delete rows from a table. To choose which
# x vector
# 0 3 [5.0, 6.0]
```
=== "Javascript/Typescript"
```javascript

View File

@@ -249,11 +249,11 @@
}
],
"source": [
"from lancedb.pydantic import vector, LanceModel\n",
"from lancedb.pydantic import Vector, LanceModel\n",
"\n",
"class Content(LanceModel):\n",
" movie_id: int\n",
" vector: vector(128)\n",
" vector: Vector(128)\n",
" genres: str\n",
" title: str\n",
" imdb_id: int\n",
@@ -359,7 +359,7 @@
"import pandas as pd\n",
"\n",
"class PydanticSchema(LanceModel):\n",
" vector: vector(2)\n",
" vector: Vector(2)\n",
" item: str\n",
" price: float\n",
"\n",
@@ -394,10 +394,10 @@
"outputs": [],
"source": [
"import lancedb\n",
"from lancedb.pydantic import LanceModel, vector\n",
"from lancedb.pydantic import LanceModel, Vector\n",
"\n",
"class Model(LanceModel):\n",
" vector: vector(2)\n",
" vector: Vector(2)\n",
"\n",
"tbl = db.create_table(\"table6\", schema=Model.to_arrow_schema())"
]

View File

@@ -13,10 +13,10 @@ via [pydantic_to_schema()](python.md##lancedb.pydantic.pydantic_to_schema) metho
## Vector Field
LanceDB provides a [`vector(dim)`](python.md#lancedb.pydantic.vector) method to define a
LanceDB provides a [`Vector(dim)`](python.md#lancedb.pydantic.Vector) method to define a
vector Field in a Pydantic Model.
::: lancedb.pydantic.vector
::: lancedb.pydantic.Vector
## Type Conversion
@@ -33,4 +33,4 @@ Current supported type conversions:
| `str` | `pyarrow.utf8()` |
| `list` | `pyarrow.List` |
| `BaseModel` | `pyarrow.Struct` |
| `vector(n)` | `pyarrow.FixedSizeList(float32, n)` |
| `Vector(n)` | `pyarrow.FixedSizeList(float32, n)` |

105
node/package-lock.json generated
View File

@@ -1,12 +1,12 @@
{
"name": "vectordb",
"version": "0.2.4",
"version": "0.2.5",
"lockfileVersion": 2,
"requires": true,
"packages": {
"": {
"name": "vectordb",
"version": "0.2.4",
"version": "0.2.5",
"cpu": [
"x64",
"arm64"
@@ -31,6 +31,7 @@
"@types/node": "^18.16.2",
"@types/sinon": "^10.0.15",
"@types/temp": "^0.9.1",
"@types/uuid": "^9.0.3",
"@typescript-eslint/eslint-plugin": "^5.59.1",
"cargo-cp-artifact": "^0.1",
"chai": "^4.3.7",
@@ -48,14 +49,15 @@
"ts-node-dev": "^2.0.0",
"typedoc": "^0.24.7",
"typedoc-plugin-markdown": "^3.15.3",
"typescript": "*"
"typescript": "*",
"uuid": "^9.0.0"
},
"optionalDependencies": {
"@lancedb/vectordb-darwin-arm64": "0.2.4",
"@lancedb/vectordb-darwin-x64": "0.2.4",
"@lancedb/vectordb-linux-arm64-gnu": "0.2.4",
"@lancedb/vectordb-linux-x64-gnu": "0.2.4",
"@lancedb/vectordb-win32-x64-msvc": "0.2.4"
"@lancedb/vectordb-darwin-arm64": "0.2.5",
"@lancedb/vectordb-darwin-x64": "0.2.5",
"@lancedb/vectordb-linux-arm64-gnu": "0.2.5",
"@lancedb/vectordb-linux-x64-gnu": "0.2.5",
"@lancedb/vectordb-win32-x64-msvc": "0.2.5"
}
},
"node_modules/@apache-arrow/ts": {
@@ -315,9 +317,9 @@
}
},
"node_modules/@lancedb/vectordb-darwin-arm64": {
"version": "0.2.4",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.2.4.tgz",
"integrity": "sha512-MqiZXamHYEOfguPsHWLBQ56IabIN6Az8u2Hx8LCyXcxW9gcyJZMSAfJc+CcA4KYHKotv0KsVBhgxZ3kaZQQyiw==",
"version": "0.2.5",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.2.5.tgz",
"integrity": "sha512-V4206SajkMN3o+bBFBAYJq5emlrjevitP0g8RFfVlmj/LS38i8k4uvSe1bICQ2amUrYkL/Jw4ktYn19NRfTU+g==",
"cpu": [
"arm64"
],
@@ -327,9 +329,9 @@
]
},
"node_modules/@lancedb/vectordb-darwin-x64": {
"version": "0.2.4",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.2.4.tgz",
"integrity": "sha512-DzL+mw5WhKDwXdEFlPh8M9zSDhGnfks7NvEh6ZqKbU6znH206YB7g3OA4WfFyV579IIEQ8jd4v/XDthNzQKuSA==",
"version": "0.2.5",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.2.5.tgz",
"integrity": "sha512-orePizgXCbTJbDJ4bMMnYh/4OgmWDBbHShNxHKQobcX+NgWTexmR0lV1WNOG+DtczBiGH422e3gHJ+xhTO13vg==",
"cpu": [
"x64"
],
@@ -339,9 +341,9 @@
]
},
"node_modules/@lancedb/vectordb-linux-arm64-gnu": {
"version": "0.2.4",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.2.4.tgz",
"integrity": "sha512-LP1nNfIpFxCgcCMlIQdseDX9dZU27TNhCL41xar8euqcetY5uKvi0YqhiVlpNO85Ss1FRQBgQ/GtnOM6Bo7oBQ==",
"version": "0.2.5",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.2.5.tgz",
"integrity": "sha512-xIMNwsFGOHeY9EUWCHhUAcA2sCHZ5Lim0sc42uuUOeWayyH+HeR6ZWReptDQRuAoJHqQeag9qcqteE0AZPDTEw==",
"cpu": [
"arm64"
],
@@ -351,9 +353,9 @@
]
},
"node_modules/@lancedb/vectordb-linux-x64-gnu": {
"version": "0.2.4",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.2.4.tgz",
"integrity": "sha512-m4RhOI5JJWPU9Ip2LlRIzXu4mwIv9M//OyAuTLiLKRm8726jQHhYi5VFUEtNzqY0o0p6pS0b3XbifYQ+cyJn3Q==",
"version": "0.2.5",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.2.5.tgz",
"integrity": "sha512-Qr8dbHavtE+Zfd45kEORJQe01kRWhMF703gk8zhtZhskDUBCfqm3ap22JIux58tASxVcBqY8EtUFojfYGnQVvA==",
"cpu": [
"x64"
],
@@ -363,9 +365,9 @@
]
},
"node_modules/@lancedb/vectordb-win32-x64-msvc": {
"version": "0.2.4",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.2.4.tgz",
"integrity": "sha512-lMF/2e3YkKWnTYv0R7cUCfjMkAqepNaHSc/dvJzCNsFVEhfDsFdScQFLToARs5GGxnq4fOf+MKpaHg/W6QTxiA==",
"version": "0.2.5",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.2.5.tgz",
"integrity": "sha512-jTqkR9HRfbjxhUrlTfveNkJ78tlpVXeNn3BS4wBm4VIsPd75jminKBRYtrlQCWyHusqrUQedKny4hhG1CuNUkg==",
"cpu": [
"x64"
],
@@ -596,6 +598,12 @@
"@types/node": "*"
}
},
"node_modules/@types/uuid": {
"version": "9.0.3",
"resolved": "https://registry.npmjs.org/@types/uuid/-/uuid-9.0.3.tgz",
"integrity": "sha512-taHQQH/3ZyI3zP8M/puluDEIEvtQHVYcC6y3N8ijFtAd28+Ey/G4sg1u2gB01S8MwybLOKAp9/yCMu/uR5l3Ug==",
"dev": true
},
"node_modules/@typescript-eslint/eslint-plugin": {
"version": "5.59.1",
"resolved": "https://registry.npmjs.org/@typescript-eslint/eslint-plugin/-/eslint-plugin-5.59.1.tgz",
@@ -4451,6 +4459,15 @@
"punycode": "^2.1.0"
}
},
"node_modules/uuid": {
"version": "9.0.0",
"resolved": "https://registry.npmjs.org/uuid/-/uuid-9.0.0.tgz",
"integrity": "sha512-MXcSTerfPa4uqyzStbRoTgt5XIe3x5+42+q1sDuy3R5MDk66URdLMOZe5aPX/SQd+kuYAh0FdP/pO28IkQyTeg==",
"dev": true,
"bin": {
"uuid": "dist/bin/uuid"
}
},
"node_modules/v8-compile-cache-lib": {
"version": "3.0.1",
"resolved": "https://registry.npmjs.org/v8-compile-cache-lib/-/v8-compile-cache-lib-3.0.1.tgz",
@@ -4852,33 +4869,33 @@
}
},
"@lancedb/vectordb-darwin-arm64": {
"version": "0.2.4",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.2.4.tgz",
"integrity": "sha512-MqiZXamHYEOfguPsHWLBQ56IabIN6Az8u2Hx8LCyXcxW9gcyJZMSAfJc+CcA4KYHKotv0KsVBhgxZ3kaZQQyiw==",
"version": "0.2.5",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.2.5.tgz",
"integrity": "sha512-V4206SajkMN3o+bBFBAYJq5emlrjevitP0g8RFfVlmj/LS38i8k4uvSe1bICQ2amUrYkL/Jw4ktYn19NRfTU+g==",
"optional": true
},
"@lancedb/vectordb-darwin-x64": {
"version": "0.2.4",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.2.4.tgz",
"integrity": "sha512-DzL+mw5WhKDwXdEFlPh8M9zSDhGnfks7NvEh6ZqKbU6znH206YB7g3OA4WfFyV579IIEQ8jd4v/XDthNzQKuSA==",
"version": "0.2.5",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.2.5.tgz",
"integrity": "sha512-orePizgXCbTJbDJ4bMMnYh/4OgmWDBbHShNxHKQobcX+NgWTexmR0lV1WNOG+DtczBiGH422e3gHJ+xhTO13vg==",
"optional": true
},
"@lancedb/vectordb-linux-arm64-gnu": {
"version": "0.2.4",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.2.4.tgz",
"integrity": "sha512-LP1nNfIpFxCgcCMlIQdseDX9dZU27TNhCL41xar8euqcetY5uKvi0YqhiVlpNO85Ss1FRQBgQ/GtnOM6Bo7oBQ==",
"version": "0.2.5",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.2.5.tgz",
"integrity": "sha512-xIMNwsFGOHeY9EUWCHhUAcA2sCHZ5Lim0sc42uuUOeWayyH+HeR6ZWReptDQRuAoJHqQeag9qcqteE0AZPDTEw==",
"optional": true
},
"@lancedb/vectordb-linux-x64-gnu": {
"version": "0.2.4",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.2.4.tgz",
"integrity": "sha512-m4RhOI5JJWPU9Ip2LlRIzXu4mwIv9M//OyAuTLiLKRm8726jQHhYi5VFUEtNzqY0o0p6pS0b3XbifYQ+cyJn3Q==",
"version": "0.2.5",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.2.5.tgz",
"integrity": "sha512-Qr8dbHavtE+Zfd45kEORJQe01kRWhMF703gk8zhtZhskDUBCfqm3ap22JIux58tASxVcBqY8EtUFojfYGnQVvA==",
"optional": true
},
"@lancedb/vectordb-win32-x64-msvc": {
"version": "0.2.4",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.2.4.tgz",
"integrity": "sha512-lMF/2e3YkKWnTYv0R7cUCfjMkAqepNaHSc/dvJzCNsFVEhfDsFdScQFLToARs5GGxnq4fOf+MKpaHg/W6QTxiA==",
"version": "0.2.5",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.2.5.tgz",
"integrity": "sha512-jTqkR9HRfbjxhUrlTfveNkJ78tlpVXeNn3BS4wBm4VIsPd75jminKBRYtrlQCWyHusqrUQedKny4hhG1CuNUkg==",
"optional": true
},
"@neon-rs/cli": {
@@ -5093,6 +5110,12 @@
"@types/node": "*"
}
},
"@types/uuid": {
"version": "9.0.3",
"resolved": "https://registry.npmjs.org/@types/uuid/-/uuid-9.0.3.tgz",
"integrity": "sha512-taHQQH/3ZyI3zP8M/puluDEIEvtQHVYcC6y3N8ijFtAd28+Ey/G4sg1u2gB01S8MwybLOKAp9/yCMu/uR5l3Ug==",
"dev": true
},
"@typescript-eslint/eslint-plugin": {
"version": "5.59.1",
"resolved": "https://registry.npmjs.org/@typescript-eslint/eslint-plugin/-/eslint-plugin-5.59.1.tgz",
@@ -7844,6 +7867,12 @@
"punycode": "^2.1.0"
}
},
"uuid": {
"version": "9.0.0",
"resolved": "https://registry.npmjs.org/uuid/-/uuid-9.0.0.tgz",
"integrity": "sha512-MXcSTerfPa4uqyzStbRoTgt5XIe3x5+42+q1sDuy3R5MDk66URdLMOZe5aPX/SQd+kuYAh0FdP/pO28IkQyTeg==",
"dev": true
},
"v8-compile-cache-lib": {
"version": "3.0.1",
"resolved": "https://registry.npmjs.org/v8-compile-cache-lib/-/v8-compile-cache-lib-3.0.1.tgz",

View File

@@ -1,6 +1,6 @@
{
"name": "vectordb",
"version": "0.2.4",
"version": "0.2.5",
"description": " Serverless, low-latency vector database for AI applications",
"main": "dist/index.js",
"types": "dist/index.d.ts",
@@ -9,6 +9,7 @@
"build": "cargo-cp-artifact --artifact cdylib vectordb-node index.node -- cargo build --message-format=json",
"build-release": "npm run build -- --release",
"test": "npm run tsc && mocha -recursive dist/test",
"integration-test": "npm run tsc && mocha -recursive dist/integration_test",
"lint": "eslint native.js src --ext .js,.ts",
"clean": "rm -rf node_modules *.node dist/",
"pack-build": "neon pack-build",
@@ -34,6 +35,7 @@
"@types/node": "^18.16.2",
"@types/sinon": "^10.0.15",
"@types/temp": "^0.9.1",
"@types/uuid": "^9.0.3",
"@typescript-eslint/eslint-plugin": "^5.59.1",
"cargo-cp-artifact": "^0.1",
"chai": "^4.3.7",
@@ -51,7 +53,8 @@
"ts-node-dev": "^2.0.0",
"typedoc": "^0.24.7",
"typedoc-plugin-markdown": "^3.15.3",
"typescript": "*"
"typescript": "*",
"uuid": "^9.0.0"
},
"dependencies": {
"@apache-arrow/ts": "^12.0.0",
@@ -78,10 +81,10 @@
}
},
"optionalDependencies": {
"@lancedb/vectordb-darwin-arm64": "0.2.4",
"@lancedb/vectordb-darwin-x64": "0.2.4",
"@lancedb/vectordb-linux-arm64-gnu": "0.2.4",
"@lancedb/vectordb-linux-x64-gnu": "0.2.4",
"@lancedb/vectordb-win32-x64-msvc": "0.2.4"
"@lancedb/vectordb-darwin-arm64": "0.2.5",
"@lancedb/vectordb-darwin-x64": "0.2.5",
"@lancedb/vectordb-linux-arm64-gnu": "0.2.5",
"@lancedb/vectordb-linux-x64-gnu": "0.2.5",
"@lancedb/vectordb-win32-x64-msvc": "0.2.5"
}
}

View File

@@ -0,0 +1,43 @@
// Copyright 2023 LanceDB Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
import { describe } from 'mocha'
import * as chai from 'chai'
import * as chaiAsPromised from 'chai-as-promised'
import { v4 as uuidv4 } from 'uuid'
import * as lancedb from '../index'
const assert = chai.assert
chai.use(chaiAsPromised)
describe('LanceDB AWS Integration test', function () {
it('s3+ddb schema is processed correctly', async function () {
this.timeout(5000)
// WARNING: specifying engine is NOT a publicly supported feature in lancedb yet
// THE API WILL CHANGE
const conn = await lancedb.connect('s3://lancedb-integtest?engine=ddb&ddbTableName=lancedb-integtest')
const data = [{ vector: Array(128).fill(1.0) }]
const tableName = uuidv4()
let table = await conn.createTable(tableName, data, { writeMode: lancedb.WriteMode.Overwrite })
const futs = [table.add(data), table.add(data), table.add(data), table.add(data), table.add(data)]
await Promise.allSettled(futs)
table = await conn.openTable(tableName)
assert.equal(await table.countRows(), 6)
})
})

View File

@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.2.2
current_version = 0.2.3
commit = True
message = [python] Bump version: {current_version} → {new_version}
tag = True

View File

@@ -46,7 +46,19 @@ class FixedSizeListMixin(ABC):
raise NotImplementedError
def vector(
def vector(dim: int, value_type: pa.DataType = pa.float32()):
# TODO: remove in future release
from warnings import warn
warn(
"lancedb.pydantic.vector() is deprecated, use lancedb.pydantic.Vector instead."
"This function will be removed in future release",
DeprecationWarning,
)
return Vector(dim, value_type)
def Vector(
dim: int, value_type: pa.DataType = pa.float32()
) -> Type[FixedSizeListMixin]:
"""Pydantic Vector Type.
@@ -65,12 +77,12 @@ def vector(
--------
>>> import pydantic
>>> from lancedb.pydantic import vector
>>> from lancedb.pydantic import Vector
...
>>> class MyModel(pydantic.BaseModel):
... id: int
... url: str
... embeddings: vector(768)
... embeddings: Vector(768)
>>> schema = pydantic_to_schema(MyModel)
>>> assert schema == pa.schema([
... pa.field("id", pa.int64(), False),
@@ -258,11 +270,11 @@ class LanceModel(pydantic.BaseModel):
Examples
--------
>>> import lancedb
>>> from lancedb.pydantic import LanceModel, vector
>>> from lancedb.pydantic import LanceModel, Vector
>>>
>>> class TestModel(LanceModel):
... name: str
... vector: vector(2)
... vector: Vector(2)
...
>>> db = lancedb.connect("/tmp")
>>> table = db.create_table("test", schema=TestModel.to_arrow_schema())

View File

@@ -102,7 +102,8 @@ def _to_record_batch_generator(
table = _sanitize_data(batch, schema, metadata, on_bad_vectors, fill_value)
for batch in table.to_batches():
yield batch
yield batch
else:
yield batch
class Table(ABC):

View File

@@ -1,8 +1,8 @@
[project]
name = "lancedb"
version = "0.2.2"
version = "0.2.3"
dependencies = [
"pylance==0.6.5",
"pylance==0.7.3",
"ratelimiter",
"retry",
"tqdm",

View File

@@ -17,7 +17,7 @@ import pyarrow as pa
import pytest
import lancedb
from lancedb.pydantic import LanceModel, vector
from lancedb.pydantic import LanceModel, Vector
def test_basic(tmp_path):
@@ -79,7 +79,7 @@ def test_ingest_pd(tmp_path):
def test_ingest_iterator(tmp_path):
class PydanticSchema(LanceModel):
vector: vector(2)
vector: Vector(2)
item: str
price: float
@@ -143,6 +143,7 @@ def test_ingest_iterator(tmp_path):
tbl_len = len(tbl)
tbl.add(make_batches())
assert tbl_len == 50
assert len(tbl) == tbl_len * 2
assert len(tbl.list_versions()) == 3
db.drop_database()

View File

@@ -19,8 +19,9 @@ from typing import List, Optional
import pyarrow as pa
import pydantic
import pytest
from pydantic import Field
from lancedb.pydantic import PYDANTIC_VERSION, LanceModel, pydantic_to_schema, vector
from lancedb.pydantic import PYDANTIC_VERSION, LanceModel, Vector, pydantic_to_schema
@pytest.mark.skipif(
@@ -107,7 +108,7 @@ def test_pydantic_to_arrow_py38():
def test_fixed_size_list_field():
class TestModel(pydantic.BaseModel):
vec: vector(16)
vec: Vector(16)
li: List[int]
data = TestModel(vec=list(range(16)), li=[1, 2, 3])
@@ -154,7 +155,7 @@ def test_fixed_size_list_field():
def test_fixed_size_list_validation():
class TestModel(pydantic.BaseModel):
vec: vector(8)
vec: Vector(8)
with pytest.raises(pydantic.ValidationError):
TestModel(vec=range(9))
@@ -167,9 +168,12 @@ def test_fixed_size_list_validation():
def test_lance_model():
class TestModel(LanceModel):
vec: vector(16)
li: List[int]
vector: Vector(16) = Field(default=[0.0] * 16)
li: List[int] = Field(default=[1, 2, 3])
schema = pydantic_to_schema(TestModel)
assert schema == TestModel.to_arrow_schema()
assert TestModel.field_names() == ["vec", "li"]
assert TestModel.field_names() == ["vector", "li"]
t = TestModel()
assert t == TestModel(vec=[0.0] * 16, li=[1, 2, 3])

View File

@@ -20,7 +20,7 @@ import pyarrow as pa
import pytest
from lancedb.db import LanceDBConnection
from lancedb.pydantic import LanceModel, vector
from lancedb.pydantic import LanceModel, Vector
from lancedb.query import LanceVectorQueryBuilder, Query
from lancedb.table import LanceTable
@@ -67,7 +67,7 @@ def table(tmp_path) -> MockTable:
def test_cast(table):
class TestModel(LanceModel):
vector: vector(2)
vector: Vector(2)
id: int
str_field: str
float_field: float

View File

@@ -24,7 +24,7 @@ import pytest
from lancedb.conftest import MockEmbeddingFunction
from lancedb.db import LanceDBConnection
from lancedb.pydantic import LanceModel, vector
from lancedb.pydantic import LanceModel, Vector
from lancedb.table import LanceTable
@@ -140,7 +140,7 @@ def test_add(db):
def test_add_pydantic_model(db):
class TestModel(LanceModel):
vector: vector(16)
vector: Vector(16)
li: List[int]
data = TestModel(vector=list(range(16)), li=[1, 2, 3])
@@ -354,7 +354,7 @@ def test_update(db):
def test_create_with_embedding_function(db):
class MyTable(LanceModel):
text: str
vector: vector(10)
vector: Vector(10)
func = MockEmbeddingFunction(source_column="text", vector_column="vector")
texts = ["hello world", "goodbye world", "foo bar baz fizz buzz"]
@@ -379,7 +379,7 @@ def test_create_with_embedding_function(db):
def test_add_with_embedding_function(db):
class MyTable(LanceModel):
text: str
vector: vector(10)
vector: Vector(10)
func = MockEmbeddingFunction(source_column="text", vector_column="vector")
table = LanceTable.create(
@@ -407,8 +407,8 @@ def test_add_with_embedding_function(db):
def test_multiple_vector_columns(db):
class MyTable(LanceModel):
text: str
vector1: vector(10)
vector2: vector(10)
vector1: Vector(10)
vector2: Vector(10)
table = LanceTable.create(
db,

View File

@@ -1,6 +1,6 @@
[package]
name = "vectordb-node"
version = "0.2.4"
version = "0.2.5"
description = "Serverless, low-latency vector database for AI applications"
license = "Apache-2.0"
edition = "2018"

View File

@@ -28,7 +28,9 @@ fn validate_vector_column(record_batch: &RecordBatch) -> Result<()> {
record_batch
.column_by_name(VECTOR_COLUMN_NAME)
.map(|_| ())
.context(MissingColumnSnafu { name: VECTOR_COLUMN_NAME })
.context(MissingColumnSnafu {
name: VECTOR_COLUMN_NAME,
})
}
pub(crate) fn arrow_buffer_to_record_batch(slice: &[u8]) -> Result<(Vec<RecordBatch>, SchemaRef)> {

View File

@@ -183,11 +183,9 @@ fn database_open_table(mut cx: FunctionContext) -> JsResult<JsPromise> {
let aws_region = get_aws_region(&mut cx, 4)?;
let params = ReadParams {
store_options: Some(ObjectStoreParams {
aws_credentials: aws_creds,
aws_region,
..ObjectStoreParams::default()
}),
store_options: Some(ObjectStoreParams::with_aws_credentials(
aws_creds, aws_region,
)),
..ReadParams::default()
};

View File

@@ -43,7 +43,8 @@ impl JsTable {
.downcast_or_throw::<JsBox<JsDatabase>, _>(&mut cx)?;
let table_name = cx.argument::<JsString>(0)?.value(&mut cx);
let buffer = cx.argument::<JsBuffer>(1)?;
let (batches, schema) = arrow_buffer_to_record_batch(buffer.as_slice(&mut cx)).or_throw(&mut cx)?;
let (batches, schema) =
arrow_buffer_to_record_batch(buffer.as_slice(&mut cx)).or_throw(&mut cx)?;
// Write mode
let mode = match cx.argument::<JsString>(2)?.value(&mut cx).as_str() {
@@ -65,11 +66,9 @@ impl JsTable {
let aws_region = get_aws_region(&mut cx, 6)?;
let params = WriteParams {
store_params: Some(ObjectStoreParams {
aws_credentials: aws_creds,
aws_region,
..ObjectStoreParams::default()
}),
store_params: Some(ObjectStoreParams::with_aws_credentials(
aws_creds, aws_region,
)),
mode: mode,
..WriteParams::default()
};
@@ -92,7 +91,8 @@ impl JsTable {
let js_table = cx.this().downcast_or_throw::<JsBox<JsTable>, _>(&mut cx)?;
let buffer = cx.argument::<JsBuffer>(0)?;
let write_mode = cx.argument::<JsString>(1)?.value(&mut cx);
let (batches, schema) = arrow_buffer_to_record_batch(buffer.as_slice(&mut cx)).or_throw(&mut cx)?;
let (batches, schema) =
arrow_buffer_to_record_batch(buffer.as_slice(&mut cx)).or_throw(&mut cx)?;
let rt = runtime(&mut cx)?;
let channel = cx.channel();
let mut table = js_table.table.clone();
@@ -108,11 +108,9 @@ impl JsTable {
let aws_region = get_aws_region(&mut cx, 5)?;
let params = WriteParams {
store_params: Some(ObjectStoreParams {
aws_credentials: aws_creds,
aws_region,
..ObjectStoreParams::default()
}),
store_params: Some(ObjectStoreParams::with_aws_credentials(
aws_creds, aws_region,
)),
mode: write_mode,
..WriteParams::default()
};

View File

@@ -1,6 +1,6 @@
[package]
name = "vectordb"
version = "0.2.4"
version = "0.2.5"
edition = "2021"
description = "LanceDB: A serverless, low-latency vector database for AI applications"
license = "Apache-2.0"
@@ -10,14 +10,20 @@ categories = ["database-implementations"]
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
arrow = { workspace = true }
arrow-array = { workspace = true }
arrow-data = { workspace = true }
arrow-schema = { workspace = true }
arrow-ord = { workspace = true }
arrow-cast = { workspace = true }
object_store = { workspace = true }
snafu = { workspace = true }
half = { workspace = true }
lance = { workspace = true }
tokio = { version = "1.23", features = ["rt-multi-thread"] }
log = { workspace = true }
num-traits = "0"
url = { workspace = true }
[dev-dependencies]
tempfile = "3.5.0"

View File

@@ -0,0 +1,15 @@
// Copyright 2023 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
pub use lance::arrow::*;

18
rust/vectordb/src/data.rs Normal file
View File

@@ -0,0 +1,18 @@
// Copyright 2023 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Data types, schema coercion, and data cleaning and etc.
pub mod inspect;
pub mod sanitize;

View File

@@ -0,0 +1,180 @@
// Copyright 2023 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::collections::HashMap;
use arrow::compute::kernels::{aggregate::bool_and, length::length};
use arrow_array::{
cast::AsArray,
types::{ArrowPrimitiveType, Int32Type, Int64Type},
Array, GenericListArray, OffsetSizeTrait, RecordBatchReader,
};
use arrow_ord::comparison::eq_dyn_scalar;
use arrow_schema::DataType;
use num_traits::{ToPrimitive, Zero};
use crate::error::{Error, Result};
pub(crate) fn infer_dimension<T: ArrowPrimitiveType>(
list_arr: &GenericListArray<T::Native>,
) -> Result<Option<T::Native>>
where
T::Native: OffsetSizeTrait + ToPrimitive,
{
let len_arr = length(list_arr)?;
if len_arr.is_empty() {
return Ok(Some(Zero::zero()));
}
let dim = len_arr.as_primitive::<T>().value(0);
if bool_and(&eq_dyn_scalar(len_arr.as_primitive::<T>(), dim)?) != Some(true) {
Ok(None)
} else {
Ok(Some(dim))
}
}
/// Infer the vector columns from a dataset.
///
/// Parameters
/// ----------
/// - reader: RecordBatchReader
/// - strict: if set true, only fixed_size_list<float> is considered as vector column. If set to false,
/// a list<float> column with same length is also considered as vector column.
pub fn infer_vector_columns(
reader: impl RecordBatchReader + Send,
strict: bool,
) -> Result<Vec<String>> {
let mut columns = vec![];
let mut columns_to_infer: HashMap<String, Option<i64>> = HashMap::new();
for field in reader.schema().fields() {
match field.data_type() {
DataType::FixedSizeList(sub_field, _) if sub_field.data_type().is_floating() => {
columns.push(field.name().to_string());
}
DataType::List(sub_field) if sub_field.data_type().is_floating() && !strict => {
columns_to_infer.insert(field.name().to_string(), None);
}
DataType::LargeList(sub_field) if sub_field.data_type().is_floating() && !strict => {
columns_to_infer.insert(field.name().to_string(), None);
}
_ => {}
}
}
for batch in reader {
let batch = batch?;
let col_names = columns_to_infer.keys().cloned().collect::<Vec<_>>();
for col_name in col_names {
let col = batch.column_by_name(&col_name).ok_or(Error::Schema {
message: format!("Column {} not found", col_name),
})?;
if let Some(dim) = match *col.data_type() {
DataType::List(_) => {
infer_dimension::<Int32Type>(col.as_list::<i32>())?.map(|d| d as i64)
}
DataType::LargeList(_) => infer_dimension::<Int64Type>(col.as_list::<i64>())?,
_ => {
return Err(Error::Schema {
message: format!("Column {} is not a list", col_name),
})
}
} {
if let Some(Some(prev_dim)) = columns_to_infer.get(&col_name) {
if prev_dim != &dim {
columns_to_infer.remove(&col_name);
}
} else {
columns_to_infer.insert(col_name, Some(dim));
}
} else {
columns_to_infer.remove(&col_name);
}
}
}
columns.extend(columns_to_infer.keys().cloned());
Ok(columns)
}
#[cfg(test)]
mod tests {
use super::*;
use arrow_array::{
types::{Float32Type, Float64Type},
FixedSizeListArray, Float32Array, ListArray, RecordBatch, RecordBatchIterator, StringArray,
};
use arrow_schema::{DataType, Field, Schema};
use std::{sync::Arc, vec};
#[test]
fn test_infer_vector_columns() {
let schema = Arc::new(Schema::new(vec![
Field::new("f", DataType::Float32, false),
Field::new("s", DataType::Utf8, false),
Field::new(
"l1",
DataType::List(Arc::new(Field::new("item", DataType::Float32, true))),
false,
),
Field::new(
"l2",
DataType::List(Arc::new(Field::new("item", DataType::Float64, true))),
false,
),
Field::new(
"fl",
DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Float32, true)), 32),
true,
),
]));
let batch = RecordBatch::try_new(
schema.clone(),
vec![
Arc::new(Float32Array::from(vec![1.0, 2.0, 3.0])),
Arc::new(StringArray::from(vec!["a", "b", "c"])),
Arc::new(ListArray::from_iter_primitive::<Float32Type, _, _>(
(0..3).map(|_| Some(vec![Some(1.0), Some(2.0), Some(3.0), Some(4.0)])),
)),
// Var-length list
Arc::new(ListArray::from_iter_primitive::<Float64Type, _, _>(vec![
Some(vec![Some(1.0_f64)]),
Some(vec![Some(2.0_f64), Some(3.0_f64)]),
Some(vec![Some(4.0_f64), Some(5.0_f64), Some(6.0_f64)]),
])),
Arc::new(
FixedSizeListArray::from_iter_primitive::<Float32Type, _, _>(
vec![
Some(vec![Some(1.0); 32]),
Some(vec![Some(2.0); 32]),
Some(vec![Some(3.0); 32]),
],
32,
),
),
],
)
.unwrap();
let reader =
RecordBatchIterator::new(vec![batch.clone()].into_iter().map(Ok), schema.clone());
let cols = infer_vector_columns(reader, false).unwrap();
assert_eq!(cols, vec!["fl", "l1"]);
let reader = RecordBatchIterator::new(vec![batch].into_iter().map(Ok), schema);
let cols = infer_vector_columns(reader, true).unwrap();
assert_eq!(cols, vec!["fl"]);
}
}

View File

@@ -0,0 +1,284 @@
// Copyright 2023 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::{iter::repeat_with, sync::Arc};
use arrow_array::{
cast::AsArray,
types::{Float16Type, Float32Type, Float64Type, Int32Type, Int64Type},
Array, ArrowNumericType, FixedSizeListArray, PrimitiveArray, RecordBatch, RecordBatchIterator,
RecordBatchReader,
};
use arrow_cast::{can_cast_types, cast};
use arrow_schema::{ArrowError, DataType, Field, Schema};
use half::f16;
use lance::arrow::{DataTypeExt, FixedSizeListArrayExt};
use log::warn;
use num_traits::cast::AsPrimitive;
use super::inspect::infer_dimension;
use crate::error::Result;
fn cast_array<I: ArrowNumericType, O: ArrowNumericType>(
arr: &PrimitiveArray<I>,
) -> Arc<PrimitiveArray<O>>
where
I::Native: AsPrimitive<O::Native>,
{
Arc::new(PrimitiveArray::<O>::from_iter_values(
arr.values().iter().map(|v| (*v).as_()),
))
}
fn cast_float_array<I: ArrowNumericType>(
arr: &PrimitiveArray<I>,
dt: &DataType,
) -> std::result::Result<Arc<dyn Array>, ArrowError>
where
I::Native: AsPrimitive<f64> + AsPrimitive<f32> + AsPrimitive<f16>,
{
match dt {
DataType::Float16 => Ok(cast_array::<I, Float16Type>(arr)),
DataType::Float32 => Ok(cast_array::<I, Float32Type>(arr)),
DataType::Float64 => Ok(cast_array::<I, Float64Type>(arr)),
_ => Err(ArrowError::SchemaError(format!(
"Incompatible change field: unable to coerce {:?} to {:?}",
arr.data_type(),
dt
))),
}
}
fn coerce_array(
array: &Arc<dyn Array>,
field: &Field,
) -> std::result::Result<Arc<dyn Array>, ArrowError> {
if array.data_type() == field.data_type() {
return Ok(array.clone());
}
match (array.data_type(), field.data_type()) {
// Normal cast-able types.
(adt, dt) if can_cast_types(adt, dt) => cast(&array, dt),
// Casting between f16/f32/f64 can be lossy.
(adt, dt) if (adt.is_floating() || dt.is_floating()) => {
if adt.byte_width() > dt.byte_width() {
warn!(
"Coercing field {} {:?} to {:?} might lose precision",
field.name(),
adt,
dt
);
}
match adt {
DataType::Float16 => cast_float_array(array.as_primitive::<Float16Type>(), dt),
DataType::Float32 => cast_float_array(array.as_primitive::<Float32Type>(), dt),
DataType::Float64 => cast_float_array(array.as_primitive::<Float64Type>(), dt),
_ => unreachable!(),
}
}
(adt, DataType::FixedSizeList(exp_field, exp_dim)) => match adt {
// Cast a float fixed size array with same dimension to the expected type.
DataType::FixedSizeList(_, dim) if dim == exp_dim => {
let actual_sub = array.as_fixed_size_list();
let values = coerce_array(actual_sub.values(), exp_field)?;
Ok(Arc::new(FixedSizeListArray::try_new_from_values(
values.clone(),
*dim,
)?) as Arc<dyn Array>)
}
DataType::List(_) | DataType::LargeList(_) => {
let Some(dim) = (match adt {
DataType::List(_) => infer_dimension::<Int32Type>(array.as_list::<i32>())
.map_err(|e| {
ArrowError::SchemaError(format!(
"failed to infer dimension from list: {}",
e
))
})?
.map(|d| d as i64),
DataType::LargeList(_) => infer_dimension::<Int64Type>(array.as_list::<i64>())
.map_err(|e| {
ArrowError::SchemaError(format!(
"failed to infer dimension from large list: {}",
e
))
})?,
_ => unreachable!(),
}) else {
return Err(ArrowError::SchemaError(format!(
"Incompatible coerce fixed size list: unable to coerce {:?} from {:?}",
field,
array.data_type()
)));
};
if dim != *exp_dim as i64 {
return Err(ArrowError::SchemaError(format!(
"Incompatible coerce fixed size list: expected dimension {} but got {}",
exp_dim, dim
)));
}
let values = coerce_array(array, exp_field)?;
Ok(Arc::new(FixedSizeListArray::try_new_from_values(
values.clone(),
*exp_dim,
)?) as Arc<dyn Array>)
}
_ => Err(ArrowError::SchemaError(format!(
"Incompatible coerce fixed size list: unable to coerce {:?} from {:?}",
field,
array.data_type()
)))?,
},
_ => Err(ArrowError::SchemaError(format!(
"Incompatible change field {}: unable to coerce {:?} to {:?}",
field.name(),
array.data_type(),
field.data_type()
)))?,
}
}
fn coerce_schema_batch(
batch: RecordBatch,
schema: Arc<Schema>,
) -> std::result::Result<RecordBatch, ArrowError> {
if batch.schema() == schema {
return Ok(batch);
}
let columns = schema
.fields()
.iter()
.map(|field| {
batch
.column_by_name(field.name())
.ok_or_else(|| {
ArrowError::SchemaError(format!("Column {} not found", field.name()))
})
.and_then(|c| coerce_array(c, field))
})
.collect::<std::result::Result<Vec<_>, ArrowError>>()?;
RecordBatch::try_new(schema, columns)
}
/// Coerce the reader (input data) to match the given [Schema].
///
pub fn coerce_schema(
reader: impl RecordBatchReader + Send + 'static,
schema: Arc<Schema>,
) -> Result<Box<dyn RecordBatchReader + Send>> {
if reader.schema() == schema {
return Ok(Box::new(RecordBatchIterator::new(reader, schema)));
}
let s = schema.clone();
let batches = reader
.zip(repeat_with(move || s.clone()))
.map(|(batch, s)| coerce_schema_batch(batch?, s));
Ok(Box::new(RecordBatchIterator::new(batches, schema)))
}
#[cfg(test)]
mod tests {
use super::*;
use std::sync::Arc;
use arrow_array::{
FixedSizeListArray, Float16Array, Float32Array, Float64Array, Int32Array, Int8Array,
RecordBatch, RecordBatchIterator, StringArray,
};
use arrow_schema::Field;
use half::f16;
use lance::arrow::FixedSizeListArrayExt;
#[test]
fn test_coerce_list_to_fixed_size_list() {
let schema = Arc::new(Schema::new(vec![
Field::new(
"fl",
DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Float32, true)), 64),
true,
),
Field::new("s", DataType::Utf8, true),
Field::new("f", DataType::Float16, true),
Field::new("i", DataType::Int32, true),
]));
let batch = RecordBatch::try_new(
schema.clone(),
vec![
Arc::new(
FixedSizeListArray::try_new_from_values(
Float32Array::from_iter_values((0..256).map(|v| v as f32)),
64,
)
.unwrap(),
),
Arc::new(StringArray::from(vec![
Some("hello"),
Some("world"),
Some("from"),
Some("lance"),
])),
Arc::new(Float16Array::from_iter_values(
(0..4).map(|v| f16::from_f32(v as f32)),
)),
Arc::new(Int32Array::from_iter_values(0..4)),
],
)
.unwrap();
let reader =
RecordBatchIterator::new(vec![batch.clone()].into_iter().map(Ok), schema.clone());
let expected_schema = Arc::new(Schema::new(vec![
Field::new(
"fl",
DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Float16, true)), 64),
true,
),
Field::new("s", DataType::Utf8, true),
Field::new("f", DataType::Float64, true),
Field::new("i", DataType::Int8, true),
]));
let stream = coerce_schema(reader, expected_schema.clone()).unwrap();
let batches = stream.collect::<Vec<_>>();
assert_eq!(batches.len(), 1);
let batch = batches[0].as_ref().unwrap();
assert_eq!(batch.schema(), expected_schema);
let expected = RecordBatch::try_new(
expected_schema,
vec![
Arc::new(
FixedSizeListArray::try_new_from_values(
Float16Array::from_iter_values((0..256).map(|v| f16::from_f32(v as f32))),
64,
)
.unwrap(),
),
Arc::new(StringArray::from(vec![
Some("hello"),
Some("world"),
Some("from"),
Some("lance"),
])),
Arc::new(Float64Array::from_iter_values((0..4).map(|v| v as f64))),
Arc::new(Int8Array::from_iter_values(0..4)),
],
)
.unwrap();
assert_eq!(batch, &expected);
}
}

View File

@@ -20,19 +20,32 @@ use lance::dataset::WriteParams;
use lance::io::object_store::ObjectStore;
use snafu::prelude::*;
use crate::error::{CreateDirSnafu, InvalidTableNameSnafu, Result};
use crate::error::{CreateDirSnafu, Error, InvalidTableNameSnafu, Result};
use crate::table::{ReadParams, Table};
pub const LANCE_FILE_EXTENSION: &str = "lance";
pub struct Database {
object_store: ObjectStore,
query_string: Option<String>,
pub(crate) uri: String,
pub(crate) base_path: object_store::path::Path,
}
const LANCE_EXTENSION: &str = "lance";
const ENGINE: &str = "engine";
/// Parse a url, if it's not a valid url, assume it's a local file
/// and try to parse with file:// appended
fn parse_url(url: &str) -> Result<url::Url> {
match url::Url::parse(url) {
Ok(url) => Ok(url),
Err(_) => url::Url::parse(format!("file://{}", url).as_str()).map_err(|e| Error::Lance {
message: format!("Failed to parse uri: {}", e),
}),
}
}
/// A connection to LanceDB
impl Database {
@@ -46,12 +59,71 @@ impl Database {
///
/// * A [Database] object.
pub async fn connect(uri: &str) -> Result<Database> {
let (object_store, base_path) = ObjectStore::from_uri(uri).await?;
if object_store.is_local() {
Self::try_create_dir(uri).context(CreateDirSnafu { path: uri })?;
// For a native (using lance directly) connection
// The DB doesn't use any uri parameters, but lance does
// So we need to parse the uri, extract the query string, and progate it to lance
let mut url = parse_url(uri)?;
// special handling for windows
if url.scheme().len() == 1 && cfg!(windows) {
let (object_store, base_path) = ObjectStore::from_uri(uri).await?;
if object_store.is_local() {
Self::try_create_dir(uri).context(CreateDirSnafu { path: uri })?;
}
return Ok(Database {
uri: uri.to_string(),
query_string: None,
base_path,
object_store,
});
}
// iter thru the query params and extract the commit store param
let mut engine = None;
let mut filtered_querys = vec![];
// WARNING: specifying engine is NOT a publicly supported feature in lancedb yet
// THE API WILL CHANGE
for (key, value) in url.query_pairs() {
if key == ENGINE {
engine = Some(value.to_string());
} else {
// to owned so we can modify the url
filtered_querys.push((key.to_string(), value.to_string()));
}
}
// Filter out the commit store query param -- it's a lancedb param
url.query_pairs_mut().clear();
url.query_pairs_mut().extend_pairs(filtered_querys);
// Take a copy of the query string so we can propagate it to lance
let query_string = url.query().map(|s| s.to_string());
// clear the query string so we can use the url as the base uri
// use .set_query(None) instead of .set_query("") because the latter
// will add a trailing '?' to the url
url.set_query(None);
let table_base_uri = if let Some(store) = engine {
static WARN_ONCE: std::sync::Once = std::sync::Once::new();
WARN_ONCE.call_once(|| {
log::warn!("Specifing engine is not a publicly supported feature in lancedb yet. THE API WILL CHANGE");
});
let old_scheme = url.scheme().to_string();
let new_scheme = format!("{}+{}", old_scheme, store);
url.to_string().replacen(&old_scheme, &new_scheme, 1)
} else {
url.to_string()
};
let plain_uri = url.to_string();
let (object_store, base_path) = ObjectStore::from_uri(&plain_uri).await?;
if object_store.is_local() {
Self::try_create_dir(&plain_uri).context(CreateDirSnafu { path: plain_uri })?;
}
Ok(Database {
uri: uri.to_string(),
uri: table_base_uri,
query_string,
base_path,
object_store,
})
@@ -149,11 +221,19 @@ impl Database {
let path = Path::new(&self.uri);
let table_uri = path.join(format!("{}.{}", name, LANCE_FILE_EXTENSION));
let uri = table_uri
let mut uri = table_uri
.as_path()
.to_str()
.context(InvalidTableNameSnafu { name })?;
Ok(uri.to_string())
.context(InvalidTableNameSnafu { name })?
.to_string();
// If there are query string set on the connection, propagate to lance
if let Some(query) = self.query_string.as_ref() {
uri.push('?');
uri.push_str(query.as_str());
}
Ok(uri)
}
}
@@ -170,7 +250,15 @@ mod tests {
let uri = tmp_dir.path().to_str().unwrap();
let db = Database::connect(uri).await.unwrap();
assert_eq!(db.uri, uri);
// file:// scheme should be automatically appended if not specified
// windows path come with drive letter, so file:// won't be appended
let expected = if cfg!(windows) {
uri.to_string()
} else {
format!("file://{}", uri)
};
assert_eq!(db.uri, expected);
}
#[tokio::test]

View File

@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use arrow_schema::ArrowError;
use snafu::Snafu;
#[derive(Debug, Snafu)]
@@ -32,10 +33,20 @@ pub enum Error {
Store { message: String },
#[snafu(display("LanceDBError: {message}"))]
Lance { message: String },
#[snafu(display("LanceDB Schema Error: {message}"))]
Schema { message: String },
}
pub type Result<T> = std::result::Result<T, Error>;
impl From<ArrowError> for Error {
fn from(e: ArrowError) -> Self {
Self::Lance {
message: e.to_string(),
}
}
}
impl From<lance::Error> for Error {
fn from(e: lance::Error) -> Self {
Self::Lance {

View File

@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
pub mod data;
pub mod database;
pub mod error;
pub mod index;