mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-24 13:59:58 +00:00
Compare commits
1 Commits
python-v0.
...
reproducib
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
9441fde2bb |
@@ -1,5 +1,5 @@
|
||||
[bumpversion]
|
||||
current_version = 0.2.5
|
||||
current_version = 0.2.4
|
||||
commit = True
|
||||
message = Bump version: {current_version} → {new_version}
|
||||
tag = True
|
||||
|
||||
53
.github/workflows/node.yml
vendored
53
.github/workflows/node.yml
vendored
@@ -107,56 +107,3 @@ jobs:
|
||||
- name: Test
|
||||
run: |
|
||||
npm run test
|
||||
aws-integtest:
|
||||
timeout-minutes: 45
|
||||
runs-on: "ubuntu-22.04"
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
working-directory: node
|
||||
env:
|
||||
AWS_ACCESS_KEY_ID: ACCESSKEY
|
||||
AWS_SECRET_ACCESS_KEY: SECRETKEY
|
||||
AWS_DEFAULT_REGION: us-west-2
|
||||
# this one is for s3
|
||||
AWS_ENDPOINT: http://localhost:4566
|
||||
# this one is for dynamodb
|
||||
DYNAMODB_ENDPOINT: http://localhost:4566
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 0
|
||||
lfs: true
|
||||
- uses: actions/setup-node@v3
|
||||
with:
|
||||
node-version: 18
|
||||
cache: 'npm'
|
||||
cache-dependency-path: node/package-lock.json
|
||||
- name: start local stack
|
||||
run: docker compose -f ../docker-compose.yml up -d
|
||||
- name: create s3
|
||||
run: aws s3 mb s3://lancedb-integtest --endpoint $AWS_ENDPOINT
|
||||
- name: create ddb
|
||||
run: |
|
||||
aws dynamodb create-table \
|
||||
--table-name lancedb-integtest \
|
||||
--attribute-definitions '[{"AttributeName": "base_uri", "AttributeType": "S"}, {"AttributeName": "version", "AttributeType": "N"}]' \
|
||||
--key-schema '[{"AttributeName": "base_uri", "KeyType": "HASH"}, {"AttributeName": "version", "KeyType": "RANGE"}]' \
|
||||
--provisioned-throughput '{"ReadCapacityUnits": 10, "WriteCapacityUnits": 10}' \
|
||||
--endpoint-url $DYNAMODB_ENDPOINT
|
||||
- uses: Swatinem/rust-cache@v2
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install -y protobuf-compiler libssl-dev
|
||||
- name: Build
|
||||
run: |
|
||||
npm ci
|
||||
npm run tsc
|
||||
npm run build
|
||||
npm run pack-build
|
||||
npm install --no-save ./dist/lancedb-vectordb-*.tgz
|
||||
# Remove index.node to test with dependency installed
|
||||
rm index.node
|
||||
- name: Test
|
||||
run: npm run integration-test
|
||||
|
||||
23
Cargo.toml
23
Cargo.toml
@@ -1,25 +1,16 @@
|
||||
[workspace]
|
||||
members = ["rust/ffi/node", "rust/vectordb"]
|
||||
# Python package needs to be built by maturin.
|
||||
exclude = ["python"]
|
||||
members = [
|
||||
"rust/vectordb",
|
||||
"rust/ffi/node"
|
||||
]
|
||||
resolver = "2"
|
||||
|
||||
[workspace.dependencies]
|
||||
lance = { "version" = "=0.7.4", "features" = ["dynamodb"] }
|
||||
lance-linalg = { "version" = "=0.7.4" }
|
||||
# Note that this one does not include pyarrow
|
||||
arrow = { version = "43.0.0", optional = false }
|
||||
lance = "=0.6.5"
|
||||
arrow-array = "43.0"
|
||||
arrow-data = "43.0"
|
||||
arrow-ipc = "43.0"
|
||||
arrow-ord = "43.0"
|
||||
arrow-schema = "43.0"
|
||||
arrow-arith = "43.0"
|
||||
arrow-cast = "43.0"
|
||||
half = { "version" = "=2.2.1", default-features = false, features = [
|
||||
"num-traits"
|
||||
] }
|
||||
log = "0.4"
|
||||
arrow-ipc = "43.0"
|
||||
half = { "version" = "=2.2.1", default-features = false }
|
||||
object_store = "0.6.1"
|
||||
snafu = "0.7.4"
|
||||
url = "2"
|
||||
|
||||
@@ -1,15 +0,0 @@
|
||||
version: "3.9"
|
||||
services:
|
||||
localstack:
|
||||
image: localstack/localstack:0.14
|
||||
ports:
|
||||
- 4566:4566
|
||||
environment:
|
||||
- SERVICES=s3,dynamodb
|
||||
- DEBUG=1
|
||||
- LS_LOG=trace
|
||||
- DOCKER_HOST=unix:///var/run/docker.sock
|
||||
- AWS_ACCESS_KEY_ID=ACCESSKEY
|
||||
- AWS_SECRET_ACCESS_KEY=SECRETKEY
|
||||
healthcheck:
|
||||
test: [ "CMD", "curl", "-f", "http://localhost:4566/health" ]
|
||||
@@ -49,11 +49,11 @@ A Table is a collection of Records in a LanceDB Database. You can follow along o
|
||||
|
||||
db.create_table("table2", data)
|
||||
|
||||
db["table2"].head()
|
||||
db["table2"].head()
|
||||
```
|
||||
!!! info "Note"
|
||||
Data is converted to Arrow before being written to disk. For maximum control over how data is saved, either provide the PyArrow schema to convert to or else provide a PyArrow Table directly.
|
||||
|
||||
|
||||
```python
|
||||
custom_schema = pa.schema([
|
||||
pa.field("vector", pa.list_(pa.float32(), 2)),
|
||||
@@ -66,7 +66,7 @@ A Table is a collection of Records in a LanceDB Database. You can follow along o
|
||||
|
||||
### From PyArrow Tables
|
||||
You can also create LanceDB tables directly from pyarrow tables
|
||||
|
||||
|
||||
```python
|
||||
table = pa.Table.from_arrays(
|
||||
[
|
||||
@@ -87,15 +87,15 @@ A Table is a collection of Records in a LanceDB Database. You can follow along o
|
||||
LanceDB supports to create Apache Arrow Schema from a Pydantic BaseModel via pydantic_to_schema() method.
|
||||
|
||||
```python
|
||||
from lancedb.pydantic import Vector, LanceModel
|
||||
from lancedb.pydantic import vector, LanceModel
|
||||
|
||||
class Content(LanceModel):
|
||||
movie_id: int
|
||||
vector: Vector(128)
|
||||
vector: vector(128)
|
||||
genres: str
|
||||
title: str
|
||||
imdb_id: int
|
||||
|
||||
|
||||
@property
|
||||
def imdb_url(self) -> str:
|
||||
return f"https://www.imdb.com/title/tt{self.imdb_id}"
|
||||
@@ -103,7 +103,7 @@ A Table is a collection of Records in a LanceDB Database. You can follow along o
|
||||
import pyarrow as pa
|
||||
db = lancedb.connect("~/.lancedb")
|
||||
table_name = "movielens_small"
|
||||
table = db.create_table(table_name, schema=Content)
|
||||
table = db.create_table(table_name, schema=Content.to_arrow_schema())
|
||||
```
|
||||
|
||||
### Using Iterators / Writing Large Datasets
|
||||
@@ -113,7 +113,7 @@ A Table is a collection of Records in a LanceDB Database. You can follow along o
|
||||
LanceDB additionally supports pyarrow's `RecordBatch` Iterators or other generators producing supported data types.
|
||||
|
||||
Here's an example using using `RecordBatch` iterator for creating tables.
|
||||
|
||||
|
||||
```python
|
||||
import pyarrow as pa
|
||||
|
||||
@@ -142,11 +142,11 @@ A Table is a collection of Records in a LanceDB Database. You can follow along o
|
||||
|
||||
## Creating Empty Table
|
||||
You can also create empty tables in python. Initialize it with schema and later ingest data into it.
|
||||
|
||||
|
||||
```python
|
||||
import lancedb
|
||||
import pyarrow as pa
|
||||
|
||||
|
||||
schema = pa.schema(
|
||||
[
|
||||
pa.field("vector", pa.list_(pa.float32(), 2)),
|
||||
@@ -168,8 +168,8 @@ A Table is a collection of Records in a LanceDB Database. You can follow along o
|
||||
from lancedb.pydantic import LanceModel, vector
|
||||
|
||||
class Model(LanceModel):
|
||||
vector: Vector(2)
|
||||
|
||||
vector: vector(2)
|
||||
|
||||
tbl = db.create_table("table5", schema=Model.to_arrow_schema())
|
||||
```
|
||||
|
||||
@@ -249,7 +249,7 @@ After a table has been created, you can always add more data to it using
|
||||
You can also add a large dataset batch in one go using Iterator of any supported data types.
|
||||
|
||||
### Adding to table using Iterator
|
||||
|
||||
|
||||
```python
|
||||
import pandas as pd
|
||||
|
||||
@@ -261,10 +261,10 @@ After a table has been created, you can always add more data to it using
|
||||
"item": ["foo", "bar"],
|
||||
"price": [10.0, 20.0],
|
||||
})
|
||||
|
||||
|
||||
tbl.add(make_batches())
|
||||
```
|
||||
|
||||
|
||||
The other arguments accepted:
|
||||
|
||||
| Name | Type | Description | Default |
|
||||
@@ -274,7 +274,7 @@ After a table has been created, you can always add more data to it using
|
||||
| on_bad_vectors | str | What to do if any of the vectors are not the same size or contains NaNs. One of "error", "drop", "fill". | drop |
|
||||
| fill value | float | The value to use when filling vectors: Only used if on_bad_vectors="fill". | 0.0 |
|
||||
|
||||
|
||||
|
||||
=== "Javascript/Typescript"
|
||||
|
||||
```javascript
|
||||
@@ -312,7 +312,7 @@ Use the `delete()` method on tables to delete rows from a table. To choose which
|
||||
# x vector
|
||||
# 0 1 [1.0, 2.0]
|
||||
# 1 3 [5.0, 6.0]
|
||||
```
|
||||
```
|
||||
|
||||
### Delete from a list of values
|
||||
|
||||
@@ -325,7 +325,7 @@ Use the `delete()` method on tables to delete rows from a table. To choose which
|
||||
# x vector
|
||||
# 0 3 [5.0, 6.0]
|
||||
```
|
||||
|
||||
|
||||
=== "Javascript/Typescript"
|
||||
|
||||
```javascript
|
||||
|
||||
1167
docs/src/notebooks/reproducibility.ipynb
Normal file
1167
docs/src/notebooks/reproducibility.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
62
docs/src/notebooks/rick_and_morty_quotes.csv
Normal file
62
docs/src/notebooks/rick_and_morty_quotes.csv
Normal file
@@ -0,0 +1,62 @@
|
||||
id,quote,author
|
||||
1,"Nobody exists on purpose. Nobody belongs anywhere.",Morty
|
||||
2,"We're all going to die. Come watch TV.",Morty
|
||||
3,"Losers look stuff up while the rest of us are carpin' all them diems.",Summer
|
||||
4,"He's not a hot girl. He can't just bail on his life and set up shop in someone else's.",Beth
|
||||
5,"When you are an a—hole, it doesn't matter how right you are. Nobody wants to give you the satisfaction.",Morty
|
||||
6,"God's turning people into insect monsters, Beth. I'm the one beating them to death. Thank me.",Jerry
|
||||
7,"Camping is just being homeless without the change.",Summer
|
||||
8,"This seems like a good time for a drink and a cold, calculated speech with sinister overtones. A speech about politics, about order, brotherhood, power ... but speeches are for campaigning. Now is the time for action.",Morty
|
||||
9,"Having a family doesn't mean that you stop being an individual. You know the best thing you can do for the people that depend on you? Be honest with them, even if it means setting them free.",Mr. Meeseeks
|
||||
10,"If I've learned one thing, it's that before you get anywhere in life, you gotta stop listening to yourself.",Jerry
|
||||
11,"I just want to go back to Hell, where everyone thinks I'm smart and funny.",Mr. Needful
|
||||
12,"Hi Mr. Jellybean, I'm Morty. I’m on an adventure with my grandpa.",Morty
|
||||
13,"You're not the cause of your parents' misery. You're just a symptom of it.",Summer
|
||||
14,"Don't deify the people who leave you.",Beth
|
||||
15,"Well, then get your s—t together, get it all together, and put it in a backpack, all your s—t, so it's together. And if you gotta take it somewhere, take it somewhere, you know, take it to the s—t store and sell it, or put it in the s—t museum. I don't care what you do, you just gotta get it together. Get your s—t together.",Morty
|
||||
16,"At least the devil has a job!",Summer
|
||||
17,"Life is effort and I'll stop when I die!",Jerry
|
||||
18,"I just killed my family! I don't care what they were!",Morty
|
||||
19,"It's funny to say they are small. It's funny to say they are big.",Shrimply Pibbles
|
||||
20,"You're holding me verbally hostage.",Summer
|
||||
21,"Honey, stop raising your father's cholesterol so you can take a hot funeral selfie.",Beth
|
||||
22,"Rick, when you say you made an exact replica of the house, did you mean, like, an exact replica?",Morty
|
||||
23,"Give a gun to the lady who got pregnant with me too early and constantly makes it our problem.",Summer
|
||||
24,"Say goodbye to your precious dry land! For soon it will be wet!",Mr. Nimbus
|
||||
25,"Nobody's smarter than Rick, but nobody else is my dad. You're a genius at that.",Morty
|
||||
26,"B—h, my generation gets traumatized for breakfast.",Summer
|
||||
27,"Inception made sense!",Morty
|
||||
28,"I realize now I'm attracted to you for the same reason I can't be with you: You can't change. And I have no problem with that, but it clearly means I have a problem with myself.",Unity
|
||||
29,"Mr. President, if I've learned one thing today, it's that sometimes you have to not give a f—k!",Morty
|
||||
30,"I didn't know freedom meant people doing stuff that sucks.",Summer
|
||||
31,"How many of these are just horrible mistakes I made? I mean, maybe I'd stop making so many if I let myself learn from them.",Morty
|
||||
32,"I'm a scientist because I invent, transform, create, and destroy for a living. And when I don't like something about the world, I change it.",Rick
|
||||
33,"Wubba lubba dub dub!",Rick
|
||||
34,"I turned myself into a pickle, Morty! I'm Pickle Rick!",Rick
|
||||
35,"I know about the Yosemite T-shirt, Morty.",Rick
|
||||
36,"The universe is basically an animal. It grazes on the ordinary. It creates infinite idiots just to eat them.",Rick
|
||||
37,"If I die in a cage, I lose a bet.",Rick
|
||||
38,"Sometimes science is more art than science.",Rick
|
||||
39,"To live is to risk it all—otherwise, you're just an inert chunk of randomly assembled molecules drifting wherever the universe blows you.",Rick
|
||||
40,"Welcome to the club, pal.",Rick
|
||||
41,"So I have an emo streak. It's part of what makes me so rad.",Rick
|
||||
42,"Listen, I'm not the nicest guy in the universe, because I'm the smartest, and being nice is something stupid people do to hedge their bets.",Rick
|
||||
43,"Wait a minute! Is that Mountain Dew in my quantum-transport-solution?",Rick
|
||||
44,"Listen, Morty, I hate to break it to you, but what people call 'love' is just a chemical reaction that compels animals to breed.",Rick
|
||||
45,"Break the cycle, Morty. Rise above. Focus on science.",Rick
|
||||
46,"Don't get drawn into the culture, Morty. Stealing stuff is about the stuff, not the stealing.",Rick
|
||||
47,"I'm sorry, but your opinion means very little to me.",Rick
|
||||
48,"You don't get to tell anyone what's sad. You’re like a one-man Mount Sadmore. So I guess like a Lincoln Sadmorial.",Rick
|
||||
49,"This pickle doesn't care about your children. I'm not gonna take their dreams. I'm gonna take their parents.",Rick
|
||||
50,"I programmed you to believe that.",Rick
|
||||
51,"Have fun with empowerment. It seems to make everyone that gets it really happy.",Rick
|
||||
52,"Thanks, Mr. Poopybutthole. I always could count on you.",Rick
|
||||
53,"Weddings are basically funerals with a cake.",Rick
|
||||
54,"I mean, if you spend all day shuffling words around, you can make anything sound bad, Morty.",Rick
|
||||
55,"It's your choice to take this personally.",Rick
|
||||
56,"Excuse me, coming through. What are you here for? Just kidding, I don't care.",Rick
|
||||
57,"If I let you make me nervous, then we can't get schwifty.",Rick
|
||||
58,"Oh, boy, so you actually learned something today? What is this, Full House?",Rick
|
||||
59,"I can't abide bureaucracy. I don't like being told where to go and what to do. I consider it a violation. Did you get those seeds all the way up your butt?",Rick
|
||||
60,"I think you have to think ahead and live in the moment.",Rick
|
||||
61,"I know that new situations can be intimidating. You're lookin' around and it's all scary and different, but you know, meeting them head-on, charging into 'em like a bull—that's how we grow as people.",Rick
|
||||
|
@@ -249,11 +249,11 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from lancedb.pydantic import Vector, LanceModel\n",
|
||||
"from lancedb.pydantic import vector, LanceModel\n",
|
||||
"\n",
|
||||
"class Content(LanceModel):\n",
|
||||
" movie_id: int\n",
|
||||
" vector: Vector(128)\n",
|
||||
" vector: vector(128)\n",
|
||||
" genres: str\n",
|
||||
" title: str\n",
|
||||
" imdb_id: int\n",
|
||||
@@ -359,7 +359,7 @@
|
||||
"import pandas as pd\n",
|
||||
"\n",
|
||||
"class PydanticSchema(LanceModel):\n",
|
||||
" vector: Vector(2)\n",
|
||||
" vector: vector(2)\n",
|
||||
" item: str\n",
|
||||
" price: float\n",
|
||||
"\n",
|
||||
@@ -394,10 +394,10 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import lancedb\n",
|
||||
"from lancedb.pydantic import LanceModel, Vector\n",
|
||||
"from lancedb.pydantic import LanceModel, vector\n",
|
||||
"\n",
|
||||
"class Model(LanceModel):\n",
|
||||
" vector: Vector(2)\n",
|
||||
" vector: vector(2)\n",
|
||||
"\n",
|
||||
"tbl = db.create_table(\"table6\", schema=Model.to_arrow_schema())"
|
||||
]
|
||||
|
||||
@@ -13,10 +13,10 @@ via [pydantic_to_schema()](python.md##lancedb.pydantic.pydantic_to_schema) metho
|
||||
|
||||
## Vector Field
|
||||
|
||||
LanceDB provides a [`Vector(dim)`](python.md#lancedb.pydantic.Vector) method to define a
|
||||
LanceDB provides a [`vector(dim)`](python.md#lancedb.pydantic.vector) method to define a
|
||||
vector Field in a Pydantic Model.
|
||||
|
||||
::: lancedb.pydantic.Vector
|
||||
::: lancedb.pydantic.vector
|
||||
|
||||
## Type Conversion
|
||||
|
||||
@@ -33,4 +33,4 @@ Current supported type conversions:
|
||||
| `str` | `pyarrow.utf8()` |
|
||||
| `list` | `pyarrow.List` |
|
||||
| `BaseModel` | `pyarrow.Struct` |
|
||||
| `Vector(n)` | `pyarrow.FixedSizeList(float32, n)` |
|
||||
| `vector(n)` | `pyarrow.FixedSizeList(float32, n)` |
|
||||
|
||||
105
node/package-lock.json
generated
105
node/package-lock.json
generated
@@ -1,12 +1,12 @@
|
||||
{
|
||||
"name": "vectordb",
|
||||
"version": "0.2.5",
|
||||
"version": "0.2.4",
|
||||
"lockfileVersion": 2,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "vectordb",
|
||||
"version": "0.2.5",
|
||||
"version": "0.2.4",
|
||||
"cpu": [
|
||||
"x64",
|
||||
"arm64"
|
||||
@@ -31,7 +31,6 @@
|
||||
"@types/node": "^18.16.2",
|
||||
"@types/sinon": "^10.0.15",
|
||||
"@types/temp": "^0.9.1",
|
||||
"@types/uuid": "^9.0.3",
|
||||
"@typescript-eslint/eslint-plugin": "^5.59.1",
|
||||
"cargo-cp-artifact": "^0.1",
|
||||
"chai": "^4.3.7",
|
||||
@@ -49,15 +48,14 @@
|
||||
"ts-node-dev": "^2.0.0",
|
||||
"typedoc": "^0.24.7",
|
||||
"typedoc-plugin-markdown": "^3.15.3",
|
||||
"typescript": "*",
|
||||
"uuid": "^9.0.0"
|
||||
"typescript": "*"
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"@lancedb/vectordb-darwin-arm64": "0.2.5",
|
||||
"@lancedb/vectordb-darwin-x64": "0.2.5",
|
||||
"@lancedb/vectordb-linux-arm64-gnu": "0.2.5",
|
||||
"@lancedb/vectordb-linux-x64-gnu": "0.2.5",
|
||||
"@lancedb/vectordb-win32-x64-msvc": "0.2.5"
|
||||
"@lancedb/vectordb-darwin-arm64": "0.2.4",
|
||||
"@lancedb/vectordb-darwin-x64": "0.2.4",
|
||||
"@lancedb/vectordb-linux-arm64-gnu": "0.2.4",
|
||||
"@lancedb/vectordb-linux-x64-gnu": "0.2.4",
|
||||
"@lancedb/vectordb-win32-x64-msvc": "0.2.4"
|
||||
}
|
||||
},
|
||||
"node_modules/@apache-arrow/ts": {
|
||||
@@ -317,9 +315,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-darwin-arm64": {
|
||||
"version": "0.2.5",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.2.5.tgz",
|
||||
"integrity": "sha512-V4206SajkMN3o+bBFBAYJq5emlrjevitP0g8RFfVlmj/LS38i8k4uvSe1bICQ2amUrYkL/Jw4ktYn19NRfTU+g==",
|
||||
"version": "0.2.4",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.2.4.tgz",
|
||||
"integrity": "sha512-MqiZXamHYEOfguPsHWLBQ56IabIN6Az8u2Hx8LCyXcxW9gcyJZMSAfJc+CcA4KYHKotv0KsVBhgxZ3kaZQQyiw==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
@@ -329,9 +327,9 @@
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-darwin-x64": {
|
||||
"version": "0.2.5",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.2.5.tgz",
|
||||
"integrity": "sha512-orePizgXCbTJbDJ4bMMnYh/4OgmWDBbHShNxHKQobcX+NgWTexmR0lV1WNOG+DtczBiGH422e3gHJ+xhTO13vg==",
|
||||
"version": "0.2.4",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.2.4.tgz",
|
||||
"integrity": "sha512-DzL+mw5WhKDwXdEFlPh8M9zSDhGnfks7NvEh6ZqKbU6znH206YB7g3OA4WfFyV579IIEQ8jd4v/XDthNzQKuSA==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
@@ -341,9 +339,9 @@
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-linux-arm64-gnu": {
|
||||
"version": "0.2.5",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.2.5.tgz",
|
||||
"integrity": "sha512-xIMNwsFGOHeY9EUWCHhUAcA2sCHZ5Lim0sc42uuUOeWayyH+HeR6ZWReptDQRuAoJHqQeag9qcqteE0AZPDTEw==",
|
||||
"version": "0.2.4",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.2.4.tgz",
|
||||
"integrity": "sha512-LP1nNfIpFxCgcCMlIQdseDX9dZU27TNhCL41xar8euqcetY5uKvi0YqhiVlpNO85Ss1FRQBgQ/GtnOM6Bo7oBQ==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
@@ -353,9 +351,9 @@
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-linux-x64-gnu": {
|
||||
"version": "0.2.5",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.2.5.tgz",
|
||||
"integrity": "sha512-Qr8dbHavtE+Zfd45kEORJQe01kRWhMF703gk8zhtZhskDUBCfqm3ap22JIux58tASxVcBqY8EtUFojfYGnQVvA==",
|
||||
"version": "0.2.4",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.2.4.tgz",
|
||||
"integrity": "sha512-m4RhOI5JJWPU9Ip2LlRIzXu4mwIv9M//OyAuTLiLKRm8726jQHhYi5VFUEtNzqY0o0p6pS0b3XbifYQ+cyJn3Q==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
@@ -365,9 +363,9 @@
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-win32-x64-msvc": {
|
||||
"version": "0.2.5",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.2.5.tgz",
|
||||
"integrity": "sha512-jTqkR9HRfbjxhUrlTfveNkJ78tlpVXeNn3BS4wBm4VIsPd75jminKBRYtrlQCWyHusqrUQedKny4hhG1CuNUkg==",
|
||||
"version": "0.2.4",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.2.4.tgz",
|
||||
"integrity": "sha512-lMF/2e3YkKWnTYv0R7cUCfjMkAqepNaHSc/dvJzCNsFVEhfDsFdScQFLToARs5GGxnq4fOf+MKpaHg/W6QTxiA==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
@@ -598,12 +596,6 @@
|
||||
"@types/node": "*"
|
||||
}
|
||||
},
|
||||
"node_modules/@types/uuid": {
|
||||
"version": "9.0.3",
|
||||
"resolved": "https://registry.npmjs.org/@types/uuid/-/uuid-9.0.3.tgz",
|
||||
"integrity": "sha512-taHQQH/3ZyI3zP8M/puluDEIEvtQHVYcC6y3N8ijFtAd28+Ey/G4sg1u2gB01S8MwybLOKAp9/yCMu/uR5l3Ug==",
|
||||
"dev": true
|
||||
},
|
||||
"node_modules/@typescript-eslint/eslint-plugin": {
|
||||
"version": "5.59.1",
|
||||
"resolved": "https://registry.npmjs.org/@typescript-eslint/eslint-plugin/-/eslint-plugin-5.59.1.tgz",
|
||||
@@ -4459,15 +4451,6 @@
|
||||
"punycode": "^2.1.0"
|
||||
}
|
||||
},
|
||||
"node_modules/uuid": {
|
||||
"version": "9.0.0",
|
||||
"resolved": "https://registry.npmjs.org/uuid/-/uuid-9.0.0.tgz",
|
||||
"integrity": "sha512-MXcSTerfPa4uqyzStbRoTgt5XIe3x5+42+q1sDuy3R5MDk66URdLMOZe5aPX/SQd+kuYAh0FdP/pO28IkQyTeg==",
|
||||
"dev": true,
|
||||
"bin": {
|
||||
"uuid": "dist/bin/uuid"
|
||||
}
|
||||
},
|
||||
"node_modules/v8-compile-cache-lib": {
|
||||
"version": "3.0.1",
|
||||
"resolved": "https://registry.npmjs.org/v8-compile-cache-lib/-/v8-compile-cache-lib-3.0.1.tgz",
|
||||
@@ -4869,33 +4852,33 @@
|
||||
}
|
||||
},
|
||||
"@lancedb/vectordb-darwin-arm64": {
|
||||
"version": "0.2.5",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.2.5.tgz",
|
||||
"integrity": "sha512-V4206SajkMN3o+bBFBAYJq5emlrjevitP0g8RFfVlmj/LS38i8k4uvSe1bICQ2amUrYkL/Jw4ktYn19NRfTU+g==",
|
||||
"version": "0.2.4",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.2.4.tgz",
|
||||
"integrity": "sha512-MqiZXamHYEOfguPsHWLBQ56IabIN6Az8u2Hx8LCyXcxW9gcyJZMSAfJc+CcA4KYHKotv0KsVBhgxZ3kaZQQyiw==",
|
||||
"optional": true
|
||||
},
|
||||
"@lancedb/vectordb-darwin-x64": {
|
||||
"version": "0.2.5",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.2.5.tgz",
|
||||
"integrity": "sha512-orePizgXCbTJbDJ4bMMnYh/4OgmWDBbHShNxHKQobcX+NgWTexmR0lV1WNOG+DtczBiGH422e3gHJ+xhTO13vg==",
|
||||
"version": "0.2.4",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.2.4.tgz",
|
||||
"integrity": "sha512-DzL+mw5WhKDwXdEFlPh8M9zSDhGnfks7NvEh6ZqKbU6znH206YB7g3OA4WfFyV579IIEQ8jd4v/XDthNzQKuSA==",
|
||||
"optional": true
|
||||
},
|
||||
"@lancedb/vectordb-linux-arm64-gnu": {
|
||||
"version": "0.2.5",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.2.5.tgz",
|
||||
"integrity": "sha512-xIMNwsFGOHeY9EUWCHhUAcA2sCHZ5Lim0sc42uuUOeWayyH+HeR6ZWReptDQRuAoJHqQeag9qcqteE0AZPDTEw==",
|
||||
"version": "0.2.4",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.2.4.tgz",
|
||||
"integrity": "sha512-LP1nNfIpFxCgcCMlIQdseDX9dZU27TNhCL41xar8euqcetY5uKvi0YqhiVlpNO85Ss1FRQBgQ/GtnOM6Bo7oBQ==",
|
||||
"optional": true
|
||||
},
|
||||
"@lancedb/vectordb-linux-x64-gnu": {
|
||||
"version": "0.2.5",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.2.5.tgz",
|
||||
"integrity": "sha512-Qr8dbHavtE+Zfd45kEORJQe01kRWhMF703gk8zhtZhskDUBCfqm3ap22JIux58tASxVcBqY8EtUFojfYGnQVvA==",
|
||||
"version": "0.2.4",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.2.4.tgz",
|
||||
"integrity": "sha512-m4RhOI5JJWPU9Ip2LlRIzXu4mwIv9M//OyAuTLiLKRm8726jQHhYi5VFUEtNzqY0o0p6pS0b3XbifYQ+cyJn3Q==",
|
||||
"optional": true
|
||||
},
|
||||
"@lancedb/vectordb-win32-x64-msvc": {
|
||||
"version": "0.2.5",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.2.5.tgz",
|
||||
"integrity": "sha512-jTqkR9HRfbjxhUrlTfveNkJ78tlpVXeNn3BS4wBm4VIsPd75jminKBRYtrlQCWyHusqrUQedKny4hhG1CuNUkg==",
|
||||
"version": "0.2.4",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.2.4.tgz",
|
||||
"integrity": "sha512-lMF/2e3YkKWnTYv0R7cUCfjMkAqepNaHSc/dvJzCNsFVEhfDsFdScQFLToARs5GGxnq4fOf+MKpaHg/W6QTxiA==",
|
||||
"optional": true
|
||||
},
|
||||
"@neon-rs/cli": {
|
||||
@@ -5110,12 +5093,6 @@
|
||||
"@types/node": "*"
|
||||
}
|
||||
},
|
||||
"@types/uuid": {
|
||||
"version": "9.0.3",
|
||||
"resolved": "https://registry.npmjs.org/@types/uuid/-/uuid-9.0.3.tgz",
|
||||
"integrity": "sha512-taHQQH/3ZyI3zP8M/puluDEIEvtQHVYcC6y3N8ijFtAd28+Ey/G4sg1u2gB01S8MwybLOKAp9/yCMu/uR5l3Ug==",
|
||||
"dev": true
|
||||
},
|
||||
"@typescript-eslint/eslint-plugin": {
|
||||
"version": "5.59.1",
|
||||
"resolved": "https://registry.npmjs.org/@typescript-eslint/eslint-plugin/-/eslint-plugin-5.59.1.tgz",
|
||||
@@ -7867,12 +7844,6 @@
|
||||
"punycode": "^2.1.0"
|
||||
}
|
||||
},
|
||||
"uuid": {
|
||||
"version": "9.0.0",
|
||||
"resolved": "https://registry.npmjs.org/uuid/-/uuid-9.0.0.tgz",
|
||||
"integrity": "sha512-MXcSTerfPa4uqyzStbRoTgt5XIe3x5+42+q1sDuy3R5MDk66URdLMOZe5aPX/SQd+kuYAh0FdP/pO28IkQyTeg==",
|
||||
"dev": true
|
||||
},
|
||||
"v8-compile-cache-lib": {
|
||||
"version": "3.0.1",
|
||||
"resolved": "https://registry.npmjs.org/v8-compile-cache-lib/-/v8-compile-cache-lib-3.0.1.tgz",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "vectordb",
|
||||
"version": "0.2.5",
|
||||
"version": "0.2.4",
|
||||
"description": " Serverless, low-latency vector database for AI applications",
|
||||
"main": "dist/index.js",
|
||||
"types": "dist/index.d.ts",
|
||||
@@ -9,7 +9,6 @@
|
||||
"build": "cargo-cp-artifact --artifact cdylib vectordb-node index.node -- cargo build --message-format=json",
|
||||
"build-release": "npm run build -- --release",
|
||||
"test": "npm run tsc && mocha -recursive dist/test",
|
||||
"integration-test": "npm run tsc && mocha -recursive dist/integration_test",
|
||||
"lint": "eslint native.js src --ext .js,.ts",
|
||||
"clean": "rm -rf node_modules *.node dist/",
|
||||
"pack-build": "neon pack-build",
|
||||
@@ -35,7 +34,6 @@
|
||||
"@types/node": "^18.16.2",
|
||||
"@types/sinon": "^10.0.15",
|
||||
"@types/temp": "^0.9.1",
|
||||
"@types/uuid": "^9.0.3",
|
||||
"@typescript-eslint/eslint-plugin": "^5.59.1",
|
||||
"cargo-cp-artifact": "^0.1",
|
||||
"chai": "^4.3.7",
|
||||
@@ -53,8 +51,7 @@
|
||||
"ts-node-dev": "^2.0.0",
|
||||
"typedoc": "^0.24.7",
|
||||
"typedoc-plugin-markdown": "^3.15.3",
|
||||
"typescript": "*",
|
||||
"uuid": "^9.0.0"
|
||||
"typescript": "*"
|
||||
},
|
||||
"dependencies": {
|
||||
"@apache-arrow/ts": "^12.0.0",
|
||||
@@ -81,10 +78,10 @@
|
||||
}
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"@lancedb/vectordb-darwin-arm64": "0.2.5",
|
||||
"@lancedb/vectordb-darwin-x64": "0.2.5",
|
||||
"@lancedb/vectordb-linux-arm64-gnu": "0.2.5",
|
||||
"@lancedb/vectordb-linux-x64-gnu": "0.2.5",
|
||||
"@lancedb/vectordb-win32-x64-msvc": "0.2.5"
|
||||
"@lancedb/vectordb-darwin-arm64": "0.2.4",
|
||||
"@lancedb/vectordb-darwin-x64": "0.2.4",
|
||||
"@lancedb/vectordb-linux-arm64-gnu": "0.2.4",
|
||||
"@lancedb/vectordb-linux-x64-gnu": "0.2.4",
|
||||
"@lancedb/vectordb-win32-x64-msvc": "0.2.4"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,43 +0,0 @@
|
||||
// Copyright 2023 LanceDB Developers.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
import { describe } from 'mocha'
|
||||
import * as chai from 'chai'
|
||||
import * as chaiAsPromised from 'chai-as-promised'
|
||||
import { v4 as uuidv4 } from 'uuid'
|
||||
|
||||
import * as lancedb from '../index'
|
||||
|
||||
const assert = chai.assert
|
||||
chai.use(chaiAsPromised)
|
||||
|
||||
describe('LanceDB AWS Integration test', function () {
|
||||
it('s3+ddb schema is processed correctly', async function () {
|
||||
this.timeout(5000)
|
||||
|
||||
// WARNING: specifying engine is NOT a publicly supported feature in lancedb yet
|
||||
// THE API WILL CHANGE
|
||||
const conn = await lancedb.connect('s3://lancedb-integtest?engine=ddb&ddbTableName=lancedb-integtest')
|
||||
const data = [{ vector: Array(128).fill(1.0) }]
|
||||
|
||||
const tableName = uuidv4()
|
||||
let table = await conn.createTable(tableName, data, { writeMode: lancedb.WriteMode.Overwrite })
|
||||
|
||||
const futs = [table.add(data), table.add(data), table.add(data), table.add(data), table.add(data)]
|
||||
await Promise.allSettled(futs)
|
||||
|
||||
table = await conn.openTable(tableName)
|
||||
assert.equal(await table.countRows(), 6)
|
||||
})
|
||||
})
|
||||
@@ -1,5 +1,5 @@
|
||||
[bumpversion]
|
||||
current_version = 0.2.4
|
||||
current_version = 0.2.2
|
||||
commit = True
|
||||
message = [python] Bump version: {current_version} → {new_version}
|
||||
tag = True
|
||||
|
||||
@@ -46,19 +46,7 @@ class FixedSizeListMixin(ABC):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
def vector(dim: int, value_type: pa.DataType = pa.float32()):
|
||||
# TODO: remove in future release
|
||||
from warnings import warn
|
||||
|
||||
warn(
|
||||
"lancedb.pydantic.vector() is deprecated, use lancedb.pydantic.Vector instead."
|
||||
"This function will be removed in future release",
|
||||
DeprecationWarning,
|
||||
)
|
||||
return Vector(dim, value_type)
|
||||
|
||||
|
||||
def Vector(
|
||||
def vector(
|
||||
dim: int, value_type: pa.DataType = pa.float32()
|
||||
) -> Type[FixedSizeListMixin]:
|
||||
"""Pydantic Vector Type.
|
||||
@@ -77,12 +65,12 @@ def Vector(
|
||||
--------
|
||||
|
||||
>>> import pydantic
|
||||
>>> from lancedb.pydantic import Vector
|
||||
>>> from lancedb.pydantic import vector
|
||||
...
|
||||
>>> class MyModel(pydantic.BaseModel):
|
||||
... id: int
|
||||
... url: str
|
||||
... embeddings: Vector(768)
|
||||
... embeddings: vector(768)
|
||||
>>> schema = pydantic_to_schema(MyModel)
|
||||
>>> assert schema == pa.schema([
|
||||
... pa.field("id", pa.int64(), False),
|
||||
@@ -270,11 +258,11 @@ class LanceModel(pydantic.BaseModel):
|
||||
Examples
|
||||
--------
|
||||
>>> import lancedb
|
||||
>>> from lancedb.pydantic import LanceModel, Vector
|
||||
>>> from lancedb.pydantic import LanceModel, vector
|
||||
>>>
|
||||
>>> class TestModel(LanceModel):
|
||||
... name: str
|
||||
... vector: Vector(2)
|
||||
... vector: vector(2)
|
||||
...
|
||||
>>> db = lancedb.connect("/tmp")
|
||||
>>> table = db.create_table("test", schema=TestModel.to_arrow_schema())
|
||||
|
||||
@@ -102,8 +102,7 @@ def _to_record_batch_generator(
|
||||
table = _sanitize_data(batch, schema, metadata, on_bad_vectors, fill_value)
|
||||
for batch in table.to_batches():
|
||||
yield batch
|
||||
else:
|
||||
yield batch
|
||||
yield batch
|
||||
|
||||
|
||||
class Table(ABC):
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
[project]
|
||||
name = "lancedb"
|
||||
version = "0.2.4"
|
||||
version = "0.2.2"
|
||||
dependencies = [
|
||||
"pylance==0.7.4",
|
||||
"pylance==0.6.5",
|
||||
"ratelimiter",
|
||||
"retry",
|
||||
"tqdm",
|
||||
|
||||
@@ -17,7 +17,7 @@ import pyarrow as pa
|
||||
import pytest
|
||||
|
||||
import lancedb
|
||||
from lancedb.pydantic import LanceModel, Vector
|
||||
from lancedb.pydantic import LanceModel, vector
|
||||
|
||||
|
||||
def test_basic(tmp_path):
|
||||
@@ -79,7 +79,7 @@ def test_ingest_pd(tmp_path):
|
||||
|
||||
def test_ingest_iterator(tmp_path):
|
||||
class PydanticSchema(LanceModel):
|
||||
vector: Vector(2)
|
||||
vector: vector(2)
|
||||
item: str
|
||||
price: float
|
||||
|
||||
@@ -143,7 +143,6 @@ def test_ingest_iterator(tmp_path):
|
||||
|
||||
tbl_len = len(tbl)
|
||||
tbl.add(make_batches())
|
||||
assert tbl_len == 50
|
||||
assert len(tbl) == tbl_len * 2
|
||||
assert len(tbl.list_versions()) == 3
|
||||
db.drop_database()
|
||||
|
||||
@@ -19,9 +19,8 @@ from typing import List, Optional
|
||||
import pyarrow as pa
|
||||
import pydantic
|
||||
import pytest
|
||||
from pydantic import Field
|
||||
|
||||
from lancedb.pydantic import PYDANTIC_VERSION, LanceModel, Vector, pydantic_to_schema
|
||||
from lancedb.pydantic import PYDANTIC_VERSION, LanceModel, pydantic_to_schema, vector
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
@@ -108,7 +107,7 @@ def test_pydantic_to_arrow_py38():
|
||||
|
||||
def test_fixed_size_list_field():
|
||||
class TestModel(pydantic.BaseModel):
|
||||
vec: Vector(16)
|
||||
vec: vector(16)
|
||||
li: List[int]
|
||||
|
||||
data = TestModel(vec=list(range(16)), li=[1, 2, 3])
|
||||
@@ -155,7 +154,7 @@ def test_fixed_size_list_field():
|
||||
|
||||
def test_fixed_size_list_validation():
|
||||
class TestModel(pydantic.BaseModel):
|
||||
vec: Vector(8)
|
||||
vec: vector(8)
|
||||
|
||||
with pytest.raises(pydantic.ValidationError):
|
||||
TestModel(vec=range(9))
|
||||
@@ -168,12 +167,9 @@ def test_fixed_size_list_validation():
|
||||
|
||||
def test_lance_model():
|
||||
class TestModel(LanceModel):
|
||||
vector: Vector(16) = Field(default=[0.0] * 16)
|
||||
li: List[int] = Field(default=[1, 2, 3])
|
||||
vec: vector(16)
|
||||
li: List[int]
|
||||
|
||||
schema = pydantic_to_schema(TestModel)
|
||||
assert schema == TestModel.to_arrow_schema()
|
||||
assert TestModel.field_names() == ["vector", "li"]
|
||||
|
||||
t = TestModel()
|
||||
assert t == TestModel(vec=[0.0] * 16, li=[1, 2, 3])
|
||||
assert TestModel.field_names() == ["vec", "li"]
|
||||
|
||||
@@ -20,7 +20,7 @@ import pyarrow as pa
|
||||
import pytest
|
||||
|
||||
from lancedb.db import LanceDBConnection
|
||||
from lancedb.pydantic import LanceModel, Vector
|
||||
from lancedb.pydantic import LanceModel, vector
|
||||
from lancedb.query import LanceVectorQueryBuilder, Query
|
||||
from lancedb.table import LanceTable
|
||||
|
||||
@@ -67,7 +67,7 @@ def table(tmp_path) -> MockTable:
|
||||
|
||||
def test_cast(table):
|
||||
class TestModel(LanceModel):
|
||||
vector: Vector(2)
|
||||
vector: vector(2)
|
||||
id: int
|
||||
str_field: str
|
||||
float_field: float
|
||||
|
||||
@@ -24,7 +24,7 @@ import pytest
|
||||
|
||||
from lancedb.conftest import MockEmbeddingFunction
|
||||
from lancedb.db import LanceDBConnection
|
||||
from lancedb.pydantic import LanceModel, Vector
|
||||
from lancedb.pydantic import LanceModel, vector
|
||||
from lancedb.table import LanceTable
|
||||
|
||||
|
||||
@@ -140,7 +140,7 @@ def test_add(db):
|
||||
|
||||
def test_add_pydantic_model(db):
|
||||
class TestModel(LanceModel):
|
||||
vector: Vector(16)
|
||||
vector: vector(16)
|
||||
li: List[int]
|
||||
|
||||
data = TestModel(vector=list(range(16)), li=[1, 2, 3])
|
||||
@@ -354,7 +354,7 @@ def test_update(db):
|
||||
def test_create_with_embedding_function(db):
|
||||
class MyTable(LanceModel):
|
||||
text: str
|
||||
vector: Vector(10)
|
||||
vector: vector(10)
|
||||
|
||||
func = MockEmbeddingFunction(source_column="text", vector_column="vector")
|
||||
texts = ["hello world", "goodbye world", "foo bar baz fizz buzz"]
|
||||
@@ -379,7 +379,7 @@ def test_create_with_embedding_function(db):
|
||||
def test_add_with_embedding_function(db):
|
||||
class MyTable(LanceModel):
|
||||
text: str
|
||||
vector: Vector(10)
|
||||
vector: vector(10)
|
||||
|
||||
func = MockEmbeddingFunction(source_column="text", vector_column="vector")
|
||||
table = LanceTable.create(
|
||||
@@ -407,8 +407,8 @@ def test_add_with_embedding_function(db):
|
||||
def test_multiple_vector_columns(db):
|
||||
class MyTable(LanceModel):
|
||||
text: str
|
||||
vector1: Vector(10)
|
||||
vector2: Vector(10)
|
||||
vector1: vector(10)
|
||||
vector2: vector(10)
|
||||
|
||||
table = LanceTable.create(
|
||||
db,
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "vectordb-node"
|
||||
version = "0.2.5"
|
||||
version = "0.2.4"
|
||||
description = "Serverless, low-latency vector database for AI applications"
|
||||
license = "Apache-2.0"
|
||||
edition = "2018"
|
||||
@@ -18,7 +18,6 @@ once_cell = "1"
|
||||
futures = "0.3"
|
||||
half = { workspace = true }
|
||||
lance = { workspace = true }
|
||||
lance-linalg = { workspace = true }
|
||||
vectordb = { path = "../../vectordb" }
|
||||
tokio = { version = "1.23", features = ["rt-multi-thread"] }
|
||||
neon = {version = "0.10.1", default-features = false, features = ["channel-api", "napi-6", "promise-api", "task-api"] }
|
||||
|
||||
@@ -28,9 +28,7 @@ fn validate_vector_column(record_batch: &RecordBatch) -> Result<()> {
|
||||
record_batch
|
||||
.column_by_name(VECTOR_COLUMN_NAME)
|
||||
.map(|_| ())
|
||||
.context(MissingColumnSnafu {
|
||||
name: VECTOR_COLUMN_NAME,
|
||||
})
|
||||
.context(MissingColumnSnafu { name: VECTOR_COLUMN_NAME })
|
||||
}
|
||||
|
||||
pub(crate) fn arrow_buffer_to_record_batch(slice: &[u8]) -> Result<(Vec<RecordBatch>, SchemaRef)> {
|
||||
|
||||
@@ -14,7 +14,7 @@
|
||||
|
||||
use lance::index::vector::ivf::IvfBuildParams;
|
||||
use lance::index::vector::pq::PQBuildParams;
|
||||
use lance_linalg::distance::MetricType;
|
||||
use lance::index::vector::MetricType;
|
||||
use neon::context::FunctionContext;
|
||||
use neon::prelude::*;
|
||||
use std::convert::TryFrom;
|
||||
|
||||
@@ -183,9 +183,11 @@ fn database_open_table(mut cx: FunctionContext) -> JsResult<JsPromise> {
|
||||
let aws_region = get_aws_region(&mut cx, 4)?;
|
||||
|
||||
let params = ReadParams {
|
||||
store_options: Some(ObjectStoreParams::with_aws_credentials(
|
||||
aws_creds, aws_region,
|
||||
)),
|
||||
store_options: Some(ObjectStoreParams {
|
||||
aws_credentials: aws_creds,
|
||||
aws_region,
|
||||
..ObjectStoreParams::default()
|
||||
}),
|
||||
..ReadParams::default()
|
||||
};
|
||||
|
||||
|
||||
@@ -3,7 +3,7 @@ use std::ops::Deref;
|
||||
|
||||
use arrow_array::Float32Array;
|
||||
use futures::{TryFutureExt, TryStreamExt};
|
||||
use lance_linalg::distance::MetricType;
|
||||
use lance::index::vector::MetricType;
|
||||
use neon::context::FunctionContext;
|
||||
use neon::handle::Handle;
|
||||
use neon::prelude::*;
|
||||
|
||||
@@ -43,8 +43,7 @@ impl JsTable {
|
||||
.downcast_or_throw::<JsBox<JsDatabase>, _>(&mut cx)?;
|
||||
let table_name = cx.argument::<JsString>(0)?.value(&mut cx);
|
||||
let buffer = cx.argument::<JsBuffer>(1)?;
|
||||
let (batches, schema) =
|
||||
arrow_buffer_to_record_batch(buffer.as_slice(&mut cx)).or_throw(&mut cx)?;
|
||||
let (batches, schema) = arrow_buffer_to_record_batch(buffer.as_slice(&mut cx)).or_throw(&mut cx)?;
|
||||
|
||||
// Write mode
|
||||
let mode = match cx.argument::<JsString>(2)?.value(&mut cx).as_str() {
|
||||
@@ -66,9 +65,11 @@ impl JsTable {
|
||||
let aws_region = get_aws_region(&mut cx, 6)?;
|
||||
|
||||
let params = WriteParams {
|
||||
store_params: Some(ObjectStoreParams::with_aws_credentials(
|
||||
aws_creds, aws_region,
|
||||
)),
|
||||
store_params: Some(ObjectStoreParams {
|
||||
aws_credentials: aws_creds,
|
||||
aws_region,
|
||||
..ObjectStoreParams::default()
|
||||
}),
|
||||
mode: mode,
|
||||
..WriteParams::default()
|
||||
};
|
||||
@@ -91,8 +92,7 @@ impl JsTable {
|
||||
let js_table = cx.this().downcast_or_throw::<JsBox<JsTable>, _>(&mut cx)?;
|
||||
let buffer = cx.argument::<JsBuffer>(0)?;
|
||||
let write_mode = cx.argument::<JsString>(1)?.value(&mut cx);
|
||||
let (batches, schema) =
|
||||
arrow_buffer_to_record_batch(buffer.as_slice(&mut cx)).or_throw(&mut cx)?;
|
||||
let (batches, schema) = arrow_buffer_to_record_batch(buffer.as_slice(&mut cx)).or_throw(&mut cx)?;
|
||||
let rt = runtime(&mut cx)?;
|
||||
let channel = cx.channel();
|
||||
let mut table = js_table.table.clone();
|
||||
@@ -108,9 +108,11 @@ impl JsTable {
|
||||
let aws_region = get_aws_region(&mut cx, 5)?;
|
||||
|
||||
let params = WriteParams {
|
||||
store_params: Some(ObjectStoreParams::with_aws_credentials(
|
||||
aws_creds, aws_region,
|
||||
)),
|
||||
store_params: Some(ObjectStoreParams {
|
||||
aws_credentials: aws_creds,
|
||||
aws_region,
|
||||
..ObjectStoreParams::default()
|
||||
}),
|
||||
mode: write_mode,
|
||||
..WriteParams::default()
|
||||
};
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "vectordb"
|
||||
version = "0.2.5"
|
||||
version = "0.2.4"
|
||||
edition = "2021"
|
||||
description = "LanceDB: A serverless, low-latency vector database for AI applications"
|
||||
license = "Apache-2.0"
|
||||
@@ -10,21 +10,14 @@ categories = ["database-implementations"]
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
[dependencies]
|
||||
arrow = { workspace = true }
|
||||
arrow-array = { workspace = true }
|
||||
arrow-data = { workspace = true }
|
||||
arrow-schema = { workspace = true }
|
||||
arrow-ord = { workspace = true }
|
||||
arrow-cast = { workspace = true }
|
||||
object_store = { workspace = true }
|
||||
snafu = { workspace = true }
|
||||
half = { workspace = true }
|
||||
lance = { workspace = true }
|
||||
lance-linalg = { workspace = true }
|
||||
tokio = { version = "1.23", features = ["rt-multi-thread"] }
|
||||
log = { workspace = true }
|
||||
num-traits = "0"
|
||||
url = { workspace = true }
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile = "3.5.0"
|
||||
|
||||
@@ -1,15 +0,0 @@
|
||||
// Copyright 2023 Lance Developers.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
pub use lance::arrow::*;
|
||||
@@ -1,18 +0,0 @@
|
||||
// Copyright 2023 Lance Developers.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//! Data types, schema coercion, and data cleaning and etc.
|
||||
|
||||
pub mod inspect;
|
||||
pub mod sanitize;
|
||||
@@ -1,180 +0,0 @@
|
||||
// Copyright 2023 Lance Developers.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::collections::HashMap;
|
||||
|
||||
use arrow::compute::kernels::{aggregate::bool_and, length::length};
|
||||
use arrow_array::{
|
||||
cast::AsArray,
|
||||
types::{ArrowPrimitiveType, Int32Type, Int64Type},
|
||||
Array, GenericListArray, OffsetSizeTrait, RecordBatchReader,
|
||||
};
|
||||
use arrow_ord::comparison::eq_dyn_scalar;
|
||||
use arrow_schema::DataType;
|
||||
use num_traits::{ToPrimitive, Zero};
|
||||
|
||||
use crate::error::{Error, Result};
|
||||
|
||||
pub(crate) fn infer_dimension<T: ArrowPrimitiveType>(
|
||||
list_arr: &GenericListArray<T::Native>,
|
||||
) -> Result<Option<T::Native>>
|
||||
where
|
||||
T::Native: OffsetSizeTrait + ToPrimitive,
|
||||
{
|
||||
let len_arr = length(list_arr)?;
|
||||
if len_arr.is_empty() {
|
||||
return Ok(Some(Zero::zero()));
|
||||
}
|
||||
|
||||
let dim = len_arr.as_primitive::<T>().value(0);
|
||||
if bool_and(&eq_dyn_scalar(len_arr.as_primitive::<T>(), dim)?) != Some(true) {
|
||||
Ok(None)
|
||||
} else {
|
||||
Ok(Some(dim))
|
||||
}
|
||||
}
|
||||
|
||||
/// Infer the vector columns from a dataset.
|
||||
///
|
||||
/// Parameters
|
||||
/// ----------
|
||||
/// - reader: RecordBatchReader
|
||||
/// - strict: if set true, only fixed_size_list<float> is considered as vector column. If set to false,
|
||||
/// a list<float> column with same length is also considered as vector column.
|
||||
pub fn infer_vector_columns(
|
||||
reader: impl RecordBatchReader + Send,
|
||||
strict: bool,
|
||||
) -> Result<Vec<String>> {
|
||||
let mut columns = vec![];
|
||||
|
||||
let mut columns_to_infer: HashMap<String, Option<i64>> = HashMap::new();
|
||||
for field in reader.schema().fields() {
|
||||
match field.data_type() {
|
||||
DataType::FixedSizeList(sub_field, _) if sub_field.data_type().is_floating() => {
|
||||
columns.push(field.name().to_string());
|
||||
}
|
||||
DataType::List(sub_field) if sub_field.data_type().is_floating() && !strict => {
|
||||
columns_to_infer.insert(field.name().to_string(), None);
|
||||
}
|
||||
DataType::LargeList(sub_field) if sub_field.data_type().is_floating() && !strict => {
|
||||
columns_to_infer.insert(field.name().to_string(), None);
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
for batch in reader {
|
||||
let batch = batch?;
|
||||
let col_names = columns_to_infer.keys().cloned().collect::<Vec<_>>();
|
||||
for col_name in col_names {
|
||||
let col = batch.column_by_name(&col_name).ok_or(Error::Schema {
|
||||
message: format!("Column {} not found", col_name),
|
||||
})?;
|
||||
if let Some(dim) = match *col.data_type() {
|
||||
DataType::List(_) => {
|
||||
infer_dimension::<Int32Type>(col.as_list::<i32>())?.map(|d| d as i64)
|
||||
}
|
||||
DataType::LargeList(_) => infer_dimension::<Int64Type>(col.as_list::<i64>())?,
|
||||
_ => {
|
||||
return Err(Error::Schema {
|
||||
message: format!("Column {} is not a list", col_name),
|
||||
})
|
||||
}
|
||||
} {
|
||||
if let Some(Some(prev_dim)) = columns_to_infer.get(&col_name) {
|
||||
if prev_dim != &dim {
|
||||
columns_to_infer.remove(&col_name);
|
||||
}
|
||||
} else {
|
||||
columns_to_infer.insert(col_name, Some(dim));
|
||||
}
|
||||
} else {
|
||||
columns_to_infer.remove(&col_name);
|
||||
}
|
||||
}
|
||||
}
|
||||
columns.extend(columns_to_infer.keys().cloned());
|
||||
Ok(columns)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
use arrow_array::{
|
||||
types::{Float32Type, Float64Type},
|
||||
FixedSizeListArray, Float32Array, ListArray, RecordBatch, RecordBatchIterator, StringArray,
|
||||
};
|
||||
use arrow_schema::{DataType, Field, Schema};
|
||||
use std::{sync::Arc, vec};
|
||||
|
||||
#[test]
|
||||
fn test_infer_vector_columns() {
|
||||
let schema = Arc::new(Schema::new(vec![
|
||||
Field::new("f", DataType::Float32, false),
|
||||
Field::new("s", DataType::Utf8, false),
|
||||
Field::new(
|
||||
"l1",
|
||||
DataType::List(Arc::new(Field::new("item", DataType::Float32, true))),
|
||||
false,
|
||||
),
|
||||
Field::new(
|
||||
"l2",
|
||||
DataType::List(Arc::new(Field::new("item", DataType::Float64, true))),
|
||||
false,
|
||||
),
|
||||
Field::new(
|
||||
"fl",
|
||||
DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Float32, true)), 32),
|
||||
true,
|
||||
),
|
||||
]));
|
||||
|
||||
let batch = RecordBatch::try_new(
|
||||
schema.clone(),
|
||||
vec![
|
||||
Arc::new(Float32Array::from(vec![1.0, 2.0, 3.0])),
|
||||
Arc::new(StringArray::from(vec!["a", "b", "c"])),
|
||||
Arc::new(ListArray::from_iter_primitive::<Float32Type, _, _>(
|
||||
(0..3).map(|_| Some(vec![Some(1.0), Some(2.0), Some(3.0), Some(4.0)])),
|
||||
)),
|
||||
// Var-length list
|
||||
Arc::new(ListArray::from_iter_primitive::<Float64Type, _, _>(vec![
|
||||
Some(vec![Some(1.0_f64)]),
|
||||
Some(vec![Some(2.0_f64), Some(3.0_f64)]),
|
||||
Some(vec![Some(4.0_f64), Some(5.0_f64), Some(6.0_f64)]),
|
||||
])),
|
||||
Arc::new(
|
||||
FixedSizeListArray::from_iter_primitive::<Float32Type, _, _>(
|
||||
vec![
|
||||
Some(vec![Some(1.0); 32]),
|
||||
Some(vec![Some(2.0); 32]),
|
||||
Some(vec![Some(3.0); 32]),
|
||||
],
|
||||
32,
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
.unwrap();
|
||||
let reader =
|
||||
RecordBatchIterator::new(vec![batch.clone()].into_iter().map(Ok), schema.clone());
|
||||
|
||||
let cols = infer_vector_columns(reader, false).unwrap();
|
||||
assert_eq!(cols, vec!["fl", "l1"]);
|
||||
|
||||
let reader = RecordBatchIterator::new(vec![batch].into_iter().map(Ok), schema);
|
||||
let cols = infer_vector_columns(reader, true).unwrap();
|
||||
assert_eq!(cols, vec!["fl"]);
|
||||
}
|
||||
}
|
||||
@@ -1,284 +0,0 @@
|
||||
// Copyright 2023 Lance Developers.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::{iter::repeat_with, sync::Arc};
|
||||
|
||||
use arrow_array::{
|
||||
cast::AsArray,
|
||||
types::{Float16Type, Float32Type, Float64Type, Int32Type, Int64Type},
|
||||
Array, ArrowNumericType, FixedSizeListArray, PrimitiveArray, RecordBatch, RecordBatchIterator,
|
||||
RecordBatchReader,
|
||||
};
|
||||
use arrow_cast::{can_cast_types, cast};
|
||||
use arrow_schema::{ArrowError, DataType, Field, Schema};
|
||||
use half::f16;
|
||||
use lance::arrow::{DataTypeExt, FixedSizeListArrayExt};
|
||||
use log::warn;
|
||||
use num_traits::cast::AsPrimitive;
|
||||
|
||||
use super::inspect::infer_dimension;
|
||||
use crate::error::Result;
|
||||
|
||||
fn cast_array<I: ArrowNumericType, O: ArrowNumericType>(
|
||||
arr: &PrimitiveArray<I>,
|
||||
) -> Arc<PrimitiveArray<O>>
|
||||
where
|
||||
I::Native: AsPrimitive<O::Native>,
|
||||
{
|
||||
Arc::new(PrimitiveArray::<O>::from_iter_values(
|
||||
arr.values().iter().map(|v| (*v).as_()),
|
||||
))
|
||||
}
|
||||
|
||||
fn cast_float_array<I: ArrowNumericType>(
|
||||
arr: &PrimitiveArray<I>,
|
||||
dt: &DataType,
|
||||
) -> std::result::Result<Arc<dyn Array>, ArrowError>
|
||||
where
|
||||
I::Native: AsPrimitive<f64> + AsPrimitive<f32> + AsPrimitive<f16>,
|
||||
{
|
||||
match dt {
|
||||
DataType::Float16 => Ok(cast_array::<I, Float16Type>(arr)),
|
||||
DataType::Float32 => Ok(cast_array::<I, Float32Type>(arr)),
|
||||
DataType::Float64 => Ok(cast_array::<I, Float64Type>(arr)),
|
||||
_ => Err(ArrowError::SchemaError(format!(
|
||||
"Incompatible change field: unable to coerce {:?} to {:?}",
|
||||
arr.data_type(),
|
||||
dt
|
||||
))),
|
||||
}
|
||||
}
|
||||
|
||||
fn coerce_array(
|
||||
array: &Arc<dyn Array>,
|
||||
field: &Field,
|
||||
) -> std::result::Result<Arc<dyn Array>, ArrowError> {
|
||||
if array.data_type() == field.data_type() {
|
||||
return Ok(array.clone());
|
||||
}
|
||||
match (array.data_type(), field.data_type()) {
|
||||
// Normal cast-able types.
|
||||
(adt, dt) if can_cast_types(adt, dt) => cast(&array, dt),
|
||||
// Casting between f16/f32/f64 can be lossy.
|
||||
(adt, dt) if (adt.is_floating() || dt.is_floating()) => {
|
||||
if adt.byte_width() > dt.byte_width() {
|
||||
warn!(
|
||||
"Coercing field {} {:?} to {:?} might lose precision",
|
||||
field.name(),
|
||||
adt,
|
||||
dt
|
||||
);
|
||||
}
|
||||
match adt {
|
||||
DataType::Float16 => cast_float_array(array.as_primitive::<Float16Type>(), dt),
|
||||
DataType::Float32 => cast_float_array(array.as_primitive::<Float32Type>(), dt),
|
||||
DataType::Float64 => cast_float_array(array.as_primitive::<Float64Type>(), dt),
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
(adt, DataType::FixedSizeList(exp_field, exp_dim)) => match adt {
|
||||
// Cast a float fixed size array with same dimension to the expected type.
|
||||
DataType::FixedSizeList(_, dim) if dim == exp_dim => {
|
||||
let actual_sub = array.as_fixed_size_list();
|
||||
let values = coerce_array(actual_sub.values(), exp_field)?;
|
||||
Ok(Arc::new(FixedSizeListArray::try_new_from_values(
|
||||
values.clone(),
|
||||
*dim,
|
||||
)?) as Arc<dyn Array>)
|
||||
}
|
||||
DataType::List(_) | DataType::LargeList(_) => {
|
||||
let Some(dim) = (match adt {
|
||||
DataType::List(_) => infer_dimension::<Int32Type>(array.as_list::<i32>())
|
||||
.map_err(|e| {
|
||||
ArrowError::SchemaError(format!(
|
||||
"failed to infer dimension from list: {}",
|
||||
e
|
||||
))
|
||||
})?
|
||||
.map(|d| d as i64),
|
||||
DataType::LargeList(_) => infer_dimension::<Int64Type>(array.as_list::<i64>())
|
||||
.map_err(|e| {
|
||||
ArrowError::SchemaError(format!(
|
||||
"failed to infer dimension from large list: {}",
|
||||
e
|
||||
))
|
||||
})?,
|
||||
_ => unreachable!(),
|
||||
}) else {
|
||||
return Err(ArrowError::SchemaError(format!(
|
||||
"Incompatible coerce fixed size list: unable to coerce {:?} from {:?}",
|
||||
field,
|
||||
array.data_type()
|
||||
)));
|
||||
};
|
||||
|
||||
if dim != *exp_dim as i64 {
|
||||
return Err(ArrowError::SchemaError(format!(
|
||||
"Incompatible coerce fixed size list: expected dimension {} but got {}",
|
||||
exp_dim, dim
|
||||
)));
|
||||
}
|
||||
|
||||
let values = coerce_array(array, exp_field)?;
|
||||
Ok(Arc::new(FixedSizeListArray::try_new_from_values(
|
||||
values.clone(),
|
||||
*exp_dim,
|
||||
)?) as Arc<dyn Array>)
|
||||
}
|
||||
_ => Err(ArrowError::SchemaError(format!(
|
||||
"Incompatible coerce fixed size list: unable to coerce {:?} from {:?}",
|
||||
field,
|
||||
array.data_type()
|
||||
)))?,
|
||||
},
|
||||
_ => Err(ArrowError::SchemaError(format!(
|
||||
"Incompatible change field {}: unable to coerce {:?} to {:?}",
|
||||
field.name(),
|
||||
array.data_type(),
|
||||
field.data_type()
|
||||
)))?,
|
||||
}
|
||||
}
|
||||
|
||||
fn coerce_schema_batch(
|
||||
batch: RecordBatch,
|
||||
schema: Arc<Schema>,
|
||||
) -> std::result::Result<RecordBatch, ArrowError> {
|
||||
if batch.schema() == schema {
|
||||
return Ok(batch);
|
||||
}
|
||||
let columns = schema
|
||||
.fields()
|
||||
.iter()
|
||||
.map(|field| {
|
||||
batch
|
||||
.column_by_name(field.name())
|
||||
.ok_or_else(|| {
|
||||
ArrowError::SchemaError(format!("Column {} not found", field.name()))
|
||||
})
|
||||
.and_then(|c| coerce_array(c, field))
|
||||
})
|
||||
.collect::<std::result::Result<Vec<_>, ArrowError>>()?;
|
||||
RecordBatch::try_new(schema, columns)
|
||||
}
|
||||
|
||||
/// Coerce the reader (input data) to match the given [Schema].
|
||||
///
|
||||
pub fn coerce_schema(
|
||||
reader: impl RecordBatchReader + Send + 'static,
|
||||
schema: Arc<Schema>,
|
||||
) -> Result<Box<dyn RecordBatchReader + Send>> {
|
||||
if reader.schema() == schema {
|
||||
return Ok(Box::new(RecordBatchIterator::new(reader, schema)));
|
||||
}
|
||||
let s = schema.clone();
|
||||
let batches = reader
|
||||
.zip(repeat_with(move || s.clone()))
|
||||
.map(|(batch, s)| coerce_schema_batch(batch?, s));
|
||||
Ok(Box::new(RecordBatchIterator::new(batches, schema)))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use arrow_array::{
|
||||
FixedSizeListArray, Float16Array, Float32Array, Float64Array, Int32Array, Int8Array,
|
||||
RecordBatch, RecordBatchIterator, StringArray,
|
||||
};
|
||||
use arrow_schema::Field;
|
||||
use half::f16;
|
||||
use lance::arrow::FixedSizeListArrayExt;
|
||||
|
||||
#[test]
|
||||
fn test_coerce_list_to_fixed_size_list() {
|
||||
let schema = Arc::new(Schema::new(vec![
|
||||
Field::new(
|
||||
"fl",
|
||||
DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Float32, true)), 64),
|
||||
true,
|
||||
),
|
||||
Field::new("s", DataType::Utf8, true),
|
||||
Field::new("f", DataType::Float16, true),
|
||||
Field::new("i", DataType::Int32, true),
|
||||
]));
|
||||
|
||||
let batch = RecordBatch::try_new(
|
||||
schema.clone(),
|
||||
vec![
|
||||
Arc::new(
|
||||
FixedSizeListArray::try_new_from_values(
|
||||
Float32Array::from_iter_values((0..256).map(|v| v as f32)),
|
||||
64,
|
||||
)
|
||||
.unwrap(),
|
||||
),
|
||||
Arc::new(StringArray::from(vec![
|
||||
Some("hello"),
|
||||
Some("world"),
|
||||
Some("from"),
|
||||
Some("lance"),
|
||||
])),
|
||||
Arc::new(Float16Array::from_iter_values(
|
||||
(0..4).map(|v| f16::from_f32(v as f32)),
|
||||
)),
|
||||
Arc::new(Int32Array::from_iter_values(0..4)),
|
||||
],
|
||||
)
|
||||
.unwrap();
|
||||
let reader =
|
||||
RecordBatchIterator::new(vec![batch.clone()].into_iter().map(Ok), schema.clone());
|
||||
|
||||
let expected_schema = Arc::new(Schema::new(vec![
|
||||
Field::new(
|
||||
"fl",
|
||||
DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Float16, true)), 64),
|
||||
true,
|
||||
),
|
||||
Field::new("s", DataType::Utf8, true),
|
||||
Field::new("f", DataType::Float64, true),
|
||||
Field::new("i", DataType::Int8, true),
|
||||
]));
|
||||
let stream = coerce_schema(reader, expected_schema.clone()).unwrap();
|
||||
let batches = stream.collect::<Vec<_>>();
|
||||
assert_eq!(batches.len(), 1);
|
||||
let batch = batches[0].as_ref().unwrap();
|
||||
assert_eq!(batch.schema(), expected_schema);
|
||||
|
||||
let expected = RecordBatch::try_new(
|
||||
expected_schema,
|
||||
vec![
|
||||
Arc::new(
|
||||
FixedSizeListArray::try_new_from_values(
|
||||
Float16Array::from_iter_values((0..256).map(|v| f16::from_f32(v as f32))),
|
||||
64,
|
||||
)
|
||||
.unwrap(),
|
||||
),
|
||||
Arc::new(StringArray::from(vec![
|
||||
Some("hello"),
|
||||
Some("world"),
|
||||
Some("from"),
|
||||
Some("lance"),
|
||||
])),
|
||||
Arc::new(Float64Array::from_iter_values((0..4).map(|v| v as f64))),
|
||||
Arc::new(Int8Array::from_iter_values(0..4)),
|
||||
],
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(batch, &expected);
|
||||
}
|
||||
}
|
||||
@@ -20,32 +20,19 @@ use lance::dataset::WriteParams;
|
||||
use lance::io::object_store::ObjectStore;
|
||||
use snafu::prelude::*;
|
||||
|
||||
use crate::error::{CreateDirSnafu, Error, InvalidTableNameSnafu, Result};
|
||||
use crate::error::{CreateDirSnafu, InvalidTableNameSnafu, Result};
|
||||
use crate::table::{ReadParams, Table};
|
||||
|
||||
pub const LANCE_FILE_EXTENSION: &str = "lance";
|
||||
|
||||
pub struct Database {
|
||||
object_store: ObjectStore,
|
||||
query_string: Option<String>,
|
||||
|
||||
pub(crate) uri: String,
|
||||
pub(crate) base_path: object_store::path::Path,
|
||||
}
|
||||
|
||||
const LANCE_EXTENSION: &str = "lance";
|
||||
const ENGINE: &str = "engine";
|
||||
|
||||
/// Parse a url, if it's not a valid url, assume it's a local file
|
||||
/// and try to parse with file:// appended
|
||||
fn parse_url(url: &str) -> Result<url::Url> {
|
||||
match url::Url::parse(url) {
|
||||
Ok(url) => Ok(url),
|
||||
Err(_) => url::Url::parse(format!("file://{}", url).as_str()).map_err(|e| Error::Lance {
|
||||
message: format!("Failed to parse uri: {}", e),
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
/// A connection to LanceDB
|
||||
impl Database {
|
||||
@@ -59,71 +46,12 @@ impl Database {
|
||||
///
|
||||
/// * A [Database] object.
|
||||
pub async fn connect(uri: &str) -> Result<Database> {
|
||||
// For a native (using lance directly) connection
|
||||
// The DB doesn't use any uri parameters, but lance does
|
||||
// So we need to parse the uri, extract the query string, and progate it to lance
|
||||
let mut url = parse_url(uri)?;
|
||||
|
||||
// special handling for windows
|
||||
if url.scheme().len() == 1 && cfg!(windows) {
|
||||
let (object_store, base_path) = ObjectStore::from_uri(uri).await?;
|
||||
if object_store.is_local() {
|
||||
Self::try_create_dir(uri).context(CreateDirSnafu { path: uri })?;
|
||||
}
|
||||
return Ok(Database {
|
||||
uri: uri.to_string(),
|
||||
query_string: None,
|
||||
base_path,
|
||||
object_store,
|
||||
});
|
||||
}
|
||||
|
||||
// iter thru the query params and extract the commit store param
|
||||
let mut engine = None;
|
||||
let mut filtered_querys = vec![];
|
||||
|
||||
// WARNING: specifying engine is NOT a publicly supported feature in lancedb yet
|
||||
// THE API WILL CHANGE
|
||||
for (key, value) in url.query_pairs() {
|
||||
if key == ENGINE {
|
||||
engine = Some(value.to_string());
|
||||
} else {
|
||||
// to owned so we can modify the url
|
||||
filtered_querys.push((key.to_string(), value.to_string()));
|
||||
}
|
||||
}
|
||||
|
||||
// Filter out the commit store query param -- it's a lancedb param
|
||||
url.query_pairs_mut().clear();
|
||||
url.query_pairs_mut().extend_pairs(filtered_querys);
|
||||
// Take a copy of the query string so we can propagate it to lance
|
||||
let query_string = url.query().map(|s| s.to_string());
|
||||
// clear the query string so we can use the url as the base uri
|
||||
// use .set_query(None) instead of .set_query("") because the latter
|
||||
// will add a trailing '?' to the url
|
||||
url.set_query(None);
|
||||
|
||||
let table_base_uri = if let Some(store) = engine {
|
||||
static WARN_ONCE: std::sync::Once = std::sync::Once::new();
|
||||
WARN_ONCE.call_once(|| {
|
||||
log::warn!("Specifing engine is not a publicly supported feature in lancedb yet. THE API WILL CHANGE");
|
||||
});
|
||||
let old_scheme = url.scheme().to_string();
|
||||
let new_scheme = format!("{}+{}", old_scheme, store);
|
||||
url.to_string().replacen(&old_scheme, &new_scheme, 1)
|
||||
} else {
|
||||
url.to_string()
|
||||
};
|
||||
|
||||
let plain_uri = url.to_string();
|
||||
let (object_store, base_path) = ObjectStore::from_uri(&plain_uri).await?;
|
||||
let (object_store, base_path) = ObjectStore::from_uri(uri).await?;
|
||||
if object_store.is_local() {
|
||||
Self::try_create_dir(&plain_uri).context(CreateDirSnafu { path: plain_uri })?;
|
||||
Self::try_create_dir(uri).context(CreateDirSnafu { path: uri })?;
|
||||
}
|
||||
|
||||
Ok(Database {
|
||||
uri: table_base_uri,
|
||||
query_string,
|
||||
uri: uri.to_string(),
|
||||
base_path,
|
||||
object_store,
|
||||
})
|
||||
@@ -221,19 +149,11 @@ impl Database {
|
||||
let path = Path::new(&self.uri);
|
||||
let table_uri = path.join(format!("{}.{}", name, LANCE_FILE_EXTENSION));
|
||||
|
||||
let mut uri = table_uri
|
||||
let uri = table_uri
|
||||
.as_path()
|
||||
.to_str()
|
||||
.context(InvalidTableNameSnafu { name })?
|
||||
.to_string();
|
||||
|
||||
// If there are query string set on the connection, propagate to lance
|
||||
if let Some(query) = self.query_string.as_ref() {
|
||||
uri.push('?');
|
||||
uri.push_str(query.as_str());
|
||||
}
|
||||
|
||||
Ok(uri)
|
||||
.context(InvalidTableNameSnafu { name })?;
|
||||
Ok(uri.to_string())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -250,15 +170,7 @@ mod tests {
|
||||
let uri = tmp_dir.path().to_str().unwrap();
|
||||
let db = Database::connect(uri).await.unwrap();
|
||||
|
||||
// file:// scheme should be automatically appended if not specified
|
||||
// windows path come with drive letter, so file:// won't be appended
|
||||
let expected = if cfg!(windows) {
|
||||
uri.to_string()
|
||||
} else {
|
||||
format!("file://{}", uri)
|
||||
};
|
||||
|
||||
assert_eq!(db.uri, expected);
|
||||
assert_eq!(db.uri, uri);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
|
||||
@@ -12,7 +12,6 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use arrow_schema::ArrowError;
|
||||
use snafu::Snafu;
|
||||
|
||||
#[derive(Debug, Snafu)]
|
||||
@@ -33,20 +32,10 @@ pub enum Error {
|
||||
Store { message: String },
|
||||
#[snafu(display("LanceDBError: {message}"))]
|
||||
Lance { message: String },
|
||||
#[snafu(display("LanceDB Schema Error: {message}"))]
|
||||
Schema { message: String },
|
||||
}
|
||||
|
||||
pub type Result<T> = std::result::Result<T, Error>;
|
||||
|
||||
impl From<ArrowError> for Error {
|
||||
fn from(e: ArrowError) -> Self {
|
||||
Self::Lance {
|
||||
message: e.to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<lance::Error> for Error {
|
||||
fn from(e: lance::Error) -> Self {
|
||||
Self::Lance {
|
||||
|
||||
@@ -14,8 +14,7 @@
|
||||
|
||||
use lance::index::vector::ivf::IvfBuildParams;
|
||||
use lance::index::vector::pq::PQBuildParams;
|
||||
use lance::index::vector::VectorIndexParams;
|
||||
use lance_linalg::distance::MetricType;
|
||||
use lance::index::vector::{MetricType, VectorIndexParams};
|
||||
|
||||
pub trait VectorIndexBuilder {
|
||||
fn get_column(&self) -> Option<String>;
|
||||
@@ -108,11 +107,9 @@ impl VectorIndexBuilder for IvfPQIndexBuilder {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
use lance::index::vector::ivf::IvfBuildParams;
|
||||
use lance::index::vector::pq::PQBuildParams;
|
||||
use lance::index::vector::StageParams;
|
||||
use lance::index::vector::{MetricType, StageParams};
|
||||
|
||||
use crate::index::vector::{IvfPQIndexBuilder, VectorIndexBuilder};
|
||||
|
||||
|
||||
@@ -12,7 +12,6 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
pub mod data;
|
||||
pub mod database;
|
||||
pub mod error;
|
||||
pub mod index;
|
||||
|
||||
@@ -17,7 +17,7 @@ use std::sync::Arc;
|
||||
use arrow_array::Float32Array;
|
||||
use lance::dataset::scanner::{DatasetRecordBatchStream, Scanner};
|
||||
use lance::dataset::Dataset;
|
||||
use lance_linalg::distance::MetricType;
|
||||
use lance::index::vector::MetricType;
|
||||
|
||||
use crate::error::Result;
|
||||
|
||||
@@ -164,10 +164,10 @@ impl Query {
|
||||
mod tests {
|
||||
use std::sync::Arc;
|
||||
|
||||
use super::*;
|
||||
use arrow_array::{Float32Array, RecordBatch, RecordBatchIterator, RecordBatchReader};
|
||||
use arrow_schema::{DataType, Field as ArrowField, Schema as ArrowSchema};
|
||||
use lance::dataset::Dataset;
|
||||
use lance::index::vector::MetricType;
|
||||
|
||||
use crate::query::Query;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user