Compare commits

...

17 Commits

Author SHA1 Message Date
Lei Xu
2704a4522c Bump to 0.1.11 2023-07-17 12:45:17 -07:00
Lei Xu
030f07e7f0 Bump minimal lance version to 0.5.8 (#318) 2023-07-17 12:41:29 -07:00
gsilvestrin
72afa06b7a feat(node): Add Windows support (#294) 2023-07-17 08:48:24 -07:00
Lei Xu
088e745e1d [Python] Create table with Iterator[RecordBatch] and add docs (#316) 2023-07-16 21:45:55 -07:00
Lei Xu
7a57cddb2c [Python] Add records to remote (#315) 2023-07-16 13:24:38 -07:00
Lei Xu
8ff5f88916 [Python] Bug fixes in remote API (#314) 2023-07-16 11:09:19 -07:00
Lei Xu
028a6e433d [Python] Get table schema (#313) 2023-07-15 17:39:37 -07:00
Lei Xu
04c6814fb1 [Rust] Expose Table schema and version in Rust (#312) 2023-07-14 22:01:23 -07:00
Lei Xu
c62e4ca1eb Bump lance version to 0.5.7 (#311) 2023-07-14 17:17:31 -07:00
gsilvestrin
aecc5fc42b feat(node): Fix npm publish task (#298) 2023-07-14 13:39:15 -07:00
Chang She
2fdcb307eb [python] Fix a few minor bugs (#304) 2023-07-15 03:47:42 +08:00
Tevin Wang
ad18826579 [Documentation Code Testing] build node sdk in release (#307) 2023-07-14 12:46:48 -07:00
Leon Yee
a8a50591d7 [docs] small fixes (#308)
Closes #288 and #287
2023-07-14 12:46:31 -07:00
gsilvestrin
6dfe7fabc2 pin half (#310) 2023-07-14 12:45:05 -07:00
gsilvestrin
2b108e1c80 Updating package-lock.json file (#301) 2023-07-13 17:50:01 -07:00
Lei Xu
8c9edafccc [Doc] Add more Python integrations documents (#299) 2023-07-13 17:09:39 -07:00
Leon Yee
0590413b96 Added transformersJS example to docs and node/examples (#297) 2023-07-13 17:01:36 -07:00
33 changed files with 719 additions and 528 deletions

View File

@@ -81,7 +81,7 @@ jobs:
run: | run: |
cd docs/test/node_modules/vectordb cd docs/test/node_modules/vectordb
npm ci npm ci
npm run build npm run build-release
npm run tsc npm run tsc
- name: Create test files - name: Create test files
run: | run: |

View File

@@ -116,6 +116,39 @@ jobs:
path: | path: |
node/dist/vectordb-linux*.tgz node/dist/vectordb-linux*.tgz
node-windows:
runs-on: windows-2022
# Only runs on tags that matches the make-release action
if: startsWith(github.ref, 'refs/tags/v')
strategy:
fail-fast: false
matrix:
target: [x86_64-pc-windows-msvc]
steps:
- name: Checkout
uses: actions/checkout@v3
- name: Install Protoc v21.12
working-directory: C:\
run: |
New-Item -Path 'C:\protoc' -ItemType Directory
Set-Location C:\protoc
Invoke-WebRequest https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip -OutFile C:\protoc\protoc.zip
7z x protoc.zip
Add-Content $env:GITHUB_PATH "C:\protoc\bin"
shell: powershell
- name: Install npm dependencies
run: |
cd node
npm ci
- name: Build Windows native node modules
run: .\ci\build_windows_artifacts.ps1 ${{ matrix.target }}
- name: Upload Windows Artifacts
uses: actions/upload-artifact@v3
with:
name: windows-native
path: |
node/dist/vectordb-win32*.tgz
release: release:
needs: [node, node-macos, node-linux] needs: [node, node-macos, node-linux]
runs-on: ubuntu-latest runs-on: ubuntu-latest
@@ -132,6 +165,7 @@ jobs:
env: env:
NODE_AUTH_TOKEN: ${{ secrets.LANCEDB_NPM_REGISTRY_TOKEN }} NODE_AUTH_TOKEN: ${{ secrets.LANCEDB_NPM_REGISTRY_TOKEN }}
run: | run: |
for filename in */*.tgz; do mv */*.tgz .
for filename in *.tgz; do
npm publish $filename npm publish $filename
done done

View File

@@ -66,3 +66,24 @@ jobs:
run: cargo build --all-features run: cargo build --all-features
- name: Run tests - name: Run tests
run: cargo test --all-features run: cargo test --all-features
windows:
runs-on: windows-2022
steps:
- uses: actions/checkout@v3
- uses: Swatinem/rust-cache@v2
with:
workspaces: rust
- name: Install Protoc v21.12
working-directory: C:\
run: |
New-Item -Path 'C:\protoc' -ItemType Directory
Set-Location C:\protoc
Invoke-WebRequest https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip -OutFile C:\protoc\protoc.zip
7z x protoc.zip
Add-Content $env:GITHUB_PATH "C:\protoc\bin"
shell: powershell
- name: Run tests
run: |
$env:VCPKG_ROOT = $env:VCPKG_INSTALLATION_ROOT
cargo build
cargo test

View File

@@ -6,9 +6,11 @@ members = [
resolver = "2" resolver = "2"
[workspace.dependencies] [workspace.dependencies]
lance = "=0.5.5" lance = "=0.5.8"
arrow-array = "42.0" arrow-array = "42.0"
arrow-data = "42.0" arrow-data = "42.0"
arrow-schema = "42.0" arrow-schema = "42.0"
arrow-ipc = "42.0" arrow-ipc = "42.0"
half = { "version" = "2.2.1", default-features = false }
object_store = "0.6.1" object_store = "0.6.1"

View File

@@ -0,0 +1,41 @@
# Builds the Windows artifacts (node binaries).
# Usage: .\ci\build_windows_artifacts.ps1 [target]
# Targets supported:
# - x86_64-pc-windows-msvc
# - i686-pc-windows-msvc
function Prebuild-Rust {
param (
[string]$target
)
# Building here for the sake of easier debugging.
Push-Location -Path "rust/ffi/node"
Write-Host "Building rust library for $target"
$env:RUST_BACKTRACE=1
cargo build --release --target $target
Pop-Location
}
function Build-NodeBinaries {
param (
[string]$target
)
Push-Location -Path "node"
Write-Host "Building node library for $target"
npm run build-release -- --target $target
npm run pack-build -- --target $target
Pop-Location
}
$targets = $args[0]
if (-not $targets) {
$targets = "x86_64-pc-windows-msvc"
}
Write-Host "Building artifacts for targets: $targets"
foreach ($target in $targets) {
Prebuild-Rust $target
Build-NodeBinaries $target
}

View File

@@ -60,6 +60,9 @@ nav:
- Python integrations: - Python integrations:
- Pandas and PyArrow: python/arrow.md - Pandas and PyArrow: python/arrow.md
- DuckDB: python/duckdb.md - DuckDB: python/duckdb.md
- LangChain 🦜️🔗: https://python.langchain.com/en/latest/modules/indexes/vectorstores/examples/lancedb.html
- LlamaIndex 🦙: https://gpt-index.readthedocs.io/en/latest/examples/vector_stores/LanceDBIndexDemo.html
- Pydantic: python/pydantic.md
- Python examples: - Python examples:
- YouTube Transcript Search: notebooks/youtube_transcript_search.ipynb - YouTube Transcript Search: notebooks/youtube_transcript_search.ipynb
- Documentation QA Bot using LangChain: notebooks/code_qa_bot.ipynb - Documentation QA Bot using LangChain: notebooks/code_qa_bot.ipynb
@@ -68,6 +71,7 @@ nav:
- Serverless QA Bot with Modal: examples/serverless_qa_bot_with_modal_and_langchain.md - Serverless QA Bot with Modal: examples/serverless_qa_bot_with_modal_and_langchain.md
- Javascript examples: - Javascript examples:
- YouTube Transcript Search: examples/youtube_transcript_bot_with_nodejs.md - YouTube Transcript Search: examples/youtube_transcript_bot_with_nodejs.md
- TransformersJS Embedding Search: examples/transformerjs_embedding_search_nodejs.md
- References: - References:
- Vector Search: search.md - Vector Search: search.md
- SQL filters: sql.md - SQL filters: sql.md

View File

@@ -46,7 +46,7 @@ You can also use an external API like OpenAI to generate embeddings
def embed_func(c): def embed_func(c):
rs = openai.Embedding.create(input=c, engine="text-embedding-ada-002") rs = openai.Embedding.create(input=c, engine="text-embedding-ada-002")
return [record["embedding"] for record in rs["data"]] return [record["embedding"] for record in rs["data"]]
``` ```
=== "Javascript" === "Javascript"
@@ -126,7 +126,7 @@ belong in the same latent space and your results will be nonsensical.
=== "Javascript" === "Javascript"
```javascript ```javascript
const results = await table const results = await table
.search('What's the best pizza topping?') .search("What's the best pizza topping?")
.limit(10) .limit(10)
.execute() .execute()
``` ```

View File

@@ -0,0 +1,121 @@
# Vector embedding search using TransformersJS
## Embed and query data from LacneDB using TransformersJS
<img id="splash" width="400" alt="transformersjs" src="https://github.com/lancedb/lancedb/assets/43097991/88a31e30-3d6f-4eef-9216-4b7c688f1b4f">
This example shows how to use the [transformers.js](https://github.com/xenova/transformers.js) library to perform vector embedding search using LanceDB's Javascript API.
### Setting up
First, install the dependencies:
```bash
npm install vectordb
npm i @xenova/transformers
```
We will also be using the [all-MiniLM-L6-v2](https://huggingface.co/Xenova/all-MiniLM-L6-v2) model to make it compatible with Transformers.js
Within our `index.js` file we will import the necessary libraries and define our model and database:
```javascript
const lancedb = require('vectordb')
const { pipeline } = await import('@xenova/transformers')
const pipe = await pipeline('feature-extraction', 'Xenova/all-MiniLM-L6-v2');
```
### Creating the embedding function
Next, we will create a function that will take in a string and return the vector embedding of that string. We will use the `pipe` function we defined earlier to get the vector embedding of the string.
```javascript
// Define the function. `sourceColumn` is required for LanceDB to know
// which column to use as input.
const embed_fun = {}
embed_fun.sourceColumn = 'text'
embed_fun.embed = async function (batch) {
let result = []
// Given a batch of strings, we will use the `pipe` function to get
// the vector embedding of each string.
for (let text of batch) {
// 'mean' pooling and normalizing allows the embeddings to share the
// same length.
const res = await pipe(text, { pooling: 'mean', normalize: true })
result.push(Array.from(res['data']))
}
return (result)
}
```
### Creating the database
Now, we will create the LanceDB database and add the embedding function we defined earlier.
```javascript
// Link a folder and create a table with data
const db = await lancedb.connect('data/sample-lancedb')
// You can also import any other data, but make sure that you have a column
// for the embedding function to use.
const data = [
{ id: 1, text: 'Cherry', type: 'fruit' },
{ id: 2, text: 'Carrot', type: 'vegetable' },
{ id: 3, text: 'Potato', type: 'vegetable' },
{ id: 4, text: 'Apple', type: 'fruit' },
{ id: 5, text: 'Banana', type: 'fruit' }
]
// Create the table with the embedding function
const table = await db.createTable('food_table', data, "create", embed_fun)
```
### Performing the search
Now, we can perform the search using the `search` function. LanceDB automatically uses the embedding function we defined earlier to get the vector embedding of the query string.
```javascript
// Query the table
const results = await table
.search("a sweet fruit to eat")
.metricType("cosine")
.limit(2)
.execute()
console.log(results.map(r => r.text))
```
```bash
[ 'Banana', 'Cherry' ]
```
Output of `results`:
```bash
[
{
vector: Float32Array(384) [
-0.057455405592918396,
0.03617725893855095,
-0.0367760956287384,
... 381 more items
],
id: 5,
text: 'Banana',
type: 'fruit',
score: 0.4919965863227844
},
{
vector: Float32Array(384) [
0.0009714411571621895,
0.008223623037338257,
0.009571489877998829,
... 381 more items
],
id: 1,
text: 'Cherry',
type: 'fruit',
score: 0.5540297031402588
}
]
```
### Wrapping it up
In this example, we showed how to use the `transformers.js` library to perform vector embedding search using LanceDB's Javascript API. You can find the full code for this example on [Github](https://github.com/lancedb/lancedb/blob/main/node/examples/js-transformers/index.js)!

View File

@@ -5,6 +5,8 @@ Built on top of [Apache Arrow](https://arrow.apache.org/),
`LanceDB` is easy to integrate with the Python ecosystem, including [Pandas](https://pandas.pydata.org/) `LanceDB` is easy to integrate with the Python ecosystem, including [Pandas](https://pandas.pydata.org/)
and PyArrow. and PyArrow.
## Create dataset
First, we need to connect to a `LanceDB` database. First, we need to connect to a `LanceDB` database.
```py ```py
@@ -27,10 +29,42 @@ data = pd.DataFrame({
table = db.create_table("pd_table", data=data) table = db.create_table("pd_table", data=data)
``` ```
You will find detailed instructions of creating dataset and index in Similar to [`pyarrow.write_dataset()`](https://arrow.apache.org/docs/python/generated/pyarrow.dataset.write_dataset.html),
[Basic Operations](basic.md) and [Indexing](ann_indexes.md) [db.create_table()](../python/#lancedb.db.DBConnection.create_table) accepts a wide-range of forms of data.
For example, if you have a dataset that is larger than memory size, you can create table with `Iterator[pyarrow.RecordBatch]`,
to lazily generate data:
```py
from typing import Iterable
import pyarrow as pa
import lancedb
def make_batches() -> Iterable[pa.RecordBatch]:
for i in range(5):
yield pa.RecordBatch.from_arrays(
[
pa.array([[3.1, 4.1], [5.9, 26.5]]),
pa.array(["foo", "bar"]),
pa.array([10.0, 20.0]),
],
["vector", "item", "price"])
schema=pa.schema([
pa.field("vector", pa.list_(pa.float32())),
pa.field("item", pa.utf8()),
pa.field("price", pa.float32()),
])
table = db.create_table("iterable_table", data=make_batches(), schema=schema)
```
You will find detailed instructions of creating dataset in
[Basic Operations](../basic.md) and [API](../python/#lancedb.db.DBConnection.create_table)
sections. sections.
## Vector Search
We can now perform similarity search via `LanceDB` Python API. We can now perform similarity search via `LanceDB` Python API.

View File

@@ -0,0 +1,35 @@
# Pydantic
[Pydantic](https://docs.pydantic.dev/latest/) is a data validation library in Python.
## Schema
LanceDB supports to create Apache Arrow Schema from a
[Pydantic BaseModel](https://docs.pydantic.dev/latest/api/main/#pydantic.main.BaseModel)
via [pydantic_to_schema()](python.md##lancedb.pydantic.pydantic_to_schema) method.
::: lancedb.pydantic.pydantic_to_schema
## Vector Field
LanceDB provides a [`vector(dim)`](python.md#lancedb.pydantic.vector) method to define a
vector Field in a Pydantic Model.
::: lancedb.pydantic.vector
## Type Conversion
LanceDB automatically convert Pydantic fields to
[Apache Arrow DataType](https://arrow.apache.org/docs/python/generated/pyarrow.DataType.html#pyarrow.DataType).
Current supported type conversions:
| Pydantic Field Type | PyArrow Data Type |
| ------------------- | ----------------- |
| `int` | `pyarrow.int64` |
| `float` | `pyarrow.float64` |
| `bool` | `pyarrow.bool` |
| `str` | `pyarrow.utf8()` |
| `list` | `pyarrow.List` |
| `BaseModel` | `pyarrow.Struct` |
| `vector(n)` | `pyarrow.FixedSizeList(float32, n)` |

View File

@@ -46,10 +46,6 @@ pip install lancedb
## Utilities ## Utilities
::: lancedb.schema.schema_to_dict
::: lancedb.schema.dict_to_schema
::: lancedb.vector ::: lancedb.vector
## Integrations ## Integrations

View File

@@ -7,6 +7,7 @@ const excludedFiles = [
"../src/embedding.md", "../src/embedding.md",
"../src/examples/serverless_lancedb_with_s3_and_lambda.md", "../src/examples/serverless_lancedb_with_s3_and_lambda.md",
"../src/examples/serverless_qa_bot_with_modal_and_langchain.md", "../src/examples/serverless_qa_bot_with_modal_and_langchain.md",
"../src/examples/transformerjs_embedding_search_nodejs.md",
"../src/examples/youtube_transcript_bot_with_nodejs.md", "../src/examples/youtube_transcript_bot_with_nodejs.md",
]; ];
const nodePrefix = "javascript"; const nodePrefix = "javascript";

View File

@@ -0,0 +1,66 @@
// Copyright 2023 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
'use strict'
async function example() {
const lancedb = require('vectordb')
// Import transformers and the all-MiniLM-L6-v2 model (https://huggingface.co/Xenova/all-MiniLM-L6-v2)
const { pipeline } = await import('@xenova/transformers')
const pipe = await pipeline('feature-extraction', 'Xenova/all-MiniLM-L6-v2');
// Create embedding function from pipeline which returns a list of vectors from batch
// sourceColumn is the name of the column in the data to be embedded
//
// Output of pipe is a Tensor { data: Float32Array(384) }, so filter for the vector
const embed_fun = {}
embed_fun.sourceColumn = 'text'
embed_fun.embed = async function (batch) {
let result = []
for (let text of batch) {
const res = await pipe(text, { pooling: 'mean', normalize: true })
result.push(Array.from(res['data']))
}
return (result)
}
// Link a folder and create a table with data
const db = await lancedb.connect('data/sample-lancedb')
const data = [
{ id: 1, text: 'Cherry', type: 'fruit' },
{ id: 2, text: 'Carrot', type: 'vegetable' },
{ id: 3, text: 'Potato', type: 'vegetable' },
{ id: 4, text: 'Apple', type: 'fruit' },
{ id: 5, text: 'Banana', type: 'fruit' }
]
const table = await db.createTable('food_table', data, "create", embed_fun)
// Query the table
const results = await table
.search("a sweet fruit to eat")
.metricType("cosine")
.limit(2)
.execute()
console.log(results.map(r => r.text))
}
example().then(_ => { console.log("Done!") })

View File

@@ -0,0 +1,16 @@
{
"name": "vectordb-example-js-transformers",
"version": "1.0.0",
"description": "Example for using transformers.js with lancedb",
"main": "index.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"author": "Lance Devs",
"license": "Apache-2.0",
"dependencies": {
"@xenova/transformers": "^2.4.1",
"vectordb": "^0.1.12"
}
}

70
node/package-lock.json generated
View File

@@ -1,12 +1,12 @@
{ {
"name": "vectordb", "name": "vectordb",
"version": "0.1.12", "version": "0.1.13",
"lockfileVersion": 2, "lockfileVersion": 2,
"requires": true, "requires": true,
"packages": { "packages": {
"": { "": {
"name": "vectordb", "name": "vectordb",
"version": "0.1.12", "version": "0.1.13",
"cpu": [ "cpu": [
"x64", "x64",
"arm64" "arm64"
@@ -14,7 +14,8 @@
"license": "Apache-2.0", "license": "Apache-2.0",
"os": [ "os": [
"darwin", "darwin",
"linux" "linux",
"win32"
], ],
"dependencies": { "dependencies": {
"@apache-arrow/ts": "^12.0.0", "@apache-arrow/ts": "^12.0.0",
@@ -49,10 +50,11 @@
"typescript": "*" "typescript": "*"
}, },
"optionalDependencies": { "optionalDependencies": {
"vectordb-darwin-arm64": "0.1.12", "vectordb-darwin-arm64": "0.1.13",
"vectordb-darwin-x64": "0.1.12", "vectordb-darwin-x64": "0.1.13",
"vectordb-linux-arm64-gnu": "0.1.12", "vectordb-linux-arm64-gnu": "0.1.13",
"vectordb-linux-x64-gnu": "0.1.12" "vectordb-linux-x64-gnu": "0.1.13",
"vectordb-win32-x64-msvc": "0.1.13"
} }
}, },
"node_modules/@apache-arrow/ts": { "node_modules/@apache-arrow/ts": {
@@ -4286,6 +4288,42 @@
"integrity": "sha512-wa7YjyUGfNZngI/vtK0UHAN+lgDCxBPCylVXGp0zu59Fz5aiGtNXaq3DhIov063MorB+VfufLh3JlF2KdTK3xg==", "integrity": "sha512-wa7YjyUGfNZngI/vtK0UHAN+lgDCxBPCylVXGp0zu59Fz5aiGtNXaq3DhIov063MorB+VfufLh3JlF2KdTK3xg==",
"dev": true "dev": true
}, },
"node_modules/vectordb-darwin-arm64": {
"version": "0.1.13",
"resolved": "https://registry.npmjs.org/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.1.13.tgz",
"integrity": "sha512-9lLuX5P8m75EfP85pfC4LxO9J7Tzu4LngX55BVAdFe6qPRHu+iHmLw0QYYSVDqNm3GtDr2qFJlL2ILlsApyYyg==",
"cpu": [
"arm64"
],
"optional": true,
"os": [
"darwin"
]
},
"node_modules/vectordb-darwin-x64": {
"version": "0.1.13",
"resolved": "https://registry.npmjs.org/vectordb-darwin-x64/-/vectordb-darwin-x64-0.1.13.tgz",
"integrity": "sha512-5mkhBJlcfAqcty7Ww2csgYogq+b0NhtllAbag9IIznvqfcrvITU0H0vm5LGWbRuE/BUUxC25MJhm93YWBzqEVA==",
"cpu": [
"x64"
],
"optional": true,
"os": [
"darwin"
]
},
"node_modules/vectordb-linux-x64-gnu": {
"version": "0.1.13",
"resolved": "https://registry.npmjs.org/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.1.13.tgz",
"integrity": "sha512-fU+sIHUkXyMdrWjggT93p0blKD+pbgr+x01tn9d2/pbA1ePo2AwuE86rYPA+BjyCUE1QifPgKadzGVVpqWYmnQ==",
"cpu": [
"x64"
],
"optional": true,
"os": [
"linux"
]
},
"node_modules/vscode-oniguruma": { "node_modules/vscode-oniguruma": {
"version": "1.7.0", "version": "1.7.0",
"resolved": "https://registry.npmjs.org/vscode-oniguruma/-/vscode-oniguruma-1.7.0.tgz", "resolved": "https://registry.npmjs.org/vscode-oniguruma/-/vscode-oniguruma-1.7.0.tgz",
@@ -7581,6 +7619,24 @@
"integrity": "sha512-wa7YjyUGfNZngI/vtK0UHAN+lgDCxBPCylVXGp0zu59Fz5aiGtNXaq3DhIov063MorB+VfufLh3JlF2KdTK3xg==", "integrity": "sha512-wa7YjyUGfNZngI/vtK0UHAN+lgDCxBPCylVXGp0zu59Fz5aiGtNXaq3DhIov063MorB+VfufLh3JlF2KdTK3xg==",
"dev": true "dev": true
}, },
"vectordb-darwin-arm64": {
"version": "0.1.13",
"resolved": "https://registry.npmjs.org/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.1.13.tgz",
"integrity": "sha512-9lLuX5P8m75EfP85pfC4LxO9J7Tzu4LngX55BVAdFe6qPRHu+iHmLw0QYYSVDqNm3GtDr2qFJlL2ILlsApyYyg==",
"optional": true
},
"vectordb-darwin-x64": {
"version": "0.1.13",
"resolved": "https://registry.npmjs.org/vectordb-darwin-x64/-/vectordb-darwin-x64-0.1.13.tgz",
"integrity": "sha512-5mkhBJlcfAqcty7Ww2csgYogq+b0NhtllAbag9IIznvqfcrvITU0H0vm5LGWbRuE/BUUxC25MJhm93YWBzqEVA==",
"optional": true
},
"vectordb-linux-x64-gnu": {
"version": "0.1.13",
"resolved": "https://registry.npmjs.org/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.1.13.tgz",
"integrity": "sha512-fU+sIHUkXyMdrWjggT93p0blKD+pbgr+x01tn9d2/pbA1ePo2AwuE86rYPA+BjyCUE1QifPgKadzGVVpqWYmnQ==",
"optional": true
},
"vscode-oniguruma": { "vscode-oniguruma": {
"version": "1.7.0", "version": "1.7.0",
"resolved": "https://registry.npmjs.org/vscode-oniguruma/-/vscode-oniguruma-1.7.0.tgz", "resolved": "https://registry.npmjs.org/vscode-oniguruma/-/vscode-oniguruma-1.7.0.tgz",

View File

@@ -8,7 +8,7 @@
"tsc": "tsc -b", "tsc": "tsc -b",
"build": "cargo-cp-artifact --artifact cdylib vectordb-node index.node -- cargo build --message-format=json", "build": "cargo-cp-artifact --artifact cdylib vectordb-node index.node -- cargo build --message-format=json",
"build-release": "npm run build -- --release", "build-release": "npm run build -- --release",
"test": "npm run tsc; mocha -recursive dist/test", "test": "npm run tsc && mocha -recursive dist/test",
"lint": "eslint src --ext .js,.ts", "lint": "eslint src --ext .js,.ts",
"clean": "rm -rf node_modules *.node dist/", "clean": "rm -rf node_modules *.node dist/",
"pack-build": "neon pack-build", "pack-build": "neon pack-build",
@@ -60,7 +60,8 @@
}, },
"os": [ "os": [
"darwin", "darwin",
"linux" "linux",
"win32"
], ],
"cpu": [ "cpu": [
"x64", "x64",
@@ -71,13 +72,15 @@
"x86_64-apple-darwin": "vectordb-darwin-x64", "x86_64-apple-darwin": "vectordb-darwin-x64",
"aarch64-apple-darwin": "vectordb-darwin-arm64", "aarch64-apple-darwin": "vectordb-darwin-arm64",
"x86_64-unknown-linux-gnu": "vectordb-linux-x64-gnu", "x86_64-unknown-linux-gnu": "vectordb-linux-x64-gnu",
"aarch64-unknown-linux-gnu": "vectordb-linux-arm64-gnu" "aarch64-unknown-linux-gnu": "vectordb-linux-arm64-gnu",
"x86_64-pc-windows-msvc": "vectordb-win32-x64-msvc"
} }
}, },
"optionalDependencies": { "optionalDependencies": {
"vectordb-darwin-arm64": "0.1.13", "vectordb-darwin-arm64": "0.1.13",
"vectordb-darwin-x64": "0.1.13", "vectordb-darwin-x64": "0.1.13",
"vectordb-linux-arm64-gnu": "0.1.13",
"vectordb-linux-x64-gnu": "0.1.13", "vectordb-linux-x64-gnu": "0.1.13",
"vectordb-linux-arm64-gnu": "0.1.13" "vectordb-win32-x64-msvc": "0.1.13"
} }
} }

View File

@@ -13,11 +13,12 @@
from __future__ import annotations from __future__ import annotations
import functools
import os import os
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from pathlib import Path from pathlib import Path
from typing import Dict, Iterable, List, Optional, Tuple, Union
import pandas as pd
import pyarrow as pa import pyarrow as pa
from pyarrow import fs from pyarrow import fs
@@ -38,8 +39,10 @@ class DBConnection(ABC):
def create_table( def create_table(
self, self,
name: str, name: str,
data: DATA = None, data: Optional[
schema: pa.Schema = None, Union[List[dict], dict, pd.DataFrame, pa.Table, Iterable[pa.RecordBatch]],
] = None,
schema: Optional[pa.Schema] = None,
mode: str = "create", mode: str = "create",
on_bad_vectors: str = "error", on_bad_vectors: str = "error",
fill_value: float = 0.0, fill_value: float = 0.0,
@@ -51,7 +54,7 @@ class DBConnection(ABC):
name: str name: str
The name of the table. The name of the table.
data: list, tuple, dict, pd.DataFrame; optional data: list, tuple, dict, pd.DataFrame; optional
The data to insert into the table. The data to initialize the table. User must provide at least one of `data` or `schema`.
schema: pyarrow.Schema; optional schema: pyarrow.Schema; optional
The schema of the table. The schema of the table.
mode: str; default "create" mode: str; default "create"
@@ -64,16 +67,16 @@ class DBConnection(ABC):
fill_value: float fill_value: float
The value to use when filling vectors. Only used if on_bad_vectors="fill". The value to use when filling vectors. Only used if on_bad_vectors="fill".
Note
----
The vector index won't be created by default.
To create the index, call the `create_index` method on the table.
Returns Returns
------- -------
LanceTable LanceTable
A reference to the newly created table. A reference to the newly created table.
!!! note
The vector index won't be created by default.
To create the index, call the `create_index` method on the table.
Examples Examples
-------- --------
@@ -119,7 +122,7 @@ class DBConnection(ABC):
Data is converted to Arrow before being written to disk. For maximum Data is converted to Arrow before being written to disk. For maximum
control over how data is saved, either provide the PyArrow schema to control over how data is saved, either provide the PyArrow schema to
convert to or else provide a PyArrow table directly. convert to or else provide a [PyArrow Table](pyarrow.Table) directly.
>>> custom_schema = pa.schema([ >>> custom_schema = pa.schema([
... pa.field("vector", pa.list_(pa.float32(), 2)), ... pa.field("vector", pa.list_(pa.float32(), 2)),
@@ -138,6 +141,30 @@ class DBConnection(ABC):
vector: [[[1.1,1.2],[0.2,1.8]]] vector: [[[1.1,1.2],[0.2,1.8]]]
lat: [[45.5,40.1]] lat: [[45.5,40.1]]
long: [[-122.7,-74.1]] long: [[-122.7,-74.1]]
It is also possible to create an table from `[Iterable[pa.RecordBatch]]`:
>>> import pyarrow as pa
>>> def make_batches():
... for i in range(5):
... yield pa.RecordBatch.from_arrays(
... [
... pa.array([[3.1, 4.1], [5.9, 26.5]]),
... pa.array(["foo", "bar"]),
... pa.array([10.0, 20.0]),
... ],
... ["vector", "item", "price"],
... )
>>> schema=pa.schema([
... pa.field("vector", pa.list_(pa.float32())),
... pa.field("item", pa.utf8()),
... pa.field("price", pa.float32()),
... ])
>>> db.create_table("table4", make_batches(), schema=schema)
LanceTable(table4)
""" """
raise NotImplementedError raise NotImplementedError
@@ -252,7 +279,7 @@ class LanceDBConnection(DBConnection):
def create_table( def create_table(
self, self,
name: str, name: str,
data: DATA = None, data: Optional[Union[List[dict], dict, pd.DataFrame]] = None,
schema: pa.Schema = None, schema: pa.Schema = None,
mode: str = "create", mode: str = "create",
on_bad_vectors: str = "error", on_bad_vectors: str = "error",
@@ -260,114 +287,22 @@ class LanceDBConnection(DBConnection):
) -> LanceTable: ) -> LanceTable:
"""Create a table in the database. """Create a table in the database.
Parameters See
---------- ---
name: str DBConnection.create_table
The name of the table.
data: list, tuple, dict, pd.DataFrame; optional
The data to insert into the table.
schema: pyarrow.Schema; optional
The schema of the table.
mode: str; default "create"
The mode to use when creating the table. Can be either "create" or "overwrite".
By default, if the table already exists, an exception is raised.
If you want to overwrite the table, use mode="overwrite".
on_bad_vectors: str, default "error"
What to do if any of the vectors are not the same size or contains NaNs.
One of "error", "drop", "fill".
fill_value: float
The value to use when filling vectors. Only used if on_bad_vectors="fill".
Note
----
The vector index won't be created by default.
To create the index, call the `create_index` method on the table.
Returns
-------
LanceTable
A reference to the newly created table.
Examples
--------
Can create with list of tuples or dictionaries:
>>> import lancedb
>>> db = lancedb.connect("./.lancedb")
>>> data = [{"vector": [1.1, 1.2], "lat": 45.5, "long": -122.7},
... {"vector": [0.2, 1.8], "lat": 40.1, "long": -74.1}]
>>> db.create_table("my_table", data)
LanceTable(my_table)
>>> db["my_table"].head()
pyarrow.Table
vector: fixed_size_list<item: float>[2]
child 0, item: float
lat: double
long: double
----
vector: [[[1.1,1.2],[0.2,1.8]]]
lat: [[45.5,40.1]]
long: [[-122.7,-74.1]]
You can also pass a pandas DataFrame:
>>> import pandas as pd
>>> data = pd.DataFrame({
... "vector": [[1.1, 1.2], [0.2, 1.8]],
... "lat": [45.5, 40.1],
... "long": [-122.7, -74.1]
... })
>>> db.create_table("table2", data)
LanceTable(table2)
>>> db["table2"].head()
pyarrow.Table
vector: fixed_size_list<item: float>[2]
child 0, item: float
lat: double
long: double
----
vector: [[[1.1,1.2],[0.2,1.8]]]
lat: [[45.5,40.1]]
long: [[-122.7,-74.1]]
Data is converted to Arrow before being written to disk. For maximum
control over how data is saved, either provide the PyArrow schema to
convert to or else provide a PyArrow table directly.
>>> custom_schema = pa.schema([
... pa.field("vector", pa.list_(pa.float32(), 2)),
... pa.field("lat", pa.float32()),
... pa.field("long", pa.float32())
... ])
>>> db.create_table("table3", data, schema = custom_schema)
LanceTable(table3)
>>> db["table3"].head()
pyarrow.Table
vector: fixed_size_list<item: float>[2]
child 0, item: float
lat: float
long: float
----
vector: [[[1.1,1.2],[0.2,1.8]]]
lat: [[45.5,40.1]]
long: [[-122.7,-74.1]]
""" """
if mode.lower() not in ["create", "overwrite"]: if mode.lower() not in ["create", "overwrite"]:
raise ValueError("mode must be either 'create' or 'overwrite'") raise ValueError("mode must be either 'create' or 'overwrite'")
if data is not None: tbl = LanceTable.create(
tbl = LanceTable.create( self,
self, name,
name, data,
data, schema,
schema, mode=mode,
mode=mode, on_bad_vectors=on_bad_vectors,
on_bad_vectors=on_bad_vectors, fill_value=fill_value,
fill_value=fill_value, )
)
else:
tbl = LanceTable.open(self, name)
return tbl return tbl
def open_table(self, name: str) -> LanceTable: def open_table(self, name: str) -> LanceTable:

View File

@@ -18,7 +18,7 @@ from __future__ import annotations
import inspect import inspect
import sys import sys
import types import types
from abc import ABC, abstractstaticmethod from abc import ABC, abstractmethod
from typing import Any, List, Type, Union, _GenericAlias from typing import Any, List, Type, Union, _GenericAlias
import pyarrow as pa import pyarrow as pa
@@ -27,11 +27,13 @@ from pydantic_core import CoreSchema, core_schema
class FixedSizeListMixin(ABC): class FixedSizeListMixin(ABC):
@abstractstaticmethod @staticmethod
@abstractmethod
def dim() -> int: def dim() -> int:
raise NotImplementedError raise NotImplementedError
@abstractstaticmethod @staticmethod
@abstractmethod
def value_arrow_type() -> pa.DataType: def value_arrow_type() -> pa.DataType:
raise NotImplementedError raise NotImplementedError
@@ -41,9 +43,15 @@ def vector(
) -> Type[FixedSizeListMixin]: ) -> Type[FixedSizeListMixin]:
"""Pydantic Vector Type. """Pydantic Vector Type.
Note !!! warning
---- Experimental feature.
Experimental feature.
Parameters
----------
dim : int
The dimension of the vector.
value_type : pyarrow.DataType, optional
The value type of the vector, by default pa.float32()
Examples Examples
-------- --------
@@ -52,9 +60,15 @@ def vector(
>>> from lancedb.pydantic import vector >>> from lancedb.pydantic import vector
... ...
>>> class MyModel(pydantic.BaseModel): >>> class MyModel(pydantic.BaseModel):
... vector: vector(756)
... id: int ... id: int
... description: str ... url: str
... embeddings: vector(768)
>>> schema = pydantic_to_schema(MyModel)
>>> assert schema == pa.schema([
... pa.field("id", pa.int64(), False),
... pa.field("url", pa.utf8(), False),
... pa.field("embeddings", pa.list_(pa.float32(), 768), False)
... ])
""" """
# TODO: make a public parameterized type. # TODO: make a public parameterized type.
@@ -163,7 +177,36 @@ def pydantic_to_schema(model: Type[pydantic.BaseModel]) -> pa.Schema:
Returns Returns
------- -------
A PyArrow Schema. pyarrow.Schema
Examples
--------
>>> from typing import List, Optional
>>> import pydantic
>>> from lancedb.pydantic import pydantic_to_schema
...
>>> class InnerModel(pydantic.BaseModel):
... a: str
... b: Optional[float]
>>>
>>> class FooModel(pydantic.BaseModel):
... id: int
... s: Optional[str] = None
... vec: List[float]
... li: List[int]
... inner: InnerModel
>>> schema = pydantic_to_schema(FooModel)
>>> assert schema == pa.schema([
... pa.field("id", pa.int64(), False),
... pa.field("s", pa.utf8(), True),
... pa.field("vec", pa.list_(pa.float64()), False),
... pa.field("li", pa.list_(pa.int64()), False),
... pa.field("inner", pa.struct([
... pa.field("a", pa.utf8(), False),
... pa.field("b", pa.float64(), True),
... ]), False),
... ])
""" """
fields = _pydantic_model_to_fields(model) fields = _pydantic_model_to_fields(model)
return pa.schema(fields) return pa.schema(fields)

View File

@@ -226,6 +226,7 @@ class LanceQueryBuilder:
columns=self._columns, columns=self._columns,
nprobes=self._nprobes, nprobes=self._nprobes,
refine_factor=self._refine_factor, refine_factor=self._refine_factor,
vector_column=self._vector_column,
) )
return self._table._execute_query(query) return self._table._execute_query(query)

View File

@@ -0,0 +1,22 @@
# Copyright 2023 LanceDB Developers
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pyarrow as pa
def to_ipc_binary(table: pa.Table) -> bytes:
"""Serialize a PyArrow Table to IPC binary."""
sink = pa.BufferOutputStream()
with pa.ipc.new_stream(sink, table.schema) as writer:
writer.write_table(table)
return sink.getvalue().to_pybytes()

View File

@@ -13,7 +13,7 @@
import functools import functools
from typing import Any, Callable, Dict, Union from typing import Any, Callable, Dict, Optional, Union
import aiohttp import aiohttp
import attr import attr
@@ -24,6 +24,8 @@ from lancedb.common import Credential
from lancedb.remote import VectorQuery, VectorQueryResult from lancedb.remote import VectorQuery, VectorQueryResult
from lancedb.remote.errors import LanceDBClientError from lancedb.remote.errors import LanceDBClientError
ARROW_STREAM_CONTENT_TYPE = "application/vnd.apache.arrow.stream"
def _check_not_closed(f): def _check_not_closed(f):
@functools.wraps(f) @functools.wraps(f)
@@ -59,9 +61,12 @@ class RestfulLanceDBClient:
@functools.cached_property @functools.cached_property
def headers(self) -> Dict[str, str]: def headers(self) -> Dict[str, str]:
return { headers = {
"x-api-key": self.api_key, "x-api-key": self.api_key,
} }
if self.region == "local": # Local test mode
headers["Host"] = f"{self.db_name}.{self.region}.api.lancedb.com"
return headers
@staticmethod @staticmethod
async def _check_status(resp: aiohttp.ClientResponse): async def _check_status(resp: aiohttp.ClientResponse):
@@ -93,7 +98,9 @@ class RestfulLanceDBClient:
async def post( async def post(
self, self,
uri: str, uri: str,
data: Union[Dict[str, Any], BaseModel], data: Union[Dict[str, Any], BaseModel, bytes],
params: Optional[Dict[str, Any]] = None,
content_type: Optional[str] = None,
deserialize: Callable = lambda resp: resp.json(), deserialize: Callable = lambda resp: resp.json(),
) -> Dict[str, Any]: ) -> Dict[str, Any]:
"""Send a POST request and returns the deserialized response payload. """Send a POST request and returns the deserialized response payload.
@@ -107,10 +114,19 @@ class RestfulLanceDBClient:
""" """
if isinstance(data, BaseModel): if isinstance(data, BaseModel):
data: Dict[str, Any] = data.dict(exclude_none=True) data: Dict[str, Any] = data.dict(exclude_none=True)
if isinstance(data, bytes):
req_kwargs = {"data": data}
else:
req_kwargs = {"json": data}
headers = self.headers.copy()
if content_type is not None:
headers["content-type"] = content_type
async with self.session.post( async with self.session.post(
uri, uri,
json=data, headers=headers,
headers=self.headers, params=params,
**req_kwargs,
) as resp: ) as resp:
resp: aiohttp.ClientResponse = resp resp: aiohttp.ClientResponse = resp
await self._check_status(resp) await self._check_status(resp)
@@ -119,11 +135,11 @@ class RestfulLanceDBClient:
@_check_not_closed @_check_not_closed
async def list_tables(self): async def list_tables(self):
"""List all tables in the database.""" """List all tables in the database."""
json = await self.get("/1/table/", {}) json = await self.get("/v1/table/", {})
return json["tables"] return json["tables"]
@_check_not_closed @_check_not_closed
async def query(self, table_name: str, query: VectorQuery) -> VectorQueryResult: async def query(self, table_name: str, query: VectorQuery) -> VectorQueryResult:
"""Query a table.""" """Query a table."""
tbl = await self.post(f"/1/table/{table_name}/", query, deserialize=_read_ipc) tbl = await self.post(f"/v1/table/{table_name}/", query, deserialize=_read_ipc)
return VectorQueryResult(tbl) return VectorQueryResult(tbl)

View File

@@ -12,6 +12,7 @@
# limitations under the License. # limitations under the License.
import asyncio import asyncio
import uuid
from typing import List from typing import List
from urllib.parse import urlparse from urllib.parse import urlparse
@@ -19,9 +20,11 @@ import pyarrow as pa
from lancedb.common import DATA from lancedb.common import DATA
from lancedb.db import DBConnection from lancedb.db import DBConnection
from lancedb.table import Table from lancedb.schema import schema_to_json
from lancedb.table import Table, _sanitize_data
from .client import RestfulLanceDBClient from .arrow import to_ipc_binary
from .client import ARROW_STREAM_CONTENT_TYPE, RestfulLanceDBClient
class RemoteDBConnection(DBConnection): class RemoteDBConnection(DBConnection):
@@ -71,8 +74,31 @@ class RemoteDBConnection(DBConnection):
name: str, name: str,
data: DATA = None, data: DATA = None,
schema: pa.Schema = None, schema: pa.Schema = None,
mode: str = "create",
on_bad_vectors: str = "error", on_bad_vectors: str = "error",
fill_value: float = 0.0, fill_value: float = 0.0,
) -> Table: ) -> Table:
raise NotImplementedError if data is None and schema is None:
raise ValueError("Either data or schema must be provided.")
if data is not None:
data = _sanitize_data(
data, schema, on_bad_vectors=on_bad_vectors, fill_value=fill_value
)
else:
if schema is None:
raise ValueError("Either data or schema must be provided")
data = pa.Table.from_pylist([], schema=schema)
from .table import RemoteTable
data = to_ipc_binary(data)
request_id = uuid.uuid4().hex
self._loop.run_until_complete(
self._client.post(
f"/v1/table/{name}/create",
data=data,
params={"request_id": request_id},
content_type=ARROW_STREAM_CONTENT_TYPE,
)
)
return RemoteTable(self, name)

View File

@@ -11,6 +11,8 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import uuid
from functools import cached_property
from typing import Union from typing import Union
import pyarrow as pa import pyarrow as pa
@@ -18,7 +20,10 @@ import pyarrow as pa
from lancedb.common import DATA, VEC, VECTOR_COLUMN_NAME from lancedb.common import DATA, VEC, VECTOR_COLUMN_NAME
from ..query import LanceQueryBuilder, Query from ..query import LanceQueryBuilder, Query
from ..table import Query, Table from ..schema import json_to_schema
from ..table import Query, Table, _sanitize_data
from .arrow import to_ipc_binary
from .client import ARROW_STREAM_CONTENT_TYPE
from .db import RemoteDBConnection from .db import RemoteDBConnection
@@ -30,8 +35,14 @@ class RemoteTable(Table):
def __repr__(self) -> str: def __repr__(self) -> str:
return f"RemoteTable({self._conn.db_name}.{self.name})" return f"RemoteTable({self._conn.db_name}.{self.name})"
@cached_property
def schema(self) -> pa.Schema: def schema(self) -> pa.Schema:
raise NotImplementedError """Return the schema of the table."""
resp = self._conn._loop.run_until_complete(
self._conn._client.get(f"/v1/table/{self._name}/describe")
)
schema = json_to_schema(resp["schema"])
return schema
def to_arrow(self) -> pa.Table: def to_arrow(self) -> pa.Table:
raise NotImplementedError raise NotImplementedError
@@ -53,7 +64,22 @@ class RemoteTable(Table):
on_bad_vectors: str = "error", on_bad_vectors: str = "error",
fill_value: float = 0.0, fill_value: float = 0.0,
) -> int: ) -> int:
raise NotImplementedError data = _sanitize_data(
data, self.schema, on_bad_vectors=on_bad_vectors, fill_value=fill_value
)
payload = to_ipc_binary(data)
request_id = uuid.uuid4().hex
self._conn._loop.run_until_complete(
self._conn._client.post(
f"/v1/table/{self._name}/insert",
data=payload,
params={"request_id": request_id, "mode": mode},
content_type=ARROW_STREAM_CONTENT_TYPE,
)
)
return len(data)
def search( def search(
self, query: Union[VEC, str], vector_column: str = VECTOR_COLUMN_NAME self, query: Union[VEC, str], vector_column: str = VECTOR_COLUMN_NAME

View File

@@ -13,10 +13,10 @@
"""Schema related utilities.""" """Schema related utilities."""
import json
from typing import Any, Dict, Type from typing import Any, Dict, Type
import pyarrow as pa import pyarrow as pa
from lance import json_to_schema, schema_to_json
def vector(dimension: int, value_type: pa.DataType = pa.float32()) -> pa.DataType: def vector(dimension: int, value_type: pa.DataType = pa.float32()) -> pa.DataType:
@@ -43,247 +43,3 @@ def vector(dimension: int, value_type: pa.DataType = pa.float32()) -> pa.DataTyp
... ]) ... ])
""" """
return pa.list_(value_type, dimension) return pa.list_(value_type, dimension)
def _type_to_dict(dt: pa.DataType) -> Dict[str, Any]:
if pa.types.is_boolean(dt):
return {"type": "boolean"}
elif pa.types.is_int8(dt):
return {"type": "int8"}
elif pa.types.is_int16(dt):
return {"type": "int16"}
elif pa.types.is_int32(dt):
return {"type": "int32"}
elif pa.types.is_int64(dt):
return {"type": "int64"}
elif pa.types.is_uint8(dt):
return {"type": "uint8"}
elif pa.types.is_uint16(dt):
return {"type": "uint16"}
elif pa.types.is_uint32(dt):
return {"type": "uint32"}
elif pa.types.is_uint64(dt):
return {"type": "uint64"}
elif pa.types.is_float16(dt):
return {"type": "float16"}
elif pa.types.is_float32(dt):
return {"type": "float32"}
elif pa.types.is_float64(dt):
return {"type": "float64"}
elif pa.types.is_date32(dt):
return {"type": f"date32"}
elif pa.types.is_date64(dt):
return {"type": f"date64"}
elif pa.types.is_time32(dt):
return {"type": f"time32:{dt.unit}"}
elif pa.types.is_time64(dt):
return {"type": f"time64:{dt.unit}"}
elif pa.types.is_timestamp(dt):
return {"type": f"timestamp:{dt.unit}:{dt.tz if dt.tz is not None else ''}"}
elif pa.types.is_string(dt):
return {"type": "string"}
elif pa.types.is_binary(dt):
return {"type": "binary"}
elif pa.types.is_large_string(dt):
return {"type": "large_string"}
elif pa.types.is_large_binary(dt):
return {"type": "large_binary"}
elif pa.types.is_fixed_size_binary(dt):
return {"type": "fixed_size_binary", "width": dt.byte_width}
elif pa.types.is_fixed_size_list(dt):
return {
"type": "fixed_size_list",
"width": dt.list_size,
"value_type": _type_to_dict(dt.value_type),
}
elif pa.types.is_list(dt):
return {
"type": "list",
"value_type": _type_to_dict(dt.value_type),
}
elif pa.types.is_struct(dt):
return {
"type": "struct",
"fields": [_field_to_dict(dt.field(i)) for i in range(dt.num_fields)],
}
elif pa.types.is_dictionary(dt):
return {
"type": "dictionary",
"index_type": _type_to_dict(dt.index_type),
"value_type": _type_to_dict(dt.value_type),
}
# TODO: support extension types
raise TypeError(f"Unsupported type: {dt}")
def _field_to_dict(field: pa.field) -> Dict[str, Any]:
ret = {
"name": field.name,
"type": _type_to_dict(field.type),
"nullable": field.nullable,
}
if field.metadata is not None:
ret["metadata"] = field.metadata
return ret
def schema_to_dict(schema: pa.Schema) -> Dict[str, Any]:
"""Convert a PyArrow [Schema](pyarrow.Schema) to a dictionary.
Parameters
----------
schema : pa.Schema
The PyArrow Schema to convert
Returns
-------
A dict of the data type.
Examples
--------
>>> import pyarrow as pa
>>> import lancedb
>>> schema = pa.schema(
... [
... pa.field("id", pa.int64()),
... pa.field("vector", lancedb.vector(512), nullable=False),
... pa.field(
... "struct",
... pa.struct(
... [
... pa.field("a", pa.utf8()),
... pa.field("b", pa.float32()),
... ]
... ),
... True,
... ),
... ],
... metadata={"key": "value"},
... )
>>> json_schema = schema_to_dict(schema)
>>> assert json_schema == {
... "fields": [
... {"name": "id", "type": {"type": "int64"}, "nullable": True},
... {
... "name": "vector",
... "type": {
... "type": "fixed_size_list",
... "value_type": {"type": "float32"},
... "width": 512,
... },
... "nullable": False,
... },
... {
... "name": "struct",
... "type": {
... "type": "struct",
... "fields": [
... {"name": "a", "type": {"type": "string"}, "nullable": True},
... {"name": "b", "type": {"type": "float32"}, "nullable": True},
... ],
... },
... "nullable": True,
... },
... ],
... "metadata": {"key": "value"},
... }
"""
fields = []
for name in schema.names:
field = schema.field(name)
fields.append(_field_to_dict(field))
json_schema = {
"fields": fields,
"metadata": {
k.decode("utf-8"): v.decode("utf-8") for (k, v) in schema.metadata.items()
}
if schema.metadata is not None
else {},
}
return json_schema
def _dict_to_type(dt: Dict[str, Any]) -> pa.DataType:
type_name = dt["type"]
try:
return {
"boolean": pa.bool_(),
"int8": pa.int8(),
"int16": pa.int16(),
"int32": pa.int32(),
"int64": pa.int64(),
"uint8": pa.uint8(),
"uint16": pa.uint16(),
"uint32": pa.uint32(),
"uint64": pa.uint64(),
"float16": pa.float16(),
"float32": pa.float32(),
"float64": pa.float64(),
"string": pa.string(),
"binary": pa.binary(),
"large_string": pa.large_string(),
"large_binary": pa.large_binary(),
"date32": pa.date32(),
"date64": pa.date64(),
}[type_name]
except KeyError:
pass
if type_name == "fixed_size_binary":
return pa.binary(dt["width"])
elif type_name == "fixed_size_list":
return pa.list_(_dict_to_type(dt["value_type"]), dt["width"])
elif type_name == "list":
return pa.list_(_dict_to_type(dt["value_type"]))
elif type_name == "struct":
fields = []
for field in dt["fields"]:
fields.append(_dict_to_field(field))
return pa.struct(fields)
elif type_name == "dictionary":
return pa.dictionary(
_dict_to_type(dt["index_type"]), _dict_to_type(dt["value_type"])
)
elif type_name.startswith("time32:"):
return pa.time32(type_name.split(":")[1])
elif type_name.startswith("time64:"):
return pa.time64(type_name.split(":")[1])
elif type_name.startswith("timestamp:"):
fields = type_name.split(":")
unit = fields[1]
tz = fields[2] if len(fields) > 2 else None
return pa.timestamp(unit, tz)
raise TypeError(f"Unsupported type: {dt}")
def _dict_to_field(field: Dict[str, Any]) -> pa.Field:
name = field["name"]
nullable = field["nullable"] if "nullable" in field else True
dt = _dict_to_type(field["type"])
metadata = field.get("metadata", None)
return pa.field(name, dt, nullable, metadata)
def dict_to_schema(json: Dict[str, Any]) -> pa.Schema:
"""Reconstruct a PyArrow Schema from a JSON dict.
Parameters
----------
json : Dict[str, Any]
The JSON dict to reconstruct Schema from.
Returns
-------
A PyArrow Schema.
"""
fields = []
for field in json["fields"]:
fields.append(_dict_to_field(field))
metadata = {
k.encode("utf-8"): v.encode("utf-8")
for (k, v) in json.get("metadata", {}).items()
}
return pa.schema(fields, metadata)

View File

@@ -16,7 +16,7 @@ from __future__ import annotations
import os import os
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from functools import cached_property from functools import cached_property
from typing import List, Union from typing import Iterable, List, Union
import lance import lance
import numpy as np import numpy as np
@@ -44,7 +44,7 @@ def _sanitize_data(data, schema, on_bad_vectors, fill_value):
data = _sanitize_schema( data = _sanitize_schema(
data, schema=schema, on_bad_vectors=on_bad_vectors, fill_value=fill_value data, schema=schema, on_bad_vectors=on_bad_vectors, fill_value=fill_value
) )
if not isinstance(data, pa.Table): if not isinstance(data, (pa.Table, Iterable)):
raise TypeError(f"Unsupported data type: {type(data)}") raise TypeError(f"Unsupported data type: {type(data)}")
return data return data
@@ -483,7 +483,7 @@ class LanceTable(Table):
if schema is None: if schema is None:
raise ValueError("Either data or schema must be provided") raise ValueError("Either data or schema must be provided")
data = pa.Table.from_pylist([], schema=schema) data = pa.Table.from_pylist([], schema=schema)
lance.write_dataset(data, tbl._dataset_uri, mode=mode) lance.write_dataset(data, tbl._dataset_uri, schema=schema, mode=mode)
return LanceTable(db, name) return LanceTable(db, name)
@classmethod @classmethod

View File

@@ -1,7 +1,7 @@
[project] [project]
name = "lancedb" name = "lancedb"
version = "0.1.10" version = "0.1.11"
dependencies = ["pylance~=0.5.0", "ratelimiter", "retry", "tqdm", "aiohttp", "pydantic>=2", "attr"] dependencies = ["pylance~=0.5.8", "ratelimiter", "retry", "tqdm", "aiohttp", "pydantic>=2", "attr"]
description = "lancedb" description = "lancedb"
authors = [ authors = [
{ name = "LanceDB Devs", email = "dev@lancedb.com" }, { name = "LanceDB Devs", email = "dev@lancedb.com" },

View File

@@ -13,6 +13,7 @@
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import pyarrow as pa
import pytest import pytest
import lancedb import lancedb
@@ -75,6 +76,32 @@ def test_ingest_pd(tmp_path):
assert db.open_table("test").name == db["test"].name assert db.open_table("test").name == db["test"].name
def test_ingest_record_batch_iterator(tmp_path):
def batch_reader():
for i in range(5):
yield pa.RecordBatch.from_arrays(
[
pa.array([[3.1, 4.1], [5.9, 26.5]]),
pa.array(["foo", "bar"]),
pa.array([10.0, 20.0]),
],
["vector", "item", "price"],
)
db = lancedb.connect(tmp_path)
tbl = db.create_table(
"test",
batch_reader(),
schema=pa.schema(
[
pa.field("vector", pa.list_(pa.float32())),
pa.field("item", pa.utf8()),
pa.field("price", pa.float32()),
]
),
)
def test_create_mode(tmp_path): def test_create_mode(tmp_path):
db = lancedb.connect(tmp_path) db = lancedb.connect(tmp_path)
data = pd.DataFrame( data = pd.DataFrame(
@@ -131,6 +158,9 @@ def test_empty_or_nonexistent_table(tmp_path):
with pytest.raises(Exception): with pytest.raises(Exception):
db.open_table("does_not_exist") db.open_table("does_not_exist")
schema = pa.schema([pa.field("a", pa.int32())])
db.create_table("test", schema=schema)
def test_replace_index(tmp_path): def test_replace_index(tmp_path):
db = lancedb.connect(uri=tmp_path) db = lancedb.connect(uri=tmp_path)

View File

@@ -119,6 +119,7 @@ def test_query_builder_with_different_vector_column():
columns=["b"], columns=["b"],
nprobes=20, nprobes=20,
refine_factor=None, refine_factor=None,
vector_column="foo_vector",
) )
) )

View File

@@ -1,109 +0,0 @@
# Copyright 2023 LanceDB Developers
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pyarrow as pa
import lancedb
from lancedb.schema import dict_to_schema, schema_to_dict
def test_schema_to_dict():
schema = pa.schema(
[
pa.field("id", pa.int64()),
pa.field("vector", lancedb.vector(512), nullable=False),
pa.field(
"struct",
pa.struct(
[
pa.field("a", pa.utf8()),
pa.field("b", pa.float32()),
]
),
True,
),
pa.field("d", pa.dictionary(pa.int64(), pa.utf8()), False),
],
metadata={"key": "value"},
)
json_schema = schema_to_dict(schema)
assert json_schema == {
"fields": [
{"name": "id", "type": {"type": "int64"}, "nullable": True},
{
"name": "vector",
"type": {
"type": "fixed_size_list",
"value_type": {"type": "float32"},
"width": 512,
},
"nullable": False,
},
{
"name": "struct",
"type": {
"type": "struct",
"fields": [
{"name": "a", "type": {"type": "string"}, "nullable": True},
{"name": "b", "type": {"type": "float32"}, "nullable": True},
],
},
"nullable": True,
},
{
"name": "d",
"type": {
"type": "dictionary",
"index_type": {"type": "int64"},
"value_type": {"type": "string"},
},
"nullable": False,
},
],
"metadata": {"key": "value"},
}
actual_schema = dict_to_schema(json_schema)
assert actual_schema == schema
def test_temporal_types():
schema = pa.schema(
[
pa.field("t32", pa.time32("s")),
pa.field("t32ms", pa.time32("ms")),
pa.field("t64", pa.time64("ns")),
pa.field("ts", pa.timestamp("s")),
pa.field("ts_us_tz", pa.timestamp("us", tz="America/New_York")),
],
)
json_schema = schema_to_dict(schema)
assert json_schema == {
"fields": [
{"name": "t32", "type": {"type": "time32:s"}, "nullable": True},
{"name": "t32ms", "type": {"type": "time32:ms"}, "nullable": True},
{"name": "t64", "type": {"type": "time64:ns"}, "nullable": True},
{"name": "ts", "type": {"type": "timestamp:s:"}, "nullable": True},
{
"name": "ts_us_tz",
"type": {"type": "timestamp:us:America/New_York"},
"nullable": True,
},
],
"metadata": {},
}
actual_schema = dict_to_schema(json_schema)
assert actual_schema == schema

View File

@@ -15,6 +15,7 @@ arrow-ipc = { workspace = true }
arrow-schema = { workspace = true } arrow-schema = { workspace = true }
once_cell = "1" once_cell = "1"
futures = "0.3" futures = "0.3"
half = { workspace = true }
lance = { workspace = true } lance = { workspace = true }
vectordb = { path = "../../vectordb" } vectordb = { path = "../../vectordb" }
tokio = { version = "1.23", features = ["rt-multi-thread"] } tokio = { version = "1.23", features = ["rt-multi-thread"] }

View File

@@ -13,6 +13,7 @@ arrow-data = { workspace = true }
arrow-schema = { workspace = true } arrow-schema = { workspace = true }
object_store = { workspace = true } object_store = { workspace = true }
snafu = "0.7.4" snafu = "0.7.4"
half = { workspace = true }
lance = { workspace = true } lance = { workspace = true }
tokio = { version = "1.23", features = ["rt-multi-thread"] } tokio = { version = "1.23", features = ["rt-multi-thread"] }

View File

@@ -27,6 +27,7 @@ pub struct Database {
object_store: ObjectStore, object_store: ObjectStore,
pub(crate) uri: String, pub(crate) uri: String,
pub(crate) base_path: object_store::path::Path,
} }
const LANCE_EXTENSION: &str = "lance"; const LANCE_EXTENSION: &str = "lance";
@@ -43,12 +44,13 @@ impl Database {
/// ///
/// * A [Database] object. /// * A [Database] object.
pub async fn connect(uri: &str) -> Result<Database> { pub async fn connect(uri: &str) -> Result<Database> {
let (object_store, _) = ObjectStore::from_uri(uri).await?; let (object_store, base_path) = ObjectStore::from_uri(uri).await?;
if object_store.is_local() { if object_store.is_local() {
Self::try_create_dir(uri).context(CreateDirSnafu { path: uri })?; Self::try_create_dir(uri).context(CreateDirSnafu { path: uri })?;
} }
Ok(Database { Ok(Database {
uri: uri.to_string(), uri: uri.to_string(),
base_path,
object_store, object_store,
}) })
} }
@@ -70,7 +72,7 @@ impl Database {
pub async fn table_names(&self) -> Result<Vec<String>> { pub async fn table_names(&self) -> Result<Vec<String>> {
let f = self let f = self
.object_store .object_store
.read_dir(self.uri.as_str()) .read_dir(self.base_path.clone())
.await? .await?
.iter() .iter()
.map(|fname| Path::new(fname)) .map(|fname| Path::new(fname))
@@ -141,8 +143,9 @@ impl Database {
/// # Arguments /// # Arguments
/// * `name` - The name of the table. /// * `name` - The name of the table.
pub async fn drop_table(&self, name: &str) -> Result<()> { pub async fn drop_table(&self, name: &str) -> Result<()> {
let dir_name = format!("{}/{}.{}", self.uri, name, LANCE_EXTENSION); let dir_name = format!("{}.{}", name, LANCE_EXTENSION);
self.object_store.remove_dir_all(dir_name).await?; let full_path = self.base_path.child(dir_name.clone());
self.object_store.remove_dir_all(full_path).await?;
Ok(()) Ok(())
} }
} }

View File

@@ -16,6 +16,7 @@ use std::path::Path;
use std::sync::Arc; use std::sync::Arc;
use arrow_array::{Float32Array, RecordBatchReader}; use arrow_array::{Float32Array, RecordBatchReader};
use arrow_schema::SchemaRef;
use lance::dataset::{Dataset, ReadParams, WriteParams}; use lance::dataset::{Dataset, ReadParams, WriteParams};
use lance::index::IndexType; use lance::index::IndexType;
use snafu::prelude::*; use snafu::prelude::*;
@@ -144,6 +145,16 @@ impl Table {
}) })
} }
/// Schema of this Table.
pub fn schema(&self) -> SchemaRef {
Arc::new(self.dataset.schema().into())
}
/// Version of this Table
pub fn version(&self) -> u64 {
self.dataset.version().version
}
/// Create index on the table. /// Create index on the table.
pub async fn create_index(&mut self, index_builder: &impl VectorIndexBuilder) -> Result<()> { pub async fn create_index(&mut self, index_builder: &impl VectorIndexBuilder) -> Result<()> {
use lance::index::DatasetIndexExt; use lance::index::DatasetIndexExt;
@@ -274,6 +285,7 @@ mod tests {
} }
#[test] #[test]
#[cfg(not(windows))]
fn test_object_store_path() { fn test_object_store_path() {
use std::path::Path as StdPath; use std::path::Path as StdPath;
let p = StdPath::new("s3://bucket/path/to/file"); let p = StdPath::new("s3://bucket/path/to/file");
@@ -350,10 +362,7 @@ mod tests {
..Default::default() ..Default::default()
}; };
table table.add(new_batches, Some(param)).await.unwrap();
.add(new_batches, Some(param))
.await
.unwrap();
assert_eq!(table.count_rows().await.unwrap(), 10); assert_eq!(table.count_rows().await.unwrap(), 10);
assert_eq!(table.name, "test"); assert_eq!(table.name, "test");
} }