Compare commits

...

17 Commits

Author SHA1 Message Date
Lei Xu
2704a4522c Bump to 0.1.11 2023-07-17 12:45:17 -07:00
Lei Xu
030f07e7f0 Bump minimal lance version to 0.5.8 (#318) 2023-07-17 12:41:29 -07:00
gsilvestrin
72afa06b7a feat(node): Add Windows support (#294) 2023-07-17 08:48:24 -07:00
Lei Xu
088e745e1d [Python] Create table with Iterator[RecordBatch] and add docs (#316) 2023-07-16 21:45:55 -07:00
Lei Xu
7a57cddb2c [Python] Add records to remote (#315) 2023-07-16 13:24:38 -07:00
Lei Xu
8ff5f88916 [Python] Bug fixes in remote API (#314) 2023-07-16 11:09:19 -07:00
Lei Xu
028a6e433d [Python] Get table schema (#313) 2023-07-15 17:39:37 -07:00
Lei Xu
04c6814fb1 [Rust] Expose Table schema and version in Rust (#312) 2023-07-14 22:01:23 -07:00
Lei Xu
c62e4ca1eb Bump lance version to 0.5.7 (#311) 2023-07-14 17:17:31 -07:00
gsilvestrin
aecc5fc42b feat(node): Fix npm publish task (#298) 2023-07-14 13:39:15 -07:00
Chang She
2fdcb307eb [python] Fix a few minor bugs (#304) 2023-07-15 03:47:42 +08:00
Tevin Wang
ad18826579 [Documentation Code Testing] build node sdk in release (#307) 2023-07-14 12:46:48 -07:00
Leon Yee
a8a50591d7 [docs] small fixes (#308)
Closes #288 and #287
2023-07-14 12:46:31 -07:00
gsilvestrin
6dfe7fabc2 pin half (#310) 2023-07-14 12:45:05 -07:00
gsilvestrin
2b108e1c80 Updating package-lock.json file (#301) 2023-07-13 17:50:01 -07:00
Lei Xu
8c9edafccc [Doc] Add more Python integrations documents (#299) 2023-07-13 17:09:39 -07:00
Leon Yee
0590413b96 Added transformersJS example to docs and node/examples (#297) 2023-07-13 17:01:36 -07:00
33 changed files with 719 additions and 528 deletions

View File

@@ -81,7 +81,7 @@ jobs:
run: |
cd docs/test/node_modules/vectordb
npm ci
npm run build
npm run build-release
npm run tsc
- name: Create test files
run: |

View File

@@ -116,6 +116,39 @@ jobs:
path: |
node/dist/vectordb-linux*.tgz
node-windows:
runs-on: windows-2022
# Only runs on tags that matches the make-release action
if: startsWith(github.ref, 'refs/tags/v')
strategy:
fail-fast: false
matrix:
target: [x86_64-pc-windows-msvc]
steps:
- name: Checkout
uses: actions/checkout@v3
- name: Install Protoc v21.12
working-directory: C:\
run: |
New-Item -Path 'C:\protoc' -ItemType Directory
Set-Location C:\protoc
Invoke-WebRequest https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip -OutFile C:\protoc\protoc.zip
7z x protoc.zip
Add-Content $env:GITHUB_PATH "C:\protoc\bin"
shell: powershell
- name: Install npm dependencies
run: |
cd node
npm ci
- name: Build Windows native node modules
run: .\ci\build_windows_artifacts.ps1 ${{ matrix.target }}
- name: Upload Windows Artifacts
uses: actions/upload-artifact@v3
with:
name: windows-native
path: |
node/dist/vectordb-win32*.tgz
release:
needs: [node, node-macos, node-linux]
runs-on: ubuntu-latest
@@ -132,6 +165,7 @@ jobs:
env:
NODE_AUTH_TOKEN: ${{ secrets.LANCEDB_NPM_REGISTRY_TOKEN }}
run: |
for filename in */*.tgz; do
mv */*.tgz .
for filename in *.tgz; do
npm publish $filename
done

View File

@@ -66,3 +66,24 @@ jobs:
run: cargo build --all-features
- name: Run tests
run: cargo test --all-features
windows:
runs-on: windows-2022
steps:
- uses: actions/checkout@v3
- uses: Swatinem/rust-cache@v2
with:
workspaces: rust
- name: Install Protoc v21.12
working-directory: C:\
run: |
New-Item -Path 'C:\protoc' -ItemType Directory
Set-Location C:\protoc
Invoke-WebRequest https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip -OutFile C:\protoc\protoc.zip
7z x protoc.zip
Add-Content $env:GITHUB_PATH "C:\protoc\bin"
shell: powershell
- name: Run tests
run: |
$env:VCPKG_ROOT = $env:VCPKG_INSTALLATION_ROOT
cargo build
cargo test

View File

@@ -6,9 +6,11 @@ members = [
resolver = "2"
[workspace.dependencies]
lance = "=0.5.5"
lance = "=0.5.8"
arrow-array = "42.0"
arrow-data = "42.0"
arrow-schema = "42.0"
arrow-ipc = "42.0"
half = { "version" = "2.2.1", default-features = false }
object_store = "0.6.1"

View File

@@ -0,0 +1,41 @@
# Builds the Windows artifacts (node binaries).
# Usage: .\ci\build_windows_artifacts.ps1 [target]
# Targets supported:
# - x86_64-pc-windows-msvc
# - i686-pc-windows-msvc
function Prebuild-Rust {
param (
[string]$target
)
# Building here for the sake of easier debugging.
Push-Location -Path "rust/ffi/node"
Write-Host "Building rust library for $target"
$env:RUST_BACKTRACE=1
cargo build --release --target $target
Pop-Location
}
function Build-NodeBinaries {
param (
[string]$target
)
Push-Location -Path "node"
Write-Host "Building node library for $target"
npm run build-release -- --target $target
npm run pack-build -- --target $target
Pop-Location
}
$targets = $args[0]
if (-not $targets) {
$targets = "x86_64-pc-windows-msvc"
}
Write-Host "Building artifacts for targets: $targets"
foreach ($target in $targets) {
Prebuild-Rust $target
Build-NodeBinaries $target
}

View File

@@ -60,6 +60,9 @@ nav:
- Python integrations:
- Pandas and PyArrow: python/arrow.md
- DuckDB: python/duckdb.md
- LangChain 🦜️🔗: https://python.langchain.com/en/latest/modules/indexes/vectorstores/examples/lancedb.html
- LlamaIndex 🦙: https://gpt-index.readthedocs.io/en/latest/examples/vector_stores/LanceDBIndexDemo.html
- Pydantic: python/pydantic.md
- Python examples:
- YouTube Transcript Search: notebooks/youtube_transcript_search.ipynb
- Documentation QA Bot using LangChain: notebooks/code_qa_bot.ipynb
@@ -68,6 +71,7 @@ nav:
- Serverless QA Bot with Modal: examples/serverless_qa_bot_with_modal_and_langchain.md
- Javascript examples:
- YouTube Transcript Search: examples/youtube_transcript_bot_with_nodejs.md
- TransformersJS Embedding Search: examples/transformerjs_embedding_search_nodejs.md
- References:
- Vector Search: search.md
- SQL filters: sql.md

View File

@@ -46,7 +46,7 @@ You can also use an external API like OpenAI to generate embeddings
def embed_func(c):
rs = openai.Embedding.create(input=c, engine="text-embedding-ada-002")
return [record["embedding"] for record in rs["data"]]
return [record["embedding"] for record in rs["data"]]
```
=== "Javascript"
@@ -126,7 +126,7 @@ belong in the same latent space and your results will be nonsensical.
=== "Javascript"
```javascript
const results = await table
.search('What's the best pizza topping?')
.search("What's the best pizza topping?")
.limit(10)
.execute()
```

View File

@@ -0,0 +1,121 @@
# Vector embedding search using TransformersJS
## Embed and query data from LacneDB using TransformersJS
<img id="splash" width="400" alt="transformersjs" src="https://github.com/lancedb/lancedb/assets/43097991/88a31e30-3d6f-4eef-9216-4b7c688f1b4f">
This example shows how to use the [transformers.js](https://github.com/xenova/transformers.js) library to perform vector embedding search using LanceDB's Javascript API.
### Setting up
First, install the dependencies:
```bash
npm install vectordb
npm i @xenova/transformers
```
We will also be using the [all-MiniLM-L6-v2](https://huggingface.co/Xenova/all-MiniLM-L6-v2) model to make it compatible with Transformers.js
Within our `index.js` file we will import the necessary libraries and define our model and database:
```javascript
const lancedb = require('vectordb')
const { pipeline } = await import('@xenova/transformers')
const pipe = await pipeline('feature-extraction', 'Xenova/all-MiniLM-L6-v2');
```
### Creating the embedding function
Next, we will create a function that will take in a string and return the vector embedding of that string. We will use the `pipe` function we defined earlier to get the vector embedding of the string.
```javascript
// Define the function. `sourceColumn` is required for LanceDB to know
// which column to use as input.
const embed_fun = {}
embed_fun.sourceColumn = 'text'
embed_fun.embed = async function (batch) {
let result = []
// Given a batch of strings, we will use the `pipe` function to get
// the vector embedding of each string.
for (let text of batch) {
// 'mean' pooling and normalizing allows the embeddings to share the
// same length.
const res = await pipe(text, { pooling: 'mean', normalize: true })
result.push(Array.from(res['data']))
}
return (result)
}
```
### Creating the database
Now, we will create the LanceDB database and add the embedding function we defined earlier.
```javascript
// Link a folder and create a table with data
const db = await lancedb.connect('data/sample-lancedb')
// You can also import any other data, but make sure that you have a column
// for the embedding function to use.
const data = [
{ id: 1, text: 'Cherry', type: 'fruit' },
{ id: 2, text: 'Carrot', type: 'vegetable' },
{ id: 3, text: 'Potato', type: 'vegetable' },
{ id: 4, text: 'Apple', type: 'fruit' },
{ id: 5, text: 'Banana', type: 'fruit' }
]
// Create the table with the embedding function
const table = await db.createTable('food_table', data, "create", embed_fun)
```
### Performing the search
Now, we can perform the search using the `search` function. LanceDB automatically uses the embedding function we defined earlier to get the vector embedding of the query string.
```javascript
// Query the table
const results = await table
.search("a sweet fruit to eat")
.metricType("cosine")
.limit(2)
.execute()
console.log(results.map(r => r.text))
```
```bash
[ 'Banana', 'Cherry' ]
```
Output of `results`:
```bash
[
{
vector: Float32Array(384) [
-0.057455405592918396,
0.03617725893855095,
-0.0367760956287384,
... 381 more items
],
id: 5,
text: 'Banana',
type: 'fruit',
score: 0.4919965863227844
},
{
vector: Float32Array(384) [
0.0009714411571621895,
0.008223623037338257,
0.009571489877998829,
... 381 more items
],
id: 1,
text: 'Cherry',
type: 'fruit',
score: 0.5540297031402588
}
]
```
### Wrapping it up
In this example, we showed how to use the `transformers.js` library to perform vector embedding search using LanceDB's Javascript API. You can find the full code for this example on [Github](https://github.com/lancedb/lancedb/blob/main/node/examples/js-transformers/index.js)!

View File

@@ -5,6 +5,8 @@ Built on top of [Apache Arrow](https://arrow.apache.org/),
`LanceDB` is easy to integrate with the Python ecosystem, including [Pandas](https://pandas.pydata.org/)
and PyArrow.
## Create dataset
First, we need to connect to a `LanceDB` database.
```py
@@ -27,10 +29,42 @@ data = pd.DataFrame({
table = db.create_table("pd_table", data=data)
```
You will find detailed instructions of creating dataset and index in
[Basic Operations](basic.md) and [Indexing](ann_indexes.md)
Similar to [`pyarrow.write_dataset()`](https://arrow.apache.org/docs/python/generated/pyarrow.dataset.write_dataset.html),
[db.create_table()](../python/#lancedb.db.DBConnection.create_table) accepts a wide-range of forms of data.
For example, if you have a dataset that is larger than memory size, you can create table with `Iterator[pyarrow.RecordBatch]`,
to lazily generate data:
```py
from typing import Iterable
import pyarrow as pa
import lancedb
def make_batches() -> Iterable[pa.RecordBatch]:
for i in range(5):
yield pa.RecordBatch.from_arrays(
[
pa.array([[3.1, 4.1], [5.9, 26.5]]),
pa.array(["foo", "bar"]),
pa.array([10.0, 20.0]),
],
["vector", "item", "price"])
schema=pa.schema([
pa.field("vector", pa.list_(pa.float32())),
pa.field("item", pa.utf8()),
pa.field("price", pa.float32()),
])
table = db.create_table("iterable_table", data=make_batches(), schema=schema)
```
You will find detailed instructions of creating dataset in
[Basic Operations](../basic.md) and [API](../python/#lancedb.db.DBConnection.create_table)
sections.
## Vector Search
We can now perform similarity search via `LanceDB` Python API.

View File

@@ -0,0 +1,35 @@
# Pydantic
[Pydantic](https://docs.pydantic.dev/latest/) is a data validation library in Python.
## Schema
LanceDB supports to create Apache Arrow Schema from a
[Pydantic BaseModel](https://docs.pydantic.dev/latest/api/main/#pydantic.main.BaseModel)
via [pydantic_to_schema()](python.md##lancedb.pydantic.pydantic_to_schema) method.
::: lancedb.pydantic.pydantic_to_schema
## Vector Field
LanceDB provides a [`vector(dim)`](python.md#lancedb.pydantic.vector) method to define a
vector Field in a Pydantic Model.
::: lancedb.pydantic.vector
## Type Conversion
LanceDB automatically convert Pydantic fields to
[Apache Arrow DataType](https://arrow.apache.org/docs/python/generated/pyarrow.DataType.html#pyarrow.DataType).
Current supported type conversions:
| Pydantic Field Type | PyArrow Data Type |
| ------------------- | ----------------- |
| `int` | `pyarrow.int64` |
| `float` | `pyarrow.float64` |
| `bool` | `pyarrow.bool` |
| `str` | `pyarrow.utf8()` |
| `list` | `pyarrow.List` |
| `BaseModel` | `pyarrow.Struct` |
| `vector(n)` | `pyarrow.FixedSizeList(float32, n)` |

View File

@@ -46,10 +46,6 @@ pip install lancedb
## Utilities
::: lancedb.schema.schema_to_dict
::: lancedb.schema.dict_to_schema
::: lancedb.vector
## Integrations

View File

@@ -7,6 +7,7 @@ const excludedFiles = [
"../src/embedding.md",
"../src/examples/serverless_lancedb_with_s3_and_lambda.md",
"../src/examples/serverless_qa_bot_with_modal_and_langchain.md",
"../src/examples/transformerjs_embedding_search_nodejs.md",
"../src/examples/youtube_transcript_bot_with_nodejs.md",
];
const nodePrefix = "javascript";
@@ -48,4 +49,4 @@ for (const file of files.filter((file) => !excludedFiles.includes(file))) {
fs.mkdirSync(path.dirname(outPath), { recursive: true });
fs.writeFileSync(outPath, asyncPrefix + "\n" + lines.join("\n") + asyncSuffix);
}
}
}

View File

@@ -0,0 +1,66 @@
// Copyright 2023 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
'use strict'
async function example() {
const lancedb = require('vectordb')
// Import transformers and the all-MiniLM-L6-v2 model (https://huggingface.co/Xenova/all-MiniLM-L6-v2)
const { pipeline } = await import('@xenova/transformers')
const pipe = await pipeline('feature-extraction', 'Xenova/all-MiniLM-L6-v2');
// Create embedding function from pipeline which returns a list of vectors from batch
// sourceColumn is the name of the column in the data to be embedded
//
// Output of pipe is a Tensor { data: Float32Array(384) }, so filter for the vector
const embed_fun = {}
embed_fun.sourceColumn = 'text'
embed_fun.embed = async function (batch) {
let result = []
for (let text of batch) {
const res = await pipe(text, { pooling: 'mean', normalize: true })
result.push(Array.from(res['data']))
}
return (result)
}
// Link a folder and create a table with data
const db = await lancedb.connect('data/sample-lancedb')
const data = [
{ id: 1, text: 'Cherry', type: 'fruit' },
{ id: 2, text: 'Carrot', type: 'vegetable' },
{ id: 3, text: 'Potato', type: 'vegetable' },
{ id: 4, text: 'Apple', type: 'fruit' },
{ id: 5, text: 'Banana', type: 'fruit' }
]
const table = await db.createTable('food_table', data, "create", embed_fun)
// Query the table
const results = await table
.search("a sweet fruit to eat")
.metricType("cosine")
.limit(2)
.execute()
console.log(results.map(r => r.text))
}
example().then(_ => { console.log("Done!") })

View File

@@ -0,0 +1,16 @@
{
"name": "vectordb-example-js-transformers",
"version": "1.0.0",
"description": "Example for using transformers.js with lancedb",
"main": "index.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"author": "Lance Devs",
"license": "Apache-2.0",
"dependencies": {
"@xenova/transformers": "^2.4.1",
"vectordb": "^0.1.12"
}
}

70
node/package-lock.json generated
View File

@@ -1,12 +1,12 @@
{
"name": "vectordb",
"version": "0.1.12",
"version": "0.1.13",
"lockfileVersion": 2,
"requires": true,
"packages": {
"": {
"name": "vectordb",
"version": "0.1.12",
"version": "0.1.13",
"cpu": [
"x64",
"arm64"
@@ -14,7 +14,8 @@
"license": "Apache-2.0",
"os": [
"darwin",
"linux"
"linux",
"win32"
],
"dependencies": {
"@apache-arrow/ts": "^12.0.0",
@@ -49,10 +50,11 @@
"typescript": "*"
},
"optionalDependencies": {
"vectordb-darwin-arm64": "0.1.12",
"vectordb-darwin-x64": "0.1.12",
"vectordb-linux-arm64-gnu": "0.1.12",
"vectordb-linux-x64-gnu": "0.1.12"
"vectordb-darwin-arm64": "0.1.13",
"vectordb-darwin-x64": "0.1.13",
"vectordb-linux-arm64-gnu": "0.1.13",
"vectordb-linux-x64-gnu": "0.1.13",
"vectordb-win32-x64-msvc": "0.1.13"
}
},
"node_modules/@apache-arrow/ts": {
@@ -4286,6 +4288,42 @@
"integrity": "sha512-wa7YjyUGfNZngI/vtK0UHAN+lgDCxBPCylVXGp0zu59Fz5aiGtNXaq3DhIov063MorB+VfufLh3JlF2KdTK3xg==",
"dev": true
},
"node_modules/vectordb-darwin-arm64": {
"version": "0.1.13",
"resolved": "https://registry.npmjs.org/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.1.13.tgz",
"integrity": "sha512-9lLuX5P8m75EfP85pfC4LxO9J7Tzu4LngX55BVAdFe6qPRHu+iHmLw0QYYSVDqNm3GtDr2qFJlL2ILlsApyYyg==",
"cpu": [
"arm64"
],
"optional": true,
"os": [
"darwin"
]
},
"node_modules/vectordb-darwin-x64": {
"version": "0.1.13",
"resolved": "https://registry.npmjs.org/vectordb-darwin-x64/-/vectordb-darwin-x64-0.1.13.tgz",
"integrity": "sha512-5mkhBJlcfAqcty7Ww2csgYogq+b0NhtllAbag9IIznvqfcrvITU0H0vm5LGWbRuE/BUUxC25MJhm93YWBzqEVA==",
"cpu": [
"x64"
],
"optional": true,
"os": [
"darwin"
]
},
"node_modules/vectordb-linux-x64-gnu": {
"version": "0.1.13",
"resolved": "https://registry.npmjs.org/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.1.13.tgz",
"integrity": "sha512-fU+sIHUkXyMdrWjggT93p0blKD+pbgr+x01tn9d2/pbA1ePo2AwuE86rYPA+BjyCUE1QifPgKadzGVVpqWYmnQ==",
"cpu": [
"x64"
],
"optional": true,
"os": [
"linux"
]
},
"node_modules/vscode-oniguruma": {
"version": "1.7.0",
"resolved": "https://registry.npmjs.org/vscode-oniguruma/-/vscode-oniguruma-1.7.0.tgz",
@@ -7581,6 +7619,24 @@
"integrity": "sha512-wa7YjyUGfNZngI/vtK0UHAN+lgDCxBPCylVXGp0zu59Fz5aiGtNXaq3DhIov063MorB+VfufLh3JlF2KdTK3xg==",
"dev": true
},
"vectordb-darwin-arm64": {
"version": "0.1.13",
"resolved": "https://registry.npmjs.org/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.1.13.tgz",
"integrity": "sha512-9lLuX5P8m75EfP85pfC4LxO9J7Tzu4LngX55BVAdFe6qPRHu+iHmLw0QYYSVDqNm3GtDr2qFJlL2ILlsApyYyg==",
"optional": true
},
"vectordb-darwin-x64": {
"version": "0.1.13",
"resolved": "https://registry.npmjs.org/vectordb-darwin-x64/-/vectordb-darwin-x64-0.1.13.tgz",
"integrity": "sha512-5mkhBJlcfAqcty7Ww2csgYogq+b0NhtllAbag9IIznvqfcrvITU0H0vm5LGWbRuE/BUUxC25MJhm93YWBzqEVA==",
"optional": true
},
"vectordb-linux-x64-gnu": {
"version": "0.1.13",
"resolved": "https://registry.npmjs.org/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.1.13.tgz",
"integrity": "sha512-fU+sIHUkXyMdrWjggT93p0blKD+pbgr+x01tn9d2/pbA1ePo2AwuE86rYPA+BjyCUE1QifPgKadzGVVpqWYmnQ==",
"optional": true
},
"vscode-oniguruma": {
"version": "1.7.0",
"resolved": "https://registry.npmjs.org/vscode-oniguruma/-/vscode-oniguruma-1.7.0.tgz",

View File

@@ -8,7 +8,7 @@
"tsc": "tsc -b",
"build": "cargo-cp-artifact --artifact cdylib vectordb-node index.node -- cargo build --message-format=json",
"build-release": "npm run build -- --release",
"test": "npm run tsc; mocha -recursive dist/test",
"test": "npm run tsc && mocha -recursive dist/test",
"lint": "eslint src --ext .js,.ts",
"clean": "rm -rf node_modules *.node dist/",
"pack-build": "neon pack-build",
@@ -60,7 +60,8 @@
},
"os": [
"darwin",
"linux"
"linux",
"win32"
],
"cpu": [
"x64",
@@ -71,13 +72,15 @@
"x86_64-apple-darwin": "vectordb-darwin-x64",
"aarch64-apple-darwin": "vectordb-darwin-arm64",
"x86_64-unknown-linux-gnu": "vectordb-linux-x64-gnu",
"aarch64-unknown-linux-gnu": "vectordb-linux-arm64-gnu"
"aarch64-unknown-linux-gnu": "vectordb-linux-arm64-gnu",
"x86_64-pc-windows-msvc": "vectordb-win32-x64-msvc"
}
},
"optionalDependencies": {
"vectordb-darwin-arm64": "0.1.13",
"vectordb-darwin-x64": "0.1.13",
"vectordb-linux-arm64-gnu": "0.1.13",
"vectordb-linux-x64-gnu": "0.1.13",
"vectordb-linux-arm64-gnu": "0.1.13"
"vectordb-win32-x64-msvc": "0.1.13"
}
}

View File

@@ -13,11 +13,12 @@
from __future__ import annotations
import functools
import os
from abc import ABC, abstractmethod
from pathlib import Path
from typing import Dict, Iterable, List, Optional, Tuple, Union
import pandas as pd
import pyarrow as pa
from pyarrow import fs
@@ -38,8 +39,10 @@ class DBConnection(ABC):
def create_table(
self,
name: str,
data: DATA = None,
schema: pa.Schema = None,
data: Optional[
Union[List[dict], dict, pd.DataFrame, pa.Table, Iterable[pa.RecordBatch]],
] = None,
schema: Optional[pa.Schema] = None,
mode: str = "create",
on_bad_vectors: str = "error",
fill_value: float = 0.0,
@@ -51,7 +54,7 @@ class DBConnection(ABC):
name: str
The name of the table.
data: list, tuple, dict, pd.DataFrame; optional
The data to insert into the table.
The data to initialize the table. User must provide at least one of `data` or `schema`.
schema: pyarrow.Schema; optional
The schema of the table.
mode: str; default "create"
@@ -64,16 +67,16 @@ class DBConnection(ABC):
fill_value: float
The value to use when filling vectors. Only used if on_bad_vectors="fill".
Note
----
The vector index won't be created by default.
To create the index, call the `create_index` method on the table.
Returns
-------
LanceTable
A reference to the newly created table.
!!! note
The vector index won't be created by default.
To create the index, call the `create_index` method on the table.
Examples
--------
@@ -119,7 +122,7 @@ class DBConnection(ABC):
Data is converted to Arrow before being written to disk. For maximum
control over how data is saved, either provide the PyArrow schema to
convert to or else provide a PyArrow table directly.
convert to or else provide a [PyArrow Table](pyarrow.Table) directly.
>>> custom_schema = pa.schema([
... pa.field("vector", pa.list_(pa.float32(), 2)),
@@ -138,6 +141,30 @@ class DBConnection(ABC):
vector: [[[1.1,1.2],[0.2,1.8]]]
lat: [[45.5,40.1]]
long: [[-122.7,-74.1]]
It is also possible to create an table from `[Iterable[pa.RecordBatch]]`:
>>> import pyarrow as pa
>>> def make_batches():
... for i in range(5):
... yield pa.RecordBatch.from_arrays(
... [
... pa.array([[3.1, 4.1], [5.9, 26.5]]),
... pa.array(["foo", "bar"]),
... pa.array([10.0, 20.0]),
... ],
... ["vector", "item", "price"],
... )
>>> schema=pa.schema([
... pa.field("vector", pa.list_(pa.float32())),
... pa.field("item", pa.utf8()),
... pa.field("price", pa.float32()),
... ])
>>> db.create_table("table4", make_batches(), schema=schema)
LanceTable(table4)
"""
raise NotImplementedError
@@ -252,7 +279,7 @@ class LanceDBConnection(DBConnection):
def create_table(
self,
name: str,
data: DATA = None,
data: Optional[Union[List[dict], dict, pd.DataFrame]] = None,
schema: pa.Schema = None,
mode: str = "create",
on_bad_vectors: str = "error",
@@ -260,114 +287,22 @@ class LanceDBConnection(DBConnection):
) -> LanceTable:
"""Create a table in the database.
Parameters
----------
name: str
The name of the table.
data: list, tuple, dict, pd.DataFrame; optional
The data to insert into the table.
schema: pyarrow.Schema; optional
The schema of the table.
mode: str; default "create"
The mode to use when creating the table. Can be either "create" or "overwrite".
By default, if the table already exists, an exception is raised.
If you want to overwrite the table, use mode="overwrite".
on_bad_vectors: str, default "error"
What to do if any of the vectors are not the same size or contains NaNs.
One of "error", "drop", "fill".
fill_value: float
The value to use when filling vectors. Only used if on_bad_vectors="fill".
Note
----
The vector index won't be created by default.
To create the index, call the `create_index` method on the table.
Returns
-------
LanceTable
A reference to the newly created table.
Examples
--------
Can create with list of tuples or dictionaries:
>>> import lancedb
>>> db = lancedb.connect("./.lancedb")
>>> data = [{"vector": [1.1, 1.2], "lat": 45.5, "long": -122.7},
... {"vector": [0.2, 1.8], "lat": 40.1, "long": -74.1}]
>>> db.create_table("my_table", data)
LanceTable(my_table)
>>> db["my_table"].head()
pyarrow.Table
vector: fixed_size_list<item: float>[2]
child 0, item: float
lat: double
long: double
----
vector: [[[1.1,1.2],[0.2,1.8]]]
lat: [[45.5,40.1]]
long: [[-122.7,-74.1]]
You can also pass a pandas DataFrame:
>>> import pandas as pd
>>> data = pd.DataFrame({
... "vector": [[1.1, 1.2], [0.2, 1.8]],
... "lat": [45.5, 40.1],
... "long": [-122.7, -74.1]
... })
>>> db.create_table("table2", data)
LanceTable(table2)
>>> db["table2"].head()
pyarrow.Table
vector: fixed_size_list<item: float>[2]
child 0, item: float
lat: double
long: double
----
vector: [[[1.1,1.2],[0.2,1.8]]]
lat: [[45.5,40.1]]
long: [[-122.7,-74.1]]
Data is converted to Arrow before being written to disk. For maximum
control over how data is saved, either provide the PyArrow schema to
convert to or else provide a PyArrow table directly.
>>> custom_schema = pa.schema([
... pa.field("vector", pa.list_(pa.float32(), 2)),
... pa.field("lat", pa.float32()),
... pa.field("long", pa.float32())
... ])
>>> db.create_table("table3", data, schema = custom_schema)
LanceTable(table3)
>>> db["table3"].head()
pyarrow.Table
vector: fixed_size_list<item: float>[2]
child 0, item: float
lat: float
long: float
----
vector: [[[1.1,1.2],[0.2,1.8]]]
lat: [[45.5,40.1]]
long: [[-122.7,-74.1]]
See
---
DBConnection.create_table
"""
if mode.lower() not in ["create", "overwrite"]:
raise ValueError("mode must be either 'create' or 'overwrite'")
if data is not None:
tbl = LanceTable.create(
self,
name,
data,
schema,
mode=mode,
on_bad_vectors=on_bad_vectors,
fill_value=fill_value,
)
else:
tbl = LanceTable.open(self, name)
tbl = LanceTable.create(
self,
name,
data,
schema,
mode=mode,
on_bad_vectors=on_bad_vectors,
fill_value=fill_value,
)
return tbl
def open_table(self, name: str) -> LanceTable:

View File

@@ -18,7 +18,7 @@ from __future__ import annotations
import inspect
import sys
import types
from abc import ABC, abstractstaticmethod
from abc import ABC, abstractmethod
from typing import Any, List, Type, Union, _GenericAlias
import pyarrow as pa
@@ -27,11 +27,13 @@ from pydantic_core import CoreSchema, core_schema
class FixedSizeListMixin(ABC):
@abstractstaticmethod
@staticmethod
@abstractmethod
def dim() -> int:
raise NotImplementedError
@abstractstaticmethod
@staticmethod
@abstractmethod
def value_arrow_type() -> pa.DataType:
raise NotImplementedError
@@ -41,9 +43,15 @@ def vector(
) -> Type[FixedSizeListMixin]:
"""Pydantic Vector Type.
Note
----
Experimental feature.
!!! warning
Experimental feature.
Parameters
----------
dim : int
The dimension of the vector.
value_type : pyarrow.DataType, optional
The value type of the vector, by default pa.float32()
Examples
--------
@@ -52,9 +60,15 @@ def vector(
>>> from lancedb.pydantic import vector
...
>>> class MyModel(pydantic.BaseModel):
... vector: vector(756)
... id: int
... description: str
... url: str
... embeddings: vector(768)
>>> schema = pydantic_to_schema(MyModel)
>>> assert schema == pa.schema([
... pa.field("id", pa.int64(), False),
... pa.field("url", pa.utf8(), False),
... pa.field("embeddings", pa.list_(pa.float32(), 768), False)
... ])
"""
# TODO: make a public parameterized type.
@@ -163,7 +177,36 @@ def pydantic_to_schema(model: Type[pydantic.BaseModel]) -> pa.Schema:
Returns
-------
A PyArrow Schema.
pyarrow.Schema
Examples
--------
>>> from typing import List, Optional
>>> import pydantic
>>> from lancedb.pydantic import pydantic_to_schema
...
>>> class InnerModel(pydantic.BaseModel):
... a: str
... b: Optional[float]
>>>
>>> class FooModel(pydantic.BaseModel):
... id: int
... s: Optional[str] = None
... vec: List[float]
... li: List[int]
... inner: InnerModel
>>> schema = pydantic_to_schema(FooModel)
>>> assert schema == pa.schema([
... pa.field("id", pa.int64(), False),
... pa.field("s", pa.utf8(), True),
... pa.field("vec", pa.list_(pa.float64()), False),
... pa.field("li", pa.list_(pa.int64()), False),
... pa.field("inner", pa.struct([
... pa.field("a", pa.utf8(), False),
... pa.field("b", pa.float64(), True),
... ]), False),
... ])
"""
fields = _pydantic_model_to_fields(model)
return pa.schema(fields)

View File

@@ -226,6 +226,7 @@ class LanceQueryBuilder:
columns=self._columns,
nprobes=self._nprobes,
refine_factor=self._refine_factor,
vector_column=self._vector_column,
)
return self._table._execute_query(query)

View File

@@ -0,0 +1,22 @@
# Copyright 2023 LanceDB Developers
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pyarrow as pa
def to_ipc_binary(table: pa.Table) -> bytes:
"""Serialize a PyArrow Table to IPC binary."""
sink = pa.BufferOutputStream()
with pa.ipc.new_stream(sink, table.schema) as writer:
writer.write_table(table)
return sink.getvalue().to_pybytes()

View File

@@ -13,7 +13,7 @@
import functools
from typing import Any, Callable, Dict, Union
from typing import Any, Callable, Dict, Optional, Union
import aiohttp
import attr
@@ -24,6 +24,8 @@ from lancedb.common import Credential
from lancedb.remote import VectorQuery, VectorQueryResult
from lancedb.remote.errors import LanceDBClientError
ARROW_STREAM_CONTENT_TYPE = "application/vnd.apache.arrow.stream"
def _check_not_closed(f):
@functools.wraps(f)
@@ -59,9 +61,12 @@ class RestfulLanceDBClient:
@functools.cached_property
def headers(self) -> Dict[str, str]:
return {
headers = {
"x-api-key": self.api_key,
}
if self.region == "local": # Local test mode
headers["Host"] = f"{self.db_name}.{self.region}.api.lancedb.com"
return headers
@staticmethod
async def _check_status(resp: aiohttp.ClientResponse):
@@ -93,7 +98,9 @@ class RestfulLanceDBClient:
async def post(
self,
uri: str,
data: Union[Dict[str, Any], BaseModel],
data: Union[Dict[str, Any], BaseModel, bytes],
params: Optional[Dict[str, Any]] = None,
content_type: Optional[str] = None,
deserialize: Callable = lambda resp: resp.json(),
) -> Dict[str, Any]:
"""Send a POST request and returns the deserialized response payload.
@@ -107,10 +114,19 @@ class RestfulLanceDBClient:
"""
if isinstance(data, BaseModel):
data: Dict[str, Any] = data.dict(exclude_none=True)
if isinstance(data, bytes):
req_kwargs = {"data": data}
else:
req_kwargs = {"json": data}
headers = self.headers.copy()
if content_type is not None:
headers["content-type"] = content_type
async with self.session.post(
uri,
json=data,
headers=self.headers,
headers=headers,
params=params,
**req_kwargs,
) as resp:
resp: aiohttp.ClientResponse = resp
await self._check_status(resp)
@@ -119,11 +135,11 @@ class RestfulLanceDBClient:
@_check_not_closed
async def list_tables(self):
"""List all tables in the database."""
json = await self.get("/1/table/", {})
json = await self.get("/v1/table/", {})
return json["tables"]
@_check_not_closed
async def query(self, table_name: str, query: VectorQuery) -> VectorQueryResult:
"""Query a table."""
tbl = await self.post(f"/1/table/{table_name}/", query, deserialize=_read_ipc)
tbl = await self.post(f"/v1/table/{table_name}/", query, deserialize=_read_ipc)
return VectorQueryResult(tbl)

View File

@@ -12,6 +12,7 @@
# limitations under the License.
import asyncio
import uuid
from typing import List
from urllib.parse import urlparse
@@ -19,9 +20,11 @@ import pyarrow as pa
from lancedb.common import DATA
from lancedb.db import DBConnection
from lancedb.table import Table
from lancedb.schema import schema_to_json
from lancedb.table import Table, _sanitize_data
from .client import RestfulLanceDBClient
from .arrow import to_ipc_binary
from .client import ARROW_STREAM_CONTENT_TYPE, RestfulLanceDBClient
class RemoteDBConnection(DBConnection):
@@ -71,8 +74,31 @@ class RemoteDBConnection(DBConnection):
name: str,
data: DATA = None,
schema: pa.Schema = None,
mode: str = "create",
on_bad_vectors: str = "error",
fill_value: float = 0.0,
) -> Table:
raise NotImplementedError
if data is None and schema is None:
raise ValueError("Either data or schema must be provided.")
if data is not None:
data = _sanitize_data(
data, schema, on_bad_vectors=on_bad_vectors, fill_value=fill_value
)
else:
if schema is None:
raise ValueError("Either data or schema must be provided")
data = pa.Table.from_pylist([], schema=schema)
from .table import RemoteTable
data = to_ipc_binary(data)
request_id = uuid.uuid4().hex
self._loop.run_until_complete(
self._client.post(
f"/v1/table/{name}/create",
data=data,
params={"request_id": request_id},
content_type=ARROW_STREAM_CONTENT_TYPE,
)
)
return RemoteTable(self, name)

View File

@@ -11,6 +11,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import uuid
from functools import cached_property
from typing import Union
import pyarrow as pa
@@ -18,7 +20,10 @@ import pyarrow as pa
from lancedb.common import DATA, VEC, VECTOR_COLUMN_NAME
from ..query import LanceQueryBuilder, Query
from ..table import Query, Table
from ..schema import json_to_schema
from ..table import Query, Table, _sanitize_data
from .arrow import to_ipc_binary
from .client import ARROW_STREAM_CONTENT_TYPE
from .db import RemoteDBConnection
@@ -30,8 +35,14 @@ class RemoteTable(Table):
def __repr__(self) -> str:
return f"RemoteTable({self._conn.db_name}.{self.name})"
@cached_property
def schema(self) -> pa.Schema:
raise NotImplementedError
"""Return the schema of the table."""
resp = self._conn._loop.run_until_complete(
self._conn._client.get(f"/v1/table/{self._name}/describe")
)
schema = json_to_schema(resp["schema"])
return schema
def to_arrow(self) -> pa.Table:
raise NotImplementedError
@@ -53,7 +64,22 @@ class RemoteTable(Table):
on_bad_vectors: str = "error",
fill_value: float = 0.0,
) -> int:
raise NotImplementedError
data = _sanitize_data(
data, self.schema, on_bad_vectors=on_bad_vectors, fill_value=fill_value
)
payload = to_ipc_binary(data)
request_id = uuid.uuid4().hex
self._conn._loop.run_until_complete(
self._conn._client.post(
f"/v1/table/{self._name}/insert",
data=payload,
params={"request_id": request_id, "mode": mode},
content_type=ARROW_STREAM_CONTENT_TYPE,
)
)
return len(data)
def search(
self, query: Union[VEC, str], vector_column: str = VECTOR_COLUMN_NAME

View File

@@ -13,10 +13,10 @@
"""Schema related utilities."""
import json
from typing import Any, Dict, Type
import pyarrow as pa
from lance import json_to_schema, schema_to_json
def vector(dimension: int, value_type: pa.DataType = pa.float32()) -> pa.DataType:
@@ -43,247 +43,3 @@ def vector(dimension: int, value_type: pa.DataType = pa.float32()) -> pa.DataTyp
... ])
"""
return pa.list_(value_type, dimension)
def _type_to_dict(dt: pa.DataType) -> Dict[str, Any]:
if pa.types.is_boolean(dt):
return {"type": "boolean"}
elif pa.types.is_int8(dt):
return {"type": "int8"}
elif pa.types.is_int16(dt):
return {"type": "int16"}
elif pa.types.is_int32(dt):
return {"type": "int32"}
elif pa.types.is_int64(dt):
return {"type": "int64"}
elif pa.types.is_uint8(dt):
return {"type": "uint8"}
elif pa.types.is_uint16(dt):
return {"type": "uint16"}
elif pa.types.is_uint32(dt):
return {"type": "uint32"}
elif pa.types.is_uint64(dt):
return {"type": "uint64"}
elif pa.types.is_float16(dt):
return {"type": "float16"}
elif pa.types.is_float32(dt):
return {"type": "float32"}
elif pa.types.is_float64(dt):
return {"type": "float64"}
elif pa.types.is_date32(dt):
return {"type": f"date32"}
elif pa.types.is_date64(dt):
return {"type": f"date64"}
elif pa.types.is_time32(dt):
return {"type": f"time32:{dt.unit}"}
elif pa.types.is_time64(dt):
return {"type": f"time64:{dt.unit}"}
elif pa.types.is_timestamp(dt):
return {"type": f"timestamp:{dt.unit}:{dt.tz if dt.tz is not None else ''}"}
elif pa.types.is_string(dt):
return {"type": "string"}
elif pa.types.is_binary(dt):
return {"type": "binary"}
elif pa.types.is_large_string(dt):
return {"type": "large_string"}
elif pa.types.is_large_binary(dt):
return {"type": "large_binary"}
elif pa.types.is_fixed_size_binary(dt):
return {"type": "fixed_size_binary", "width": dt.byte_width}
elif pa.types.is_fixed_size_list(dt):
return {
"type": "fixed_size_list",
"width": dt.list_size,
"value_type": _type_to_dict(dt.value_type),
}
elif pa.types.is_list(dt):
return {
"type": "list",
"value_type": _type_to_dict(dt.value_type),
}
elif pa.types.is_struct(dt):
return {
"type": "struct",
"fields": [_field_to_dict(dt.field(i)) for i in range(dt.num_fields)],
}
elif pa.types.is_dictionary(dt):
return {
"type": "dictionary",
"index_type": _type_to_dict(dt.index_type),
"value_type": _type_to_dict(dt.value_type),
}
# TODO: support extension types
raise TypeError(f"Unsupported type: {dt}")
def _field_to_dict(field: pa.field) -> Dict[str, Any]:
ret = {
"name": field.name,
"type": _type_to_dict(field.type),
"nullable": field.nullable,
}
if field.metadata is not None:
ret["metadata"] = field.metadata
return ret
def schema_to_dict(schema: pa.Schema) -> Dict[str, Any]:
"""Convert a PyArrow [Schema](pyarrow.Schema) to a dictionary.
Parameters
----------
schema : pa.Schema
The PyArrow Schema to convert
Returns
-------
A dict of the data type.
Examples
--------
>>> import pyarrow as pa
>>> import lancedb
>>> schema = pa.schema(
... [
... pa.field("id", pa.int64()),
... pa.field("vector", lancedb.vector(512), nullable=False),
... pa.field(
... "struct",
... pa.struct(
... [
... pa.field("a", pa.utf8()),
... pa.field("b", pa.float32()),
... ]
... ),
... True,
... ),
... ],
... metadata={"key": "value"},
... )
>>> json_schema = schema_to_dict(schema)
>>> assert json_schema == {
... "fields": [
... {"name": "id", "type": {"type": "int64"}, "nullable": True},
... {
... "name": "vector",
... "type": {
... "type": "fixed_size_list",
... "value_type": {"type": "float32"},
... "width": 512,
... },
... "nullable": False,
... },
... {
... "name": "struct",
... "type": {
... "type": "struct",
... "fields": [
... {"name": "a", "type": {"type": "string"}, "nullable": True},
... {"name": "b", "type": {"type": "float32"}, "nullable": True},
... ],
... },
... "nullable": True,
... },
... ],
... "metadata": {"key": "value"},
... }
"""
fields = []
for name in schema.names:
field = schema.field(name)
fields.append(_field_to_dict(field))
json_schema = {
"fields": fields,
"metadata": {
k.decode("utf-8"): v.decode("utf-8") for (k, v) in schema.metadata.items()
}
if schema.metadata is not None
else {},
}
return json_schema
def _dict_to_type(dt: Dict[str, Any]) -> pa.DataType:
type_name = dt["type"]
try:
return {
"boolean": pa.bool_(),
"int8": pa.int8(),
"int16": pa.int16(),
"int32": pa.int32(),
"int64": pa.int64(),
"uint8": pa.uint8(),
"uint16": pa.uint16(),
"uint32": pa.uint32(),
"uint64": pa.uint64(),
"float16": pa.float16(),
"float32": pa.float32(),
"float64": pa.float64(),
"string": pa.string(),
"binary": pa.binary(),
"large_string": pa.large_string(),
"large_binary": pa.large_binary(),
"date32": pa.date32(),
"date64": pa.date64(),
}[type_name]
except KeyError:
pass
if type_name == "fixed_size_binary":
return pa.binary(dt["width"])
elif type_name == "fixed_size_list":
return pa.list_(_dict_to_type(dt["value_type"]), dt["width"])
elif type_name == "list":
return pa.list_(_dict_to_type(dt["value_type"]))
elif type_name == "struct":
fields = []
for field in dt["fields"]:
fields.append(_dict_to_field(field))
return pa.struct(fields)
elif type_name == "dictionary":
return pa.dictionary(
_dict_to_type(dt["index_type"]), _dict_to_type(dt["value_type"])
)
elif type_name.startswith("time32:"):
return pa.time32(type_name.split(":")[1])
elif type_name.startswith("time64:"):
return pa.time64(type_name.split(":")[1])
elif type_name.startswith("timestamp:"):
fields = type_name.split(":")
unit = fields[1]
tz = fields[2] if len(fields) > 2 else None
return pa.timestamp(unit, tz)
raise TypeError(f"Unsupported type: {dt}")
def _dict_to_field(field: Dict[str, Any]) -> pa.Field:
name = field["name"]
nullable = field["nullable"] if "nullable" in field else True
dt = _dict_to_type(field["type"])
metadata = field.get("metadata", None)
return pa.field(name, dt, nullable, metadata)
def dict_to_schema(json: Dict[str, Any]) -> pa.Schema:
"""Reconstruct a PyArrow Schema from a JSON dict.
Parameters
----------
json : Dict[str, Any]
The JSON dict to reconstruct Schema from.
Returns
-------
A PyArrow Schema.
"""
fields = []
for field in json["fields"]:
fields.append(_dict_to_field(field))
metadata = {
k.encode("utf-8"): v.encode("utf-8")
for (k, v) in json.get("metadata", {}).items()
}
return pa.schema(fields, metadata)

View File

@@ -16,7 +16,7 @@ from __future__ import annotations
import os
from abc import ABC, abstractmethod
from functools import cached_property
from typing import List, Union
from typing import Iterable, List, Union
import lance
import numpy as np
@@ -44,7 +44,7 @@ def _sanitize_data(data, schema, on_bad_vectors, fill_value):
data = _sanitize_schema(
data, schema=schema, on_bad_vectors=on_bad_vectors, fill_value=fill_value
)
if not isinstance(data, pa.Table):
if not isinstance(data, (pa.Table, Iterable)):
raise TypeError(f"Unsupported data type: {type(data)}")
return data
@@ -483,7 +483,7 @@ class LanceTable(Table):
if schema is None:
raise ValueError("Either data or schema must be provided")
data = pa.Table.from_pylist([], schema=schema)
lance.write_dataset(data, tbl._dataset_uri, mode=mode)
lance.write_dataset(data, tbl._dataset_uri, schema=schema, mode=mode)
return LanceTable(db, name)
@classmethod

View File

@@ -1,7 +1,7 @@
[project]
name = "lancedb"
version = "0.1.10"
dependencies = ["pylance~=0.5.0", "ratelimiter", "retry", "tqdm", "aiohttp", "pydantic>=2", "attr"]
version = "0.1.11"
dependencies = ["pylance~=0.5.8", "ratelimiter", "retry", "tqdm", "aiohttp", "pydantic>=2", "attr"]
description = "lancedb"
authors = [
{ name = "LanceDB Devs", email = "dev@lancedb.com" },

View File

@@ -13,6 +13,7 @@
import numpy as np
import pandas as pd
import pyarrow as pa
import pytest
import lancedb
@@ -75,6 +76,32 @@ def test_ingest_pd(tmp_path):
assert db.open_table("test").name == db["test"].name
def test_ingest_record_batch_iterator(tmp_path):
def batch_reader():
for i in range(5):
yield pa.RecordBatch.from_arrays(
[
pa.array([[3.1, 4.1], [5.9, 26.5]]),
pa.array(["foo", "bar"]),
pa.array([10.0, 20.0]),
],
["vector", "item", "price"],
)
db = lancedb.connect(tmp_path)
tbl = db.create_table(
"test",
batch_reader(),
schema=pa.schema(
[
pa.field("vector", pa.list_(pa.float32())),
pa.field("item", pa.utf8()),
pa.field("price", pa.float32()),
]
),
)
def test_create_mode(tmp_path):
db = lancedb.connect(tmp_path)
data = pd.DataFrame(
@@ -131,6 +158,9 @@ def test_empty_or_nonexistent_table(tmp_path):
with pytest.raises(Exception):
db.open_table("does_not_exist")
schema = pa.schema([pa.field("a", pa.int32())])
db.create_table("test", schema=schema)
def test_replace_index(tmp_path):
db = lancedb.connect(uri=tmp_path)

View File

@@ -119,6 +119,7 @@ def test_query_builder_with_different_vector_column():
columns=["b"],
nprobes=20,
refine_factor=None,
vector_column="foo_vector",
)
)

View File

@@ -1,109 +0,0 @@
# Copyright 2023 LanceDB Developers
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pyarrow as pa
import lancedb
from lancedb.schema import dict_to_schema, schema_to_dict
def test_schema_to_dict():
schema = pa.schema(
[
pa.field("id", pa.int64()),
pa.field("vector", lancedb.vector(512), nullable=False),
pa.field(
"struct",
pa.struct(
[
pa.field("a", pa.utf8()),
pa.field("b", pa.float32()),
]
),
True,
),
pa.field("d", pa.dictionary(pa.int64(), pa.utf8()), False),
],
metadata={"key": "value"},
)
json_schema = schema_to_dict(schema)
assert json_schema == {
"fields": [
{"name": "id", "type": {"type": "int64"}, "nullable": True},
{
"name": "vector",
"type": {
"type": "fixed_size_list",
"value_type": {"type": "float32"},
"width": 512,
},
"nullable": False,
},
{
"name": "struct",
"type": {
"type": "struct",
"fields": [
{"name": "a", "type": {"type": "string"}, "nullable": True},
{"name": "b", "type": {"type": "float32"}, "nullable": True},
],
},
"nullable": True,
},
{
"name": "d",
"type": {
"type": "dictionary",
"index_type": {"type": "int64"},
"value_type": {"type": "string"},
},
"nullable": False,
},
],
"metadata": {"key": "value"},
}
actual_schema = dict_to_schema(json_schema)
assert actual_schema == schema
def test_temporal_types():
schema = pa.schema(
[
pa.field("t32", pa.time32("s")),
pa.field("t32ms", pa.time32("ms")),
pa.field("t64", pa.time64("ns")),
pa.field("ts", pa.timestamp("s")),
pa.field("ts_us_tz", pa.timestamp("us", tz="America/New_York")),
],
)
json_schema = schema_to_dict(schema)
assert json_schema == {
"fields": [
{"name": "t32", "type": {"type": "time32:s"}, "nullable": True},
{"name": "t32ms", "type": {"type": "time32:ms"}, "nullable": True},
{"name": "t64", "type": {"type": "time64:ns"}, "nullable": True},
{"name": "ts", "type": {"type": "timestamp:s:"}, "nullable": True},
{
"name": "ts_us_tz",
"type": {"type": "timestamp:us:America/New_York"},
"nullable": True,
},
],
"metadata": {},
}
actual_schema = dict_to_schema(json_schema)
assert actual_schema == schema

View File

@@ -15,6 +15,7 @@ arrow-ipc = { workspace = true }
arrow-schema = { workspace = true }
once_cell = "1"
futures = "0.3"
half = { workspace = true }
lance = { workspace = true }
vectordb = { path = "../../vectordb" }
tokio = { version = "1.23", features = ["rt-multi-thread"] }

View File

@@ -13,6 +13,7 @@ arrow-data = { workspace = true }
arrow-schema = { workspace = true }
object_store = { workspace = true }
snafu = "0.7.4"
half = { workspace = true }
lance = { workspace = true }
tokio = { version = "1.23", features = ["rt-multi-thread"] }

View File

@@ -27,6 +27,7 @@ pub struct Database {
object_store: ObjectStore,
pub(crate) uri: String,
pub(crate) base_path: object_store::path::Path,
}
const LANCE_EXTENSION: &str = "lance";
@@ -43,12 +44,13 @@ impl Database {
///
/// * A [Database] object.
pub async fn connect(uri: &str) -> Result<Database> {
let (object_store, _) = ObjectStore::from_uri(uri).await?;
let (object_store, base_path) = ObjectStore::from_uri(uri).await?;
if object_store.is_local() {
Self::try_create_dir(uri).context(CreateDirSnafu { path: uri })?;
}
Ok(Database {
uri: uri.to_string(),
base_path,
object_store,
})
}
@@ -70,7 +72,7 @@ impl Database {
pub async fn table_names(&self) -> Result<Vec<String>> {
let f = self
.object_store
.read_dir(self.uri.as_str())
.read_dir(self.base_path.clone())
.await?
.iter()
.map(|fname| Path::new(fname))
@@ -141,8 +143,9 @@ impl Database {
/// # Arguments
/// * `name` - The name of the table.
pub async fn drop_table(&self, name: &str) -> Result<()> {
let dir_name = format!("{}/{}.{}", self.uri, name, LANCE_EXTENSION);
self.object_store.remove_dir_all(dir_name).await?;
let dir_name = format!("{}.{}", name, LANCE_EXTENSION);
let full_path = self.base_path.child(dir_name.clone());
self.object_store.remove_dir_all(full_path).await?;
Ok(())
}
}

View File

@@ -16,6 +16,7 @@ use std::path::Path;
use std::sync::Arc;
use arrow_array::{Float32Array, RecordBatchReader};
use arrow_schema::SchemaRef;
use lance::dataset::{Dataset, ReadParams, WriteParams};
use lance::index::IndexType;
use snafu::prelude::*;
@@ -144,6 +145,16 @@ impl Table {
})
}
/// Schema of this Table.
pub fn schema(&self) -> SchemaRef {
Arc::new(self.dataset.schema().into())
}
/// Version of this Table
pub fn version(&self) -> u64 {
self.dataset.version().version
}
/// Create index on the table.
pub async fn create_index(&mut self, index_builder: &impl VectorIndexBuilder) -> Result<()> {
use lance::index::DatasetIndexExt;
@@ -274,6 +285,7 @@ mod tests {
}
#[test]
#[cfg(not(windows))]
fn test_object_store_path() {
use std::path::Path as StdPath;
let p = StdPath::new("s3://bucket/path/to/file");
@@ -350,10 +362,7 @@ mod tests {
..Default::default()
};
table
.add(new_batches, Some(param))
.await
.unwrap();
table.add(new_batches, Some(param)).await.unwrap();
assert_eq!(table.count_rows().await.unwrap(), 10);
assert_eq!(table.name, "test");
}