mirror of
https://github.com/lancedb/lancedb.git
synced 2026-01-03 10:22:56 +00:00
Compare commits
31 Commits
python-v0.
...
changhiskh
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
9eca8e7cd1 | ||
|
|
587fe6ffc1 | ||
|
|
89c8e5839b | ||
|
|
50c20af060 | ||
|
|
0965d7dd5a | ||
|
|
7bbb2872de | ||
|
|
e81d2975da | ||
|
|
2c7f96ba4f | ||
|
|
f9dd7a5d8a | ||
|
|
1d4943688d | ||
|
|
7856a94d2c | ||
|
|
371d2f979e | ||
|
|
fff8e399a3 | ||
|
|
73e4015797 | ||
|
|
5142a27482 | ||
|
|
81df2a524e | ||
|
|
40638e5515 | ||
|
|
018314a5c1 | ||
|
|
409eb30ea5 | ||
|
|
ff9872fd44 | ||
|
|
a0608044a1 | ||
|
|
2e4ea7d2bc | ||
|
|
57e5695a54 | ||
|
|
ce58ea7c38 | ||
|
|
57207eff4a | ||
|
|
2d78bff120 | ||
|
|
7c09b9b9a9 | ||
|
|
bd0034a157 | ||
|
|
144b3b5d83 | ||
|
|
b6f0a31686 | ||
|
|
9ec526f73f |
@@ -1,5 +1,5 @@
|
|||||||
[bumpversion]
|
[bumpversion]
|
||||||
current_version = 0.3.9
|
current_version = 0.4.0
|
||||||
commit = True
|
commit = True
|
||||||
message = Bump version: {current_version} → {new_version}
|
message = Bump version: {current_version} → {new_version}
|
||||||
tag = True
|
tag = True
|
||||||
|
|||||||
33
.github/ISSUE_TEMPLATE/bug-node.yml
vendored
Normal file
33
.github/ISSUE_TEMPLATE/bug-node.yml
vendored
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
name: Bug Report - Node / Typescript
|
||||||
|
description: File a bug report
|
||||||
|
title: "bug(node): "
|
||||||
|
labels: [bug, typescript]
|
||||||
|
body:
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: |
|
||||||
|
Thanks for taking the time to fill out this bug report!
|
||||||
|
- type: input
|
||||||
|
id: version
|
||||||
|
attributes:
|
||||||
|
label: LanceDB version
|
||||||
|
description: What version of LanceDB are you using? `npm list | grep vectordb`.
|
||||||
|
placeholder: v0.3.2
|
||||||
|
validations:
|
||||||
|
required: false
|
||||||
|
- type: textarea
|
||||||
|
id: what-happened
|
||||||
|
attributes:
|
||||||
|
label: What happened?
|
||||||
|
description: Also tell us, what did you expect to happen?
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: textarea
|
||||||
|
id: reproduction
|
||||||
|
attributes:
|
||||||
|
label: Are there known steps to reproduce?
|
||||||
|
description: |
|
||||||
|
Let us know how to reproduce the bug and we may be able to fix it more
|
||||||
|
quickly. This is not required, but it is helpful.
|
||||||
|
validations:
|
||||||
|
required: false
|
||||||
33
.github/ISSUE_TEMPLATE/bug-python.yml
vendored
Normal file
33
.github/ISSUE_TEMPLATE/bug-python.yml
vendored
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
name: Bug Report - Python
|
||||||
|
description: File a bug report
|
||||||
|
title: "bug(python): "
|
||||||
|
labels: [bug, python]
|
||||||
|
body:
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: |
|
||||||
|
Thanks for taking the time to fill out this bug report!
|
||||||
|
- type: input
|
||||||
|
id: version
|
||||||
|
attributes:
|
||||||
|
label: LanceDB version
|
||||||
|
description: What version of LanceDB are you using? `python -c "import lancedb; print(lancedb.__version__)"`.
|
||||||
|
placeholder: v0.3.2
|
||||||
|
validations:
|
||||||
|
required: false
|
||||||
|
- type: textarea
|
||||||
|
id: what-happened
|
||||||
|
attributes:
|
||||||
|
label: What happened?
|
||||||
|
description: Also tell us, what did you expect to happen?
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: textarea
|
||||||
|
id: reproduction
|
||||||
|
attributes:
|
||||||
|
label: Are there known steps to reproduce?
|
||||||
|
description: |
|
||||||
|
Let us know how to reproduce the bug and we may be able to fix it more
|
||||||
|
quickly. This is not required, but it is helpful.
|
||||||
|
validations:
|
||||||
|
required: false
|
||||||
5
.github/ISSUE_TEMPLATE/config.yml
vendored
Normal file
5
.github/ISSUE_TEMPLATE/config.yml
vendored
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
blank_issues_enabled: true
|
||||||
|
contact_links:
|
||||||
|
- name: Discord Community Support
|
||||||
|
url: https://discord.com/invite/zMM32dvNtd
|
||||||
|
about: Please ask and answer questions here.
|
||||||
23
.github/ISSUE_TEMPLATE/documentation.yml
vendored
Normal file
23
.github/ISSUE_TEMPLATE/documentation.yml
vendored
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
name: 'Documentation improvement'
|
||||||
|
description: Report an issue with the documentation.
|
||||||
|
labels: [documentation]
|
||||||
|
|
||||||
|
body:
|
||||||
|
- type: textarea
|
||||||
|
id: description
|
||||||
|
attributes:
|
||||||
|
label: Description
|
||||||
|
description: >
|
||||||
|
Describe the issue with the documentation and how it can be fixed or improved.
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
|
||||||
|
- type: input
|
||||||
|
id: link
|
||||||
|
attributes:
|
||||||
|
label: Link
|
||||||
|
description: >
|
||||||
|
Provide a link to the existing documentation, if applicable.
|
||||||
|
placeholder: ex. https://lancedb.github.io/lancedb/guides/tables/...
|
||||||
|
validations:
|
||||||
|
required: false
|
||||||
31
.github/ISSUE_TEMPLATE/feature.yml
vendored
Normal file
31
.github/ISSUE_TEMPLATE/feature.yml
vendored
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
name: Feature suggestion
|
||||||
|
description: Suggestion a new feature for LanceDB
|
||||||
|
title: "Feature: "
|
||||||
|
labels: [enhancement]
|
||||||
|
body:
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: |
|
||||||
|
Share a new idea for a feature or improvement. Be sure to search existing
|
||||||
|
issues first to avoid duplicates.
|
||||||
|
- type: dropdown
|
||||||
|
id: sdk
|
||||||
|
attributes:
|
||||||
|
label: SDK
|
||||||
|
description: Which SDK are you using? This helps us prioritize.
|
||||||
|
options:
|
||||||
|
- Python
|
||||||
|
- Node
|
||||||
|
- Rust
|
||||||
|
default: 0
|
||||||
|
validations:
|
||||||
|
required: false
|
||||||
|
- type: textarea
|
||||||
|
id: description
|
||||||
|
attributes:
|
||||||
|
label: Description
|
||||||
|
description: |
|
||||||
|
Describe the feature and why it would be useful. If applicable, consider
|
||||||
|
providing a code example of what it might be like to use the feature.
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
37
.github/workflows/npm-publish.yml
vendored
37
.github/workflows/npm-publish.yml
vendored
@@ -37,8 +37,16 @@ jobs:
|
|||||||
path: |
|
path: |
|
||||||
node/vectordb-*.tgz
|
node/vectordb-*.tgz
|
||||||
|
|
||||||
node-macos-x86:
|
node-macos:
|
||||||
runs-on: macos-13
|
strategy:
|
||||||
|
matrix:
|
||||||
|
config:
|
||||||
|
- arch: x86_64-apple-darwin
|
||||||
|
runner: macos-13
|
||||||
|
- arch: aarch64-apple-darwin
|
||||||
|
# xlarge is implicitly arm64.
|
||||||
|
runner: macos-13-xlarge
|
||||||
|
runs-on: ${{ matrix.config.runner }}
|
||||||
# Only runs on tags that matches the make-release action
|
# Only runs on tags that matches the make-release action
|
||||||
if: startsWith(github.ref, 'refs/tags/v')
|
if: startsWith(github.ref, 'refs/tags/v')
|
||||||
steps:
|
steps:
|
||||||
@@ -51,35 +59,14 @@ jobs:
|
|||||||
cd node
|
cd node
|
||||||
npm ci
|
npm ci
|
||||||
- name: Build MacOS native node modules
|
- name: Build MacOS native node modules
|
||||||
run: bash ci/build_macos_artifacts.sh x86_64-apple-darwin
|
run: bash ci/build_macos_artifacts.sh ${{ matrix.config.arch }}
|
||||||
- name: Upload Darwin Artifacts
|
- name: Upload Darwin Artifacts
|
||||||
uses: actions/upload-artifact@v3
|
uses: actions/upload-artifact@v3
|
||||||
with:
|
with:
|
||||||
name: native-darwin
|
name: native-darwin
|
||||||
path: |
|
path: |
|
||||||
node/dist/lancedb-vectordb-darwin*.tgz
|
node/dist/lancedb-vectordb-darwin*.tgz
|
||||||
|
|
||||||
node-macos-arm64:
|
|
||||||
runs-on: macos-13-xlarge
|
|
||||||
# Only runs on tags that matches the make-release action
|
|
||||||
if: startsWith(github.ref, 'refs/tags/v')
|
|
||||||
steps:
|
|
||||||
- name: Checkout
|
|
||||||
uses: actions/checkout@v3
|
|
||||||
- name: Install system dependencies
|
|
||||||
run: brew install protobuf
|
|
||||||
- name: Install npm dependencies
|
|
||||||
run: |
|
|
||||||
cd node
|
|
||||||
npm ci
|
|
||||||
- name: Build MacOS native node modules
|
|
||||||
run: bash ci/build_macos_artifacts.sh aarch64-apple-darwin
|
|
||||||
- name: Upload Darwin Artifacts
|
|
||||||
uses: actions/upload-artifact@v3
|
|
||||||
with:
|
|
||||||
name: native-darwin
|
|
||||||
path: |
|
|
||||||
node/dist/lancedb-vectordb-darwin*.tgz
|
|
||||||
|
|
||||||
node-linux:
|
node-linux:
|
||||||
name: node-linux (${{ matrix.config.arch}}-unknown-linux-gnu
|
name: node-linux (${{ matrix.config.arch}}-unknown-linux-gnu
|
||||||
|
|||||||
19
.github/workflows/python.yml
vendored
19
.github/workflows/python.yml
vendored
@@ -44,12 +44,19 @@ jobs:
|
|||||||
run: pytest -m "not slow" -x -v --durations=30 tests
|
run: pytest -m "not slow" -x -v --durations=30 tests
|
||||||
- name: doctest
|
- name: doctest
|
||||||
run: pytest --doctest-modules lancedb
|
run: pytest --doctest-modules lancedb
|
||||||
mac:
|
platform:
|
||||||
|
name: "Platform: ${{ matrix.config.name }}"
|
||||||
timeout-minutes: 30
|
timeout-minutes: 30
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
mac-runner: [ "macos-13", "macos-13-xlarge" ]
|
config:
|
||||||
runs-on: "${{ matrix.mac-runner }}"
|
- name: x86 Mac
|
||||||
|
runner: macos-13
|
||||||
|
- name: Arm Mac
|
||||||
|
runner: macos-13-xlarge
|
||||||
|
- name: x86 Windows
|
||||||
|
runner: windows-latest
|
||||||
|
runs-on: "${{ matrix.config.runner }}"
|
||||||
defaults:
|
defaults:
|
||||||
run:
|
run:
|
||||||
shell: bash
|
shell: bash
|
||||||
@@ -91,11 +98,7 @@ jobs:
|
|||||||
pip install "pydantic<2"
|
pip install "pydantic<2"
|
||||||
pip install -e .[tests]
|
pip install -e .[tests]
|
||||||
pip install tantivy@git+https://github.com/quickwit-oss/tantivy-py#164adc87e1a033117001cf70e38c82a53014d985
|
pip install tantivy@git+https://github.com/quickwit-oss/tantivy-py#164adc87e1a033117001cf70e38c82a53014d985
|
||||||
pip install pytest pytest-mock black isort
|
pip install pytest pytest-mock
|
||||||
- name: Black
|
|
||||||
run: black --check --diff --no-color --quiet .
|
|
||||||
- name: isort
|
|
||||||
run: isort --check --diff --quiet .
|
|
||||||
- name: Run tests
|
- name: Run tests
|
||||||
run: pytest -m "not slow" -x -v --durations=30 tests
|
run: pytest -m "not slow" -x -v --durations=30 tests
|
||||||
- name: doctest
|
- name: doctest
|
||||||
|
|||||||
23
.github/workflows/rust.yml
vendored
23
.github/workflows/rust.yml
vendored
@@ -24,6 +24,29 @@ env:
|
|||||||
RUST_BACKTRACE: "1"
|
RUST_BACKTRACE: "1"
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
|
lint:
|
||||||
|
timeout-minutes: 30
|
||||||
|
runs-on: ubuntu-22.04
|
||||||
|
defaults:
|
||||||
|
run:
|
||||||
|
shell: bash
|
||||||
|
working-directory: rust
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v3
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
lfs: true
|
||||||
|
- uses: Swatinem/rust-cache@v2
|
||||||
|
with:
|
||||||
|
workspaces: rust
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
sudo apt update
|
||||||
|
sudo apt install -y protobuf-compiler libssl-dev
|
||||||
|
- name: Run format
|
||||||
|
run: cargo fmt --all -- --check
|
||||||
|
- name: Run clippy
|
||||||
|
run: cargo clippy --all --all-features -- -D warnings
|
||||||
linux:
|
linux:
|
||||||
timeout-minutes: 30
|
timeout-minutes: 30
|
||||||
runs-on: ubuntu-22.04
|
runs-on: ubuntu-22.04
|
||||||
|
|||||||
26
Cargo.toml
26
Cargo.toml
@@ -5,24 +5,24 @@ exclude = ["python"]
|
|||||||
resolver = "2"
|
resolver = "2"
|
||||||
|
|
||||||
[workspace.dependencies]
|
[workspace.dependencies]
|
||||||
lance = { "version" = "=0.8.20", "features" = ["dynamodb"] }
|
lance = { "version" = "=0.9.1", "features" = ["dynamodb"] }
|
||||||
lance-index = { "version" = "=0.8.20" }
|
lance-index = { "version" = "=0.9.1" }
|
||||||
lance-linalg = { "version" = "=0.8.20" }
|
lance-linalg = { "version" = "=0.9.1" }
|
||||||
lance-testing = { "version" = "=0.8.20" }
|
lance-testing = { "version" = "=0.9.1" }
|
||||||
# Note that this one does not include pyarrow
|
# Note that this one does not include pyarrow
|
||||||
arrow = { version = "47.0.0", optional = false }
|
arrow = { version = "49.0.0", optional = false }
|
||||||
arrow-array = "47.0"
|
arrow-array = "49.0"
|
||||||
arrow-data = "47.0"
|
arrow-data = "49.0"
|
||||||
arrow-ipc = "47.0"
|
arrow-ipc = "49.0"
|
||||||
arrow-ord = "47.0"
|
arrow-ord = "49.0"
|
||||||
arrow-schema = "47.0"
|
arrow-schema = "49.0"
|
||||||
arrow-arith = "47.0"
|
arrow-arith = "49.0"
|
||||||
arrow-cast = "47.0"
|
arrow-cast = "49.0"
|
||||||
chrono = "0.4.23"
|
chrono = "0.4.23"
|
||||||
half = { "version" = "=2.3.1", default-features = false, features = [
|
half = { "version" = "=2.3.1", default-features = false, features = [
|
||||||
"num-traits",
|
"num-traits",
|
||||||
] }
|
] }
|
||||||
log = "0.4"
|
log = "0.4"
|
||||||
object_store = "0.7.1"
|
object_store = "0.8.0"
|
||||||
snafu = "0.7.4"
|
snafu = "0.7.4"
|
||||||
url = "2"
|
url = "2"
|
||||||
|
|||||||
@@ -2,3 +2,4 @@ mkdocs==1.4.2
|
|||||||
mkdocs-jupyter==0.24.1
|
mkdocs-jupyter==0.24.1
|
||||||
mkdocs-material==9.1.3
|
mkdocs-material==9.1.3
|
||||||
mkdocstrings[python]==0.20.0
|
mkdocstrings[python]==0.20.0
|
||||||
|
pydantic
|
||||||
@@ -64,18 +64,26 @@ We'll cover the basics of using LanceDB on your local machine in this section.
|
|||||||
tbl = db.create_table("table_from_df", data=df)
|
tbl = db.create_table("table_from_df", data=df)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
!!! warning
|
||||||
|
|
||||||
|
If the table already exists, LanceDB will raise an error by default.
|
||||||
|
If you want to overwrite the table, you can pass in `mode="overwrite"`
|
||||||
|
to the `createTable` function.
|
||||||
|
|
||||||
=== "Javascript"
|
=== "Javascript"
|
||||||
```javascript
|
```javascript
|
||||||
const tb = await db.createTable("my_table",
|
const tb = await db.createTable(
|
||||||
data=[{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
|
"myTable",
|
||||||
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0}])
|
[{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
|
||||||
|
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0}])
|
||||||
```
|
```
|
||||||
|
|
||||||
!!! warning
|
|
||||||
|
|
||||||
If the table already exists, LanceDB will raise an error by default.
|
!!! warning
|
||||||
If you want to overwrite the table, you can pass in `mode="overwrite"`
|
|
||||||
to the `createTable` function.
|
If the table already exists, LanceDB will raise an error by default.
|
||||||
|
If you want to overwrite the table, you can pass in `"overwrite"`
|
||||||
|
to the `createTable` function like this: `await con.createTable(tableName, data, { writeMode: WriteMode.Overwrite })`
|
||||||
|
|
||||||
|
|
||||||
??? info "Under the hood, LanceDB is converting the input data into an Apache Arrow table and persisting it to disk in [Lance format](https://www.github.com/lancedb/lance)."
|
??? info "Under the hood, LanceDB is converting the input data into an Apache Arrow table and persisting it to disk in [Lance format](https://www.github.com/lancedb/lance)."
|
||||||
|
|
||||||
@@ -108,7 +116,7 @@ Once created, you can open a table using the following code:
|
|||||||
|
|
||||||
=== "Javascript"
|
=== "Javascript"
|
||||||
```javascript
|
```javascript
|
||||||
const tbl = await db.openTable("my_table");
|
const tbl = await db.openTable("myTable");
|
||||||
```
|
```
|
||||||
|
|
||||||
If you forget the name of your table, you can always get a listing of all table names:
|
If you forget the name of your table, you can always get a listing of all table names:
|
||||||
@@ -194,10 +202,17 @@ Use the `drop_table()` method on the database to remove a table.
|
|||||||
db.drop_table("my_table")
|
db.drop_table("my_table")
|
||||||
```
|
```
|
||||||
|
|
||||||
This permanently removes the table and is not recoverable, unlike deleting rows.
|
This permanently removes the table and is not recoverable, unlike deleting rows.
|
||||||
By default, if the table does not exist an exception is raised. To suppress this,
|
By default, if the table does not exist an exception is raised. To suppress this,
|
||||||
you can pass in `ignore_missing=True`.
|
you can pass in `ignore_missing=True`.
|
||||||
|
|
||||||
|
=== "JavaScript"
|
||||||
|
```javascript
|
||||||
|
await db.dropTable('myTable')
|
||||||
|
```
|
||||||
|
|
||||||
|
This permanently removes the table and is not recoverable, unlike deleting rows.
|
||||||
|
If the table does not exist an exception is raised.
|
||||||
|
|
||||||
## What's next
|
## What's next
|
||||||
|
|
||||||
|
|||||||
@@ -201,8 +201,8 @@ A Table is a collection of Records in a LanceDB Database. You can follow along o
|
|||||||
```javascript
|
```javascript
|
||||||
data
|
data
|
||||||
const tb = await db.createTable("my_table",
|
const tb = await db.createTable("my_table",
|
||||||
data=[{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
|
[{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
|
||||||
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0}])
|
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0}])
|
||||||
```
|
```
|
||||||
|
|
||||||
!!! info "Note"
|
!!! info "Note"
|
||||||
|
|||||||
@@ -118,4 +118,101 @@ However, fast vector search using indices often entails making a trade-off with
|
|||||||
This is why it is often called **Approximate Nearest Neighbors (ANN)** search, while the Flat Search (KNN)
|
This is why it is often called **Approximate Nearest Neighbors (ANN)** search, while the Flat Search (KNN)
|
||||||
always returns 100% recall.
|
always returns 100% recall.
|
||||||
|
|
||||||
See [ANN Index](ann_indexes.md) for more details.
|
See [ANN Index](ann_indexes.md) for more details.
|
||||||
|
|
||||||
|
|
||||||
|
### Output formats
|
||||||
|
|
||||||
|
LanceDB returns results in many different formats commonly used in python.
|
||||||
|
Let's create a LanceDB table with a nested schema:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from datetime import datetime
|
||||||
|
import lancedb
|
||||||
|
from lancedb.pydantic import LanceModel, Vector
|
||||||
|
import numpy as np
|
||||||
|
from pydantic import BaseModel
|
||||||
|
uri = "data/sample-lancedb-nested"
|
||||||
|
|
||||||
|
class Metadata(BaseModel):
|
||||||
|
source: str
|
||||||
|
timestamp: datetime
|
||||||
|
|
||||||
|
class Document(BaseModel):
|
||||||
|
content: str
|
||||||
|
meta: Metadata
|
||||||
|
|
||||||
|
class LanceSchema(LanceModel):
|
||||||
|
id: str
|
||||||
|
vector: Vector(1536)
|
||||||
|
payload: Document
|
||||||
|
|
||||||
|
# Let's add 100 sample rows to our dataset
|
||||||
|
data = [LanceSchema(
|
||||||
|
id=f"id{i}",
|
||||||
|
vector=np.random.randn(1536),
|
||||||
|
payload=Document(
|
||||||
|
content=f"document{i}", meta=Metadata(source=f"source{i%10}", timestamp=datetime.now())
|
||||||
|
),
|
||||||
|
) for i in range(100)]
|
||||||
|
|
||||||
|
tbl = db.create_table("documents", data=data)
|
||||||
|
```
|
||||||
|
|
||||||
|
#### As a pyarrow table
|
||||||
|
|
||||||
|
Using `to_arrow()` we can get the results back as a pyarrow Table.
|
||||||
|
This result table has the same columns as the LanceDB table, with
|
||||||
|
the addition of an `_distance` column for vector search or a `score`
|
||||||
|
column for full text search.
|
||||||
|
|
||||||
|
```python
|
||||||
|
tbl.search(np.random.randn(1536)).to_arrow()
|
||||||
|
```
|
||||||
|
|
||||||
|
#### As a pandas dataframe
|
||||||
|
|
||||||
|
You can also get the results as a pandas dataframe.
|
||||||
|
|
||||||
|
```python
|
||||||
|
tbl.search(np.random.randn(1536)).to_pandas()
|
||||||
|
```
|
||||||
|
|
||||||
|
While other formats like Arrow/Pydantic/Python dicts have a natural
|
||||||
|
way to handle nested schemas, pandas can only store nested data as a
|
||||||
|
python dict column, which makes it difficult to support nested references.
|
||||||
|
So for convenience, you can also tell LanceDB to flatten a nested schema
|
||||||
|
when creating the pandas dataframe.
|
||||||
|
|
||||||
|
```python
|
||||||
|
tbl.search(np.random.randn(1536)).to_pandas(flatten=True)
|
||||||
|
```
|
||||||
|
|
||||||
|
If your table has a deeply nested struct, you can control how many levels
|
||||||
|
of nesting to flatten by passing in a positive integer.
|
||||||
|
|
||||||
|
```python
|
||||||
|
tbl.search(np.random.randn(1536)).to_pandas(flatten=1)
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
#### As a list of python dicts
|
||||||
|
|
||||||
|
You can of course return results as a list of python dicts.
|
||||||
|
|
||||||
|
```python
|
||||||
|
tbl.search(np.random.randn(1536)).to_list()
|
||||||
|
```
|
||||||
|
|
||||||
|
#### As a list of pydantic models
|
||||||
|
|
||||||
|
We can add data using pydantic models, and we can certainly
|
||||||
|
retrieve results as pydantic models
|
||||||
|
|
||||||
|
```python
|
||||||
|
tbl.search(np.random.randn(1536)).to_pydantic(LanceSchema)
|
||||||
|
```
|
||||||
|
|
||||||
|
Note that in this case the extra `_distance` field is discarded since
|
||||||
|
it's not part of the LanceSchema.
|
||||||
|
|
||||||
|
|||||||
@@ -22,7 +22,7 @@ import numpy as np
|
|||||||
uri = "data/sample-lancedb"
|
uri = "data/sample-lancedb"
|
||||||
db = lancedb.connect(uri)
|
db = lancedb.connect(uri)
|
||||||
|
|
||||||
data = [{"vector": row, "item": f"item {i}"}
|
data = [{"vector": row, "item": f"item {i}", "id": i}
|
||||||
for i, row in enumerate(np.random.random((10_000, 2)).astype('int'))]
|
for i, row in enumerate(np.random.random((10_000, 2)).astype('int'))]
|
||||||
|
|
||||||
tbl = db.create_table("my_vectors", data=data)
|
tbl = db.create_table("my_vectors", data=data)
|
||||||
@@ -35,33 +35,25 @@ const db = await vectordb.connect('data/sample-lancedb')
|
|||||||
|
|
||||||
let data = []
|
let data = []
|
||||||
for (let i = 0; i < 10_000; i++) {
|
for (let i = 0; i < 10_000; i++) {
|
||||||
data.push({vector: Array(1536).fill(i), id: `${i}`, content: "", longId: `${i}`},)
|
data.push({vector: Array(1536).fill(i), id: i, item: `item ${i}`, strId: `${i}`})
|
||||||
}
|
}
|
||||||
const tbl = await db.createTable('my_vectors', data)
|
const tbl = await db.createTable('myVectors', data)
|
||||||
```
|
```
|
||||||
-->
|
-->
|
||||||
=== "Python"
|
=== "Python"
|
||||||
|
|
||||||
```python
|
```python
|
||||||
tbl.search([100, 102]) \
|
tbl.search([100, 102]) \
|
||||||
.where("""(
|
.where("(item IN ('item 0', 'item 2')) AND (id > 10)") \
|
||||||
(label IN [10, 20])
|
.to_arrow()
|
||||||
AND
|
|
||||||
(note.email IS NOT NULL)
|
|
||||||
) OR NOT note.created
|
|
||||||
""")
|
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
=== "Javascript"
|
=== "Javascript"
|
||||||
|
|
||||||
```javascript
|
```javascript
|
||||||
tbl.search([100, 102])
|
await tbl.search(Array(1536).fill(0))
|
||||||
.where(`(
|
.where("(item IN ('item 0', 'item 2')) AND (id > 10)")
|
||||||
(label IN [10, 20])
|
.execute()
|
||||||
AND
|
|
||||||
(note.email IS NOT NULL)
|
|
||||||
) OR NOT note.created
|
|
||||||
`)
|
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
@@ -118,3 +110,22 @@ The mapping from SQL types to Arrow types is:
|
|||||||
|
|
||||||
[^1]: See precision mapping in previous table.
|
[^1]: See precision mapping in previous table.
|
||||||
|
|
||||||
|
|
||||||
|
## Filtering without Vector Search
|
||||||
|
|
||||||
|
You can also filter your data without search.
|
||||||
|
|
||||||
|
=== "Python"
|
||||||
|
```python
|
||||||
|
tbl.search().where("id=10").limit(10).to_arrow()
|
||||||
|
```
|
||||||
|
|
||||||
|
=== "JavaScript"
|
||||||
|
```javascript
|
||||||
|
await tbl.where('id=10').limit(10).execute()
|
||||||
|
```
|
||||||
|
|
||||||
|
!!! warning
|
||||||
|
If your table is large, this could potentially return a very large
|
||||||
|
amount of data. Please be sure to use a `limit` clause unless
|
||||||
|
you're sure you want to return the whole result set.
|
||||||
|
|||||||
74
node/package-lock.json
generated
74
node/package-lock.json
generated
@@ -1,12 +1,12 @@
|
|||||||
{
|
{
|
||||||
"name": "vectordb",
|
"name": "vectordb",
|
||||||
"version": "0.3.9",
|
"version": "0.4.0",
|
||||||
"lockfileVersion": 2,
|
"lockfileVersion": 2,
|
||||||
"requires": true,
|
"requires": true,
|
||||||
"packages": {
|
"packages": {
|
||||||
"": {
|
"": {
|
||||||
"name": "vectordb",
|
"name": "vectordb",
|
||||||
"version": "0.3.9",
|
"version": "0.4.0",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"x64",
|
"x64",
|
||||||
"arm64"
|
"arm64"
|
||||||
@@ -53,11 +53,11 @@
|
|||||||
"uuid": "^9.0.0"
|
"uuid": "^9.0.0"
|
||||||
},
|
},
|
||||||
"optionalDependencies": {
|
"optionalDependencies": {
|
||||||
"@lancedb/vectordb-darwin-arm64": "0.3.9",
|
"@lancedb/vectordb-darwin-arm64": "0.4.0",
|
||||||
"@lancedb/vectordb-darwin-x64": "0.3.9",
|
"@lancedb/vectordb-darwin-x64": "0.4.0",
|
||||||
"@lancedb/vectordb-linux-arm64-gnu": "0.3.9",
|
"@lancedb/vectordb-linux-arm64-gnu": "0.4.0",
|
||||||
"@lancedb/vectordb-linux-x64-gnu": "0.3.9",
|
"@lancedb/vectordb-linux-x64-gnu": "0.4.0",
|
||||||
"@lancedb/vectordb-win32-x64-msvc": "0.3.9"
|
"@lancedb/vectordb-win32-x64-msvc": "0.4.0"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"node_modules/@apache-arrow/ts": {
|
"node_modules/@apache-arrow/ts": {
|
||||||
@@ -317,9 +317,9 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
"node_modules/@lancedb/vectordb-darwin-arm64": {
|
"node_modules/@lancedb/vectordb-darwin-arm64": {
|
||||||
"version": "0.3.9",
|
"version": "0.4.0",
|
||||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.3.9.tgz",
|
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.4.0.tgz",
|
||||||
"integrity": "sha512-irtAdfSRQDcfnMnB8T7D0atLFfu1MMZZ1JaxMKu24DDZ8e4IMYKUplxwvWni3241yA9yDE/pliRZCNQbQCEfrg==",
|
"integrity": "sha512-cP6zGtBWXEcJHCI4uLNIP5ILtRvexvwmL8Uri1dnHG8dT8g12Ykug3BHO6Wt6wp/xASd2jJRIF/VAJsN9IeP1A==",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"arm64"
|
"arm64"
|
||||||
],
|
],
|
||||||
@@ -329,9 +329,9 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
"node_modules/@lancedb/vectordb-darwin-x64": {
|
"node_modules/@lancedb/vectordb-darwin-x64": {
|
||||||
"version": "0.3.9",
|
"version": "0.4.0",
|
||||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.3.9.tgz",
|
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.4.0.tgz",
|
||||||
"integrity": "sha512-4xXQoPheyIl1P5kRoKmZtaAHFrYdL9pw5yq+r6ewIx0TCemN4LSvzSUTqM5nZl3QPU8FeL0CGD8Gt2gMU0HQ2A==",
|
"integrity": "sha512-ig0gV5ol1sFe2lb1HOatK0rizyj9I91WbnH79i7OdUl3nAQIcWm70CnxrPLtx0DS2NTGh2kFJbYCWcaUlu6YfA==",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"x64"
|
"x64"
|
||||||
],
|
],
|
||||||
@@ -341,9 +341,9 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
"node_modules/@lancedb/vectordb-linux-arm64-gnu": {
|
"node_modules/@lancedb/vectordb-linux-arm64-gnu": {
|
||||||
"version": "0.3.9",
|
"version": "0.4.0",
|
||||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.3.9.tgz",
|
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.4.0.tgz",
|
||||||
"integrity": "sha512-WIxCZKnLeSlz0PGURtKSX6hJ4CYE2o5P+IFmmuWOWB1uNapQu6zOpea6rNxcRFHUA0IJdO02lVxVfn2hDX4SMg==",
|
"integrity": "sha512-gMXIDT2kriAPDwWIRKXdaTCNdOeFGEok1S9Y30AOruHXddW1vCIo4JNJIYbBqHnwAeI4wI3ae6GRCFaf1UxO3g==",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"arm64"
|
"arm64"
|
||||||
],
|
],
|
||||||
@@ -353,9 +353,9 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
"node_modules/@lancedb/vectordb-linux-x64-gnu": {
|
"node_modules/@lancedb/vectordb-linux-x64-gnu": {
|
||||||
"version": "0.3.9",
|
"version": "0.4.0",
|
||||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.3.9.tgz",
|
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.4.0.tgz",
|
||||||
"integrity": "sha512-bQbcV9adKzYbJLNzDjk9OYsMnT2IjmieLfb4IQ1hj5IUoWfbg80Bd0+gZUnrmrhG6fe56TIriFZYQR9i7TSE9Q==",
|
"integrity": "sha512-ZQ3lDrDSz1IKdx/mS9Lz08agFO+OD5oSFrrcFNCoT1+H93eS1mCLdmCoEARu3jKbx0tMs38l5J9yXZ2QmJye3w==",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"x64"
|
"x64"
|
||||||
],
|
],
|
||||||
@@ -365,9 +365,9 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
"node_modules/@lancedb/vectordb-win32-x64-msvc": {
|
"node_modules/@lancedb/vectordb-win32-x64-msvc": {
|
||||||
"version": "0.3.9",
|
"version": "0.4.0",
|
||||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.3.9.tgz",
|
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.4.0.tgz",
|
||||||
"integrity": "sha512-7EXI7P1QvAfgJNPWWBMDOkoJ696gSBAClcyEJNYg0JV21jVFZRwJVI3bZXflesWduFi/mTuzPkFFA68us1u19A==",
|
"integrity": "sha512-toNcNwBRE1sdsSf5hr7W8QiqZ33csc/knVEek4CyvYkZHJGh4Z6WI+DJUIASo5wzUez4TX7qUPpRPL9HuaPMCg==",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"x64"
|
"x64"
|
||||||
],
|
],
|
||||||
@@ -4869,33 +4869,33 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
"@lancedb/vectordb-darwin-arm64": {
|
"@lancedb/vectordb-darwin-arm64": {
|
||||||
"version": "0.3.9",
|
"version": "0.4.0",
|
||||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.3.9.tgz",
|
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.4.0.tgz",
|
||||||
"integrity": "sha512-irtAdfSRQDcfnMnB8T7D0atLFfu1MMZZ1JaxMKu24DDZ8e4IMYKUplxwvWni3241yA9yDE/pliRZCNQbQCEfrg==",
|
"integrity": "sha512-cP6zGtBWXEcJHCI4uLNIP5ILtRvexvwmL8Uri1dnHG8dT8g12Ykug3BHO6Wt6wp/xASd2jJRIF/VAJsN9IeP1A==",
|
||||||
"optional": true
|
"optional": true
|
||||||
},
|
},
|
||||||
"@lancedb/vectordb-darwin-x64": {
|
"@lancedb/vectordb-darwin-x64": {
|
||||||
"version": "0.3.9",
|
"version": "0.4.0",
|
||||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.3.9.tgz",
|
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.4.0.tgz",
|
||||||
"integrity": "sha512-4xXQoPheyIl1P5kRoKmZtaAHFrYdL9pw5yq+r6ewIx0TCemN4LSvzSUTqM5nZl3QPU8FeL0CGD8Gt2gMU0HQ2A==",
|
"integrity": "sha512-ig0gV5ol1sFe2lb1HOatK0rizyj9I91WbnH79i7OdUl3nAQIcWm70CnxrPLtx0DS2NTGh2kFJbYCWcaUlu6YfA==",
|
||||||
"optional": true
|
"optional": true
|
||||||
},
|
},
|
||||||
"@lancedb/vectordb-linux-arm64-gnu": {
|
"@lancedb/vectordb-linux-arm64-gnu": {
|
||||||
"version": "0.3.9",
|
"version": "0.4.0",
|
||||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.3.9.tgz",
|
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.4.0.tgz",
|
||||||
"integrity": "sha512-WIxCZKnLeSlz0PGURtKSX6hJ4CYE2o5P+IFmmuWOWB1uNapQu6zOpea6rNxcRFHUA0IJdO02lVxVfn2hDX4SMg==",
|
"integrity": "sha512-gMXIDT2kriAPDwWIRKXdaTCNdOeFGEok1S9Y30AOruHXddW1vCIo4JNJIYbBqHnwAeI4wI3ae6GRCFaf1UxO3g==",
|
||||||
"optional": true
|
"optional": true
|
||||||
},
|
},
|
||||||
"@lancedb/vectordb-linux-x64-gnu": {
|
"@lancedb/vectordb-linux-x64-gnu": {
|
||||||
"version": "0.3.9",
|
"version": "0.4.0",
|
||||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.3.9.tgz",
|
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.4.0.tgz",
|
||||||
"integrity": "sha512-bQbcV9adKzYbJLNzDjk9OYsMnT2IjmieLfb4IQ1hj5IUoWfbg80Bd0+gZUnrmrhG6fe56TIriFZYQR9i7TSE9Q==",
|
"integrity": "sha512-ZQ3lDrDSz1IKdx/mS9Lz08agFO+OD5oSFrrcFNCoT1+H93eS1mCLdmCoEARu3jKbx0tMs38l5J9yXZ2QmJye3w==",
|
||||||
"optional": true
|
"optional": true
|
||||||
},
|
},
|
||||||
"@lancedb/vectordb-win32-x64-msvc": {
|
"@lancedb/vectordb-win32-x64-msvc": {
|
||||||
"version": "0.3.9",
|
"version": "0.4.0",
|
||||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.3.9.tgz",
|
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.4.0.tgz",
|
||||||
"integrity": "sha512-7EXI7P1QvAfgJNPWWBMDOkoJ696gSBAClcyEJNYg0JV21jVFZRwJVI3bZXflesWduFi/mTuzPkFFA68us1u19A==",
|
"integrity": "sha512-toNcNwBRE1sdsSf5hr7W8QiqZ33csc/knVEek4CyvYkZHJGh4Z6WI+DJUIASo5wzUez4TX7qUPpRPL9HuaPMCg==",
|
||||||
"optional": true
|
"optional": true
|
||||||
},
|
},
|
||||||
"@neon-rs/cli": {
|
"@neon-rs/cli": {
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "vectordb",
|
"name": "vectordb",
|
||||||
"version": "0.3.9",
|
"version": "0.4.0",
|
||||||
"description": " Serverless, low-latency vector database for AI applications",
|
"description": " Serverless, low-latency vector database for AI applications",
|
||||||
"main": "dist/index.js",
|
"main": "dist/index.js",
|
||||||
"types": "dist/index.d.ts",
|
"types": "dist/index.d.ts",
|
||||||
@@ -81,10 +81,10 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
"optionalDependencies": {
|
"optionalDependencies": {
|
||||||
"@lancedb/vectordb-darwin-arm64": "0.3.9",
|
"@lancedb/vectordb-darwin-arm64": "0.4.0",
|
||||||
"@lancedb/vectordb-darwin-x64": "0.3.9",
|
"@lancedb/vectordb-darwin-x64": "0.4.0",
|
||||||
"@lancedb/vectordb-linux-arm64-gnu": "0.3.9",
|
"@lancedb/vectordb-linux-arm64-gnu": "0.4.0",
|
||||||
"@lancedb/vectordb-linux-x64-gnu": "0.3.9",
|
"@lancedb/vectordb-linux-x64-gnu": "0.4.0",
|
||||||
"@lancedb/vectordb-win32-x64-msvc": "0.3.9"
|
"@lancedb/vectordb-win32-x64-msvc": "0.4.0"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -744,6 +744,11 @@ export interface IvfPQIndexConfig {
|
|||||||
*/
|
*/
|
||||||
replace?: boolean
|
replace?: boolean
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Cache size of the index
|
||||||
|
*/
|
||||||
|
index_cache_size?: number
|
||||||
|
|
||||||
type: 'ivf_pq'
|
type: 'ivf_pq'
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -25,6 +25,7 @@ import { Vector, Table as ArrowTable } from 'apache-arrow'
|
|||||||
import { HttpLancedbClient } from './client'
|
import { HttpLancedbClient } from './client'
|
||||||
import { isEmbeddingFunction } from '../embedding/embedding_function'
|
import { isEmbeddingFunction } from '../embedding/embedding_function'
|
||||||
import { createEmptyTable, fromRecordsToStreamBuffer, fromTableToStreamBuffer } from '../arrow'
|
import { createEmptyTable, fromRecordsToStreamBuffer, fromTableToStreamBuffer } from '../arrow'
|
||||||
|
import { toSQL } from '../util'
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Remote connection.
|
* Remote connection.
|
||||||
@@ -56,8 +57,8 @@ export class RemoteConnection implements Connection {
|
|||||||
return 'db://' + this._client.uri
|
return 'db://' + this._client.uri
|
||||||
}
|
}
|
||||||
|
|
||||||
async tableNames (): Promise<string[]> {
|
async tableNames (pageToken: string = '', limit: number = 10): Promise<string[]> {
|
||||||
const response = await this._client.get('/v1/table/')
|
const response = await this._client.get('/v1/table/', { limit, page_token: pageToken })
|
||||||
return response.data.tables
|
return response.data.tables
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -194,6 +195,17 @@ export class RemoteTable<T = number[]> implements Table<T> {
|
|||||||
return this._name
|
return this._name
|
||||||
}
|
}
|
||||||
|
|
||||||
|
get schema (): Promise<any> {
|
||||||
|
return this._client.post(`/v1/table/${this._name}/describe/`).then(res => {
|
||||||
|
if (res.status !== 200) {
|
||||||
|
throw new Error(`Server Error, status: ${res.status}, ` +
|
||||||
|
// eslint-disable-next-line @typescript-eslint/restrict-template-expressions
|
||||||
|
`message: ${res.statusText}: ${res.data}`)
|
||||||
|
}
|
||||||
|
return res.data?.schema
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
search (query: T): Query<T> {
|
search (query: T): Query<T> {
|
||||||
return new RemoteQuery(query, this._client, this._name)//, this._embeddings_new)
|
return new RemoteQuery(query, this._client, this._name)//, this._embeddings_new)
|
||||||
}
|
}
|
||||||
@@ -234,8 +246,41 @@ export class RemoteTable<T = number[]> implements Table<T> {
|
|||||||
return data.length
|
return data.length
|
||||||
}
|
}
|
||||||
|
|
||||||
async createIndex (indexParams: VectorIndexParams): Promise<any> {
|
async createIndex (indexParams: VectorIndexParams): Promise<void> {
|
||||||
throw new Error('Not implemented')
|
const unsupportedParams = [
|
||||||
|
'index_name',
|
||||||
|
'num_partitions',
|
||||||
|
'max_iters',
|
||||||
|
'use_opq',
|
||||||
|
'num_sub_vectors',
|
||||||
|
'num_bits',
|
||||||
|
'max_opq_iters',
|
||||||
|
'replace'
|
||||||
|
]
|
||||||
|
for (const param of unsupportedParams) {
|
||||||
|
// eslint-disable-next-line @typescript-eslint/strict-boolean-expressions
|
||||||
|
if (indexParams[param as keyof VectorIndexParams]) {
|
||||||
|
throw new Error(`${param} is not supported for remote connections`)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const column = indexParams.column ?? 'vector'
|
||||||
|
const indexType = 'vector' // only vector index is supported for remote connections
|
||||||
|
const metricType = indexParams.metric_type ?? 'L2'
|
||||||
|
const indexCacheSize = indexParams ?? null
|
||||||
|
|
||||||
|
const data = {
|
||||||
|
column,
|
||||||
|
index_type: indexType,
|
||||||
|
metric_type: metricType,
|
||||||
|
index_cache_size: indexCacheSize
|
||||||
|
}
|
||||||
|
const res = await this._client.post(`/v1/table/${this._name}/create_index/`, data)
|
||||||
|
if (res.status !== 200) {
|
||||||
|
throw new Error(`Server Error, status: ${res.status}, ` +
|
||||||
|
// eslint-disable-next-line @typescript-eslint/restrict-template-expressions
|
||||||
|
`message: ${res.statusText}: ${res.data}`)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async countRows (): Promise<number> {
|
async countRows (): Promise<number> {
|
||||||
@@ -248,7 +293,23 @@ export class RemoteTable<T = number[]> implements Table<T> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
async update (args: UpdateArgs | UpdateSqlArgs): Promise<void> {
|
async update (args: UpdateArgs | UpdateSqlArgs): Promise<void> {
|
||||||
throw new Error('Not implemented')
|
let filter: string | null
|
||||||
|
let updates: Record<string, string>
|
||||||
|
|
||||||
|
if ('valuesSql' in args) {
|
||||||
|
filter = args.where ?? null
|
||||||
|
updates = args.valuesSql
|
||||||
|
} else {
|
||||||
|
filter = args.where ?? null
|
||||||
|
updates = {}
|
||||||
|
for (const [key, value] of Object.entries(args.values)) {
|
||||||
|
updates[key] = toSQL(value)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
await this._client.post(`/v1/table/${this._name}/update/`, {
|
||||||
|
predicate: filter,
|
||||||
|
updates: Object.entries(updates).map(([key, value]) => [key, value])
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
async listIndices (): Promise<VectorIndex[]> {
|
async listIndices (): Promise<VectorIndex[]> {
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
[bumpversion]
|
[bumpversion]
|
||||||
current_version = 0.3.5
|
current_version = 0.4.0
|
||||||
commit = True
|
commit = True
|
||||||
message = [python] Bump version: {current_version} → {new_version}
|
message = [python] Bump version: {current_version} → {new_version}
|
||||||
tag = True
|
tag = True
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ from overrides import EnforceOverrides, override
|
|||||||
from pyarrow import fs
|
from pyarrow import fs
|
||||||
|
|
||||||
from .table import LanceTable, Table
|
from .table import LanceTable, Table
|
||||||
from .util import fs_from_uri, get_uri_location, get_uri_scheme
|
from .util import fs_from_uri, get_uri_location, get_uri_scheme, join_uri
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from .common import DATA, URI
|
from .common import DATA, URI
|
||||||
@@ -288,14 +288,13 @@ class LanceDBConnection(DBConnection):
|
|||||||
A list of table names.
|
A list of table names.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
filesystem, path = fs_from_uri(self.uri)
|
filesystem = fs_from_uri(self.uri)[0]
|
||||||
except pa.ArrowInvalid:
|
except pa.ArrowInvalid:
|
||||||
raise NotImplementedError("Unsupported scheme: " + self.uri)
|
raise NotImplementedError("Unsupported scheme: " + self.uri)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
paths = filesystem.get_file_info(
|
loc = get_uri_location(self.uri)
|
||||||
fs.FileSelector(get_uri_location(self.uri))
|
paths = filesystem.get_file_info(fs.FileSelector(loc))
|
||||||
)
|
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
# It is ok if the file does not exist since it will be created
|
# It is ok if the file does not exist since it will be created
|
||||||
paths = []
|
paths = []
|
||||||
@@ -373,7 +372,7 @@ class LanceDBConnection(DBConnection):
|
|||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
filesystem, path = fs_from_uri(self.uri)
|
filesystem, path = fs_from_uri(self.uri)
|
||||||
table_path = os.path.join(path, name + ".lance")
|
table_path = join_uri(path, name + ".lance")
|
||||||
filesystem.delete_dir(table_path)
|
filesystem.delete_dir(table_path)
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
if not ignore_missing:
|
if not ignore_missing:
|
||||||
|
|||||||
@@ -75,8 +75,14 @@ def populate_index(index: tantivy.Index, table: LanceTable, fields: List[str]) -
|
|||||||
The number of rows indexed
|
The number of rows indexed
|
||||||
"""
|
"""
|
||||||
# first check the fields exist and are string or large string type
|
# first check the fields exist and are string or large string type
|
||||||
|
nested = []
|
||||||
for name in fields:
|
for name in fields:
|
||||||
f = table.schema.field(name) # raises KeyError if not found
|
try:
|
||||||
|
f = table.schema.field(name) # raises KeyError if not found
|
||||||
|
except KeyError:
|
||||||
|
f = resolve_path(table.schema, name)
|
||||||
|
nested.append(name)
|
||||||
|
|
||||||
if not pa.types.is_string(f.type) and not pa.types.is_large_string(f.type):
|
if not pa.types.is_string(f.type) and not pa.types.is_large_string(f.type):
|
||||||
raise TypeError(f"Field {name} is not a string type")
|
raise TypeError(f"Field {name} is not a string type")
|
||||||
|
|
||||||
@@ -85,7 +91,16 @@ def populate_index(index: tantivy.Index, table: LanceTable, fields: List[str]) -
|
|||||||
# write data into index
|
# write data into index
|
||||||
dataset = table.to_lance()
|
dataset = table.to_lance()
|
||||||
row_id = 0
|
row_id = 0
|
||||||
|
|
||||||
|
max_nested_level = 0
|
||||||
|
if len(nested) > 0:
|
||||||
|
max_nested_level = max([len(name.split(".")) for name in nested])
|
||||||
|
|
||||||
for b in dataset.to_batches(columns=fields):
|
for b in dataset.to_batches(columns=fields):
|
||||||
|
if max_nested_level > 0:
|
||||||
|
b = pa.Table.from_batches([b])
|
||||||
|
for _ in range(max_nested_level - 1):
|
||||||
|
b = b.flatten()
|
||||||
for i in range(b.num_rows):
|
for i in range(b.num_rows):
|
||||||
doc = tantivy.Document()
|
doc = tantivy.Document()
|
||||||
doc.add_integer("doc_id", row_id)
|
doc.add_integer("doc_id", row_id)
|
||||||
@@ -98,6 +113,30 @@ def populate_index(index: tantivy.Index, table: LanceTable, fields: List[str]) -
|
|||||||
return row_id
|
return row_id
|
||||||
|
|
||||||
|
|
||||||
|
def resolve_path(schema, field_name: str) -> pa.Field:
|
||||||
|
"""
|
||||||
|
Resolve a nested field path to a list of field names
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
field_name : str
|
||||||
|
The field name to resolve
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
List[str]
|
||||||
|
The resolved path
|
||||||
|
"""
|
||||||
|
path = field_name.split(".")
|
||||||
|
field = schema.field(path.pop(0))
|
||||||
|
for segment in path:
|
||||||
|
if pa.types.is_struct(field.type):
|
||||||
|
field = field.type.field(segment)
|
||||||
|
else:
|
||||||
|
raise KeyError(f"field {field_name} not found in schema {schema}")
|
||||||
|
return field
|
||||||
|
|
||||||
|
|
||||||
def search_index(
|
def search_index(
|
||||||
index: tantivy.Index, query: str, limit: int = 10
|
index: tantivy.Index, query: str, limit: int = 10
|
||||||
) -> Tuple[Tuple[int], Tuple[float]]:
|
) -> Tuple[Tuple[int], Tuple[float]]:
|
||||||
|
|||||||
@@ -348,3 +348,20 @@ def get_extras(field_info: pydantic.fields.FieldInfo, key: str) -> Any:
|
|||||||
if PYDANTIC_VERSION.major >= 2:
|
if PYDANTIC_VERSION.major >= 2:
|
||||||
return (field_info.json_schema_extra or {}).get(key)
|
return (field_info.json_schema_extra or {}).get(key)
|
||||||
return (field_info.field_info.extra or {}).get("json_schema_extra", {}).get(key)
|
return (field_info.field_info.extra or {}).get("json_schema_extra", {}).get(key)
|
||||||
|
|
||||||
|
|
||||||
|
if PYDANTIC_VERSION.major < 2:
|
||||||
|
|
||||||
|
def model_to_dict(model: pydantic.BaseModel) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Convert a Pydantic model to a dictionary.
|
||||||
|
"""
|
||||||
|
return model.dict()
|
||||||
|
|
||||||
|
else:
|
||||||
|
|
||||||
|
def model_to_dict(model: pydantic.BaseModel) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Convert a Pydantic model to a dictionary.
|
||||||
|
"""
|
||||||
|
return model.model_dump()
|
||||||
|
|||||||
@@ -185,14 +185,40 @@ class LanceQueryBuilder(ABC):
|
|||||||
"""
|
"""
|
||||||
return self.to_pandas()
|
return self.to_pandas()
|
||||||
|
|
||||||
def to_pandas(self) -> "pd.DataFrame":
|
def to_pandas(self, flatten: Optional[Union[int, bool]] = None) -> "pd.DataFrame":
|
||||||
"""
|
"""
|
||||||
Execute the query and return the results as a pandas DataFrame.
|
Execute the query and return the results as a pandas DataFrame.
|
||||||
In addition to the selected columns, LanceDB also returns a vector
|
In addition to the selected columns, LanceDB also returns a vector
|
||||||
and also the "_distance" column which is the distance between the query
|
and also the "_distance" column which is the distance between the query
|
||||||
vector and the returned vector.
|
vector and the returned vector.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
flatten: Optional[Union[int, bool]]
|
||||||
|
If flatten is True, flatten all nested columns.
|
||||||
|
If flatten is an integer, flatten the nested columns up to the
|
||||||
|
specified depth.
|
||||||
|
If unspecified, do not flatten the nested columns.
|
||||||
"""
|
"""
|
||||||
return self.to_arrow().to_pandas()
|
tbl = self.to_arrow()
|
||||||
|
if flatten is True:
|
||||||
|
while True:
|
||||||
|
tbl = tbl.flatten()
|
||||||
|
has_struct = False
|
||||||
|
# loop through all columns to check if there is any struct column
|
||||||
|
if any(pa.types.is_struct(col.type) for col in tbl.schema):
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
elif isinstance(flatten, int):
|
||||||
|
if flatten <= 0:
|
||||||
|
raise ValueError(
|
||||||
|
"Please specify a positive integer for flatten or the boolean value `True`"
|
||||||
|
)
|
||||||
|
while flatten > 0:
|
||||||
|
tbl = tbl.flatten()
|
||||||
|
flatten -= 1
|
||||||
|
return tbl.to_pandas()
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def to_arrow(self) -> pa.Table:
|
def to_arrow(self) -> pa.Table:
|
||||||
|
|||||||
@@ -18,6 +18,8 @@ import attrs
|
|||||||
import pyarrow as pa
|
import pyarrow as pa
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
from lancedb.common import VECTOR_COLUMN_NAME
|
||||||
|
|
||||||
__all__ = ["LanceDBClient", "VectorQuery", "VectorQueryResult"]
|
__all__ = ["LanceDBClient", "VectorQuery", "VectorQueryResult"]
|
||||||
|
|
||||||
|
|
||||||
@@ -43,6 +45,8 @@ class VectorQuery(BaseModel):
|
|||||||
|
|
||||||
refine_factor: Optional[int] = None
|
refine_factor: Optional[int] = None
|
||||||
|
|
||||||
|
vector_column: str = VECTOR_COLUMN_NAME
|
||||||
|
|
||||||
|
|
||||||
@attrs.define
|
@attrs.define
|
||||||
class VectorQueryResult:
|
class VectorQueryResult:
|
||||||
|
|||||||
@@ -13,7 +13,7 @@
|
|||||||
|
|
||||||
import uuid
|
import uuid
|
||||||
from functools import cached_property
|
from functools import cached_property
|
||||||
from typing import Optional, Union
|
from typing import Dict, Optional, Union
|
||||||
|
|
||||||
import pyarrow as pa
|
import pyarrow as pa
|
||||||
from lance import json_to_schema
|
from lance import json_to_schema
|
||||||
@@ -22,6 +22,7 @@ from lancedb.common import DATA, VEC, VECTOR_COLUMN_NAME
|
|||||||
|
|
||||||
from ..query import LanceVectorQueryBuilder
|
from ..query import LanceVectorQueryBuilder
|
||||||
from ..table import Query, Table, _sanitize_data
|
from ..table import Query, Table, _sanitize_data
|
||||||
|
from ..util import value_to_sql
|
||||||
from .arrow import to_ipc_binary
|
from .arrow import to_ipc_binary
|
||||||
from .client import ARROW_STREAM_CONTENT_TYPE
|
from .client import ARROW_STREAM_CONTENT_TYPE
|
||||||
from .db import RemoteDBConnection
|
from .db import RemoteDBConnection
|
||||||
@@ -273,3 +274,65 @@ class RemoteTable(Table):
|
|||||||
self._conn._loop.run_until_complete(
|
self._conn._loop.run_until_complete(
|
||||||
self._conn._client.post(f"/v1/table/{self._name}/delete/", data=payload)
|
self._conn._client.post(f"/v1/table/{self._name}/delete/", data=payload)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def update(
|
||||||
|
self,
|
||||||
|
where: Optional[str] = None,
|
||||||
|
values: Optional[dict] = None,
|
||||||
|
*,
|
||||||
|
values_sql: Optional[Dict[str, str]] = None,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
This can be used to update zero to all rows depending on how many
|
||||||
|
rows match the where clause.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
where: str, optional
|
||||||
|
The SQL where clause to use when updating rows. For example, 'x = 2'
|
||||||
|
or 'x IN (1, 2, 3)'. The filter must not be empty, or it will error.
|
||||||
|
values: dict, optional
|
||||||
|
The values to update. The keys are the column names and the values
|
||||||
|
are the values to set.
|
||||||
|
values_sql: dict, optional
|
||||||
|
The values to update, expressed as SQL expression strings. These can
|
||||||
|
reference existing columns. For example, {"x": "x + 1"} will increment
|
||||||
|
the x column by 1.
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
>>> import lancedb
|
||||||
|
>>> data = [
|
||||||
|
... {"x": 1, "vector": [1, 2]},
|
||||||
|
... {"x": 2, "vector": [3, 4]},
|
||||||
|
... {"x": 3, "vector": [5, 6]}
|
||||||
|
... ]
|
||||||
|
>>> db = lancedb.connect("db://...", api_key="...", region="...") # doctest: +SKIP
|
||||||
|
>>> table = db.create_table("my_table", data) # doctest: +SKIP
|
||||||
|
>>> table.to_pandas() # doctest: +SKIP
|
||||||
|
x vector # doctest: +SKIP
|
||||||
|
0 1 [1.0, 2.0] # doctest: +SKIP
|
||||||
|
1 2 [3.0, 4.0] # doctest: +SKIP
|
||||||
|
2 3 [5.0, 6.0] # doctest: +SKIP
|
||||||
|
>>> table.update(where="x = 2", values={"vector": [10, 10]}) # doctest: +SKIP
|
||||||
|
>>> table.to_pandas() # doctest: +SKIP
|
||||||
|
x vector # doctest: +SKIP
|
||||||
|
0 1 [1.0, 2.0] # doctest: +SKIP
|
||||||
|
1 3 [5.0, 6.0] # doctest: +SKIP
|
||||||
|
2 2 [10.0, 10.0] # doctest: +SKIP
|
||||||
|
|
||||||
|
"""
|
||||||
|
if values is not None and values_sql is not None:
|
||||||
|
raise ValueError("Only one of values or values_sql can be provided")
|
||||||
|
if values is None and values_sql is None:
|
||||||
|
raise ValueError("Either values or values_sql must be provided")
|
||||||
|
|
||||||
|
if values is not None:
|
||||||
|
updates = [[k, value_to_sql(v)] for k, v in values.items()]
|
||||||
|
else:
|
||||||
|
updates = [[k, v] for k, v in values_sql.items()]
|
||||||
|
|
||||||
|
payload = {"predicate": where, "updates": updates}
|
||||||
|
self._conn._loop.run_until_complete(
|
||||||
|
self._conn._client.post(f"/v1/table/{self._name}/update/", data=payload)
|
||||||
|
)
|
||||||
|
|||||||
@@ -23,14 +23,15 @@ import lance
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import pyarrow as pa
|
import pyarrow as pa
|
||||||
import pyarrow.compute as pc
|
import pyarrow.compute as pc
|
||||||
|
import pyarrow.fs as pa_fs
|
||||||
from lance import LanceDataset
|
from lance import LanceDataset
|
||||||
from lance.vector import vec_to_table
|
from lance.vector import vec_to_table
|
||||||
|
|
||||||
from .common import DATA, VEC, VECTOR_COLUMN_NAME
|
from .common import DATA, VEC, VECTOR_COLUMN_NAME
|
||||||
from .embeddings import EmbeddingFunctionConfig, EmbeddingFunctionRegistry
|
from .embeddings import EmbeddingFunctionConfig, EmbeddingFunctionRegistry
|
||||||
from .pydantic import LanceModel
|
from .pydantic import LanceModel, model_to_dict
|
||||||
from .query import LanceQueryBuilder, Query
|
from .query import LanceQueryBuilder, Query
|
||||||
from .util import fs_from_uri, safe_import_pandas, value_to_sql
|
from .util import fs_from_uri, safe_import_pandas, value_to_sql, join_uri
|
||||||
from .utils.events import register_event
|
from .utils.events import register_event
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
@@ -53,8 +54,10 @@ def _sanitize_data(
|
|||||||
# convert to list of dict if data is a bunch of LanceModels
|
# convert to list of dict if data is a bunch of LanceModels
|
||||||
if isinstance(data[0], LanceModel):
|
if isinstance(data[0], LanceModel):
|
||||||
schema = data[0].__class__.to_arrow_schema()
|
schema = data[0].__class__.to_arrow_schema()
|
||||||
data = [dict(d) for d in data]
|
data = [model_to_dict(d) for d in data]
|
||||||
data = pa.Table.from_pylist(data)
|
data = pa.Table.from_pylist(data, schema=schema)
|
||||||
|
else:
|
||||||
|
data = pa.Table.from_pylist(data)
|
||||||
elif isinstance(data, dict):
|
elif isinstance(data, dict):
|
||||||
data = vec_to_table(data)
|
data = vec_to_table(data)
|
||||||
elif pd is not None and isinstance(data, pd.DataFrame):
|
elif pd is not None and isinstance(data, pd.DataFrame):
|
||||||
@@ -394,14 +397,6 @@ class LanceTable(Table):
|
|||||||
self.name = name
|
self.name = name
|
||||||
self._version = version
|
self._version = version
|
||||||
|
|
||||||
def _reset_dataset(self, version=None):
|
|
||||||
try:
|
|
||||||
if "_dataset" in self.__dict__:
|
|
||||||
del self.__dict__["_dataset"]
|
|
||||||
self._version = version
|
|
||||||
except AttributeError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def schema(self) -> pa.Schema:
|
def schema(self) -> pa.Schema:
|
||||||
"""Return the schema of the table.
|
"""Return the schema of the table.
|
||||||
@@ -410,16 +405,16 @@ class LanceTable(Table):
|
|||||||
-------
|
-------
|
||||||
pa.Schema
|
pa.Schema
|
||||||
A PyArrow schema object."""
|
A PyArrow schema object."""
|
||||||
return self._dataset.schema
|
return self.to_lance().schema
|
||||||
|
|
||||||
def list_versions(self):
|
def list_versions(self):
|
||||||
"""List all versions of the table"""
|
"""List all versions of the table"""
|
||||||
return self._dataset.versions()
|
return self.to_lance().versions()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def version(self) -> int:
|
def version(self) -> int:
|
||||||
"""Get the current version of the table"""
|
"""Get the current version of the table"""
|
||||||
return self._dataset.version
|
return self.to_lance().version
|
||||||
|
|
||||||
def checkout(self, version: int):
|
def checkout(self, version: int):
|
||||||
"""Checkout a version of the table. This is an in-place operation.
|
"""Checkout a version of the table. This is an in-place operation.
|
||||||
@@ -452,14 +447,12 @@ class LanceTable(Table):
|
|||||||
vector type
|
vector type
|
||||||
0 [1.1, 0.9] vector
|
0 [1.1, 0.9] vector
|
||||||
"""
|
"""
|
||||||
max_ver = max([v["version"] for v in self._dataset.versions()])
|
max_ver = max([v["version"] for v in self.to_lance().versions()])
|
||||||
if version < 1 or version > max_ver:
|
if version < 1 or version > max_ver:
|
||||||
raise ValueError(f"Invalid version {version}")
|
raise ValueError(f"Invalid version {version}")
|
||||||
self._reset_dataset(version=version)
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Accessing the property updates the cached value
|
self.to_lance().checkout(version)
|
||||||
_ = self._dataset
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if "not found" in str(e):
|
if "not found" in str(e):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
@@ -502,7 +495,7 @@ class LanceTable(Table):
|
|||||||
>>> len(table.list_versions())
|
>>> len(table.list_versions())
|
||||||
4
|
4
|
||||||
"""
|
"""
|
||||||
max_ver = max([v["version"] for v in self._dataset.versions()])
|
max_ver = max([v["version"] for v in self.to_lance().versions()])
|
||||||
if version is None:
|
if version is None:
|
||||||
version = self.version
|
version = self.version
|
||||||
elif version < 1 or version > max_ver:
|
elif version < 1 or version > max_ver:
|
||||||
@@ -514,11 +507,10 @@ class LanceTable(Table):
|
|||||||
# no-op if restoring the latest version
|
# no-op if restoring the latest version
|
||||||
return
|
return
|
||||||
|
|
||||||
self._dataset.restore()
|
self.to_lance().restore()
|
||||||
self._reset_dataset()
|
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
return self._dataset.count_rows()
|
return self.to_lance().count_rows()
|
||||||
|
|
||||||
def __repr__(self) -> str:
|
def __repr__(self) -> str:
|
||||||
return f"LanceTable({self.name})"
|
return f"LanceTable({self.name})"
|
||||||
@@ -528,7 +520,7 @@ class LanceTable(Table):
|
|||||||
|
|
||||||
def head(self, n=5) -> pa.Table:
|
def head(self, n=5) -> pa.Table:
|
||||||
"""Return the first n rows of the table."""
|
"""Return the first n rows of the table."""
|
||||||
return self._dataset.head(n)
|
return self.to_lance().head(n)
|
||||||
|
|
||||||
def to_pandas(self) -> "pd.DataFrame":
|
def to_pandas(self) -> "pd.DataFrame":
|
||||||
"""Return the table as a pandas DataFrame.
|
"""Return the table as a pandas DataFrame.
|
||||||
@@ -545,11 +537,11 @@ class LanceTable(Table):
|
|||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
pa.Table"""
|
pa.Table"""
|
||||||
return self._dataset.to_table()
|
return self.to_lance().to_table()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def _dataset_uri(self) -> str:
|
def _dataset_uri(self) -> str:
|
||||||
return os.path.join(self._conn.uri, f"{self.name}.lance")
|
return join_uri(self._conn.uri, f"{self.name}.lance")
|
||||||
|
|
||||||
def create_index(
|
def create_index(
|
||||||
self,
|
self,
|
||||||
@@ -572,10 +564,11 @@ class LanceTable(Table):
|
|||||||
accelerator=accelerator,
|
accelerator=accelerator,
|
||||||
index_cache_size=index_cache_size,
|
index_cache_size=index_cache_size,
|
||||||
)
|
)
|
||||||
self._reset_dataset()
|
|
||||||
register_event("create_index")
|
register_event("create_index")
|
||||||
|
|
||||||
def create_fts_index(self, field_names: Union[str, List[str]]):
|
def create_fts_index(
|
||||||
|
self, field_names: Union[str, List[str]], *, replace: bool = False
|
||||||
|
):
|
||||||
"""Create a full-text search index on the table.
|
"""Create a full-text search index on the table.
|
||||||
|
|
||||||
Warning - this API is highly experimental and is highly likely to change
|
Warning - this API is highly experimental and is highly likely to change
|
||||||
@@ -585,17 +578,35 @@ class LanceTable(Table):
|
|||||||
----------
|
----------
|
||||||
field_names: str or list of str
|
field_names: str or list of str
|
||||||
The name(s) of the field to index.
|
The name(s) of the field to index.
|
||||||
|
replace: bool, default False
|
||||||
|
If True, replace the existing index if it exists. Note that this is
|
||||||
|
not yet an atomic operation; the index will be temporarily
|
||||||
|
unavailable while the new index is being created.
|
||||||
"""
|
"""
|
||||||
from .fts import create_index, populate_index
|
from .fts import create_index, populate_index
|
||||||
|
|
||||||
if isinstance(field_names, str):
|
if isinstance(field_names, str):
|
||||||
field_names = [field_names]
|
field_names = [field_names]
|
||||||
|
|
||||||
|
fs, path = fs_from_uri(self._get_fts_index_path())
|
||||||
|
index_exists = fs.get_file_info(path).type != pa_fs.FileType.NotFound
|
||||||
|
if index_exists:
|
||||||
|
if not replace:
|
||||||
|
raise ValueError(
|
||||||
|
f"Index already exists. Use replace=True to overwrite."
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
fs.delete_dir(path)
|
||||||
|
except FileNotFoundError as e:
|
||||||
|
if "Cannot get information for path" in str(e):
|
||||||
|
pass
|
||||||
|
|
||||||
index = create_index(self._get_fts_index_path(), field_names)
|
index = create_index(self._get_fts_index_path(), field_names)
|
||||||
populate_index(index, self, field_names)
|
populate_index(index, self, field_names)
|
||||||
register_event("create_fts_index")
|
register_event("create_fts_index")
|
||||||
|
|
||||||
def _get_fts_index_path(self):
|
def _get_fts_index_path(self):
|
||||||
return os.path.join(self._dataset_uri, "_indices", "tantivy")
|
return join_uri(self._dataset_uri, "_indices", "tantivy")
|
||||||
|
|
||||||
@cached_property
|
@cached_property
|
||||||
def _dataset(self) -> LanceDataset:
|
def _dataset(self) -> LanceDataset:
|
||||||
@@ -643,8 +654,7 @@ class LanceTable(Table):
|
|||||||
on_bad_vectors=on_bad_vectors,
|
on_bad_vectors=on_bad_vectors,
|
||||||
fill_value=fill_value,
|
fill_value=fill_value,
|
||||||
)
|
)
|
||||||
lance.write_dataset(data, self._dataset_uri, schema=self.schema, mode=mode)
|
self.to_lance().write(data, mode=mode)
|
||||||
self._reset_dataset()
|
|
||||||
register_event("add")
|
register_event("add")
|
||||||
|
|
||||||
def merge(
|
def merge(
|
||||||
@@ -705,10 +715,9 @@ class LanceTable(Table):
|
|||||||
other_table = other_table.to_lance()
|
other_table = other_table.to_lance()
|
||||||
if isinstance(other_table, LanceDataset):
|
if isinstance(other_table, LanceDataset):
|
||||||
other_table = other_table.to_table()
|
other_table = other_table.to_table()
|
||||||
self._dataset.merge(
|
self.to_lance().merge(
|
||||||
other_table, left_on=left_on, right_on=right_on, schema=schema
|
other_table, left_on=left_on, right_on=right_on, schema=schema
|
||||||
)
|
)
|
||||||
self._reset_dataset()
|
|
||||||
register_event("merge")
|
register_event("merge")
|
||||||
|
|
||||||
@cached_property
|
@cached_property
|
||||||
@@ -911,7 +920,7 @@ class LanceTable(Table):
|
|||||||
return tbl
|
return tbl
|
||||||
|
|
||||||
def delete(self, where: str):
|
def delete(self, where: str):
|
||||||
self._dataset.delete(where)
|
self.to_lance().delete(where)
|
||||||
|
|
||||||
def update(
|
def update(
|
||||||
self,
|
self,
|
||||||
@@ -966,7 +975,6 @@ class LanceTable(Table):
|
|||||||
values_sql = {k: value_to_sql(v) for k, v in values.items()}
|
values_sql = {k: value_to_sql(v) for k, v in values.items()}
|
||||||
|
|
||||||
self.to_lance().update(values_sql, where)
|
self.to_lance().update(values_sql, where)
|
||||||
self._reset_dataset()
|
|
||||||
register_event("update")
|
register_event("update")
|
||||||
|
|
||||||
def _execute_query(self, query: Query) -> pa.Table:
|
def _execute_query(self, query: Query) -> pa.Table:
|
||||||
|
|||||||
@@ -14,7 +14,8 @@
|
|||||||
import os
|
import os
|
||||||
from datetime import date, datetime
|
from datetime import date, datetime
|
||||||
from functools import singledispatch
|
from functools import singledispatch
|
||||||
from typing import Tuple
|
import pathlib
|
||||||
|
from typing import Tuple, Union
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@@ -62,6 +63,12 @@ def get_uri_location(uri: str) -> str:
|
|||||||
str: Location part of the URL, without scheme
|
str: Location part of the URL, without scheme
|
||||||
"""
|
"""
|
||||||
parsed = urlparse(uri)
|
parsed = urlparse(uri)
|
||||||
|
if len(parsed.scheme) == 1:
|
||||||
|
# Windows drive names are parsed as the scheme
|
||||||
|
# e.g. "c:\path" -> ParseResult(scheme="c", netloc="", path="/path", ...)
|
||||||
|
# So we add special handling here for schemes that are a single character
|
||||||
|
return uri
|
||||||
|
|
||||||
if not parsed.netloc:
|
if not parsed.netloc:
|
||||||
return parsed.path
|
return parsed.path
|
||||||
else:
|
else:
|
||||||
@@ -84,6 +91,29 @@ def fs_from_uri(uri: str) -> Tuple[pa_fs.FileSystem, str]:
|
|||||||
return pa_fs.FileSystem.from_uri(uri)
|
return pa_fs.FileSystem.from_uri(uri)
|
||||||
|
|
||||||
|
|
||||||
|
def join_uri(base: Union[str, pathlib.Path], *parts: str) -> str:
|
||||||
|
"""
|
||||||
|
Join a URI with multiple parts, handles both local and remote paths
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
base : str
|
||||||
|
The base URI
|
||||||
|
parts : str
|
||||||
|
The parts to join to the base URI, each separated by the
|
||||||
|
appropriate path separator for the URI scheme and OS
|
||||||
|
"""
|
||||||
|
if isinstance(base, pathlib.Path):
|
||||||
|
return base.joinpath(*parts)
|
||||||
|
base = str(base)
|
||||||
|
if get_uri_scheme(base) == "file":
|
||||||
|
# using pathlib for local paths make this windows compatible
|
||||||
|
# `get_uri_scheme` returns `file` for windows drive names (e.g. `c:\path`)
|
||||||
|
return str(pathlib.Path(base, *parts))
|
||||||
|
# for remote paths, just use os.path.join
|
||||||
|
return "/".join([p.rstrip("/") for p in [base, *parts]])
|
||||||
|
|
||||||
|
|
||||||
def safe_import_pandas():
|
def safe_import_pandas():
|
||||||
try:
|
try:
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|||||||
@@ -1,9 +1,9 @@
|
|||||||
[project]
|
[project]
|
||||||
name = "lancedb"
|
name = "lancedb"
|
||||||
version = "0.3.5"
|
version = "0.4.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"deprecation",
|
"deprecation",
|
||||||
"pylance==0.8.21",
|
"pylance==0.9.1",
|
||||||
"ratelimiter~=1.0",
|
"ratelimiter~=1.0",
|
||||||
"retry>=0.9.2",
|
"retry>=0.9.2",
|
||||||
"tqdm>=4.27.0",
|
"tqdm>=4.27.0",
|
||||||
|
|||||||
@@ -43,7 +43,15 @@ def table(tmp_path) -> ldb.table.LanceTable:
|
|||||||
for _ in range(100)
|
for _ in range(100)
|
||||||
]
|
]
|
||||||
table = db.create_table(
|
table = db.create_table(
|
||||||
"test", data=pd.DataFrame({"vector": vectors, "text": text, "text2": text})
|
"test",
|
||||||
|
data=pd.DataFrame(
|
||||||
|
{
|
||||||
|
"vector": vectors,
|
||||||
|
"text": text,
|
||||||
|
"text2": text,
|
||||||
|
"nested": [{"text": t} for t in text],
|
||||||
|
}
|
||||||
|
),
|
||||||
)
|
)
|
||||||
return table
|
return table
|
||||||
|
|
||||||
@@ -75,6 +83,24 @@ def test_create_index_from_table(tmp_path, table):
|
|||||||
assert len(df) == 10
|
assert len(df) == 10
|
||||||
assert "text" in df.columns
|
assert "text" in df.columns
|
||||||
|
|
||||||
|
# Check whether it can be updated
|
||||||
|
table.add(
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"vector": np.random.randn(128),
|
||||||
|
"text": "gorilla",
|
||||||
|
"text2": "gorilla",
|
||||||
|
"nested": {"text": "gorilla"},
|
||||||
|
}
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
table.create_fts_index("text", replace=True)
|
||||||
|
assert len(table.search("gorilla").limit(1).to_pandas()) == 1
|
||||||
|
|
||||||
|
with pytest.raises(ValueError, match="already exists"):
|
||||||
|
table.create_fts_index("text")
|
||||||
|
|
||||||
|
|
||||||
def test_create_index_multiple_columns(tmp_path, table):
|
def test_create_index_multiple_columns(tmp_path, table):
|
||||||
table.create_fts_index(["text", "text2"])
|
table.create_fts_index(["text", "text2"])
|
||||||
@@ -89,3 +115,9 @@ def test_empty_rs(tmp_path, table, mocker):
|
|||||||
mocker.patch("lancedb.fts.search_index", return_value=([], []))
|
mocker.patch("lancedb.fts.search_index", return_value=([], []))
|
||||||
df = table.search("puppy").limit(10).to_pandas()
|
df = table.search("puppy").limit(10).to_pandas()
|
||||||
assert len(df) == 0
|
assert len(df) == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_nested_schema(tmp_path, table):
|
||||||
|
table.create_fts_index("nested.text")
|
||||||
|
rs = table.search("puppy").limit(10).to_list()
|
||||||
|
assert len(rs) == 10
|
||||||
|
|||||||
@@ -22,6 +22,7 @@ import numpy as np
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
import pyarrow as pa
|
import pyarrow as pa
|
||||||
import pytest
|
import pytest
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
from lancedb.conftest import MockTextEmbeddingFunction
|
from lancedb.conftest import MockTextEmbeddingFunction
|
||||||
from lancedb.db import LanceDBConnection
|
from lancedb.db import LanceDBConnection
|
||||||
@@ -141,14 +142,44 @@ def test_add(db):
|
|||||||
|
|
||||||
|
|
||||||
def test_add_pydantic_model(db):
|
def test_add_pydantic_model(db):
|
||||||
class TestModel(LanceModel):
|
# https://github.com/lancedb/lancedb/issues/562
|
||||||
vector: Vector(16)
|
|
||||||
li: List[int]
|
|
||||||
|
|
||||||
data = TestModel(vector=list(range(16)), li=[1, 2, 3])
|
class Metadata(BaseModel):
|
||||||
table = LanceTable.create(db, "test", data=[data])
|
source: str
|
||||||
assert len(table) == 1
|
timestamp: datetime
|
||||||
assert table.schema == TestModel.to_arrow_schema()
|
|
||||||
|
class Document(BaseModel):
|
||||||
|
content: str
|
||||||
|
meta: Metadata
|
||||||
|
|
||||||
|
class LanceSchema(LanceModel):
|
||||||
|
id: str
|
||||||
|
vector: Vector(2)
|
||||||
|
li: List[int]
|
||||||
|
payload: Document
|
||||||
|
|
||||||
|
tbl = LanceTable.create(db, "mytable", schema=LanceSchema, mode="overwrite")
|
||||||
|
assert tbl.schema == LanceSchema.to_arrow_schema()
|
||||||
|
|
||||||
|
# add works
|
||||||
|
expected = LanceSchema(
|
||||||
|
id="id",
|
||||||
|
vector=[0.0, 0.0],
|
||||||
|
li=[1, 2, 3],
|
||||||
|
payload=Document(
|
||||||
|
content="foo", meta=Metadata(source="bar", timestamp=datetime.now())
|
||||||
|
),
|
||||||
|
)
|
||||||
|
tbl.add([expected])
|
||||||
|
|
||||||
|
result = tbl.search([0.0, 0.0]).limit(1).to_pydantic(LanceSchema)[0]
|
||||||
|
assert result == expected
|
||||||
|
|
||||||
|
flattened = tbl.search([0.0, 0.0]).limit(1).to_pandas(flatten=1)
|
||||||
|
assert len(flattened.columns) == 6 # _distance is automatically added
|
||||||
|
|
||||||
|
really_flattened = tbl.search([0.0, 0.0]).limit(1).to_pandas(flatten=True)
|
||||||
|
assert len(really_flattened.columns) == 7
|
||||||
|
|
||||||
|
|
||||||
def _add(table, schema):
|
def _add(table, schema):
|
||||||
@@ -195,39 +226,38 @@ def test_versioning(db):
|
|||||||
|
|
||||||
|
|
||||||
def test_create_index_method():
|
def test_create_index_method():
|
||||||
with patch.object(LanceTable, "_reset_dataset", return_value=None):
|
with patch.object(
|
||||||
with patch.object(
|
LanceTable, "_dataset", new_callable=PropertyMock
|
||||||
LanceTable, "_dataset", new_callable=PropertyMock
|
) as mock_dataset:
|
||||||
) as mock_dataset:
|
# Setup mock responses
|
||||||
# Setup mock responses
|
mock_dataset.return_value.create_index.return_value = None
|
||||||
mock_dataset.return_value.create_index.return_value = None
|
|
||||||
|
|
||||||
# Create a LanceTable object
|
# Create a LanceTable object
|
||||||
connection = LanceDBConnection(uri="mock.uri")
|
connection = LanceDBConnection(uri="mock.uri")
|
||||||
table = LanceTable(connection, "test_table")
|
table = LanceTable(connection, "test_table")
|
||||||
|
|
||||||
# Call the create_index method
|
# Call the create_index method
|
||||||
table.create_index(
|
table.create_index(
|
||||||
metric="L2",
|
metric="L2",
|
||||||
num_partitions=256,
|
num_partitions=256,
|
||||||
num_sub_vectors=96,
|
num_sub_vectors=96,
|
||||||
vector_column_name="vector",
|
vector_column_name="vector",
|
||||||
replace=True,
|
replace=True,
|
||||||
index_cache_size=256,
|
index_cache_size=256,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Check that the _dataset.create_index method was called
|
# Check that the _dataset.create_index method was called
|
||||||
# with the right parameters
|
# with the right parameters
|
||||||
mock_dataset.return_value.create_index.assert_called_once_with(
|
mock_dataset.return_value.create_index.assert_called_once_with(
|
||||||
column="vector",
|
column="vector",
|
||||||
index_type="IVF_PQ",
|
index_type="IVF_PQ",
|
||||||
metric="L2",
|
metric="L2",
|
||||||
num_partitions=256,
|
num_partitions=256,
|
||||||
num_sub_vectors=96,
|
num_sub_vectors=96,
|
||||||
replace=True,
|
replace=True,
|
||||||
accelerator=None,
|
accelerator=None,
|
||||||
index_cache_size=256,
|
index_cache_size=256,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_add_with_nans(db):
|
def test_add_with_nans(db):
|
||||||
|
|||||||
@@ -11,7 +11,12 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
from lancedb.util import get_uri_scheme
|
import os
|
||||||
|
import pathlib
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from lancedb.util import get_uri_scheme, join_uri
|
||||||
|
|
||||||
|
|
||||||
def test_normalize_uri():
|
def test_normalize_uri():
|
||||||
@@ -28,3 +33,55 @@ def test_normalize_uri():
|
|||||||
for uri, expected_scheme in zip(uris, schemes):
|
for uri, expected_scheme in zip(uris, schemes):
|
||||||
parsed_scheme = get_uri_scheme(uri)
|
parsed_scheme = get_uri_scheme(uri)
|
||||||
assert parsed_scheme == expected_scheme
|
assert parsed_scheme == expected_scheme
|
||||||
|
|
||||||
|
|
||||||
|
def test_join_uri_remote():
|
||||||
|
schemes = ["s3", "az", "gs"]
|
||||||
|
for scheme in schemes:
|
||||||
|
expected = f"{scheme}://bucket/path/to/table.lance"
|
||||||
|
base_uri = f"{scheme}://bucket/path/to/"
|
||||||
|
parts = ["table.lance"]
|
||||||
|
assert join_uri(base_uri, *parts) == expected
|
||||||
|
|
||||||
|
base_uri = f"{scheme}://bucket"
|
||||||
|
parts = ["path", "to", "table.lance"]
|
||||||
|
assert join_uri(base_uri, *parts) == expected
|
||||||
|
|
||||||
|
|
||||||
|
# skip this test if on windows
|
||||||
|
@pytest.mark.skipif(os.name == "nt", reason="Windows paths are not POSIX")
|
||||||
|
def test_join_uri_posix():
|
||||||
|
for base in [
|
||||||
|
# relative path
|
||||||
|
"relative/path",
|
||||||
|
"relative/path/",
|
||||||
|
# an absolute path
|
||||||
|
"/absolute/path",
|
||||||
|
"/absolute/path/",
|
||||||
|
# a file URI
|
||||||
|
"file:///absolute/path",
|
||||||
|
"file:///absolute/path/",
|
||||||
|
]:
|
||||||
|
joined = join_uri(base, "table.lance")
|
||||||
|
assert joined == str(pathlib.Path(base) / "table.lance")
|
||||||
|
joined = join_uri(pathlib.Path(base), "table.lance")
|
||||||
|
assert joined == pathlib.Path(base) / "table.lance"
|
||||||
|
|
||||||
|
|
||||||
|
# skip this test if not on windows
|
||||||
|
@pytest.mark.skipif(os.name != "nt", reason="Windows paths are not POSIX")
|
||||||
|
def test_local_join_uri_windows():
|
||||||
|
# https://learn.microsoft.com/en-us/dotnet/standard/io/file-path-formats
|
||||||
|
for base in [
|
||||||
|
# windows relative path
|
||||||
|
"relative\\path",
|
||||||
|
"relative\\path\\",
|
||||||
|
# windows absolute path from current drive
|
||||||
|
"c:\\absolute\\path",
|
||||||
|
# relative path from root of current drive
|
||||||
|
"\\relative\\path",
|
||||||
|
]:
|
||||||
|
joined = join_uri(base, "table.lance")
|
||||||
|
assert joined == str(pathlib.Path(base) / "table.lance")
|
||||||
|
joined = join_uri(pathlib.Path(base), "table.lance")
|
||||||
|
assert joined == pathlib.Path(base) / "table.lance"
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "vectordb-node"
|
name = "vectordb-node"
|
||||||
version = "0.3.9"
|
version = "0.4.0"
|
||||||
description = "Serverless, low-latency vector database for AI applications"
|
description = "Serverless, low-latency vector database for AI applications"
|
||||||
license = "Apache-2.0"
|
license = "Apache-2.0"
|
||||||
edition = "2018"
|
edition = "2018"
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ pub enum Error {
|
|||||||
#[snafu(display("column '{name}' is missing"))]
|
#[snafu(display("column '{name}' is missing"))]
|
||||||
MissingColumn { name: String },
|
MissingColumn { name: String },
|
||||||
#[snafu(display("{name}: {message}"))]
|
#[snafu(display("{name}: {message}"))]
|
||||||
RangeError { name: String, message: String },
|
OutOfRange { name: String, message: String },
|
||||||
#[snafu(display("{index_type} is not a valid index type"))]
|
#[snafu(display("{index_type} is not a valid index type"))]
|
||||||
InvalidIndexType { index_type: String },
|
InvalidIndexType { index_type: String },
|
||||||
|
|
||||||
|
|||||||
@@ -65,12 +65,10 @@ fn get_index_params_builder(
|
|||||||
obj.get_opt::<JsString, _, _>(cx, "index_name")?
|
obj.get_opt::<JsString, _, _>(cx, "index_name")?
|
||||||
.map(|s| index_builder.index_name(s.value(cx)));
|
.map(|s| index_builder.index_name(s.value(cx)));
|
||||||
|
|
||||||
obj.get_opt::<JsString, _, _>(cx, "metric_type")?
|
if let Some(metric_type) = obj.get_opt::<JsString, _, _>(cx, "metric_type")? {
|
||||||
.map(|s| MetricType::try_from(s.value(cx).as_str()))
|
let metric_type = MetricType::try_from(metric_type.value(cx).as_str()).unwrap();
|
||||||
.map(|mt| {
|
index_builder.metric_type(metric_type);
|
||||||
let metric_type = mt.unwrap();
|
}
|
||||||
index_builder.metric_type(metric_type);
|
|
||||||
});
|
|
||||||
|
|
||||||
let num_partitions = obj.get_opt_usize(cx, "num_partitions")?;
|
let num_partitions = obj.get_opt_usize(cx, "num_partitions")?;
|
||||||
let max_iters = obj.get_opt_usize(cx, "max_iters")?;
|
let max_iters = obj.get_opt_usize(cx, "max_iters")?;
|
||||||
@@ -85,23 +83,29 @@ fn get_index_params_builder(
|
|||||||
index_builder.ivf_params(ivf_params)
|
index_builder.ivf_params(ivf_params)
|
||||||
});
|
});
|
||||||
|
|
||||||
obj.get_opt::<JsBoolean, _, _>(cx, "use_opq")?
|
if let Some(use_opq) = obj.get_opt::<JsBoolean, _, _>(cx, "use_opq")? {
|
||||||
.map(|s| pq_params.use_opq = s.value(cx));
|
pq_params.use_opq = use_opq.value(cx);
|
||||||
|
}
|
||||||
|
|
||||||
obj.get_opt_usize(cx, "num_sub_vectors")?
|
if let Some(num_sub_vectors) = obj.get_opt_usize(cx, "num_sub_vectors")? {
|
||||||
.map(|s| pq_params.num_sub_vectors = s);
|
pq_params.num_sub_vectors = num_sub_vectors;
|
||||||
|
}
|
||||||
|
|
||||||
obj.get_opt_usize(cx, "num_bits")?
|
if let Some(num_bits) = obj.get_opt_usize(cx, "num_bits")? {
|
||||||
.map(|s| pq_params.num_bits = s);
|
pq_params.num_bits = num_bits;
|
||||||
|
}
|
||||||
|
|
||||||
obj.get_opt_usize(cx, "max_iters")?
|
if let Some(max_iters) = obj.get_opt_usize(cx, "max_iters")? {
|
||||||
.map(|s| pq_params.max_iters = s);
|
pq_params.max_iters = max_iters;
|
||||||
|
}
|
||||||
|
|
||||||
obj.get_opt_usize(cx, "max_opq_iters")?
|
if let Some(max_opq_iters) = obj.get_opt_usize(cx, "max_opq_iters")? {
|
||||||
.map(|s| pq_params.max_opq_iters = s);
|
pq_params.max_opq_iters = max_opq_iters;
|
||||||
|
}
|
||||||
|
|
||||||
obj.get_opt::<JsBoolean, _, _>(cx, "replace")?
|
if let Some(replace) = obj.get_opt::<JsBoolean, _, _>(cx, "replace")? {
|
||||||
.map(|s| index_builder.replace(s.value(cx)));
|
index_builder.replace(replace.value(cx));
|
||||||
|
}
|
||||||
|
|
||||||
Ok(index_builder)
|
Ok(index_builder)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -47,15 +47,15 @@ fn f64_to_u32_safe(n: f64, key: &str) -> Result<u32> {
|
|||||||
use conv::*;
|
use conv::*;
|
||||||
|
|
||||||
n.approx_as::<u32>().map_err(|e| match e {
|
n.approx_as::<u32>().map_err(|e| match e {
|
||||||
FloatError::NegOverflow(_) => Error::RangeError {
|
FloatError::NegOverflow(_) => Error::OutOfRange {
|
||||||
name: key.into(),
|
name: key.into(),
|
||||||
message: "must be > 0".to_string(),
|
message: "must be > 0".to_string(),
|
||||||
},
|
},
|
||||||
FloatError::PosOverflow(_) => Error::RangeError {
|
FloatError::PosOverflow(_) => Error::OutOfRange {
|
||||||
name: key.into(),
|
name: key.into(),
|
||||||
message: format!("must be < {}", u32::MAX),
|
message: format!("must be < {}", u32::MAX),
|
||||||
},
|
},
|
||||||
FloatError::NotANumber(_) => Error::RangeError {
|
FloatError::NotANumber(_) => Error::OutOfRange {
|
||||||
name: key.into(),
|
name: key.into(),
|
||||||
message: "not a valid number".to_string(),
|
message: "not a valid number".to_string(),
|
||||||
},
|
},
|
||||||
@@ -66,15 +66,15 @@ fn f64_to_usize_safe(n: f64, key: &str) -> Result<usize> {
|
|||||||
use conv::*;
|
use conv::*;
|
||||||
|
|
||||||
n.approx_as::<usize>().map_err(|e| match e {
|
n.approx_as::<usize>().map_err(|e| match e {
|
||||||
FloatError::NegOverflow(_) => Error::RangeError {
|
FloatError::NegOverflow(_) => Error::OutOfRange {
|
||||||
name: key.into(),
|
name: key.into(),
|
||||||
message: "must be > 0".to_string(),
|
message: "must be > 0".to_string(),
|
||||||
},
|
},
|
||||||
FloatError::PosOverflow(_) => Error::RangeError {
|
FloatError::PosOverflow(_) => Error::OutOfRange {
|
||||||
name: key.into(),
|
name: key.into(),
|
||||||
message: format!("must be < {}", usize::MAX),
|
message: format!("must be < {}", usize::MAX),
|
||||||
},
|
},
|
||||||
FloatError::NotANumber(_) => Error::RangeError {
|
FloatError::NotANumber(_) => Error::OutOfRange {
|
||||||
name: key.into(),
|
name: key.into(),
|
||||||
message: "not a valid number".to_string(),
|
message: "not a valid number".to_string(),
|
||||||
},
|
},
|
||||||
|
|||||||
@@ -25,11 +25,11 @@ impl JsQuery {
|
|||||||
let limit = query_obj
|
let limit = query_obj
|
||||||
.get_opt::<JsNumber, _, _>(&mut cx, "_limit")?
|
.get_opt::<JsNumber, _, _>(&mut cx, "_limit")?
|
||||||
.map(|value| {
|
.map(|value| {
|
||||||
let limit = value.value(&mut cx) as u64;
|
let limit = value.value(&mut cx);
|
||||||
if limit <= 0 {
|
if limit <= 0.0 {
|
||||||
panic!("Limit must be a positive integer");
|
panic!("Limit must be a positive integer");
|
||||||
}
|
}
|
||||||
limit
|
limit as u64
|
||||||
});
|
});
|
||||||
let select = query_obj
|
let select = query_obj
|
||||||
.get_opt::<JsArray, _, _>(&mut cx, "_select")?
|
.get_opt::<JsArray, _, _>(&mut cx, "_select")?
|
||||||
@@ -73,7 +73,7 @@ impl JsQuery {
|
|||||||
|
|
||||||
rt.spawn(async move {
|
rt.spawn(async move {
|
||||||
let mut builder = table
|
let mut builder = table
|
||||||
.search(query.map(|q| Float32Array::from(q)))
|
.search(query.map(Float32Array::from))
|
||||||
.refine_factor(refine_factor)
|
.refine_factor(refine_factor)
|
||||||
.nprobes(nprobes)
|
.nprobes(nprobes)
|
||||||
.filter(filter)
|
.filter(filter)
|
||||||
|
|||||||
@@ -45,7 +45,7 @@ impl JsTable {
|
|||||||
let table_name = cx.argument::<JsString>(0)?.value(&mut cx);
|
let table_name = cx.argument::<JsString>(0)?.value(&mut cx);
|
||||||
let buffer = cx.argument::<JsBuffer>(1)?;
|
let buffer = cx.argument::<JsBuffer>(1)?;
|
||||||
let (batches, schema) =
|
let (batches, schema) =
|
||||||
arrow_buffer_to_record_batch(buffer.as_slice(&mut cx)).or_throw(&mut cx)?;
|
arrow_buffer_to_record_batch(buffer.as_slice(&cx)).or_throw(&mut cx)?;
|
||||||
|
|
||||||
// Write mode
|
// Write mode
|
||||||
let mode = match cx.argument::<JsString>(2)?.value(&mut cx).as_str() {
|
let mode = match cx.argument::<JsString>(2)?.value(&mut cx).as_str() {
|
||||||
@@ -93,7 +93,7 @@ impl JsTable {
|
|||||||
let buffer = cx.argument::<JsBuffer>(0)?;
|
let buffer = cx.argument::<JsBuffer>(0)?;
|
||||||
let write_mode = cx.argument::<JsString>(1)?.value(&mut cx);
|
let write_mode = cx.argument::<JsString>(1)?.value(&mut cx);
|
||||||
let (batches, schema) =
|
let (batches, schema) =
|
||||||
arrow_buffer_to_record_batch(buffer.as_slice(&mut cx)).or_throw(&mut cx)?;
|
arrow_buffer_to_record_batch(buffer.as_slice(&cx)).or_throw(&mut cx)?;
|
||||||
let rt = runtime(&mut cx)?;
|
let rt = runtime(&mut cx)?;
|
||||||
let channel = cx.channel();
|
let channel = cx.channel();
|
||||||
let mut table = js_table.table.clone();
|
let mut table = js_table.table.clone();
|
||||||
@@ -186,7 +186,7 @@ impl JsTable {
|
|||||||
.downcast_or_throw::<JsString, _>(&mut cx)?;
|
.downcast_or_throw::<JsString, _>(&mut cx)?;
|
||||||
|
|
||||||
let value = updates_arg
|
let value = updates_arg
|
||||||
.get_value(&mut cx, property.clone())?
|
.get_value(&mut cx, property)?
|
||||||
.downcast_or_throw::<JsString, _>(&mut cx)?;
|
.downcast_or_throw::<JsString, _>(&mut cx)?;
|
||||||
|
|
||||||
let property = property.value(&mut cx);
|
let property = property.value(&mut cx);
|
||||||
@@ -216,7 +216,7 @@ impl JsTable {
|
|||||||
.map(|(k, v)| (k.as_str(), v.as_str()))
|
.map(|(k, v)| (k.as_str(), v.as_str()))
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
let predicate = predicate.as_ref().map(|s| s.as_str());
|
let predicate = predicate.as_deref();
|
||||||
|
|
||||||
let update_result = table.update(predicate, updates_arg).await;
|
let update_result = table.update(predicate, updates_arg).await;
|
||||||
deferred.settle_with(&channel, move |mut cx| {
|
deferred.settle_with(&channel, move |mut cx| {
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "vectordb"
|
name = "vectordb"
|
||||||
version = "0.3.9"
|
version = "0.4.0"
|
||||||
edition = "2021"
|
edition = "2021"
|
||||||
description = "LanceDB: A serverless, low-latency vector database for AI applications"
|
description = "LanceDB: A serverless, low-latency vector database for AI applications"
|
||||||
license = "Apache-2.0"
|
license = "Apache-2.0"
|
||||||
|
|||||||
@@ -26,7 +26,7 @@ use futures::{stream::BoxStream, FutureExt, StreamExt};
|
|||||||
use lance::io::object_store::WrappingObjectStore;
|
use lance::io::object_store::WrappingObjectStore;
|
||||||
use object_store::{
|
use object_store::{
|
||||||
path::Path, Error, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore,
|
path::Path, Error, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore,
|
||||||
Result,
|
PutOptions, PutResult, Result,
|
||||||
};
|
};
|
||||||
|
|
||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
@@ -72,13 +72,28 @@ impl PrimaryOnly for Path {
|
|||||||
/// Note: this object store does not mirror writes to *.manifest files
|
/// Note: this object store does not mirror writes to *.manifest files
|
||||||
#[async_trait]
|
#[async_trait]
|
||||||
impl ObjectStore for MirroringObjectStore {
|
impl ObjectStore for MirroringObjectStore {
|
||||||
async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> {
|
async fn put(&self, location: &Path, bytes: Bytes) -> Result<PutResult> {
|
||||||
if location.primary_only() {
|
if location.primary_only() {
|
||||||
self.primary.put(location, bytes).await
|
self.primary.put(location, bytes).await
|
||||||
} else {
|
} else {
|
||||||
self.secondary.put(location, bytes.clone()).await?;
|
self.secondary.put(location, bytes.clone()).await?;
|
||||||
self.primary.put(location, bytes).await?;
|
self.primary.put(location, bytes).await
|
||||||
Ok(())
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn put_opts(
|
||||||
|
&self,
|
||||||
|
location: &Path,
|
||||||
|
bytes: Bytes,
|
||||||
|
options: PutOptions,
|
||||||
|
) -> Result<PutResult> {
|
||||||
|
if location.primary_only() {
|
||||||
|
self.primary.put_opts(location, bytes, options).await
|
||||||
|
} else {
|
||||||
|
self.secondary
|
||||||
|
.put_opts(location, bytes.clone(), options.clone())
|
||||||
|
.await?;
|
||||||
|
self.primary.put_opts(location, bytes, options).await
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -129,8 +144,8 @@ impl ObjectStore for MirroringObjectStore {
|
|||||||
self.primary.delete(location).await
|
self.primary.delete(location).await
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn list(&self, prefix: Option<&Path>) -> Result<BoxStream<'_, Result<ObjectMeta>>> {
|
fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result<ObjectMeta>> {
|
||||||
self.primary.list(prefix).await
|
self.primary.list(prefix)
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result<ListResult> {
|
async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result<ListResult> {
|
||||||
@@ -359,7 +374,9 @@ mod test {
|
|||||||
assert_eq!(t.count_rows().await.unwrap(), 100);
|
assert_eq!(t.count_rows().await.unwrap(), 100);
|
||||||
|
|
||||||
let q = t
|
let q = t
|
||||||
.search(Some(PrimitiveArray::from_iter_values(vec![0.1, 0.1, 0.1, 0.1])))
|
.search(Some(PrimitiveArray::from_iter_values(vec![
|
||||||
|
0.1, 0.1, 0.1, 0.1,
|
||||||
|
])))
|
||||||
.limit(10)
|
.limit(10)
|
||||||
.execute()
|
.execute()
|
||||||
.await
|
.await
|
||||||
|
|||||||
@@ -25,6 +25,7 @@ use crate::error::Result;
|
|||||||
pub struct Query {
|
pub struct Query {
|
||||||
pub dataset: Arc<Dataset>,
|
pub dataset: Arc<Dataset>,
|
||||||
pub query_vector: Option<Float32Array>,
|
pub query_vector: Option<Float32Array>,
|
||||||
|
pub column: String,
|
||||||
pub limit: Option<usize>,
|
pub limit: Option<usize>,
|
||||||
pub filter: Option<String>,
|
pub filter: Option<String>,
|
||||||
pub select: Option<Vec<String>>,
|
pub select: Option<Vec<String>>,
|
||||||
@@ -50,6 +51,7 @@ impl Query {
|
|||||||
Query {
|
Query {
|
||||||
dataset,
|
dataset,
|
||||||
query_vector: vector,
|
query_vector: vector,
|
||||||
|
column: crate::table::VECTOR_COLUMN_NAME.to_string(),
|
||||||
limit: None,
|
limit: None,
|
||||||
nprobes: 20,
|
nprobes: 20,
|
||||||
refine_factor: None,
|
refine_factor: None,
|
||||||
@@ -71,7 +73,7 @@ impl Query {
|
|||||||
|
|
||||||
if let Some(query) = self.query_vector.as_ref() {
|
if let Some(query) = self.query_vector.as_ref() {
|
||||||
// If there is a vector query, default to limit=10 if unspecified
|
// If there is a vector query, default to limit=10 if unspecified
|
||||||
scanner.nearest(crate::table::VECTOR_COLUMN_NAME, query, self.limit.unwrap_or(10))?;
|
scanner.nearest(&self.column, query, self.limit.unwrap_or(10))?;
|
||||||
} else {
|
} else {
|
||||||
// If there is no vector query, it's ok to not have a limit
|
// If there is no vector query, it's ok to not have a limit
|
||||||
scanner.limit(self.limit.map(|limit| limit as i64), None)?;
|
scanner.limit(self.limit.map(|limit| limit as i64), None)?;
|
||||||
@@ -87,6 +89,16 @@ impl Query {
|
|||||||
Ok(scanner.try_into_stream().await?)
|
Ok(scanner.try_into_stream().await?)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Set the column to query
|
||||||
|
///
|
||||||
|
/// # Arguments
|
||||||
|
///
|
||||||
|
/// * `column` - The column name
|
||||||
|
pub fn column(mut self, column: &str) -> Query {
|
||||||
|
self.column = column.into();
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
/// Set the maximum number of results to return.
|
/// Set the maximum number of results to return.
|
||||||
///
|
///
|
||||||
/// # Arguments
|
/// # Arguments
|
||||||
@@ -176,7 +188,10 @@ mod tests {
|
|||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
use arrow_array::{Float32Array, RecordBatch, RecordBatchIterator, RecordBatchReader, cast::AsArray, Int32Array};
|
use arrow_array::{
|
||||||
|
cast::AsArray, Float32Array, Int32Array, RecordBatch, RecordBatchIterator,
|
||||||
|
RecordBatchReader,
|
||||||
|
};
|
||||||
use arrow_schema::{DataType, Field as ArrowField, Schema as ArrowSchema};
|
use arrow_schema::{DataType, Field as ArrowField, Schema as ArrowSchema};
|
||||||
use futures::StreamExt;
|
use futures::StreamExt;
|
||||||
use lance::dataset::Dataset;
|
use lance::dataset::Dataset;
|
||||||
@@ -260,7 +275,7 @@ mod tests {
|
|||||||
let mut stream = result.expect("should have result");
|
let mut stream = result.expect("should have result");
|
||||||
// should only have one batch
|
// should only have one batch
|
||||||
while let Some(batch) = stream.next().await {
|
while let Some(batch) = stream.next().await {
|
||||||
let b = batch.expect("should be Ok");
|
let b = batch.expect("should be Ok");
|
||||||
// cast arr into Int32Array
|
// cast arr into Int32Array
|
||||||
let arr: &Int32Array = b["id"].as_primitive();
|
let arr: &Int32Array = b["id"].as_primitive();
|
||||||
assert!(arr.iter().all(|x| x.unwrap() % 2 == 0));
|
assert!(arr.iter().all(|x| x.unwrap() % 2 == 0));
|
||||||
|
|||||||
Reference in New Issue
Block a user