mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-23 13:29:57 +00:00
Compare commits
30 Commits
python-v0.
...
python-v0.
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a9897d9d85 | ||
|
|
acda7a4589 | ||
|
|
dac0857745 | ||
|
|
0a9e1eab75 | ||
|
|
d999d72c8d | ||
|
|
de4720993e | ||
|
|
6c14a307e2 | ||
|
|
43747278c8 | ||
|
|
e5f42a850e | ||
|
|
7920ecf66e | ||
|
|
28e1b70e4b | ||
|
|
52b79d2b1e | ||
|
|
c05d45150d | ||
|
|
48ed3bb544 | ||
|
|
bcfc93cc88 | ||
|
|
214d0debf5 | ||
|
|
f059372137 | ||
|
|
3dc1803c07 | ||
|
|
d0501f65f1 | ||
|
|
4703cc6894 | ||
|
|
493f9ce467 | ||
|
|
5c759505b8 | ||
|
|
bb6a39727e | ||
|
|
d57bed90e5 | ||
|
|
648327e90c | ||
|
|
6c7e81ee57 | ||
|
|
905e9d4738 | ||
|
|
38642e349c | ||
|
|
6879861ea8 | ||
|
|
88325e488e |
@@ -1,5 +1,5 @@
|
||||
[tool.bumpversion]
|
||||
current_version = "0.15.0-beta.0"
|
||||
current_version = "0.15.1-beta.0"
|
||||
parse = """(?x)
|
||||
(?P<major>0|[1-9]\\d*)\\.
|
||||
(?P<minor>0|[1-9]\\d*)\\.
|
||||
|
||||
@@ -52,12 +52,7 @@ runs:
|
||||
args: ${{ inputs.args }}
|
||||
before-script-linux: |
|
||||
set -e
|
||||
apt install -y unzip
|
||||
if [ $(uname -m) = "x86_64" ]; then
|
||||
PROTOC_ARCH="x86_64"
|
||||
else
|
||||
PROTOC_ARCH="aarch_64"
|
||||
fi
|
||||
curl -L https://github.com/protocolbuffers/protobuf/releases/download/v24.4/protoc-24.4-linux-$PROTOC_ARCH.zip > /tmp/protoc.zip \
|
||||
yum install -y openssl-devel clang \
|
||||
&& curl -L https://github.com/protocolbuffers/protobuf/releases/download/v24.4/protoc-24.4-linux-aarch_64.zip > /tmp/protoc.zip \
|
||||
&& unzip /tmp/protoc.zip -d /usr/local \
|
||||
&& rm /tmp/protoc.zip
|
||||
|
||||
14
.github/workflows/pypi-publish.yml
vendored
14
.github/workflows/pypi-publish.yml
vendored
@@ -15,15 +15,21 @@ jobs:
|
||||
- platform: x86_64
|
||||
manylinux: "2_17"
|
||||
extra_args: ""
|
||||
runner: ubuntu-22.04
|
||||
- platform: x86_64
|
||||
manylinux: "2_28"
|
||||
extra_args: "--features fp16kernels"
|
||||
runner: ubuntu-22.04
|
||||
- platform: aarch64
|
||||
manylinux: "2_24"
|
||||
manylinux: "2_17"
|
||||
extra_args: ""
|
||||
# We don't build fp16 kernels for aarch64, because it uses
|
||||
# cross compilation image, which doesn't have a new enough compiler.
|
||||
runs-on: "ubuntu-22.04"
|
||||
# For successful fat LTO builds, we need a large runner to avoid OOM errors.
|
||||
runner: ubuntu-2404-8x-arm64
|
||||
- platform: aarch64
|
||||
manylinux: "2_28"
|
||||
extra_args: "--features fp16kernels"
|
||||
runner: ubuntu-2404-8x-arm64
|
||||
runs-on: ${{ matrix.config.runner }}
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
|
||||
18
Cargo.toml
18
Cargo.toml
@@ -21,14 +21,16 @@ categories = ["database-implementations"]
|
||||
rust-version = "1.78.0"
|
||||
|
||||
[workspace.dependencies]
|
||||
lance = { "version" = "=0.22.0", "features" = ["dynamodb"] }
|
||||
lance-io = "=0.22.0"
|
||||
lance-index = "=0.22.0"
|
||||
lance-linalg = "=0.22.0"
|
||||
lance-table = "=0.22.0"
|
||||
lance-testing = "=0.22.0"
|
||||
lance-datafusion = "=0.22.0"
|
||||
lance-encoding = "=0.22.0"
|
||||
lance = { "version" = "=0.23.0", "features" = [
|
||||
"dynamodb",
|
||||
], git = "https://github.com/lancedb/lance.git", tag = "v0.23.0-beta.3" }
|
||||
lance-io = { version = "=0.23.0", git = "https://github.com/lancedb/lance.git", tag = "v0.23.0-beta.3" }
|
||||
lance-index = { version = "=0.23.0", git = "https://github.com/lancedb/lance.git", tag = "v0.23.0-beta.3" }
|
||||
lance-linalg = { version = "=0.23.0", git = "https://github.com/lancedb/lance.git", tag = "v0.23.0-beta.3" }
|
||||
lance-table = { version = "=0.23.0", git = "https://github.com/lancedb/lance.git", tag = "v0.23.0-beta.3" }
|
||||
lance-testing = { version = "=0.23.0", git = "https://github.com/lancedb/lance.git", tag = "v0.23.0-beta.3" }
|
||||
lance-datafusion = { version = "=0.23.0", git = "https://github.com/lancedb/lance.git", tag = "v0.23.0-beta.3" }
|
||||
lance-encoding = { version = "=0.23.0", git = "https://github.com/lancedb/lance.git", tag = "v0.23.0-beta.3" }
|
||||
# Note that this one does not include pyarrow
|
||||
arrow = { version = "53.2", optional = false }
|
||||
arrow-array = "53.2"
|
||||
|
||||
@@ -40,37 +40,4 @@ The [quickstart](../basic.md) contains a more complete example.
|
||||
|
||||
## Development
|
||||
|
||||
```sh
|
||||
npm run build
|
||||
npm run test
|
||||
```
|
||||
|
||||
### Running lint / format
|
||||
|
||||
LanceDb uses [biome](https://biomejs.dev/) for linting and formatting. if you are using VSCode you will need to install the official [Biome](https://marketplace.visualstudio.com/items?itemName=biomejs.biome) extension.
|
||||
To manually lint your code you can run:
|
||||
|
||||
```sh
|
||||
npm run lint
|
||||
```
|
||||
|
||||
to automatically fix all fixable issues:
|
||||
|
||||
```sh
|
||||
npm run lint-fix
|
||||
```
|
||||
|
||||
If you do not have your workspace root set to the `nodejs` directory, unfortunately the extension will not work. You can still run the linting and formatting commands manually.
|
||||
|
||||
### Generating docs
|
||||
|
||||
```sh
|
||||
npm run docs
|
||||
|
||||
cd ../docs
|
||||
# Asssume the virtual environment was created
|
||||
# python3 -m venv venv
|
||||
# pip install -r requirements.txt
|
||||
. ./venv/bin/activate
|
||||
mkdocs build
|
||||
```
|
||||
See [CONTRIBUTING.md](_media/CONTRIBUTING.md) for information on how to contribute to LanceDB.
|
||||
|
||||
76
docs/src/js/_media/CONTRIBUTING.md
Normal file
76
docs/src/js/_media/CONTRIBUTING.md
Normal file
@@ -0,0 +1,76 @@
|
||||
# Contributing to LanceDB Typescript
|
||||
|
||||
This document outlines the process for contributing to LanceDB Typescript.
|
||||
For general contribution guidelines, see [CONTRIBUTING.md](../CONTRIBUTING.md).
|
||||
|
||||
## Project layout
|
||||
|
||||
The Typescript package is a wrapper around the Rust library, `lancedb`. We use
|
||||
the [napi-rs](https://napi.rs/) library to create the bindings between Rust and
|
||||
Typescript.
|
||||
|
||||
* `src/`: Rust bindings source code
|
||||
* `lancedb/`: Typescript package source code
|
||||
* `__test__/`: Unit tests
|
||||
* `examples/`: An npm package with the examples shown in the documentation
|
||||
|
||||
## Development environment
|
||||
|
||||
To set up your development environment, you will need to install the following:
|
||||
|
||||
1. Node.js 14 or later
|
||||
2. Rust's package manager, Cargo. Use [rustup](https://rustup.rs/) to install.
|
||||
3. [protoc](https://grpc.io/docs/protoc-installation/) (Protocol Buffers compiler)
|
||||
|
||||
Initial setup:
|
||||
|
||||
```shell
|
||||
npm install
|
||||
```
|
||||
|
||||
### Commit Hooks
|
||||
|
||||
It is **highly recommended** to install the [pre-commit](https://pre-commit.com/) hooks to ensure that your
|
||||
code is formatted correctly and passes basic checks before committing:
|
||||
|
||||
```shell
|
||||
pre-commit install
|
||||
```
|
||||
|
||||
## Development
|
||||
|
||||
Most common development commands can be run using the npm scripts.
|
||||
|
||||
Build the package
|
||||
|
||||
```shell
|
||||
npm install
|
||||
npm run build
|
||||
```
|
||||
|
||||
Lint:
|
||||
|
||||
```shell
|
||||
npm run lint
|
||||
```
|
||||
|
||||
Format and fix lints:
|
||||
|
||||
```shell
|
||||
npm run lint-fix
|
||||
```
|
||||
|
||||
Run tests:
|
||||
|
||||
```shell
|
||||
npm test
|
||||
```
|
||||
|
||||
To run a single test:
|
||||
|
||||
```shell
|
||||
# Single file: table.test.ts
|
||||
npm test -- table.test.ts
|
||||
# Single test: 'merge insert' in table.test.ts
|
||||
npm test -- table.test.ts --testNamePattern=merge\ insert
|
||||
```
|
||||
@@ -317,6 +317,32 @@ then call ``cleanup_files`` to remove the old files.
|
||||
|
||||
***
|
||||
|
||||
### dropIndex()
|
||||
|
||||
```ts
|
||||
abstract dropIndex(name): Promise<void>
|
||||
```
|
||||
|
||||
Drop an index from the table.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **name**: `string`
|
||||
The name of the index.
|
||||
|
||||
#### Returns
|
||||
|
||||
`Promise`<`void`>
|
||||
|
||||
#### Note
|
||||
|
||||
This does not delete the index from disk, it just removes it from the table.
|
||||
To delete the index, run [Table#optimize](Table.md#optimize) after dropping the index.
|
||||
|
||||
Use [Table.listIndices](Table.md#listindices) to find the names of the indices.
|
||||
|
||||
***
|
||||
|
||||
### indexStats()
|
||||
|
||||
```ts
|
||||
@@ -336,6 +362,8 @@ List all the stats of a specified index
|
||||
|
||||
The stats of the index. If the index does not exist, it will return undefined
|
||||
|
||||
Use [Table.listIndices](Table.md#listindices) to find the names of the indices.
|
||||
|
||||
***
|
||||
|
||||
### isOpen()
|
||||
|
||||
@@ -128,6 +128,24 @@ whose data type is a fixed-size-list of floats.
|
||||
|
||||
***
|
||||
|
||||
### distanceRange()
|
||||
|
||||
```ts
|
||||
distanceRange(lowerBound?, upperBound?): VectorQuery
|
||||
```
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **lowerBound?**: `number`
|
||||
|
||||
* **upperBound?**: `number`
|
||||
|
||||
#### Returns
|
||||
|
||||
[`VectorQuery`](VectorQuery.md)
|
||||
|
||||
***
|
||||
|
||||
### distanceType()
|
||||
|
||||
```ts
|
||||
@@ -528,6 +546,22 @@ distance between the query vector and the actual uncompressed vector.
|
||||
|
||||
***
|
||||
|
||||
### rerank()
|
||||
|
||||
```ts
|
||||
rerank(reranker): VectorQuery
|
||||
```
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **reranker**: [`Reranker`](../namespaces/rerankers/interfaces/Reranker.md)
|
||||
|
||||
#### Returns
|
||||
|
||||
[`VectorQuery`](VectorQuery.md)
|
||||
|
||||
***
|
||||
|
||||
### select()
|
||||
|
||||
```ts
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
## Namespaces
|
||||
|
||||
- [embedding](namespaces/embedding/README.md)
|
||||
- [rerankers](namespaces/rerankers/README.md)
|
||||
|
||||
## Enumerations
|
||||
|
||||
|
||||
@@ -68,6 +68,21 @@ The default value is 50.
|
||||
|
||||
***
|
||||
|
||||
### numBits?
|
||||
|
||||
```ts
|
||||
optional numBits: number;
|
||||
```
|
||||
|
||||
Number of bits per sub-vector.
|
||||
|
||||
This value controls how much each subvector is compressed. The more bits the more
|
||||
accurate the index will be but the slower search. The default is 8 bits.
|
||||
|
||||
The number of bits must be 4 or 8.
|
||||
|
||||
***
|
||||
|
||||
### numPartitions?
|
||||
|
||||
```ts
|
||||
|
||||
17
docs/src/js/namespaces/rerankers/README.md
Normal file
17
docs/src/js/namespaces/rerankers/README.md
Normal file
@@ -0,0 +1,17 @@
|
||||
[**@lancedb/lancedb**](../../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../../globals.md) / rerankers
|
||||
|
||||
# rerankers
|
||||
|
||||
## Index
|
||||
|
||||
### Classes
|
||||
|
||||
- [RRFReranker](classes/RRFReranker.md)
|
||||
|
||||
### Interfaces
|
||||
|
||||
- [Reranker](interfaces/Reranker.md)
|
||||
66
docs/src/js/namespaces/rerankers/classes/RRFReranker.md
Normal file
66
docs/src/js/namespaces/rerankers/classes/RRFReranker.md
Normal file
@@ -0,0 +1,66 @@
|
||||
[**@lancedb/lancedb**](../../../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../../../globals.md) / [rerankers](../README.md) / RRFReranker
|
||||
|
||||
# Class: RRFReranker
|
||||
|
||||
Reranks the results using the Reciprocal Rank Fusion (RRF) algorithm.
|
||||
|
||||
Internally this uses the Rust implementation
|
||||
|
||||
## Constructors
|
||||
|
||||
### new RRFReranker()
|
||||
|
||||
```ts
|
||||
new RRFReranker(inner): RRFReranker
|
||||
```
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **inner**: `RrfReranker`
|
||||
|
||||
#### Returns
|
||||
|
||||
[`RRFReranker`](RRFReranker.md)
|
||||
|
||||
## Methods
|
||||
|
||||
### rerankHybrid()
|
||||
|
||||
```ts
|
||||
rerankHybrid(
|
||||
query,
|
||||
vecResults,
|
||||
ftsResults): Promise<RecordBatch<any>>
|
||||
```
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **query**: `string`
|
||||
|
||||
* **vecResults**: `RecordBatch`<`any`>
|
||||
|
||||
* **ftsResults**: `RecordBatch`<`any`>
|
||||
|
||||
#### Returns
|
||||
|
||||
`Promise`<`RecordBatch`<`any`>>
|
||||
|
||||
***
|
||||
|
||||
### create()
|
||||
|
||||
```ts
|
||||
static create(k): Promise<RRFReranker>
|
||||
```
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **k**: `number` = `60`
|
||||
|
||||
#### Returns
|
||||
|
||||
`Promise`<[`RRFReranker`](RRFReranker.md)>
|
||||
30
docs/src/js/namespaces/rerankers/interfaces/Reranker.md
Normal file
30
docs/src/js/namespaces/rerankers/interfaces/Reranker.md
Normal file
@@ -0,0 +1,30 @@
|
||||
[**@lancedb/lancedb**](../../../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../../../globals.md) / [rerankers](../README.md) / Reranker
|
||||
|
||||
# Interface: Reranker
|
||||
|
||||
## Methods
|
||||
|
||||
### rerankHybrid()
|
||||
|
||||
```ts
|
||||
rerankHybrid(
|
||||
query,
|
||||
vecResults,
|
||||
ftsResults): Promise<RecordBatch<any>>
|
||||
```
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **query**: `string`
|
||||
|
||||
* **vecResults**: `RecordBatch`<`any`>
|
||||
|
||||
* **ftsResults**: `RecordBatch`<`any`>
|
||||
|
||||
#### Returns
|
||||
|
||||
`Promise`<`RecordBatch`<`any`>>
|
||||
@@ -114,14 +114,17 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"data = [\n",
|
||||
" {\"vector\": [1.1, 1.2], \"lat\": 45.5, \"long\": -122.7},\n",
|
||||
" {\"vector\": [0.2, 1.8], \"lat\": 40.1, \"long\": -74.1},\n",
|
||||
"]\n",
|
||||
"import pandas as pd\n",
|
||||
"\n",
|
||||
"db.create_table(\"table2\", data)\n",
|
||||
"\n",
|
||||
"db[\"table2\"].head() "
|
||||
"data = pd.DataFrame(\n",
|
||||
" {\n",
|
||||
" \"vector\": [[1.1, 1.2, 1.3, 1.4], [0.2, 1.8, 0.4, 3.6]],\n",
|
||||
" \"lat\": [45.5, 40.1],\n",
|
||||
" \"long\": [-122.7, -74.1],\n",
|
||||
" }\n",
|
||||
")\n",
|
||||
"db.create_table(\"my_table_pandas\", data)\n",
|
||||
"db[\"my_table_pandas\"].head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -164,7 +167,7 @@
|
||||
"import pyarrow as pa\n",
|
||||
"\n",
|
||||
"custom_schema = pa.schema([\n",
|
||||
"pa.field(\"vector\", pa.list_(pa.float32(), 2)),\n",
|
||||
"pa.field(\"vector\", pa.list_(pa.float32(), 4)),\n",
|
||||
"pa.field(\"lat\", pa.float32()),\n",
|
||||
"pa.field(\"long\", pa.float32())\n",
|
||||
"])\n",
|
||||
|
||||
@@ -147,8 +147,19 @@ to return the entire (typically filtered) table. Vector searches return the
|
||||
rows nearest to a query vector and can be created with the
|
||||
[AsyncTable.vector_search][lancedb.table.AsyncTable.vector_search] method.
|
||||
|
||||
::: lancedb.query.AsyncQueryBase
|
||||
|
||||
::: lancedb.query.AsyncQuery
|
||||
options:
|
||||
inherited_members: true
|
||||
|
||||
::: lancedb.query.AsyncVectorQuery
|
||||
options:
|
||||
inherited_members: true
|
||||
|
||||
::: lancedb.query.AsyncFTSQuery
|
||||
options:
|
||||
inherited_members: true
|
||||
|
||||
::: lancedb.query.AsyncHybridQuery
|
||||
options:
|
||||
inherited_members: true
|
||||
|
||||
@@ -149,6 +149,7 @@ You can index on a column with multivector type and search on it, the query can
|
||||
where `sim` is the similarity function (e.g. cosine).
|
||||
|
||||
For now, only `cosine` metric is supported for multivector search.
|
||||
The vector value type can be `float16`, `float32` or `float64`.
|
||||
|
||||
=== "Python"
|
||||
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
<parent>
|
||||
<groupId>com.lancedb</groupId>
|
||||
<artifactId>lancedb-parent</artifactId>
|
||||
<version>0.15.0-beta.0</version>
|
||||
<version>0.15.1-beta.0</version>
|
||||
<relativePath>../pom.xml</relativePath>
|
||||
</parent>
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
|
||||
<groupId>com.lancedb</groupId>
|
||||
<artifactId>lancedb-parent</artifactId>
|
||||
<version>0.15.0-beta.0</version>
|
||||
<version>0.15.1-beta.0</version>
|
||||
<packaging>pom</packaging>
|
||||
|
||||
<name>LanceDB Parent</name>
|
||||
|
||||
124
node/package-lock.json
generated
124
node/package-lock.json
generated
@@ -1,12 +1,12 @@
|
||||
{
|
||||
"name": "vectordb",
|
||||
"version": "0.15.0-beta.0",
|
||||
"version": "0.15.1-beta.0",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "vectordb",
|
||||
"version": "0.15.0-beta.0",
|
||||
"version": "0.15.1-beta.0",
|
||||
"cpu": [
|
||||
"x64",
|
||||
"arm64"
|
||||
@@ -52,14 +52,14 @@
|
||||
"uuid": "^9.0.0"
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"@lancedb/vectordb-darwin-arm64": "0.15.0-beta.0",
|
||||
"@lancedb/vectordb-darwin-x64": "0.15.0-beta.0",
|
||||
"@lancedb/vectordb-linux-arm64-gnu": "0.15.0-beta.0",
|
||||
"@lancedb/vectordb-linux-arm64-musl": "0.15.0-beta.0",
|
||||
"@lancedb/vectordb-linux-x64-gnu": "0.15.0-beta.0",
|
||||
"@lancedb/vectordb-linux-x64-musl": "0.15.0-beta.0",
|
||||
"@lancedb/vectordb-win32-arm64-msvc": "0.15.0-beta.0",
|
||||
"@lancedb/vectordb-win32-x64-msvc": "0.15.0-beta.0"
|
||||
"@lancedb/vectordb-darwin-arm64": "0.15.1-beta.0",
|
||||
"@lancedb/vectordb-darwin-x64": "0.15.1-beta.0",
|
||||
"@lancedb/vectordb-linux-arm64-gnu": "0.15.1-beta.0",
|
||||
"@lancedb/vectordb-linux-arm64-musl": "0.15.1-beta.0",
|
||||
"@lancedb/vectordb-linux-x64-gnu": "0.15.1-beta.0",
|
||||
"@lancedb/vectordb-linux-x64-musl": "0.15.1-beta.0",
|
||||
"@lancedb/vectordb-win32-arm64-msvc": "0.15.1-beta.0",
|
||||
"@lancedb/vectordb-win32-x64-msvc": "0.15.1-beta.0"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"@apache-arrow/ts": "^14.0.2",
|
||||
@@ -329,110 +329,6 @@
|
||||
"@jridgewell/sourcemap-codec": "^1.4.10"
|
||||
}
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-darwin-arm64": {
|
||||
"version": "0.15.0-beta.0",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.15.0-beta.0.tgz",
|
||||
"integrity": "sha512-4sPAW4p1YFVfURyf0k017l6LRCz+VmN9fVUBy7W27b6EOQ3xuIb3t5xq3JAtslMPWBP3wxP8rKXXDmlbqDg3+g==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
"license": "Apache-2.0",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"darwin"
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-darwin-x64": {
|
||||
"version": "0.15.0-beta.0",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.15.0-beta.0.tgz",
|
||||
"integrity": "sha512-uzGINrBBsZattB4/ZYxdGNkTxNh3MqE6Y4nF762qo0zWWSiu+QNHQ+ZyLAZ2lwrEvwxs8LUaJNmnpn3nocHc1A==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
"license": "Apache-2.0",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"darwin"
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-linux-arm64-gnu": {
|
||||
"version": "0.15.0-beta.0",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.15.0-beta.0.tgz",
|
||||
"integrity": "sha512-bgphfea8h65vJ+bAL+vb+XEfmjskLZ+trZ3GN4n6SICU7XMGSFPl9xzPLGAj1WsoFCTJHe87DRYQpsWGlOI/LQ==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
"license": "Apache-2.0",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"linux"
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-linux-arm64-musl": {
|
||||
"version": "0.15.0-beta.0",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-musl/-/vectordb-linux-arm64-musl-0.15.0-beta.0.tgz",
|
||||
"integrity": "sha512-GpmVgqMS9ztNX53z8v0JdZiG6K1cK+mJnGZd3Gzguiavrly4mkYZ8IKNwWP9RmewUMNsFWR0IzD4VR+ojVpjlQ==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
"license": "Apache-2.0",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"linux"
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-linux-x64-gnu": {
|
||||
"version": "0.15.0-beta.0",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.15.0-beta.0.tgz",
|
||||
"integrity": "sha512-6Y/39TDv4UDVWnl8UpUJ8mqv9rUNc9Q5VR510I7w34c0ChdWvjqdcy+JFnGrraamE1DA8E6wGEz+5oG0zprkNg==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
"license": "Apache-2.0",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"linux"
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-linux-x64-musl": {
|
||||
"version": "0.15.0-beta.0",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-musl/-/vectordb-linux-x64-musl-0.15.0-beta.0.tgz",
|
||||
"integrity": "sha512-GRdW2dhf6DmynhRojjtQjs8DeARM1WpbZZKXukeofOSMv6JoRBSWKw2DzW5sF/285IMU81B0OXZE75QjLp+VJg==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
"license": "Apache-2.0",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"linux"
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-win32-arm64-msvc": {
|
||||
"version": "0.15.0-beta.0",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-arm64-msvc/-/vectordb-win32-arm64-msvc-0.15.0-beta.0.tgz",
|
||||
"integrity": "sha512-2EmRHuqqj8kC5ArUZztUWWTfNd774zL68btOlyhYL1CAiet5jIeGuFWJifdh+PXfQeLoa4GLW5LwyudIR4IHwA==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
"license": "Apache-2.0",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"win32"
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-win32-x64-msvc": {
|
||||
"version": "0.15.0-beta.0",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.15.0-beta.0.tgz",
|
||||
"integrity": "sha512-lWq9b7LnWMGO0zDsp3rsLYyAzLooV7zQP77ph9Qv9fF0e4egD5l6SmMsAdQqLQnlhbQjkRjt3XRoDsqI809fcw==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
"license": "Apache-2.0",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"win32"
|
||||
]
|
||||
},
|
||||
"node_modules/@neon-rs/cli": {
|
||||
"version": "0.0.160",
|
||||
"resolved": "https://registry.npmjs.org/@neon-rs/cli/-/cli-0.0.160.tgz",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "vectordb",
|
||||
"version": "0.15.0-beta.0",
|
||||
"version": "0.15.1-beta.0",
|
||||
"description": " Serverless, low-latency vector database for AI applications",
|
||||
"private": false,
|
||||
"main": "dist/index.js",
|
||||
@@ -92,13 +92,13 @@
|
||||
}
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"@lancedb/vectordb-darwin-x64": "0.15.0-beta.0",
|
||||
"@lancedb/vectordb-darwin-arm64": "0.15.0-beta.0",
|
||||
"@lancedb/vectordb-linux-x64-gnu": "0.15.0-beta.0",
|
||||
"@lancedb/vectordb-linux-arm64-gnu": "0.15.0-beta.0",
|
||||
"@lancedb/vectordb-linux-x64-musl": "0.15.0-beta.0",
|
||||
"@lancedb/vectordb-linux-arm64-musl": "0.15.0-beta.0",
|
||||
"@lancedb/vectordb-win32-x64-msvc": "0.15.0-beta.0",
|
||||
"@lancedb/vectordb-win32-arm64-msvc": "0.15.0-beta.0"
|
||||
"@lancedb/vectordb-darwin-x64": "0.15.1-beta.0",
|
||||
"@lancedb/vectordb-darwin-arm64": "0.15.1-beta.0",
|
||||
"@lancedb/vectordb-linux-x64-gnu": "0.15.1-beta.0",
|
||||
"@lancedb/vectordb-linux-arm64-gnu": "0.15.1-beta.0",
|
||||
"@lancedb/vectordb-linux-x64-musl": "0.15.1-beta.0",
|
||||
"@lancedb/vectordb-linux-arm64-musl": "0.15.1-beta.0",
|
||||
"@lancedb/vectordb-win32-x64-msvc": "0.15.1-beta.0",
|
||||
"@lancedb/vectordb-win32-arm64-msvc": "0.15.1-beta.0"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
[package]
|
||||
name = "lancedb-nodejs"
|
||||
edition.workspace = true
|
||||
version = "0.15.0-beta.0"
|
||||
version = "0.15.1-beta.0"
|
||||
license.workspace = true
|
||||
description.workspace = true
|
||||
repository.workspace = true
|
||||
|
||||
@@ -83,6 +83,74 @@ describe("embedding functions", () => {
|
||||
expect(vector0).toEqual([1, 2, 3]);
|
||||
});
|
||||
|
||||
it("should be able to append and upsert using embedding function", async () => {
|
||||
@register()
|
||||
class MockEmbeddingFunction extends EmbeddingFunction<string> {
|
||||
toJSON(): object {
|
||||
return {};
|
||||
}
|
||||
ndims() {
|
||||
return 3;
|
||||
}
|
||||
embeddingDataType(): Float {
|
||||
return new Float32();
|
||||
}
|
||||
async computeQueryEmbeddings(_data: string) {
|
||||
return [1, 2, 3];
|
||||
}
|
||||
async computeSourceEmbeddings(data: string[]) {
|
||||
return Array.from({ length: data.length }).fill([
|
||||
1, 2, 3,
|
||||
]) as number[][];
|
||||
}
|
||||
}
|
||||
const func = new MockEmbeddingFunction();
|
||||
const db = await connect(tmpDir.name);
|
||||
const table = await db.createTable(
|
||||
"test",
|
||||
[
|
||||
{ id: 1, text: "hello" },
|
||||
{ id: 2, text: "world" },
|
||||
],
|
||||
{
|
||||
embeddingFunction: {
|
||||
function: func,
|
||||
sourceColumn: "text",
|
||||
},
|
||||
},
|
||||
);
|
||||
|
||||
const schema = await table.schema();
|
||||
expect(schema.metadata.get("embedding_functions")).toBeDefined();
|
||||
|
||||
// Append some new data
|
||||
const data1 = [
|
||||
{ id: 3, text: "forest" },
|
||||
{ id: 4, text: "mountain" },
|
||||
];
|
||||
await table.add(data1);
|
||||
|
||||
// Upsert some data
|
||||
const data2 = [
|
||||
{ id: 5, text: "river" },
|
||||
{ id: 2, text: "canyon" },
|
||||
];
|
||||
await table
|
||||
.mergeInsert("id")
|
||||
.whenMatchedUpdateAll()
|
||||
.whenNotMatchedInsertAll()
|
||||
.execute(data2);
|
||||
|
||||
const rows = await table.query().toArray();
|
||||
rows.sort((a, b) => a.id - b.id);
|
||||
const texts = rows.map((row) => row.text);
|
||||
expect(texts).toEqual(["hello", "canyon", "forest", "mountain", "river"]);
|
||||
const vectorsDefined = rows.map(
|
||||
(row) => row.vector !== undefined && row.vector !== null,
|
||||
);
|
||||
expect(vectorsDefined).toEqual(new Array(5).fill(true));
|
||||
});
|
||||
|
||||
it("should be able to create an empty table with an embedding function", async () => {
|
||||
@register()
|
||||
class MockEmbeddingFunction extends EmbeddingFunction<string> {
|
||||
|
||||
@@ -473,6 +473,10 @@ describe("When creating an index", () => {
|
||||
// test offset
|
||||
rst = await tbl.query().limit(2).offset(1).nearestTo(queryVec).toArrow();
|
||||
expect(rst.numRows).toBe(1);
|
||||
|
||||
await tbl.dropIndex("vec_idx");
|
||||
const indices2 = await tbl.listIndices();
|
||||
expect(indices2.length).toBe(0);
|
||||
});
|
||||
|
||||
it("should search with distance range", async () => {
|
||||
|
||||
@@ -609,6 +609,14 @@ async function applyEmbeddings<T>(
|
||||
return table;
|
||||
}
|
||||
|
||||
let schemaMetadata = schema?.metadata || new Map<string, string>();
|
||||
|
||||
if (!(embeddings == null || embeddings === undefined)) {
|
||||
const registry = getRegistry();
|
||||
const embeddingMetadata = registry.getTableMetadata([embeddings]);
|
||||
schemaMetadata = new Map([...schemaMetadata, ...embeddingMetadata]);
|
||||
}
|
||||
|
||||
// Convert from ArrowTable to Record<String, Vector>
|
||||
const colEntries = [...Array(table.numCols).keys()].map((_, idx) => {
|
||||
const name = table.schema.fields[idx].name;
|
||||
@@ -677,15 +685,21 @@ async function applyEmbeddings<T>(
|
||||
newColumns[destColumn] = makeVector(vectors, destType);
|
||||
}
|
||||
|
||||
const newTable = new ArrowTable(newColumns);
|
||||
let newTable = new ArrowTable(newColumns);
|
||||
if (schema != null) {
|
||||
if (schema.fields.find((f) => f.name === destColumn) === undefined) {
|
||||
throw new Error(
|
||||
`When using embedding functions and specifying a schema the schema should include the embedding column but the column ${destColumn} was missing`,
|
||||
);
|
||||
}
|
||||
return alignTable(newTable, schema as Schema);
|
||||
newTable = alignTable(newTable, schema as Schema);
|
||||
}
|
||||
|
||||
newTable = new ArrowTable(
|
||||
new Schema(newTable.schema.fields, schemaMetadata),
|
||||
newTable.batches,
|
||||
);
|
||||
|
||||
return newTable;
|
||||
}
|
||||
|
||||
|
||||
@@ -1,13 +1,20 @@
|
||||
import { Data, fromDataToBuffer } from "./arrow";
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
import { Data, Schema, fromDataToBuffer } from "./arrow";
|
||||
import { NativeMergeInsertBuilder } from "./native";
|
||||
|
||||
/** A builder used to create and run a merge insert operation */
|
||||
export class MergeInsertBuilder {
|
||||
#native: NativeMergeInsertBuilder;
|
||||
#schema: Schema | Promise<Schema>;
|
||||
|
||||
/** Construct a MergeInsertBuilder. __Internal use only.__ */
|
||||
constructor(native: NativeMergeInsertBuilder) {
|
||||
constructor(
|
||||
native: NativeMergeInsertBuilder,
|
||||
schema: Schema | Promise<Schema>,
|
||||
) {
|
||||
this.#native = native;
|
||||
this.#schema = schema;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -35,6 +42,7 @@ export class MergeInsertBuilder {
|
||||
whenMatchedUpdateAll(options?: { where: string }): MergeInsertBuilder {
|
||||
return new MergeInsertBuilder(
|
||||
this.#native.whenMatchedUpdateAll(options?.where),
|
||||
this.#schema,
|
||||
);
|
||||
}
|
||||
/**
|
||||
@@ -42,7 +50,10 @@ export class MergeInsertBuilder {
|
||||
* be inserted into the target table.
|
||||
*/
|
||||
whenNotMatchedInsertAll(): MergeInsertBuilder {
|
||||
return new MergeInsertBuilder(this.#native.whenNotMatchedInsertAll());
|
||||
return new MergeInsertBuilder(
|
||||
this.#native.whenNotMatchedInsertAll(),
|
||||
this.#schema,
|
||||
);
|
||||
}
|
||||
/**
|
||||
* Rows that exist only in the target table (old data) will be
|
||||
@@ -56,6 +67,7 @@ export class MergeInsertBuilder {
|
||||
}): MergeInsertBuilder {
|
||||
return new MergeInsertBuilder(
|
||||
this.#native.whenNotMatchedBySourceDelete(options?.where),
|
||||
this.#schema,
|
||||
);
|
||||
}
|
||||
/**
|
||||
@@ -64,7 +76,14 @@ export class MergeInsertBuilder {
|
||||
* Nothing is returned but the `Table` is updated
|
||||
*/
|
||||
async execute(data: Data): Promise<void> {
|
||||
const buffer = await fromDataToBuffer(data);
|
||||
let schema: Schema;
|
||||
if (this.#schema instanceof Promise) {
|
||||
schema = await this.#schema;
|
||||
this.#schema = schema; // In case of future calls
|
||||
} else {
|
||||
schema = this.#schema;
|
||||
}
|
||||
const buffer = await fromDataToBuffer(data, undefined, schema);
|
||||
await this.#native.execute(buffer);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -226,6 +226,19 @@ export abstract class Table {
|
||||
column: string,
|
||||
options?: Partial<IndexOptions>,
|
||||
): Promise<void>;
|
||||
|
||||
/**
|
||||
* Drop an index from the table.
|
||||
*
|
||||
* @param name The name of the index.
|
||||
*
|
||||
* @note This does not delete the index from disk, it just removes it from the table.
|
||||
* To delete the index, run {@link Table#optimize} after dropping the index.
|
||||
*
|
||||
* Use {@link Table.listIndices} to find the names of the indices.
|
||||
*/
|
||||
abstract dropIndex(name: string): Promise<void>;
|
||||
|
||||
/**
|
||||
* Create a {@link Query} Builder.
|
||||
*
|
||||
@@ -426,6 +439,8 @@ export abstract class Table {
|
||||
*
|
||||
* @param {string} name The name of the index.
|
||||
* @returns {IndexStatistics | undefined} The stats of the index. If the index does not exist, it will return undefined
|
||||
*
|
||||
* Use {@link Table.listIndices} to find the names of the indices.
|
||||
*/
|
||||
abstract indexStats(name: string): Promise<IndexStatistics | undefined>;
|
||||
|
||||
@@ -505,14 +520,8 @@ export class LocalTable extends Table {
|
||||
async add(data: Data, options?: Partial<AddDataOptions>): Promise<void> {
|
||||
const mode = options?.mode ?? "append";
|
||||
const schema = await this.schema();
|
||||
const registry = getRegistry();
|
||||
const functions = await registry.parseFunctions(schema.metadata);
|
||||
|
||||
const buffer = await fromDataToBuffer(
|
||||
data,
|
||||
functions.values().next().value,
|
||||
schema,
|
||||
);
|
||||
const buffer = await fromDataToBuffer(data, undefined, schema);
|
||||
await this.inner.add(buffer, mode);
|
||||
}
|
||||
|
||||
@@ -591,6 +600,10 @@ export class LocalTable extends Table {
|
||||
await this.inner.createIndex(nativeIndex, column, options?.replace);
|
||||
}
|
||||
|
||||
async dropIndex(name: string): Promise<void> {
|
||||
await this.inner.dropIndex(name);
|
||||
}
|
||||
|
||||
query(): Query {
|
||||
return new Query(this.inner);
|
||||
}
|
||||
@@ -714,7 +727,7 @@ export class LocalTable extends Table {
|
||||
}
|
||||
mergeInsert(on: string | string[]): MergeInsertBuilder {
|
||||
on = Array.isArray(on) ? on : [on];
|
||||
return new MergeInsertBuilder(this.inner.mergeInsert(on));
|
||||
return new MergeInsertBuilder(this.inner.mergeInsert(on), this.schema());
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-darwin-arm64",
|
||||
"version": "0.15.0-beta.0",
|
||||
"version": "0.15.1-beta.0",
|
||||
"os": ["darwin"],
|
||||
"cpu": ["arm64"],
|
||||
"main": "lancedb.darwin-arm64.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-darwin-x64",
|
||||
"version": "0.15.0-beta.0",
|
||||
"version": "0.15.1-beta.0",
|
||||
"os": ["darwin"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.darwin-x64.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-arm64-gnu",
|
||||
"version": "0.15.0-beta.0",
|
||||
"version": "0.15.1-beta.0",
|
||||
"os": ["linux"],
|
||||
"cpu": ["arm64"],
|
||||
"main": "lancedb.linux-arm64-gnu.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-arm64-musl",
|
||||
"version": "0.15.0-beta.0",
|
||||
"version": "0.15.1-beta.0",
|
||||
"os": ["linux"],
|
||||
"cpu": ["arm64"],
|
||||
"main": "lancedb.linux-arm64-musl.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-x64-gnu",
|
||||
"version": "0.15.0-beta.0",
|
||||
"version": "0.15.1-beta.0",
|
||||
"os": ["linux"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.linux-x64-gnu.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-x64-musl",
|
||||
"version": "0.15.0-beta.0",
|
||||
"version": "0.15.1-beta.0",
|
||||
"os": ["linux"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.linux-x64-musl.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-win32-arm64-msvc",
|
||||
"version": "0.15.0-beta.0",
|
||||
"version": "0.15.1-beta.0",
|
||||
"os": [
|
||||
"win32"
|
||||
],
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-win32-x64-msvc",
|
||||
"version": "0.15.0-beta.0",
|
||||
"version": "0.15.1-beta.0",
|
||||
"os": ["win32"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.win32-x64-msvc.node",
|
||||
|
||||
4
nodejs/package-lock.json
generated
4
nodejs/package-lock.json
generated
@@ -1,12 +1,12 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb",
|
||||
"version": "0.15.0-beta.0",
|
||||
"version": "0.15.1-beta.0",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "@lancedb/lancedb",
|
||||
"version": "0.15.0-beta.0",
|
||||
"version": "0.15.1-beta.0",
|
||||
"cpu": [
|
||||
"x64",
|
||||
"arm64"
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
"ann"
|
||||
],
|
||||
"private": false,
|
||||
"version": "0.15.0-beta.0",
|
||||
"version": "0.15.1-beta.0",
|
||||
"main": "dist/index.js",
|
||||
"exports": {
|
||||
".": "./dist/index.js",
|
||||
|
||||
@@ -135,6 +135,14 @@ impl Table {
|
||||
builder.execute().await.default_error()
|
||||
}
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
pub async fn drop_index(&self, index_name: String) -> napi::Result<()> {
|
||||
self.inner_ref()?
|
||||
.drop_index(&index_name)
|
||||
.await
|
||||
.default_error()
|
||||
}
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
pub async fn update(
|
||||
&self,
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
[tool.bumpversion]
|
||||
current_version = "0.18.0"
|
||||
current_version = "0.18.1-beta.2"
|
||||
parse = """(?x)
|
||||
(?P<major>0|[1-9]\\d*)\\.
|
||||
(?P<minor>0|[1-9]\\d*)\\.
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "lancedb-python"
|
||||
version = "0.18.0"
|
||||
version = "0.18.1-beta.2"
|
||||
edition.workspace = true
|
||||
description = "Python bindings for LanceDB"
|
||||
license.workspace = true
|
||||
|
||||
@@ -4,7 +4,7 @@ name = "lancedb"
|
||||
dynamic = ["version"]
|
||||
dependencies = [
|
||||
"deprecation",
|
||||
"pylance==0.22.0",
|
||||
"pylance==0.23.0b3",
|
||||
"tqdm>=4.27.0",
|
||||
"pydantic>=1.10",
|
||||
"packaging",
|
||||
@@ -55,7 +55,7 @@ tests = [
|
||||
"tantivy",
|
||||
"pyarrow-stubs",
|
||||
]
|
||||
dev = ["ruff", "pre-commit", "pyright"]
|
||||
dev = ["ruff", "pre-commit", "pyright", 'typing-extensions>=4.0.0; python_version < "3.11"']
|
||||
docs = ["mkdocs", "mkdocs-jupyter", "mkdocs-material", "mkdocstrings[python]"]
|
||||
clip = ["torch", "pillow", "open-clip"]
|
||||
embeddings = [
|
||||
|
||||
@@ -84,11 +84,15 @@ class RecordBatchStream:
|
||||
class Query:
|
||||
def where(self, filter: str): ...
|
||||
def select(self, columns: Tuple[str, str]): ...
|
||||
def select_columns(self, columns: List[str]): ...
|
||||
def limit(self, limit: int): ...
|
||||
def offset(self, offset: int): ...
|
||||
def fast_search(self): ...
|
||||
def with_row_id(self): ...
|
||||
def postfilter(self): ...
|
||||
def nearest_to(self, query_vec: pa.Array) -> VectorQuery: ...
|
||||
def nearest_to_text(self, query: dict) -> FTSQuery: ...
|
||||
async def execute(self, max_batch_legnth: Optional[int]) -> RecordBatchStream: ...
|
||||
async def execute(self, max_batch_length: Optional[int]) -> RecordBatchStream: ...
|
||||
|
||||
class FTSQuery:
|
||||
def where(self, filter: str): ...
|
||||
@@ -98,6 +102,8 @@ class FTSQuery:
|
||||
def fast_search(self): ...
|
||||
def with_row_id(self): ...
|
||||
def postfilter(self): ...
|
||||
def get_query(self) -> str: ...
|
||||
def add_query_vector(self, query_vec: pa.Array) -> None: ...
|
||||
def nearest_to(self, query_vec: pa.Array) -> HybridQuery: ...
|
||||
async def execute(self, max_batch_length: Optional[int]) -> RecordBatchStream: ...
|
||||
async def explain_plan(self) -> str: ...
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from typing import List
|
||||
from typing import List, Optional, Union
|
||||
|
||||
import pyarrow as pa
|
||||
|
||||
@@ -12,17 +12,27 @@ class AsyncRecordBatchReader:
|
||||
Also allows access to the schema of the stream
|
||||
"""
|
||||
|
||||
def __init__(self, inner: RecordBatchStream):
|
||||
self.inner_ = inner
|
||||
|
||||
@property
|
||||
def schema(self) -> pa.Schema:
|
||||
def __init__(
|
||||
self,
|
||||
inner: Union[RecordBatchStream, pa.Table],
|
||||
max_batch_length: Optional[int] = None,
|
||||
):
|
||||
"""
|
||||
Get the schema of the batches produced by the stream
|
||||
|
||||
Accessing the schema does not consume any data from the stream
|
||||
Attributes
|
||||
----------
|
||||
schema : pa.Schema
|
||||
The schema of the batches produced by the stream.
|
||||
Accessing the schema does not consume any data from the stream
|
||||
"""
|
||||
return self.inner_.schema()
|
||||
if isinstance(inner, pa.Table):
|
||||
self._inner = self._async_iter_from_table(inner, max_batch_length)
|
||||
self.schema: pa.Schema = inner.schema
|
||||
elif isinstance(inner, RecordBatchStream):
|
||||
self._inner = inner
|
||||
self.schema: pa.Schema = inner.schema
|
||||
else:
|
||||
raise TypeError("inner must be a RecordBatchStream or a Table")
|
||||
|
||||
async def read_all(self) -> List[pa.RecordBatch]:
|
||||
"""
|
||||
@@ -38,7 +48,18 @@ class AsyncRecordBatchReader:
|
||||
return self
|
||||
|
||||
async def __anext__(self) -> pa.RecordBatch:
|
||||
next = await self.inner_.next()
|
||||
if next is None:
|
||||
raise StopAsyncIteration
|
||||
return next
|
||||
return await self._inner.__anext__()
|
||||
|
||||
@staticmethod
|
||||
async def _async_iter_from_table(
|
||||
table: pa.Table, max_batch_length: Optional[int] = None
|
||||
):
|
||||
"""
|
||||
Create an AsyncRecordBatchReader from a Table
|
||||
|
||||
This is useful when you have a Table that you want to iterate
|
||||
over asynchronously
|
||||
"""
|
||||
batches = table.to_batches(max_chunksize=max_batch_length)
|
||||
for batch in batches:
|
||||
yield batch
|
||||
|
||||
@@ -20,6 +20,7 @@ import asyncio
|
||||
import deprecation
|
||||
import numpy as np
|
||||
import pyarrow as pa
|
||||
import pyarrow.compute as pc
|
||||
import pyarrow.fs as pa_fs
|
||||
import pydantic
|
||||
|
||||
@@ -31,6 +32,7 @@ from .rerankers.util import check_reranker_result
|
||||
from .util import safe_import_pandas, flatten_columns
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import sys
|
||||
import PIL
|
||||
import polars as pl
|
||||
|
||||
@@ -42,6 +44,11 @@ if TYPE_CHECKING:
|
||||
from .pydantic import LanceModel
|
||||
from .table import Table
|
||||
|
||||
if sys.version_info >= (3, 11):
|
||||
from typing import Self
|
||||
else:
|
||||
from typing_extensions import Self
|
||||
|
||||
pd = safe_import_pandas()
|
||||
|
||||
|
||||
@@ -498,7 +505,7 @@ class LanceQueryBuilder(ABC):
|
||||
"column": self._vector_column,
|
||||
"q": self._query,
|
||||
"k": self._limit,
|
||||
"metric": self._metric,
|
||||
"metric": self._distance_type,
|
||||
"nprobes": self._nprobes,
|
||||
"refine_factor": self._refine_factor,
|
||||
"use_index": self._use_index,
|
||||
@@ -569,7 +576,7 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
|
||||
>>> db = lancedb.connect("./.lancedb")
|
||||
>>> table = db.create_table("my_table", data=data)
|
||||
>>> (table.search([0.4, 0.4])
|
||||
... .metric("cosine")
|
||||
... .distance_type("cosine")
|
||||
... .where("b < 10")
|
||||
... .select(["b", "vector"])
|
||||
... .limit(2)
|
||||
@@ -589,7 +596,7 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
|
||||
):
|
||||
super().__init__(table)
|
||||
self._query = query
|
||||
self._metric = "L2"
|
||||
self._distance_type = "L2"
|
||||
self._nprobes = 20
|
||||
self._lower_bound = None
|
||||
self._upper_bound = None
|
||||
@@ -603,6 +610,9 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
|
||||
def metric(self, metric: Literal["L2", "cosine", "dot"]) -> LanceVectorQueryBuilder:
|
||||
"""Set the distance metric to use.
|
||||
|
||||
This is an alias for distance_type() and may be deprecated in the future.
|
||||
Please use distance_type() instead.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
metric: "L2" or "cosine" or "dot"
|
||||
@@ -613,7 +623,32 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
|
||||
LanceVectorQueryBuilder
|
||||
The LanceQueryBuilder object.
|
||||
"""
|
||||
self._metric = metric.lower()
|
||||
return self.distance_type(metric)
|
||||
|
||||
def distance_type(
|
||||
self, distance_type: Literal["L2", "cosine", "dot"]
|
||||
) -> "LanceVectorQueryBuilder":
|
||||
"""Set the distance metric to use.
|
||||
|
||||
When performing a vector search we try and find the "nearest" vectors according
|
||||
to some kind of distance metric. This parameter controls which distance metric
|
||||
to use.
|
||||
|
||||
Note: if there is a vector index then the distance type used MUST match the
|
||||
distance type used to train the vector index. If this is not done then the
|
||||
results will be invalid.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
distance_type: "L2" or "cosine" or "dot"
|
||||
The distance metric to use. By default "L2" is used.
|
||||
|
||||
Returns
|
||||
-------
|
||||
LanceVectorQueryBuilder
|
||||
The LanceQueryBuilder object.
|
||||
"""
|
||||
self._distance_type = distance_type.lower()
|
||||
return self
|
||||
|
||||
def nprobes(self, nprobes: int) -> LanceVectorQueryBuilder:
|
||||
@@ -738,7 +773,7 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
|
||||
filter=self._where,
|
||||
prefilter=self._prefilter,
|
||||
k=self._limit,
|
||||
metric=self._metric,
|
||||
metric=self._distance_type,
|
||||
columns=self._columns,
|
||||
nprobes=self._nprobes,
|
||||
lower_bound=self._lower_bound,
|
||||
@@ -1071,7 +1106,7 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
|
||||
self._reranker = RRFReranker()
|
||||
self._nprobes = None
|
||||
self._refine_factor = None
|
||||
self._metric = None
|
||||
self._distance_type = None
|
||||
self._phrase_query = False
|
||||
|
||||
def _validate_query(self, query, vector=None, text=None):
|
||||
@@ -1139,8 +1174,8 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
|
||||
self._fts_query.with_row_id(True)
|
||||
if self._phrase_query:
|
||||
self._fts_query.phrase_query(True)
|
||||
if self._metric:
|
||||
self._vector_query.metric(self._metric)
|
||||
if self._distance_type:
|
||||
self._vector_query.metric(self._distance_type)
|
||||
if self._nprobes:
|
||||
self._vector_query.nprobes(self._nprobes)
|
||||
if self._refine_factor:
|
||||
@@ -1183,18 +1218,52 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
|
||||
fts_results = LanceHybridQueryBuilder._rank(fts_results, "_score")
|
||||
|
||||
# normalize the scores to be between 0 and 1, 0 being most relevant
|
||||
vector_results = LanceHybridQueryBuilder._normalize_scores(
|
||||
vector_results, "_distance"
|
||||
)
|
||||
# We check whether the results (vector and FTS) are empty, because when
|
||||
# they are, they often are missing the _rowid column, which causes an error
|
||||
if vector_results.num_rows > 0:
|
||||
distance_i = vector_results.column_names.index("_distance")
|
||||
original_distances = vector_results.column(distance_i)
|
||||
original_distance_row_ids = vector_results.column("_rowid")
|
||||
vector_results = vector_results.set_column(
|
||||
distance_i,
|
||||
vector_results.field(distance_i),
|
||||
LanceHybridQueryBuilder._normalize_scores(original_distances),
|
||||
)
|
||||
|
||||
# In fts higher scores represent relevance. Not inverting them here as
|
||||
# rerankers might need to preserve this score to support `return_score="all"`
|
||||
fts_results = LanceHybridQueryBuilder._normalize_scores(fts_results, "_score")
|
||||
if fts_results.num_rows > 0:
|
||||
score_i = fts_results.column_names.index("_score")
|
||||
original_scores = fts_results.column(score_i)
|
||||
original_score_row_ids = fts_results.column("_rowid")
|
||||
fts_results = fts_results.set_column(
|
||||
score_i,
|
||||
fts_results.field(score_i),
|
||||
LanceHybridQueryBuilder._normalize_scores(original_scores),
|
||||
)
|
||||
|
||||
results = reranker.rerank_hybrid(fts_query, vector_results, fts_results)
|
||||
|
||||
check_reranker_result(results)
|
||||
|
||||
if "_distance" in results.column_names:
|
||||
# restore the original distances
|
||||
indices = pc.index_in(
|
||||
results["_rowid"], original_distance_row_ids, skip_nulls=True
|
||||
)
|
||||
original_distances = pc.take(original_distances, indices)
|
||||
distance_i = results.column_names.index("_distance")
|
||||
results = results.set_column(distance_i, "_distance", original_distances)
|
||||
|
||||
if "_score" in results.column_names:
|
||||
# restore the original scores
|
||||
indices = pc.index_in(
|
||||
results["_rowid"], original_score_row_ids, skip_nulls=True
|
||||
)
|
||||
original_scores = pc.take(original_scores, indices)
|
||||
score_i = results.column_names.index("_score")
|
||||
results = results.set_column(score_i, "_score", original_scores)
|
||||
|
||||
results = results.slice(length=limit)
|
||||
|
||||
if not with_row_ids:
|
||||
@@ -1224,28 +1293,23 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
|
||||
return results
|
||||
|
||||
@staticmethod
|
||||
def _normalize_scores(results: pa.Table, column: str, invert=False):
|
||||
if len(results) == 0:
|
||||
return results
|
||||
# Get the _score column from results
|
||||
scores = results.column(column).to_numpy()
|
||||
def _normalize_scores(scores: pa.Array, invert=False) -> pa.Array:
|
||||
if len(scores) == 0:
|
||||
return scores
|
||||
# normalize the scores by subtracting the min and dividing by the max
|
||||
max, min = np.max(scores), np.min(scores)
|
||||
if np.isclose(max, min):
|
||||
rng = max
|
||||
else:
|
||||
rng = max - min
|
||||
# If rng is 0 then min and max are both 0 and so we can leave the scores as is
|
||||
if rng != 0:
|
||||
scores = (scores - min) / rng
|
||||
min, max = pc.min_max(scores).values()
|
||||
rng = pc.subtract(max, min)
|
||||
|
||||
if not pc.equal(rng, pa.scalar(0.0)).as_py():
|
||||
scores = pc.divide(pc.subtract(scores, min), rng)
|
||||
elif not pc.equal(max, pa.scalar(0.0)).as_py():
|
||||
# If rng is 0, then we at least want the scores to be 0
|
||||
scores = pc.subtract(scores, min)
|
||||
|
||||
if invert:
|
||||
scores = 1 - scores
|
||||
# replace the _score column with the ranks
|
||||
_score_idx = results.column_names.index(column)
|
||||
results = results.set_column(
|
||||
_score_idx, column, pa.array(scores, type=pa.float32())
|
||||
)
|
||||
return results
|
||||
scores = pc.subtract(1, scores)
|
||||
|
||||
return scores
|
||||
|
||||
def rerank(
|
||||
self,
|
||||
@@ -1350,6 +1414,9 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
|
||||
def metric(self, metric: Literal["L2", "cosine", "dot"]) -> LanceHybridQueryBuilder:
|
||||
"""Set the distance metric to use.
|
||||
|
||||
This is an alias for distance_type() and may be deprecated in the future.
|
||||
Please use distance_type() instead.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
metric: "L2" or "cosine" or "dot"
|
||||
@@ -1360,7 +1427,32 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
|
||||
LanceVectorQueryBuilder
|
||||
The LanceQueryBuilder object.
|
||||
"""
|
||||
self._metric = metric.lower()
|
||||
return self.distance_type(metric)
|
||||
|
||||
def distance_type(
|
||||
self, distance_type: Literal["L2", "cosine", "dot"]
|
||||
) -> "LanceHybridQueryBuilder":
|
||||
"""Set the distance metric to use.
|
||||
|
||||
When performing a vector search we try and find the "nearest" vectors according
|
||||
to some kind of distance metric. This parameter controls which distance metric
|
||||
to use.
|
||||
|
||||
Note: if there is a vector index then the distance type used MUST match the
|
||||
distance type used to train the vector index. If this is not done then the
|
||||
results will be invalid.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
distance_type: "L2" or "cosine" or "dot"
|
||||
The distance metric to use. By default "L2" is used.
|
||||
|
||||
Returns
|
||||
-------
|
||||
LanceVectorQueryBuilder
|
||||
The LanceQueryBuilder object.
|
||||
"""
|
||||
self._distance_type = distance_type.lower()
|
||||
return self
|
||||
|
||||
def refine_factor(self, refine_factor: int) -> LanceHybridQueryBuilder:
|
||||
@@ -1418,7 +1510,7 @@ class AsyncQueryBase(object):
|
||||
"""
|
||||
self._inner = inner
|
||||
|
||||
def where(self, predicate: str) -> AsyncQuery:
|
||||
def where(self, predicate: str) -> Self:
|
||||
"""
|
||||
Only return rows matching the given predicate
|
||||
|
||||
@@ -1437,7 +1529,7 @@ class AsyncQueryBase(object):
|
||||
self._inner.where(predicate)
|
||||
return self
|
||||
|
||||
def select(self, columns: Union[List[str], dict[str, str]]) -> AsyncQuery:
|
||||
def select(self, columns: Union[List[str], dict[str, str]]) -> Self:
|
||||
"""
|
||||
Return only the specified columns.
|
||||
|
||||
@@ -1475,7 +1567,7 @@ class AsyncQueryBase(object):
|
||||
raise TypeError("columns must be a list of column names or a dict")
|
||||
return self
|
||||
|
||||
def limit(self, limit: int) -> AsyncQuery:
|
||||
def limit(self, limit: int) -> Self:
|
||||
"""
|
||||
Set the maximum number of results to return.
|
||||
|
||||
@@ -1485,7 +1577,7 @@ class AsyncQueryBase(object):
|
||||
self._inner.limit(limit)
|
||||
return self
|
||||
|
||||
def offset(self, offset: int) -> AsyncQuery:
|
||||
def offset(self, offset: int) -> Self:
|
||||
"""
|
||||
Set the offset for the results.
|
||||
|
||||
@@ -1497,7 +1589,7 @@ class AsyncQueryBase(object):
|
||||
self._inner.offset(offset)
|
||||
return self
|
||||
|
||||
def fast_search(self) -> AsyncQuery:
|
||||
def fast_search(self) -> Self:
|
||||
"""
|
||||
Skip searching un-indexed data.
|
||||
|
||||
@@ -1511,14 +1603,14 @@ class AsyncQueryBase(object):
|
||||
self._inner.fast_search()
|
||||
return self
|
||||
|
||||
def with_row_id(self) -> AsyncQuery:
|
||||
def with_row_id(self) -> Self:
|
||||
"""
|
||||
Include the _rowid column in the results.
|
||||
"""
|
||||
self._inner.with_row_id()
|
||||
return self
|
||||
|
||||
def postfilter(self) -> AsyncQuery:
|
||||
def postfilter(self) -> Self:
|
||||
"""
|
||||
If this is called then filtering will happen after the search instead of
|
||||
before.
|
||||
@@ -1754,7 +1846,7 @@ class AsyncQuery(AsyncQueryBase):
|
||||
raise ValueError("query_vector can not be None")
|
||||
|
||||
if (
|
||||
isinstance(query_vector, list)
|
||||
isinstance(query_vector, (list, np.ndarray, pa.Array))
|
||||
and len(query_vector) > 0
|
||||
and isinstance(query_vector[0], (list, np.ndarray, pa.Array))
|
||||
):
|
||||
@@ -1807,8 +1899,8 @@ class AsyncFTSQuery(AsyncQueryBase):
|
||||
self._inner = inner
|
||||
self._reranker = None
|
||||
|
||||
def get_query(self):
|
||||
self._inner.get_query()
|
||||
def get_query(self) -> str:
|
||||
return self._inner.get_query()
|
||||
|
||||
def rerank(
|
||||
self,
|
||||
@@ -1891,29 +1983,18 @@ class AsyncFTSQuery(AsyncQueryBase):
|
||||
self._inner.nearest_to(AsyncQuery._query_vec_to_array(query_vector))
|
||||
)
|
||||
|
||||
async def to_arrow(self) -> pa.Table:
|
||||
results = await super().to_arrow()
|
||||
async def to_batches(
|
||||
self, *, max_batch_length: Optional[int] = None
|
||||
) -> AsyncRecordBatchReader:
|
||||
reader = await super().to_batches()
|
||||
results = pa.Table.from_batches(await reader.read_all(), reader.schema)
|
||||
if self._reranker:
|
||||
results = self._reranker.rerank_fts(results)
|
||||
return results
|
||||
results = self._reranker.rerank_fts(self.get_query(), results)
|
||||
return AsyncRecordBatchReader(results, max_batch_length=max_batch_length)
|
||||
|
||||
|
||||
class AsyncVectorQuery(AsyncQueryBase):
|
||||
def __init__(self, inner: LanceVectorQuery):
|
||||
"""
|
||||
Construct an AsyncVectorQuery
|
||||
|
||||
This method is not intended to be called directly. Instead, create
|
||||
a query first with [AsyncTable.query][lancedb.table.AsyncTable.query] and then
|
||||
use [AsyncQuery.nearest_to][lancedb.query.AsyncQuery.nearest_to]] to convert to
|
||||
a vector query. Or you can use
|
||||
[AsyncTable.vector_search][lancedb.table.AsyncTable.vector_search]
|
||||
"""
|
||||
super().__init__(inner)
|
||||
self._inner = inner
|
||||
self._reranker = None
|
||||
|
||||
def column(self, column: str) -> AsyncVectorQuery:
|
||||
class AsyncVectorQueryBase:
|
||||
def column(self, column: str) -> Self:
|
||||
"""
|
||||
Set the vector column to query
|
||||
|
||||
@@ -1926,7 +2007,7 @@ class AsyncVectorQuery(AsyncQueryBase):
|
||||
self._inner.column(column)
|
||||
return self
|
||||
|
||||
def nprobes(self, nprobes: int) -> AsyncVectorQuery:
|
||||
def nprobes(self, nprobes: int) -> Self:
|
||||
"""
|
||||
Set the number of partitions to search (probe)
|
||||
|
||||
@@ -1954,7 +2035,7 @@ class AsyncVectorQuery(AsyncQueryBase):
|
||||
|
||||
def distance_range(
|
||||
self, lower_bound: Optional[float] = None, upper_bound: Optional[float] = None
|
||||
) -> AsyncVectorQuery:
|
||||
) -> Self:
|
||||
"""Set the distance range to use.
|
||||
|
||||
Only rows with distances within range [lower_bound, upper_bound)
|
||||
@@ -1975,7 +2056,7 @@ class AsyncVectorQuery(AsyncQueryBase):
|
||||
self._inner.distance_range(lower_bound, upper_bound)
|
||||
return self
|
||||
|
||||
def ef(self, ef: int) -> AsyncVectorQuery:
|
||||
def ef(self, ef: int) -> Self:
|
||||
"""
|
||||
Set the number of candidates to consider during search
|
||||
|
||||
@@ -1990,7 +2071,7 @@ class AsyncVectorQuery(AsyncQueryBase):
|
||||
self._inner.ef(ef)
|
||||
return self
|
||||
|
||||
def refine_factor(self, refine_factor: int) -> AsyncVectorQuery:
|
||||
def refine_factor(self, refine_factor: int) -> Self:
|
||||
"""
|
||||
A multiplier to control how many additional rows are taken during the refine
|
||||
step
|
||||
@@ -2026,7 +2107,7 @@ class AsyncVectorQuery(AsyncQueryBase):
|
||||
self._inner.refine_factor(refine_factor)
|
||||
return self
|
||||
|
||||
def distance_type(self, distance_type: str) -> AsyncVectorQuery:
|
||||
def distance_type(self, distance_type: str) -> Self:
|
||||
"""
|
||||
Set the distance metric to use
|
||||
|
||||
@@ -2044,7 +2125,7 @@ class AsyncVectorQuery(AsyncQueryBase):
|
||||
self._inner.distance_type(distance_type)
|
||||
return self
|
||||
|
||||
def bypass_vector_index(self) -> AsyncVectorQuery:
|
||||
def bypass_vector_index(self) -> Self:
|
||||
"""
|
||||
If this is called then any vector index is skipped
|
||||
|
||||
@@ -2057,6 +2138,23 @@ class AsyncVectorQuery(AsyncQueryBase):
|
||||
self._inner.bypass_vector_index()
|
||||
return self
|
||||
|
||||
|
||||
class AsyncVectorQuery(AsyncQueryBase, AsyncVectorQueryBase):
|
||||
def __init__(self, inner: LanceVectorQuery):
|
||||
"""
|
||||
Construct an AsyncVectorQuery
|
||||
|
||||
This method is not intended to be called directly. Instead, create
|
||||
a query first with [AsyncTable.query][lancedb.table.AsyncTable.query] and then
|
||||
use [AsyncQuery.nearest_to][lancedb.query.AsyncQuery.nearest_to]] to convert to
|
||||
a vector query. Or you can use
|
||||
[AsyncTable.vector_search][lancedb.table.AsyncTable.vector_search]
|
||||
"""
|
||||
super().__init__(inner)
|
||||
self._inner = inner
|
||||
self._reranker = None
|
||||
self._query_string = None
|
||||
|
||||
def rerank(
|
||||
self, reranker: Reranker = RRFReranker(), query_string: Optional[str] = None
|
||||
) -> AsyncHybridQuery:
|
||||
@@ -2065,6 +2163,11 @@ class AsyncVectorQuery(AsyncQueryBase):
|
||||
|
||||
self._reranker = reranker
|
||||
|
||||
if not self._query_string and not query_string:
|
||||
raise ValueError("query_string must be provided to rerank the results.")
|
||||
|
||||
self._query_string = query_string
|
||||
|
||||
return self
|
||||
|
||||
def nearest_to_text(
|
||||
@@ -2100,14 +2203,17 @@ class AsyncVectorQuery(AsyncQueryBase):
|
||||
self._inner.nearest_to_text({"query": query, "columns": columns})
|
||||
)
|
||||
|
||||
async def to_arrow(self) -> pa.Table:
|
||||
results = await super().to_arrow()
|
||||
async def to_batches(
|
||||
self, *, max_batch_length: Optional[int] = None
|
||||
) -> AsyncRecordBatchReader:
|
||||
reader = await super().to_batches()
|
||||
results = pa.Table.from_batches(await reader.read_all(), reader.schema)
|
||||
if self._reranker:
|
||||
results = self._reranker.rerank_vector(results)
|
||||
return results
|
||||
results = self._reranker.rerank_vector(self._query_string, results)
|
||||
return AsyncRecordBatchReader(results, max_batch_length=max_batch_length)
|
||||
|
||||
|
||||
class AsyncHybridQuery(AsyncQueryBase):
|
||||
class AsyncHybridQuery(AsyncQueryBase, AsyncVectorQueryBase):
|
||||
"""
|
||||
A query builder that performs hybrid vector and full text search.
|
||||
Results are combined and reranked based on the specified reranker.
|
||||
@@ -2155,10 +2261,9 @@ class AsyncHybridQuery(AsyncQueryBase):
|
||||
|
||||
return self
|
||||
|
||||
async def to_batches(self):
|
||||
raise NotImplementedError("to_batches not yet supported on a hybrid query")
|
||||
|
||||
async def to_arrow(self) -> pa.Table:
|
||||
async def to_batches(
|
||||
self, *, max_batch_length: Optional[int] = None
|
||||
) -> AsyncRecordBatchReader:
|
||||
fts_query = AsyncFTSQuery(self._inner.to_fts_query())
|
||||
vec_query = AsyncVectorQuery(self._inner.to_vector_query())
|
||||
|
||||
@@ -2173,7 +2278,7 @@ class AsyncHybridQuery(AsyncQueryBase):
|
||||
vec_query.to_arrow(),
|
||||
)
|
||||
|
||||
return LanceHybridQueryBuilder._combine_hybrid_results(
|
||||
result = LanceHybridQueryBuilder._combine_hybrid_results(
|
||||
fts_results=fts_results,
|
||||
vector_results=vector_results,
|
||||
norm=self._norm,
|
||||
@@ -2183,6 +2288,8 @@ class AsyncHybridQuery(AsyncQueryBase):
|
||||
with_row_ids=with_row_ids,
|
||||
)
|
||||
|
||||
return AsyncRecordBatchReader(result, max_batch_length=max_batch_length)
|
||||
|
||||
async def explain_plan(self, verbose: Optional[bool] = False):
|
||||
"""Return the execution plan for this query.
|
||||
|
||||
|
||||
@@ -11,6 +11,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from collections import defaultdict
|
||||
from numpy import nan
|
||||
import pyarrow as pa
|
||||
|
||||
@@ -95,43 +96,22 @@ class LinearCombinationReranker(Reranker):
|
||||
pa.array([nan] * len(vector_results), type=pa.float32()),
|
||||
)
|
||||
return results
|
||||
|
||||
# sort both input tables on _rowid
|
||||
combined_list = []
|
||||
vector_list = vector_results.sort_by("_rowid").to_pylist()
|
||||
fts_list = fts_results.sort_by("_rowid").to_pylist()
|
||||
i, j = 0, 0
|
||||
while i < len(vector_list):
|
||||
if j >= len(fts_list):
|
||||
for vi in vector_list[i:]:
|
||||
vi["_relevance_score"] = self._combine_score(vi["_distance"], fill)
|
||||
combined_list.append(vi)
|
||||
break
|
||||
|
||||
vi = vector_list[i]
|
||||
fj = fts_list[j]
|
||||
# invert the fts score from relevance to distance
|
||||
inverted_fts_score = self._invert_score(fj["_score"])
|
||||
if vi["_rowid"] == fj["_rowid"]:
|
||||
vi["_relevance_score"] = self._combine_score(
|
||||
vi["_distance"], inverted_fts_score
|
||||
)
|
||||
vi["_score"] = fj["_score"] # keep the original score
|
||||
combined_list.append(vi)
|
||||
i += 1
|
||||
j += 1
|
||||
elif vector_list[i]["_rowid"] < fts_list[j]["_rowid"]:
|
||||
vi["_relevance_score"] = self._combine_score(vi["_distance"], fill)
|
||||
combined_list.append(vi)
|
||||
i += 1
|
||||
results = defaultdict()
|
||||
for vector_result in vector_results.to_pylist():
|
||||
results[vector_result["_rowid"]] = vector_result
|
||||
for fts_result in fts_results.to_pylist():
|
||||
row_id = fts_result["_rowid"]
|
||||
if row_id in results:
|
||||
results[row_id]["_score"] = fts_result["_score"]
|
||||
else:
|
||||
fj["_relevance_score"] = self._combine_score(inverted_fts_score, fill)
|
||||
combined_list.append(fj)
|
||||
j += 1
|
||||
if j < len(fts_list) - 1:
|
||||
for fj in fts_list[j:]:
|
||||
fj["_relevance_score"] = self._combine_score(inverted_fts_score, fill)
|
||||
combined_list.append(fj)
|
||||
results[row_id] = fts_result
|
||||
|
||||
combined_list = []
|
||||
for row_id, result in results.items():
|
||||
vector_score = self._invert_score(result.get("_distance", fill))
|
||||
fts_score = result.get("_score", fill)
|
||||
result["_relevance_score"] = self._combine_score(vector_score, fts_score)
|
||||
combined_list.append(result)
|
||||
|
||||
relevance_score_schema = pa.schema(
|
||||
[
|
||||
@@ -148,10 +128,10 @@ class LinearCombinationReranker(Reranker):
|
||||
tbl = self._keep_relevance_score(tbl)
|
||||
return tbl
|
||||
|
||||
def _combine_score(self, score1, score2):
|
||||
def _combine_score(self, vector_score, fts_score):
|
||||
# these scores represent distance
|
||||
return 1 - (self.weight * score1 + (1 - self.weight) * score2)
|
||||
return 1 - (self.weight * vector_score + (1 - self.weight) * fts_score)
|
||||
|
||||
def _invert_score(self, score: float):
|
||||
def _invert_score(self, dist: float):
|
||||
# Invert the score between relevance and distance
|
||||
return 1 - score
|
||||
return 1 - dist
|
||||
|
||||
@@ -586,6 +586,26 @@ class Table(ABC):
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def drop_index(self, name: str) -> None:
|
||||
"""
|
||||
Drop an index from the table.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
name: str
|
||||
The name of the index to drop.
|
||||
|
||||
Notes
|
||||
-----
|
||||
This does not delete the index from disk, it just removes it from the table.
|
||||
To delete the index, run [optimize][lancedb.table.Table.optimize]
|
||||
after dropping the index.
|
||||
|
||||
Use [list_indices][lancedb.table.Table.list_indices] to find the names of
|
||||
the indices.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def create_scalar_index(
|
||||
self,
|
||||
@@ -1594,6 +1614,9 @@ class LanceTable(Table):
|
||||
)
|
||||
)
|
||||
|
||||
def drop_index(self, name: str) -> None:
|
||||
return LOOP.run(self._table.drop_index(name))
|
||||
|
||||
def create_scalar_index(
|
||||
self,
|
||||
column: str,
|
||||
@@ -2716,6 +2739,26 @@ class AsyncTable:
|
||||
add_note(e, help_msg)
|
||||
raise e
|
||||
|
||||
async def drop_index(self, name: str) -> None:
|
||||
"""
|
||||
Drop an index from the table.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
name: str
|
||||
The name of the index to drop.
|
||||
|
||||
Notes
|
||||
-----
|
||||
This does not delete the index from disk, it just removes it from the table.
|
||||
To delete the index, run [optimize][lancedb.table.AsyncTable.optimize]
|
||||
after dropping the index.
|
||||
|
||||
Use [list_indices][lancedb.table.AsyncTable.list_indices] to find the names
|
||||
of the indices.
|
||||
"""
|
||||
await self._inner.drop_index(name)
|
||||
|
||||
async def add(
|
||||
self,
|
||||
data: DATA,
|
||||
|
||||
@@ -3,6 +3,7 @@ import shutil
|
||||
# --8<-- [start:imports]
|
||||
import lancedb
|
||||
import numpy as np
|
||||
import pyarrow as pa
|
||||
import pytest
|
||||
# --8<-- [end:imports]
|
||||
|
||||
@@ -12,16 +13,32 @@ shutil.rmtree("data/binary_lancedb", ignore_errors=True)
|
||||
def test_binary_vector():
|
||||
# --8<-- [start:sync_binary_vector]
|
||||
db = lancedb.connect("data/binary_lancedb")
|
||||
data = [
|
||||
{
|
||||
"id": i,
|
||||
"vector": np.random.randint(0, 256, size=16),
|
||||
}
|
||||
for i in range(1024)
|
||||
]
|
||||
tbl = db.create_table("my_binary_vectors", data=data)
|
||||
query = np.random.randint(0, 256, size=16)
|
||||
tbl.search(query).metric("hamming").to_arrow()
|
||||
schema = pa.schema(
|
||||
[
|
||||
pa.field("id", pa.int64()),
|
||||
# for dim=256, lance stores every 8 bits in a byte
|
||||
# so the vector field should be a list of 256 / 8 = 32 bytes
|
||||
pa.field("vector", pa.list_(pa.uint8(), 32)),
|
||||
]
|
||||
)
|
||||
tbl = db.create_table("my_binary_vectors", schema=schema)
|
||||
|
||||
data = []
|
||||
for i in range(1024):
|
||||
vector = np.random.randint(0, 2, size=256)
|
||||
# pack the binary vector into bytes to save space
|
||||
packed_vector = np.packbits(vector)
|
||||
data.append(
|
||||
{
|
||||
"id": i,
|
||||
"vector": packed_vector,
|
||||
}
|
||||
)
|
||||
tbl.add(data)
|
||||
|
||||
query = np.random.randint(0, 2, size=256)
|
||||
packed_query = np.packbits(query)
|
||||
tbl.search(packed_query).distance_type("hamming").to_arrow()
|
||||
# --8<-- [end:sync_binary_vector]
|
||||
db.drop_table("my_binary_vectors")
|
||||
|
||||
@@ -30,15 +47,31 @@ def test_binary_vector():
|
||||
async def test_binary_vector_async():
|
||||
# --8<-- [start:async_binary_vector]
|
||||
db = await lancedb.connect_async("data/binary_lancedb")
|
||||
data = [
|
||||
{
|
||||
"id": i,
|
||||
"vector": np.random.randint(0, 256, size=16),
|
||||
}
|
||||
for i in range(1024)
|
||||
]
|
||||
tbl = await db.create_table("my_binary_vectors", data=data)
|
||||
query = np.random.randint(0, 256, size=16)
|
||||
await tbl.query().nearest_to(query).distance_type("hamming").to_arrow()
|
||||
schema = pa.schema(
|
||||
[
|
||||
pa.field("id", pa.int64()),
|
||||
# for dim=256, lance stores every 8 bits in a byte
|
||||
# so the vector field should be a list of 256 / 8 = 32 bytes
|
||||
pa.field("vector", pa.list_(pa.uint8(), 32)),
|
||||
]
|
||||
)
|
||||
tbl = await db.create_table("my_binary_vectors", schema=schema)
|
||||
|
||||
data = []
|
||||
for i in range(1024):
|
||||
vector = np.random.randint(0, 2, size=256)
|
||||
# pack the binary vector into bytes to save space
|
||||
packed_vector = np.packbits(vector)
|
||||
data.append(
|
||||
{
|
||||
"id": i,
|
||||
"vector": packed_vector,
|
||||
}
|
||||
)
|
||||
await tbl.add(data)
|
||||
|
||||
query = np.random.randint(0, 2, size=256)
|
||||
packed_query = np.packbits(query)
|
||||
await tbl.query().nearest_to(packed_query).distance_type("hamming").to_arrow()
|
||||
# --8<-- [end:async_binary_vector]
|
||||
await db.drop_table("my_binary_vectors")
|
||||
|
||||
80
python/python/tests/docs/test_multivector.py
Normal file
80
python/python/tests/docs/test_multivector.py
Normal file
@@ -0,0 +1,80 @@
|
||||
import shutil
|
||||
from lancedb.index import IvfPq
|
||||
import pytest
|
||||
|
||||
# --8<-- [start:imports]
|
||||
import lancedb
|
||||
import numpy as np
|
||||
import pyarrow as pa
|
||||
# --8<-- [end:imports]
|
||||
|
||||
shutil.rmtree("data/multivector_demo", ignore_errors=True)
|
||||
|
||||
|
||||
def test_multivector():
|
||||
# --8<-- [start:sync_multivector]
|
||||
db = lancedb.connect("data/multivector_demo")
|
||||
schema = pa.schema(
|
||||
[
|
||||
pa.field("id", pa.int64()),
|
||||
# float16, float32, and float64 are supported
|
||||
pa.field("vector", pa.list_(pa.list_(pa.float32(), 256))),
|
||||
]
|
||||
)
|
||||
data = [
|
||||
{
|
||||
"id": i,
|
||||
"vector": np.random.random(size=(2, 256)).tolist(),
|
||||
}
|
||||
for i in range(1024)
|
||||
]
|
||||
tbl = db.create_table("my_table", data=data, schema=schema)
|
||||
|
||||
# only cosine similarity is supported for multi-vectors
|
||||
tbl.create_index(metric="cosine")
|
||||
|
||||
# query with single vector
|
||||
query = np.random.random(256).astype(np.float16)
|
||||
tbl.search(query).to_arrow()
|
||||
|
||||
# query with multiple vectors
|
||||
query = np.random.random(size=(2, 256))
|
||||
tbl.search(query).to_arrow()
|
||||
|
||||
# --8<-- [end:sync_multivector]
|
||||
db.drop_table("my_table")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_multivector_async():
|
||||
# --8<-- [start:async_multivector]
|
||||
db = await lancedb.connect_async("data/multivector_demo")
|
||||
schema = pa.schema(
|
||||
[
|
||||
pa.field("id", pa.int64()),
|
||||
# float16, float32, and float64 are supported
|
||||
pa.field("vector", pa.list_(pa.list_(pa.float32(), 256))),
|
||||
]
|
||||
)
|
||||
data = [
|
||||
{
|
||||
"id": i,
|
||||
"vector": np.random.random(size=(2, 256)).tolist(),
|
||||
}
|
||||
for i in range(1024)
|
||||
]
|
||||
tbl = await db.create_table("my_table", data=data, schema=schema)
|
||||
|
||||
# only cosine similarity is supported for multi-vectors
|
||||
await tbl.create_index(column="vector", config=IvfPq(distance_type="cosine"))
|
||||
|
||||
# query with single vector
|
||||
query = np.random.random(256)
|
||||
await tbl.query().nearest_to(query).to_arrow()
|
||||
|
||||
# query with multiple vectors
|
||||
query = np.random.random(size=(2, 256))
|
||||
await tbl.query().nearest_to(query).to_arrow()
|
||||
|
||||
# --8<-- [end:async_multivector]
|
||||
await db.drop_table("my_table")
|
||||
@@ -65,7 +65,7 @@ def test_vector_search():
|
||||
tbl.search(np.random.random((1536))).limit(10).to_list()
|
||||
# --8<-- [end:exhaustive_search]
|
||||
# --8<-- [start:exhaustive_search_cosine]
|
||||
tbl.search(np.random.random((1536))).metric("cosine").limit(10).to_list()
|
||||
tbl.search(np.random.random((1536))).distance_type("cosine").limit(10).to_list()
|
||||
# --8<-- [end:exhaustive_search_cosine]
|
||||
# --8<-- [start:create_table_with_nested_schema]
|
||||
# Let's add 100 sample rows to our dataset
|
||||
|
||||
@@ -3,7 +3,9 @@
|
||||
|
||||
import lancedb
|
||||
|
||||
from lancedb.query import LanceHybridQueryBuilder
|
||||
import pyarrow as pa
|
||||
import pyarrow.compute as pc
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
|
||||
@@ -67,6 +69,7 @@ async def test_async_hybrid_query_filters(table: AsyncTable):
|
||||
.where("text not in ('a', 'dog')")
|
||||
.nearest_to([0.3, 0.3])
|
||||
.nearest_to_text("*a*")
|
||||
.distance_type("l2")
|
||||
.limit(2)
|
||||
.to_arrow()
|
||||
)
|
||||
@@ -109,3 +112,23 @@ async def test_explain_plan(table: AsyncTable):
|
||||
assert "KNNVectorDistance" in plan
|
||||
assert "FTS Search Plan" in plan
|
||||
assert "LanceScan" in plan
|
||||
|
||||
|
||||
def test_normalize_scores():
|
||||
cases = [
|
||||
(pa.array([0.1, 0.4]), pa.array([0.0, 1.0])),
|
||||
(pa.array([2.0, 10.0, 20.0]), pa.array([0.0, 8.0 / 18.0, 1.0])),
|
||||
(pa.array([0.0, 0.0, 0.0]), pa.array([0.0, 0.0, 0.0])),
|
||||
(pa.array([10.0, 9.9999999999999]), pa.array([0.0, 0.0])),
|
||||
]
|
||||
|
||||
for input, expected in cases:
|
||||
for invert in [True, False]:
|
||||
result = LanceHybridQueryBuilder._normalize_scores(input, invert)
|
||||
|
||||
if invert:
|
||||
expected = pc.subtract(1.0, expected)
|
||||
|
||||
assert pc.equal(
|
||||
result, expected
|
||||
), f"Expected {expected} but got {result} for invert={invert}"
|
||||
|
||||
@@ -80,6 +80,10 @@ async def test_create_scalar_index(some_table: AsyncTable):
|
||||
# can also specify index type
|
||||
await some_table.create_index("id", config=BTree())
|
||||
|
||||
await some_table.drop_index("id_idx")
|
||||
indices = await some_table.list_indices()
|
||||
assert len(indices) == 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_create_bitmap_index(some_table: AsyncTable):
|
||||
|
||||
@@ -7,6 +7,7 @@ from pathlib import Path
|
||||
|
||||
import lancedb
|
||||
from lancedb.index import IvfPq, FTS
|
||||
from lancedb.rerankers.cross_encoder import CrossEncoderReranker
|
||||
import numpy as np
|
||||
import pandas.testing as tm
|
||||
import pyarrow as pa
|
||||
@@ -69,7 +70,7 @@ async def table_struct_async(tmp_path) -> AsyncTable:
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def multivec_table() -> lancedb.table.Table:
|
||||
def multivec_table(vector_value_type=pa.float32()) -> lancedb.table.Table:
|
||||
db = lancedb.connect("memory://")
|
||||
# Generate 256 rows of data
|
||||
num_rows = 256
|
||||
@@ -85,7 +86,7 @@ def multivec_table() -> lancedb.table.Table:
|
||||
df = pa.table(
|
||||
{
|
||||
"vector": pa.array(
|
||||
vector_data, type=pa.list_(pa.list_(pa.float32(), list_size=2))
|
||||
vector_data, type=pa.list_(pa.list_(vector_value_type, list_size=2))
|
||||
),
|
||||
"id": pa.array(id_data),
|
||||
"float_field": pa.array(float_field_data),
|
||||
@@ -95,7 +96,7 @@ def multivec_table() -> lancedb.table.Table:
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def multivec_table_async(tmp_path) -> AsyncTable:
|
||||
async def multivec_table_async(vector_value_type=pa.float32()) -> AsyncTable:
|
||||
conn = await lancedb.connect_async(
|
||||
"memory://", read_consistency_interval=timedelta(seconds=0)
|
||||
)
|
||||
@@ -113,7 +114,7 @@ async def multivec_table_async(tmp_path) -> AsyncTable:
|
||||
df = pa.table(
|
||||
{
|
||||
"vector": pa.array(
|
||||
vector_data, type=pa.list_(pa.list_(pa.float32(), list_size=2))
|
||||
vector_data, type=pa.list_(pa.list_(vector_value_type, list_size=2))
|
||||
),
|
||||
"id": pa.array(id_data),
|
||||
"float_field": pa.array(float_field_data),
|
||||
@@ -231,6 +232,9 @@ async def test_distance_range_async(table_async: AsyncTable):
|
||||
assert res["_distance"].to_pylist() == [min_dist, max_dist]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"multivec_table", [pa.float16(), pa.float32(), pa.float64()], indirect=True
|
||||
)
|
||||
def test_multivector(multivec_table: lancedb.table.Table):
|
||||
# create index on multivector
|
||||
multivec_table.create_index(
|
||||
@@ -261,6 +265,9 @@ def test_multivector(multivec_table: lancedb.table.Table):
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"multivec_table_async", [pa.float16(), pa.float32(), pa.float64()], indirect=True
|
||||
)
|
||||
async def test_multivector_async(multivec_table_async: AsyncTable):
|
||||
# create index on multivector
|
||||
await multivec_table_async.create_index(
|
||||
@@ -370,14 +377,14 @@ def test_query_builder_with_metric(table):
|
||||
df_default = LanceVectorQueryBuilder(table, query, vector_column_name).to_pandas()
|
||||
df_l2 = (
|
||||
LanceVectorQueryBuilder(table, query, vector_column_name)
|
||||
.metric("L2")
|
||||
.distance_type("L2")
|
||||
.to_pandas()
|
||||
)
|
||||
tm.assert_frame_equal(df_default, df_l2)
|
||||
|
||||
df_cosine = (
|
||||
LanceVectorQueryBuilder(table, query, vector_column_name)
|
||||
.metric("cosine")
|
||||
.distance_type("cosine")
|
||||
.limit(1)
|
||||
.to_pandas()
|
||||
)
|
||||
@@ -394,7 +401,7 @@ def test_query_builder_with_different_vector_column():
|
||||
vector_column_name = "foo_vector"
|
||||
builder = (
|
||||
LanceVectorQueryBuilder(table, query, vector_column_name)
|
||||
.metric("cosine")
|
||||
.distance_type("cosine")
|
||||
.where("b < 10")
|
||||
.select(["b"])
|
||||
.limit(2)
|
||||
@@ -509,15 +516,24 @@ async def test_query_async(table_async: AsyncTable):
|
||||
expected_columns=["id", "vector", "_rowid"],
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.slow
|
||||
async def test_query_reranked_async(table_async: AsyncTable):
|
||||
# FTS with rerank
|
||||
await table_async.create_index("text", config=FTS(with_position=False))
|
||||
await check_query(
|
||||
table_async.query().nearest_to_text("dog").rerank(),
|
||||
table_async.query().nearest_to_text("dog").rerank(CrossEncoderReranker()),
|
||||
expected_num_rows=1,
|
||||
)
|
||||
|
||||
# Vector query with rerank
|
||||
await check_query(table_async.vector_search([1, 2]).rerank(), expected_num_rows=2)
|
||||
await check_query(
|
||||
table_async.vector_search([1, 2]).rerank(
|
||||
CrossEncoderReranker(), query_string="dog"
|
||||
),
|
||||
expected_num_rows=2,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
||||
@@ -366,7 +366,7 @@ def test_query_sync_maximal():
|
||||
with query_test_table(handler) as table:
|
||||
(
|
||||
table.search([1, 2, 3], vector_column_name="vector2", fast_search=True)
|
||||
.metric("cosine")
|
||||
.distance_type("cosine")
|
||||
.limit(42)
|
||||
.offset(10)
|
||||
.refine_factor(10)
|
||||
|
||||
@@ -3,6 +3,8 @@ import random
|
||||
|
||||
import lancedb
|
||||
import numpy as np
|
||||
import pyarrow as pa
|
||||
import pyarrow.compute as pc
|
||||
import pytest
|
||||
from lancedb.conftest import MockTextEmbeddingFunction # noqa
|
||||
from lancedb.embeddings import EmbeddingFunctionRegistry
|
||||
@@ -281,6 +283,31 @@ def _run_test_hybrid_reranker(reranker, tmp_path, use_tantivy):
|
||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
||||
def test_linear_combination(tmp_path, use_tantivy):
|
||||
reranker = LinearCombinationReranker()
|
||||
|
||||
vector_results = pa.Table.from_pydict(
|
||||
{
|
||||
"_rowid": [0, 1, 2, 3, 4],
|
||||
"_distance": [0.1, 0.2, 0.3, 0.4, 0.5],
|
||||
"_text": ["a", "b", "c", "d", "e"],
|
||||
}
|
||||
)
|
||||
|
||||
fts_results = pa.Table.from_pydict(
|
||||
{
|
||||
"_rowid": [1, 2, 3, 4, 5],
|
||||
"_score": [0.1, 0.2, 0.3, 0.4, 0.5],
|
||||
"_text": ["b", "c", "d", "e", "f"],
|
||||
}
|
||||
)
|
||||
|
||||
combined_results = reranker.merge_results(vector_results, fts_results, 1.0)
|
||||
assert len(combined_results) == 6
|
||||
assert "_rowid" in combined_results.column_names
|
||||
assert "_text" in combined_results.column_names
|
||||
assert "_distance" not in combined_results.column_names
|
||||
assert "_score" not in combined_results.column_names
|
||||
assert "_relevance_score" in combined_results.column_names
|
||||
|
||||
_run_test_hybrid_reranker(reranker, tmp_path, use_tantivy)
|
||||
|
||||
|
||||
@@ -290,6 +317,55 @@ def test_rrf_reranker(tmp_path, use_tantivy):
|
||||
_run_test_hybrid_reranker(reranker, tmp_path, use_tantivy)
|
||||
|
||||
|
||||
def test_rrf_reranker_distance():
|
||||
data = pa.table(
|
||||
{
|
||||
"vector": pa.FixedSizeListArray.from_arrays(
|
||||
pc.random(32 * 1024).cast(pa.float32()), 32
|
||||
),
|
||||
"text": pa.array(["hello"] * 1024),
|
||||
}
|
||||
)
|
||||
db = lancedb.connect("memory://")
|
||||
table = db.create_table("test", data)
|
||||
|
||||
table.create_index(num_partitions=1, num_sub_vectors=2)
|
||||
table.create_fts_index("text", use_tantivy=False)
|
||||
|
||||
reranker = RRFReranker(return_score="all")
|
||||
|
||||
hybrid_results = (
|
||||
table.search(query_type="hybrid")
|
||||
.vector([0.0] * 32)
|
||||
.text("hello")
|
||||
.with_row_id(True)
|
||||
.rerank(reranker)
|
||||
.to_list()
|
||||
)
|
||||
hybrid_distances = {row["_rowid"]: row["_distance"] for row in hybrid_results}
|
||||
hybrid_scores = {row["_rowid"]: row["_score"] for row in hybrid_results}
|
||||
|
||||
vector_results = table.search([0.0] * 32).with_row_id(True).to_list()
|
||||
vector_distances = {row["_rowid"]: row["_distance"] for row in vector_results}
|
||||
|
||||
fts_results = table.search("hello", query_type="fts").with_row_id(True).to_list()
|
||||
fts_scores = {row["_rowid"]: row["_score"] for row in fts_results}
|
||||
|
||||
found_match = False
|
||||
for rowid, distance in hybrid_distances.items():
|
||||
if rowid in vector_distances:
|
||||
found_match = True
|
||||
assert distance == vector_distances[rowid], "Distance mismatch"
|
||||
assert found_match, "No results matched between hybrid and vector search"
|
||||
|
||||
found_match = False
|
||||
for rowid, score in hybrid_scores.items():
|
||||
if rowid in fts_scores and fts_scores[rowid] is not None:
|
||||
found_match = True
|
||||
assert score == fts_scores[rowid], "Score mismatch"
|
||||
assert found_match, "No results matched between hybrid and fts search"
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
os.environ.get("COHERE_API_KEY") is None, reason="COHERE_API_KEY not set"
|
||||
)
|
||||
|
||||
@@ -1008,6 +1008,10 @@ def test_create_scalar_index(mem_db: DBConnection):
|
||||
results = table.search([5, 5]).where("x != 'b'").to_arrow()
|
||||
assert results["_distance"][0].as_py() > 0
|
||||
|
||||
table.drop_index(scalar_index.name)
|
||||
indices = table.list_indices()
|
||||
assert len(indices) == 0
|
||||
|
||||
|
||||
def test_empty_query(mem_db: DBConnection):
|
||||
table = mem_db.create_table(
|
||||
@@ -1238,7 +1242,9 @@ def test_hybrid_search_metric_type(tmp_db: DBConnection):
|
||||
|
||||
# with custom metric
|
||||
result_dot = (
|
||||
table.search("feeling lucky", query_type="hybrid").metric("dot").to_arrow()
|
||||
table.search("feeling lucky", query_type="hybrid")
|
||||
.distance_type("dot")
|
||||
.to_arrow()
|
||||
)
|
||||
result_l2 = table.search("feeling lucky", query_type="hybrid").to_arrow()
|
||||
assert len(result_dot) > 0
|
||||
|
||||
@@ -9,7 +9,10 @@ use arrow::{
|
||||
};
|
||||
use futures::stream::StreamExt;
|
||||
use lancedb::arrow::SendableRecordBatchStream;
|
||||
use pyo3::{pyclass, pymethods, Bound, PyAny, PyObject, PyRef, PyResult, Python};
|
||||
use pyo3::{
|
||||
exceptions::PyStopAsyncIteration, pyclass, pymethods, Bound, PyAny, PyObject, PyRef, PyResult,
|
||||
Python,
|
||||
};
|
||||
use pyo3_async_runtimes::tokio::future_into_py;
|
||||
|
||||
use crate::error::PythonErrorExt;
|
||||
@@ -32,20 +35,25 @@ impl RecordBatchStream {
|
||||
|
||||
#[pymethods]
|
||||
impl RecordBatchStream {
|
||||
#[getter]
|
||||
pub fn schema(&self, py: Python) -> PyResult<PyObject> {
|
||||
(*self.schema).clone().into_pyarrow(py)
|
||||
}
|
||||
|
||||
pub fn next(self_: PyRef<'_, Self>) -> PyResult<Bound<'_, PyAny>> {
|
||||
pub fn __aiter__(self_: PyRef<'_, Self>) -> PyRef<'_, Self> {
|
||||
self_
|
||||
}
|
||||
|
||||
pub fn __anext__(self_: PyRef<'_, Self>) -> PyResult<Bound<'_, PyAny>> {
|
||||
let inner = self_.inner.clone();
|
||||
future_into_py(self_.py(), async move {
|
||||
let inner_next = inner.lock().await.next().await;
|
||||
inner_next
|
||||
.map(|item| {
|
||||
let item = item.infer_error()?;
|
||||
Python::with_gil(|py| item.to_pyarrow(py))
|
||||
})
|
||||
.transpose()
|
||||
let inner_next = inner
|
||||
.lock()
|
||||
.await
|
||||
.next()
|
||||
.await
|
||||
.ok_or_else(|| PyStopAsyncIteration::new_err(""))?;
|
||||
Python::with_gil(|py| inner_next.infer_error()?.to_pyarrow(py))
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -21,7 +21,7 @@ use pyo3::{
|
||||
types::{PyModule, PyModuleMethods},
|
||||
wrap_pyfunction, Bound, PyResult, Python,
|
||||
};
|
||||
use query::{Query, VectorQuery};
|
||||
use query::{FTSQuery, HybridQuery, Query, VectorQuery};
|
||||
use table::Table;
|
||||
|
||||
pub mod arrow;
|
||||
@@ -42,6 +42,8 @@ pub fn _lancedb(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
|
||||
m.add_class::<Table>()?;
|
||||
m.add_class::<IndexConfig>()?;
|
||||
m.add_class::<Query>()?;
|
||||
m.add_class::<FTSQuery>()?;
|
||||
m.add_class::<HybridQuery>()?;
|
||||
m.add_class::<VectorQuery>()?;
|
||||
m.add_class::<RecordBatchStream>()?;
|
||||
m.add_function(wrap_pyfunction!(connect, m)?)?;
|
||||
|
||||
@@ -194,6 +194,14 @@ impl Table {
|
||||
})
|
||||
}
|
||||
|
||||
pub fn drop_index(self_: PyRef<'_, Self>, index_name: String) -> PyResult<Bound<'_, PyAny>> {
|
||||
let inner = self_.inner_ref()?.clone();
|
||||
future_into_py(self_.py(), async move {
|
||||
inner.drop_index(&index_name).await.infer_error()?;
|
||||
Ok(())
|
||||
})
|
||||
}
|
||||
|
||||
pub fn list_indices(self_: PyRef<'_, Self>) -> PyResult<Bound<'_, PyAny>> {
|
||||
let inner = self_.inner_ref()?.clone();
|
||||
future_into_py(self_.py(), async move {
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "lancedb-node"
|
||||
version = "0.15.0-beta.0"
|
||||
version = "0.15.1-beta.0"
|
||||
description = "Serverless, low-latency vector database for AI applications"
|
||||
license.workspace = true
|
||||
edition.workspace = true
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "lancedb"
|
||||
version = "0.15.0-beta.0"
|
||||
version = "0.15.1-beta.0"
|
||||
edition.workspace = true
|
||||
description = "LanceDB: A serverless, low-latency vector database for AI applications"
|
||||
license.workspace = true
|
||||
|
||||
@@ -6,7 +6,7 @@ use crate::index::IndexStatistics;
|
||||
use crate::query::Select;
|
||||
use crate::table::AddDataMode;
|
||||
use crate::utils::{supported_btree_data_type, supported_vector_data_type};
|
||||
use crate::{Error, Table};
|
||||
use crate::{DistanceType, Error, Table};
|
||||
use arrow_array::RecordBatchReader;
|
||||
use arrow_ipc::reader::FileReader;
|
||||
use arrow_schema::{DataType, SchemaRef};
|
||||
@@ -592,7 +592,7 @@ impl<S: HttpSend> TableInternal for RemoteTable<S> {
|
||||
message: format!("Column {} not found in schema", column),
|
||||
})?;
|
||||
if supported_vector_data_type(field.data_type()) {
|
||||
("IVF_PQ", None)
|
||||
("IVF_PQ", Some(DistanceType::L2))
|
||||
} else if supported_btree_data_type(field.data_type()) {
|
||||
("BTREE", None)
|
||||
} else {
|
||||
@@ -816,6 +816,14 @@ impl<S: HttpSend> TableInternal for RemoteTable<S> {
|
||||
|
||||
Ok(Some(stats))
|
||||
}
|
||||
|
||||
/// Not yet supported on LanceDB Cloud.
|
||||
async fn drop_index(&self, _name: &str) -> Result<()> {
|
||||
Err(Error::NotSupported {
|
||||
message: "Drop index is not yet supported on LanceDB Cloud.".into(),
|
||||
})
|
||||
}
|
||||
|
||||
async fn table_definition(&self) -> Result<TableDefinition> {
|
||||
Err(Error::NotSupported {
|
||||
message: "table_definition is not supported on LanceDB cloud.".into(),
|
||||
|
||||
@@ -41,6 +41,7 @@ use lance::dataset::{
|
||||
WriteParams,
|
||||
};
|
||||
use lance::dataset::{MergeInsertBuilder as LanceMergeInsertBuilder, WhenNotMatchedBySource};
|
||||
use lance::index::vector::utils::infer_vector_dim;
|
||||
use lance::io::WrappingObjectStore;
|
||||
use lance_datafusion::exec::execute_plan;
|
||||
use lance_index::vector::hnsw::builder::HnswBuildParams;
|
||||
@@ -410,6 +411,7 @@ pub(crate) trait TableInternal: std::fmt::Display + std::fmt::Debug + Send + Syn
|
||||
async fn update(&self, update: UpdateBuilder) -> Result<u64>;
|
||||
async fn create_index(&self, index: IndexBuilder) -> Result<()>;
|
||||
async fn list_indices(&self) -> Result<Vec<IndexConfig>>;
|
||||
async fn drop_index(&self, name: &str) -> Result<()>;
|
||||
async fn index_stats(&self, index_name: &str) -> Result<Option<IndexStatistics>>;
|
||||
async fn merge_insert(
|
||||
&self,
|
||||
@@ -984,6 +986,18 @@ impl Table {
|
||||
self.inner.index_stats(index_name.as_ref()).await
|
||||
}
|
||||
|
||||
/// Drop an index from the table.
|
||||
///
|
||||
/// Note: This is not yet available in LanceDB cloud.
|
||||
///
|
||||
/// This does not delete the index from disk, it just removes it from the table.
|
||||
/// To delete the index, run [`Self::optimize()`] after dropping the index.
|
||||
///
|
||||
/// Use [`Self::list_indices()`] to find the names of the indices.
|
||||
pub async fn drop_index(&self, name: &str) -> Result<()> {
|
||||
self.inner.drop_index(name).await
|
||||
}
|
||||
|
||||
// Take many execution plans and map them into a single plan that adds
|
||||
// a query_index column and unions them.
|
||||
pub(crate) fn multi_vector_plan(
|
||||
@@ -1370,14 +1384,8 @@ impl NativeTable {
|
||||
let num_sub_vectors: u32 = if let Some(n) = index.num_sub_vectors {
|
||||
n
|
||||
} else {
|
||||
match field.data_type() {
|
||||
arrow_schema::DataType::FixedSizeList(_, n) => {
|
||||
Ok::<u32, Error>(suggested_num_sub_vectors(*n as u32))
|
||||
}
|
||||
_ => Err(Error::Schema {
|
||||
message: format!("Column '{}' is not a FixedSizeList", field.name()),
|
||||
}),
|
||||
}?
|
||||
let dim = infer_vector_dim(field.data_type())?;
|
||||
suggested_num_sub_vectors(dim as u32)
|
||||
};
|
||||
let mut dataset = self.dataset.get_mut().await?;
|
||||
let lance_idx_params = lance::index::vector::VectorIndexParams::ivf_pq(
|
||||
@@ -1734,6 +1742,12 @@ impl NativeTable {
|
||||
}
|
||||
|
||||
/// Update field metadata
|
||||
///
|
||||
/// # Arguments:
|
||||
/// * `new_values` - An iterator of tuples where the first element is the
|
||||
/// field id and the second element is a hashmap of metadata key-value
|
||||
/// pairs.
|
||||
///
|
||||
pub async fn replace_field_metadata(
|
||||
&self,
|
||||
new_values: impl IntoIterator<Item = (u32, HashMap<String, String>)>,
|
||||
@@ -1877,6 +1891,12 @@ impl TableInternal for NativeTable {
|
||||
}
|
||||
}
|
||||
|
||||
async fn drop_index(&self, index_name: &str) -> Result<()> {
|
||||
let mut dataset = self.dataset.get_mut().await?;
|
||||
dataset.drop_index(index_name).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn update(&self, update: UpdateBuilder) -> Result<u64> {
|
||||
let dataset = self.dataset.get().await?.clone();
|
||||
let mut builder = LanceUpdateBuilder::new(Arc::new(dataset));
|
||||
@@ -2903,6 +2923,9 @@ mod tests {
|
||||
assert_eq!(stats.num_unindexed_rows, 0);
|
||||
assert_eq!(stats.index_type, crate::index::IndexType::IvfPq);
|
||||
assert_eq!(stats.distance_type, Some(crate::DistanceType::L2));
|
||||
|
||||
table.drop_index(index_name).await.unwrap();
|
||||
assert_eq!(table.list_indices().await.unwrap().len(), 0);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
@@ -3513,11 +3536,10 @@ mod tests {
|
||||
.unwrap();
|
||||
|
||||
let native_tbl = table.as_native().unwrap();
|
||||
let schema = native_tbl.schema().await.unwrap();
|
||||
let schema = native_tbl.manifest().await.unwrap().schema;
|
||||
|
||||
let (field_idx, field) = schema.column_with_name("i").unwrap();
|
||||
let field_metadata = field.metadata();
|
||||
assert_eq!(field_metadata.len(), 0);
|
||||
let field = schema.field("i").unwrap();
|
||||
assert_eq!(field.metadata.len(), 0);
|
||||
|
||||
native_tbl
|
||||
.replace_schema_metadata(vec![(
|
||||
@@ -3538,16 +3560,15 @@ mod tests {
|
||||
let mut new_field_metadata = HashMap::<String, String>::new();
|
||||
new_field_metadata.insert("test_field_key1".into(), "test_field_val1".into());
|
||||
native_tbl
|
||||
.replace_field_metadata(vec![(field_idx as u32, new_field_metadata)])
|
||||
.replace_field_metadata(vec![(field.id as u32, new_field_metadata)])
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let schema = native_tbl.schema().await.unwrap();
|
||||
let (_field_idx, field) = schema.column_with_name("i").unwrap();
|
||||
let field_metadata = field.metadata();
|
||||
assert_eq!(field_metadata.len(), 1);
|
||||
let schema = native_tbl.manifest().await.unwrap().schema;
|
||||
let field = schema.field("i").unwrap();
|
||||
assert_eq!(field.metadata.len(), 1);
|
||||
assert_eq!(
|
||||
field_metadata.get("test_field_key1"),
|
||||
field.metadata.get("test_field_key1"),
|
||||
Some(&"test_field_val1".to_string())
|
||||
);
|
||||
}
|
||||
|
||||
@@ -17,6 +17,7 @@ use std::sync::Arc;
|
||||
use arrow_schema::{DataType, Schema};
|
||||
use lance::arrow::json::JsonDataType;
|
||||
use lance::dataset::{ReadParams, WriteParams};
|
||||
use lance::index::vector::utils::infer_vector_dim;
|
||||
use lance::io::{ObjectStoreParams, WrappingObjectStore};
|
||||
use lazy_static::lazy_static;
|
||||
|
||||
@@ -104,12 +105,12 @@ pub fn validate_table_name(name: &str) -> Result<()> {
|
||||
|
||||
/// Find one default column to create index or perform vector query.
|
||||
pub(crate) fn default_vector_column(schema: &Schema, dim: Option<i32>) -> Result<String> {
|
||||
// Try to find one fixed size list array column.
|
||||
// Try to find a vector column.
|
||||
let candidates = schema
|
||||
.fields()
|
||||
.iter()
|
||||
.filter_map(|field| match inf_vector_dim(field) {
|
||||
Some(d) if dim.is_none() || dim == Some(d) => Some(field.name()),
|
||||
.filter_map(|field| match infer_vector_dim(field.data_type()) {
|
||||
Ok(d) if dim.is_none() || dim == Some(d as i32) => Some(field.name()),
|
||||
_ => None,
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
@@ -133,20 +134,6 @@ pub(crate) fn default_vector_column(schema: &Schema, dim: Option<i32>) -> Result
|
||||
}
|
||||
}
|
||||
|
||||
fn inf_vector_dim(field: &arrow_schema::Field) -> Option<i32> {
|
||||
match field.data_type() {
|
||||
arrow_schema::DataType::FixedSizeList(f, d) => {
|
||||
if f.data_type().is_floating() || f.data_type() == &DataType::UInt8 {
|
||||
Some(*d)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
arrow_schema::DataType::List(f) => inf_vector_dim(f),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn supported_btree_data_type(dtype: &DataType) -> bool {
|
||||
dtype.is_integer()
|
||||
|| dtype.is_floating()
|
||||
|
||||
Reference in New Issue
Block a user