Compare commits

...

22 Commits

Author SHA1 Message Date
Lance Release
e5f42a850e Bump version: 0.18.1-beta.0 → 0.18.1-beta.1 2025-01-23 23:01:13 +00:00
Will Jones
7920ecf66e ci(python): stop using deprecated 2_24 manylinux for arm (#2064)
Based on changes made in Lance:

* https://github.com/lancedb/lance/pull/3409
* https://github.com/lancedb/lance/pull/3411
2025-01-23 15:00:34 -08:00
Will Jones
28e1b70e4b fix(python): preserve original distance and score in hybrid queries (#2061)
Fixes #2031

When we do hybrid search, we normalize the scores. We do this
calculation in-place, because the Rerankers expect the `_distance` and
`_score` columns to be the normalized ones. So I've changed the logic so
that we restore the original distance and scores by matching on row ids.
2025-01-23 13:54:26 -08:00
Will Jones
52b79d2b1e feat: upgrade lance to v0.23.0-beta.2 (#2063)
Fixes https://github.com/lancedb/lancedb/issues/2043
2025-01-23 13:51:30 -08:00
Bert
c05d45150d docs: clarify the arguments for replace_field_metadata (#2053)
When calling `replace_field_metadata` we pass in an iter of tuples
`(u32, HashMap<String, String>)`.

That `u32` needs to be the field id from the lance schema

7f60aa0a87/rust/lance-core/src/datatypes/field.rs (L123)

This can sometimes be different than the index of the field in the arrow
schema (e.g. if fields have been dropped).

This PR adds docs that try to clarify what that argument should be, as
well as corrects the usage in the test (which was improperly passing the
index of the arrow schema).
2025-01-23 08:52:27 -05:00
BubbleCal
48ed3bb544 chore: replace the util to lance's (#2052)
Signed-off-by: BubbleCal <bubble-cal@outlook.com>
2025-01-23 11:04:37 +08:00
Will Jones
bcfc93cc88 fix(python): various fixes for async query builders (#2048)
This includes several improvements and fixes to the Python Async query
builders:

1. The API reference docs show all the methods for each builder
2. The hybrid query builder now has all the same setter methods as the
vector search one, so you can now set things like `.distance_type()` on
a hybrid query.
3. Re-rankers are now properly hooked up and tested for FTS and vector
search. Previously the re-rankers were accidentally bypassed in unit
tests, because the builders overrode `.to_arrow()`, but the unit test
called `.to_batches()` which was only defined in the base class. Now all
builders implement `.to_batches()` and leave `.to_arrow()` to the base
class.
4. The `AsyncQueryBase` and `AsyncVectoryQueryBase` setter methods now
return `Self`, which provides the appropriate subclass as the type hint
return value. Previously, `AsyncQueryBase` had them all hard-coded to
`AsyncQuery`, which was unfortunate. (This required bringing in
`typing-extensions` for older Python version, but I think it's worth
it.)
2025-01-20 16:14:34 -08:00
BubbleCal
214d0debf5 docs: claim LanceDB supports float16/float32/float64 for multivector (#2040) 2025-01-21 07:04:15 +08:00
Will Jones
f059372137 feat: add drop_index() method (#2039)
Closes #1665
2025-01-20 10:08:51 -08:00
Lance Release
3dc1803c07 Bump version: 0.18.0 → 0.18.1-beta.0 2025-01-17 04:37:23 +00:00
BubbleCal
d0501f65f1 fix: linear reranker applies wrong score to combine (#2035)
related to #2014 
this fixes:
- linear reranker may lost some results if the merging consumes all
vector results earlier than fts results
- linear reranker inverts the fts score but only vector distance can be
inverted

---------

Signed-off-by: BubbleCal <bubble-cal@outlook.com>
2025-01-17 11:33:48 +08:00
Bert
4703cc6894 chore: upgrade lance to v0.22.1-beta.3 (#2038) 2025-01-16 12:42:42 -05:00
BubbleCal
493f9ce467 fix: can't infer the vector column for multivector (#2026)
Signed-off-by: BubbleCal <bubble-cal@outlook.com>
2025-01-16 14:08:04 +08:00
Weston Pace
5c759505b8 feat: upgrade lance 0.22.1b1 (#2029)
Now the version actually exists :)
2025-01-15 07:37:37 -08:00
BubbleCal
bb6a39727e fix: missing distance type for auto index on RemoteTable (#2027)
Signed-off-by: BubbleCal <bubble-cal@outlook.com>
2025-01-15 20:28:55 +08:00
BubbleCal
d57bed90e5 docs: add missing example code (#2025) 2025-01-14 21:17:05 -08:00
BubbleCal
648327e90c docs: show how to pack bits for binary vector (#2020)
Signed-off-by: BubbleCal <bubble-cal@outlook.com>
2025-01-14 09:00:57 -08:00
Lance Release
6c7e81ee57 Updating package-lock.json 2025-01-14 02:14:37 +00:00
Lance Release
905e9d4738 Updating package-lock.json 2025-01-14 01:03:49 +00:00
Lance Release
38642e349c Updating package-lock.json 2025-01-14 01:03:33 +00:00
Lance Release
6879861ea8 Bump version: 0.15.0-beta.1 → 0.15.0 2025-01-14 01:03:04 +00:00
Lance Release
88325e488e Bump version: 0.15.0-beta.0 → 0.15.0-beta.1 2025-01-14 01:02:59 +00:00
56 changed files with 966 additions and 315 deletions

View File

@@ -1,5 +1,5 @@
[tool.bumpversion]
current_version = "0.15.0-beta.0"
current_version = "0.15.0"
parse = """(?x)
(?P<major>0|[1-9]\\d*)\\.
(?P<minor>0|[1-9]\\d*)\\.

View File

@@ -52,12 +52,7 @@ runs:
args: ${{ inputs.args }}
before-script-linux: |
set -e
apt install -y unzip
if [ $(uname -m) = "x86_64" ]; then
PROTOC_ARCH="x86_64"
else
PROTOC_ARCH="aarch_64"
fi
curl -L https://github.com/protocolbuffers/protobuf/releases/download/v24.4/protoc-24.4-linux-$PROTOC_ARCH.zip > /tmp/protoc.zip \
yum install -y openssl-devel clang \
&& curl -L https://github.com/protocolbuffers/protobuf/releases/download/v24.4/protoc-24.4-linux-aarch_64.zip > /tmp/protoc.zip \
&& unzip /tmp/protoc.zip -d /usr/local \
&& rm /tmp/protoc.zip

View File

@@ -15,15 +15,21 @@ jobs:
- platform: x86_64
manylinux: "2_17"
extra_args: ""
runner: ubuntu-22.04
- platform: x86_64
manylinux: "2_28"
extra_args: "--features fp16kernels"
runner: ubuntu-22.04
- platform: aarch64
manylinux: "2_24"
manylinux: "2_17"
extra_args: ""
# We don't build fp16 kernels for aarch64, because it uses
# cross compilation image, which doesn't have a new enough compiler.
runs-on: "ubuntu-22.04"
# For successful fat LTO builds, we need a large runner to avoid OOM errors.
runner: ubuntu-2404-8x-arm64
- platform: aarch64
manylinux: "2_28"
extra_args: "--features fp16kernels"
runner: ubuntu-2404-8x-arm64
runs-on: ${{ matrix.config.runner }}
steps:
- uses: actions/checkout@v4
with:

View File

@@ -21,14 +21,16 @@ categories = ["database-implementations"]
rust-version = "1.78.0"
[workspace.dependencies]
lance = { "version" = "=0.22.0", "features" = ["dynamodb"] }
lance-io = "=0.22.0"
lance-index = "=0.22.0"
lance-linalg = "=0.22.0"
lance-table = "=0.22.0"
lance-testing = "=0.22.0"
lance-datafusion = "=0.22.0"
lance-encoding = "=0.22.0"
lance = { "version" = "=0.23.0", "features" = [
"dynamodb",
], git = "https://github.com/lancedb/lance.git", tag = "v0.23.0-beta.2" }
lance-io = { version = "=0.23.0", git = "https://github.com/lancedb/lance.git", tag = "v0.23.0-beta.2" }
lance-index = { version = "=0.23.0", git = "https://github.com/lancedb/lance.git", tag = "v0.23.0-beta.2" }
lance-linalg = { version = "=0.23.0", git = "https://github.com/lancedb/lance.git", tag = "v0.23.0-beta.2" }
lance-table = { version = "=0.23.0", git = "https://github.com/lancedb/lance.git", tag = "v0.23.0-beta.2" }
lance-testing = { version = "=0.23.0", git = "https://github.com/lancedb/lance.git", tag = "v0.23.0-beta.2" }
lance-datafusion = { version = "=0.23.0", git = "https://github.com/lancedb/lance.git", tag = "v0.23.0-beta.2" }
lance-encoding = { version = "=0.23.0", git = "https://github.com/lancedb/lance.git", tag = "v0.23.0-beta.2" }
# Note that this one does not include pyarrow
arrow = { version = "53.2", optional = false }
arrow-array = "53.2"

View File

@@ -40,37 +40,4 @@ The [quickstart](../basic.md) contains a more complete example.
## Development
```sh
npm run build
npm run test
```
### Running lint / format
LanceDb uses [biome](https://biomejs.dev/) for linting and formatting. if you are using VSCode you will need to install the official [Biome](https://marketplace.visualstudio.com/items?itemName=biomejs.biome) extension.
To manually lint your code you can run:
```sh
npm run lint
```
to automatically fix all fixable issues:
```sh
npm run lint-fix
```
If you do not have your workspace root set to the `nodejs` directory, unfortunately the extension will not work. You can still run the linting and formatting commands manually.
### Generating docs
```sh
npm run docs
cd ../docs
# Asssume the virtual environment was created
# python3 -m venv venv
# pip install -r requirements.txt
. ./venv/bin/activate
mkdocs build
```
See [CONTRIBUTING.md](_media/CONTRIBUTING.md) for information on how to contribute to LanceDB.

View File

@@ -0,0 +1,76 @@
# Contributing to LanceDB Typescript
This document outlines the process for contributing to LanceDB Typescript.
For general contribution guidelines, see [CONTRIBUTING.md](../CONTRIBUTING.md).
## Project layout
The Typescript package is a wrapper around the Rust library, `lancedb`. We use
the [napi-rs](https://napi.rs/) library to create the bindings between Rust and
Typescript.
* `src/`: Rust bindings source code
* `lancedb/`: Typescript package source code
* `__test__/`: Unit tests
* `examples/`: An npm package with the examples shown in the documentation
## Development environment
To set up your development environment, you will need to install the following:
1. Node.js 14 or later
2. Rust's package manager, Cargo. Use [rustup](https://rustup.rs/) to install.
3. [protoc](https://grpc.io/docs/protoc-installation/) (Protocol Buffers compiler)
Initial setup:
```shell
npm install
```
### Commit Hooks
It is **highly recommended** to install the [pre-commit](https://pre-commit.com/) hooks to ensure that your
code is formatted correctly and passes basic checks before committing:
```shell
pre-commit install
```
## Development
Most common development commands can be run using the npm scripts.
Build the package
```shell
npm install
npm run build
```
Lint:
```shell
npm run lint
```
Format and fix lints:
```shell
npm run lint-fix
```
Run tests:
```shell
npm test
```
To run a single test:
```shell
# Single file: table.test.ts
npm test -- table.test.ts
# Single test: 'merge insert' in table.test.ts
npm test -- table.test.ts --testNamePattern=merge\ insert
```

View File

@@ -317,6 +317,32 @@ then call ``cleanup_files`` to remove the old files.
***
### dropIndex()
```ts
abstract dropIndex(name): Promise<void>
```
Drop an index from the table.
#### Parameters
* **name**: `string`
The name of the index.
#### Returns
`Promise`&lt;`void`&gt;
#### Note
This does not delete the index from disk, it just removes it from the table.
To delete the index, run [Table#optimize](Table.md#optimize) after dropping the index.
Use [Table.listIndices](Table.md#listindices) to find the names of the indices.
***
### indexStats()
```ts
@@ -336,6 +362,8 @@ List all the stats of a specified index
The stats of the index. If the index does not exist, it will return undefined
Use [Table.listIndices](Table.md#listindices) to find the names of the indices.
***
### isOpen()

View File

@@ -128,6 +128,24 @@ whose data type is a fixed-size-list of floats.
***
### distanceRange()
```ts
distanceRange(lowerBound?, upperBound?): VectorQuery
```
#### Parameters
* **lowerBound?**: `number`
* **upperBound?**: `number`
#### Returns
[`VectorQuery`](VectorQuery.md)
***
### distanceType()
```ts
@@ -528,6 +546,22 @@ distance between the query vector and the actual uncompressed vector.
***
### rerank()
```ts
rerank(reranker): VectorQuery
```
#### Parameters
* **reranker**: [`Reranker`](../namespaces/rerankers/interfaces/Reranker.md)
#### Returns
[`VectorQuery`](VectorQuery.md)
***
### select()
```ts

View File

@@ -7,6 +7,7 @@
## Namespaces
- [embedding](namespaces/embedding/README.md)
- [rerankers](namespaces/rerankers/README.md)
## Enumerations

View File

@@ -68,6 +68,21 @@ The default value is 50.
***
### numBits?
```ts
optional numBits: number;
```
Number of bits per sub-vector.
This value controls how much each subvector is compressed. The more bits the more
accurate the index will be but the slower search. The default is 8 bits.
The number of bits must be 4 or 8.
***
### numPartitions?
```ts

View File

@@ -0,0 +1,17 @@
[**@lancedb/lancedb**](../../README.md) • **Docs**
***
[@lancedb/lancedb](../../globals.md) / rerankers
# rerankers
## Index
### Classes
- [RRFReranker](classes/RRFReranker.md)
### Interfaces
- [Reranker](interfaces/Reranker.md)

View File

@@ -0,0 +1,66 @@
[**@lancedb/lancedb**](../../../README.md) • **Docs**
***
[@lancedb/lancedb](../../../globals.md) / [rerankers](../README.md) / RRFReranker
# Class: RRFReranker
Reranks the results using the Reciprocal Rank Fusion (RRF) algorithm.
Internally this uses the Rust implementation
## Constructors
### new RRFReranker()
```ts
new RRFReranker(inner): RRFReranker
```
#### Parameters
* **inner**: `RrfReranker`
#### Returns
[`RRFReranker`](RRFReranker.md)
## Methods
### rerankHybrid()
```ts
rerankHybrid(
query,
vecResults,
ftsResults): Promise<RecordBatch<any>>
```
#### Parameters
* **query**: `string`
* **vecResults**: `RecordBatch`&lt;`any`&gt;
* **ftsResults**: `RecordBatch`&lt;`any`&gt;
#### Returns
`Promise`&lt;`RecordBatch`&lt;`any`&gt;&gt;
***
### create()
```ts
static create(k): Promise<RRFReranker>
```
#### Parameters
* **k**: `number` = `60`
#### Returns
`Promise`&lt;[`RRFReranker`](RRFReranker.md)&gt;

View File

@@ -0,0 +1,30 @@
[**@lancedb/lancedb**](../../../README.md) • **Docs**
***
[@lancedb/lancedb](../../../globals.md) / [rerankers](../README.md) / Reranker
# Interface: Reranker
## Methods
### rerankHybrid()
```ts
rerankHybrid(
query,
vecResults,
ftsResults): Promise<RecordBatch<any>>
```
#### Parameters
* **query**: `string`
* **vecResults**: `RecordBatch`&lt;`any`&gt;
* **ftsResults**: `RecordBatch`&lt;`any`&gt;
#### Returns
`Promise`&lt;`RecordBatch`&lt;`any`&gt;&gt;

View File

@@ -147,8 +147,19 @@ to return the entire (typically filtered) table. Vector searches return the
rows nearest to a query vector and can be created with the
[AsyncTable.vector_search][lancedb.table.AsyncTable.vector_search] method.
::: lancedb.query.AsyncQueryBase
::: lancedb.query.AsyncQuery
options:
inherited_members: true
::: lancedb.query.AsyncVectorQuery
options:
inherited_members: true
::: lancedb.query.AsyncFTSQuery
options:
inherited_members: true
::: lancedb.query.AsyncHybridQuery
options:
inherited_members: true

View File

@@ -149,6 +149,7 @@ You can index on a column with multivector type and search on it, the query can
where `sim` is the similarity function (e.g. cosine).
For now, only `cosine` metric is supported for multivector search.
The vector value type can be `float16`, `float32` or `float64`.
=== "Python"

View File

@@ -8,7 +8,7 @@
<parent>
<groupId>com.lancedb</groupId>
<artifactId>lancedb-parent</artifactId>
<version>0.15.0-beta.0</version>
<version>0.15.0-final.0</version>
<relativePath>../pom.xml</relativePath>
</parent>

View File

@@ -6,7 +6,7 @@
<groupId>com.lancedb</groupId>
<artifactId>lancedb-parent</artifactId>
<version>0.15.0-beta.0</version>
<version>0.15.0-final.0</version>
<packaging>pom</packaging>
<name>LanceDB Parent</name>

68
node/package-lock.json generated
View File

@@ -1,12 +1,12 @@
{
"name": "vectordb",
"version": "0.15.0-beta.0",
"version": "0.15.0",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "vectordb",
"version": "0.15.0-beta.0",
"version": "0.15.0",
"cpu": [
"x64",
"arm64"
@@ -52,14 +52,14 @@
"uuid": "^9.0.0"
},
"optionalDependencies": {
"@lancedb/vectordb-darwin-arm64": "0.15.0-beta.0",
"@lancedb/vectordb-darwin-x64": "0.15.0-beta.0",
"@lancedb/vectordb-linux-arm64-gnu": "0.15.0-beta.0",
"@lancedb/vectordb-linux-arm64-musl": "0.15.0-beta.0",
"@lancedb/vectordb-linux-x64-gnu": "0.15.0-beta.0",
"@lancedb/vectordb-linux-x64-musl": "0.15.0-beta.0",
"@lancedb/vectordb-win32-arm64-msvc": "0.15.0-beta.0",
"@lancedb/vectordb-win32-x64-msvc": "0.15.0-beta.0"
"@lancedb/vectordb-darwin-arm64": "0.15.0",
"@lancedb/vectordb-darwin-x64": "0.15.0",
"@lancedb/vectordb-linux-arm64-gnu": "0.15.0",
"@lancedb/vectordb-linux-arm64-musl": "0.15.0",
"@lancedb/vectordb-linux-x64-gnu": "0.15.0",
"@lancedb/vectordb-linux-x64-musl": "0.15.0",
"@lancedb/vectordb-win32-arm64-msvc": "0.15.0",
"@lancedb/vectordb-win32-x64-msvc": "0.15.0"
},
"peerDependencies": {
"@apache-arrow/ts": "^14.0.2",
@@ -330,9 +330,9 @@
}
},
"node_modules/@lancedb/vectordb-darwin-arm64": {
"version": "0.15.0-beta.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.15.0-beta.0.tgz",
"integrity": "sha512-4sPAW4p1YFVfURyf0k017l6LRCz+VmN9fVUBy7W27b6EOQ3xuIb3t5xq3JAtslMPWBP3wxP8rKXXDmlbqDg3+g==",
"version": "0.15.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.15.0.tgz",
"integrity": "sha512-FnBRsCrxvecjhkMQus9M9RQpXyhu1jxQjYGDaqqRIfcUd3ew7ahIR4qk9FyALHmjpPd72xJZgNLjliHtsIX4/w==",
"cpu": [
"arm64"
],
@@ -343,9 +343,9 @@
]
},
"node_modules/@lancedb/vectordb-darwin-x64": {
"version": "0.15.0-beta.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.15.0-beta.0.tgz",
"integrity": "sha512-uzGINrBBsZattB4/ZYxdGNkTxNh3MqE6Y4nF762qo0zWWSiu+QNHQ+ZyLAZ2lwrEvwxs8LUaJNmnpn3nocHc1A==",
"version": "0.15.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.15.0.tgz",
"integrity": "sha512-zy+nt1WBCabVI16u2t3sqGUXBOmnF5ZXMsHa9TWYEXVnbw5112K7/1783DTNA/ZBI/WziUa5jqYQ0GOwkgruqA==",
"cpu": [
"x64"
],
@@ -356,9 +356,9 @@
]
},
"node_modules/@lancedb/vectordb-linux-arm64-gnu": {
"version": "0.15.0-beta.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.15.0-beta.0.tgz",
"integrity": "sha512-bgphfea8h65vJ+bAL+vb+XEfmjskLZ+trZ3GN4n6SICU7XMGSFPl9xzPLGAj1WsoFCTJHe87DRYQpsWGlOI/LQ==",
"version": "0.15.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.15.0.tgz",
"integrity": "sha512-2Pbw+z5Ij5QBvmBxmjaT5F2lNHftVWlarDM1bDc4JtgodJ3Js729qnVLQ0yehnlt+hM6aGFEyn8bH5vf6gEvpQ==",
"cpu": [
"arm64"
],
@@ -369,9 +369,9 @@
]
},
"node_modules/@lancedb/vectordb-linux-arm64-musl": {
"version": "0.15.0-beta.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-musl/-/vectordb-linux-arm64-musl-0.15.0-beta.0.tgz",
"integrity": "sha512-GpmVgqMS9ztNX53z8v0JdZiG6K1cK+mJnGZd3Gzguiavrly4mkYZ8IKNwWP9RmewUMNsFWR0IzD4VR+ojVpjlQ==",
"version": "0.15.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-musl/-/vectordb-linux-arm64-musl-0.15.0.tgz",
"integrity": "sha512-WIvgd2EY2maCdYNHPC0C9RprjNWL83FkQKtn591xixltFk3XKgvBQ2USZW2tXndH/WVdvFQvystmZ3dgUrh8DQ==",
"cpu": [
"arm64"
],
@@ -382,9 +382,9 @@
]
},
"node_modules/@lancedb/vectordb-linux-x64-gnu": {
"version": "0.15.0-beta.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.15.0-beta.0.tgz",
"integrity": "sha512-6Y/39TDv4UDVWnl8UpUJ8mqv9rUNc9Q5VR510I7w34c0ChdWvjqdcy+JFnGrraamE1DA8E6wGEz+5oG0zprkNg==",
"version": "0.15.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.15.0.tgz",
"integrity": "sha512-Pet3aPE+yQT13Gm0+fh11pgHvImS4X8Uf0zRdzsx0eja7x8j15VrVcZTEVTT4QdBNiZrhXBuiq482NJBsqe6vw==",
"cpu": [
"x64"
],
@@ -395,9 +395,9 @@
]
},
"node_modules/@lancedb/vectordb-linux-x64-musl": {
"version": "0.15.0-beta.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-musl/-/vectordb-linux-x64-musl-0.15.0-beta.0.tgz",
"integrity": "sha512-GRdW2dhf6DmynhRojjtQjs8DeARM1WpbZZKXukeofOSMv6JoRBSWKw2DzW5sF/285IMU81B0OXZE75QjLp+VJg==",
"version": "0.15.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-musl/-/vectordb-linux-x64-musl-0.15.0.tgz",
"integrity": "sha512-BC1RvIoEmyOr7ENp618vs9F05gdN7aKlToJNZnGIoi++hRZ25y39B1xxMXQHDnUL8G+Ur9kJObfQ43nVWqueTQ==",
"cpu": [
"x64"
],
@@ -408,9 +408,9 @@
]
},
"node_modules/@lancedb/vectordb-win32-arm64-msvc": {
"version": "0.15.0-beta.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-arm64-msvc/-/vectordb-win32-arm64-msvc-0.15.0-beta.0.tgz",
"integrity": "sha512-2EmRHuqqj8kC5ArUZztUWWTfNd774zL68btOlyhYL1CAiet5jIeGuFWJifdh+PXfQeLoa4GLW5LwyudIR4IHwA==",
"version": "0.15.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-arm64-msvc/-/vectordb-win32-arm64-msvc-0.15.0.tgz",
"integrity": "sha512-H9BeryZl1aLxldtVP0XyiQJyzKStkuxS6SmIg+zaANr9Dns+LmVxYCz429JLC0DlvBWoYjTfK9WJTgMSZXr0Cg==",
"cpu": [
"arm64"
],
@@ -421,9 +421,9 @@
]
},
"node_modules/@lancedb/vectordb-win32-x64-msvc": {
"version": "0.15.0-beta.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.15.0-beta.0.tgz",
"integrity": "sha512-lWq9b7LnWMGO0zDsp3rsLYyAzLooV7zQP77ph9Qv9fF0e4egD5l6SmMsAdQqLQnlhbQjkRjt3XRoDsqI809fcw==",
"version": "0.15.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.15.0.tgz",
"integrity": "sha512-J8JICux2M82OR27i/4YAbEPlvszuE7EnGIU5jmm2+RTFaptKOCshH1C4D4jEXDAaHcUkVgsxyc9lGmGJCkGLhg==",
"cpu": [
"x64"
],

View File

@@ -1,6 +1,6 @@
{
"name": "vectordb",
"version": "0.15.0-beta.0",
"version": "0.15.0",
"description": " Serverless, low-latency vector database for AI applications",
"private": false,
"main": "dist/index.js",
@@ -92,13 +92,13 @@
}
},
"optionalDependencies": {
"@lancedb/vectordb-darwin-x64": "0.15.0-beta.0",
"@lancedb/vectordb-darwin-arm64": "0.15.0-beta.0",
"@lancedb/vectordb-linux-x64-gnu": "0.15.0-beta.0",
"@lancedb/vectordb-linux-arm64-gnu": "0.15.0-beta.0",
"@lancedb/vectordb-linux-x64-musl": "0.15.0-beta.0",
"@lancedb/vectordb-linux-arm64-musl": "0.15.0-beta.0",
"@lancedb/vectordb-win32-x64-msvc": "0.15.0-beta.0",
"@lancedb/vectordb-win32-arm64-msvc": "0.15.0-beta.0"
"@lancedb/vectordb-darwin-x64": "0.15.0",
"@lancedb/vectordb-darwin-arm64": "0.15.0",
"@lancedb/vectordb-linux-x64-gnu": "0.15.0",
"@lancedb/vectordb-linux-arm64-gnu": "0.15.0",
"@lancedb/vectordb-linux-x64-musl": "0.15.0",
"@lancedb/vectordb-linux-arm64-musl": "0.15.0",
"@lancedb/vectordb-win32-x64-msvc": "0.15.0",
"@lancedb/vectordb-win32-arm64-msvc": "0.15.0"
}
}

View File

@@ -1,7 +1,7 @@
[package]
name = "lancedb-nodejs"
edition.workspace = true
version = "0.15.0-beta.0"
version = "0.15.0"
license.workspace = true
description.workspace = true
repository.workspace = true

View File

@@ -473,6 +473,10 @@ describe("When creating an index", () => {
// test offset
rst = await tbl.query().limit(2).offset(1).nearestTo(queryVec).toArrow();
expect(rst.numRows).toBe(1);
await tbl.dropIndex("vec_idx");
const indices2 = await tbl.listIndices();
expect(indices2.length).toBe(0);
});
it("should search with distance range", async () => {

View File

@@ -226,6 +226,19 @@ export abstract class Table {
column: string,
options?: Partial<IndexOptions>,
): Promise<void>;
/**
* Drop an index from the table.
*
* @param name The name of the index.
*
* @note This does not delete the index from disk, it just removes it from the table.
* To delete the index, run {@link Table#optimize} after dropping the index.
*
* Use {@link Table.listIndices} to find the names of the indices.
*/
abstract dropIndex(name: string): Promise<void>;
/**
* Create a {@link Query} Builder.
*
@@ -426,6 +439,8 @@ export abstract class Table {
*
* @param {string} name The name of the index.
* @returns {IndexStatistics | undefined} The stats of the index. If the index does not exist, it will return undefined
*
* Use {@link Table.listIndices} to find the names of the indices.
*/
abstract indexStats(name: string): Promise<IndexStatistics | undefined>;
@@ -591,6 +606,10 @@ export class LocalTable extends Table {
await this.inner.createIndex(nativeIndex, column, options?.replace);
}
async dropIndex(name: string): Promise<void> {
await this.inner.dropIndex(name);
}
query(): Query {
return new Query(this.inner);
}

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-darwin-arm64",
"version": "0.15.0-beta.0",
"version": "0.15.0",
"os": ["darwin"],
"cpu": ["arm64"],
"main": "lancedb.darwin-arm64.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-darwin-x64",
"version": "0.15.0-beta.0",
"version": "0.15.0",
"os": ["darwin"],
"cpu": ["x64"],
"main": "lancedb.darwin-x64.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-arm64-gnu",
"version": "0.15.0-beta.0",
"version": "0.15.0",
"os": ["linux"],
"cpu": ["arm64"],
"main": "lancedb.linux-arm64-gnu.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-arm64-musl",
"version": "0.15.0-beta.0",
"version": "0.15.0",
"os": ["linux"],
"cpu": ["arm64"],
"main": "lancedb.linux-arm64-musl.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-x64-gnu",
"version": "0.15.0-beta.0",
"version": "0.15.0",
"os": ["linux"],
"cpu": ["x64"],
"main": "lancedb.linux-x64-gnu.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-x64-musl",
"version": "0.15.0-beta.0",
"version": "0.15.0",
"os": ["linux"],
"cpu": ["x64"],
"main": "lancedb.linux-x64-musl.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-win32-arm64-msvc",
"version": "0.15.0-beta.0",
"version": "0.15.0",
"os": [
"win32"
],

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-win32-x64-msvc",
"version": "0.15.0-beta.0",
"version": "0.15.0",
"os": ["win32"],
"cpu": ["x64"],
"main": "lancedb.win32-x64-msvc.node",

View File

@@ -1,12 +1,12 @@
{
"name": "@lancedb/lancedb",
"version": "0.15.0-beta.0",
"version": "0.15.0",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "@lancedb/lancedb",
"version": "0.15.0-beta.0",
"version": "0.15.0",
"cpu": [
"x64",
"arm64"

View File

@@ -11,7 +11,7 @@
"ann"
],
"private": false,
"version": "0.15.0-beta.0",
"version": "0.15.0",
"main": "dist/index.js",
"exports": {
".": "./dist/index.js",

View File

@@ -135,6 +135,14 @@ impl Table {
builder.execute().await.default_error()
}
#[napi(catch_unwind)]
pub async fn drop_index(&self, index_name: String) -> napi::Result<()> {
self.inner_ref()?
.drop_index(&index_name)
.await
.default_error()
}
#[napi(catch_unwind)]
pub async fn update(
&self,

View File

@@ -1,5 +1,5 @@
[tool.bumpversion]
current_version = "0.18.0"
current_version = "0.18.1-beta.1"
parse = """(?x)
(?P<major>0|[1-9]\\d*)\\.
(?P<minor>0|[1-9]\\d*)\\.

View File

@@ -1,6 +1,6 @@
[package]
name = "lancedb-python"
version = "0.18.0"
version = "0.18.1-beta.1"
edition.workspace = true
description = "Python bindings for LanceDB"
license.workspace = true

View File

@@ -4,7 +4,7 @@ name = "lancedb"
dynamic = ["version"]
dependencies = [
"deprecation",
"pylance==0.22.0",
"pylance==0.23.0b2",
"tqdm>=4.27.0",
"pydantic>=1.10",
"packaging",
@@ -55,7 +55,7 @@ tests = [
"tantivy",
"pyarrow-stubs",
]
dev = ["ruff", "pre-commit", "pyright"]
dev = ["ruff", "pre-commit", "pyright", 'typing-extensions>=4.0.0; python_version < "3.11"']
docs = ["mkdocs", "mkdocs-jupyter", "mkdocs-material", "mkdocstrings[python]"]
clip = ["torch", "pillow", "open-clip"]
embeddings = [

View File

@@ -84,11 +84,15 @@ class RecordBatchStream:
class Query:
def where(self, filter: str): ...
def select(self, columns: Tuple[str, str]): ...
def select_columns(self, columns: List[str]): ...
def limit(self, limit: int): ...
def offset(self, offset: int): ...
def fast_search(self): ...
def with_row_id(self): ...
def postfilter(self): ...
def nearest_to(self, query_vec: pa.Array) -> VectorQuery: ...
def nearest_to_text(self, query: dict) -> FTSQuery: ...
async def execute(self, max_batch_legnth: Optional[int]) -> RecordBatchStream: ...
async def execute(self, max_batch_length: Optional[int]) -> RecordBatchStream: ...
class FTSQuery:
def where(self, filter: str): ...
@@ -98,6 +102,8 @@ class FTSQuery:
def fast_search(self): ...
def with_row_id(self): ...
def postfilter(self): ...
def get_query(self) -> str: ...
def add_query_vector(self, query_vec: pa.Array) -> None: ...
def nearest_to(self, query_vec: pa.Array) -> HybridQuery: ...
async def execute(self, max_batch_length: Optional[int]) -> RecordBatchStream: ...
async def explain_plan(self) -> str: ...

View File

@@ -1,4 +1,4 @@
from typing import List
from typing import List, Optional, Union
import pyarrow as pa
@@ -12,17 +12,27 @@ class AsyncRecordBatchReader:
Also allows access to the schema of the stream
"""
def __init__(self, inner: RecordBatchStream):
self.inner_ = inner
@property
def schema(self) -> pa.Schema:
def __init__(
self,
inner: Union[RecordBatchStream, pa.Table],
max_batch_length: Optional[int] = None,
):
"""
Get the schema of the batches produced by the stream
Accessing the schema does not consume any data from the stream
Attributes
----------
schema : pa.Schema
The schema of the batches produced by the stream.
Accessing the schema does not consume any data from the stream
"""
return self.inner_.schema()
if isinstance(inner, pa.Table):
self._inner = self._async_iter_from_table(inner, max_batch_length)
self.schema: pa.Schema = inner.schema
elif isinstance(inner, RecordBatchStream):
self._inner = inner
self.schema: pa.Schema = inner.schema
else:
raise TypeError("inner must be a RecordBatchStream or a Table")
async def read_all(self) -> List[pa.RecordBatch]:
"""
@@ -38,7 +48,18 @@ class AsyncRecordBatchReader:
return self
async def __anext__(self) -> pa.RecordBatch:
next = await self.inner_.next()
if next is None:
raise StopAsyncIteration
return next
return await self._inner.__anext__()
@staticmethod
async def _async_iter_from_table(
table: pa.Table, max_batch_length: Optional[int] = None
):
"""
Create an AsyncRecordBatchReader from a Table
This is useful when you have a Table that you want to iterate
over asynchronously
"""
batches = table.to_batches(max_chunksize=max_batch_length)
for batch in batches:
yield batch

View File

@@ -20,6 +20,7 @@ import asyncio
import deprecation
import numpy as np
import pyarrow as pa
import pyarrow.compute as pc
import pyarrow.fs as pa_fs
import pydantic
@@ -31,6 +32,7 @@ from .rerankers.util import check_reranker_result
from .util import safe_import_pandas, flatten_columns
if TYPE_CHECKING:
import sys
import PIL
import polars as pl
@@ -42,6 +44,11 @@ if TYPE_CHECKING:
from .pydantic import LanceModel
from .table import Table
if sys.version_info >= (3, 11):
from typing import Self
else:
from typing_extensions import Self
pd = safe_import_pandas()
@@ -1183,18 +1190,52 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
fts_results = LanceHybridQueryBuilder._rank(fts_results, "_score")
# normalize the scores to be between 0 and 1, 0 being most relevant
vector_results = LanceHybridQueryBuilder._normalize_scores(
vector_results, "_distance"
)
# We check whether the results (vector and FTS) are empty, because when
# they are, they often are missing the _rowid column, which causes an error
if vector_results.num_rows > 0:
distance_i = vector_results.column_names.index("_distance")
original_distances = vector_results.column(distance_i)
original_distance_row_ids = vector_results.column("_rowid")
vector_results = vector_results.set_column(
distance_i,
vector_results.field(distance_i),
LanceHybridQueryBuilder._normalize_scores(original_distances),
)
# In fts higher scores represent relevance. Not inverting them here as
# rerankers might need to preserve this score to support `return_score="all"`
fts_results = LanceHybridQueryBuilder._normalize_scores(fts_results, "_score")
if fts_results.num_rows > 0:
score_i = fts_results.column_names.index("_score")
original_scores = fts_results.column(score_i)
original_score_row_ids = fts_results.column("_rowid")
fts_results = fts_results.set_column(
score_i,
fts_results.field(score_i),
LanceHybridQueryBuilder._normalize_scores(original_scores),
)
results = reranker.rerank_hybrid(fts_query, vector_results, fts_results)
check_reranker_result(results)
if "_distance" in results.column_names:
# restore the original distances
indices = pc.index_in(
results["_rowid"], original_distance_row_ids, skip_nulls=True
)
original_distances = pc.take(original_distances, indices)
distance_i = results.column_names.index("_distance")
results = results.set_column(distance_i, "_distance", original_distances)
if "_score" in results.column_names:
# restore the original scores
indices = pc.index_in(
results["_rowid"], original_score_row_ids, skip_nulls=True
)
original_scores = pc.take(original_scores, indices)
score_i = results.column_names.index("_score")
results = results.set_column(score_i, "_score", original_scores)
results = results.slice(length=limit)
if not with_row_ids:
@@ -1224,28 +1265,23 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
return results
@staticmethod
def _normalize_scores(results: pa.Table, column: str, invert=False):
if len(results) == 0:
return results
# Get the _score column from results
scores = results.column(column).to_numpy()
def _normalize_scores(scores: pa.Array, invert=False) -> pa.Array:
if len(scores) == 0:
return scores
# normalize the scores by subtracting the min and dividing by the max
max, min = np.max(scores), np.min(scores)
if np.isclose(max, min):
rng = max
else:
rng = max - min
# If rng is 0 then min and max are both 0 and so we can leave the scores as is
if rng != 0:
scores = (scores - min) / rng
min, max = pc.min_max(scores).values()
rng = pc.subtract(max, min)
if not pc.equal(rng, pa.scalar(0.0)).as_py():
scores = pc.divide(pc.subtract(scores, min), rng)
elif not pc.equal(max, pa.scalar(0.0)).as_py():
# If rng is 0, then we at least want the scores to be 0
scores = pc.subtract(scores, min)
if invert:
scores = 1 - scores
# replace the _score column with the ranks
_score_idx = results.column_names.index(column)
results = results.set_column(
_score_idx, column, pa.array(scores, type=pa.float32())
)
return results
scores = pc.subtract(1, scores)
return scores
def rerank(
self,
@@ -1418,7 +1454,7 @@ class AsyncQueryBase(object):
"""
self._inner = inner
def where(self, predicate: str) -> AsyncQuery:
def where(self, predicate: str) -> Self:
"""
Only return rows matching the given predicate
@@ -1437,7 +1473,7 @@ class AsyncQueryBase(object):
self._inner.where(predicate)
return self
def select(self, columns: Union[List[str], dict[str, str]]) -> AsyncQuery:
def select(self, columns: Union[List[str], dict[str, str]]) -> Self:
"""
Return only the specified columns.
@@ -1475,7 +1511,7 @@ class AsyncQueryBase(object):
raise TypeError("columns must be a list of column names or a dict")
return self
def limit(self, limit: int) -> AsyncQuery:
def limit(self, limit: int) -> Self:
"""
Set the maximum number of results to return.
@@ -1485,7 +1521,7 @@ class AsyncQueryBase(object):
self._inner.limit(limit)
return self
def offset(self, offset: int) -> AsyncQuery:
def offset(self, offset: int) -> Self:
"""
Set the offset for the results.
@@ -1497,7 +1533,7 @@ class AsyncQueryBase(object):
self._inner.offset(offset)
return self
def fast_search(self) -> AsyncQuery:
def fast_search(self) -> Self:
"""
Skip searching un-indexed data.
@@ -1511,14 +1547,14 @@ class AsyncQueryBase(object):
self._inner.fast_search()
return self
def with_row_id(self) -> AsyncQuery:
def with_row_id(self) -> Self:
"""
Include the _rowid column in the results.
"""
self._inner.with_row_id()
return self
def postfilter(self) -> AsyncQuery:
def postfilter(self) -> Self:
"""
If this is called then filtering will happen after the search instead of
before.
@@ -1754,7 +1790,7 @@ class AsyncQuery(AsyncQueryBase):
raise ValueError("query_vector can not be None")
if (
isinstance(query_vector, list)
isinstance(query_vector, (list, np.ndarray, pa.Array))
and len(query_vector) > 0
and isinstance(query_vector[0], (list, np.ndarray, pa.Array))
):
@@ -1807,8 +1843,8 @@ class AsyncFTSQuery(AsyncQueryBase):
self._inner = inner
self._reranker = None
def get_query(self):
self._inner.get_query()
def get_query(self) -> str:
return self._inner.get_query()
def rerank(
self,
@@ -1891,29 +1927,18 @@ class AsyncFTSQuery(AsyncQueryBase):
self._inner.nearest_to(AsyncQuery._query_vec_to_array(query_vector))
)
async def to_arrow(self) -> pa.Table:
results = await super().to_arrow()
async def to_batches(
self, *, max_batch_length: Optional[int] = None
) -> AsyncRecordBatchReader:
reader = await super().to_batches()
results = pa.Table.from_batches(await reader.read_all(), reader.schema)
if self._reranker:
results = self._reranker.rerank_fts(results)
return results
results = self._reranker.rerank_fts(self.get_query(), results)
return AsyncRecordBatchReader(results, max_batch_length=max_batch_length)
class AsyncVectorQuery(AsyncQueryBase):
def __init__(self, inner: LanceVectorQuery):
"""
Construct an AsyncVectorQuery
This method is not intended to be called directly. Instead, create
a query first with [AsyncTable.query][lancedb.table.AsyncTable.query] and then
use [AsyncQuery.nearest_to][lancedb.query.AsyncQuery.nearest_to]] to convert to
a vector query. Or you can use
[AsyncTable.vector_search][lancedb.table.AsyncTable.vector_search]
"""
super().__init__(inner)
self._inner = inner
self._reranker = None
def column(self, column: str) -> AsyncVectorQuery:
class AsyncVectorQueryBase:
def column(self, column: str) -> Self:
"""
Set the vector column to query
@@ -1926,7 +1951,7 @@ class AsyncVectorQuery(AsyncQueryBase):
self._inner.column(column)
return self
def nprobes(self, nprobes: int) -> AsyncVectorQuery:
def nprobes(self, nprobes: int) -> Self:
"""
Set the number of partitions to search (probe)
@@ -1954,7 +1979,7 @@ class AsyncVectorQuery(AsyncQueryBase):
def distance_range(
self, lower_bound: Optional[float] = None, upper_bound: Optional[float] = None
) -> AsyncVectorQuery:
) -> Self:
"""Set the distance range to use.
Only rows with distances within range [lower_bound, upper_bound)
@@ -1975,7 +2000,7 @@ class AsyncVectorQuery(AsyncQueryBase):
self._inner.distance_range(lower_bound, upper_bound)
return self
def ef(self, ef: int) -> AsyncVectorQuery:
def ef(self, ef: int) -> Self:
"""
Set the number of candidates to consider during search
@@ -1990,7 +2015,7 @@ class AsyncVectorQuery(AsyncQueryBase):
self._inner.ef(ef)
return self
def refine_factor(self, refine_factor: int) -> AsyncVectorQuery:
def refine_factor(self, refine_factor: int) -> Self:
"""
A multiplier to control how many additional rows are taken during the refine
step
@@ -2026,7 +2051,7 @@ class AsyncVectorQuery(AsyncQueryBase):
self._inner.refine_factor(refine_factor)
return self
def distance_type(self, distance_type: str) -> AsyncVectorQuery:
def distance_type(self, distance_type: str) -> Self:
"""
Set the distance metric to use
@@ -2044,7 +2069,7 @@ class AsyncVectorQuery(AsyncQueryBase):
self._inner.distance_type(distance_type)
return self
def bypass_vector_index(self) -> AsyncVectorQuery:
def bypass_vector_index(self) -> Self:
"""
If this is called then any vector index is skipped
@@ -2057,6 +2082,23 @@ class AsyncVectorQuery(AsyncQueryBase):
self._inner.bypass_vector_index()
return self
class AsyncVectorQuery(AsyncQueryBase, AsyncVectorQueryBase):
def __init__(self, inner: LanceVectorQuery):
"""
Construct an AsyncVectorQuery
This method is not intended to be called directly. Instead, create
a query first with [AsyncTable.query][lancedb.table.AsyncTable.query] and then
use [AsyncQuery.nearest_to][lancedb.query.AsyncQuery.nearest_to]] to convert to
a vector query. Or you can use
[AsyncTable.vector_search][lancedb.table.AsyncTable.vector_search]
"""
super().__init__(inner)
self._inner = inner
self._reranker = None
self._query_string = None
def rerank(
self, reranker: Reranker = RRFReranker(), query_string: Optional[str] = None
) -> AsyncHybridQuery:
@@ -2065,6 +2107,11 @@ class AsyncVectorQuery(AsyncQueryBase):
self._reranker = reranker
if not self._query_string and not query_string:
raise ValueError("query_string must be provided to rerank the results.")
self._query_string = query_string
return self
def nearest_to_text(
@@ -2100,14 +2147,17 @@ class AsyncVectorQuery(AsyncQueryBase):
self._inner.nearest_to_text({"query": query, "columns": columns})
)
async def to_arrow(self) -> pa.Table:
results = await super().to_arrow()
async def to_batches(
self, *, max_batch_length: Optional[int] = None
) -> AsyncRecordBatchReader:
reader = await super().to_batches()
results = pa.Table.from_batches(await reader.read_all(), reader.schema)
if self._reranker:
results = self._reranker.rerank_vector(results)
return results
results = self._reranker.rerank_vector(self._query_string, results)
return AsyncRecordBatchReader(results, max_batch_length=max_batch_length)
class AsyncHybridQuery(AsyncQueryBase):
class AsyncHybridQuery(AsyncQueryBase, AsyncVectorQueryBase):
"""
A query builder that performs hybrid vector and full text search.
Results are combined and reranked based on the specified reranker.
@@ -2155,10 +2205,9 @@ class AsyncHybridQuery(AsyncQueryBase):
return self
async def to_batches(self):
raise NotImplementedError("to_batches not yet supported on a hybrid query")
async def to_arrow(self) -> pa.Table:
async def to_batches(
self, *, max_batch_length: Optional[int] = None
) -> AsyncRecordBatchReader:
fts_query = AsyncFTSQuery(self._inner.to_fts_query())
vec_query = AsyncVectorQuery(self._inner.to_vector_query())
@@ -2173,7 +2222,7 @@ class AsyncHybridQuery(AsyncQueryBase):
vec_query.to_arrow(),
)
return LanceHybridQueryBuilder._combine_hybrid_results(
result = LanceHybridQueryBuilder._combine_hybrid_results(
fts_results=fts_results,
vector_results=vector_results,
norm=self._norm,
@@ -2183,6 +2232,8 @@ class AsyncHybridQuery(AsyncQueryBase):
with_row_ids=with_row_ids,
)
return AsyncRecordBatchReader(result, max_batch_length=max_batch_length)
async def explain_plan(self, verbose: Optional[bool] = False):
"""Return the execution plan for this query.

View File

@@ -11,6 +11,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from collections import defaultdict
from numpy import nan
import pyarrow as pa
@@ -95,43 +96,22 @@ class LinearCombinationReranker(Reranker):
pa.array([nan] * len(vector_results), type=pa.float32()),
)
return results
# sort both input tables on _rowid
combined_list = []
vector_list = vector_results.sort_by("_rowid").to_pylist()
fts_list = fts_results.sort_by("_rowid").to_pylist()
i, j = 0, 0
while i < len(vector_list):
if j >= len(fts_list):
for vi in vector_list[i:]:
vi["_relevance_score"] = self._combine_score(vi["_distance"], fill)
combined_list.append(vi)
break
vi = vector_list[i]
fj = fts_list[j]
# invert the fts score from relevance to distance
inverted_fts_score = self._invert_score(fj["_score"])
if vi["_rowid"] == fj["_rowid"]:
vi["_relevance_score"] = self._combine_score(
vi["_distance"], inverted_fts_score
)
vi["_score"] = fj["_score"] # keep the original score
combined_list.append(vi)
i += 1
j += 1
elif vector_list[i]["_rowid"] < fts_list[j]["_rowid"]:
vi["_relevance_score"] = self._combine_score(vi["_distance"], fill)
combined_list.append(vi)
i += 1
results = defaultdict()
for vector_result in vector_results.to_pylist():
results[vector_result["_rowid"]] = vector_result
for fts_result in fts_results.to_pylist():
row_id = fts_result["_rowid"]
if row_id in results:
results[row_id]["_score"] = fts_result["_score"]
else:
fj["_relevance_score"] = self._combine_score(inverted_fts_score, fill)
combined_list.append(fj)
j += 1
if j < len(fts_list) - 1:
for fj in fts_list[j:]:
fj["_relevance_score"] = self._combine_score(inverted_fts_score, fill)
combined_list.append(fj)
results[row_id] = fts_result
combined_list = []
for row_id, result in results.items():
vector_score = self._invert_score(result.get("_distance", fill))
fts_score = result.get("_score", fill)
result["_relevance_score"] = self._combine_score(vector_score, fts_score)
combined_list.append(result)
relevance_score_schema = pa.schema(
[
@@ -148,10 +128,10 @@ class LinearCombinationReranker(Reranker):
tbl = self._keep_relevance_score(tbl)
return tbl
def _combine_score(self, score1, score2):
def _combine_score(self, vector_score, fts_score):
# these scores represent distance
return 1 - (self.weight * score1 + (1 - self.weight) * score2)
return 1 - (self.weight * vector_score + (1 - self.weight) * fts_score)
def _invert_score(self, score: float):
def _invert_score(self, dist: float):
# Invert the score between relevance and distance
return 1 - score
return 1 - dist

View File

@@ -586,6 +586,26 @@ class Table(ABC):
"""
raise NotImplementedError
def drop_index(self, name: str) -> None:
"""
Drop an index from the table.
Parameters
----------
name: str
The name of the index to drop.
Notes
-----
This does not delete the index from disk, it just removes it from the table.
To delete the index, run [optimize][lancedb.table.Table.optimize]
after dropping the index.
Use [list_indices][lancedb.table.Table.list_indices] to find the names of
the indices.
"""
raise NotImplementedError
@abstractmethod
def create_scalar_index(
self,
@@ -1594,6 +1614,9 @@ class LanceTable(Table):
)
)
def drop_index(self, name: str) -> None:
return LOOP.run(self._table.drop_index(name))
def create_scalar_index(
self,
column: str,
@@ -2716,6 +2739,26 @@ class AsyncTable:
add_note(e, help_msg)
raise e
async def drop_index(self, name: str) -> None:
"""
Drop an index from the table.
Parameters
----------
name: str
The name of the index to drop.
Notes
-----
This does not delete the index from disk, it just removes it from the table.
To delete the index, run [optimize][lancedb.table.AsyncTable.optimize]
after dropping the index.
Use [list_indices][lancedb.table.AsyncTable.list_indices] to find the names
of the indices.
"""
await self._inner.drop_index(name)
async def add(
self,
data: DATA,

View File

@@ -3,6 +3,7 @@ import shutil
# --8<-- [start:imports]
import lancedb
import numpy as np
import pyarrow as pa
import pytest
# --8<-- [end:imports]
@@ -12,16 +13,32 @@ shutil.rmtree("data/binary_lancedb", ignore_errors=True)
def test_binary_vector():
# --8<-- [start:sync_binary_vector]
db = lancedb.connect("data/binary_lancedb")
data = [
{
"id": i,
"vector": np.random.randint(0, 256, size=16),
}
for i in range(1024)
]
tbl = db.create_table("my_binary_vectors", data=data)
query = np.random.randint(0, 256, size=16)
tbl.search(query).metric("hamming").to_arrow()
schema = pa.schema(
[
pa.field("id", pa.int64()),
# for dim=256, lance stores every 8 bits in a byte
# so the vector field should be a list of 256 / 8 = 32 bytes
pa.field("vector", pa.list_(pa.uint8(), 32)),
]
)
tbl = db.create_table("my_binary_vectors", schema=schema)
data = []
for i in range(1024):
vector = np.random.randint(0, 2, size=256)
# pack the binary vector into bytes to save space
packed_vector = np.packbits(vector)
data.append(
{
"id": i,
"vector": packed_vector,
}
)
tbl.add(data)
query = np.random.randint(0, 2, size=256)
packed_query = np.packbits(query)
tbl.search(packed_query).metric("hamming").to_arrow()
# --8<-- [end:sync_binary_vector]
db.drop_table("my_binary_vectors")
@@ -30,15 +47,31 @@ def test_binary_vector():
async def test_binary_vector_async():
# --8<-- [start:async_binary_vector]
db = await lancedb.connect_async("data/binary_lancedb")
data = [
{
"id": i,
"vector": np.random.randint(0, 256, size=16),
}
for i in range(1024)
]
tbl = await db.create_table("my_binary_vectors", data=data)
query = np.random.randint(0, 256, size=16)
await tbl.query().nearest_to(query).distance_type("hamming").to_arrow()
schema = pa.schema(
[
pa.field("id", pa.int64()),
# for dim=256, lance stores every 8 bits in a byte
# so the vector field should be a list of 256 / 8 = 32 bytes
pa.field("vector", pa.list_(pa.uint8(), 32)),
]
)
tbl = await db.create_table("my_binary_vectors", schema=schema)
data = []
for i in range(1024):
vector = np.random.randint(0, 2, size=256)
# pack the binary vector into bytes to save space
packed_vector = np.packbits(vector)
data.append(
{
"id": i,
"vector": packed_vector,
}
)
await tbl.add(data)
query = np.random.randint(0, 2, size=256)
packed_query = np.packbits(query)
await tbl.query().nearest_to(packed_query).distance_type("hamming").to_arrow()
# --8<-- [end:async_binary_vector]
await db.drop_table("my_binary_vectors")

View File

@@ -0,0 +1,80 @@
import shutil
from lancedb.index import IvfPq
import pytest
# --8<-- [start:imports]
import lancedb
import numpy as np
import pyarrow as pa
# --8<-- [end:imports]
shutil.rmtree("data/multivector_demo", ignore_errors=True)
def test_multivector():
# --8<-- [start:sync_multivector]
db = lancedb.connect("data/multivector_demo")
schema = pa.schema(
[
pa.field("id", pa.int64()),
# float16, float32, and float64 are supported
pa.field("vector", pa.list_(pa.list_(pa.float32(), 256))),
]
)
data = [
{
"id": i,
"vector": np.random.random(size=(2, 256)).tolist(),
}
for i in range(1024)
]
tbl = db.create_table("my_table", data=data, schema=schema)
# only cosine similarity is supported for multi-vectors
tbl.create_index(metric="cosine")
# query with single vector
query = np.random.random(256).astype(np.float16)
tbl.search(query).to_arrow()
# query with multiple vectors
query = np.random.random(size=(2, 256))
tbl.search(query).to_arrow()
# --8<-- [end:sync_multivector]
db.drop_table("my_table")
@pytest.mark.asyncio
async def test_multivector_async():
# --8<-- [start:async_multivector]
db = await lancedb.connect_async("data/multivector_demo")
schema = pa.schema(
[
pa.field("id", pa.int64()),
# float16, float32, and float64 are supported
pa.field("vector", pa.list_(pa.list_(pa.float32(), 256))),
]
)
data = [
{
"id": i,
"vector": np.random.random(size=(2, 256)).tolist(),
}
for i in range(1024)
]
tbl = await db.create_table("my_table", data=data, schema=schema)
# only cosine similarity is supported for multi-vectors
await tbl.create_index(column="vector", config=IvfPq(distance_type="cosine"))
# query with single vector
query = np.random.random(256)
await tbl.query().nearest_to(query).to_arrow()
# query with multiple vectors
query = np.random.random(size=(2, 256))
await tbl.query().nearest_to(query).to_arrow()
# --8<-- [end:async_multivector]
await db.drop_table("my_table")

View File

@@ -3,7 +3,9 @@
import lancedb
from lancedb.query import LanceHybridQueryBuilder
import pyarrow as pa
import pyarrow.compute as pc
import pytest
import pytest_asyncio
@@ -67,6 +69,7 @@ async def test_async_hybrid_query_filters(table: AsyncTable):
.where("text not in ('a', 'dog')")
.nearest_to([0.3, 0.3])
.nearest_to_text("*a*")
.distance_type("l2")
.limit(2)
.to_arrow()
)
@@ -109,3 +112,23 @@ async def test_explain_plan(table: AsyncTable):
assert "KNNVectorDistance" in plan
assert "FTS Search Plan" in plan
assert "LanceScan" in plan
def test_normalize_scores():
cases = [
(pa.array([0.1, 0.4]), pa.array([0.0, 1.0])),
(pa.array([2.0, 10.0, 20.0]), pa.array([0.0, 8.0 / 18.0, 1.0])),
(pa.array([0.0, 0.0, 0.0]), pa.array([0.0, 0.0, 0.0])),
(pa.array([10.0, 9.9999999999999]), pa.array([0.0, 0.0])),
]
for input, expected in cases:
for invert in [True, False]:
result = LanceHybridQueryBuilder._normalize_scores(input, invert)
if invert:
expected = pc.subtract(1.0, expected)
assert pc.equal(
result, expected
), f"Expected {expected} but got {result} for invert={invert}"

View File

@@ -80,6 +80,10 @@ async def test_create_scalar_index(some_table: AsyncTable):
# can also specify index type
await some_table.create_index("id", config=BTree())
await some_table.drop_index("id_idx")
indices = await some_table.list_indices()
assert len(indices) == 0
@pytest.mark.asyncio
async def test_create_bitmap_index(some_table: AsyncTable):

View File

@@ -7,6 +7,7 @@ from pathlib import Path
import lancedb
from lancedb.index import IvfPq, FTS
from lancedb.rerankers.cross_encoder import CrossEncoderReranker
import numpy as np
import pandas.testing as tm
import pyarrow as pa
@@ -69,7 +70,7 @@ async def table_struct_async(tmp_path) -> AsyncTable:
@pytest.fixture
def multivec_table() -> lancedb.table.Table:
def multivec_table(vector_value_type=pa.float32()) -> lancedb.table.Table:
db = lancedb.connect("memory://")
# Generate 256 rows of data
num_rows = 256
@@ -85,7 +86,7 @@ def multivec_table() -> lancedb.table.Table:
df = pa.table(
{
"vector": pa.array(
vector_data, type=pa.list_(pa.list_(pa.float32(), list_size=2))
vector_data, type=pa.list_(pa.list_(vector_value_type, list_size=2))
),
"id": pa.array(id_data),
"float_field": pa.array(float_field_data),
@@ -95,7 +96,7 @@ def multivec_table() -> lancedb.table.Table:
@pytest_asyncio.fixture
async def multivec_table_async(tmp_path) -> AsyncTable:
async def multivec_table_async(vector_value_type=pa.float32()) -> AsyncTable:
conn = await lancedb.connect_async(
"memory://", read_consistency_interval=timedelta(seconds=0)
)
@@ -113,7 +114,7 @@ async def multivec_table_async(tmp_path) -> AsyncTable:
df = pa.table(
{
"vector": pa.array(
vector_data, type=pa.list_(pa.list_(pa.float32(), list_size=2))
vector_data, type=pa.list_(pa.list_(vector_value_type, list_size=2))
),
"id": pa.array(id_data),
"float_field": pa.array(float_field_data),
@@ -231,6 +232,9 @@ async def test_distance_range_async(table_async: AsyncTable):
assert res["_distance"].to_pylist() == [min_dist, max_dist]
@pytest.mark.parametrize(
"multivec_table", [pa.float16(), pa.float32(), pa.float64()], indirect=True
)
def test_multivector(multivec_table: lancedb.table.Table):
# create index on multivector
multivec_table.create_index(
@@ -261,6 +265,9 @@ def test_multivector(multivec_table: lancedb.table.Table):
@pytest.mark.asyncio
@pytest.mark.parametrize(
"multivec_table_async", [pa.float16(), pa.float32(), pa.float64()], indirect=True
)
async def test_multivector_async(multivec_table_async: AsyncTable):
# create index on multivector
await multivec_table_async.create_index(
@@ -509,15 +516,24 @@ async def test_query_async(table_async: AsyncTable):
expected_columns=["id", "vector", "_rowid"],
)
@pytest.mark.asyncio
@pytest.mark.slow
async def test_query_reranked_async(table_async: AsyncTable):
# FTS with rerank
await table_async.create_index("text", config=FTS(with_position=False))
await check_query(
table_async.query().nearest_to_text("dog").rerank(),
table_async.query().nearest_to_text("dog").rerank(CrossEncoderReranker()),
expected_num_rows=1,
)
# Vector query with rerank
await check_query(table_async.vector_search([1, 2]).rerank(), expected_num_rows=2)
await check_query(
table_async.vector_search([1, 2]).rerank(
CrossEncoderReranker(), query_string="dog"
),
expected_num_rows=2,
)
@pytest.mark.asyncio

View File

@@ -3,6 +3,8 @@ import random
import lancedb
import numpy as np
import pyarrow as pa
import pyarrow.compute as pc
import pytest
from lancedb.conftest import MockTextEmbeddingFunction # noqa
from lancedb.embeddings import EmbeddingFunctionRegistry
@@ -281,6 +283,31 @@ def _run_test_hybrid_reranker(reranker, tmp_path, use_tantivy):
@pytest.mark.parametrize("use_tantivy", [True, False])
def test_linear_combination(tmp_path, use_tantivy):
reranker = LinearCombinationReranker()
vector_results = pa.Table.from_pydict(
{
"_rowid": [0, 1, 2, 3, 4],
"_distance": [0.1, 0.2, 0.3, 0.4, 0.5],
"_text": ["a", "b", "c", "d", "e"],
}
)
fts_results = pa.Table.from_pydict(
{
"_rowid": [1, 2, 3, 4, 5],
"_score": [0.1, 0.2, 0.3, 0.4, 0.5],
"_text": ["b", "c", "d", "e", "f"],
}
)
combined_results = reranker.merge_results(vector_results, fts_results, 1.0)
assert len(combined_results) == 6
assert "_rowid" in combined_results.column_names
assert "_text" in combined_results.column_names
assert "_distance" not in combined_results.column_names
assert "_score" not in combined_results.column_names
assert "_relevance_score" in combined_results.column_names
_run_test_hybrid_reranker(reranker, tmp_path, use_tantivy)
@@ -290,6 +317,55 @@ def test_rrf_reranker(tmp_path, use_tantivy):
_run_test_hybrid_reranker(reranker, tmp_path, use_tantivy)
def test_rrf_reranker_distance():
data = pa.table(
{
"vector": pa.FixedSizeListArray.from_arrays(
pc.random(32 * 1024).cast(pa.float32()), 32
),
"text": pa.array(["hello"] * 1024),
}
)
db = lancedb.connect("memory://")
table = db.create_table("test", data)
table.create_index(num_partitions=1, num_sub_vectors=2)
table.create_fts_index("text", use_tantivy=False)
reranker = RRFReranker(return_score="all")
hybrid_results = (
table.search(query_type="hybrid")
.vector([0.0] * 32)
.text("hello")
.with_row_id(True)
.rerank(reranker)
.to_list()
)
hybrid_distances = {row["_rowid"]: row["_distance"] for row in hybrid_results}
hybrid_scores = {row["_rowid"]: row["_score"] for row in hybrid_results}
vector_results = table.search([0.0] * 32).with_row_id(True).to_list()
vector_distances = {row["_rowid"]: row["_distance"] for row in vector_results}
fts_results = table.search("hello", query_type="fts").with_row_id(True).to_list()
fts_scores = {row["_rowid"]: row["_score"] for row in fts_results}
found_match = False
for rowid, distance in hybrid_distances.items():
if rowid in vector_distances:
found_match = True
assert distance == vector_distances[rowid], "Distance mismatch"
assert found_match, "No results matched between hybrid and vector search"
found_match = False
for rowid, score in hybrid_scores.items():
if rowid in fts_scores and fts_scores[rowid] is not None:
found_match = True
assert score == fts_scores[rowid], "Score mismatch"
assert found_match, "No results matched between hybrid and fts search"
@pytest.mark.skipif(
os.environ.get("COHERE_API_KEY") is None, reason="COHERE_API_KEY not set"
)

View File

@@ -1008,6 +1008,10 @@ def test_create_scalar_index(mem_db: DBConnection):
results = table.search([5, 5]).where("x != 'b'").to_arrow()
assert results["_distance"][0].as_py() > 0
table.drop_index(scalar_index.name)
indices = table.list_indices()
assert len(indices) == 0
def test_empty_query(mem_db: DBConnection):
table = mem_db.create_table(

View File

@@ -9,7 +9,10 @@ use arrow::{
};
use futures::stream::StreamExt;
use lancedb::arrow::SendableRecordBatchStream;
use pyo3::{pyclass, pymethods, Bound, PyAny, PyObject, PyRef, PyResult, Python};
use pyo3::{
exceptions::PyStopAsyncIteration, pyclass, pymethods, Bound, PyAny, PyObject, PyRef, PyResult,
Python,
};
use pyo3_async_runtimes::tokio::future_into_py;
use crate::error::PythonErrorExt;
@@ -32,20 +35,25 @@ impl RecordBatchStream {
#[pymethods]
impl RecordBatchStream {
#[getter]
pub fn schema(&self, py: Python) -> PyResult<PyObject> {
(*self.schema).clone().into_pyarrow(py)
}
pub fn next(self_: PyRef<'_, Self>) -> PyResult<Bound<'_, PyAny>> {
pub fn __aiter__(self_: PyRef<'_, Self>) -> PyRef<'_, Self> {
self_
}
pub fn __anext__(self_: PyRef<'_, Self>) -> PyResult<Bound<'_, PyAny>> {
let inner = self_.inner.clone();
future_into_py(self_.py(), async move {
let inner_next = inner.lock().await.next().await;
inner_next
.map(|item| {
let item = item.infer_error()?;
Python::with_gil(|py| item.to_pyarrow(py))
})
.transpose()
let inner_next = inner
.lock()
.await
.next()
.await
.ok_or_else(|| PyStopAsyncIteration::new_err(""))?;
Python::with_gil(|py| inner_next.infer_error()?.to_pyarrow(py))
})
}
}

View File

@@ -21,7 +21,7 @@ use pyo3::{
types::{PyModule, PyModuleMethods},
wrap_pyfunction, Bound, PyResult, Python,
};
use query::{Query, VectorQuery};
use query::{FTSQuery, HybridQuery, Query, VectorQuery};
use table::Table;
pub mod arrow;
@@ -42,6 +42,8 @@ pub fn _lancedb(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add_class::<Table>()?;
m.add_class::<IndexConfig>()?;
m.add_class::<Query>()?;
m.add_class::<FTSQuery>()?;
m.add_class::<HybridQuery>()?;
m.add_class::<VectorQuery>()?;
m.add_class::<RecordBatchStream>()?;
m.add_function(wrap_pyfunction!(connect, m)?)?;

View File

@@ -194,6 +194,14 @@ impl Table {
})
}
pub fn drop_index(self_: PyRef<'_, Self>, index_name: String) -> PyResult<Bound<'_, PyAny>> {
let inner = self_.inner_ref()?.clone();
future_into_py(self_.py(), async move {
inner.drop_index(&index_name).await.infer_error()?;
Ok(())
})
}
pub fn list_indices(self_: PyRef<'_, Self>) -> PyResult<Bound<'_, PyAny>> {
let inner = self_.inner_ref()?.clone();
future_into_py(self_.py(), async move {

View File

@@ -1,6 +1,6 @@
[package]
name = "lancedb-node"
version = "0.15.0-beta.0"
version = "0.15.0"
description = "Serverless, low-latency vector database for AI applications"
license.workspace = true
edition.workspace = true

View File

@@ -1,6 +1,6 @@
[package]
name = "lancedb"
version = "0.15.0-beta.0"
version = "0.15.0"
edition.workspace = true
description = "LanceDB: A serverless, low-latency vector database for AI applications"
license.workspace = true

View File

@@ -6,7 +6,7 @@ use crate::index::IndexStatistics;
use crate::query::Select;
use crate::table::AddDataMode;
use crate::utils::{supported_btree_data_type, supported_vector_data_type};
use crate::{Error, Table};
use crate::{DistanceType, Error, Table};
use arrow_array::RecordBatchReader;
use arrow_ipc::reader::FileReader;
use arrow_schema::{DataType, SchemaRef};
@@ -592,7 +592,7 @@ impl<S: HttpSend> TableInternal for RemoteTable<S> {
message: format!("Column {} not found in schema", column),
})?;
if supported_vector_data_type(field.data_type()) {
("IVF_PQ", None)
("IVF_PQ", Some(DistanceType::L2))
} else if supported_btree_data_type(field.data_type()) {
("BTREE", None)
} else {
@@ -816,6 +816,14 @@ impl<S: HttpSend> TableInternal for RemoteTable<S> {
Ok(Some(stats))
}
/// Not yet supported on LanceDB Cloud.
async fn drop_index(&self, _name: &str) -> Result<()> {
Err(Error::NotSupported {
message: "Drop index is not yet supported on LanceDB Cloud.".into(),
})
}
async fn table_definition(&self) -> Result<TableDefinition> {
Err(Error::NotSupported {
message: "table_definition is not supported on LanceDB cloud.".into(),

View File

@@ -41,6 +41,7 @@ use lance::dataset::{
WriteParams,
};
use lance::dataset::{MergeInsertBuilder as LanceMergeInsertBuilder, WhenNotMatchedBySource};
use lance::index::vector::utils::infer_vector_dim;
use lance::io::WrappingObjectStore;
use lance_datafusion::exec::execute_plan;
use lance_index::vector::hnsw::builder::HnswBuildParams;
@@ -410,6 +411,7 @@ pub(crate) trait TableInternal: std::fmt::Display + std::fmt::Debug + Send + Syn
async fn update(&self, update: UpdateBuilder) -> Result<u64>;
async fn create_index(&self, index: IndexBuilder) -> Result<()>;
async fn list_indices(&self) -> Result<Vec<IndexConfig>>;
async fn drop_index(&self, name: &str) -> Result<()>;
async fn index_stats(&self, index_name: &str) -> Result<Option<IndexStatistics>>;
async fn merge_insert(
&self,
@@ -984,6 +986,18 @@ impl Table {
self.inner.index_stats(index_name.as_ref()).await
}
/// Drop an index from the table.
///
/// Note: This is not yet available in LanceDB cloud.
///
/// This does not delete the index from disk, it just removes it from the table.
/// To delete the index, run [`Self::optimize()`] after dropping the index.
///
/// Use [`Self::list_indices()`] to find the names of the indices.
pub async fn drop_index(&self, name: &str) -> Result<()> {
self.inner.drop_index(name).await
}
// Take many execution plans and map them into a single plan that adds
// a query_index column and unions them.
pub(crate) fn multi_vector_plan(
@@ -1370,14 +1384,8 @@ impl NativeTable {
let num_sub_vectors: u32 = if let Some(n) = index.num_sub_vectors {
n
} else {
match field.data_type() {
arrow_schema::DataType::FixedSizeList(_, n) => {
Ok::<u32, Error>(suggested_num_sub_vectors(*n as u32))
}
_ => Err(Error::Schema {
message: format!("Column '{}' is not a FixedSizeList", field.name()),
}),
}?
let dim = infer_vector_dim(field.data_type())?;
suggested_num_sub_vectors(dim as u32)
};
let mut dataset = self.dataset.get_mut().await?;
let lance_idx_params = lance::index::vector::VectorIndexParams::ivf_pq(
@@ -1734,6 +1742,12 @@ impl NativeTable {
}
/// Update field metadata
///
/// # Arguments:
/// * `new_values` - An iterator of tuples where the first element is the
/// field id and the second element is a hashmap of metadata key-value
/// pairs.
///
pub async fn replace_field_metadata(
&self,
new_values: impl IntoIterator<Item = (u32, HashMap<String, String>)>,
@@ -1877,6 +1891,12 @@ impl TableInternal for NativeTable {
}
}
async fn drop_index(&self, index_name: &str) -> Result<()> {
let mut dataset = self.dataset.get_mut().await?;
dataset.drop_index(index_name).await?;
Ok(())
}
async fn update(&self, update: UpdateBuilder) -> Result<u64> {
let dataset = self.dataset.get().await?.clone();
let mut builder = LanceUpdateBuilder::new(Arc::new(dataset));
@@ -2903,6 +2923,9 @@ mod tests {
assert_eq!(stats.num_unindexed_rows, 0);
assert_eq!(stats.index_type, crate::index::IndexType::IvfPq);
assert_eq!(stats.distance_type, Some(crate::DistanceType::L2));
table.drop_index(index_name).await.unwrap();
assert_eq!(table.list_indices().await.unwrap().len(), 0);
}
#[tokio::test]
@@ -3513,11 +3536,10 @@ mod tests {
.unwrap();
let native_tbl = table.as_native().unwrap();
let schema = native_tbl.schema().await.unwrap();
let schema = native_tbl.manifest().await.unwrap().schema;
let (field_idx, field) = schema.column_with_name("i").unwrap();
let field_metadata = field.metadata();
assert_eq!(field_metadata.len(), 0);
let field = schema.field("i").unwrap();
assert_eq!(field.metadata.len(), 0);
native_tbl
.replace_schema_metadata(vec![(
@@ -3538,16 +3560,15 @@ mod tests {
let mut new_field_metadata = HashMap::<String, String>::new();
new_field_metadata.insert("test_field_key1".into(), "test_field_val1".into());
native_tbl
.replace_field_metadata(vec![(field_idx as u32, new_field_metadata)])
.replace_field_metadata(vec![(field.id as u32, new_field_metadata)])
.await
.unwrap();
let schema = native_tbl.schema().await.unwrap();
let (_field_idx, field) = schema.column_with_name("i").unwrap();
let field_metadata = field.metadata();
assert_eq!(field_metadata.len(), 1);
let schema = native_tbl.manifest().await.unwrap().schema;
let field = schema.field("i").unwrap();
assert_eq!(field.metadata.len(), 1);
assert_eq!(
field_metadata.get("test_field_key1"),
field.metadata.get("test_field_key1"),
Some(&"test_field_val1".to_string())
);
}

View File

@@ -17,6 +17,7 @@ use std::sync::Arc;
use arrow_schema::{DataType, Schema};
use lance::arrow::json::JsonDataType;
use lance::dataset::{ReadParams, WriteParams};
use lance::index::vector::utils::infer_vector_dim;
use lance::io::{ObjectStoreParams, WrappingObjectStore};
use lazy_static::lazy_static;
@@ -104,12 +105,12 @@ pub fn validate_table_name(name: &str) -> Result<()> {
/// Find one default column to create index or perform vector query.
pub(crate) fn default_vector_column(schema: &Schema, dim: Option<i32>) -> Result<String> {
// Try to find one fixed size list array column.
// Try to find a vector column.
let candidates = schema
.fields()
.iter()
.filter_map(|field| match inf_vector_dim(field) {
Some(d) if dim.is_none() || dim == Some(d) => Some(field.name()),
.filter_map(|field| match infer_vector_dim(field.data_type()) {
Ok(d) if dim.is_none() || dim == Some(d as i32) => Some(field.name()),
_ => None,
})
.collect::<Vec<_>>();
@@ -133,20 +134,6 @@ pub(crate) fn default_vector_column(schema: &Schema, dim: Option<i32>) -> Result
}
}
fn inf_vector_dim(field: &arrow_schema::Field) -> Option<i32> {
match field.data_type() {
arrow_schema::DataType::FixedSizeList(f, d) => {
if f.data_type().is_floating() || f.data_type() == &DataType::UInt8 {
Some(*d)
} else {
None
}
}
arrow_schema::DataType::List(f) => inf_vector_dim(f),
_ => None,
}
}
pub fn supported_btree_data_type(dtype: &DataType) -> bool {
dtype.is_integer()
|| dtype.is_floating()