mirror of
https://github.com/lancedb/lancedb.git
synced 2026-01-04 02:42:57 +00:00
Compare commits
1 Commits
rpgreen/no
...
v0.20.0
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
87c95a8c94 |
@@ -1,5 +1,5 @@
|
|||||||
[tool.bumpversion]
|
[tool.bumpversion]
|
||||||
current_version = "0.21.2"
|
current_version = "0.20.0"
|
||||||
parse = """(?x)
|
parse = """(?x)
|
||||||
(?P<major>0|[1-9]\\d*)\\.
|
(?P<major>0|[1-9]\\d*)\\.
|
||||||
(?P<minor>0|[1-9]\\d*)\\.
|
(?P<minor>0|[1-9]\\d*)\\.
|
||||||
|
|||||||
10
.github/workflows/cargo-publish.yml
vendored
10
.github/workflows/cargo-publish.yml
vendored
@@ -5,8 +5,8 @@ on:
|
|||||||
tags-ignore:
|
tags-ignore:
|
||||||
# We don't publish pre-releases for Rust. Crates.io is just a source
|
# We don't publish pre-releases for Rust. Crates.io is just a source
|
||||||
# distribution, so we don't need to publish pre-releases.
|
# distribution, so we don't need to publish pre-releases.
|
||||||
- "v*-beta*"
|
- 'v*-beta*'
|
||||||
- "*-v*" # for example, python-vX.Y.Z
|
- '*-v*' # for example, python-vX.Y.Z
|
||||||
|
|
||||||
env:
|
env:
|
||||||
# This env var is used by Swatinem/rust-cache@v2 for the cache
|
# This env var is used by Swatinem/rust-cache@v2 for the cache
|
||||||
@@ -19,8 +19,6 @@ env:
|
|||||||
jobs:
|
jobs:
|
||||||
build:
|
build:
|
||||||
runs-on: ubuntu-22.04
|
runs-on: ubuntu-22.04
|
||||||
permissions:
|
|
||||||
id-token: write
|
|
||||||
timeout-minutes: 30
|
timeout-minutes: 30
|
||||||
# Only runs on tags that matches the make-release action
|
# Only runs on tags that matches the make-release action
|
||||||
if: startsWith(github.ref, 'refs/tags/v')
|
if: startsWith(github.ref, 'refs/tags/v')
|
||||||
@@ -33,8 +31,6 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
sudo apt update
|
sudo apt update
|
||||||
sudo apt install -y protobuf-compiler libssl-dev
|
sudo apt install -y protobuf-compiler libssl-dev
|
||||||
- uses: rust-lang/crates-io-auth-action@v1
|
|
||||||
id: auth
|
|
||||||
- name: Publish the package
|
- name: Publish the package
|
||||||
run: |
|
run: |
|
||||||
cargo publish -p lancedb --all-features --token ${{ steps.auth.outputs.token }}
|
cargo publish -p lancedb --all-features --token ${{ secrets.CARGO_REGISTRY_TOKEN }}
|
||||||
|
|||||||
12
.github/workflows/npm-publish.yml
vendored
12
.github/workflows/npm-publish.yml
vendored
@@ -539,20 +539,10 @@ jobs:
|
|||||||
# We need to deprecate the old package to avoid confusion.
|
# We need to deprecate the old package to avoid confusion.
|
||||||
# Each time we publish a new version, it gets undeprecated.
|
# Each time we publish a new version, it gets undeprecated.
|
||||||
run: npm deprecate vectordb "Use @lancedb/lancedb instead."
|
run: npm deprecate vectordb "Use @lancedb/lancedb instead."
|
||||||
- name: Checkout
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
with:
|
|
||||||
ref: main
|
|
||||||
- name: Update package-lock.json
|
- name: Update package-lock.json
|
||||||
run: |
|
run: bash ci/update_lockfiles.sh
|
||||||
git config user.name 'Lance Release'
|
|
||||||
git config user.email 'lance-dev@lancedb.com'
|
|
||||||
bash ci/update_lockfiles.sh
|
|
||||||
- name: Push new commit
|
- name: Push new commit
|
||||||
uses: ad-m/github-push-action@master
|
uses: ad-m/github-push-action@master
|
||||||
with:
|
|
||||||
github_token: ${{ secrets.LANCEDB_RELEASE_TOKEN }}
|
|
||||||
branch: main
|
|
||||||
- name: Notify Slack Action
|
- name: Notify Slack Action
|
||||||
uses: ravsamhq/notify-slack-action@2.3.0
|
uses: ravsamhq/notify-slack-action@2.3.0
|
||||||
if: ${{ always() }}
|
if: ${{ always() }}
|
||||||
|
|||||||
24
CLAUDE.md
24
CLAUDE.md
@@ -1,24 +0,0 @@
|
|||||||
LanceDB is a database designed for retrieval, including vector, full-text, and hybrid search.
|
|
||||||
It is a wrapper around Lance. There are two backends: local (in-process like SQLite) and
|
|
||||||
remote (against LanceDB Cloud).
|
|
||||||
|
|
||||||
The core of LanceDB is written in Rust. There are bindings in Python, Typescript, and Java.
|
|
||||||
|
|
||||||
Project layout:
|
|
||||||
|
|
||||||
* `rust/lancedb`: The LanceDB core Rust implementation.
|
|
||||||
* `python`: The Python bindings, using PyO3.
|
|
||||||
* `nodejs`: The Typescript bindings, using napi-rs
|
|
||||||
* `java`: The Java bindings
|
|
||||||
|
|
||||||
(`rust/ffi` and `node/` are for a deprecated package. You can ignore them.)
|
|
||||||
|
|
||||||
Common commands:
|
|
||||||
|
|
||||||
* Check for compiler errors: `cargo check --features remote --tests --examples`
|
|
||||||
* Run tests: `cargo test --features remote --tests`
|
|
||||||
* Run specific test: `cargo test --features remote -p <package_name> --test <test_name>`
|
|
||||||
* Lint: `cargo clippy --features remote --tests --examples`
|
|
||||||
* Format: `cargo fmt --all`
|
|
||||||
|
|
||||||
Before committing changes, run formatting.
|
|
||||||
1108
Cargo.lock
generated
1108
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
34
Cargo.toml
34
Cargo.toml
@@ -21,16 +21,14 @@ categories = ["database-implementations"]
|
|||||||
rust-version = "1.78.0"
|
rust-version = "1.78.0"
|
||||||
|
|
||||||
[workspace.dependencies]
|
[workspace.dependencies]
|
||||||
lance = { "version" = "=0.32.1", "features" = [
|
lance = { "version" = "=0.29.0", "features" = ["dynamodb"] }
|
||||||
"dynamodb",
|
lance-io = "=0.29.0"
|
||||||
], "tag" = "v0.32.1-beta.2", "git" = "https://github.com/lancedb/lance.git" }
|
lance-index = "=0.29.0"
|
||||||
lance-io = { "version" = "=0.32.1", "tag" = "v0.32.1-beta.2", "git" = "https://github.com/lancedb/lance.git" }
|
lance-linalg = "=0.29.0"
|
||||||
lance-index = { "version" = "=0.32.1", "tag" = "v0.32.1-beta.2", "git" = "https://github.com/lancedb/lance.git" }
|
lance-table = "=0.29.0"
|
||||||
lance-linalg = { "version" = "=0.32.1", "tag" = "v0.32.1-beta.2", "git" = "https://github.com/lancedb/lance.git" }
|
lance-testing = "=0.29.0"
|
||||||
lance-table = { "version" = "=0.32.1", "tag" = "v0.32.1-beta.2", "git" = "https://github.com/lancedb/lance.git" }
|
lance-datafusion = "=0.29.0"
|
||||||
lance-testing = { "version" = "=0.32.1", "tag" = "v0.32.1-beta.2", "git" = "https://github.com/lancedb/lance.git" }
|
lance-encoding = "=0.29.0"
|
||||||
lance-datafusion = { "version" = "=0.32.1", "tag" = "v0.32.1-beta.2", "git" = "https://github.com/lancedb/lance.git" }
|
|
||||||
lance-encoding = { "version" = "=0.32.1", "tag" = "v0.32.1-beta.2", "git" = "https://github.com/lancedb/lance.git" }
|
|
||||||
# Note that this one does not include pyarrow
|
# Note that this one does not include pyarrow
|
||||||
arrow = { version = "55.1", optional = false }
|
arrow = { version = "55.1", optional = false }
|
||||||
arrow-array = "55.1"
|
arrow-array = "55.1"
|
||||||
@@ -41,20 +39,20 @@ arrow-schema = "55.1"
|
|||||||
arrow-arith = "55.1"
|
arrow-arith = "55.1"
|
||||||
arrow-cast = "55.1"
|
arrow-cast = "55.1"
|
||||||
async-trait = "0"
|
async-trait = "0"
|
||||||
datafusion = { version = "48.0", default-features = false }
|
datafusion = { version = "47.0", default-features = false }
|
||||||
datafusion-catalog = "48.0"
|
datafusion-catalog = "47.0"
|
||||||
datafusion-common = { version = "48.0", default-features = false }
|
datafusion-common = { version = "47.0", default-features = false }
|
||||||
datafusion-execution = "48.0"
|
datafusion-execution = "47.0"
|
||||||
datafusion-expr = "48.0"
|
datafusion-expr = "47.0"
|
||||||
datafusion-physical-plan = "48.0"
|
datafusion-physical-plan = "47.0"
|
||||||
env_logger = "0.11"
|
env_logger = "0.11"
|
||||||
half = { "version" = "2.6.0", default-features = false, features = [
|
half = { "version" = "=2.5.0", default-features = false, features = [
|
||||||
"num-traits",
|
"num-traits",
|
||||||
] }
|
] }
|
||||||
futures = "0"
|
futures = "0"
|
||||||
log = "0.4"
|
log = "0.4"
|
||||||
moka = { version = "0.12", features = ["future"] }
|
moka = { version = "0.12", features = ["future"] }
|
||||||
object_store = "0.12.0"
|
object_store = "0.11.0"
|
||||||
pin-project = "1.0.7"
|
pin-project = "1.0.7"
|
||||||
snafu = "0.8"
|
snafu = "0.8"
|
||||||
url = "2"
|
url = "2"
|
||||||
|
|||||||
@@ -47,10 +47,10 @@ def extract_features(line: str) -> list:
|
|||||||
"""
|
"""
|
||||||
import re
|
import re
|
||||||
|
|
||||||
match = re.search(r'"features"\s*=\s*\[\s*(.*?)\s*\]', line, re.DOTALL)
|
match = re.search(r'"features"\s*=\s*\[(.*?)\]', line)
|
||||||
if match:
|
if match:
|
||||||
features_str = match.group(1)
|
features_str = match.group(1)
|
||||||
return [f.strip('"') for f in features_str.split(",") if len(f) > 0]
|
return [f.strip('"') for f in features_str.split(",")]
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
|
||||||
@@ -63,24 +63,10 @@ def update_cargo_toml(line_updater):
|
|||||||
lines = f.readlines()
|
lines = f.readlines()
|
||||||
|
|
||||||
new_lines = []
|
new_lines = []
|
||||||
lance_line = ""
|
|
||||||
is_parsing_lance_line = False
|
|
||||||
for line in lines:
|
for line in lines:
|
||||||
if line.startswith("lance"):
|
if line.startswith("lance"):
|
||||||
# Update the line using the provided function
|
# Update the line using the provided function
|
||||||
if line.strip().endswith("}"):
|
new_lines.append(line_updater(line))
|
||||||
new_lines.append(line_updater(line))
|
|
||||||
else:
|
|
||||||
lance_line = line
|
|
||||||
is_parsing_lance_line = True
|
|
||||||
elif is_parsing_lance_line:
|
|
||||||
lance_line += line
|
|
||||||
if line.strip().endswith("}"):
|
|
||||||
new_lines.append(line_updater(lance_line))
|
|
||||||
lance_line = ""
|
|
||||||
is_parsing_lance_line = False
|
|
||||||
else:
|
|
||||||
print("doesn't end with }:", line)
|
|
||||||
else:
|
else:
|
||||||
# Keep the line unchanged
|
# Keep the line unchanged
|
||||||
new_lines.append(line)
|
new_lines.append(line)
|
||||||
|
|||||||
12
docs/package-lock.json
generated
12
docs/package-lock.json
generated
@@ -19,7 +19,7 @@
|
|||||||
},
|
},
|
||||||
"../node": {
|
"../node": {
|
||||||
"name": "vectordb",
|
"name": "vectordb",
|
||||||
"version": "0.21.2-beta.0",
|
"version": "0.12.0",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"x64",
|
"x64",
|
||||||
"arm64"
|
"arm64"
|
||||||
@@ -65,11 +65,11 @@
|
|||||||
"uuid": "^9.0.0"
|
"uuid": "^9.0.0"
|
||||||
},
|
},
|
||||||
"optionalDependencies": {
|
"optionalDependencies": {
|
||||||
"@lancedb/vectordb-darwin-arm64": "0.21.2-beta.0",
|
"@lancedb/vectordb-darwin-arm64": "0.12.0",
|
||||||
"@lancedb/vectordb-darwin-x64": "0.21.2-beta.0",
|
"@lancedb/vectordb-darwin-x64": "0.12.0",
|
||||||
"@lancedb/vectordb-linux-arm64-gnu": "0.21.2-beta.0",
|
"@lancedb/vectordb-linux-arm64-gnu": "0.12.0",
|
||||||
"@lancedb/vectordb-linux-x64-gnu": "0.21.2-beta.0",
|
"@lancedb/vectordb-linux-x64-gnu": "0.12.0",
|
||||||
"@lancedb/vectordb-win32-x64-msvc": "0.21.2-beta.0"
|
"@lancedb/vectordb-win32-x64-msvc": "0.12.0"
|
||||||
},
|
},
|
||||||
"peerDependencies": {
|
"peerDependencies": {
|
||||||
"@apache-arrow/ts": "^14.0.2",
|
"@apache-arrow/ts": "^14.0.2",
|
||||||
|
|||||||
@@ -1,9 +1,7 @@
|
|||||||
# SQL Querying
|
|
||||||
|
|
||||||
You can use DuckDB and Apache Datafusion to query your LanceDB tables using SQL.
|
You can use DuckDB and Apache Datafusion to query your LanceDB tables using SQL.
|
||||||
This guide will show how to query Lance tables them using both.
|
This guide will show how to query Lance tables them using both.
|
||||||
|
|
||||||
We will re-use the dataset [created previously](./tables.md):
|
We will re-use the dataset [created previously](./pandas_and_pyarrow.md):
|
||||||
|
|
||||||
```python
|
```python
|
||||||
import lancedb
|
import lancedb
|
||||||
@@ -29,17 +27,21 @@ arrow_table = table.to_lance()
|
|||||||
duckdb.query("SELECT * FROM arrow_table")
|
duckdb.query("SELECT * FROM arrow_table")
|
||||||
```
|
```
|
||||||
|
|
||||||
| vector | item | price |
|
```
|
||||||
| ----------- | ---- | ----- |
|
┌─────────────┬─────────┬────────┐
|
||||||
| [3.1, 4.1] | foo | 10.0 |
|
│ vector │ item │ price │
|
||||||
| [5.9, 26.5] | bar | 20.0 |
|
│ float[] │ varchar │ double │
|
||||||
|
├─────────────┼─────────┼────────┤
|
||||||
|
│ [3.1, 4.1] │ foo │ 10.0 │
|
||||||
|
│ [5.9, 26.5] │ bar │ 20.0 │
|
||||||
|
└─────────────┴─────────┴────────┘
|
||||||
|
```
|
||||||
|
|
||||||
## Querying a LanceDB Table with Apache Datafusion
|
## Querying a LanceDB Table with Apache Datafusion
|
||||||
|
|
||||||
Have the required imports before doing any querying.
|
Have the required imports before doing any querying.
|
||||||
|
|
||||||
=== "Python"
|
=== "Python"
|
||||||
|
|
||||||
```python
|
```python
|
||||||
--8<-- "python/python/tests/docs/test_guide_tables.py:import-lancedb"
|
--8<-- "python/python/tests/docs/test_guide_tables.py:import-lancedb"
|
||||||
--8<-- "python/python/tests/docs/test_guide_tables.py:import-session-context"
|
--8<-- "python/python/tests/docs/test_guide_tables.py:import-session-context"
|
||||||
@@ -49,12 +51,16 @@ Have the required imports before doing any querying.
|
|||||||
Register the table created with the Datafusion session context.
|
Register the table created with the Datafusion session context.
|
||||||
|
|
||||||
=== "Python"
|
=== "Python"
|
||||||
|
|
||||||
```python
|
```python
|
||||||
--8<-- "python/python/tests/docs/test_guide_tables.py:lance_sql_basic"
|
--8<-- "python/python/tests/docs/test_guide_tables.py:lance_sql_basic"
|
||||||
```
|
```
|
||||||
|
|
||||||
| vector | item | price |
|
```
|
||||||
| ----------- | ---- | ----- |
|
┌─────────────┬─────────┬────────┐
|
||||||
| [3.1, 4.1] | foo | 10.0 |
|
│ vector │ item │ price │
|
||||||
| [5.9, 26.5] | bar | 20.0 |
|
│ float[] │ varchar │ double │
|
||||||
|
├─────────────┼─────────┼────────┤
|
||||||
|
│ [3.1, 4.1] │ foo │ 10.0 │
|
||||||
|
│ [5.9, 26.5] │ bar │ 20.0 │
|
||||||
|
└─────────────┴─────────┴────────┘
|
||||||
|
```
|
||||||
|
|||||||
@@ -1,53 +0,0 @@
|
|||||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
|
||||||
|
|
||||||
***
|
|
||||||
|
|
||||||
[@lancedb/lancedb](../globals.md) / BooleanQuery
|
|
||||||
|
|
||||||
# Class: BooleanQuery
|
|
||||||
|
|
||||||
Represents a full-text query interface.
|
|
||||||
This interface defines the structure and behavior for full-text queries,
|
|
||||||
including methods to retrieve the query type and convert the query to a dictionary format.
|
|
||||||
|
|
||||||
## Implements
|
|
||||||
|
|
||||||
- [`FullTextQuery`](../interfaces/FullTextQuery.md)
|
|
||||||
|
|
||||||
## Constructors
|
|
||||||
|
|
||||||
### new BooleanQuery()
|
|
||||||
|
|
||||||
```ts
|
|
||||||
new BooleanQuery(queries): BooleanQuery
|
|
||||||
```
|
|
||||||
|
|
||||||
Creates an instance of BooleanQuery.
|
|
||||||
|
|
||||||
#### Parameters
|
|
||||||
|
|
||||||
* **queries**: [[`Occur`](../enumerations/Occur.md), [`FullTextQuery`](../interfaces/FullTextQuery.md)][]
|
|
||||||
An array of (Occur, FullTextQuery objects) to combine.
|
|
||||||
Occur specifies whether the query must match, or should match.
|
|
||||||
|
|
||||||
#### Returns
|
|
||||||
|
|
||||||
[`BooleanQuery`](BooleanQuery.md)
|
|
||||||
|
|
||||||
## Methods
|
|
||||||
|
|
||||||
### queryType()
|
|
||||||
|
|
||||||
```ts
|
|
||||||
queryType(): FullTextQueryType
|
|
||||||
```
|
|
||||||
|
|
||||||
The type of the full-text query.
|
|
||||||
|
|
||||||
#### Returns
|
|
||||||
|
|
||||||
[`FullTextQueryType`](../enumerations/FullTextQueryType.md)
|
|
||||||
|
|
||||||
#### Implementation of
|
|
||||||
|
|
||||||
[`FullTextQuery`](../interfaces/FullTextQuery.md).[`queryType`](../interfaces/FullTextQuery.md#querytype)
|
|
||||||
@@ -40,8 +40,6 @@ Creates an instance of MatchQuery.
|
|||||||
- `boost`: The boost factor for the query (default is 1.0).
|
- `boost`: The boost factor for the query (default is 1.0).
|
||||||
- `fuzziness`: The fuzziness level for the query (default is 0).
|
- `fuzziness`: The fuzziness level for the query (default is 0).
|
||||||
- `maxExpansions`: The maximum number of terms to consider for fuzzy matching (default is 50).
|
- `maxExpansions`: The maximum number of terms to consider for fuzzy matching (default is 50).
|
||||||
- `operator`: The logical operator to use for combining terms in the query (default is "OR").
|
|
||||||
- `prefixLength`: The number of beginning characters being unchanged for fuzzy matching.
|
|
||||||
|
|
||||||
* **options.boost?**: `number`
|
* **options.boost?**: `number`
|
||||||
|
|
||||||
@@ -49,10 +47,6 @@ Creates an instance of MatchQuery.
|
|||||||
|
|
||||||
* **options.maxExpansions?**: `number`
|
* **options.maxExpansions?**: `number`
|
||||||
|
|
||||||
* **options.operator?**: [`Operator`](../enumerations/Operator.md)
|
|
||||||
|
|
||||||
* **options.prefixLength?**: `number`
|
|
||||||
|
|
||||||
#### Returns
|
#### Returns
|
||||||
|
|
||||||
[`MatchQuery`](MatchQuery.md)
|
[`MatchQuery`](MatchQuery.md)
|
||||||
|
|||||||
@@ -38,12 +38,9 @@ Creates an instance of MultiMatchQuery.
|
|||||||
* **options?**
|
* **options?**
|
||||||
Optional parameters for the multi-match query.
|
Optional parameters for the multi-match query.
|
||||||
- `boosts`: An array of boost factors for each column (default is 1.0 for all).
|
- `boosts`: An array of boost factors for each column (default is 1.0 for all).
|
||||||
- `operator`: The logical operator to use for combining terms in the query (default is "OR").
|
|
||||||
|
|
||||||
* **options.boosts?**: `number`[]
|
* **options.boosts?**: `number`[]
|
||||||
|
|
||||||
* **options.operator?**: [`Operator`](../enumerations/Operator.md)
|
|
||||||
|
|
||||||
#### Returns
|
#### Returns
|
||||||
|
|
||||||
[`MultiMatchQuery`](MultiMatchQuery.md)
|
[`MultiMatchQuery`](MultiMatchQuery.md)
|
||||||
|
|||||||
@@ -19,10 +19,7 @@ including methods to retrieve the query type and convert the query to a dictiona
|
|||||||
### new PhraseQuery()
|
### new PhraseQuery()
|
||||||
|
|
||||||
```ts
|
```ts
|
||||||
new PhraseQuery(
|
new PhraseQuery(query, column): PhraseQuery
|
||||||
query,
|
|
||||||
column,
|
|
||||||
options?): PhraseQuery
|
|
||||||
```
|
```
|
||||||
|
|
||||||
Creates an instance of `PhraseQuery`.
|
Creates an instance of `PhraseQuery`.
|
||||||
@@ -35,12 +32,6 @@ Creates an instance of `PhraseQuery`.
|
|||||||
* **column**: `string`
|
* **column**: `string`
|
||||||
The name of the column to search within.
|
The name of the column to search within.
|
||||||
|
|
||||||
* **options?**
|
|
||||||
Optional parameters for the phrase query.
|
|
||||||
- `slop`: The maximum number of intervening unmatched positions allowed between words in the phrase (default is 0).
|
|
||||||
|
|
||||||
* **options.slop?**: `number`
|
|
||||||
|
|
||||||
#### Returns
|
#### Returns
|
||||||
|
|
||||||
[`PhraseQuery`](PhraseQuery.md)
|
[`PhraseQuery`](PhraseQuery.md)
|
||||||
|
|||||||
@@ -1,84 +0,0 @@
|
|||||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
|
||||||
|
|
||||||
***
|
|
||||||
|
|
||||||
[@lancedb/lancedb](../globals.md) / Session
|
|
||||||
|
|
||||||
# Class: Session
|
|
||||||
|
|
||||||
A session for managing caches and object stores across LanceDB operations.
|
|
||||||
|
|
||||||
Sessions allow you to configure cache sizes for index and metadata caches,
|
|
||||||
which can significantly impact performance for large datasets.
|
|
||||||
|
|
||||||
## Constructors
|
|
||||||
|
|
||||||
### new Session()
|
|
||||||
|
|
||||||
```ts
|
|
||||||
new Session(indexCacheSizeBytes?, metadataCacheSizeBytes?): Session
|
|
||||||
```
|
|
||||||
|
|
||||||
Create a new session with custom cache sizes.
|
|
||||||
|
|
||||||
# Parameters
|
|
||||||
|
|
||||||
- `index_cache_size_bytes`: The size of the index cache in bytes.
|
|
||||||
Defaults to 6GB if not specified.
|
|
||||||
- `metadata_cache_size_bytes`: The size of the metadata cache in bytes.
|
|
||||||
Defaults to 1GB if not specified.
|
|
||||||
|
|
||||||
#### Parameters
|
|
||||||
|
|
||||||
* **indexCacheSizeBytes?**: `null` \| `bigint`
|
|
||||||
|
|
||||||
* **metadataCacheSizeBytes?**: `null` \| `bigint`
|
|
||||||
|
|
||||||
#### Returns
|
|
||||||
|
|
||||||
[`Session`](Session.md)
|
|
||||||
|
|
||||||
## Methods
|
|
||||||
|
|
||||||
### approxNumItems()
|
|
||||||
|
|
||||||
```ts
|
|
||||||
approxNumItems(): number
|
|
||||||
```
|
|
||||||
|
|
||||||
Get the approximate number of items cached in the session.
|
|
||||||
|
|
||||||
#### Returns
|
|
||||||
|
|
||||||
`number`
|
|
||||||
|
|
||||||
***
|
|
||||||
|
|
||||||
### sizeBytes()
|
|
||||||
|
|
||||||
```ts
|
|
||||||
sizeBytes(): bigint
|
|
||||||
```
|
|
||||||
|
|
||||||
Get the current size of the session caches in bytes.
|
|
||||||
|
|
||||||
#### Returns
|
|
||||||
|
|
||||||
`bigint`
|
|
||||||
|
|
||||||
***
|
|
||||||
|
|
||||||
### default()
|
|
||||||
|
|
||||||
```ts
|
|
||||||
static default(): Session
|
|
||||||
```
|
|
||||||
|
|
||||||
Create a session with default cache sizes.
|
|
||||||
|
|
||||||
This is equivalent to creating a session with 6GB index cache
|
|
||||||
and 1GB metadata cache.
|
|
||||||
|
|
||||||
#### Returns
|
|
||||||
|
|
||||||
[`Session`](Session.md)
|
|
||||||
@@ -612,7 +612,7 @@ of the given query
|
|||||||
|
|
||||||
#### Parameters
|
#### Parameters
|
||||||
|
|
||||||
* **query**: `string` \| [`IntoVector`](../type-aliases/IntoVector.md) \| [`MultiVector`](../type-aliases/MultiVector.md) \| [`FullTextQuery`](../interfaces/FullTextQuery.md)
|
* **query**: `string` \| [`IntoVector`](../type-aliases/IntoVector.md) \| [`FullTextQuery`](../interfaces/FullTextQuery.md)
|
||||||
the query, a vector or string
|
the query, a vector or string
|
||||||
|
|
||||||
* **queryType?**: `string`
|
* **queryType?**: `string`
|
||||||
@@ -799,7 +799,7 @@ by `query`.
|
|||||||
|
|
||||||
#### Parameters
|
#### Parameters
|
||||||
|
|
||||||
* **vector**: [`IntoVector`](../type-aliases/IntoVector.md) \| [`MultiVector`](../type-aliases/MultiVector.md)
|
* **vector**: [`IntoVector`](../type-aliases/IntoVector.md)
|
||||||
|
|
||||||
#### Returns
|
#### Returns
|
||||||
|
|
||||||
|
|||||||
@@ -386,53 +386,6 @@ called then every valid row from the table will be returned.
|
|||||||
|
|
||||||
***
|
***
|
||||||
|
|
||||||
### maximumNprobes()
|
|
||||||
|
|
||||||
```ts
|
|
||||||
maximumNprobes(maximumNprobes): VectorQuery
|
|
||||||
```
|
|
||||||
|
|
||||||
Set the maximum number of probes used.
|
|
||||||
|
|
||||||
This controls the maximum number of partitions that will be searched. If this
|
|
||||||
number is greater than minimumNprobes then the excess partitions will _only_ be
|
|
||||||
searched if we have not found enough results. This can be useful when there is
|
|
||||||
a narrow filter to allow these queries to spend more time searching and avoid
|
|
||||||
potential false negatives.
|
|
||||||
|
|
||||||
#### Parameters
|
|
||||||
|
|
||||||
* **maximumNprobes**: `number`
|
|
||||||
|
|
||||||
#### Returns
|
|
||||||
|
|
||||||
[`VectorQuery`](VectorQuery.md)
|
|
||||||
|
|
||||||
***
|
|
||||||
|
|
||||||
### minimumNprobes()
|
|
||||||
|
|
||||||
```ts
|
|
||||||
minimumNprobes(minimumNprobes): VectorQuery
|
|
||||||
```
|
|
||||||
|
|
||||||
Set the minimum number of probes used.
|
|
||||||
|
|
||||||
This controls the minimum number of partitions that will be searched. This
|
|
||||||
parameter will impact every query against a vector index, regardless of the
|
|
||||||
filter. See `nprobes` for more details. Higher values will increase recall
|
|
||||||
but will also increase latency.
|
|
||||||
|
|
||||||
#### Parameters
|
|
||||||
|
|
||||||
* **minimumNprobes**: `number`
|
|
||||||
|
|
||||||
#### Returns
|
|
||||||
|
|
||||||
[`VectorQuery`](VectorQuery.md)
|
|
||||||
|
|
||||||
***
|
|
||||||
|
|
||||||
### nprobes()
|
### nprobes()
|
||||||
|
|
||||||
```ts
|
```ts
|
||||||
@@ -460,10 +413,6 @@ For best results we recommend tuning this parameter with a benchmark against
|
|||||||
your actual data to find the smallest possible value that will still give
|
your actual data to find the smallest possible value that will still give
|
||||||
you the desired recall.
|
you the desired recall.
|
||||||
|
|
||||||
For more fine grained control over behavior when you have a very narrow filter
|
|
||||||
you can use `minimumNprobes` and `maximumNprobes`. This method sets both
|
|
||||||
the minimum and maximum to the same value.
|
|
||||||
|
|
||||||
#### Parameters
|
#### Parameters
|
||||||
|
|
||||||
* **nprobes**: `number`
|
* **nprobes**: `number`
|
||||||
|
|||||||
@@ -15,14 +15,6 @@ Enum representing the types of full-text queries supported.
|
|||||||
|
|
||||||
## Enumeration Members
|
## Enumeration Members
|
||||||
|
|
||||||
### Boolean
|
|
||||||
|
|
||||||
```ts
|
|
||||||
Boolean: "boolean";
|
|
||||||
```
|
|
||||||
|
|
||||||
***
|
|
||||||
|
|
||||||
### Boost
|
### Boost
|
||||||
|
|
||||||
```ts
|
```ts
|
||||||
|
|||||||
@@ -1,37 +0,0 @@
|
|||||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
|
||||||
|
|
||||||
***
|
|
||||||
|
|
||||||
[@lancedb/lancedb](../globals.md) / Occur
|
|
||||||
|
|
||||||
# Enumeration: Occur
|
|
||||||
|
|
||||||
Enum representing the occurrence of terms in full-text queries.
|
|
||||||
|
|
||||||
- `Must`: The term must be present in the document.
|
|
||||||
- `Should`: The term should contribute to the document score, but is not required.
|
|
||||||
- `MustNot`: The term must not be present in the document.
|
|
||||||
|
|
||||||
## Enumeration Members
|
|
||||||
|
|
||||||
### Must
|
|
||||||
|
|
||||||
```ts
|
|
||||||
Must: "MUST";
|
|
||||||
```
|
|
||||||
|
|
||||||
***
|
|
||||||
|
|
||||||
### MustNot
|
|
||||||
|
|
||||||
```ts
|
|
||||||
MustNot: "MUST_NOT";
|
|
||||||
```
|
|
||||||
|
|
||||||
***
|
|
||||||
|
|
||||||
### Should
|
|
||||||
|
|
||||||
```ts
|
|
||||||
Should: "SHOULD";
|
|
||||||
```
|
|
||||||
@@ -1,28 +0,0 @@
|
|||||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
|
||||||
|
|
||||||
***
|
|
||||||
|
|
||||||
[@lancedb/lancedb](../globals.md) / Operator
|
|
||||||
|
|
||||||
# Enumeration: Operator
|
|
||||||
|
|
||||||
Enum representing the logical operators used in full-text queries.
|
|
||||||
|
|
||||||
- `And`: All terms must match.
|
|
||||||
- `Or`: At least one term must match.
|
|
||||||
|
|
||||||
## Enumeration Members
|
|
||||||
|
|
||||||
### And
|
|
||||||
|
|
||||||
```ts
|
|
||||||
And: "AND";
|
|
||||||
```
|
|
||||||
|
|
||||||
***
|
|
||||||
|
|
||||||
### Or
|
|
||||||
|
|
||||||
```ts
|
|
||||||
Or: "OR";
|
|
||||||
```
|
|
||||||
@@ -6,13 +6,10 @@
|
|||||||
|
|
||||||
# Function: connect()
|
# Function: connect()
|
||||||
|
|
||||||
## connect(uri, options, session)
|
## connect(uri, options)
|
||||||
|
|
||||||
```ts
|
```ts
|
||||||
function connect(
|
function connect(uri, options?): Promise<Connection>
|
||||||
uri,
|
|
||||||
options?,
|
|
||||||
session?): Promise<Connection>
|
|
||||||
```
|
```
|
||||||
|
|
||||||
Connect to a LanceDB instance at the given URI.
|
Connect to a LanceDB instance at the given URI.
|
||||||
@@ -32,8 +29,6 @@ Accepted formats:
|
|||||||
* **options?**: `Partial`<[`ConnectionOptions`](../interfaces/ConnectionOptions.md)>
|
* **options?**: `Partial`<[`ConnectionOptions`](../interfaces/ConnectionOptions.md)>
|
||||||
The options to use when connecting to the database
|
The options to use when connecting to the database
|
||||||
|
|
||||||
* **session?**: [`Session`](../classes/Session.md)
|
|
||||||
|
|
||||||
### Returns
|
### Returns
|
||||||
|
|
||||||
`Promise`<[`Connection`](../classes/Connection.md)>
|
`Promise`<[`Connection`](../classes/Connection.md)>
|
||||||
@@ -82,7 +77,7 @@ Accepted formats:
|
|||||||
|
|
||||||
[ConnectionOptions](../interfaces/ConnectionOptions.md) for more details on the URI format.
|
[ConnectionOptions](../interfaces/ConnectionOptions.md) for more details on the URI format.
|
||||||
|
|
||||||
### Examples
|
### Example
|
||||||
|
|
||||||
```ts
|
```ts
|
||||||
const conn = await connect({
|
const conn = await connect({
|
||||||
@@ -90,11 +85,3 @@ const conn = await connect({
|
|||||||
storageOptions: {timeout: "60s"}
|
storageOptions: {timeout: "60s"}
|
||||||
});
|
});
|
||||||
```
|
```
|
||||||
|
|
||||||
```ts
|
|
||||||
const session = Session.default();
|
|
||||||
const conn = await connect({
|
|
||||||
uri: "/path/to/database",
|
|
||||||
session: session
|
|
||||||
});
|
|
||||||
```
|
|
||||||
|
|||||||
@@ -12,12 +12,9 @@
|
|||||||
## Enumerations
|
## Enumerations
|
||||||
|
|
||||||
- [FullTextQueryType](enumerations/FullTextQueryType.md)
|
- [FullTextQueryType](enumerations/FullTextQueryType.md)
|
||||||
- [Occur](enumerations/Occur.md)
|
|
||||||
- [Operator](enumerations/Operator.md)
|
|
||||||
|
|
||||||
## Classes
|
## Classes
|
||||||
|
|
||||||
- [BooleanQuery](classes/BooleanQuery.md)
|
|
||||||
- [BoostQuery](classes/BoostQuery.md)
|
- [BoostQuery](classes/BoostQuery.md)
|
||||||
- [Connection](classes/Connection.md)
|
- [Connection](classes/Connection.md)
|
||||||
- [Index](classes/Index.md)
|
- [Index](classes/Index.md)
|
||||||
@@ -29,7 +26,6 @@
|
|||||||
- [Query](classes/Query.md)
|
- [Query](classes/Query.md)
|
||||||
- [QueryBase](classes/QueryBase.md)
|
- [QueryBase](classes/QueryBase.md)
|
||||||
- [RecordBatchIterator](classes/RecordBatchIterator.md)
|
- [RecordBatchIterator](classes/RecordBatchIterator.md)
|
||||||
- [Session](classes/Session.md)
|
|
||||||
- [Table](classes/Table.md)
|
- [Table](classes/Table.md)
|
||||||
- [TagContents](classes/TagContents.md)
|
- [TagContents](classes/TagContents.md)
|
||||||
- [Tags](classes/Tags.md)
|
- [Tags](classes/Tags.md)
|
||||||
@@ -85,7 +81,6 @@
|
|||||||
- [FieldLike](type-aliases/FieldLike.md)
|
- [FieldLike](type-aliases/FieldLike.md)
|
||||||
- [IntoSql](type-aliases/IntoSql.md)
|
- [IntoSql](type-aliases/IntoSql.md)
|
||||||
- [IntoVector](type-aliases/IntoVector.md)
|
- [IntoVector](type-aliases/IntoVector.md)
|
||||||
- [MultiVector](type-aliases/MultiVector.md)
|
|
||||||
- [RecordBatchLike](type-aliases/RecordBatchLike.md)
|
- [RecordBatchLike](type-aliases/RecordBatchLike.md)
|
||||||
- [SchemaLike](type-aliases/SchemaLike.md)
|
- [SchemaLike](type-aliases/SchemaLike.md)
|
||||||
- [TableLike](type-aliases/TableLike.md)
|
- [TableLike](type-aliases/TableLike.md)
|
||||||
|
|||||||
@@ -70,17 +70,6 @@ Defaults to 'us-east-1'.
|
|||||||
|
|
||||||
***
|
***
|
||||||
|
|
||||||
### session?
|
|
||||||
|
|
||||||
```ts
|
|
||||||
optional session: Session;
|
|
||||||
```
|
|
||||||
|
|
||||||
(For LanceDB OSS only): the session to use for this connection. Holds
|
|
||||||
shared caches and other session-specific state.
|
|
||||||
|
|
||||||
***
|
|
||||||
|
|
||||||
### storageOptions?
|
### storageOptions?
|
||||||
|
|
||||||
```ts
|
```ts
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ whether to remove punctuation
|
|||||||
### baseTokenizer?
|
### baseTokenizer?
|
||||||
|
|
||||||
```ts
|
```ts
|
||||||
optional baseTokenizer: "raw" | "simple" | "whitespace" | "ngram";
|
optional baseTokenizer: "raw" | "simple" | "whitespace";
|
||||||
```
|
```
|
||||||
|
|
||||||
The tokenizer to use when building the index.
|
The tokenizer to use when building the index.
|
||||||
@@ -71,36 +71,6 @@ tokens longer than this length will be ignored
|
|||||||
|
|
||||||
***
|
***
|
||||||
|
|
||||||
### ngramMaxLength?
|
|
||||||
|
|
||||||
```ts
|
|
||||||
optional ngramMaxLength: number;
|
|
||||||
```
|
|
||||||
|
|
||||||
ngram max length
|
|
||||||
|
|
||||||
***
|
|
||||||
|
|
||||||
### ngramMinLength?
|
|
||||||
|
|
||||||
```ts
|
|
||||||
optional ngramMinLength: number;
|
|
||||||
```
|
|
||||||
|
|
||||||
ngram min length
|
|
||||||
|
|
||||||
***
|
|
||||||
|
|
||||||
### prefixOnly?
|
|
||||||
|
|
||||||
```ts
|
|
||||||
optional prefixOnly: boolean;
|
|
||||||
```
|
|
||||||
|
|
||||||
whether to only index the prefix of the token for ngram tokenizer
|
|
||||||
|
|
||||||
***
|
|
||||||
|
|
||||||
### removeStopWords?
|
### removeStopWords?
|
||||||
|
|
||||||
```ts
|
```ts
|
||||||
|
|||||||
@@ -8,7 +8,7 @@
|
|||||||
|
|
||||||
## Properties
|
## Properties
|
||||||
|
|
||||||
### ~~indexCacheSize?~~
|
### indexCacheSize?
|
||||||
|
|
||||||
```ts
|
```ts
|
||||||
optional indexCacheSize: number;
|
optional indexCacheSize: number;
|
||||||
@@ -16,11 +16,6 @@ optional indexCacheSize: number;
|
|||||||
|
|
||||||
Set the size of the index cache, specified as a number of entries
|
Set the size of the index cache, specified as a number of entries
|
||||||
|
|
||||||
#### Deprecated
|
|
||||||
|
|
||||||
Use session-level cache configuration instead.
|
|
||||||
Create a Session with custom cache sizes and pass it to the connect() function.
|
|
||||||
|
|
||||||
The exact meaning of an "entry" will depend on the type of index:
|
The exact meaning of an "entry" will depend on the type of index:
|
||||||
- IVF: there is one entry for each IVF partition
|
- IVF: there is one entry for each IVF partition
|
||||||
- BTREE: there is one entry for the entire index
|
- BTREE: there is one entry for the entire index
|
||||||
|
|||||||
@@ -24,10 +24,10 @@ The default is 7 days
|
|||||||
// Delete all versions older than 1 day
|
// Delete all versions older than 1 day
|
||||||
const olderThan = new Date();
|
const olderThan = new Date();
|
||||||
olderThan.setDate(olderThan.getDate() - 1));
|
olderThan.setDate(olderThan.getDate() - 1));
|
||||||
tbl.optimize({cleanupOlderThan: olderThan});
|
tbl.cleanupOlderVersions(olderThan);
|
||||||
|
|
||||||
// Delete all versions except the current version
|
// Delete all versions except the current version
|
||||||
tbl.optimize({cleanupOlderThan: new Date()});
|
tbl.cleanupOlderVersions(new Date());
|
||||||
```
|
```
|
||||||
|
|
||||||
***
|
***
|
||||||
|
|||||||
@@ -1,11 +0,0 @@
|
|||||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
|
||||||
|
|
||||||
***
|
|
||||||
|
|
||||||
[@lancedb/lancedb](../globals.md) / MultiVector
|
|
||||||
|
|
||||||
# Type Alias: MultiVector
|
|
||||||
|
|
||||||
```ts
|
|
||||||
type MultiVector: IntoVector[];
|
|
||||||
```
|
|
||||||
@@ -428,7 +428,7 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"**Why?** \n",
|
"**Why?** \n",
|
||||||
"Embedding the UFO dataset and ingesting it into LanceDB takes **~2 hours on a T4 GPU**. To save time: \n",
|
"Embedding the UFO dataset and ingesting it into LanceDB takes **~2 hours on a T4 GPU**. To save time: \n",
|
||||||
"- **Use the pre-prepared table with index created** (provided below) to proceed directly to **Step 7**: search. \n",
|
"- **Use the pre-prepared table with index created ** (provided below) to proceed directly to step7: search. \n",
|
||||||
"- **Step 5a** contains the full ingestion code for reference (run it only if necessary). \n",
|
"- **Step 5a** contains the full ingestion code for reference (run it only if necessary). \n",
|
||||||
"- **Step 6** contains the details on creating the index on the multivector column"
|
"- **Step 6** contains the details on creating the index on the multivector column"
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -30,8 +30,7 @@ excluded_globs = [
|
|||||||
"../src/rag/advanced_techniques/*.md",
|
"../src/rag/advanced_techniques/*.md",
|
||||||
"../src/guides/scalar_index.md",
|
"../src/guides/scalar_index.md",
|
||||||
"../src/guides/storage.md",
|
"../src/guides/storage.md",
|
||||||
"../src/search.md",
|
"../src/search.md"
|
||||||
"../src/guides/sql_querying.md",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
python_prefix = "py"
|
python_prefix = "py"
|
||||||
|
|||||||
@@ -7,4 +7,3 @@ tantivy==0.20.1
|
|||||||
--extra-index-url https://download.pytorch.org/whl/cpu
|
--extra-index-url https://download.pytorch.org/whl/cpu
|
||||||
torch
|
torch
|
||||||
polars>=0.19, <=1.3.0
|
polars>=0.19, <=1.3.0
|
||||||
datafusion
|
|
||||||
|
|||||||
19
java/.mvn/wrapper/maven-wrapper.properties
vendored
19
java/.mvn/wrapper/maven-wrapper.properties
vendored
@@ -1,19 +0,0 @@
|
|||||||
# Licensed to the Apache Software Foundation (ASF) under one
|
|
||||||
# or more contributor license agreements. See the NOTICE file
|
|
||||||
# distributed with this work for additional information
|
|
||||||
# regarding copyright ownership. The ASF licenses this file
|
|
||||||
# to you under the Apache License, Version 2.0 (the
|
|
||||||
# "License"); you may not use this file except in compliance
|
|
||||||
# with the License. You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing,
|
|
||||||
# software distributed under the License is distributed on an
|
|
||||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
||||||
# KIND, either express or implied. See the License for the
|
|
||||||
# specific language governing permissions and limitations
|
|
||||||
# under the License.
|
|
||||||
wrapperVersion=3.3.2
|
|
||||||
distributionType=only-script
|
|
||||||
distributionUrl=https://repo.maven.apache.org/maven2/org/apache/maven/apache-maven/3.9.9/apache-maven-3.9.9-bin.zip
|
|
||||||
@@ -1,37 +0,0 @@
|
|||||||
# LanceDB Java SDK
|
|
||||||
|
|
||||||
## Configuration and Initialization
|
|
||||||
|
|
||||||
### LanceDB Cloud
|
|
||||||
|
|
||||||
For LanceDB Cloud, use the simplified builder API:
|
|
||||||
|
|
||||||
```java
|
|
||||||
import com.lancedb.lance.namespace.LanceRestNamespace;
|
|
||||||
|
|
||||||
// If your DB url is db://example-db, then your database here is example-db
|
|
||||||
LanceRestNamespace namespace = LanceDBRestNamespaces.builder()
|
|
||||||
.apiKey("your_lancedb_cloud_api_key")
|
|
||||||
.database("your_database_name")
|
|
||||||
.build();
|
|
||||||
```
|
|
||||||
|
|
||||||
### LanceDB Enterprise
|
|
||||||
|
|
||||||
For Enterprise deployments, use your VPC endpoint:
|
|
||||||
|
|
||||||
```java
|
|
||||||
LanceRestNamespace namespace = LanceDBRestNamespaces.builder()
|
|
||||||
.apiKey("your_lancedb_enterprise_api_key")
|
|
||||||
.database("your-top-dir") // Your top level folder under your cloud bucket, e.g. s3://your-bucket/your-top-dir/
|
|
||||||
.hostOverride("http://<vpc_endpoint_dns_name>:80")
|
|
||||||
.build();
|
|
||||||
```
|
|
||||||
|
|
||||||
## Development
|
|
||||||
|
|
||||||
Build:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
./mvnw install
|
|
||||||
```
|
|
||||||
@@ -19,7 +19,7 @@ lancedb = { path = "../../../rust/lancedb" }
|
|||||||
lance = { workspace = true }
|
lance = { workspace = true }
|
||||||
arrow = { workspace = true, features = ["ffi"] }
|
arrow = { workspace = true, features = ["ffi"] }
|
||||||
arrow-schema.workspace = true
|
arrow-schema.workspace = true
|
||||||
tokio = "1.46"
|
tokio = "1.23"
|
||||||
jni = "0.21.1"
|
jni = "0.21.1"
|
||||||
snafu.workspace = true
|
snafu.workspace = true
|
||||||
lazy_static.workspace = true
|
lazy_static.workspace = true
|
||||||
|
|||||||
@@ -8,24 +8,18 @@
|
|||||||
<parent>
|
<parent>
|
||||||
<groupId>com.lancedb</groupId>
|
<groupId>com.lancedb</groupId>
|
||||||
<artifactId>lancedb-parent</artifactId>
|
<artifactId>lancedb-parent</artifactId>
|
||||||
<version>0.21.2-final.0</version>
|
<version>0.20.0-final.0</version>
|
||||||
<relativePath>../pom.xml</relativePath>
|
<relativePath>../pom.xml</relativePath>
|
||||||
</parent>
|
</parent>
|
||||||
|
|
||||||
<artifactId>lancedb-core</artifactId>
|
<artifactId>lancedb-core</artifactId>
|
||||||
<name>${project.artifactId}</name>
|
<name>LanceDB Core</name>
|
||||||
<description>LanceDB Core</description>
|
|
||||||
<packaging>jar</packaging>
|
<packaging>jar</packaging>
|
||||||
<properties>
|
<properties>
|
||||||
<rust.release.build>false</rust.release.build>
|
<rust.release.build>false</rust.release.build>
|
||||||
</properties>
|
</properties>
|
||||||
|
|
||||||
<dependencies>
|
<dependencies>
|
||||||
<dependency>
|
|
||||||
<groupId>com.lancedb</groupId>
|
|
||||||
<artifactId>lance-namespace-core</artifactId>
|
|
||||||
<version>0.0.1</version>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.arrow</groupId>
|
<groupId>org.apache.arrow</groupId>
|
||||||
<artifactId>arrow-vector</artifactId>
|
<artifactId>arrow-vector</artifactId>
|
||||||
|
|||||||
@@ -1,26 +0,0 @@
|
|||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
|
|
||||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
|
||||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
|
||||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
|
||||||
<modelVersion>4.0.0</modelVersion>
|
|
||||||
|
|
||||||
<parent>
|
|
||||||
<groupId>com.lancedb</groupId>
|
|
||||||
<artifactId>lancedb-parent</artifactId>
|
|
||||||
<version>0.21.2-final.0</version>
|
|
||||||
<relativePath>../pom.xml</relativePath>
|
|
||||||
</parent>
|
|
||||||
|
|
||||||
<artifactId>lancedb-lance-namespace</artifactId>
|
|
||||||
<name>${project.artifactId}</name>
|
|
||||||
<description>LanceDB Java Integration with Lance Namespace</description>
|
|
||||||
<packaging>jar</packaging>
|
|
||||||
|
|
||||||
<dependencies>
|
|
||||||
<dependency>
|
|
||||||
<groupId>com.lancedb</groupId>
|
|
||||||
<artifactId>lance-namespace-core</artifactId>
|
|
||||||
</dependency>
|
|
||||||
</dependencies>
|
|
||||||
</project>
|
|
||||||
@@ -1,146 +0,0 @@
|
|||||||
/*
|
|
||||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
* you may not use this file except in compliance with the License.
|
|
||||||
* You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
package com.lancedb.lancedb;
|
|
||||||
|
|
||||||
import com.lancedb.lance.namespace.LanceRestNamespace;
|
|
||||||
import com.lancedb.lance.namespace.client.apache.ApiClient;
|
|
||||||
|
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.Optional;
|
|
||||||
|
|
||||||
/** Util class to help construct a {@link LanceRestNamespace} for LanceDB. */
|
|
||||||
public class LanceDbRestNamespaces {
|
|
||||||
private static final String DEFAULT_REGION = "us-east-1";
|
|
||||||
private static final String CLOUD_URL_PATTERN = "https://%s.%s.api.lancedb.com";
|
|
||||||
|
|
||||||
private String apiKey;
|
|
||||||
private String database;
|
|
||||||
private Optional<String> hostOverride = Optional.empty();
|
|
||||||
private Optional<String> region = Optional.empty();
|
|
||||||
private Map<String, String> additionalConfig = new HashMap<>();
|
|
||||||
|
|
||||||
private LanceDbRestNamespaces() {}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Create a new builder instance.
|
|
||||||
*
|
|
||||||
* @return A new LanceRestNamespaceBuilder
|
|
||||||
*/
|
|
||||||
public static LanceDbRestNamespaces builder() {
|
|
||||||
return new LanceDbRestNamespaces();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Set the API key (required).
|
|
||||||
*
|
|
||||||
* @param apiKey The LanceDB API key
|
|
||||||
* @return This builder
|
|
||||||
*/
|
|
||||||
public LanceDbRestNamespaces apiKey(String apiKey) {
|
|
||||||
if (apiKey == null || apiKey.trim().isEmpty()) {
|
|
||||||
throw new IllegalArgumentException("API key cannot be null or empty");
|
|
||||||
}
|
|
||||||
this.apiKey = apiKey;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Set the database name (required).
|
|
||||||
*
|
|
||||||
* @param database The database name
|
|
||||||
* @return This builder
|
|
||||||
*/
|
|
||||||
public LanceDbRestNamespaces database(String database) {
|
|
||||||
if (database == null || database.trim().isEmpty()) {
|
|
||||||
throw new IllegalArgumentException("Database cannot be null or empty");
|
|
||||||
}
|
|
||||||
this.database = database;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Set a custom host override (optional). When set, this overrides the default LanceDB Cloud URL
|
|
||||||
* construction. Use this for LanceDB Enterprise deployments.
|
|
||||||
*
|
|
||||||
* @param hostOverride The complete base URL (e.g., "http://your-vpc-endpoint:80")
|
|
||||||
* @return This builder
|
|
||||||
*/
|
|
||||||
public LanceDbRestNamespaces hostOverride(String hostOverride) {
|
|
||||||
this.hostOverride = Optional.ofNullable(hostOverride);
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Set the region for LanceDB Cloud (optional). Defaults to "us-east-1" if not specified. This is
|
|
||||||
* ignored when hostOverride is set.
|
|
||||||
*
|
|
||||||
* @param region The AWS region (e.g., "us-east-1", "eu-west-1")
|
|
||||||
* @return This builder
|
|
||||||
*/
|
|
||||||
public LanceDbRestNamespaces region(String region) {
|
|
||||||
this.region = Optional.ofNullable(region);
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Add additional configuration parameters.
|
|
||||||
*
|
|
||||||
* @param key The configuration key
|
|
||||||
* @param value The configuration value
|
|
||||||
* @return This builder
|
|
||||||
*/
|
|
||||||
public LanceDbRestNamespaces config(String key, String value) {
|
|
||||||
this.additionalConfig.put(key, value);
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Build the LanceRestNamespace instance.
|
|
||||||
*
|
|
||||||
* @return A configured LanceRestNamespace
|
|
||||||
* @throws IllegalStateException if required parameters are missing
|
|
||||||
*/
|
|
||||||
public LanceRestNamespace build() {
|
|
||||||
// Validate required fields
|
|
||||||
if (apiKey == null) {
|
|
||||||
throw new IllegalStateException("API key is required");
|
|
||||||
}
|
|
||||||
if (database == null) {
|
|
||||||
throw new IllegalStateException("Database is required");
|
|
||||||
}
|
|
||||||
|
|
||||||
// Build configuration map
|
|
||||||
Map<String, String> config = new HashMap<>(additionalConfig);
|
|
||||||
config.put("headers.x-lancedb-database", database);
|
|
||||||
config.put("headers.x-api-key", apiKey);
|
|
||||||
|
|
||||||
// Determine base URL
|
|
||||||
String baseUrl;
|
|
||||||
if (hostOverride.isPresent()) {
|
|
||||||
baseUrl = hostOverride.get();
|
|
||||||
config.put("host_override", hostOverride.get());
|
|
||||||
} else {
|
|
||||||
String effectiveRegion = region.orElse(DEFAULT_REGION);
|
|
||||||
baseUrl = String.format(CLOUD_URL_PATTERN, database, effectiveRegion);
|
|
||||||
config.put("region", effectiveRegion);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Create and configure ApiClient
|
|
||||||
ApiClient apiClient = new ApiClient();
|
|
||||||
apiClient.setBasePath(baseUrl);
|
|
||||||
|
|
||||||
return new LanceRestNamespace(apiClient, config);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
259
java/mvnw
vendored
259
java/mvnw
vendored
@@ -1,259 +0,0 @@
|
|||||||
#!/bin/sh
|
|
||||||
# ----------------------------------------------------------------------------
|
|
||||||
# Licensed to the Apache Software Foundation (ASF) under one
|
|
||||||
# or more contributor license agreements. See the NOTICE file
|
|
||||||
# distributed with this work for additional information
|
|
||||||
# regarding copyright ownership. The ASF licenses this file
|
|
||||||
# to you under the Apache License, Version 2.0 (the
|
|
||||||
# "License"); you may not use this file except in compliance
|
|
||||||
# with the License. You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing,
|
|
||||||
# software distributed under the License is distributed on an
|
|
||||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
||||||
# KIND, either express or implied. See the License for the
|
|
||||||
# specific language governing permissions and limitations
|
|
||||||
# under the License.
|
|
||||||
# ----------------------------------------------------------------------------
|
|
||||||
|
|
||||||
# ----------------------------------------------------------------------------
|
|
||||||
# Apache Maven Wrapper startup batch script, version 3.3.2
|
|
||||||
#
|
|
||||||
# Optional ENV vars
|
|
||||||
# -----------------
|
|
||||||
# JAVA_HOME - location of a JDK home dir, required when download maven via java source
|
|
||||||
# MVNW_REPOURL - repo url base for downloading maven distribution
|
|
||||||
# MVNW_USERNAME/MVNW_PASSWORD - user and password for downloading maven
|
|
||||||
# MVNW_VERBOSE - true: enable verbose log; debug: trace the mvnw script; others: silence the output
|
|
||||||
# ----------------------------------------------------------------------------
|
|
||||||
|
|
||||||
set -euf
|
|
||||||
[ "${MVNW_VERBOSE-}" != debug ] || set -x
|
|
||||||
|
|
||||||
# OS specific support.
|
|
||||||
native_path() { printf %s\\n "$1"; }
|
|
||||||
case "$(uname)" in
|
|
||||||
CYGWIN* | MINGW*)
|
|
||||||
[ -z "${JAVA_HOME-}" ] || JAVA_HOME="$(cygpath --unix "$JAVA_HOME")"
|
|
||||||
native_path() { cygpath --path --windows "$1"; }
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
|
|
||||||
# set JAVACMD and JAVACCMD
|
|
||||||
set_java_home() {
|
|
||||||
# For Cygwin and MinGW, ensure paths are in Unix format before anything is touched
|
|
||||||
if [ -n "${JAVA_HOME-}" ]; then
|
|
||||||
if [ -x "$JAVA_HOME/jre/sh/java" ]; then
|
|
||||||
# IBM's JDK on AIX uses strange locations for the executables
|
|
||||||
JAVACMD="$JAVA_HOME/jre/sh/java"
|
|
||||||
JAVACCMD="$JAVA_HOME/jre/sh/javac"
|
|
||||||
else
|
|
||||||
JAVACMD="$JAVA_HOME/bin/java"
|
|
||||||
JAVACCMD="$JAVA_HOME/bin/javac"
|
|
||||||
|
|
||||||
if [ ! -x "$JAVACMD" ] || [ ! -x "$JAVACCMD" ]; then
|
|
||||||
echo "The JAVA_HOME environment variable is not defined correctly, so mvnw cannot run." >&2
|
|
||||||
echo "JAVA_HOME is set to \"$JAVA_HOME\", but \"\$JAVA_HOME/bin/java\" or \"\$JAVA_HOME/bin/javac\" does not exist." >&2
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
JAVACMD="$(
|
|
||||||
'set' +e
|
|
||||||
'unset' -f command 2>/dev/null
|
|
||||||
'command' -v java
|
|
||||||
)" || :
|
|
||||||
JAVACCMD="$(
|
|
||||||
'set' +e
|
|
||||||
'unset' -f command 2>/dev/null
|
|
||||||
'command' -v javac
|
|
||||||
)" || :
|
|
||||||
|
|
||||||
if [ ! -x "${JAVACMD-}" ] || [ ! -x "${JAVACCMD-}" ]; then
|
|
||||||
echo "The java/javac command does not exist in PATH nor is JAVA_HOME set, so mvnw cannot run." >&2
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
# hash string like Java String::hashCode
|
|
||||||
hash_string() {
|
|
||||||
str="${1:-}" h=0
|
|
||||||
while [ -n "$str" ]; do
|
|
||||||
char="${str%"${str#?}"}"
|
|
||||||
h=$(((h * 31 + $(LC_CTYPE=C printf %d "'$char")) % 4294967296))
|
|
||||||
str="${str#?}"
|
|
||||||
done
|
|
||||||
printf %x\\n $h
|
|
||||||
}
|
|
||||||
|
|
||||||
verbose() { :; }
|
|
||||||
[ "${MVNW_VERBOSE-}" != true ] || verbose() { printf %s\\n "${1-}"; }
|
|
||||||
|
|
||||||
die() {
|
|
||||||
printf %s\\n "$1" >&2
|
|
||||||
exit 1
|
|
||||||
}
|
|
||||||
|
|
||||||
trim() {
|
|
||||||
# MWRAPPER-139:
|
|
||||||
# Trims trailing and leading whitespace, carriage returns, tabs, and linefeeds.
|
|
||||||
# Needed for removing poorly interpreted newline sequences when running in more
|
|
||||||
# exotic environments such as mingw bash on Windows.
|
|
||||||
printf "%s" "${1}" | tr -d '[:space:]'
|
|
||||||
}
|
|
||||||
|
|
||||||
# parse distributionUrl and optional distributionSha256Sum, requires .mvn/wrapper/maven-wrapper.properties
|
|
||||||
while IFS="=" read -r key value; do
|
|
||||||
case "${key-}" in
|
|
||||||
distributionUrl) distributionUrl=$(trim "${value-}") ;;
|
|
||||||
distributionSha256Sum) distributionSha256Sum=$(trim "${value-}") ;;
|
|
||||||
esac
|
|
||||||
done <"${0%/*}/.mvn/wrapper/maven-wrapper.properties"
|
|
||||||
[ -n "${distributionUrl-}" ] || die "cannot read distributionUrl property in ${0%/*}/.mvn/wrapper/maven-wrapper.properties"
|
|
||||||
|
|
||||||
case "${distributionUrl##*/}" in
|
|
||||||
maven-mvnd-*bin.*)
|
|
||||||
MVN_CMD=mvnd.sh _MVNW_REPO_PATTERN=/maven/mvnd/
|
|
||||||
case "${PROCESSOR_ARCHITECTURE-}${PROCESSOR_ARCHITEW6432-}:$(uname -a)" in
|
|
||||||
*AMD64:CYGWIN* | *AMD64:MINGW*) distributionPlatform=windows-amd64 ;;
|
|
||||||
:Darwin*x86_64) distributionPlatform=darwin-amd64 ;;
|
|
||||||
:Darwin*arm64) distributionPlatform=darwin-aarch64 ;;
|
|
||||||
:Linux*x86_64*) distributionPlatform=linux-amd64 ;;
|
|
||||||
*)
|
|
||||||
echo "Cannot detect native platform for mvnd on $(uname)-$(uname -m), use pure java version" >&2
|
|
||||||
distributionPlatform=linux-amd64
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
distributionUrl="${distributionUrl%-bin.*}-$distributionPlatform.zip"
|
|
||||||
;;
|
|
||||||
maven-mvnd-*) MVN_CMD=mvnd.sh _MVNW_REPO_PATTERN=/maven/mvnd/ ;;
|
|
||||||
*) MVN_CMD="mvn${0##*/mvnw}" _MVNW_REPO_PATTERN=/org/apache/maven/ ;;
|
|
||||||
esac
|
|
||||||
|
|
||||||
# apply MVNW_REPOURL and calculate MAVEN_HOME
|
|
||||||
# maven home pattern: ~/.m2/wrapper/dists/{apache-maven-<version>,maven-mvnd-<version>-<platform>}/<hash>
|
|
||||||
[ -z "${MVNW_REPOURL-}" ] || distributionUrl="$MVNW_REPOURL$_MVNW_REPO_PATTERN${distributionUrl#*"$_MVNW_REPO_PATTERN"}"
|
|
||||||
distributionUrlName="${distributionUrl##*/}"
|
|
||||||
distributionUrlNameMain="${distributionUrlName%.*}"
|
|
||||||
distributionUrlNameMain="${distributionUrlNameMain%-bin}"
|
|
||||||
MAVEN_USER_HOME="${MAVEN_USER_HOME:-${HOME}/.m2}"
|
|
||||||
MAVEN_HOME="${MAVEN_USER_HOME}/wrapper/dists/${distributionUrlNameMain-}/$(hash_string "$distributionUrl")"
|
|
||||||
|
|
||||||
exec_maven() {
|
|
||||||
unset MVNW_VERBOSE MVNW_USERNAME MVNW_PASSWORD MVNW_REPOURL || :
|
|
||||||
exec "$MAVEN_HOME/bin/$MVN_CMD" "$@" || die "cannot exec $MAVEN_HOME/bin/$MVN_CMD"
|
|
||||||
}
|
|
||||||
|
|
||||||
if [ -d "$MAVEN_HOME" ]; then
|
|
||||||
verbose "found existing MAVEN_HOME at $MAVEN_HOME"
|
|
||||||
exec_maven "$@"
|
|
||||||
fi
|
|
||||||
|
|
||||||
case "${distributionUrl-}" in
|
|
||||||
*?-bin.zip | *?maven-mvnd-?*-?*.zip) ;;
|
|
||||||
*) die "distributionUrl is not valid, must match *-bin.zip or maven-mvnd-*.zip, but found '${distributionUrl-}'" ;;
|
|
||||||
esac
|
|
||||||
|
|
||||||
# prepare tmp dir
|
|
||||||
if TMP_DOWNLOAD_DIR="$(mktemp -d)" && [ -d "$TMP_DOWNLOAD_DIR" ]; then
|
|
||||||
clean() { rm -rf -- "$TMP_DOWNLOAD_DIR"; }
|
|
||||||
trap clean HUP INT TERM EXIT
|
|
||||||
else
|
|
||||||
die "cannot create temp dir"
|
|
||||||
fi
|
|
||||||
|
|
||||||
mkdir -p -- "${MAVEN_HOME%/*}"
|
|
||||||
|
|
||||||
# Download and Install Apache Maven
|
|
||||||
verbose "Couldn't find MAVEN_HOME, downloading and installing it ..."
|
|
||||||
verbose "Downloading from: $distributionUrl"
|
|
||||||
verbose "Downloading to: $TMP_DOWNLOAD_DIR/$distributionUrlName"
|
|
||||||
|
|
||||||
# select .zip or .tar.gz
|
|
||||||
if ! command -v unzip >/dev/null; then
|
|
||||||
distributionUrl="${distributionUrl%.zip}.tar.gz"
|
|
||||||
distributionUrlName="${distributionUrl##*/}"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# verbose opt
|
|
||||||
__MVNW_QUIET_WGET=--quiet __MVNW_QUIET_CURL=--silent __MVNW_QUIET_UNZIP=-q __MVNW_QUIET_TAR=''
|
|
||||||
[ "${MVNW_VERBOSE-}" != true ] || __MVNW_QUIET_WGET='' __MVNW_QUIET_CURL='' __MVNW_QUIET_UNZIP='' __MVNW_QUIET_TAR=v
|
|
||||||
|
|
||||||
# normalize http auth
|
|
||||||
case "${MVNW_PASSWORD:+has-password}" in
|
|
||||||
'') MVNW_USERNAME='' MVNW_PASSWORD='' ;;
|
|
||||||
has-password) [ -n "${MVNW_USERNAME-}" ] || MVNW_USERNAME='' MVNW_PASSWORD='' ;;
|
|
||||||
esac
|
|
||||||
|
|
||||||
if [ -z "${MVNW_USERNAME-}" ] && command -v wget >/dev/null; then
|
|
||||||
verbose "Found wget ... using wget"
|
|
||||||
wget ${__MVNW_QUIET_WGET:+"$__MVNW_QUIET_WGET"} "$distributionUrl" -O "$TMP_DOWNLOAD_DIR/$distributionUrlName" || die "wget: Failed to fetch $distributionUrl"
|
|
||||||
elif [ -z "${MVNW_USERNAME-}" ] && command -v curl >/dev/null; then
|
|
||||||
verbose "Found curl ... using curl"
|
|
||||||
curl ${__MVNW_QUIET_CURL:+"$__MVNW_QUIET_CURL"} -f -L -o "$TMP_DOWNLOAD_DIR/$distributionUrlName" "$distributionUrl" || die "curl: Failed to fetch $distributionUrl"
|
|
||||||
elif set_java_home; then
|
|
||||||
verbose "Falling back to use Java to download"
|
|
||||||
javaSource="$TMP_DOWNLOAD_DIR/Downloader.java"
|
|
||||||
targetZip="$TMP_DOWNLOAD_DIR/$distributionUrlName"
|
|
||||||
cat >"$javaSource" <<-END
|
|
||||||
public class Downloader extends java.net.Authenticator
|
|
||||||
{
|
|
||||||
protected java.net.PasswordAuthentication getPasswordAuthentication()
|
|
||||||
{
|
|
||||||
return new java.net.PasswordAuthentication( System.getenv( "MVNW_USERNAME" ), System.getenv( "MVNW_PASSWORD" ).toCharArray() );
|
|
||||||
}
|
|
||||||
public static void main( String[] args ) throws Exception
|
|
||||||
{
|
|
||||||
setDefault( new Downloader() );
|
|
||||||
java.nio.file.Files.copy( java.net.URI.create( args[0] ).toURL().openStream(), java.nio.file.Paths.get( args[1] ).toAbsolutePath().normalize() );
|
|
||||||
}
|
|
||||||
}
|
|
||||||
END
|
|
||||||
# For Cygwin/MinGW, switch paths to Windows format before running javac and java
|
|
||||||
verbose " - Compiling Downloader.java ..."
|
|
||||||
"$(native_path "$JAVACCMD")" "$(native_path "$javaSource")" || die "Failed to compile Downloader.java"
|
|
||||||
verbose " - Running Downloader.java ..."
|
|
||||||
"$(native_path "$JAVACMD")" -cp "$(native_path "$TMP_DOWNLOAD_DIR")" Downloader "$distributionUrl" "$(native_path "$targetZip")"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# If specified, validate the SHA-256 sum of the Maven distribution zip file
|
|
||||||
if [ -n "${distributionSha256Sum-}" ]; then
|
|
||||||
distributionSha256Result=false
|
|
||||||
if [ "$MVN_CMD" = mvnd.sh ]; then
|
|
||||||
echo "Checksum validation is not supported for maven-mvnd." >&2
|
|
||||||
echo "Please disable validation by removing 'distributionSha256Sum' from your maven-wrapper.properties." >&2
|
|
||||||
exit 1
|
|
||||||
elif command -v sha256sum >/dev/null; then
|
|
||||||
if echo "$distributionSha256Sum $TMP_DOWNLOAD_DIR/$distributionUrlName" | sha256sum -c >/dev/null 2>&1; then
|
|
||||||
distributionSha256Result=true
|
|
||||||
fi
|
|
||||||
elif command -v shasum >/dev/null; then
|
|
||||||
if echo "$distributionSha256Sum $TMP_DOWNLOAD_DIR/$distributionUrlName" | shasum -a 256 -c >/dev/null 2>&1; then
|
|
||||||
distributionSha256Result=true
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
echo "Checksum validation was requested but neither 'sha256sum' or 'shasum' are available." >&2
|
|
||||||
echo "Please install either command, or disable validation by removing 'distributionSha256Sum' from your maven-wrapper.properties." >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
if [ $distributionSha256Result = false ]; then
|
|
||||||
echo "Error: Failed to validate Maven distribution SHA-256, your Maven distribution might be compromised." >&2
|
|
||||||
echo "If you updated your Maven version, you need to update the specified distributionSha256Sum property." >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
# unzip and move
|
|
||||||
if command -v unzip >/dev/null; then
|
|
||||||
unzip ${__MVNW_QUIET_UNZIP:+"$__MVNW_QUIET_UNZIP"} "$TMP_DOWNLOAD_DIR/$distributionUrlName" -d "$TMP_DOWNLOAD_DIR" || die "failed to unzip"
|
|
||||||
else
|
|
||||||
tar xzf${__MVNW_QUIET_TAR:+"$__MVNW_QUIET_TAR"} "$TMP_DOWNLOAD_DIR/$distributionUrlName" -C "$TMP_DOWNLOAD_DIR" || die "failed to untar"
|
|
||||||
fi
|
|
||||||
printf %s\\n "$distributionUrl" >"$TMP_DOWNLOAD_DIR/$distributionUrlNameMain/mvnw.url"
|
|
||||||
mv -- "$TMP_DOWNLOAD_DIR/$distributionUrlNameMain" "$MAVEN_HOME" || [ -d "$MAVEN_HOME" ] || die "fail to move MAVEN_HOME"
|
|
||||||
|
|
||||||
clean || :
|
|
||||||
exec_maven "$@"
|
|
||||||
14
java/pom.xml
14
java/pom.xml
@@ -6,10 +6,11 @@
|
|||||||
|
|
||||||
<groupId>com.lancedb</groupId>
|
<groupId>com.lancedb</groupId>
|
||||||
<artifactId>lancedb-parent</artifactId>
|
<artifactId>lancedb-parent</artifactId>
|
||||||
<version>0.21.2-final.0</version>
|
<version>0.20.0-final.0</version>
|
||||||
<packaging>pom</packaging>
|
<packaging>pom</packaging>
|
||||||
<name>${project.artifactId}</name>
|
|
||||||
<description>LanceDB Java SDK Parent POM</description>
|
<name>LanceDB Parent</name>
|
||||||
|
<description>LanceDB vector database Java API</description>
|
||||||
<url>http://lancedb.com/</url>
|
<url>http://lancedb.com/</url>
|
||||||
|
|
||||||
<developers>
|
<developers>
|
||||||
@@ -28,7 +29,6 @@
|
|||||||
<properties>
|
<properties>
|
||||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||||
<arrow.version>15.0.0</arrow.version>
|
<arrow.version>15.0.0</arrow.version>
|
||||||
<lance-namespace.verison>0.0.1</lance-namespace.verison>
|
|
||||||
<spotless.skip>false</spotless.skip>
|
<spotless.skip>false</spotless.skip>
|
||||||
<spotless.version>2.30.0</spotless.version>
|
<spotless.version>2.30.0</spotless.version>
|
||||||
<spotless.java.googlejavaformat.version>1.7</spotless.java.googlejavaformat.version>
|
<spotless.java.googlejavaformat.version>1.7</spotless.java.googlejavaformat.version>
|
||||||
@@ -52,7 +52,6 @@
|
|||||||
|
|
||||||
<modules>
|
<modules>
|
||||||
<module>core</module>
|
<module>core</module>
|
||||||
<module>lance-namespace</module>
|
|
||||||
</modules>
|
</modules>
|
||||||
|
|
||||||
<scm>
|
<scm>
|
||||||
@@ -63,11 +62,6 @@
|
|||||||
|
|
||||||
<dependencyManagement>
|
<dependencyManagement>
|
||||||
<dependencies>
|
<dependencies>
|
||||||
<dependency>
|
|
||||||
<groupId>com.lancedb</groupId>
|
|
||||||
<artifactId>lance-namespace-core</artifactId>
|
|
||||||
<version>${lance-namespace.verison}</version>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.arrow</groupId>
|
<groupId>org.apache.arrow</groupId>
|
||||||
<artifactId>arrow-vector</artifactId>
|
<artifactId>arrow-vector</artifactId>
|
||||||
|
|||||||
49
node/package-lock.json
generated
49
node/package-lock.json
generated
@@ -1,12 +1,12 @@
|
|||||||
{
|
{
|
||||||
"name": "vectordb",
|
"name": "vectordb",
|
||||||
"version": "0.21.2",
|
"version": "0.20.0-beta.2",
|
||||||
"lockfileVersion": 3,
|
"lockfileVersion": 3,
|
||||||
"requires": true,
|
"requires": true,
|
||||||
"packages": {
|
"packages": {
|
||||||
"": {
|
"": {
|
||||||
"name": "vectordb",
|
"name": "vectordb",
|
||||||
"version": "0.21.2",
|
"version": "0.20.0-beta.2",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"x64",
|
"x64",
|
||||||
"arm64"
|
"arm64"
|
||||||
@@ -52,11 +52,11 @@
|
|||||||
"uuid": "^9.0.0"
|
"uuid": "^9.0.0"
|
||||||
},
|
},
|
||||||
"optionalDependencies": {
|
"optionalDependencies": {
|
||||||
"@lancedb/vectordb-darwin-arm64": "0.21.2",
|
"@lancedb/vectordb-darwin-arm64": "0.20.0-beta.2",
|
||||||
"@lancedb/vectordb-darwin-x64": "0.21.2",
|
"@lancedb/vectordb-darwin-x64": "0.20.0-beta.2",
|
||||||
"@lancedb/vectordb-linux-arm64-gnu": "0.21.2",
|
"@lancedb/vectordb-linux-arm64-gnu": "0.20.0-beta.2",
|
||||||
"@lancedb/vectordb-linux-x64-gnu": "0.21.2",
|
"@lancedb/vectordb-linux-x64-gnu": "0.20.0-beta.2",
|
||||||
"@lancedb/vectordb-win32-x64-msvc": "0.21.2"
|
"@lancedb/vectordb-win32-x64-msvc": "0.20.0-beta.2"
|
||||||
},
|
},
|
||||||
"peerDependencies": {
|
"peerDependencies": {
|
||||||
"@apache-arrow/ts": "^14.0.2",
|
"@apache-arrow/ts": "^14.0.2",
|
||||||
@@ -327,65 +327,60 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
"node_modules/@lancedb/vectordb-darwin-arm64": {
|
"node_modules/@lancedb/vectordb-darwin-arm64": {
|
||||||
"version": "0.21.2",
|
"version": "0.20.0-beta.2",
|
||||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.21.2.tgz",
|
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.20.0-beta.2.tgz",
|
||||||
"integrity": "sha512-NAQnIKLw9K33KMODNXBEW0qC8/safWzZtqbVC7j1GcE7PSk0Uc6x7w5nrH5gvleZggjaxY9jaRVTqmtg7PNmqw==",
|
"integrity": "sha512-H9PmJ/5KSvstVzR8Q7T22+eHRjJZ2ef3aA3gdFxXvoMi3xQ0MGIxz23HuKHGTRT4tfl1nNnpOPb2W7Na8etK9w==",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"arm64"
|
"arm64"
|
||||||
],
|
],
|
||||||
"license": "Apache-2.0",
|
|
||||||
"optional": true,
|
"optional": true,
|
||||||
"os": [
|
"os": [
|
||||||
"darwin"
|
"darwin"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"node_modules/@lancedb/vectordb-darwin-x64": {
|
"node_modules/@lancedb/vectordb-darwin-x64": {
|
||||||
"version": "0.21.2",
|
"version": "0.20.0-beta.2",
|
||||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.21.2.tgz",
|
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.20.0-beta.2.tgz",
|
||||||
"integrity": "sha512-PudbltlbRiXvBf/bkAaDPL8+RqcI4TG69u00rQHxwkhH7PgPYRTUjfzfaQfiDXZuLXuZHQq703RyoHOqzsHN0Q==",
|
"integrity": "sha512-9AQkv4tIys+vg0cplZtSE48o61jd7EnmuMkUht+vLORL5/HAma84eAoU9lXHT7zAtPAQmL+98Bfvcsx7fJ6mVw==",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"x64"
|
"x64"
|
||||||
],
|
],
|
||||||
"license": "Apache-2.0",
|
|
||||||
"optional": true,
|
"optional": true,
|
||||||
"os": [
|
"os": [
|
||||||
"darwin"
|
"darwin"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"node_modules/@lancedb/vectordb-linux-arm64-gnu": {
|
"node_modules/@lancedb/vectordb-linux-arm64-gnu": {
|
||||||
"version": "0.21.2",
|
"version": "0.20.0-beta.2",
|
||||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.21.2.tgz",
|
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.20.0-beta.2.tgz",
|
||||||
"integrity": "sha512-3lJ8lootlwLmhqabCdg0DKftv0Ujep6NTWAoLWK/6VQe2IgHmu/ZPRNQkOSZ5tnYlmRyDiMDMB2tlAzo45sV8Q==",
|
"integrity": "sha512-eQWoJz2ePml7NyEInTBeakWx56+5c6r2p3F+iHC5tsLuznn6eFX90koXJunRxH1WXHDN48ECUlEmKypgfEmn4w==",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"arm64"
|
"arm64"
|
||||||
],
|
],
|
||||||
"license": "Apache-2.0",
|
|
||||||
"optional": true,
|
"optional": true,
|
||||||
"os": [
|
"os": [
|
||||||
"linux"
|
"linux"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"node_modules/@lancedb/vectordb-linux-x64-gnu": {
|
"node_modules/@lancedb/vectordb-linux-x64-gnu": {
|
||||||
"version": "0.21.2",
|
"version": "0.20.0-beta.2",
|
||||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.21.2.tgz",
|
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.20.0-beta.2.tgz",
|
||||||
"integrity": "sha512-5I2drMOIyRODlAHPsipQBTrRRgcOZ45N5GsuhqcKnz3Tg8GAdc1MQKyK3BrdJzKHLPdRtIyRJ6QTLB3wZvDsQQ==",
|
"integrity": "sha512-/+84U+Dt07m8Jk0b8h+SvOzlrynITPP3SDBOlB+OonwmGSxirXhc8gkfNZctgXOJYKMyRIRSsMHP/QNjOp2ajA==",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"x64"
|
"x64"
|
||||||
],
|
],
|
||||||
"license": "Apache-2.0",
|
|
||||||
"optional": true,
|
"optional": true,
|
||||||
"os": [
|
"os": [
|
||||||
"linux"
|
"linux"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"node_modules/@lancedb/vectordb-win32-x64-msvc": {
|
"node_modules/@lancedb/vectordb-win32-x64-msvc": {
|
||||||
"version": "0.21.2",
|
"version": "0.20.0-beta.2",
|
||||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.21.2.tgz",
|
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.20.0-beta.2.tgz",
|
||||||
"integrity": "sha512-gjpFukq0NTQSRpWPNIpq4XFtaudjSNBT6DMsagC61D2nx9ZLEdSAdU0wdkeluQwhoMvNnXEPdP9HxDSFUXk+Ww==",
|
"integrity": "sha512-bgdunAPnknBh/5oO+vr6RXMr6wb3hHugNPXcIidxYMQvgFa8uhaAKtgYkAKuoyUReOYo8DGtVkZxNUUpZbF7/A==",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"x64"
|
"x64"
|
||||||
],
|
],
|
||||||
"license": "Apache-2.0",
|
|
||||||
"optional": true,
|
"optional": true,
|
||||||
"os": [
|
"os": [
|
||||||
"win32"
|
"win32"
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "vectordb",
|
"name": "vectordb",
|
||||||
"version": "0.21.2",
|
"version": "0.20.0",
|
||||||
"description": " Serverless, low-latency vector database for AI applications",
|
"description": " Serverless, low-latency vector database for AI applications",
|
||||||
"private": false,
|
"private": false,
|
||||||
"main": "dist/index.js",
|
"main": "dist/index.js",
|
||||||
@@ -89,10 +89,10 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
"optionalDependencies": {
|
"optionalDependencies": {
|
||||||
"@lancedb/vectordb-darwin-x64": "0.21.2",
|
"@lancedb/vectordb-darwin-x64": "0.20.0",
|
||||||
"@lancedb/vectordb-darwin-arm64": "0.21.2",
|
"@lancedb/vectordb-darwin-arm64": "0.20.0",
|
||||||
"@lancedb/vectordb-linux-x64-gnu": "0.21.2",
|
"@lancedb/vectordb-linux-x64-gnu": "0.20.0",
|
||||||
"@lancedb/vectordb-linux-arm64-gnu": "0.21.2",
|
"@lancedb/vectordb-linux-arm64-gnu": "0.20.0",
|
||||||
"@lancedb/vectordb-win32-x64-msvc": "0.21.2"
|
"@lancedb/vectordb-win32-x64-msvc": "0.20.0"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -49,7 +49,7 @@ describe('LanceDB Mirrored Store Integration test', function () {
|
|||||||
it('s3://...?mirroredStore=... param is processed correctly', async function () {
|
it('s3://...?mirroredStore=... param is processed correctly', async function () {
|
||||||
this.timeout(600000)
|
this.timeout(600000)
|
||||||
|
|
||||||
const dir = await fs.promises.mkdtemp(path.join(tmpdir(), 'lancedb-mirror-'))
|
const dir = tmpdir()
|
||||||
console.log(dir)
|
console.log(dir)
|
||||||
const conn = await lancedb.connect({ uri: `s3://lancedb-integtest?mirroredStore=${dir}`, storageOptions: { allowHttp: 'true' } })
|
const conn = await lancedb.connect({ uri: `s3://lancedb-integtest?mirroredStore=${dir}`, storageOptions: { allowHttp: 'true' } })
|
||||||
const data = Array(200).fill({ vector: Array(128).fill(1.0), id: 0 })
|
const data = Array(200).fill({ vector: Array(128).fill(1.0), id: 0 })
|
||||||
@@ -63,93 +63,118 @@ describe('LanceDB Mirrored Store Integration test', function () {
|
|||||||
const t = await conn.createTable(tableName, data, { writeMode: lancedb.WriteMode.Overwrite })
|
const t = await conn.createTable(tableName, data, { writeMode: lancedb.WriteMode.Overwrite })
|
||||||
|
|
||||||
const mirroredPath = path.join(dir, `${tableName}.lance`)
|
const mirroredPath = path.join(dir, `${tableName}.lance`)
|
||||||
|
fs.readdir(mirroredPath, { withFileTypes: true }, (err, files) => {
|
||||||
|
if (err != null) throw err
|
||||||
|
// there should be three dirs
|
||||||
|
assert.equal(files.length, 3)
|
||||||
|
assert.isTrue(files[0].isDirectory())
|
||||||
|
assert.isTrue(files[1].isDirectory())
|
||||||
|
|
||||||
const files = await fs.promises.readdir(mirroredPath, { withFileTypes: true })
|
fs.readdir(path.join(mirroredPath, '_transactions'), { withFileTypes: true }, (err, files) => {
|
||||||
// there should be three dirs
|
if (err != null) throw err
|
||||||
assert.equal(files.length, 3, 'files after table creation')
|
assert.equal(files.length, 1)
|
||||||
assert.isTrue(files[0].isDirectory())
|
assert.isTrue(files[0].name.endsWith('.txn'))
|
||||||
assert.isTrue(files[1].isDirectory())
|
})
|
||||||
|
|
||||||
const transactionFiles = await fs.promises.readdir(path.join(mirroredPath, '_transactions'), { withFileTypes: true })
|
fs.readdir(path.join(mirroredPath, '_versions'), { withFileTypes: true }, (err, files) => {
|
||||||
assert.equal(transactionFiles.length, 1, 'transactionFiles after table creation')
|
if (err != null) throw err
|
||||||
assert.isTrue(transactionFiles[0].name.endsWith('.txn'))
|
assert.equal(files.length, 1)
|
||||||
|
assert.isTrue(files[0].name.endsWith('.manifest'))
|
||||||
|
})
|
||||||
|
|
||||||
const versionFiles = await fs.promises.readdir(path.join(mirroredPath, '_versions'), { withFileTypes: true })
|
fs.readdir(path.join(mirroredPath, 'data'), { withFileTypes: true }, (err, files) => {
|
||||||
assert.equal(versionFiles.length, 1, 'versionFiles after table creation')
|
if (err != null) throw err
|
||||||
assert.isTrue(versionFiles[0].name.endsWith('.manifest'))
|
assert.equal(files.length, 1)
|
||||||
|
assert.isTrue(files[0].name.endsWith('.lance'))
|
||||||
const dataFiles = await fs.promises.readdir(path.join(mirroredPath, 'data'), { withFileTypes: true })
|
})
|
||||||
assert.equal(dataFiles.length, 1, 'dataFiles after table creation')
|
})
|
||||||
assert.isTrue(dataFiles[0].name.endsWith('.lance'))
|
|
||||||
|
|
||||||
// try create index and check if it's mirrored
|
// try create index and check if it's mirrored
|
||||||
await t.createIndex({ column: 'vector', type: 'ivf_pq' })
|
await t.createIndex({ column: 'vector', type: 'ivf_pq' })
|
||||||
|
|
||||||
const filesAfterIndex = await fs.promises.readdir(mirroredPath, { withFileTypes: true })
|
fs.readdir(mirroredPath, { withFileTypes: true }, (err, files) => {
|
||||||
// there should be four dirs
|
if (err != null) throw err
|
||||||
assert.equal(filesAfterIndex.length, 4, 'filesAfterIndex')
|
// there should be four dirs
|
||||||
assert.isTrue(filesAfterIndex[0].isDirectory())
|
assert.equal(files.length, 4)
|
||||||
assert.isTrue(filesAfterIndex[1].isDirectory())
|
assert.isTrue(files[0].isDirectory())
|
||||||
assert.isTrue(filesAfterIndex[2].isDirectory())
|
assert.isTrue(files[1].isDirectory())
|
||||||
|
assert.isTrue(files[2].isDirectory())
|
||||||
|
|
||||||
// Two TXs now
|
// Two TXs now
|
||||||
const transactionFilesAfterIndex = await fs.promises.readdir(path.join(mirroredPath, '_transactions'), { withFileTypes: true })
|
fs.readdir(path.join(mirroredPath, '_transactions'), { withFileTypes: true }, (err, files) => {
|
||||||
assert.equal(transactionFilesAfterIndex.length, 2, 'transactionFilesAfterIndex')
|
if (err != null) throw err
|
||||||
assert.isTrue(transactionFilesAfterIndex[0].name.endsWith('.txn'))
|
assert.equal(files.length, 2)
|
||||||
assert.isTrue(transactionFilesAfterIndex[1].name.endsWith('.txn'))
|
assert.isTrue(files[0].name.endsWith('.txn'))
|
||||||
|
assert.isTrue(files[1].name.endsWith('.txn'))
|
||||||
|
})
|
||||||
|
|
||||||
const dataFilesAfterIndex = await fs.promises.readdir(path.join(mirroredPath, 'data'), { withFileTypes: true })
|
fs.readdir(path.join(mirroredPath, 'data'), { withFileTypes: true }, (err, files) => {
|
||||||
assert.equal(dataFilesAfterIndex.length, 1, 'dataFilesAfterIndex')
|
if (err != null) throw err
|
||||||
assert.isTrue(dataFilesAfterIndex[0].name.endsWith('.lance'))
|
assert.equal(files.length, 1)
|
||||||
|
assert.isTrue(files[0].name.endsWith('.lance'))
|
||||||
|
})
|
||||||
|
|
||||||
const indicesFiles = await fs.promises.readdir(path.join(mirroredPath, '_indices'), { withFileTypes: true })
|
fs.readdir(path.join(mirroredPath, '_indices'), { withFileTypes: true }, (err, files) => {
|
||||||
assert.equal(indicesFiles.length, 1, 'indicesFiles')
|
if (err != null) throw err
|
||||||
assert.isTrue(indicesFiles[0].isDirectory())
|
assert.equal(files.length, 1)
|
||||||
|
assert.isTrue(files[0].isDirectory())
|
||||||
|
|
||||||
const indexFiles = await fs.promises.readdir(path.join(mirroredPath, '_indices', indicesFiles[0].name), { withFileTypes: true })
|
fs.readdir(path.join(mirroredPath, '_indices', files[0].name), { withFileTypes: true }, (err, files) => {
|
||||||
console.log(`DEBUG indexFiles in ${indicesFiles[0].name}:`, indexFiles.map(f => `${f.name} (${f.isFile() ? 'file' : 'dir'})`))
|
if (err != null) throw err
|
||||||
assert.equal(indexFiles.length, 2, 'indexFiles')
|
|
||||||
const fileNames = indexFiles.map(f => f.name).sort()
|
assert.equal(files.length, 1)
|
||||||
assert.isTrue(fileNames.includes('auxiliary.idx'), 'auxiliary.idx should be present')
|
assert.isTrue(files[0].isFile())
|
||||||
assert.isTrue(fileNames.includes('index.idx'), 'index.idx should be present')
|
assert.isTrue(files[0].name.endsWith('.idx'))
|
||||||
assert.isTrue(indexFiles.every(f => f.isFile()), 'all index files should be files')
|
})
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|
||||||
// try delete and check if it's mirrored
|
// try delete and check if it's mirrored
|
||||||
await t.delete('id = 0')
|
await t.delete('id = 0')
|
||||||
|
|
||||||
const filesAfterDelete = await fs.promises.readdir(mirroredPath, { withFileTypes: true })
|
fs.readdir(mirroredPath, { withFileTypes: true }, (err, files) => {
|
||||||
// there should be five dirs
|
if (err != null) throw err
|
||||||
assert.equal(filesAfterDelete.length, 5, 'filesAfterDelete')
|
// there should be five dirs
|
||||||
assert.isTrue(filesAfterDelete[0].isDirectory())
|
assert.equal(files.length, 5)
|
||||||
assert.isTrue(filesAfterDelete[1].isDirectory())
|
assert.isTrue(files[0].isDirectory())
|
||||||
assert.isTrue(filesAfterDelete[2].isDirectory())
|
assert.isTrue(files[1].isDirectory())
|
||||||
assert.isTrue(filesAfterDelete[3].isDirectory())
|
assert.isTrue(files[2].isDirectory())
|
||||||
assert.isTrue(filesAfterDelete[4].isDirectory())
|
assert.isTrue(files[3].isDirectory())
|
||||||
|
assert.isTrue(files[4].isDirectory())
|
||||||
|
|
||||||
// Three TXs now
|
// Three TXs now
|
||||||
const transactionFilesAfterDelete = await fs.promises.readdir(path.join(mirroredPath, '_transactions'), { withFileTypes: true })
|
fs.readdir(path.join(mirroredPath, '_transactions'), { withFileTypes: true }, (err, files) => {
|
||||||
assert.equal(transactionFilesAfterDelete.length, 3, 'transactionFilesAfterDelete')
|
if (err != null) throw err
|
||||||
assert.isTrue(transactionFilesAfterDelete[0].name.endsWith('.txn'))
|
assert.equal(files.length, 3)
|
||||||
assert.isTrue(transactionFilesAfterDelete[1].name.endsWith('.txn'))
|
assert.isTrue(files[0].name.endsWith('.txn'))
|
||||||
|
assert.isTrue(files[1].name.endsWith('.txn'))
|
||||||
|
})
|
||||||
|
|
||||||
const dataFilesAfterDelete = await fs.promises.readdir(path.join(mirroredPath, 'data'), { withFileTypes: true })
|
fs.readdir(path.join(mirroredPath, 'data'), { withFileTypes: true }, (err, files) => {
|
||||||
assert.equal(dataFilesAfterDelete.length, 1, 'dataFilesAfterDelete')
|
if (err != null) throw err
|
||||||
assert.isTrue(dataFilesAfterDelete[0].name.endsWith('.lance'))
|
assert.equal(files.length, 1)
|
||||||
|
assert.isTrue(files[0].name.endsWith('.lance'))
|
||||||
|
})
|
||||||
|
|
||||||
const indicesFilesAfterDelete = await fs.promises.readdir(path.join(mirroredPath, '_indices'), { withFileTypes: true })
|
fs.readdir(path.join(mirroredPath, '_indices'), { withFileTypes: true }, (err, files) => {
|
||||||
assert.equal(indicesFilesAfterDelete.length, 1, 'indicesFilesAfterDelete')
|
if (err != null) throw err
|
||||||
assert.isTrue(indicesFilesAfterDelete[0].isDirectory())
|
assert.equal(files.length, 1)
|
||||||
|
assert.isTrue(files[0].isDirectory())
|
||||||
|
|
||||||
const indexFilesAfterDelete = await fs.promises.readdir(path.join(mirroredPath, '_indices', indicesFilesAfterDelete[0].name), { withFileTypes: true })
|
fs.readdir(path.join(mirroredPath, '_indices', files[0].name), { withFileTypes: true }, (err, files) => {
|
||||||
console.log(`DEBUG indexFilesAfterDelete in ${indicesFilesAfterDelete[0].name}:`, indexFilesAfterDelete.map(f => `${f.name} (${f.isFile() ? 'file' : 'dir'})`))
|
if (err != null) throw err
|
||||||
assert.equal(indexFilesAfterDelete.length, 2, 'indexFilesAfterDelete')
|
|
||||||
const fileNamesAfterDelete = indexFilesAfterDelete.map(f => f.name).sort()
|
|
||||||
assert.isTrue(fileNamesAfterDelete.includes('auxiliary.idx'), 'auxiliary.idx should be present after delete')
|
|
||||||
assert.isTrue(fileNamesAfterDelete.includes('index.idx'), 'index.idx should be present after delete')
|
|
||||||
assert.isTrue(indexFilesAfterDelete.every(f => f.isFile()), 'all index files should be files after delete')
|
|
||||||
|
|
||||||
const deletionFiles = await fs.promises.readdir(path.join(mirroredPath, '_deletions'), { withFileTypes: true })
|
assert.equal(files.length, 1)
|
||||||
assert.equal(deletionFiles.length, 1, 'deletionFiles')
|
assert.isTrue(files[0].isFile())
|
||||||
assert.isTrue(deletionFiles[0].name.endsWith('.arrow'))
|
assert.isTrue(files[0].name.endsWith('.idx'))
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|
||||||
|
fs.readdir(path.join(mirroredPath, '_deletions'), { withFileTypes: true }, (err, files) => {
|
||||||
|
if (err != null) throw err
|
||||||
|
assert.equal(files.length, 1)
|
||||||
|
assert.isTrue(files[0].name.endsWith('.arrow'))
|
||||||
|
})
|
||||||
|
})
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
|
|||||||
@@ -1,13 +0,0 @@
|
|||||||
These are the typescript bindings of LanceDB.
|
|
||||||
The core Rust library is in the `../rust/lancedb` directory, the rust binding
|
|
||||||
code is in the `src/` directory and the typescript bindings are in
|
|
||||||
the `lancedb/` directory.
|
|
||||||
|
|
||||||
Whenever you change the Rust code, you will need to recompile: `npm run build`.
|
|
||||||
|
|
||||||
Common commands:
|
|
||||||
* Build: `npm run build`
|
|
||||||
* Lint: `npm run lint`
|
|
||||||
* Fix lints: `npm run lint-fix`
|
|
||||||
* Test: `npm test`
|
|
||||||
* Run single test file: `npm test __test__/arrow.test.ts`
|
|
||||||
@@ -1,7 +1,7 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb-nodejs"
|
name = "lancedb-nodejs"
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
version = "0.21.2"
|
version = "0.20.0"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
description.workspace = true
|
description.workspace = true
|
||||||
repository.workspace = true
|
repository.workspace = true
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
// SPDX-License-Identifier: Apache-2.0
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||||
|
|
||||||
import { Bool, Field, Int32, List, Schema, Struct, Utf8 } from "apache-arrow";
|
import { Schema } from "apache-arrow";
|
||||||
|
|
||||||
import * as arrow15 from "apache-arrow-15";
|
import * as arrow15 from "apache-arrow-15";
|
||||||
import * as arrow16 from "apache-arrow-16";
|
import * as arrow16 from "apache-arrow-16";
|
||||||
@@ -11,12 +11,10 @@ import * as arrow18 from "apache-arrow-18";
|
|||||||
import {
|
import {
|
||||||
convertToTable,
|
convertToTable,
|
||||||
fromBufferToRecordBatch,
|
fromBufferToRecordBatch,
|
||||||
fromDataToBuffer,
|
|
||||||
fromRecordBatchToBuffer,
|
fromRecordBatchToBuffer,
|
||||||
fromTableToBuffer,
|
fromTableToBuffer,
|
||||||
makeArrowTable,
|
makeArrowTable,
|
||||||
makeEmptyTable,
|
makeEmptyTable,
|
||||||
tableFromIPC,
|
|
||||||
} from "../lancedb/arrow";
|
} from "../lancedb/arrow";
|
||||||
import {
|
import {
|
||||||
EmbeddingFunction,
|
EmbeddingFunction,
|
||||||
@@ -377,221 +375,8 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
|
|||||||
expect(table2.schema).toEqual(schema);
|
expect(table2.schema).toEqual(schema);
|
||||||
});
|
});
|
||||||
|
|
||||||
it("will handle missing columns in schema alignment when using embeddings", async function () {
|
|
||||||
const schema = new Schema(
|
|
||||||
[
|
|
||||||
new Field("domain", new Utf8(), true),
|
|
||||||
new Field("name", new Utf8(), true),
|
|
||||||
new Field("description", new Utf8(), true),
|
|
||||||
],
|
|
||||||
new Map([["embedding_functions", JSON.stringify([])]]),
|
|
||||||
);
|
|
||||||
|
|
||||||
const data = [
|
|
||||||
{ domain: "google.com", name: "Google" },
|
|
||||||
{ domain: "facebook.com", name: "Facebook" },
|
|
||||||
];
|
|
||||||
|
|
||||||
const table = await convertToTable(data, undefined, { schema });
|
|
||||||
|
|
||||||
expect(table.numCols).toBe(3);
|
|
||||||
expect(table.numRows).toBe(2);
|
|
||||||
|
|
||||||
const descriptionColumn = table.getChild("description");
|
|
||||||
expect(descriptionColumn).toBeDefined();
|
|
||||||
expect(descriptionColumn?.nullCount).toBe(2);
|
|
||||||
expect(descriptionColumn?.toArray()).toEqual([null, null]);
|
|
||||||
|
|
||||||
expect(table.getChild("domain")?.toArray()).toEqual([
|
|
||||||
"google.com",
|
|
||||||
"facebook.com",
|
|
||||||
]);
|
|
||||||
expect(table.getChild("name")?.toArray()).toEqual([
|
|
||||||
"Google",
|
|
||||||
"Facebook",
|
|
||||||
]);
|
|
||||||
});
|
|
||||||
|
|
||||||
it("will handle completely missing nested struct columns", async function () {
|
|
||||||
const schema = new Schema(
|
|
||||||
[
|
|
||||||
new Field("id", new Utf8(), true),
|
|
||||||
new Field("name", new Utf8(), true),
|
|
||||||
new Field(
|
|
||||||
"metadata",
|
|
||||||
new Struct([
|
|
||||||
new Field("version", new Int32(), true),
|
|
||||||
new Field("author", new Utf8(), true),
|
|
||||||
new Field(
|
|
||||||
"tags",
|
|
||||||
new List(new Field("item", new Utf8(), true)),
|
|
||||||
true,
|
|
||||||
),
|
|
||||||
]),
|
|
||||||
true,
|
|
||||||
),
|
|
||||||
],
|
|
||||||
new Map([["embedding_functions", JSON.stringify([])]]),
|
|
||||||
);
|
|
||||||
|
|
||||||
const data = [
|
|
||||||
{ id: "doc1", name: "Document 1" },
|
|
||||||
{ id: "doc2", name: "Document 2" },
|
|
||||||
];
|
|
||||||
|
|
||||||
const table = await convertToTable(data, undefined, { schema });
|
|
||||||
|
|
||||||
expect(table.numCols).toBe(3);
|
|
||||||
expect(table.numRows).toBe(2);
|
|
||||||
|
|
||||||
const buf = await fromTableToBuffer(table);
|
|
||||||
const retrievedTable = tableFromIPC(buf);
|
|
||||||
|
|
||||||
const rows = [];
|
|
||||||
for (let i = 0; i < retrievedTable.numRows; i++) {
|
|
||||||
rows.push(retrievedTable.get(i));
|
|
||||||
}
|
|
||||||
|
|
||||||
expect(rows[0].metadata.version).toBe(null);
|
|
||||||
expect(rows[0].metadata.author).toBe(null);
|
|
||||||
expect(rows[0].metadata.tags).toBe(null);
|
|
||||||
expect(rows[0].id).toBe("doc1");
|
|
||||||
expect(rows[0].name).toBe("Document 1");
|
|
||||||
});
|
|
||||||
|
|
||||||
it("will handle partially missing nested struct fields", async function () {
|
|
||||||
const schema = new Schema(
|
|
||||||
[
|
|
||||||
new Field("id", new Utf8(), true),
|
|
||||||
new Field(
|
|
||||||
"metadata",
|
|
||||||
new Struct([
|
|
||||||
new Field("version", new Int32(), true),
|
|
||||||
new Field("author", new Utf8(), true),
|
|
||||||
new Field("created_at", new Utf8(), true),
|
|
||||||
]),
|
|
||||||
true,
|
|
||||||
),
|
|
||||||
],
|
|
||||||
new Map([["embedding_functions", JSON.stringify([])]]),
|
|
||||||
);
|
|
||||||
|
|
||||||
const data = [
|
|
||||||
{ id: "doc1", metadata: { version: 1, author: "Alice" } },
|
|
||||||
{ id: "doc2", metadata: { version: 2 } },
|
|
||||||
];
|
|
||||||
|
|
||||||
const table = await convertToTable(data, undefined, { schema });
|
|
||||||
|
|
||||||
expect(table.numCols).toBe(2);
|
|
||||||
expect(table.numRows).toBe(2);
|
|
||||||
|
|
||||||
const metadataColumn = table.getChild("metadata");
|
|
||||||
expect(metadataColumn).toBeDefined();
|
|
||||||
expect(metadataColumn?.type.toString()).toBe(
|
|
||||||
"Struct<{version:Int32, author:Utf8, created_at:Utf8}>",
|
|
||||||
);
|
|
||||||
});
|
|
||||||
|
|
||||||
it("will handle multiple levels of nested structures", async function () {
|
|
||||||
const schema = new Schema(
|
|
||||||
[
|
|
||||||
new Field("id", new Utf8(), true),
|
|
||||||
new Field(
|
|
||||||
"config",
|
|
||||||
new Struct([
|
|
||||||
new Field("database", new Utf8(), true),
|
|
||||||
new Field(
|
|
||||||
"connection",
|
|
||||||
new Struct([
|
|
||||||
new Field("host", new Utf8(), true),
|
|
||||||
new Field("port", new Int32(), true),
|
|
||||||
new Field(
|
|
||||||
"ssl",
|
|
||||||
new Struct([
|
|
||||||
new Field("enabled", new Bool(), true),
|
|
||||||
new Field("cert_path", new Utf8(), true),
|
|
||||||
]),
|
|
||||||
true,
|
|
||||||
),
|
|
||||||
]),
|
|
||||||
true,
|
|
||||||
),
|
|
||||||
]),
|
|
||||||
true,
|
|
||||||
),
|
|
||||||
],
|
|
||||||
new Map([["embedding_functions", JSON.stringify([])]]),
|
|
||||||
);
|
|
||||||
|
|
||||||
const data = [
|
|
||||||
{
|
|
||||||
id: "config1",
|
|
||||||
config: {
|
|
||||||
database: "postgres",
|
|
||||||
connection: { host: "localhost" },
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
id: "config2",
|
|
||||||
config: { database: "mysql" },
|
|
||||||
},
|
|
||||||
{
|
|
||||||
id: "config3",
|
|
||||||
},
|
|
||||||
];
|
|
||||||
|
|
||||||
const table = await convertToTable(data, undefined, { schema });
|
|
||||||
|
|
||||||
expect(table.numCols).toBe(2);
|
|
||||||
expect(table.numRows).toBe(3);
|
|
||||||
|
|
||||||
const configColumn = table.getChild("config");
|
|
||||||
expect(configColumn).toBeDefined();
|
|
||||||
expect(configColumn?.type.toString()).toBe(
|
|
||||||
"Struct<{database:Utf8, connection:Struct<{host:Utf8, port:Int32, ssl:Struct<{enabled:Bool, cert_path:Utf8}>}>}>",
|
|
||||||
);
|
|
||||||
});
|
|
||||||
|
|
||||||
it("will handle missing columns in Arrow table input when using embeddings", async function () {
|
|
||||||
const incompleteTable = makeArrowTable([
|
|
||||||
{ domain: "google.com", name: "Google" },
|
|
||||||
{ domain: "facebook.com", name: "Facebook" },
|
|
||||||
]);
|
|
||||||
|
|
||||||
const schema = new Schema(
|
|
||||||
[
|
|
||||||
new Field("domain", new Utf8(), true),
|
|
||||||
new Field("name", new Utf8(), true),
|
|
||||||
new Field("description", new Utf8(), true),
|
|
||||||
],
|
|
||||||
new Map([["embedding_functions", JSON.stringify([])]]),
|
|
||||||
);
|
|
||||||
|
|
||||||
const buf = await fromDataToBuffer(incompleteTable, undefined, schema);
|
|
||||||
|
|
||||||
expect(buf.byteLength).toBeGreaterThan(0);
|
|
||||||
|
|
||||||
const retrievedTable = tableFromIPC(buf);
|
|
||||||
expect(retrievedTable.numCols).toBe(3);
|
|
||||||
expect(retrievedTable.numRows).toBe(2);
|
|
||||||
|
|
||||||
const descriptionColumn = retrievedTable.getChild("description");
|
|
||||||
expect(descriptionColumn).toBeDefined();
|
|
||||||
expect(descriptionColumn?.nullCount).toBe(2);
|
|
||||||
expect(descriptionColumn?.toArray()).toEqual([null, null]);
|
|
||||||
|
|
||||||
expect(retrievedTable.getChild("domain")?.toArray()).toEqual([
|
|
||||||
"google.com",
|
|
||||||
"facebook.com",
|
|
||||||
]);
|
|
||||||
expect(retrievedTable.getChild("name")?.toArray()).toEqual([
|
|
||||||
"Google",
|
|
||||||
"Facebook",
|
|
||||||
]);
|
|
||||||
});
|
|
||||||
|
|
||||||
it("should correctly retain values in nested struct fields", async function () {
|
it("should correctly retain values in nested struct fields", async function () {
|
||||||
|
// Define test data with nested struct
|
||||||
const testData = [
|
const testData = [
|
||||||
{
|
{
|
||||||
id: "doc1",
|
id: "doc1",
|
||||||
@@ -615,8 +400,10 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
|
|||||||
},
|
},
|
||||||
];
|
];
|
||||||
|
|
||||||
|
// Create Arrow table from the data
|
||||||
const table = makeArrowTable(testData);
|
const table = makeArrowTable(testData);
|
||||||
|
|
||||||
|
// Verify schema has the nested struct fields
|
||||||
const metadataField = table.schema.fields.find(
|
const metadataField = table.schema.fields.find(
|
||||||
(f) => f.name === "metadata",
|
(f) => f.name === "metadata",
|
||||||
);
|
);
|
||||||
@@ -630,17 +417,23 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
|
|||||||
"text",
|
"text",
|
||||||
]);
|
]);
|
||||||
|
|
||||||
|
// Convert to buffer and back (simulating storage and retrieval)
|
||||||
const buf = await fromTableToBuffer(table);
|
const buf = await fromTableToBuffer(table);
|
||||||
const retrievedTable = tableFromIPC(buf);
|
const retrievedTable = tableFromIPC(buf);
|
||||||
|
|
||||||
|
// Verify the retrieved table has the same structure
|
||||||
const rows = [];
|
const rows = [];
|
||||||
for (let i = 0; i < retrievedTable.numRows; i++) {
|
for (let i = 0; i < retrievedTable.numRows; i++) {
|
||||||
rows.push(retrievedTable.get(i));
|
rows.push(retrievedTable.get(i));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Check values in the first row
|
||||||
const firstRow = rows[0];
|
const firstRow = rows[0];
|
||||||
expect(firstRow.id).toBe("doc1");
|
expect(firstRow.id).toBe("doc1");
|
||||||
expect(firstRow.vector.toJSON()).toEqual([1, 2, 3]);
|
expect(firstRow.vector.toJSON()).toEqual([1, 2, 3]);
|
||||||
|
|
||||||
|
// Verify metadata values are preserved (this is where the bug is)
|
||||||
|
expect(firstRow.metadata).toBeDefined();
|
||||||
expect(firstRow.metadata.filePath).toBe("/path/to/file1.ts");
|
expect(firstRow.metadata.filePath).toBe("/path/to/file1.ts");
|
||||||
expect(firstRow.metadata.startLine).toBe(10);
|
expect(firstRow.metadata.startLine).toBe(10);
|
||||||
expect(firstRow.metadata.endLine).toBe(20);
|
expect(firstRow.metadata.endLine).toBe(20);
|
||||||
@@ -799,14 +592,14 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
|
|||||||
).rejects.toThrow("column vector was missing");
|
).rejects.toThrow("column vector was missing");
|
||||||
});
|
});
|
||||||
|
|
||||||
it("will skip embedding application if already applied", async function () {
|
it("will provide a nice error if run twice", async function () {
|
||||||
const records = sampleRecords();
|
const records = sampleRecords();
|
||||||
const table = await convertToTable(records, dummyEmbeddingConfig);
|
const table = await convertToTable(records, dummyEmbeddingConfig);
|
||||||
|
|
||||||
// fromTableToBuffer will try and apply the embeddings again
|
// fromTableToBuffer will try and apply the embeddings again
|
||||||
// but should skip since the column already has non-null values
|
await expect(
|
||||||
const result = await fromTableToBuffer(table, dummyEmbeddingConfig);
|
fromTableToBuffer(table, dummyEmbeddingConfig),
|
||||||
expect(result.byteLength).toBeGreaterThan(0);
|
).rejects.toThrow("already existed");
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|||||||
@@ -108,10 +108,7 @@ describe("remote connection", () => {
|
|||||||
it("should pass on requested extra headers", async () => {
|
it("should pass on requested extra headers", async () => {
|
||||||
await withMockDatabase(
|
await withMockDatabase(
|
||||||
(req, res) => {
|
(req, res) => {
|
||||||
expect(req.headers["foo"]).toEqual("1");
|
expect(req.headers["x-my-header"]).toEqual("my-value");
|
||||||
expect(req.headers["bar"]).toEqual("2");
|
|
||||||
expect(req.headers["baz"]).toEqual("3");
|
|
||||||
expect(req.headers["x-log-attrs"]).toEqual("foo, bar, baz");
|
|
||||||
|
|
||||||
const body = JSON.stringify({ tables: [] });
|
const body = JSON.stringify({ tables: [] });
|
||||||
res.writeHead(200, { "Content-Type": "application/json" }).end(body);
|
res.writeHead(200, { "Content-Type": "application/json" }).end(body);
|
||||||
@@ -122,12 +119,9 @@ describe("remote connection", () => {
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
clientConfig: {
|
clientConfig: {
|
||||||
extraHeaders: {
|
extraHeaders: {
|
||||||
"x-log-attrs": "foo, bar, baz",
|
"x-my-header": "my-value",
|
||||||
foo: "1",
|
},
|
||||||
bar: "2",
|
|
||||||
baz: "3",
|
|
||||||
},
|
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
);
|
);
|
||||||
|
|||||||
@@ -1,46 +0,0 @@
|
|||||||
// SPDX-License-Identifier: Apache-2.0
|
|
||||||
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
|
||||||
|
|
||||||
import * as tmp from "tmp";
|
|
||||||
import { Session, connect } from "../lancedb";
|
|
||||||
|
|
||||||
describe("Session", () => {
|
|
||||||
let tmpDir: tmp.DirResult;
|
|
||||||
beforeEach(() => {
|
|
||||||
tmpDir = tmp.dirSync({ unsafeCleanup: true });
|
|
||||||
});
|
|
||||||
afterEach(() => tmpDir.removeCallback());
|
|
||||||
|
|
||||||
it("should configure cache sizes and work with database operations", async () => {
|
|
||||||
// Create session with small cache limits for testing
|
|
||||||
const indexCacheSize = BigInt(1024 * 1024); // 1MB
|
|
||||||
const metadataCacheSize = BigInt(512 * 1024); // 512KB
|
|
||||||
|
|
||||||
const session = new Session(indexCacheSize, metadataCacheSize);
|
|
||||||
|
|
||||||
// Record initial cache state
|
|
||||||
const initialCacheSize = session.sizeBytes();
|
|
||||||
const initialCacheItems = session.approxNumItems();
|
|
||||||
|
|
||||||
// Test session works with database connection
|
|
||||||
const db = await connect({ uri: tmpDir.name, session: session });
|
|
||||||
|
|
||||||
// Create and use a table to exercise the session
|
|
||||||
const data = Array.from({ length: 100 }, (_, i) => ({
|
|
||||||
id: i,
|
|
||||||
text: `item ${i}`,
|
|
||||||
}));
|
|
||||||
const table = await db.createTable("test", data);
|
|
||||||
const results = await table.query().limit(5).toArray();
|
|
||||||
|
|
||||||
expect(results).toHaveLength(5);
|
|
||||||
|
|
||||||
// Verify cache usage increased after operations
|
|
||||||
const finalCacheSize = session.sizeBytes();
|
|
||||||
const finalCacheItems = session.approxNumItems();
|
|
||||||
|
|
||||||
expect(finalCacheSize).toBeGreaterThan(initialCacheSize); // Cache should have grown
|
|
||||||
expect(finalCacheItems).toBeGreaterThanOrEqual(initialCacheItems); // Items should not decrease
|
|
||||||
expect(initialCacheSize).toBeLessThan(indexCacheSize + metadataCacheSize); // Within limits
|
|
||||||
});
|
|
||||||
});
|
|
||||||
@@ -33,12 +33,7 @@ import {
|
|||||||
register,
|
register,
|
||||||
} from "../lancedb/embedding";
|
} from "../lancedb/embedding";
|
||||||
import { Index } from "../lancedb/indices";
|
import { Index } from "../lancedb/indices";
|
||||||
import {
|
import { instanceOfFullTextQuery } from "../lancedb/query";
|
||||||
BooleanQuery,
|
|
||||||
Occur,
|
|
||||||
Operator,
|
|
||||||
instanceOfFullTextQuery,
|
|
||||||
} from "../lancedb/query";
|
|
||||||
import exp = require("constants");
|
import exp = require("constants");
|
||||||
|
|
||||||
describe.each([arrow15, arrow16, arrow17, arrow18])(
|
describe.each([arrow15, arrow16, arrow17, arrow18])(
|
||||||
@@ -368,9 +363,9 @@ describe("merge insert", () => {
|
|||||||
{ a: 4, b: "z" },
|
{ a: 4, b: "z" },
|
||||||
];
|
];
|
||||||
|
|
||||||
const result = (await table.toArrow()).toArray().sort((a, b) => a.a - b.a);
|
expect(
|
||||||
|
JSON.parse(JSON.stringify((await table.toArrow()).toArray())),
|
||||||
expect(result.map((row) => ({ ...row }))).toEqual(expected);
|
).toEqual(expected);
|
||||||
});
|
});
|
||||||
test("conditional update", async () => {
|
test("conditional update", async () => {
|
||||||
const newData = [
|
const newData = [
|
||||||
@@ -559,32 +554,6 @@ describe("When creating an index", () => {
|
|||||||
rst = await tbl.query().limit(2).offset(1).nearestTo(queryVec).toArrow();
|
rst = await tbl.query().limit(2).offset(1).nearestTo(queryVec).toArrow();
|
||||||
expect(rst.numRows).toBe(1);
|
expect(rst.numRows).toBe(1);
|
||||||
|
|
||||||
// test nprobes
|
|
||||||
rst = await tbl.query().nearestTo(queryVec).limit(2).nprobes(50).toArrow();
|
|
||||||
expect(rst.numRows).toBe(2);
|
|
||||||
rst = await tbl
|
|
||||||
.query()
|
|
||||||
.nearestTo(queryVec)
|
|
||||||
.limit(2)
|
|
||||||
.minimumNprobes(15)
|
|
||||||
.toArrow();
|
|
||||||
expect(rst.numRows).toBe(2);
|
|
||||||
rst = await tbl
|
|
||||||
.query()
|
|
||||||
.nearestTo(queryVec)
|
|
||||||
.limit(2)
|
|
||||||
.minimumNprobes(10)
|
|
||||||
.maximumNprobes(20)
|
|
||||||
.toArrow();
|
|
||||||
expect(rst.numRows).toBe(2);
|
|
||||||
|
|
||||||
expect(() => tbl.query().nearestTo(queryVec).minimumNprobes(0)).toThrow(
|
|
||||||
"Invalid input, minimum_nprobes must be greater than 0",
|
|
||||||
);
|
|
||||||
expect(() => tbl.query().nearestTo(queryVec).maximumNprobes(5)).toThrow(
|
|
||||||
"Invalid input, maximum_nprobes must be greater than or equal to minimum_nprobes",
|
|
||||||
);
|
|
||||||
|
|
||||||
await tbl.dropIndex("vec_idx");
|
await tbl.dropIndex("vec_idx");
|
||||||
const indices2 = await tbl.listIndices();
|
const indices2 = await tbl.listIndices();
|
||||||
expect(indices2.length).toBe(0);
|
expect(indices2.length).toBe(0);
|
||||||
@@ -1562,18 +1531,6 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
|
|||||||
|
|
||||||
const results = await table.search("hello").toArray();
|
const results = await table.search("hello").toArray();
|
||||||
expect(results[0].text).toBe(data[0].text);
|
expect(results[0].text).toBe(data[0].text);
|
||||||
|
|
||||||
const results2 = await table
|
|
||||||
.search(new MatchQuery("hello world", "text"))
|
|
||||||
.toArray();
|
|
||||||
expect(results2.length).toBe(2);
|
|
||||||
|
|
||||||
const results3 = await table
|
|
||||||
.search(
|
|
||||||
new MatchQuery("hello world", "text", { operator: Operator.And }),
|
|
||||||
)
|
|
||||||
.toArray();
|
|
||||||
expect(results3.length).toBe(1);
|
|
||||||
});
|
});
|
||||||
|
|
||||||
test("full text search without lowercase", async () => {
|
test("full text search without lowercase", async () => {
|
||||||
@@ -1650,114 +1607,6 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
|
|||||||
expect(resultSet.has("fob")).toBe(true);
|
expect(resultSet.has("fob")).toBe(true);
|
||||||
expect(resultSet.has("fo")).toBe(true);
|
expect(resultSet.has("fo")).toBe(true);
|
||||||
expect(resultSet.has("food")).toBe(true);
|
expect(resultSet.has("food")).toBe(true);
|
||||||
|
|
||||||
const prefixResults = await table
|
|
||||||
.search(
|
|
||||||
new MatchQuery("foo", "text", { fuzziness: 3, prefixLength: 3 }),
|
|
||||||
)
|
|
||||||
.toArray();
|
|
||||||
expect(prefixResults.length).toBe(2);
|
|
||||||
const resultSet2 = new Set(prefixResults.map((r) => r.text));
|
|
||||||
expect(resultSet2.has("foo")).toBe(true);
|
|
||||||
expect(resultSet2.has("food")).toBe(true);
|
|
||||||
});
|
|
||||||
|
|
||||||
test("full text search boolean query", async () => {
|
|
||||||
const db = await connect(tmpDir.name);
|
|
||||||
const data = [
|
|
||||||
{ text: "The cat and dog are playing" },
|
|
||||||
{ text: "The cat is sleeping" },
|
|
||||||
{ text: "The dog is barking" },
|
|
||||||
{ text: "The dog chases the cat" },
|
|
||||||
];
|
|
||||||
const table = await db.createTable("test", data);
|
|
||||||
await table.createIndex("text", {
|
|
||||||
config: Index.fts({ withPosition: false }),
|
|
||||||
});
|
|
||||||
|
|
||||||
const shouldResults = await table
|
|
||||||
.search(
|
|
||||||
new BooleanQuery([
|
|
||||||
[Occur.Should, new MatchQuery("cat", "text")],
|
|
||||||
[Occur.Should, new MatchQuery("dog", "text")],
|
|
||||||
]),
|
|
||||||
)
|
|
||||||
.toArray();
|
|
||||||
expect(shouldResults.length).toBe(4);
|
|
||||||
|
|
||||||
const mustResults = await table
|
|
||||||
.search(
|
|
||||||
new BooleanQuery([
|
|
||||||
[Occur.Must, new MatchQuery("cat", "text")],
|
|
||||||
[Occur.Must, new MatchQuery("dog", "text")],
|
|
||||||
]),
|
|
||||||
)
|
|
||||||
.toArray();
|
|
||||||
expect(mustResults.length).toBe(2);
|
|
||||||
|
|
||||||
const mustNotResults = await table
|
|
||||||
.search(
|
|
||||||
new BooleanQuery([
|
|
||||||
[Occur.Must, new MatchQuery("cat", "text")],
|
|
||||||
[Occur.MustNot, new MatchQuery("dog", "text")],
|
|
||||||
]),
|
|
||||||
)
|
|
||||||
.toArray();
|
|
||||||
expect(mustNotResults.length).toBe(1);
|
|
||||||
});
|
|
||||||
|
|
||||||
test("full text search ngram", async () => {
|
|
||||||
const db = await connect(tmpDir.name);
|
|
||||||
const data = [
|
|
||||||
{ text: "hello world", vector: [0.1, 0.2, 0.3] },
|
|
||||||
{ text: "lance database", vector: [0.4, 0.5, 0.6] },
|
|
||||||
{ text: "lance is cool", vector: [0.7, 0.8, 0.9] },
|
|
||||||
];
|
|
||||||
const table = await db.createTable("test", data);
|
|
||||||
await table.createIndex("text", {
|
|
||||||
config: Index.fts({ baseTokenizer: "ngram" }),
|
|
||||||
});
|
|
||||||
|
|
||||||
const results = await table.search("lan").toArray();
|
|
||||||
expect(results.length).toBe(2);
|
|
||||||
const resultSet = new Set(results.map((r) => r.text));
|
|
||||||
expect(resultSet.has("lance database")).toBe(true);
|
|
||||||
expect(resultSet.has("lance is cool")).toBe(true);
|
|
||||||
|
|
||||||
const results2 = await table.search("nce").toArray(); // spellchecker:disable-line
|
|
||||||
expect(results2.length).toBe(2);
|
|
||||||
const resultSet2 = new Set(results2.map((r) => r.text));
|
|
||||||
expect(resultSet2.has("lance database")).toBe(true);
|
|
||||||
expect(resultSet2.has("lance is cool")).toBe(true);
|
|
||||||
|
|
||||||
// the default min_ngram_length is 3, so "la" should not match
|
|
||||||
const results3 = await table.search("la").toArray();
|
|
||||||
expect(results3.length).toBe(0);
|
|
||||||
|
|
||||||
// test setting min_ngram_length and prefix_only
|
|
||||||
await table.createIndex("text", {
|
|
||||||
config: Index.fts({
|
|
||||||
baseTokenizer: "ngram",
|
|
||||||
ngramMinLength: 2,
|
|
||||||
prefixOnly: true,
|
|
||||||
}),
|
|
||||||
replace: true,
|
|
||||||
});
|
|
||||||
|
|
||||||
const results4 = await table.search("lan").toArray();
|
|
||||||
expect(results4.length).toBe(2);
|
|
||||||
const resultSet4 = new Set(results4.map((r) => r.text));
|
|
||||||
expect(resultSet4.has("lance database")).toBe(true);
|
|
||||||
expect(resultSet4.has("lance is cool")).toBe(true);
|
|
||||||
|
|
||||||
const results5 = await table.search("nce").toArray(); // spellchecker:disable-line
|
|
||||||
expect(results5.length).toBe(0);
|
|
||||||
|
|
||||||
const results6 = await table.search("la").toArray();
|
|
||||||
expect(results6.length).toBe(2);
|
|
||||||
const resultSet6 = new Set(results6.map((r) => r.text));
|
|
||||||
expect(resultSet6.has("lance database")).toBe(true);
|
|
||||||
expect(resultSet6.has("lance is cool")).toBe(true);
|
|
||||||
});
|
});
|
||||||
|
|
||||||
test.each([
|
test.each([
|
||||||
@@ -1863,43 +1712,4 @@ describe("column name options", () => {
|
|||||||
expect(results[0].query_index).toBe(0);
|
expect(results[0].query_index).toBe(0);
|
||||||
expect(results[1].query_index).toBe(1);
|
expect(results[1].query_index).toBe(1);
|
||||||
});
|
});
|
||||||
|
|
||||||
test("index and search multivectors", async () => {
|
|
||||||
const db = await connect(tmpDir.name);
|
|
||||||
const data = [];
|
|
||||||
// generate 512 random multivectors
|
|
||||||
for (let i = 0; i < 256; i++) {
|
|
||||||
data.push({
|
|
||||||
multivector: Array.from({ length: 10 }, () =>
|
|
||||||
Array(2).fill(Math.random()),
|
|
||||||
),
|
|
||||||
});
|
|
||||||
}
|
|
||||||
const table = await db.createTable("multivectors", data, {
|
|
||||||
schema: new Schema([
|
|
||||||
new Field(
|
|
||||||
"multivector",
|
|
||||||
new List(
|
|
||||||
new Field(
|
|
||||||
"item",
|
|
||||||
new FixedSizeList(2, new Field("item", new Float32())),
|
|
||||||
),
|
|
||||||
),
|
|
||||||
),
|
|
||||||
]),
|
|
||||||
});
|
|
||||||
|
|
||||||
const results = await table.search(data[0].multivector).limit(10).toArray();
|
|
||||||
expect(results.length).toBe(10);
|
|
||||||
|
|
||||||
await table.createIndex("multivector", {
|
|
||||||
config: Index.ivfPq({ numPartitions: 2, distanceType: "cosine" }),
|
|
||||||
});
|
|
||||||
|
|
||||||
const results2 = await table
|
|
||||||
.search(data[0].multivector)
|
|
||||||
.limit(10)
|
|
||||||
.toArray();
|
|
||||||
expect(results2.length).toBe(10);
|
|
||||||
});
|
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -107,20 +107,6 @@ export type IntoVector =
|
|||||||
| number[]
|
| number[]
|
||||||
| Promise<Float32Array | Float64Array | number[]>;
|
| Promise<Float32Array | Float64Array | number[]>;
|
||||||
|
|
||||||
export type MultiVector = IntoVector[];
|
|
||||||
|
|
||||||
export function isMultiVector(value: unknown): value is MultiVector {
|
|
||||||
return Array.isArray(value) && isIntoVector(value[0]);
|
|
||||||
}
|
|
||||||
|
|
||||||
export function isIntoVector(value: unknown): value is IntoVector {
|
|
||||||
return (
|
|
||||||
value instanceof Float32Array ||
|
|
||||||
value instanceof Float64Array ||
|
|
||||||
(Array.isArray(value) && !Array.isArray(value[0]))
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
export function isArrowTable(value: object): value is TableLike {
|
export function isArrowTable(value: object): value is TableLike {
|
||||||
if (value instanceof ArrowTable) return true;
|
if (value instanceof ArrowTable) return true;
|
||||||
return "schema" in value && "batches" in value;
|
return "schema" in value && "batches" in value;
|
||||||
@@ -431,9 +417,7 @@ function inferSchema(
|
|||||||
} else {
|
} else {
|
||||||
const inferredType = inferType(value, path, opts);
|
const inferredType = inferType(value, path, opts);
|
||||||
if (inferredType === undefined) {
|
if (inferredType === undefined) {
|
||||||
throw new Error(`Failed to infer data type for field ${path.join(
|
throw new Error(`Failed to infer data type for field ${path.join(".")} at row ${rowI}. \
|
||||||
".",
|
|
||||||
)} at row ${rowI}. \
|
|
||||||
Consider providing an explicit schema.`);
|
Consider providing an explicit schema.`);
|
||||||
}
|
}
|
||||||
pathTree.set(path, inferredType);
|
pathTree.set(path, inferredType);
|
||||||
@@ -815,17 +799,11 @@ async function applyEmbeddingsFromMetadata(
|
|||||||
`Cannot apply embedding function because the source column '${functionEntry.sourceColumn}' was not present in the data`,
|
`Cannot apply embedding function because the source column '${functionEntry.sourceColumn}' was not present in the data`,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check if destination column exists and handle accordingly
|
|
||||||
if (columns[destColumn] !== undefined) {
|
if (columns[destColumn] !== undefined) {
|
||||||
const existingColumn = columns[destColumn];
|
throw new Error(
|
||||||
// If the column exists but is all null, we can fill it with embeddings
|
`Attempt to apply embeddings to table failed because column ${destColumn} already existed`,
|
||||||
if (existingColumn.nullCount !== existingColumn.length) {
|
);
|
||||||
// Column has non-null values, skip embedding application
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (table.batches.length > 1) {
|
if (table.batches.length > 1) {
|
||||||
throw new Error(
|
throw new Error(
|
||||||
"Internal error: `makeArrowTable` unexpectedly created a table with more than one batch",
|
"Internal error: `makeArrowTable` unexpectedly created a table with more than one batch",
|
||||||
@@ -853,15 +831,6 @@ async function applyEmbeddingsFromMetadata(
|
|||||||
const vector = makeVector(vectors, destType);
|
const vector = makeVector(vectors, destType);
|
||||||
columns[destColumn] = vector;
|
columns[destColumn] = vector;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Add any missing columns from the schema as null vectors
|
|
||||||
for (const field of schema.fields) {
|
|
||||||
if (!(field.name in columns)) {
|
|
||||||
const nullValues = new Array(table.numRows).fill(null);
|
|
||||||
columns[field.name] = makeVector(nullValues, field.type);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const newTable = new ArrowTable(columns);
|
const newTable = new ArrowTable(columns);
|
||||||
return alignTable(newTable, schema);
|
return alignTable(newTable, schema);
|
||||||
}
|
}
|
||||||
@@ -934,23 +903,11 @@ async function applyEmbeddings<T>(
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// Check if destination column exists and handle accordingly
|
|
||||||
if (Object.prototype.hasOwnProperty.call(newColumns, destColumn)) {
|
if (Object.prototype.hasOwnProperty.call(newColumns, destColumn)) {
|
||||||
const existingColumn = newColumns[destColumn];
|
throw new Error(
|
||||||
// If the column exists but is all null, we can fill it with embeddings
|
`Attempt to apply embeddings to table failed because column ${destColumn} already existed`,
|
||||||
if (existingColumn.nullCount !== existingColumn.length) {
|
);
|
||||||
// Column has non-null values, skip embedding application and return table as-is
|
|
||||||
let newTable = new ArrowTable(newColumns);
|
|
||||||
if (schema != null) {
|
|
||||||
newTable = alignTable(newTable, schema as Schema);
|
|
||||||
}
|
|
||||||
return new ArrowTable(
|
|
||||||
new Schema(newTable.schema.fields, schemaMetadata),
|
|
||||||
newTable.batches,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (table.batches.length > 1) {
|
if (table.batches.length > 1) {
|
||||||
throw new Error(
|
throw new Error(
|
||||||
"Internal error: `makeArrowTable` unexpectedly created a table with more than one batch",
|
"Internal error: `makeArrowTable` unexpectedly created a table with more than one batch",
|
||||||
@@ -1010,21 +967,7 @@ export async function convertToTable(
|
|||||||
embeddings?: EmbeddingFunctionConfig,
|
embeddings?: EmbeddingFunctionConfig,
|
||||||
makeTableOptions?: Partial<MakeArrowTableOptions>,
|
makeTableOptions?: Partial<MakeArrowTableOptions>,
|
||||||
): Promise<ArrowTable> {
|
): Promise<ArrowTable> {
|
||||||
let processedData = data;
|
const table = makeArrowTable(data, makeTableOptions);
|
||||||
|
|
||||||
// If we have a schema with embedding metadata, we need to preprocess the data
|
|
||||||
// to ensure all nested fields are present
|
|
||||||
if (
|
|
||||||
makeTableOptions?.schema &&
|
|
||||||
makeTableOptions.schema.metadata?.has("embedding_functions")
|
|
||||||
) {
|
|
||||||
processedData = ensureNestedFieldsExist(
|
|
||||||
data,
|
|
||||||
makeTableOptions.schema as Schema,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
const table = makeArrowTable(processedData, makeTableOptions);
|
|
||||||
return await applyEmbeddings(table, embeddings, makeTableOptions?.schema);
|
return await applyEmbeddings(table, embeddings, makeTableOptions?.schema);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1117,16 +1060,7 @@ export async function fromDataToBuffer(
|
|||||||
schema = sanitizeSchema(schema);
|
schema = sanitizeSchema(schema);
|
||||||
}
|
}
|
||||||
if (isArrowTable(data)) {
|
if (isArrowTable(data)) {
|
||||||
const table = sanitizeTable(data);
|
return fromTableToBuffer(sanitizeTable(data), embeddings, schema);
|
||||||
// If we have a schema with embedding functions, we need to ensure all columns exist
|
|
||||||
// before applying embeddings, since applyEmbeddingsFromMetadata expects all columns
|
|
||||||
// to be present in the table
|
|
||||||
if (schema && schema.metadata?.has("embedding_functions")) {
|
|
||||||
const alignedTable = alignTableToSchema(table, schema);
|
|
||||||
return fromTableToBuffer(alignedTable, embeddings, schema);
|
|
||||||
} else {
|
|
||||||
return fromTableToBuffer(table, embeddings, schema);
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
const table = await convertToTable(data, embeddings, { schema });
|
const table = await convertToTable(data, embeddings, { schema });
|
||||||
return fromTableToBuffer(table);
|
return fromTableToBuffer(table);
|
||||||
@@ -1195,7 +1129,7 @@ function alignBatch(batch: RecordBatch, schema: Schema): RecordBatch {
|
|||||||
type: new Struct(schema.fields),
|
type: new Struct(schema.fields),
|
||||||
length: batch.numRows,
|
length: batch.numRows,
|
||||||
nullCount: batch.nullCount,
|
nullCount: batch.nullCount,
|
||||||
children: alignedChildren as unknown as ArrowData<DataType>[],
|
children: alignedChildren,
|
||||||
});
|
});
|
||||||
return new RecordBatch(schema, newData);
|
return new RecordBatch(schema, newData);
|
||||||
}
|
}
|
||||||
@@ -1267,79 +1201,6 @@ function validateSchemaEmbeddings(
|
|||||||
return new Schema(fields, schema.metadata);
|
return new Schema(fields, schema.metadata);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Ensures that all nested fields defined in the schema exist in the data,
|
|
||||||
* filling missing fields with null values.
|
|
||||||
*/
|
|
||||||
export function ensureNestedFieldsExist(
|
|
||||||
data: Array<Record<string, unknown>>,
|
|
||||||
schema: Schema,
|
|
||||||
): Array<Record<string, unknown>> {
|
|
||||||
return data.map((row) => {
|
|
||||||
const completeRow: Record<string, unknown> = {};
|
|
||||||
|
|
||||||
for (const field of schema.fields) {
|
|
||||||
if (field.name in row) {
|
|
||||||
if (
|
|
||||||
field.type.constructor.name === "Struct" &&
|
|
||||||
row[field.name] !== null &&
|
|
||||||
row[field.name] !== undefined
|
|
||||||
) {
|
|
||||||
// Handle nested struct
|
|
||||||
const nestedValue = row[field.name] as Record<string, unknown>;
|
|
||||||
completeRow[field.name] = ensureStructFieldsExist(
|
|
||||||
nestedValue,
|
|
||||||
field.type,
|
|
||||||
);
|
|
||||||
} else {
|
|
||||||
// Non-struct field or null struct value
|
|
||||||
completeRow[field.name] = row[field.name];
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// Field is missing from the data - set to null
|
|
||||||
completeRow[field.name] = null;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return completeRow;
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Recursively ensures that all fields in a struct type exist in the data,
|
|
||||||
* filling missing fields with null values.
|
|
||||||
*/
|
|
||||||
function ensureStructFieldsExist(
|
|
||||||
data: Record<string, unknown>,
|
|
||||||
structType: Struct,
|
|
||||||
): Record<string, unknown> {
|
|
||||||
const completeStruct: Record<string, unknown> = {};
|
|
||||||
|
|
||||||
for (const childField of structType.children) {
|
|
||||||
if (childField.name in data) {
|
|
||||||
if (
|
|
||||||
childField.type.constructor.name === "Struct" &&
|
|
||||||
data[childField.name] !== null &&
|
|
||||||
data[childField.name] !== undefined
|
|
||||||
) {
|
|
||||||
// Recursively handle nested struct
|
|
||||||
completeStruct[childField.name] = ensureStructFieldsExist(
|
|
||||||
data[childField.name] as Record<string, unknown>,
|
|
||||||
childField.type,
|
|
||||||
);
|
|
||||||
} else {
|
|
||||||
// Non-struct field or null struct value
|
|
||||||
completeStruct[childField.name] = data[childField.name];
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// Field is missing - set to null
|
|
||||||
completeStruct[childField.name] = null;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return completeStruct;
|
|
||||||
}
|
|
||||||
|
|
||||||
interface JsonDataType {
|
interface JsonDataType {
|
||||||
type: string;
|
type: string;
|
||||||
fields?: JsonField[];
|
fields?: JsonField[];
|
||||||
@@ -1473,64 +1334,3 @@ function fieldToJson(field: Field): JsonField {
|
|||||||
metadata: field.metadata,
|
metadata: field.metadata,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
function alignTableToSchema(
|
|
||||||
table: ArrowTable,
|
|
||||||
targetSchema: Schema,
|
|
||||||
): ArrowTable {
|
|
||||||
const existingColumns = new Map<string, Vector>();
|
|
||||||
|
|
||||||
// Map existing columns
|
|
||||||
for (const field of table.schema.fields) {
|
|
||||||
existingColumns.set(field.name, table.getChild(field.name)!);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Create vectors for all fields in target schema
|
|
||||||
const alignedColumns: Record<string, Vector> = {};
|
|
||||||
|
|
||||||
for (const field of targetSchema.fields) {
|
|
||||||
if (existingColumns.has(field.name)) {
|
|
||||||
// Column exists, use it
|
|
||||||
alignedColumns[field.name] = existingColumns.get(field.name)!;
|
|
||||||
} else {
|
|
||||||
// Column missing, create null vector
|
|
||||||
alignedColumns[field.name] = createNullVector(field, table.numRows);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Create new table with aligned schema and columns
|
|
||||||
return new ArrowTable(targetSchema, alignedColumns);
|
|
||||||
}
|
|
||||||
|
|
||||||
function createNullVector(field: Field, numRows: number): Vector {
|
|
||||||
if (field.type.constructor.name === "Struct") {
|
|
||||||
// For struct types, create a struct with null fields
|
|
||||||
const structType = field.type as Struct;
|
|
||||||
const childVectors = structType.children.map((childField) =>
|
|
||||||
createNullVector(childField, numRows),
|
|
||||||
);
|
|
||||||
|
|
||||||
// Create struct data
|
|
||||||
const structData = makeData({
|
|
||||||
type: structType,
|
|
||||||
length: numRows,
|
|
||||||
nullCount: 0,
|
|
||||||
children: childVectors.map((v) => v.data[0]),
|
|
||||||
});
|
|
||||||
|
|
||||||
return arrowMakeVector(structData);
|
|
||||||
} else {
|
|
||||||
// For other types, create a vector of nulls
|
|
||||||
const nullBitmap = new Uint8Array(Math.ceil(numRows / 8));
|
|
||||||
// All bits are 0, meaning all values are null
|
|
||||||
|
|
||||||
const data = makeData({
|
|
||||||
type: field.type,
|
|
||||||
length: numRows,
|
|
||||||
nullCount: numRows,
|
|
||||||
nullBitmap,
|
|
||||||
});
|
|
||||||
|
|
||||||
return arrowMakeVector(data);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -85,9 +85,6 @@ export interface OpenTableOptions {
|
|||||||
/**
|
/**
|
||||||
* Set the size of the index cache, specified as a number of entries
|
* Set the size of the index cache, specified as a number of entries
|
||||||
*
|
*
|
||||||
* @deprecated Use session-level cache configuration instead.
|
|
||||||
* Create a Session with custom cache sizes and pass it to the connect() function.
|
|
||||||
*
|
|
||||||
* The exact meaning of an "entry" will depend on the type of index:
|
* The exact meaning of an "entry" will depend on the type of index:
|
||||||
* - IVF: there is one entry for each IVF partition
|
* - IVF: there is one entry for each IVF partition
|
||||||
* - BTREE: there is one entry for the entire index
|
* - BTREE: there is one entry for the entire index
|
||||||
|
|||||||
@@ -10,7 +10,6 @@ import {
|
|||||||
import {
|
import {
|
||||||
ConnectionOptions,
|
ConnectionOptions,
|
||||||
Connection as LanceDbConnection,
|
Connection as LanceDbConnection,
|
||||||
Session,
|
|
||||||
} from "./native.js";
|
} from "./native.js";
|
||||||
|
|
||||||
export {
|
export {
|
||||||
@@ -52,8 +51,6 @@ export {
|
|||||||
OpenTableOptions,
|
OpenTableOptions,
|
||||||
} from "./connection";
|
} from "./connection";
|
||||||
|
|
||||||
export { Session } from "./native.js";
|
|
||||||
|
|
||||||
export {
|
export {
|
||||||
ExecutableQuery,
|
ExecutableQuery,
|
||||||
Query,
|
Query,
|
||||||
@@ -67,10 +64,7 @@ export {
|
|||||||
PhraseQuery,
|
PhraseQuery,
|
||||||
BoostQuery,
|
BoostQuery,
|
||||||
MultiMatchQuery,
|
MultiMatchQuery,
|
||||||
BooleanQuery,
|
|
||||||
FullTextQueryType,
|
FullTextQueryType,
|
||||||
Operator,
|
|
||||||
Occur,
|
|
||||||
} from "./query";
|
} from "./query";
|
||||||
|
|
||||||
export {
|
export {
|
||||||
@@ -103,7 +97,6 @@ export {
|
|||||||
RecordBatchLike,
|
RecordBatchLike,
|
||||||
DataLike,
|
DataLike,
|
||||||
IntoVector,
|
IntoVector,
|
||||||
MultiVector,
|
|
||||||
} from "./arrow";
|
} from "./arrow";
|
||||||
export { IntoSql, packBits } from "./util";
|
export { IntoSql, packBits } from "./util";
|
||||||
|
|
||||||
@@ -134,7 +127,6 @@ export { IntoSql, packBits } from "./util";
|
|||||||
export async function connect(
|
export async function connect(
|
||||||
uri: string,
|
uri: string,
|
||||||
options?: Partial<ConnectionOptions>,
|
options?: Partial<ConnectionOptions>,
|
||||||
session?: Session,
|
|
||||||
): Promise<Connection>;
|
): Promise<Connection>;
|
||||||
/**
|
/**
|
||||||
* Connect to a LanceDB instance at the given URI.
|
* Connect to a LanceDB instance at the given URI.
|
||||||
@@ -153,43 +145,31 @@ export async function connect(
|
|||||||
* storageOptions: {timeout: "60s"}
|
* storageOptions: {timeout: "60s"}
|
||||||
* });
|
* });
|
||||||
* ```
|
* ```
|
||||||
*
|
|
||||||
* @example
|
|
||||||
* ```ts
|
|
||||||
* const session = Session.default();
|
|
||||||
* const conn = await connect({
|
|
||||||
* uri: "/path/to/database",
|
|
||||||
* session: session
|
|
||||||
* });
|
|
||||||
* ```
|
|
||||||
*/
|
*/
|
||||||
export async function connect(
|
export async function connect(
|
||||||
options: Partial<ConnectionOptions> & { uri: string },
|
options: Partial<ConnectionOptions> & { uri: string },
|
||||||
): Promise<Connection>;
|
): Promise<Connection>;
|
||||||
export async function connect(
|
export async function connect(
|
||||||
uriOrOptions: string | (Partial<ConnectionOptions> & { uri: string }),
|
uriOrOptions: string | (Partial<ConnectionOptions> & { uri: string }),
|
||||||
options?: Partial<ConnectionOptions>,
|
options: Partial<ConnectionOptions> = {},
|
||||||
): Promise<Connection> {
|
): Promise<Connection> {
|
||||||
let uri: string | undefined;
|
let uri: string | undefined;
|
||||||
let finalOptions: Partial<ConnectionOptions> = {};
|
|
||||||
|
|
||||||
if (typeof uriOrOptions !== "string") {
|
if (typeof uriOrOptions !== "string") {
|
||||||
const { uri: uri_, ...opts } = uriOrOptions;
|
const { uri: uri_, ...opts } = uriOrOptions;
|
||||||
uri = uri_;
|
uri = uri_;
|
||||||
finalOptions = opts;
|
options = opts;
|
||||||
} else {
|
} else {
|
||||||
uri = uriOrOptions;
|
uri = uriOrOptions;
|
||||||
finalOptions = options || {};
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!uri) {
|
if (!uri) {
|
||||||
throw new Error("uri is required");
|
throw new Error("uri is required");
|
||||||
}
|
}
|
||||||
|
|
||||||
finalOptions = (finalOptions as ConnectionOptions) ?? {};
|
options = (options as ConnectionOptions) ?? {};
|
||||||
(<ConnectionOptions>finalOptions).storageOptions = cleanseStorageOptions(
|
(<ConnectionOptions>options).storageOptions = cleanseStorageOptions(
|
||||||
(<ConnectionOptions>finalOptions).storageOptions,
|
(<ConnectionOptions>options).storageOptions,
|
||||||
);
|
);
|
||||||
const nativeConn = await LanceDbConnection.new(uri, finalOptions);
|
const nativeConn = await LanceDbConnection.new(uri, options);
|
||||||
return new LocalConnection(nativeConn);
|
return new LocalConnection(nativeConn);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -439,7 +439,7 @@ export interface FtsOptions {
|
|||||||
*
|
*
|
||||||
* "raw" - Raw tokenizer. This tokenizer does not split the text into tokens and indexes the entire text as a single token.
|
* "raw" - Raw tokenizer. This tokenizer does not split the text into tokens and indexes the entire text as a single token.
|
||||||
*/
|
*/
|
||||||
baseTokenizer?: "simple" | "whitespace" | "raw" | "ngram";
|
baseTokenizer?: "simple" | "whitespace" | "raw";
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* language for stemming and stop words
|
* language for stemming and stop words
|
||||||
@@ -472,21 +472,6 @@ export interface FtsOptions {
|
|||||||
* whether to remove punctuation
|
* whether to remove punctuation
|
||||||
*/
|
*/
|
||||||
asciiFolding?: boolean;
|
asciiFolding?: boolean;
|
||||||
|
|
||||||
/**
|
|
||||||
* ngram min length
|
|
||||||
*/
|
|
||||||
ngramMinLength?: number;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* ngram max length
|
|
||||||
*/
|
|
||||||
ngramMaxLength?: number;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* whether to only index the prefix of the token for ngram tokenizer
|
|
||||||
*/
|
|
||||||
prefixOnly?: boolean;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
export class Index {
|
export class Index {
|
||||||
@@ -623,9 +608,6 @@ export class Index {
|
|||||||
options?.stem,
|
options?.stem,
|
||||||
options?.removeStopWords,
|
options?.removeStopWords,
|
||||||
options?.asciiFolding,
|
options?.asciiFolding,
|
||||||
options?.ngramMinLength,
|
|
||||||
options?.ngramMaxLength,
|
|
||||||
options?.prefixOnly,
|
|
||||||
),
|
),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -448,10 +448,6 @@ export class VectorQuery extends QueryBase<NativeVectorQuery> {
|
|||||||
* For best results we recommend tuning this parameter with a benchmark against
|
* For best results we recommend tuning this parameter with a benchmark against
|
||||||
* your actual data to find the smallest possible value that will still give
|
* your actual data to find the smallest possible value that will still give
|
||||||
* you the desired recall.
|
* you the desired recall.
|
||||||
*
|
|
||||||
* For more fine grained control over behavior when you have a very narrow filter
|
|
||||||
* you can use `minimumNprobes` and `maximumNprobes`. This method sets both
|
|
||||||
* the minimum and maximum to the same value.
|
|
||||||
*/
|
*/
|
||||||
nprobes(nprobes: number): VectorQuery {
|
nprobes(nprobes: number): VectorQuery {
|
||||||
super.doCall((inner) => inner.nprobes(nprobes));
|
super.doCall((inner) => inner.nprobes(nprobes));
|
||||||
@@ -459,33 +455,6 @@ export class VectorQuery extends QueryBase<NativeVectorQuery> {
|
|||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Set the minimum number of probes used.
|
|
||||||
*
|
|
||||||
* This controls the minimum number of partitions that will be searched. This
|
|
||||||
* parameter will impact every query against a vector index, regardless of the
|
|
||||||
* filter. See `nprobes` for more details. Higher values will increase recall
|
|
||||||
* but will also increase latency.
|
|
||||||
*/
|
|
||||||
minimumNprobes(minimumNprobes: number): VectorQuery {
|
|
||||||
super.doCall((inner) => inner.minimumNprobes(minimumNprobes));
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Set the maximum number of probes used.
|
|
||||||
*
|
|
||||||
* This controls the maximum number of partitions that will be searched. If this
|
|
||||||
* number is greater than minimumNprobes then the excess partitions will _only_ be
|
|
||||||
* searched if we have not found enough results. This can be useful when there is
|
|
||||||
* a narrow filter to allow these queries to spend more time searching and avoid
|
|
||||||
* potential false negatives.
|
|
||||||
*/
|
|
||||||
maximumNprobes(maximumNprobes: number): VectorQuery {
|
|
||||||
super.doCall((inner) => inner.maximumNprobes(maximumNprobes));
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Set the distance range to use
|
* Set the distance range to use
|
||||||
*
|
*
|
||||||
@@ -793,31 +762,6 @@ export enum FullTextQueryType {
|
|||||||
MatchPhrase = "match_phrase",
|
MatchPhrase = "match_phrase",
|
||||||
Boost = "boost",
|
Boost = "boost",
|
||||||
MultiMatch = "multi_match",
|
MultiMatch = "multi_match",
|
||||||
Boolean = "boolean",
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Enum representing the logical operators used in full-text queries.
|
|
||||||
*
|
|
||||||
* - `And`: All terms must match.
|
|
||||||
* - `Or`: At least one term must match.
|
|
||||||
*/
|
|
||||||
export enum Operator {
|
|
||||||
And = "AND",
|
|
||||||
Or = "OR",
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Enum representing the occurrence of terms in full-text queries.
|
|
||||||
*
|
|
||||||
* - `Must`: The term must be present in the document.
|
|
||||||
* - `Should`: The term should contribute to the document score, but is not required.
|
|
||||||
* - `MustNot`: The term must not be present in the document.
|
|
||||||
*/
|
|
||||||
export enum Occur {
|
|
||||||
Should = "SHOULD",
|
|
||||||
Must = "MUST",
|
|
||||||
MustNot = "MUST_NOT",
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -847,7 +791,6 @@ export function instanceOfFullTextQuery(obj: any): obj is FullTextQuery {
|
|||||||
export class MatchQuery implements FullTextQuery {
|
export class MatchQuery implements FullTextQuery {
|
||||||
/** @ignore */
|
/** @ignore */
|
||||||
public readonly inner: JsFullTextQuery;
|
public readonly inner: JsFullTextQuery;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates an instance of MatchQuery.
|
* Creates an instance of MatchQuery.
|
||||||
*
|
*
|
||||||
@@ -857,8 +800,6 @@ export class MatchQuery implements FullTextQuery {
|
|||||||
* - `boost`: The boost factor for the query (default is 1.0).
|
* - `boost`: The boost factor for the query (default is 1.0).
|
||||||
* - `fuzziness`: The fuzziness level for the query (default is 0).
|
* - `fuzziness`: The fuzziness level for the query (default is 0).
|
||||||
* - `maxExpansions`: The maximum number of terms to consider for fuzzy matching (default is 50).
|
* - `maxExpansions`: The maximum number of terms to consider for fuzzy matching (default is 50).
|
||||||
* - `operator`: The logical operator to use for combining terms in the query (default is "OR").
|
|
||||||
* - `prefixLength`: The number of beginning characters being unchanged for fuzzy matching.
|
|
||||||
*/
|
*/
|
||||||
constructor(
|
constructor(
|
||||||
query: string,
|
query: string,
|
||||||
@@ -867,8 +808,6 @@ export class MatchQuery implements FullTextQuery {
|
|||||||
boost?: number;
|
boost?: number;
|
||||||
fuzziness?: number;
|
fuzziness?: number;
|
||||||
maxExpansions?: number;
|
maxExpansions?: number;
|
||||||
operator?: Operator;
|
|
||||||
prefixLength?: number;
|
|
||||||
},
|
},
|
||||||
) {
|
) {
|
||||||
let fuzziness = options?.fuzziness;
|
let fuzziness = options?.fuzziness;
|
||||||
@@ -881,8 +820,6 @@ export class MatchQuery implements FullTextQuery {
|
|||||||
options?.boost ?? 1.0,
|
options?.boost ?? 1.0,
|
||||||
fuzziness,
|
fuzziness,
|
||||||
options?.maxExpansions ?? 50,
|
options?.maxExpansions ?? 50,
|
||||||
options?.operator ?? Operator.Or,
|
|
||||||
options?.prefixLength ?? 0,
|
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -899,11 +836,9 @@ export class PhraseQuery implements FullTextQuery {
|
|||||||
*
|
*
|
||||||
* @param query - The phrase to search for in the specified column.
|
* @param query - The phrase to search for in the specified column.
|
||||||
* @param column - The name of the column to search within.
|
* @param column - The name of the column to search within.
|
||||||
* @param options - Optional parameters for the phrase query.
|
|
||||||
* - `slop`: The maximum number of intervening unmatched positions allowed between words in the phrase (default is 0).
|
|
||||||
*/
|
*/
|
||||||
constructor(query: string, column: string, options?: { slop?: number }) {
|
constructor(query: string, column: string) {
|
||||||
this.inner = JsFullTextQuery.phraseQuery(query, column, options?.slop ?? 0);
|
this.inner = JsFullTextQuery.phraseQuery(query, column);
|
||||||
}
|
}
|
||||||
|
|
||||||
queryType(): FullTextQueryType {
|
queryType(): FullTextQueryType {
|
||||||
@@ -954,21 +889,18 @@ export class MultiMatchQuery implements FullTextQuery {
|
|||||||
* @param columns - An array of column names to search within.
|
* @param columns - An array of column names to search within.
|
||||||
* @param options - Optional parameters for the multi-match query.
|
* @param options - Optional parameters for the multi-match query.
|
||||||
* - `boosts`: An array of boost factors for each column (default is 1.0 for all).
|
* - `boosts`: An array of boost factors for each column (default is 1.0 for all).
|
||||||
* - `operator`: The logical operator to use for combining terms in the query (default is "OR").
|
|
||||||
*/
|
*/
|
||||||
constructor(
|
constructor(
|
||||||
query: string,
|
query: string,
|
||||||
columns: string[],
|
columns: string[],
|
||||||
options?: {
|
options?: {
|
||||||
boosts?: number[];
|
boosts?: number[];
|
||||||
operator?: Operator;
|
|
||||||
},
|
},
|
||||||
) {
|
) {
|
||||||
this.inner = JsFullTextQuery.multiMatchQuery(
|
this.inner = JsFullTextQuery.multiMatchQuery(
|
||||||
query,
|
query,
|
||||||
columns,
|
columns,
|
||||||
options?.boosts,
|
options?.boosts,
|
||||||
options?.operator ?? Operator.Or,
|
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -976,23 +908,3 @@ export class MultiMatchQuery implements FullTextQuery {
|
|||||||
return FullTextQueryType.MultiMatch;
|
return FullTextQueryType.MultiMatch;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export class BooleanQuery implements FullTextQuery {
|
|
||||||
/** @ignore */
|
|
||||||
public readonly inner: JsFullTextQuery;
|
|
||||||
/**
|
|
||||||
* Creates an instance of BooleanQuery.
|
|
||||||
*
|
|
||||||
* @param queries - An array of (Occur, FullTextQuery objects) to combine.
|
|
||||||
* Occur specifies whether the query must match, or should match.
|
|
||||||
*/
|
|
||||||
constructor(queries: [Occur, FullTextQuery][]) {
|
|
||||||
this.inner = JsFullTextQuery.booleanQuery(
|
|
||||||
queries.map(([occur, query]) => [occur, query.inner]),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
queryType(): FullTextQueryType {
|
|
||||||
return FullTextQueryType.Boolean;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -6,11 +6,9 @@ import {
|
|||||||
Data,
|
Data,
|
||||||
DataType,
|
DataType,
|
||||||
IntoVector,
|
IntoVector,
|
||||||
MultiVector,
|
|
||||||
Schema,
|
Schema,
|
||||||
dataTypeToJson,
|
dataTypeToJson,
|
||||||
fromDataToBuffer,
|
fromDataToBuffer,
|
||||||
isMultiVector,
|
|
||||||
tableFromIPC,
|
tableFromIPC,
|
||||||
} from "./arrow";
|
} from "./arrow";
|
||||||
|
|
||||||
@@ -77,10 +75,10 @@ export interface OptimizeOptions {
|
|||||||
* // Delete all versions older than 1 day
|
* // Delete all versions older than 1 day
|
||||||
* const olderThan = new Date();
|
* const olderThan = new Date();
|
||||||
* olderThan.setDate(olderThan.getDate() - 1));
|
* olderThan.setDate(olderThan.getDate() - 1));
|
||||||
* tbl.optimize({cleanupOlderThan: olderThan});
|
* tbl.cleanupOlderVersions(olderThan);
|
||||||
*
|
*
|
||||||
* // Delete all versions except the current version
|
* // Delete all versions except the current version
|
||||||
* tbl.optimize({cleanupOlderThan: new Date()});
|
* tbl.cleanupOlderVersions(new Date());
|
||||||
*/
|
*/
|
||||||
cleanupOlderThan: Date;
|
cleanupOlderThan: Date;
|
||||||
deleteUnverified: boolean;
|
deleteUnverified: boolean;
|
||||||
@@ -348,7 +346,7 @@ export abstract class Table {
|
|||||||
* if the query is a string and no embedding function is defined, it will be treated as a full text search query
|
* if the query is a string and no embedding function is defined, it will be treated as a full text search query
|
||||||
*/
|
*/
|
||||||
abstract search(
|
abstract search(
|
||||||
query: string | IntoVector | MultiVector | FullTextQuery,
|
query: string | IntoVector | FullTextQuery,
|
||||||
queryType?: string,
|
queryType?: string,
|
||||||
ftsColumns?: string | string[],
|
ftsColumns?: string | string[],
|
||||||
): VectorQuery | Query;
|
): VectorQuery | Query;
|
||||||
@@ -359,7 +357,7 @@ export abstract class Table {
|
|||||||
* is the same thing as calling `nearestTo` on the builder returned
|
* is the same thing as calling `nearestTo` on the builder returned
|
||||||
* by `query`. @see {@link Query#nearestTo} for more details.
|
* by `query`. @see {@link Query#nearestTo} for more details.
|
||||||
*/
|
*/
|
||||||
abstract vectorSearch(vector: IntoVector | MultiVector): VectorQuery;
|
abstract vectorSearch(vector: IntoVector): VectorQuery;
|
||||||
/**
|
/**
|
||||||
* Add new columns with defined values.
|
* Add new columns with defined values.
|
||||||
* @param {AddColumnsSql[]} newColumnTransforms pairs of column names and
|
* @param {AddColumnsSql[]} newColumnTransforms pairs of column names and
|
||||||
@@ -670,7 +668,7 @@ export class LocalTable extends Table {
|
|||||||
}
|
}
|
||||||
|
|
||||||
search(
|
search(
|
||||||
query: string | IntoVector | MultiVector | FullTextQuery,
|
query: string | IntoVector | FullTextQuery,
|
||||||
queryType: string = "auto",
|
queryType: string = "auto",
|
||||||
ftsColumns?: string | string[],
|
ftsColumns?: string | string[],
|
||||||
): VectorQuery | Query {
|
): VectorQuery | Query {
|
||||||
@@ -717,15 +715,7 @@ export class LocalTable extends Table {
|
|||||||
return this.query().nearestTo(queryPromise);
|
return this.query().nearestTo(queryPromise);
|
||||||
}
|
}
|
||||||
|
|
||||||
vectorSearch(vector: IntoVector | MultiVector): VectorQuery {
|
vectorSearch(vector: IntoVector): VectorQuery {
|
||||||
if (isMultiVector(vector)) {
|
|
||||||
const query = this.query().nearestTo(vector[0]);
|
|
||||||
for (const v of vector.slice(1)) {
|
|
||||||
query.addQueryVector(v);
|
|
||||||
}
|
|
||||||
return query;
|
|
||||||
}
|
|
||||||
|
|
||||||
return this.query().nearestTo(vector);
|
return this.query().nearestTo(vector);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-darwin-arm64",
|
"name": "@lancedb/lancedb-darwin-arm64",
|
||||||
"version": "0.21.2",
|
"version": "0.20.0",
|
||||||
"os": ["darwin"],
|
"os": ["darwin"],
|
||||||
"cpu": ["arm64"],
|
"cpu": ["arm64"],
|
||||||
"main": "lancedb.darwin-arm64.node",
|
"main": "lancedb.darwin-arm64.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-darwin-x64",
|
"name": "@lancedb/lancedb-darwin-x64",
|
||||||
"version": "0.21.2",
|
"version": "0.20.0",
|
||||||
"os": ["darwin"],
|
"os": ["darwin"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.darwin-x64.node",
|
"main": "lancedb.darwin-x64.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-linux-arm64-gnu",
|
"name": "@lancedb/lancedb-linux-arm64-gnu",
|
||||||
"version": "0.21.2",
|
"version": "0.20.0",
|
||||||
"os": ["linux"],
|
"os": ["linux"],
|
||||||
"cpu": ["arm64"],
|
"cpu": ["arm64"],
|
||||||
"main": "lancedb.linux-arm64-gnu.node",
|
"main": "lancedb.linux-arm64-gnu.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-linux-arm64-musl",
|
"name": "@lancedb/lancedb-linux-arm64-musl",
|
||||||
"version": "0.21.2",
|
"version": "0.20.0",
|
||||||
"os": ["linux"],
|
"os": ["linux"],
|
||||||
"cpu": ["arm64"],
|
"cpu": ["arm64"],
|
||||||
"main": "lancedb.linux-arm64-musl.node",
|
"main": "lancedb.linux-arm64-musl.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-linux-x64-gnu",
|
"name": "@lancedb/lancedb-linux-x64-gnu",
|
||||||
"version": "0.21.2",
|
"version": "0.20.0",
|
||||||
"os": ["linux"],
|
"os": ["linux"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.linux-x64-gnu.node",
|
"main": "lancedb.linux-x64-gnu.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-linux-x64-musl",
|
"name": "@lancedb/lancedb-linux-x64-musl",
|
||||||
"version": "0.21.2",
|
"version": "0.20.0",
|
||||||
"os": ["linux"],
|
"os": ["linux"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.linux-x64-musl.node",
|
"main": "lancedb.linux-x64-musl.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-win32-arm64-msvc",
|
"name": "@lancedb/lancedb-win32-arm64-msvc",
|
||||||
"version": "0.21.2",
|
"version": "0.20.0",
|
||||||
"os": [
|
"os": [
|
||||||
"win32"
|
"win32"
|
||||||
],
|
],
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-win32-x64-msvc",
|
"name": "@lancedb/lancedb-win32-x64-msvc",
|
||||||
"version": "0.21.2",
|
"version": "0.20.0",
|
||||||
"os": ["win32"],
|
"os": ["win32"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.win32-x64-msvc.node",
|
"main": "lancedb.win32-x64-msvc.node",
|
||||||
|
|||||||
4
nodejs/package-lock.json
generated
4
nodejs/package-lock.json
generated
@@ -1,12 +1,12 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb",
|
"name": "@lancedb/lancedb",
|
||||||
"version": "0.21.2",
|
"version": "0.20.0-beta.2",
|
||||||
"lockfileVersion": 3,
|
"lockfileVersion": 3,
|
||||||
"requires": true,
|
"requires": true,
|
||||||
"packages": {
|
"packages": {
|
||||||
"": {
|
"": {
|
||||||
"name": "@lancedb/lancedb",
|
"name": "@lancedb/lancedb",
|
||||||
"version": "0.21.2",
|
"version": "0.20.0-beta.2",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"x64",
|
"x64",
|
||||||
"arm64"
|
"arm64"
|
||||||
|
|||||||
@@ -11,7 +11,7 @@
|
|||||||
"ann"
|
"ann"
|
||||||
],
|
],
|
||||||
"private": false,
|
"private": false,
|
||||||
"version": "0.21.2",
|
"version": "0.20.0",
|
||||||
"main": "dist/index.js",
|
"main": "dist/index.js",
|
||||||
"exports": {
|
"exports": {
|
||||||
".": "./dist/index.js",
|
".": "./dist/index.js",
|
||||||
|
|||||||
@@ -74,10 +74,6 @@ impl Connection {
|
|||||||
builder = builder.host_override(&host_override);
|
builder = builder.host_override(&host_override);
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Some(session) = options.session {
|
|
||||||
builder = builder.session(session.inner.clone());
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(Self::inner_new(builder.execute().await.default_error()?))
|
Ok(Self::inner_new(builder.execute().await.default_error()?))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -123,9 +123,6 @@ impl Index {
|
|||||||
stem: Option<bool>,
|
stem: Option<bool>,
|
||||||
remove_stop_words: Option<bool>,
|
remove_stop_words: Option<bool>,
|
||||||
ascii_folding: Option<bool>,
|
ascii_folding: Option<bool>,
|
||||||
ngram_min_length: Option<u32>,
|
|
||||||
ngram_max_length: Option<u32>,
|
|
||||||
prefix_only: Option<bool>,
|
|
||||||
) -> Self {
|
) -> Self {
|
||||||
let mut opts = FtsIndexBuilder::default();
|
let mut opts = FtsIndexBuilder::default();
|
||||||
if let Some(with_position) = with_position {
|
if let Some(with_position) = with_position {
|
||||||
@@ -152,15 +149,6 @@ impl Index {
|
|||||||
if let Some(ascii_folding) = ascii_folding {
|
if let Some(ascii_folding) = ascii_folding {
|
||||||
opts = opts.ascii_folding(ascii_folding);
|
opts = opts.ascii_folding(ascii_folding);
|
||||||
}
|
}
|
||||||
if let Some(ngram_min_length) = ngram_min_length {
|
|
||||||
opts = opts.ngram_min_length(ngram_min_length);
|
|
||||||
}
|
|
||||||
if let Some(ngram_max_length) = ngram_max_length {
|
|
||||||
opts = opts.ngram_max_length(ngram_max_length);
|
|
||||||
}
|
|
||||||
if let Some(prefix_only) = prefix_only {
|
|
||||||
opts = opts.ngram_prefix_only(prefix_only);
|
|
||||||
}
|
|
||||||
|
|
||||||
Self {
|
Self {
|
||||||
inner: Mutex::new(Some(LanceDbIndex::FTS(opts))),
|
inner: Mutex::new(Some(LanceDbIndex::FTS(opts))),
|
||||||
|
|||||||
@@ -14,7 +14,6 @@ pub mod merge;
|
|||||||
mod query;
|
mod query;
|
||||||
pub mod remote;
|
pub mod remote;
|
||||||
mod rerankers;
|
mod rerankers;
|
||||||
mod session;
|
|
||||||
mod table;
|
mod table;
|
||||||
mod util;
|
mod util;
|
||||||
|
|
||||||
@@ -35,9 +34,6 @@ pub struct ConnectionOptions {
|
|||||||
///
|
///
|
||||||
/// The available options are described at https://lancedb.github.io/lancedb/guides/storage/
|
/// The available options are described at https://lancedb.github.io/lancedb/guides/storage/
|
||||||
pub storage_options: Option<HashMap<String, String>>,
|
pub storage_options: Option<HashMap<String, String>>,
|
||||||
/// (For LanceDB OSS only): the session to use for this connection. Holds
|
|
||||||
/// shared caches and other session-specific state.
|
|
||||||
pub session: Option<session::Session>,
|
|
||||||
|
|
||||||
/// (For LanceDB cloud only): configuration for the remote HTTP client.
|
/// (For LanceDB cloud only): configuration for the remote HTTP client.
|
||||||
pub client_config: Option<remote::ClientConfig>,
|
pub client_config: Option<remote::ClientConfig>,
|
||||||
|
|||||||
@@ -4,8 +4,7 @@
|
|||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
use lancedb::index::scalar::{
|
use lancedb::index::scalar::{
|
||||||
BooleanQuery, BoostQuery, FtsQuery, FullTextSearchQuery, MatchQuery, MultiMatchQuery, Occur,
|
BoostQuery, FtsQuery, FullTextSearchQuery, MatchQuery, MultiMatchQuery, PhraseQuery,
|
||||||
Operator, PhraseQuery,
|
|
||||||
};
|
};
|
||||||
use lancedb::query::ExecutableQuery;
|
use lancedb::query::ExecutableQuery;
|
||||||
use lancedb::query::Query as LanceDbQuery;
|
use lancedb::query::Query as LanceDbQuery;
|
||||||
@@ -178,31 +177,6 @@ impl VectorQuery {
|
|||||||
self.inner = self.inner.clone().nprobes(nprobe as usize);
|
self.inner = self.inner.clone().nprobes(nprobe as usize);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[napi]
|
|
||||||
pub fn minimum_nprobes(&mut self, minimum_nprobe: u32) -> napi::Result<()> {
|
|
||||||
self.inner = self
|
|
||||||
.inner
|
|
||||||
.clone()
|
|
||||||
.minimum_nprobes(minimum_nprobe as usize)
|
|
||||||
.default_error()?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[napi]
|
|
||||||
pub fn maximum_nprobes(&mut self, maximum_nprobes: u32) -> napi::Result<()> {
|
|
||||||
let maximum_nprobes = if maximum_nprobes == 0 {
|
|
||||||
None
|
|
||||||
} else {
|
|
||||||
Some(maximum_nprobes as usize)
|
|
||||||
};
|
|
||||||
self.inner = self
|
|
||||||
.inner
|
|
||||||
.clone()
|
|
||||||
.maximum_nprobes(maximum_nprobes)
|
|
||||||
.default_error()?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[napi]
|
#[napi]
|
||||||
pub fn distance_range(&mut self, lower_bound: Option<f64>, upper_bound: Option<f64>) {
|
pub fn distance_range(&mut self, lower_bound: Option<f64>, upper_bound: Option<f64>) {
|
||||||
// napi doesn't support f32, so we have to convert to f32
|
// napi doesn't support f32, so we have to convert to f32
|
||||||
@@ -334,8 +308,6 @@ impl JsFullTextQuery {
|
|||||||
boost: f64,
|
boost: f64,
|
||||||
fuzziness: Option<u32>,
|
fuzziness: Option<u32>,
|
||||||
max_expansions: u32,
|
max_expansions: u32,
|
||||||
operator: String,
|
|
||||||
prefix_length: u32,
|
|
||||||
) -> napi::Result<Self> {
|
) -> napi::Result<Self> {
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
inner: MatchQuery::new(query)
|
inner: MatchQuery::new(query)
|
||||||
@@ -343,23 +315,14 @@ impl JsFullTextQuery {
|
|||||||
.with_boost(boost as f32)
|
.with_boost(boost as f32)
|
||||||
.with_fuzziness(fuzziness)
|
.with_fuzziness(fuzziness)
|
||||||
.with_max_expansions(max_expansions as usize)
|
.with_max_expansions(max_expansions as usize)
|
||||||
.with_operator(
|
|
||||||
Operator::try_from(operator.as_str()).map_err(|e| {
|
|
||||||
napi::Error::from_reason(format!("Invalid operator: {}", e))
|
|
||||||
})?,
|
|
||||||
)
|
|
||||||
.with_prefix_length(prefix_length)
|
|
||||||
.into(),
|
.into(),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
#[napi(factory)]
|
#[napi(factory)]
|
||||||
pub fn phrase_query(query: String, column: String, slop: u32) -> napi::Result<Self> {
|
pub fn phrase_query(query: String, column: String) -> napi::Result<Self> {
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
inner: PhraseQuery::new(query)
|
inner: PhraseQuery::new(query).with_column(Some(column)).into(),
|
||||||
.with_column(Some(column))
|
|
||||||
.with_slop(slop)
|
|
||||||
.into(),
|
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -385,7 +348,6 @@ impl JsFullTextQuery {
|
|||||||
query: String,
|
query: String,
|
||||||
columns: Vec<String>,
|
columns: Vec<String>,
|
||||||
boosts: Option<Vec<f64>>,
|
boosts: Option<Vec<f64>>,
|
||||||
operator: String,
|
|
||||||
) -> napi::Result<Self> {
|
) -> napi::Result<Self> {
|
||||||
let q = match boosts {
|
let q = match boosts {
|
||||||
Some(boosts) => MultiMatchQuery::try_new(query, columns)
|
Some(boosts) => MultiMatchQuery::try_new(query, columns)
|
||||||
@@ -396,37 +358,7 @@ impl JsFullTextQuery {
|
|||||||
napi::Error::from_reason(format!("Failed to create multi match query: {}", e))
|
napi::Error::from_reason(format!("Failed to create multi match query: {}", e))
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
let operator = Operator::try_from(operator.as_str()).map_err(|e| {
|
Ok(Self { inner: q.into() })
|
||||||
napi::Error::from_reason(format!("Invalid operator for multi match query: {}", e))
|
|
||||||
})?;
|
|
||||||
|
|
||||||
Ok(Self {
|
|
||||||
inner: q.with_operator(operator).into(),
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
#[napi(factory)]
|
|
||||||
pub fn boolean_query(queries: Vec<(String, &JsFullTextQuery)>) -> napi::Result<Self> {
|
|
||||||
let mut sub_queries = Vec::with_capacity(queries.len());
|
|
||||||
for (occur, q) in queries {
|
|
||||||
let occur = Occur::try_from(occur.as_str())
|
|
||||||
.map_err(|e| napi::Error::from_reason(e.to_string()))?;
|
|
||||||
sub_queries.push((occur, q.inner.clone()));
|
|
||||||
}
|
|
||||||
Ok(Self {
|
|
||||||
inner: BooleanQuery::new(sub_queries).into(),
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
#[napi(getter)]
|
|
||||||
pub fn query_type(&self) -> String {
|
|
||||||
match self.inner {
|
|
||||||
FtsQuery::Match(_) => "match".to_string(),
|
|
||||||
FtsQuery::Phrase(_) => "phrase".to_string(),
|
|
||||||
FtsQuery::Boost(_) => "boost".to_string(),
|
|
||||||
FtsQuery::MultiMatch(_) => "multi_match".to_string(),
|
|
||||||
FtsQuery::Boolean(_) => "boolean".to_string(),
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,102 +0,0 @@
|
|||||||
// SPDX-License-Identifier: Apache-2.0
|
|
||||||
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
|
||||||
|
|
||||||
use std::sync::Arc;
|
|
||||||
|
|
||||||
use lancedb::{ObjectStoreRegistry, Session as LanceSession};
|
|
||||||
use napi::bindgen_prelude::*;
|
|
||||||
use napi_derive::*;
|
|
||||||
|
|
||||||
/// A session for managing caches and object stores across LanceDB operations.
|
|
||||||
///
|
|
||||||
/// Sessions allow you to configure cache sizes for index and metadata caches,
|
|
||||||
/// which can significantly impact memory use and performance. They can
|
|
||||||
/// also be re-used across multiple connections to share the same cache state.
|
|
||||||
#[napi]
|
|
||||||
#[derive(Clone)]
|
|
||||||
pub struct Session {
|
|
||||||
pub(crate) inner: Arc<LanceSession>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl std::fmt::Debug for Session {
|
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
||||||
f.debug_struct("Session")
|
|
||||||
.field("size_bytes", &self.inner.size_bytes())
|
|
||||||
.field("approx_num_items", &self.inner.approx_num_items())
|
|
||||||
.finish()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[napi]
|
|
||||||
impl Session {
|
|
||||||
/// Create a new session with custom cache sizes.
|
|
||||||
///
|
|
||||||
/// # Parameters
|
|
||||||
///
|
|
||||||
/// - `index_cache_size_bytes`: The size of the index cache in bytes.
|
|
||||||
/// Index data is stored in memory in this cache to speed up queries.
|
|
||||||
/// Defaults to 6GB if not specified.
|
|
||||||
/// - `metadata_cache_size_bytes`: The size of the metadata cache in bytes.
|
|
||||||
/// The metadata cache stores file metadata and schema information in memory.
|
|
||||||
/// This cache improves scan and write performance.
|
|
||||||
/// Defaults to 1GB if not specified.
|
|
||||||
#[napi(constructor)]
|
|
||||||
pub fn new(
|
|
||||||
index_cache_size_bytes: Option<BigInt>,
|
|
||||||
metadata_cache_size_bytes: Option<BigInt>,
|
|
||||||
) -> napi::Result<Self> {
|
|
||||||
let index_cache_size = index_cache_size_bytes
|
|
||||||
.map(|size| size.get_u64().1 as usize)
|
|
||||||
.unwrap_or(6 * 1024 * 1024 * 1024); // 6GB default
|
|
||||||
|
|
||||||
let metadata_cache_size = metadata_cache_size_bytes
|
|
||||||
.map(|size| size.get_u64().1 as usize)
|
|
||||||
.unwrap_or(1024 * 1024 * 1024); // 1GB default
|
|
||||||
|
|
||||||
let session = LanceSession::new(
|
|
||||||
index_cache_size,
|
|
||||||
metadata_cache_size,
|
|
||||||
Arc::new(ObjectStoreRegistry::default()),
|
|
||||||
);
|
|
||||||
|
|
||||||
Ok(Self {
|
|
||||||
inner: Arc::new(session),
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Create a session with default cache sizes.
|
|
||||||
///
|
|
||||||
/// This is equivalent to creating a session with 6GB index cache
|
|
||||||
/// and 1GB metadata cache.
|
|
||||||
#[napi(factory)]
|
|
||||||
pub fn default() -> Self {
|
|
||||||
Self {
|
|
||||||
inner: Arc::new(LanceSession::default()),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Get the current size of the session caches in bytes.
|
|
||||||
#[napi]
|
|
||||||
pub fn size_bytes(&self) -> BigInt {
|
|
||||||
BigInt::from(self.inner.size_bytes())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Get the approximate number of items cached in the session.
|
|
||||||
#[napi]
|
|
||||||
pub fn approx_num_items(&self) -> u32 {
|
|
||||||
self.inner.approx_num_items() as u32
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Implement FromNapiValue for Session to work with napi(object)
|
|
||||||
impl napi::bindgen_prelude::FromNapiValue for Session {
|
|
||||||
unsafe fn from_napi_value(
|
|
||||||
env: napi::sys::napi_env,
|
|
||||||
napi_val: napi::sys::napi_value,
|
|
||||||
) -> napi::Result<Self> {
|
|
||||||
let object: napi::bindgen_prelude::ClassInstance<Session> =
|
|
||||||
napi::bindgen_prelude::ClassInstance::from_napi_value(env, napi_val)?;
|
|
||||||
let copy = object.clone();
|
|
||||||
Ok(copy)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,5 +1,5 @@
|
|||||||
[tool.bumpversion]
|
[tool.bumpversion]
|
||||||
current_version = "0.24.2"
|
current_version = "0.23.0"
|
||||||
parse = """(?x)
|
parse = """(?x)
|
||||||
(?P<major>0|[1-9]\\d*)\\.
|
(?P<major>0|[1-9]\\d*)\\.
|
||||||
(?P<minor>0|[1-9]\\d*)\\.
|
(?P<minor>0|[1-9]\\d*)\\.
|
||||||
|
|||||||
@@ -1,19 +0,0 @@
|
|||||||
These are the Python bindings of LanceDB.
|
|
||||||
The core Rust library is in the `../rust/lancedb` directory, the rust binding
|
|
||||||
code is in the `src/` directory and the Python bindings are in the `lancedb/` directory.
|
|
||||||
|
|
||||||
Common commands:
|
|
||||||
|
|
||||||
* Build: `make develop`
|
|
||||||
* Format: `make format`
|
|
||||||
* Lint: `make check`
|
|
||||||
* Fix lints: `make fix`
|
|
||||||
* Test: `make test`
|
|
||||||
* Doc test: `make doctest`
|
|
||||||
|
|
||||||
Before committing changes, run lints and then formatting.
|
|
||||||
|
|
||||||
When you change the Rust code, you will need to recompile the Python bindings: `make develop`.
|
|
||||||
|
|
||||||
When you export new types from Rust to Python, you must manually update `python/lancedb/_lancedb.pyi`
|
|
||||||
with the corresponding type hints. You can run `pyright` to check for type errors in the Python code.
|
|
||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb-python"
|
name = "lancedb-python"
|
||||||
version = "0.24.2"
|
version = "0.23.0"
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
description = "Python bindings for LanceDB"
|
description = "Python bindings for LanceDB"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
|
|||||||
@@ -85,8 +85,8 @@ embeddings = [
|
|||||||
"boto3>=1.28.57",
|
"boto3>=1.28.57",
|
||||||
"awscli>=1.29.57",
|
"awscli>=1.29.57",
|
||||||
"botocore>=1.31.57",
|
"botocore>=1.31.57",
|
||||||
'ibm-watsonx-ai>=1.1.2; python_version >= "3.10"',
|
"ollama",
|
||||||
"ollama>=0.3.0",
|
"ibm-watsonx-ai>=1.1.2",
|
||||||
]
|
]
|
||||||
azure = ["adlfs>=2024.2.0"]
|
azure = ["adlfs>=2024.2.0"]
|
||||||
|
|
||||||
|
|||||||
@@ -18,7 +18,6 @@ from .remote import ClientConfig
|
|||||||
from .remote.db import RemoteDBConnection
|
from .remote.db import RemoteDBConnection
|
||||||
from .schema import vector
|
from .schema import vector
|
||||||
from .table import AsyncTable
|
from .table import AsyncTable
|
||||||
from ._lancedb import Session
|
|
||||||
|
|
||||||
|
|
||||||
def connect(
|
def connect(
|
||||||
@@ -31,7 +30,6 @@ def connect(
|
|||||||
request_thread_pool: Optional[Union[int, ThreadPoolExecutor]] = None,
|
request_thread_pool: Optional[Union[int, ThreadPoolExecutor]] = None,
|
||||||
client_config: Union[ClientConfig, Dict[str, Any], None] = None,
|
client_config: Union[ClientConfig, Dict[str, Any], None] = None,
|
||||||
storage_options: Optional[Dict[str, str]] = None,
|
storage_options: Optional[Dict[str, str]] = None,
|
||||||
session: Optional[Session] = None,
|
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> DBConnection:
|
) -> DBConnection:
|
||||||
"""Connect to a LanceDB database.
|
"""Connect to a LanceDB database.
|
||||||
@@ -66,12 +64,6 @@ def connect(
|
|||||||
storage_options: dict, optional
|
storage_options: dict, optional
|
||||||
Additional options for the storage backend. See available options at
|
Additional options for the storage backend. See available options at
|
||||||
<https://lancedb.github.io/lancedb/guides/storage/>
|
<https://lancedb.github.io/lancedb/guides/storage/>
|
||||||
session: Session, optional
|
|
||||||
(For LanceDB OSS only)
|
|
||||||
A session to use for this connection. Sessions allow you to configure
|
|
||||||
cache sizes for index and metadata caches, which can significantly
|
|
||||||
impact memory use and performance. They can also be re-used across
|
|
||||||
multiple connections to share the same cache state.
|
|
||||||
|
|
||||||
Examples
|
Examples
|
||||||
--------
|
--------
|
||||||
@@ -100,7 +92,7 @@ def connect(
|
|||||||
if api_key is None:
|
if api_key is None:
|
||||||
api_key = os.environ.get("LANCEDB_API_KEY")
|
api_key = os.environ.get("LANCEDB_API_KEY")
|
||||||
if api_key is None:
|
if api_key is None:
|
||||||
raise ValueError(f"api_key is required to connect to LanceDB cloud: {uri}")
|
raise ValueError(f"api_key is required to connected LanceDB cloud: {uri}")
|
||||||
if isinstance(request_thread_pool, int):
|
if isinstance(request_thread_pool, int):
|
||||||
request_thread_pool = ThreadPoolExecutor(request_thread_pool)
|
request_thread_pool = ThreadPoolExecutor(request_thread_pool)
|
||||||
return RemoteDBConnection(
|
return RemoteDBConnection(
|
||||||
@@ -121,7 +113,6 @@ def connect(
|
|||||||
uri,
|
uri,
|
||||||
read_consistency_interval=read_consistency_interval,
|
read_consistency_interval=read_consistency_interval,
|
||||||
storage_options=storage_options,
|
storage_options=storage_options,
|
||||||
session=session,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -134,7 +125,6 @@ async def connect_async(
|
|||||||
read_consistency_interval: Optional[timedelta] = None,
|
read_consistency_interval: Optional[timedelta] = None,
|
||||||
client_config: Optional[Union[ClientConfig, Dict[str, Any]]] = None,
|
client_config: Optional[Union[ClientConfig, Dict[str, Any]]] = None,
|
||||||
storage_options: Optional[Dict[str, str]] = None,
|
storage_options: Optional[Dict[str, str]] = None,
|
||||||
session: Optional[Session] = None,
|
|
||||||
) -> AsyncConnection:
|
) -> AsyncConnection:
|
||||||
"""Connect to a LanceDB database.
|
"""Connect to a LanceDB database.
|
||||||
|
|
||||||
@@ -168,12 +158,6 @@ async def connect_async(
|
|||||||
storage_options: dict, optional
|
storage_options: dict, optional
|
||||||
Additional options for the storage backend. See available options at
|
Additional options for the storage backend. See available options at
|
||||||
<https://lancedb.github.io/lancedb/guides/storage/>
|
<https://lancedb.github.io/lancedb/guides/storage/>
|
||||||
session: Session, optional
|
|
||||||
(For LanceDB OSS only)
|
|
||||||
A session to use for this connection. Sessions allow you to configure
|
|
||||||
cache sizes for index and metadata caches, which can significantly
|
|
||||||
impact memory use and performance. They can also be re-used across
|
|
||||||
multiple connections to share the same cache state.
|
|
||||||
|
|
||||||
Examples
|
Examples
|
||||||
--------
|
--------
|
||||||
@@ -213,7 +197,6 @@ async def connect_async(
|
|||||||
read_consistency_interval_secs,
|
read_consistency_interval_secs,
|
||||||
client_config,
|
client_config,
|
||||||
storage_options,
|
storage_options,
|
||||||
session,
|
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -229,7 +212,6 @@ __all__ = [
|
|||||||
"DBConnection",
|
"DBConnection",
|
||||||
"LanceDBConnection",
|
"LanceDBConnection",
|
||||||
"RemoteDBConnection",
|
"RemoteDBConnection",
|
||||||
"Session",
|
|
||||||
"__version__",
|
"__version__",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|||||||
@@ -6,19 +6,6 @@ import pyarrow as pa
|
|||||||
from .index import BTree, IvfFlat, IvfPq, Bitmap, LabelList, HnswPq, HnswSq, FTS
|
from .index import BTree, IvfFlat, IvfPq, Bitmap, LabelList, HnswPq, HnswSq, FTS
|
||||||
from .remote import ClientConfig
|
from .remote import ClientConfig
|
||||||
|
|
||||||
class Session:
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
index_cache_size_bytes: Optional[int] = None,
|
|
||||||
metadata_cache_size_bytes: Optional[int] = None,
|
|
||||||
): ...
|
|
||||||
@staticmethod
|
|
||||||
def default() -> "Session": ...
|
|
||||||
@property
|
|
||||||
def size_bytes(self) -> int: ...
|
|
||||||
@property
|
|
||||||
def approx_num_items(self) -> int: ...
|
|
||||||
|
|
||||||
class Connection(object):
|
class Connection(object):
|
||||||
uri: str
|
uri: str
|
||||||
async def table_names(
|
async def table_names(
|
||||||
@@ -102,7 +89,6 @@ async def connect(
|
|||||||
read_consistency_interval: Optional[float],
|
read_consistency_interval: Optional[float],
|
||||||
client_config: Optional[Union[ClientConfig, Dict[str, Any]]],
|
client_config: Optional[Union[ClientConfig, Dict[str, Any]]],
|
||||||
storage_options: Optional[Dict[str, str]],
|
storage_options: Optional[Dict[str, str]],
|
||||||
session: Optional[Session],
|
|
||||||
) -> Connection: ...
|
) -> Connection: ...
|
||||||
|
|
||||||
class RecordBatchStream:
|
class RecordBatchStream:
|
||||||
@@ -157,8 +143,6 @@ class VectorQuery:
|
|||||||
def postfilter(self): ...
|
def postfilter(self): ...
|
||||||
def refine_factor(self, refine_factor: int): ...
|
def refine_factor(self, refine_factor: int): ...
|
||||||
def nprobes(self, nprobes: int): ...
|
def nprobes(self, nprobes: int): ...
|
||||||
def minimum_nprobes(self, minimum_nprobes: int): ...
|
|
||||||
def maximum_nprobes(self, maximum_nprobes: int): ...
|
|
||||||
def bypass_vector_index(self): ...
|
def bypass_vector_index(self): ...
|
||||||
def nearest_to_text(self, query: dict) -> HybridQuery: ...
|
def nearest_to_text(self, query: dict) -> HybridQuery: ...
|
||||||
def to_query_request(self) -> PyQueryRequest: ...
|
def to_query_request(self) -> PyQueryRequest: ...
|
||||||
@@ -174,8 +158,6 @@ class HybridQuery:
|
|||||||
def distance_type(self, distance_type: str): ...
|
def distance_type(self, distance_type: str): ...
|
||||||
def refine_factor(self, refine_factor: int): ...
|
def refine_factor(self, refine_factor: int): ...
|
||||||
def nprobes(self, nprobes: int): ...
|
def nprobes(self, nprobes: int): ...
|
||||||
def minimum_nprobes(self, minimum_nprobes: int): ...
|
|
||||||
def maximum_nprobes(self, maximum_nprobes: int): ...
|
|
||||||
def bypass_vector_index(self): ...
|
def bypass_vector_index(self): ...
|
||||||
def to_vector_query(self) -> VectorQuery: ...
|
def to_vector_query(self) -> VectorQuery: ...
|
||||||
def to_fts_query(self) -> FTSQuery: ...
|
def to_fts_query(self) -> FTSQuery: ...
|
||||||
@@ -183,21 +165,23 @@ class HybridQuery:
|
|||||||
def get_with_row_id(self) -> bool: ...
|
def get_with_row_id(self) -> bool: ...
|
||||||
def to_query_request(self) -> PyQueryRequest: ...
|
def to_query_request(self) -> PyQueryRequest: ...
|
||||||
|
|
||||||
class FullTextQuery:
|
class PyFullTextSearchQuery:
|
||||||
pass
|
columns: Optional[List[str]]
|
||||||
|
query: str
|
||||||
|
limit: Optional[int]
|
||||||
|
wand_factor: Optional[float]
|
||||||
|
|
||||||
class PyQueryRequest:
|
class PyQueryRequest:
|
||||||
limit: Optional[int]
|
limit: Optional[int]
|
||||||
offset: Optional[int]
|
offset: Optional[int]
|
||||||
filter: Optional[Union[str, bytes]]
|
filter: Optional[Union[str, bytes]]
|
||||||
full_text_search: Optional[FullTextQuery]
|
full_text_search: Optional[PyFullTextSearchQuery]
|
||||||
select: Optional[Union[str, List[str]]]
|
select: Optional[Union[str, List[str]]]
|
||||||
fast_search: Optional[bool]
|
fast_search: Optional[bool]
|
||||||
with_row_id: Optional[bool]
|
with_row_id: Optional[bool]
|
||||||
column: Optional[str]
|
column: Optional[str]
|
||||||
query_vector: Optional[List[pa.Array]]
|
query_vector: Optional[List[pa.Array]]
|
||||||
minimum_nprobes: Optional[int]
|
nprobes: Optional[int]
|
||||||
maximum_nprobes: Optional[int]
|
|
||||||
lower_bound: Optional[float]
|
lower_bound: Optional[float]
|
||||||
upper_bound: Optional[float]
|
upper_bound: Optional[float]
|
||||||
ef: Optional[int]
|
ef: Optional[int]
|
||||||
|
|||||||
@@ -94,9 +94,9 @@ def data_to_reader(
|
|||||||
else:
|
else:
|
||||||
raise TypeError(
|
raise TypeError(
|
||||||
f"Unknown data type {type(data)}. "
|
f"Unknown data type {type(data)}. "
|
||||||
"Supported types: list of dicts, pandas DataFrame, polars DataFrame, "
|
"Please check "
|
||||||
"pyarrow Table/RecordBatch, or Pydantic models. "
|
"https://lancedb.github.io/lance/read_and_write.html "
|
||||||
"See https://lancedb.github.io/lancedb/guides/tables/ for examples."
|
"to see supported types."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -37,7 +37,6 @@ if TYPE_CHECKING:
|
|||||||
from ._lancedb import Connection as LanceDbConnection
|
from ._lancedb import Connection as LanceDbConnection
|
||||||
from .common import DATA, URI
|
from .common import DATA, URI
|
||||||
from .embeddings import EmbeddingFunctionConfig
|
from .embeddings import EmbeddingFunctionConfig
|
||||||
from ._lancedb import Session
|
|
||||||
|
|
||||||
|
|
||||||
class DBConnection(EnforceOverrides):
|
class DBConnection(EnforceOverrides):
|
||||||
@@ -248,9 +247,6 @@ class DBConnection(EnforceOverrides):
|
|||||||
name: str
|
name: str
|
||||||
The name of the table.
|
The name of the table.
|
||||||
index_cache_size: int, default 256
|
index_cache_size: int, default 256
|
||||||
**Deprecated**: Use session-level cache configuration instead.
|
|
||||||
Create a Session with custom cache sizes and pass it to lancedb.connect().
|
|
||||||
|
|
||||||
Set the size of the index cache, specified as a number of entries
|
Set the size of the index cache, specified as a number of entries
|
||||||
|
|
||||||
The exact meaning of an "entry" will depend on the type of index:
|
The exact meaning of an "entry" will depend on the type of index:
|
||||||
@@ -358,7 +354,6 @@ class LanceDBConnection(DBConnection):
|
|||||||
*,
|
*,
|
||||||
read_consistency_interval: Optional[timedelta] = None,
|
read_consistency_interval: Optional[timedelta] = None,
|
||||||
storage_options: Optional[Dict[str, str]] = None,
|
storage_options: Optional[Dict[str, str]] = None,
|
||||||
session: Optional[Session] = None,
|
|
||||||
):
|
):
|
||||||
if not isinstance(uri, Path):
|
if not isinstance(uri, Path):
|
||||||
scheme = get_uri_scheme(uri)
|
scheme = get_uri_scheme(uri)
|
||||||
@@ -372,7 +367,6 @@ class LanceDBConnection(DBConnection):
|
|||||||
self._entered = False
|
self._entered = False
|
||||||
self.read_consistency_interval = read_consistency_interval
|
self.read_consistency_interval = read_consistency_interval
|
||||||
self.storage_options = storage_options
|
self.storage_options = storage_options
|
||||||
self.session = session
|
|
||||||
|
|
||||||
if read_consistency_interval is not None:
|
if read_consistency_interval is not None:
|
||||||
read_consistency_interval_secs = read_consistency_interval.total_seconds()
|
read_consistency_interval_secs = read_consistency_interval.total_seconds()
|
||||||
@@ -388,7 +382,6 @@ class LanceDBConnection(DBConnection):
|
|||||||
read_consistency_interval_secs,
|
read_consistency_interval_secs,
|
||||||
None,
|
None,
|
||||||
storage_options,
|
storage_options,
|
||||||
session,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
self._conn = AsyncConnection(LOOP.run(do_connect()))
|
self._conn = AsyncConnection(LOOP.run(do_connect()))
|
||||||
@@ -482,17 +475,6 @@ class LanceDBConnection(DBConnection):
|
|||||||
-------
|
-------
|
||||||
A LanceTable object representing the table.
|
A LanceTable object representing the table.
|
||||||
"""
|
"""
|
||||||
if index_cache_size is not None:
|
|
||||||
import warnings
|
|
||||||
|
|
||||||
warnings.warn(
|
|
||||||
"index_cache_size is deprecated. Use session-level cache "
|
|
||||||
"configuration instead. Create a Session with custom cache sizes "
|
|
||||||
"and pass it to lancedb.connect().",
|
|
||||||
DeprecationWarning,
|
|
||||||
stacklevel=2,
|
|
||||||
)
|
|
||||||
|
|
||||||
return LanceTable.open(
|
return LanceTable.open(
|
||||||
self,
|
self,
|
||||||
name,
|
name,
|
||||||
@@ -838,9 +820,6 @@ class AsyncConnection(object):
|
|||||||
See available options at
|
See available options at
|
||||||
<https://lancedb.github.io/lancedb/guides/storage/>
|
<https://lancedb.github.io/lancedb/guides/storage/>
|
||||||
index_cache_size: int, default 256
|
index_cache_size: int, default 256
|
||||||
**Deprecated**: Use session-level cache configuration instead.
|
|
||||||
Create a Session with custom cache sizes and pass it to lancedb.connect().
|
|
||||||
|
|
||||||
Set the size of the index cache, specified as a number of entries
|
Set the size of the index cache, specified as a number of entries
|
||||||
|
|
||||||
The exact meaning of an "entry" will depend on the type of index:
|
The exact meaning of an "entry" will depend on the type of index:
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ from .instructor import InstructorEmbeddingFunction
|
|||||||
from .ollama import OllamaEmbeddings
|
from .ollama import OllamaEmbeddings
|
||||||
from .open_clip import OpenClipEmbeddings
|
from .open_clip import OpenClipEmbeddings
|
||||||
from .openai import OpenAIEmbeddings
|
from .openai import OpenAIEmbeddings
|
||||||
from .registry import EmbeddingFunctionRegistry, get_registry, register
|
from .registry import EmbeddingFunctionRegistry, get_registry
|
||||||
from .sentence_transformers import SentenceTransformerEmbeddings
|
from .sentence_transformers import SentenceTransformerEmbeddings
|
||||||
from .gte import GteEmbeddings
|
from .gte import GteEmbeddings
|
||||||
from .transformers import TransformersEmbeddingFunction, ColbertEmbeddings
|
from .transformers import TransformersEmbeddingFunction, ColbertEmbeddings
|
||||||
|
|||||||
@@ -9,14 +9,11 @@ from huggingface_hub import snapshot_download
|
|||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
from transformers import BertTokenizer
|
from transformers import BertTokenizer
|
||||||
|
|
||||||
from .utils import create_import_stub
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import mlx.core as mx
|
import mlx.core as mx
|
||||||
import mlx.nn as nn
|
import mlx.nn as nn
|
||||||
except ImportError:
|
except ImportError:
|
||||||
mx = create_import_stub("mlx.core", "mlx")
|
raise ImportError("You need to install MLX to use this model use - pip install mlx")
|
||||||
nn = create_import_stub("mlx.nn", "mlx")
|
|
||||||
|
|
||||||
|
|
||||||
def average_pool(last_hidden_state: mx.array, attention_mask: mx.array) -> mx.array:
|
def average_pool(last_hidden_state: mx.array, attention_mask: mx.array) -> mx.array:
|
||||||
@@ -75,7 +72,7 @@ class TransformerEncoder(nn.Module):
|
|||||||
super().__init__()
|
super().__init__()
|
||||||
self.layers = [
|
self.layers = [
|
||||||
TransformerEncoderLayer(dims, num_heads, mlp_dims)
|
TransformerEncoderLayer(dims, num_heads, mlp_dims)
|
||||||
for _ in range(num_layers)
|
for i in range(num_layers)
|
||||||
]
|
]
|
||||||
|
|
||||||
def __call__(self, x, mask):
|
def __call__(self, x, mask):
|
||||||
|
|||||||
@@ -2,15 +2,14 @@
|
|||||||
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||||
|
|
||||||
from functools import cached_property
|
from functools import cached_property
|
||||||
from typing import TYPE_CHECKING, List, Optional, Sequence, Union
|
from typing import TYPE_CHECKING, List, Optional, Union
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
from ..util import attempt_import_or_raise
|
from ..util import attempt_import_or_raise
|
||||||
from .base import TextEmbeddingFunction
|
from .base import TextEmbeddingFunction
|
||||||
from .registry import register
|
from .registry import register
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
|
import numpy as np
|
||||||
import ollama
|
import ollama
|
||||||
|
|
||||||
|
|
||||||
@@ -29,21 +28,23 @@ class OllamaEmbeddings(TextEmbeddingFunction):
|
|||||||
keep_alive: Optional[Union[float, str]] = None
|
keep_alive: Optional[Union[float, str]] = None
|
||||||
ollama_client_kwargs: Optional[dict] = {}
|
ollama_client_kwargs: Optional[dict] = {}
|
||||||
|
|
||||||
def ndims(self) -> int:
|
def ndims(self):
|
||||||
return len(self.generate_embeddings(["foo"])[0])
|
return len(self.generate_embeddings(["foo"])[0])
|
||||||
|
|
||||||
def _compute_embedding(self, text: Sequence[str]) -> Sequence[Sequence[float]]:
|
def _compute_embedding(self, text) -> Union["np.array", None]:
|
||||||
response = self._ollama_client.embed(
|
return (
|
||||||
model=self.name,
|
self._ollama_client.embeddings(
|
||||||
input=text,
|
model=self.name,
|
||||||
options=self.options,
|
prompt=text,
|
||||||
keep_alive=self.keep_alive,
|
options=self.options,
|
||||||
|
keep_alive=self.keep_alive,
|
||||||
|
)["embedding"]
|
||||||
|
or None
|
||||||
)
|
)
|
||||||
return response.embeddings
|
|
||||||
|
|
||||||
def generate_embeddings(
|
def generate_embeddings(
|
||||||
self, texts: Union[List[str], np.ndarray]
|
self, texts: Union[List[str], "np.ndarray"]
|
||||||
) -> list[Union[np.array, None]]:
|
) -> list[Union["np.array", None]]:
|
||||||
"""
|
"""
|
||||||
Get the embeddings for the given texts
|
Get the embeddings for the given texts
|
||||||
|
|
||||||
@@ -53,8 +54,8 @@ class OllamaEmbeddings(TextEmbeddingFunction):
|
|||||||
The texts to embed
|
The texts to embed
|
||||||
"""
|
"""
|
||||||
# TODO retry, rate limit, token limit
|
# TODO retry, rate limit, token limit
|
||||||
embeddings = self._compute_embedding(texts)
|
embeddings = [self._compute_embedding(text) for text in texts]
|
||||||
return list(embeddings)
|
return embeddings
|
||||||
|
|
||||||
@cached_property
|
@cached_property
|
||||||
def _ollama_client(self) -> "ollama.Client":
|
def _ollama_client(self) -> "ollama.Client":
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||||
|
|
||||||
import json
|
import json
|
||||||
from typing import Dict, Optional, Type
|
from typing import Dict, Optional
|
||||||
|
|
||||||
from .base import EmbeddingFunction, EmbeddingFunctionConfig
|
from .base import EmbeddingFunction, EmbeddingFunctionConfig
|
||||||
|
|
||||||
@@ -43,7 +43,7 @@ class EmbeddingFunctionRegistry:
|
|||||||
self._functions = {}
|
self._functions = {}
|
||||||
self._variables = {}
|
self._variables = {}
|
||||||
|
|
||||||
def register(self, alias: Optional[str] = None):
|
def register(self, alias: str = None):
|
||||||
"""
|
"""
|
||||||
This creates a decorator that can be used to register
|
This creates a decorator that can be used to register
|
||||||
an EmbeddingFunction.
|
an EmbeddingFunction.
|
||||||
@@ -75,7 +75,7 @@ class EmbeddingFunctionRegistry:
|
|||||||
"""
|
"""
|
||||||
self._functions = {}
|
self._functions = {}
|
||||||
|
|
||||||
def get(self, name: str) -> Type[EmbeddingFunction]:
|
def get(self, name: str):
|
||||||
"""
|
"""
|
||||||
Fetch an embedding function class by name
|
Fetch an embedding function class by name
|
||||||
|
|
||||||
|
|||||||
@@ -21,36 +21,6 @@ from ..dependencies import pandas as pd
|
|||||||
from ..util import attempt_import_or_raise
|
from ..util import attempt_import_or_raise
|
||||||
|
|
||||||
|
|
||||||
def create_import_stub(module_name: str, package_name: str = None):
|
|
||||||
"""
|
|
||||||
Create a stub module that allows class definition but fails when used.
|
|
||||||
This allows modules to be imported for doctest collection even when
|
|
||||||
optional dependencies are not available.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
module_name : str
|
|
||||||
The name of the module to create a stub for
|
|
||||||
package_name : str, optional
|
|
||||||
The package name to suggest in the error message
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
object
|
|
||||||
A stub object that can be used in place of the module
|
|
||||||
"""
|
|
||||||
|
|
||||||
class _ImportStub:
|
|
||||||
def __getattr__(self, name):
|
|
||||||
return _ImportStub # Return stub for chained access like nn.Module
|
|
||||||
|
|
||||||
def __call__(self, *args, **kwargs):
|
|
||||||
pkg = package_name or module_name
|
|
||||||
raise ImportError(f"You need to install {pkg} to use this functionality")
|
|
||||||
|
|
||||||
return _ImportStub()
|
|
||||||
|
|
||||||
|
|
||||||
# ruff: noqa: PERF203
|
# ruff: noqa: PERF203
|
||||||
def retry(tries=10, delay=1, max_delay=30, backoff=3, jitter=1):
|
def retry(tries=10, delay=1, max_delay=30, backoff=3, jitter=1):
|
||||||
def wrapper(fn):
|
def wrapper(fn):
|
||||||
|
|||||||
@@ -137,9 +137,6 @@ class FTS:
|
|||||||
stem: bool = True
|
stem: bool = True
|
||||||
remove_stop_words: bool = True
|
remove_stop_words: bool = True
|
||||||
ascii_folding: bool = True
|
ascii_folding: bool = True
|
||||||
ngram_min_length: int = 3
|
|
||||||
ngram_max_length: int = 3
|
|
||||||
prefix_only: bool = False
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
|||||||
@@ -4,6 +4,7 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
|
import abc
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from datetime import timedelta
|
from datetime import timedelta
|
||||||
@@ -14,7 +15,7 @@ from typing import (
|
|||||||
Literal,
|
Literal,
|
||||||
Optional,
|
Optional,
|
||||||
Tuple,
|
Tuple,
|
||||||
TypeVar,
|
Type,
|
||||||
Union,
|
Union,
|
||||||
Any,
|
Any,
|
||||||
)
|
)
|
||||||
@@ -58,8 +59,6 @@ if TYPE_CHECKING:
|
|||||||
else:
|
else:
|
||||||
from typing_extensions import Self
|
from typing_extensions import Self
|
||||||
|
|
||||||
T = TypeVar("T", bound="LanceModel")
|
|
||||||
|
|
||||||
|
|
||||||
# Pydantic validation function for vector queries
|
# Pydantic validation function for vector queries
|
||||||
def ensure_vector_query(
|
def ensure_vector_query(
|
||||||
@@ -89,28 +88,15 @@ def ensure_vector_query(
|
|||||||
return val
|
return val
|
||||||
|
|
||||||
|
|
||||||
class FullTextQueryType(str, Enum):
|
class FullTextQueryType(Enum):
|
||||||
MATCH = "match"
|
MATCH = "match"
|
||||||
MATCH_PHRASE = "match_phrase"
|
MATCH_PHRASE = "match_phrase"
|
||||||
BOOST = "boost"
|
BOOST = "boost"
|
||||||
MULTI_MATCH = "multi_match"
|
MULTI_MATCH = "multi_match"
|
||||||
BOOLEAN = "boolean"
|
|
||||||
|
|
||||||
|
|
||||||
class FullTextOperator(str, Enum):
|
class FullTextQuery(abc.ABC, pydantic.BaseModel):
|
||||||
AND = "AND"
|
@abc.abstractmethod
|
||||||
OR = "OR"
|
|
||||||
|
|
||||||
|
|
||||||
class Occur(str, Enum):
|
|
||||||
SHOULD = "SHOULD"
|
|
||||||
MUST = "MUST"
|
|
||||||
MUST_NOT = "MUST_NOT"
|
|
||||||
|
|
||||||
|
|
||||||
@pydantic.dataclasses.dataclass
|
|
||||||
class FullTextQuery(ABC):
|
|
||||||
@abstractmethod
|
|
||||||
def query_type(self) -> FullTextQueryType:
|
def query_type(self) -> FullTextQueryType:
|
||||||
"""
|
"""
|
||||||
Get the query type of the query.
|
Get the query type of the query.
|
||||||
@@ -120,178 +106,193 @@ class FullTextQuery(ABC):
|
|||||||
str
|
str
|
||||||
The type of the query.
|
The type of the query.
|
||||||
"""
|
"""
|
||||||
pass
|
|
||||||
|
|
||||||
def __and__(self, other: "FullTextQuery") -> "FullTextQuery":
|
@abc.abstractmethod
|
||||||
|
def to_dict(self) -> dict:
|
||||||
"""
|
"""
|
||||||
Combine two queries with a logical AND operation.
|
Convert the query to a dictionary.
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
other : FullTextQuery
|
|
||||||
The other query to combine with.
|
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
FullTextQuery
|
dict
|
||||||
A new query that combines both queries with AND.
|
The query as a dictionary.
|
||||||
"""
|
"""
|
||||||
return BooleanQuery([(Occur.MUST, self), (Occur.MUST, other)])
|
|
||||||
|
|
||||||
def __or__(self, other: "FullTextQuery") -> "FullTextQuery":
|
|
||||||
"""
|
|
||||||
Combine two queries with a logical OR operation.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
other : FullTextQuery
|
|
||||||
The other query to combine with.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
FullTextQuery
|
|
||||||
A new query that combines both queries with OR.
|
|
||||||
"""
|
|
||||||
return BooleanQuery([(Occur.SHOULD, self), (Occur.SHOULD, other)])
|
|
||||||
|
|
||||||
|
|
||||||
@pydantic.dataclasses.dataclass
|
|
||||||
class MatchQuery(FullTextQuery):
|
class MatchQuery(FullTextQuery):
|
||||||
"""
|
|
||||||
Match query for full-text search.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
query : str
|
|
||||||
The query string to match against.
|
|
||||||
column : str
|
|
||||||
The name of the column to match against.
|
|
||||||
boost : float, default 1.0
|
|
||||||
The boost factor for the query.
|
|
||||||
The score of each matching document is multiplied by this value.
|
|
||||||
fuzziness : int, optional
|
|
||||||
The maximum edit distance for each term in the match query.
|
|
||||||
Defaults to 0 (exact match).
|
|
||||||
If None, fuzziness is applied automatically by the rules:
|
|
||||||
- 0 for terms with length <= 2
|
|
||||||
- 1 for terms with length <= 5
|
|
||||||
- 2 for terms with length > 5
|
|
||||||
max_expansions : int, optional
|
|
||||||
The maximum number of terms to consider for fuzzy matching.
|
|
||||||
Defaults to 50.
|
|
||||||
operator : FullTextOperator, default OR
|
|
||||||
The operator to use for combining the query results.
|
|
||||||
Can be either `AND` or `OR`.
|
|
||||||
If `AND`, all terms in the query must match.
|
|
||||||
If `OR`, at least one term in the query must match.
|
|
||||||
prefix_length : int, optional
|
|
||||||
The number of beginning characters being unchanged for fuzzy matching.
|
|
||||||
This is useful to achieve prefix matching.
|
|
||||||
"""
|
|
||||||
|
|
||||||
query: str
|
query: str
|
||||||
column: str
|
column: str
|
||||||
boost: float = pydantic.Field(1.0, kw_only=True)
|
boost: float = 1.0
|
||||||
fuzziness: int = pydantic.Field(0, kw_only=True)
|
fuzziness: int = 0
|
||||||
max_expansions: int = pydantic.Field(50, kw_only=True)
|
max_expansions: int = 50
|
||||||
operator: FullTextOperator = pydantic.Field(FullTextOperator.OR, kw_only=True)
|
|
||||||
prefix_length: int = pydantic.Field(0, kw_only=True)
|
def __init__(
|
||||||
|
self,
|
||||||
|
query: str,
|
||||||
|
column: str,
|
||||||
|
*,
|
||||||
|
boost: float = 1.0,
|
||||||
|
fuzziness: int = 0,
|
||||||
|
max_expansions: int = 50,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Match query for full-text search.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
query : str
|
||||||
|
The query string to match against.
|
||||||
|
column : str
|
||||||
|
The name of the column to match against.
|
||||||
|
boost : float, default 1.0
|
||||||
|
The boost factor for the query.
|
||||||
|
The score of each matching document is multiplied by this value.
|
||||||
|
fuzziness : int, optional
|
||||||
|
The maximum edit distance for each term in the match query.
|
||||||
|
Defaults to 0 (exact match).
|
||||||
|
If None, fuzziness is applied automatically by the rules:
|
||||||
|
- 0 for terms with length <= 2
|
||||||
|
- 1 for terms with length <= 5
|
||||||
|
- 2 for terms with length > 5
|
||||||
|
max_expansions : int, optional
|
||||||
|
The maximum number of terms to consider for fuzzy matching.
|
||||||
|
Defaults to 50.
|
||||||
|
"""
|
||||||
|
super().__init__(
|
||||||
|
query=query,
|
||||||
|
column=column,
|
||||||
|
boost=boost,
|
||||||
|
fuzziness=fuzziness,
|
||||||
|
max_expansions=max_expansions,
|
||||||
|
)
|
||||||
|
|
||||||
def query_type(self) -> FullTextQueryType:
|
def query_type(self) -> FullTextQueryType:
|
||||||
return FullTextQueryType.MATCH
|
return FullTextQueryType.MATCH
|
||||||
|
|
||||||
|
def to_dict(self) -> dict:
|
||||||
|
return {
|
||||||
|
"match": {
|
||||||
|
self.column: {
|
||||||
|
"query": self.query,
|
||||||
|
"boost": self.boost,
|
||||||
|
"fuzziness": self.fuzziness,
|
||||||
|
"max_expansions": self.max_expansions,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@pydantic.dataclasses.dataclass
|
|
||||||
class PhraseQuery(FullTextQuery):
|
class PhraseQuery(FullTextQuery):
|
||||||
"""
|
|
||||||
Phrase query for full-text search.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
query : str
|
|
||||||
The query string to match against.
|
|
||||||
column : str
|
|
||||||
The name of the column to match against.
|
|
||||||
"""
|
|
||||||
|
|
||||||
query: str
|
query: str
|
||||||
column: str
|
column: str
|
||||||
slop: int = pydantic.Field(0, kw_only=True)
|
|
||||||
|
def __init__(self, query: str, column: str):
|
||||||
|
"""
|
||||||
|
Phrase query for full-text search.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
query : str
|
||||||
|
The query string to match against.
|
||||||
|
column : str
|
||||||
|
The name of the column to match against.
|
||||||
|
"""
|
||||||
|
super().__init__(query=query, column=column)
|
||||||
|
|
||||||
def query_type(self) -> FullTextQueryType:
|
def query_type(self) -> FullTextQueryType:
|
||||||
return FullTextQueryType.MATCH_PHRASE
|
return FullTextQueryType.MATCH_PHRASE
|
||||||
|
|
||||||
|
def to_dict(self) -> dict:
|
||||||
|
return {
|
||||||
|
"match_phrase": {
|
||||||
|
self.column: self.query,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@pydantic.dataclasses.dataclass
|
|
||||||
class BoostQuery(FullTextQuery):
|
class BoostQuery(FullTextQuery):
|
||||||
"""
|
|
||||||
Boost query for full-text search.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
positive : dict
|
|
||||||
The positive query object.
|
|
||||||
negative : dict
|
|
||||||
The negative query object.
|
|
||||||
negative_boost : float, default 0.5
|
|
||||||
The boost factor for the negative query.
|
|
||||||
"""
|
|
||||||
|
|
||||||
positive: FullTextQuery
|
positive: FullTextQuery
|
||||||
negative: FullTextQuery
|
negative: FullTextQuery
|
||||||
negative_boost: float = pydantic.Field(0.5, kw_only=True)
|
negative_boost: float = 0.5
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
positive: FullTextQuery,
|
||||||
|
negative: FullTextQuery,
|
||||||
|
*,
|
||||||
|
negative_boost: float = 0.5,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Boost query for full-text search.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
positive : dict
|
||||||
|
The positive query object.
|
||||||
|
negative : dict
|
||||||
|
The negative query object.
|
||||||
|
negative_boost : float
|
||||||
|
The boost factor for the negative query.
|
||||||
|
"""
|
||||||
|
super().__init__(
|
||||||
|
positive=positive, negative=negative, negative_boost=negative_boost
|
||||||
|
)
|
||||||
|
|
||||||
def query_type(self) -> FullTextQueryType:
|
def query_type(self) -> FullTextQueryType:
|
||||||
return FullTextQueryType.BOOST
|
return FullTextQueryType.BOOST
|
||||||
|
|
||||||
|
def to_dict(self) -> dict:
|
||||||
|
return {
|
||||||
|
"boost": {
|
||||||
|
"positive": self.positive.to_dict(),
|
||||||
|
"negative": self.negative.to_dict(),
|
||||||
|
"negative_boost": self.negative_boost,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@pydantic.dataclasses.dataclass
|
|
||||||
class MultiMatchQuery(FullTextQuery):
|
class MultiMatchQuery(FullTextQuery):
|
||||||
"""
|
|
||||||
Multi-match query for full-text search.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
query : str | list[Query]
|
|
||||||
If a string, the query string to match against.
|
|
||||||
columns : list[str]
|
|
||||||
The list of columns to match against.
|
|
||||||
boosts : list[float], optional
|
|
||||||
The list of boost factors for each column. If not provided,
|
|
||||||
all columns will have the same boost factor.
|
|
||||||
operator : FullTextOperator, default OR
|
|
||||||
The operator to use for combining the query results.
|
|
||||||
Can be either `AND` or `OR`.
|
|
||||||
It would be applied to all columns individually.
|
|
||||||
For example, if the operator is `AND`,
|
|
||||||
then the query "hello world" is equal to
|
|
||||||
`match("hello AND world", column1) OR match("hello AND world", column2)`.
|
|
||||||
"""
|
|
||||||
|
|
||||||
query: str
|
query: str
|
||||||
columns: list[str]
|
columns: list[str]
|
||||||
boosts: Optional[list[float]] = pydantic.Field(None, kw_only=True)
|
boosts: list[float]
|
||||||
operator: FullTextOperator = pydantic.Field(FullTextOperator.OR, kw_only=True)
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
query: str,
|
||||||
|
columns: list[str],
|
||||||
|
*,
|
||||||
|
boosts: Optional[list[float]] = None,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Multi-match query for full-text search.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
query : str
|
||||||
|
The query string to match against.
|
||||||
|
|
||||||
|
columns : list[str]
|
||||||
|
The list of columns to match against.
|
||||||
|
|
||||||
|
boosts : list[float], optional
|
||||||
|
The list of boost factors for each column. If not provided,
|
||||||
|
all columns will have the same boost factor.
|
||||||
|
"""
|
||||||
|
if boosts is None:
|
||||||
|
boosts = [1.0] * len(columns)
|
||||||
|
super().__init__(query=query, columns=columns, boosts=boosts)
|
||||||
|
|
||||||
def query_type(self) -> FullTextQueryType:
|
def query_type(self) -> FullTextQueryType:
|
||||||
return FullTextQueryType.MULTI_MATCH
|
return FullTextQueryType.MULTI_MATCH
|
||||||
|
|
||||||
|
def to_dict(self) -> dict:
|
||||||
@pydantic.dataclasses.dataclass
|
return {
|
||||||
class BooleanQuery(FullTextQuery):
|
"multi_match": {
|
||||||
"""
|
"query": self.query,
|
||||||
Boolean query for full-text search.
|
"columns": self.columns,
|
||||||
|
"boost": self.boosts,
|
||||||
Parameters
|
}
|
||||||
----------
|
}
|
||||||
queries : list[tuple(Occur, FullTextQuery)]
|
|
||||||
The list of queries with their occurrence requirements.
|
|
||||||
"""
|
|
||||||
|
|
||||||
queries: list[tuple[Occur, FullTextQuery]]
|
|
||||||
|
|
||||||
def query_type(self) -> FullTextQueryType:
|
|
||||||
return FullTextQueryType.BOOLEAN
|
|
||||||
|
|
||||||
|
|
||||||
class FullTextSearchQuery(pydantic.BaseModel):
|
class FullTextSearchQuery(pydantic.BaseModel):
|
||||||
@@ -444,18 +445,8 @@ class Query(pydantic.BaseModel):
|
|||||||
# which columns to return in the results
|
# which columns to return in the results
|
||||||
columns: Optional[Union[List[str], Dict[str, str]]] = None
|
columns: Optional[Union[List[str], Dict[str, str]]] = None
|
||||||
|
|
||||||
# minimum number of IVF partitions to search
|
# number of IVF partitions to search
|
||||||
#
|
nprobes: Optional[int] = None
|
||||||
# If None then a default value (20) will be used.
|
|
||||||
minimum_nprobes: Optional[int] = None
|
|
||||||
|
|
||||||
# maximum number of IVF partitions to search
|
|
||||||
#
|
|
||||||
# If None then a default value (20) will be used.
|
|
||||||
#
|
|
||||||
# If 0 then no limit will be applied and all partitions could be searched
|
|
||||||
# if needed to satisfy the limit.
|
|
||||||
maximum_nprobes: Optional[int] = None
|
|
||||||
|
|
||||||
# lower bound for distance search
|
# lower bound for distance search
|
||||||
lower_bound: Optional[float] = None
|
lower_bound: Optional[float] = None
|
||||||
@@ -493,8 +484,7 @@ class Query(pydantic.BaseModel):
|
|||||||
query.vector_column = req.column
|
query.vector_column = req.column
|
||||||
query.vector = req.query_vector
|
query.vector = req.query_vector
|
||||||
query.distance_type = req.distance_type
|
query.distance_type = req.distance_type
|
||||||
query.minimum_nprobes = req.minimum_nprobes
|
query.nprobes = req.nprobes
|
||||||
query.maximum_nprobes = req.maximum_nprobes
|
|
||||||
query.lower_bound = req.lower_bound
|
query.lower_bound = req.lower_bound
|
||||||
query.upper_bound = req.upper_bound
|
query.upper_bound = req.upper_bound
|
||||||
query.ef = req.ef
|
query.ef = req.ef
|
||||||
@@ -503,8 +493,10 @@ class Query(pydantic.BaseModel):
|
|||||||
query.postfilter = req.postfilter
|
query.postfilter = req.postfilter
|
||||||
if req.full_text_search is not None:
|
if req.full_text_search is not None:
|
||||||
query.full_text_query = FullTextSearchQuery(
|
query.full_text_query = FullTextSearchQuery(
|
||||||
columns=None,
|
columns=req.full_text_search.columns,
|
||||||
query=req.full_text_search,
|
query=req.full_text_search.query,
|
||||||
|
limit=req.full_text_search.limit,
|
||||||
|
wand_factor=req.full_text_search.wand_factor,
|
||||||
)
|
)
|
||||||
return query
|
return query
|
||||||
|
|
||||||
@@ -748,8 +740,8 @@ class LanceQueryBuilder(ABC):
|
|||||||
return self.to_arrow(timeout=timeout).to_pylist()
|
return self.to_arrow(timeout=timeout).to_pylist()
|
||||||
|
|
||||||
def to_pydantic(
|
def to_pydantic(
|
||||||
self, model: type[T], *, timeout: Optional[timedelta] = None
|
self, model: Type[LanceModel], *, timeout: Optional[timedelta] = None
|
||||||
) -> list[T]:
|
) -> List[LanceModel]:
|
||||||
"""Return the table as a list of pydantic models.
|
"""Return the table as a list of pydantic models.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
@@ -908,11 +900,11 @@ class LanceQueryBuilder(ABC):
|
|||||||
>>> plan = table.search(query).explain_plan(True)
|
>>> plan = table.search(query).explain_plan(True)
|
||||||
>>> print(plan) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
|
>>> print(plan) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
|
||||||
ProjectionExec: expr=[vector@0 as vector, _distance@2 as _distance]
|
ProjectionExec: expr=[vector@0 as vector, _distance@2 as _distance]
|
||||||
GlobalLimitExec: skip=0, fetch=10
|
GlobalLimitExec: skip=0, fetch=10
|
||||||
FilterExec: _distance@2 IS NOT NULL
|
FilterExec: _distance@2 IS NOT NULL
|
||||||
SortExec: TopK(fetch=10), expr=[_distance@2 ASC NULLS LAST], preserve_partitioning=[false]
|
SortExec: TopK(fetch=10), expr=[_distance@2 ASC NULLS LAST], preserve_partitioning=[false]
|
||||||
KNNVectorDistance: metric=l2
|
KNNVectorDistance: metric=l2
|
||||||
LanceRead: uri=..., projection=[vector], ...
|
LanceScan: uri=..., projection=[vector], row_id=true, row_addr=false, ordered=false
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
@@ -942,19 +934,19 @@ class LanceQueryBuilder(ABC):
|
|||||||
>>> plan = table.search(query).analyze_plan()
|
>>> plan = table.search(query).analyze_plan()
|
||||||
>>> print(plan) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
|
>>> print(plan) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
|
||||||
AnalyzeExec verbose=true, metrics=[]
|
AnalyzeExec verbose=true, metrics=[]
|
||||||
TracedExec, metrics=[]
|
ProjectionExec: expr=[...], metrics=[...]
|
||||||
ProjectionExec: expr=[...], metrics=[...]
|
GlobalLimitExec: skip=0, fetch=10, metrics=[...]
|
||||||
GlobalLimitExec: skip=0, fetch=10, metrics=[...]
|
FilterExec: _distance@2 IS NOT NULL,
|
||||||
FilterExec: _distance@2 IS NOT NULL,
|
metrics=[output_rows=..., elapsed_compute=...]
|
||||||
metrics=[output_rows=..., elapsed_compute=...]
|
SortExec: TopK(fetch=10), expr=[...],
|
||||||
SortExec: TopK(fetch=10), expr=[...],
|
preserve_partitioning=[...],
|
||||||
preserve_partitioning=[...],
|
metrics=[output_rows=..., elapsed_compute=..., row_replacements=...]
|
||||||
metrics=[output_rows=..., elapsed_compute=..., row_replacements=...]
|
KNNVectorDistance: metric=l2,
|
||||||
KNNVectorDistance: metric=l2,
|
metrics=[output_rows=..., elapsed_compute=..., output_batches=...]
|
||||||
metrics=[output_rows=..., elapsed_compute=..., output_batches=...]
|
LanceScan: uri=..., projection=[vector], row_id=true,
|
||||||
LanceRead: uri=..., projection=[vector], ...
|
row_addr=false, ordered=false,
|
||||||
metrics=[output_rows=..., elapsed_compute=...,
|
metrics=[output_rows=..., elapsed_compute=...,
|
||||||
bytes_read=..., iops=..., requests=...]
|
bytes_read=..., iops=..., requests=...]
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
@@ -1055,8 +1047,7 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
|
|||||||
super().__init__(table)
|
super().__init__(table)
|
||||||
self._query = query
|
self._query = query
|
||||||
self._distance_type = None
|
self._distance_type = None
|
||||||
self._minimum_nprobes = None
|
self._nprobes = None
|
||||||
self._maximum_nprobes = None
|
|
||||||
self._lower_bound = None
|
self._lower_bound = None
|
||||||
self._upper_bound = None
|
self._upper_bound = None
|
||||||
self._refine_factor = None
|
self._refine_factor = None
|
||||||
@@ -1119,10 +1110,6 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
|
|||||||
See discussion in [Querying an ANN Index][querying-an-ann-index] for
|
See discussion in [Querying an ANN Index][querying-an-ann-index] for
|
||||||
tuning advice.
|
tuning advice.
|
||||||
|
|
||||||
This method sets both the minimum and maximum number of probes to the same
|
|
||||||
value. See `minimum_nprobes` and `maximum_nprobes` for more fine-grained
|
|
||||||
control.
|
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
nprobes: int
|
nprobes: int
|
||||||
@@ -1133,36 +1120,7 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
|
|||||||
LanceVectorQueryBuilder
|
LanceVectorQueryBuilder
|
||||||
The LanceQueryBuilder object.
|
The LanceQueryBuilder object.
|
||||||
"""
|
"""
|
||||||
self._minimum_nprobes = nprobes
|
self._nprobes = nprobes
|
||||||
self._maximum_nprobes = nprobes
|
|
||||||
return self
|
|
||||||
|
|
||||||
def minimum_nprobes(self, minimum_nprobes: int) -> LanceVectorQueryBuilder:
|
|
||||||
"""Set the minimum number of probes to use.
|
|
||||||
|
|
||||||
See `nprobes` for more details.
|
|
||||||
|
|
||||||
These partitions will be searched on every vector query and will increase recall
|
|
||||||
at the expense of latency.
|
|
||||||
"""
|
|
||||||
self._minimum_nprobes = minimum_nprobes
|
|
||||||
return self
|
|
||||||
|
|
||||||
def maximum_nprobes(self, maximum_nprobes: int) -> LanceVectorQueryBuilder:
|
|
||||||
"""Set the maximum number of probes to use.
|
|
||||||
|
|
||||||
See `nprobes` for more details.
|
|
||||||
|
|
||||||
If this value is greater than `minimum_nprobes` then the excess partitions
|
|
||||||
will be searched only if we have not found enough results.
|
|
||||||
|
|
||||||
This can be useful when there is a narrow filter to allow these queries to
|
|
||||||
spend more time searching and avoid potential false negatives.
|
|
||||||
|
|
||||||
If this value is 0 then no limit will be applied and all partitions could be
|
|
||||||
searched if needed to satisfy the limit.
|
|
||||||
"""
|
|
||||||
self._maximum_nprobes = maximum_nprobes
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def distance_range(
|
def distance_range(
|
||||||
@@ -1266,8 +1224,7 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
|
|||||||
limit=self._limit,
|
limit=self._limit,
|
||||||
distance_type=self._distance_type,
|
distance_type=self._distance_type,
|
||||||
columns=self._columns,
|
columns=self._columns,
|
||||||
minimum_nprobes=self._minimum_nprobes,
|
nprobes=self._nprobes,
|
||||||
maximum_nprobes=self._maximum_nprobes,
|
|
||||||
lower_bound=self._lower_bound,
|
lower_bound=self._lower_bound,
|
||||||
upper_bound=self._upper_bound,
|
upper_bound=self._upper_bound,
|
||||||
refine_factor=self._refine_factor,
|
refine_factor=self._refine_factor,
|
||||||
@@ -1376,8 +1333,6 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
|
|||||||
if query_string is not None and not isinstance(query_string, str):
|
if query_string is not None and not isinstance(query_string, str):
|
||||||
raise ValueError("Reranking currently only supports string queries")
|
raise ValueError("Reranking currently only supports string queries")
|
||||||
self._str_query = query_string if query_string is not None else self._str_query
|
self._str_query = query_string if query_string is not None else self._str_query
|
||||||
if reranker.score == "all":
|
|
||||||
self.with_row_id(True)
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def bypass_vector_index(self) -> LanceVectorQueryBuilder:
|
def bypass_vector_index(self) -> LanceVectorQueryBuilder:
|
||||||
@@ -1455,13 +1410,10 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):
|
|||||||
|
|
||||||
query = self._query
|
query = self._query
|
||||||
if self._phrase_query:
|
if self._phrase_query:
|
||||||
if isinstance(query, str):
|
raise NotImplementedError(
|
||||||
if not query.startswith('"') or not query.endswith('"'):
|
"Phrase query is not yet supported in Lance FTS. "
|
||||||
query = f'"{query}"'
|
"Use tantivy-based index instead for now."
|
||||||
elif isinstance(query, FullTextQuery) and not isinstance(
|
)
|
||||||
query, PhraseQuery
|
|
||||||
):
|
|
||||||
raise TypeError("Please use PhraseQuery for phrase queries.")
|
|
||||||
query = self.to_query_object()
|
query = self.to_query_object()
|
||||||
results = self._table._execute_query(query, timeout=timeout)
|
results = self._table._execute_query(query, timeout=timeout)
|
||||||
results = results.read_all()
|
results = results.read_all()
|
||||||
@@ -1573,8 +1525,6 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):
|
|||||||
The LanceQueryBuilder object.
|
The LanceQueryBuilder object.
|
||||||
"""
|
"""
|
||||||
self._reranker = reranker
|
self._reranker = reranker
|
||||||
if reranker.score == "all":
|
|
||||||
self.with_row_id(True)
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
||||||
@@ -1638,8 +1588,7 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
|
|||||||
self._fts_columns = fts_columns
|
self._fts_columns = fts_columns
|
||||||
self._norm = None
|
self._norm = None
|
||||||
self._reranker = None
|
self._reranker = None
|
||||||
self._minimum_nprobes = None
|
self._nprobes = None
|
||||||
self._maximum_nprobes = None
|
|
||||||
self._refine_factor = None
|
self._refine_factor = None
|
||||||
self._distance_type = None
|
self._distance_type = None
|
||||||
self._phrase_query = None
|
self._phrase_query = None
|
||||||
@@ -1851,8 +1800,6 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
|
|||||||
|
|
||||||
self._norm = normalize
|
self._norm = normalize
|
||||||
self._reranker = reranker
|
self._reranker = reranker
|
||||||
if reranker.score == "all":
|
|
||||||
self.with_row_id(True)
|
|
||||||
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
@@ -1873,24 +1820,7 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
|
|||||||
LanceHybridQueryBuilder
|
LanceHybridQueryBuilder
|
||||||
The LanceHybridQueryBuilder object.
|
The LanceHybridQueryBuilder object.
|
||||||
"""
|
"""
|
||||||
self._minimum_nprobes = nprobes
|
self._nprobes = nprobes
|
||||||
self._maximum_nprobes = nprobes
|
|
||||||
return self
|
|
||||||
|
|
||||||
def minimum_nprobes(self, minimum_nprobes: int) -> LanceHybridQueryBuilder:
|
|
||||||
"""Set the minimum number of probes to use.
|
|
||||||
|
|
||||||
See `nprobes` for more details.
|
|
||||||
"""
|
|
||||||
self._minimum_nprobes = minimum_nprobes
|
|
||||||
return self
|
|
||||||
|
|
||||||
def maximum_nprobes(self, maximum_nprobes: int) -> LanceHybridQueryBuilder:
|
|
||||||
"""Set the maximum number of probes to use.
|
|
||||||
|
|
||||||
See `nprobes` for more details.
|
|
||||||
"""
|
|
||||||
self._maximum_nprobes = maximum_nprobes
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def distance_range(
|
def distance_range(
|
||||||
@@ -2045,7 +1975,7 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
|
|||||||
FilterExec: _distance@2 IS NOT NULL
|
FilterExec: _distance@2 IS NOT NULL
|
||||||
SortExec: TopK(fetch=10), expr=[_distance@2 ASC NULLS LAST], preserve_partitioning=[false]
|
SortExec: TopK(fetch=10), expr=[_distance@2 ASC NULLS LAST], preserve_partitioning=[false]
|
||||||
KNNVectorDistance: metric=l2
|
KNNVectorDistance: metric=l2
|
||||||
LanceRead: uri=..., projection=[vector], ...
|
LanceScan: uri=..., projection=[vector], row_id=true, row_addr=false, ordered=false
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
@@ -2119,10 +2049,8 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
|
|||||||
self._fts_query.phrase_query(True)
|
self._fts_query.phrase_query(True)
|
||||||
if self._distance_type:
|
if self._distance_type:
|
||||||
self._vector_query.metric(self._distance_type)
|
self._vector_query.metric(self._distance_type)
|
||||||
if self._minimum_nprobes:
|
if self._nprobes:
|
||||||
self._vector_query.minimum_nprobes(self._minimum_nprobes)
|
self._vector_query.nprobes(self._nprobes)
|
||||||
if self._maximum_nprobes is not None:
|
|
||||||
self._vector_query.maximum_nprobes(self._maximum_nprobes)
|
|
||||||
if self._refine_factor:
|
if self._refine_factor:
|
||||||
self._vector_query.refine_factor(self._refine_factor)
|
self._vector_query.refine_factor(self._refine_factor)
|
||||||
if self._ef:
|
if self._ef:
|
||||||
@@ -2431,7 +2359,7 @@ class AsyncQueryBase(object):
|
|||||||
FilterExec: _distance@2 IS NOT NULL
|
FilterExec: _distance@2 IS NOT NULL
|
||||||
SortExec: TopK(fetch=10), expr=[_distance@2 ASC NULLS LAST], preserve_partitioning=[false]
|
SortExec: TopK(fetch=10), expr=[_distance@2 ASC NULLS LAST], preserve_partitioning=[false]
|
||||||
KNNVectorDistance: metric=l2
|
KNNVectorDistance: metric=l2
|
||||||
LanceRead: uri=..., projection=[vector], ...
|
LanceScan: uri=..., projection=[vector], row_id=true, row_addr=false, ordered=false
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
@@ -2585,7 +2513,7 @@ class AsyncQuery(AsyncQueryBase):
|
|||||||
self._inner.nearest_to_text({"query": query, "columns": columns})
|
self._inner.nearest_to_text({"query": query, "columns": columns})
|
||||||
)
|
)
|
||||||
# FullTextQuery object
|
# FullTextQuery object
|
||||||
return AsyncFTSQuery(self._inner.nearest_to_text({"query": query}))
|
return AsyncFTSQuery(self._inner.nearest_to_text({"query": query.to_dict()}))
|
||||||
|
|
||||||
|
|
||||||
class AsyncFTSQuery(AsyncQueryBase):
|
class AsyncFTSQuery(AsyncQueryBase):
|
||||||
@@ -2733,34 +2661,6 @@ class AsyncVectorQueryBase:
|
|||||||
self._inner.nprobes(nprobes)
|
self._inner.nprobes(nprobes)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def minimum_nprobes(self, minimum_nprobes: int) -> Self:
|
|
||||||
"""Set the minimum number of probes to use.
|
|
||||||
|
|
||||||
See `nprobes` for more details.
|
|
||||||
|
|
||||||
These partitions will be searched on every indexed vector query and will
|
|
||||||
increase recall at the expense of latency.
|
|
||||||
"""
|
|
||||||
self._inner.minimum_nprobes(minimum_nprobes)
|
|
||||||
return self
|
|
||||||
|
|
||||||
def maximum_nprobes(self, maximum_nprobes: int) -> Self:
|
|
||||||
"""Set the maximum number of probes to use.
|
|
||||||
|
|
||||||
See `nprobes` for more details.
|
|
||||||
|
|
||||||
If this value is greater than `minimum_nprobes` then the excess partitions
|
|
||||||
will be searched only if we have not found enough results.
|
|
||||||
|
|
||||||
This can be useful when there is a narrow filter to allow these queries to
|
|
||||||
spend more time searching and avoid potential false negatives.
|
|
||||||
|
|
||||||
If this value is 0 then no limit will be applied and all partitions could be
|
|
||||||
searched if needed to satisfy the limit.
|
|
||||||
"""
|
|
||||||
self._inner.maximum_nprobes(maximum_nprobes)
|
|
||||||
return self
|
|
||||||
|
|
||||||
def distance_range(
|
def distance_range(
|
||||||
self, lower_bound: Optional[float] = None, upper_bound: Optional[float] = None
|
self, lower_bound: Optional[float] = None, upper_bound: Optional[float] = None
|
||||||
) -> Self:
|
) -> Self:
|
||||||
@@ -2935,7 +2835,7 @@ class AsyncVectorQuery(AsyncQueryBase, AsyncVectorQueryBase):
|
|||||||
self._inner.nearest_to_text({"query": query, "columns": columns})
|
self._inner.nearest_to_text({"query": query, "columns": columns})
|
||||||
)
|
)
|
||||||
# FullTextQuery object
|
# FullTextQuery object
|
||||||
return AsyncHybridQuery(self._inner.nearest_to_text({"query": query}))
|
return AsyncHybridQuery(self._inner.nearest_to_text({"query": query.to_dict()}))
|
||||||
|
|
||||||
async def to_batches(
|
async def to_batches(
|
||||||
self,
|
self,
|
||||||
@@ -3050,21 +2950,15 @@ class AsyncHybridQuery(AsyncQueryBase, AsyncVectorQueryBase):
|
|||||||
>>> asyncio.run(doctest_example()) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
|
>>> asyncio.run(doctest_example()) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
|
||||||
Vector Search Plan:
|
Vector Search Plan:
|
||||||
ProjectionExec: expr=[vector@0 as vector, text@3 as text, _distance@2 as _distance]
|
ProjectionExec: expr=[vector@0 as vector, text@3 as text, _distance@2 as _distance]
|
||||||
Take: columns="vector, _rowid, _distance, (text)"
|
Take: columns="vector, _rowid, _distance, (text)"
|
||||||
CoalesceBatchesExec: target_batch_size=1024
|
CoalesceBatchesExec: target_batch_size=1024
|
||||||
GlobalLimitExec: skip=0, fetch=10
|
GlobalLimitExec: skip=0, fetch=10
|
||||||
FilterExec: _distance@2 IS NOT NULL
|
FilterExec: _distance@2 IS NOT NULL
|
||||||
SortExec: TopK(fetch=10), expr=[_distance@2 ASC NULLS LAST], preserve_partitioning=[false]
|
SortExec: TopK(fetch=10), expr=[_distance@2 ASC NULLS LAST], preserve_partitioning=[false]
|
||||||
KNNVectorDistance: metric=l2
|
KNNVectorDistance: metric=l2
|
||||||
LanceRead: uri=..., projection=[vector], ...
|
LanceScan: uri=..., projection=[vector], row_id=true, row_addr=false, ordered=false
|
||||||
<BLANKLINE>
|
|
||||||
FTS Search Plan:
|
FTS Search Plan:
|
||||||
ProjectionExec: expr=[vector@2 as vector, text@3 as text, _score@1 as _score]
|
LanceScan: uri=..., projection=[vector, text], row_id=false, row_addr=false, ordered=true
|
||||||
Take: columns="_rowid, _score, (vector), (text)"
|
|
||||||
CoalesceBatchesExec: target_batch_size=1024
|
|
||||||
GlobalLimitExec: skip=0, fetch=10
|
|
||||||
MatchQuery: query=hello
|
|
||||||
<BLANKLINE>
|
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
|
|||||||
@@ -18,7 +18,7 @@ from lancedb._lancedb import (
|
|||||||
UpdateResult,
|
UpdateResult,
|
||||||
)
|
)
|
||||||
from lancedb.embeddings.base import EmbeddingFunctionConfig
|
from lancedb.embeddings.base import EmbeddingFunctionConfig
|
||||||
from lancedb.index import FTS, BTree, Bitmap, HnswSq, IvfFlat, IvfPq, LabelList
|
from lancedb.index import FTS, BTree, Bitmap, HnswPq, HnswSq, IvfFlat, IvfPq, LabelList
|
||||||
from lancedb.remote.db import LOOP
|
from lancedb.remote.db import LOOP
|
||||||
import pyarrow as pa
|
import pyarrow as pa
|
||||||
|
|
||||||
@@ -89,7 +89,7 @@ class RemoteTable(Table):
|
|||||||
|
|
||||||
def to_pandas(self):
|
def to_pandas(self):
|
||||||
"""to_pandas() is not yet supported on LanceDB cloud."""
|
"""to_pandas() is not yet supported on LanceDB cloud."""
|
||||||
raise NotImplementedError("to_pandas() is not yet supported on LanceDB cloud.")
|
return NotImplementedError("to_pandas() is not yet supported on LanceDB cloud.")
|
||||||
|
|
||||||
def checkout(self, version: Union[int, str]):
|
def checkout(self, version: Union[int, str]):
|
||||||
return LOOP.run(self._table.checkout(version))
|
return LOOP.run(self._table.checkout(version))
|
||||||
@@ -158,9 +158,6 @@ class RemoteTable(Table):
|
|||||||
stem: bool = True,
|
stem: bool = True,
|
||||||
remove_stop_words: bool = True,
|
remove_stop_words: bool = True,
|
||||||
ascii_folding: bool = True,
|
ascii_folding: bool = True,
|
||||||
ngram_min_length: int = 3,
|
|
||||||
ngram_max_length: int = 3,
|
|
||||||
prefix_only: bool = False,
|
|
||||||
):
|
):
|
||||||
config = FTS(
|
config = FTS(
|
||||||
with_position=with_position,
|
with_position=with_position,
|
||||||
@@ -171,9 +168,6 @@ class RemoteTable(Table):
|
|||||||
stem=stem,
|
stem=stem,
|
||||||
remove_stop_words=remove_stop_words,
|
remove_stop_words=remove_stop_words,
|
||||||
ascii_folding=ascii_folding,
|
ascii_folding=ascii_folding,
|
||||||
ngram_min_length=ngram_min_length,
|
|
||||||
ngram_max_length=ngram_max_length,
|
|
||||||
prefix_only=prefix_only,
|
|
||||||
)
|
)
|
||||||
LOOP.run(
|
LOOP.run(
|
||||||
self._table.create_index(
|
self._table.create_index(
|
||||||
@@ -192,8 +186,6 @@ class RemoteTable(Table):
|
|||||||
accelerator: Optional[str] = None,
|
accelerator: Optional[str] = None,
|
||||||
index_type="vector",
|
index_type="vector",
|
||||||
wait_timeout: Optional[timedelta] = None,
|
wait_timeout: Optional[timedelta] = None,
|
||||||
*,
|
|
||||||
num_bits: int = 8,
|
|
||||||
):
|
):
|
||||||
"""Create an index on the table.
|
"""Create an index on the table.
|
||||||
Currently, the only parameters that matter are
|
Currently, the only parameters that matter are
|
||||||
@@ -228,6 +220,11 @@ class RemoteTable(Table):
|
|||||||
>>> table.create_index("l2", "vector") # doctest: +SKIP
|
>>> table.create_index("l2", "vector") # doctest: +SKIP
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
if num_partitions is not None:
|
||||||
|
logging.warning(
|
||||||
|
"num_partitions is not supported on LanceDB cloud."
|
||||||
|
"This parameter will be tuned automatically."
|
||||||
|
)
|
||||||
if num_sub_vectors is not None:
|
if num_sub_vectors is not None:
|
||||||
logging.warning(
|
logging.warning(
|
||||||
"num_sub_vectors is not supported on LanceDB cloud."
|
"num_sub_vectors is not supported on LanceDB cloud."
|
||||||
@@ -247,21 +244,13 @@ class RemoteTable(Table):
|
|||||||
|
|
||||||
index_type = index_type.upper()
|
index_type = index_type.upper()
|
||||||
if index_type == "VECTOR" or index_type == "IVF_PQ":
|
if index_type == "VECTOR" or index_type == "IVF_PQ":
|
||||||
config = IvfPq(
|
config = IvfPq(distance_type=metric)
|
||||||
distance_type=metric,
|
|
||||||
num_partitions=num_partitions,
|
|
||||||
num_sub_vectors=num_sub_vectors,
|
|
||||||
num_bits=num_bits,
|
|
||||||
)
|
|
||||||
elif index_type == "IVF_HNSW_PQ":
|
elif index_type == "IVF_HNSW_PQ":
|
||||||
raise ValueError(
|
config = HnswPq(distance_type=metric)
|
||||||
"IVF_HNSW_PQ is not supported on LanceDB cloud."
|
|
||||||
"Please use IVF_HNSW_SQ instead."
|
|
||||||
)
|
|
||||||
elif index_type == "IVF_HNSW_SQ":
|
elif index_type == "IVF_HNSW_SQ":
|
||||||
config = HnswSq(distance_type=metric, num_partitions=num_partitions)
|
config = HnswSq(distance_type=metric)
|
||||||
elif index_type == "IVF_FLAT":
|
elif index_type == "IVF_FLAT":
|
||||||
config = IvfFlat(distance_type=metric, num_partitions=num_partitions)
|
config = IvfFlat(distance_type=metric)
|
||||||
else:
|
else:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Unknown vector index type: {index_type}. Valid options are"
|
f"Unknown vector index type: {index_type}. Valid options are"
|
||||||
|
|||||||
@@ -74,7 +74,9 @@ class AnswerdotaiRerankers(Reranker):
|
|||||||
if self.score == "relevance":
|
if self.score == "relevance":
|
||||||
combined_results = self._keep_relevance_score(combined_results)
|
combined_results = self._keep_relevance_score(combined_results)
|
||||||
elif self.score == "all":
|
elif self.score == "all":
|
||||||
combined_results = self._merge_and_keep_scores(vector_results, fts_results)
|
raise NotImplementedError(
|
||||||
|
"Answerdotai Reranker does not support score='all' yet"
|
||||||
|
)
|
||||||
combined_results = combined_results.sort_by(
|
combined_results = combined_results.sort_by(
|
||||||
[("_relevance_score", "descending")]
|
[("_relevance_score", "descending")]
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -232,39 +232,6 @@ class Reranker(ABC):
|
|||||||
|
|
||||||
return deduped_table
|
return deduped_table
|
||||||
|
|
||||||
def _merge_and_keep_scores(self, vector_results: pa.Table, fts_results: pa.Table):
|
|
||||||
"""
|
|
||||||
Merge the results from the vector and FTS search and keep the scores.
|
|
||||||
This op is slower than just keeping relevance score but can be useful
|
|
||||||
for debugging.
|
|
||||||
"""
|
|
||||||
# add nulls to fts results for _distance
|
|
||||||
if "_distance" not in fts_results.column_names:
|
|
||||||
fts_results = fts_results.append_column(
|
|
||||||
"_distance",
|
|
||||||
pa.array([None] * len(fts_results), type=pa.float32()),
|
|
||||||
)
|
|
||||||
# add nulls to vector results for _score
|
|
||||||
if "_score" not in vector_results.column_names:
|
|
||||||
vector_results = vector_results.append_column(
|
|
||||||
"_score",
|
|
||||||
pa.array([None] * len(vector_results), type=pa.float32()),
|
|
||||||
)
|
|
||||||
|
|
||||||
# combine them and fill the scores
|
|
||||||
vector_results_dict = {row["_rowid"]: row for row in vector_results.to_pylist()}
|
|
||||||
fts_results_dict = {row["_rowid"]: row for row in fts_results.to_pylist()}
|
|
||||||
|
|
||||||
# merge them into vector_results
|
|
||||||
for key, value in fts_results_dict.items():
|
|
||||||
if key in vector_results_dict:
|
|
||||||
vector_results_dict[key]["_score"] = value["_score"]
|
|
||||||
else:
|
|
||||||
vector_results_dict[key] = value
|
|
||||||
|
|
||||||
combined = pa.Table.from_pylist(list(vector_results_dict.values()))
|
|
||||||
return combined
|
|
||||||
|
|
||||||
def _keep_relevance_score(self, combined_results: pa.Table):
|
def _keep_relevance_score(self, combined_results: pa.Table):
|
||||||
if self.score == "relevance":
|
if self.score == "relevance":
|
||||||
if "_score" in combined_results.column_names:
|
if "_score" in combined_results.column_names:
|
||||||
|
|||||||
@@ -92,14 +92,14 @@ class CohereReranker(Reranker):
|
|||||||
vector_results: pa.Table,
|
vector_results: pa.Table,
|
||||||
fts_results: pa.Table,
|
fts_results: pa.Table,
|
||||||
):
|
):
|
||||||
if self.score == "all":
|
combined_results = self.merge_results(vector_results, fts_results)
|
||||||
combined_results = self._merge_and_keep_scores(vector_results, fts_results)
|
|
||||||
else:
|
|
||||||
combined_results = self.merge_results(vector_results, fts_results)
|
|
||||||
combined_results = self._rerank(combined_results, query)
|
combined_results = self._rerank(combined_results, query)
|
||||||
if self.score == "relevance":
|
if self.score == "relevance":
|
||||||
combined_results = self._keep_relevance_score(combined_results)
|
combined_results = self._keep_relevance_score(combined_results)
|
||||||
|
elif self.score == "all":
|
||||||
|
raise NotImplementedError(
|
||||||
|
"return_score='all' not implemented for cohere reranker"
|
||||||
|
)
|
||||||
return combined_results
|
return combined_results
|
||||||
|
|
||||||
def rerank_vector(self, query: str, vector_results: pa.Table):
|
def rerank_vector(self, query: str, vector_results: pa.Table):
|
||||||
|
|||||||
@@ -81,15 +81,15 @@ class CrossEncoderReranker(Reranker):
|
|||||||
vector_results: pa.Table,
|
vector_results: pa.Table,
|
||||||
fts_results: pa.Table,
|
fts_results: pa.Table,
|
||||||
):
|
):
|
||||||
if self.score == "all":
|
combined_results = self.merge_results(vector_results, fts_results)
|
||||||
combined_results = self._merge_and_keep_scores(vector_results, fts_results)
|
|
||||||
else:
|
|
||||||
combined_results = self.merge_results(vector_results, fts_results)
|
|
||||||
combined_results = self._rerank(combined_results, query)
|
combined_results = self._rerank(combined_results, query)
|
||||||
# sort the results by _score
|
# sort the results by _score
|
||||||
if self.score == "relevance":
|
if self.score == "relevance":
|
||||||
combined_results = self._keep_relevance_score(combined_results)
|
combined_results = self._keep_relevance_score(combined_results)
|
||||||
|
elif self.score == "all":
|
||||||
|
raise NotImplementedError(
|
||||||
|
"return_score='all' not implemented for CrossEncoderReranker"
|
||||||
|
)
|
||||||
combined_results = combined_results.sort_by(
|
combined_results = combined_results.sort_by(
|
||||||
[("_relevance_score", "descending")]
|
[("_relevance_score", "descending")]
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -97,14 +97,14 @@ class JinaReranker(Reranker):
|
|||||||
vector_results: pa.Table,
|
vector_results: pa.Table,
|
||||||
fts_results: pa.Table,
|
fts_results: pa.Table,
|
||||||
):
|
):
|
||||||
if self.score == "all":
|
combined_results = self.merge_results(vector_results, fts_results)
|
||||||
combined_results = self._merge_and_keep_scores(vector_results, fts_results)
|
|
||||||
else:
|
|
||||||
combined_results = self.merge_results(vector_results, fts_results)
|
|
||||||
combined_results = self._rerank(combined_results, query)
|
combined_results = self._rerank(combined_results, query)
|
||||||
if self.score == "relevance":
|
if self.score == "relevance":
|
||||||
combined_results = self._keep_relevance_score(combined_results)
|
combined_results = self._keep_relevance_score(combined_results)
|
||||||
|
elif self.score == "all":
|
||||||
|
raise NotImplementedError(
|
||||||
|
"return_score='all' not implemented for JinaReranker"
|
||||||
|
)
|
||||||
return combined_results
|
return combined_results
|
||||||
|
|
||||||
def rerank_vector(self, query: str, vector_results: pa.Table):
|
def rerank_vector(self, query: str, vector_results: pa.Table):
|
||||||
|
|||||||
@@ -88,13 +88,14 @@ class OpenaiReranker(Reranker):
|
|||||||
vector_results: pa.Table,
|
vector_results: pa.Table,
|
||||||
fts_results: pa.Table,
|
fts_results: pa.Table,
|
||||||
):
|
):
|
||||||
if self.score == "all":
|
combined_results = self.merge_results(vector_results, fts_results)
|
||||||
combined_results = self._merge_and_keep_scores(vector_results, fts_results)
|
|
||||||
else:
|
|
||||||
combined_results = self.merge_results(vector_results, fts_results)
|
|
||||||
combined_results = self._rerank(combined_results, query)
|
combined_results = self._rerank(combined_results, query)
|
||||||
if self.score == "relevance":
|
if self.score == "relevance":
|
||||||
combined_results = self._keep_relevance_score(combined_results)
|
combined_results = self._keep_relevance_score(combined_results)
|
||||||
|
elif self.score == "all":
|
||||||
|
raise NotImplementedError(
|
||||||
|
"OpenAI Reranker does not support score='all' yet"
|
||||||
|
)
|
||||||
|
|
||||||
combined_results = combined_results.sort_by(
|
combined_results = combined_results.sort_by(
|
||||||
[("_relevance_score", "descending")]
|
[("_relevance_score", "descending")]
|
||||||
|
|||||||
@@ -94,14 +94,14 @@ class VoyageAIReranker(Reranker):
|
|||||||
vector_results: pa.Table,
|
vector_results: pa.Table,
|
||||||
fts_results: pa.Table,
|
fts_results: pa.Table,
|
||||||
):
|
):
|
||||||
if self.score == "all":
|
combined_results = self.merge_results(vector_results, fts_results)
|
||||||
combined_results = self._merge_and_keep_scores(vector_results, fts_results)
|
|
||||||
else:
|
|
||||||
combined_results = self.merge_results(vector_results, fts_results)
|
|
||||||
combined_results = self._rerank(combined_results, query)
|
combined_results = self._rerank(combined_results, query)
|
||||||
if self.score == "relevance":
|
if self.score == "relevance":
|
||||||
combined_results = self._keep_relevance_score(combined_results)
|
combined_results = self._keep_relevance_score(combined_results)
|
||||||
|
elif self.score == "all":
|
||||||
|
raise NotImplementedError(
|
||||||
|
"return_score='all' not implemented for voyageai reranker"
|
||||||
|
)
|
||||||
return combined_results
|
return combined_results
|
||||||
|
|
||||||
def rerank_vector(self, query: str, vector_results: pa.Table):
|
def rerank_vector(self, query: str, vector_results: pa.Table):
|
||||||
|
|||||||
@@ -102,9 +102,7 @@ if TYPE_CHECKING:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def _into_pyarrow_reader(
|
def _into_pyarrow_reader(data) -> pa.RecordBatchReader:
|
||||||
data, schema: Optional[pa.Schema] = None
|
|
||||||
) -> pa.RecordBatchReader:
|
|
||||||
from lancedb.dependencies import datasets
|
from lancedb.dependencies import datasets
|
||||||
|
|
||||||
if _check_for_hugging_face(data):
|
if _check_for_hugging_face(data):
|
||||||
@@ -125,12 +123,6 @@ def _into_pyarrow_reader(
|
|||||||
raise ValueError("Cannot add a single dictionary to a table. Use a list.")
|
raise ValueError("Cannot add a single dictionary to a table. Use a list.")
|
||||||
|
|
||||||
if isinstance(data, list):
|
if isinstance(data, list):
|
||||||
# Handle empty list case
|
|
||||||
if not data:
|
|
||||||
if schema is None:
|
|
||||||
raise ValueError("Cannot create table from empty list without a schema")
|
|
||||||
return pa.Table.from_pylist(data, schema=schema).to_reader()
|
|
||||||
|
|
||||||
# convert to list of dict if data is a bunch of LanceModels
|
# convert to list of dict if data is a bunch of LanceModels
|
||||||
if isinstance(data[0], LanceModel):
|
if isinstance(data[0], LanceModel):
|
||||||
schema = data[0].__class__.to_arrow_schema()
|
schema = data[0].__class__.to_arrow_schema()
|
||||||
@@ -173,9 +165,9 @@ def _into_pyarrow_reader(
|
|||||||
else:
|
else:
|
||||||
raise TypeError(
|
raise TypeError(
|
||||||
f"Unknown data type {type(data)}. "
|
f"Unknown data type {type(data)}. "
|
||||||
"Supported types: list of dicts, pandas DataFrame, polars DataFrame, "
|
"Please check "
|
||||||
"pyarrow Table/RecordBatch, or Pydantic models. "
|
"https://lancedb.github.io/lancedb/python/python/ "
|
||||||
"See https://lancedb.github.io/lancedb/guides/tables/ for examples."
|
"to see supported types."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -244,7 +236,7 @@ def _sanitize_data(
|
|||||||
# 1. There might be embedding columns missing that will be added
|
# 1. There might be embedding columns missing that will be added
|
||||||
# in the add_embeddings step.
|
# in the add_embeddings step.
|
||||||
# 2. If `allow_subschemas` is True, there might be columns missing.
|
# 2. If `allow_subschemas` is True, there might be columns missing.
|
||||||
reader = _into_pyarrow_reader(data, target_schema)
|
reader = _into_pyarrow_reader(data)
|
||||||
|
|
||||||
reader = _append_vector_columns(reader, target_schema, metadata=metadata)
|
reader = _append_vector_columns(reader, target_schema, metadata=metadata)
|
||||||
|
|
||||||
@@ -835,7 +827,7 @@ class Table(ABC):
|
|||||||
ordering_field_names: Optional[Union[str, List[str]]] = None,
|
ordering_field_names: Optional[Union[str, List[str]]] = None,
|
||||||
replace: bool = False,
|
replace: bool = False,
|
||||||
writer_heap_size: Optional[int] = 1024 * 1024 * 1024,
|
writer_heap_size: Optional[int] = 1024 * 1024 * 1024,
|
||||||
use_tantivy: bool = False,
|
use_tantivy: bool = True,
|
||||||
tokenizer_name: Optional[str] = None,
|
tokenizer_name: Optional[str] = None,
|
||||||
with_position: bool = False,
|
with_position: bool = False,
|
||||||
# tokenizer configs:
|
# tokenizer configs:
|
||||||
@@ -846,9 +838,6 @@ class Table(ABC):
|
|||||||
stem: bool = True,
|
stem: bool = True,
|
||||||
remove_stop_words: bool = True,
|
remove_stop_words: bool = True,
|
||||||
ascii_folding: bool = True,
|
ascii_folding: bool = True,
|
||||||
ngram_min_length: int = 3,
|
|
||||||
ngram_max_length: int = 3,
|
|
||||||
prefix_only: bool = False,
|
|
||||||
wait_timeout: Optional[timedelta] = None,
|
wait_timeout: Optional[timedelta] = None,
|
||||||
):
|
):
|
||||||
"""Create a full-text search index on the table.
|
"""Create a full-text search index on the table.
|
||||||
@@ -875,7 +864,7 @@ class Table(ABC):
|
|||||||
The tokenizer to use for the index. Can be "raw", "default" or the 2 letter
|
The tokenizer to use for the index. Can be "raw", "default" or the 2 letter
|
||||||
language code followed by "_stem". So for english it would be "en_stem".
|
language code followed by "_stem". So for english it would be "en_stem".
|
||||||
For available languages see: https://docs.rs/tantivy/latest/tantivy/tokenizer/enum.Language.html
|
For available languages see: https://docs.rs/tantivy/latest/tantivy/tokenizer/enum.Language.html
|
||||||
use_tantivy: bool, default False
|
use_tantivy: bool, default True
|
||||||
If True, use the legacy full-text search implementation based on tantivy.
|
If True, use the legacy full-text search implementation based on tantivy.
|
||||||
If False, use the new full-text search implementation based on lance-index.
|
If False, use the new full-text search implementation based on lance-index.
|
||||||
with_position: bool, default False
|
with_position: bool, default False
|
||||||
@@ -888,7 +877,6 @@ class Table(ABC):
|
|||||||
- "simple": Splits text by whitespace and punctuation.
|
- "simple": Splits text by whitespace and punctuation.
|
||||||
- "whitespace": Split text by whitespace, but not punctuation.
|
- "whitespace": Split text by whitespace, but not punctuation.
|
||||||
- "raw": No tokenization. The entire text is treated as a single token.
|
- "raw": No tokenization. The entire text is treated as a single token.
|
||||||
- "ngram": N-Gram tokenizer.
|
|
||||||
language : str, default "English"
|
language : str, default "English"
|
||||||
The language to use for tokenization.
|
The language to use for tokenization.
|
||||||
max_token_length : int, default 40
|
max_token_length : int, default 40
|
||||||
@@ -906,12 +894,6 @@ class Table(ABC):
|
|||||||
ascii_folding : bool, default True
|
ascii_folding : bool, default True
|
||||||
Whether to fold ASCII characters. This converts accented characters to
|
Whether to fold ASCII characters. This converts accented characters to
|
||||||
their ASCII equivalent. For example, "café" would be converted to "cafe".
|
their ASCII equivalent. For example, "café" would be converted to "cafe".
|
||||||
ngram_min_length: int, default 3
|
|
||||||
The minimum length of an n-gram.
|
|
||||||
ngram_max_length: int, default 3
|
|
||||||
The maximum length of an n-gram.
|
|
||||||
prefix_only: bool, default False
|
|
||||||
Whether to only index the prefix of the token for ngram tokenizer.
|
|
||||||
wait_timeout: timedelta, optional
|
wait_timeout: timedelta, optional
|
||||||
The timeout to wait if indexing is asynchronous.
|
The timeout to wait if indexing is asynchronous.
|
||||||
"""
|
"""
|
||||||
@@ -1988,7 +1970,7 @@ class LanceTable(Table):
|
|||||||
ordering_field_names: Optional[Union[str, List[str]]] = None,
|
ordering_field_names: Optional[Union[str, List[str]]] = None,
|
||||||
replace: bool = False,
|
replace: bool = False,
|
||||||
writer_heap_size: Optional[int] = 1024 * 1024 * 1024,
|
writer_heap_size: Optional[int] = 1024 * 1024 * 1024,
|
||||||
use_tantivy: bool = False,
|
use_tantivy: bool = True,
|
||||||
tokenizer_name: Optional[str] = None,
|
tokenizer_name: Optional[str] = None,
|
||||||
with_position: bool = False,
|
with_position: bool = False,
|
||||||
# tokenizer configs:
|
# tokenizer configs:
|
||||||
@@ -1999,9 +1981,6 @@ class LanceTable(Table):
|
|||||||
stem: bool = True,
|
stem: bool = True,
|
||||||
remove_stop_words: bool = True,
|
remove_stop_words: bool = True,
|
||||||
ascii_folding: bool = True,
|
ascii_folding: bool = True,
|
||||||
ngram_min_length: int = 3,
|
|
||||||
ngram_max_length: int = 3,
|
|
||||||
prefix_only: bool = False,
|
|
||||||
):
|
):
|
||||||
if not use_tantivy:
|
if not use_tantivy:
|
||||||
if not isinstance(field_names, str):
|
if not isinstance(field_names, str):
|
||||||
@@ -2017,9 +1996,6 @@ class LanceTable(Table):
|
|||||||
"stem": stem,
|
"stem": stem,
|
||||||
"remove_stop_words": remove_stop_words,
|
"remove_stop_words": remove_stop_words,
|
||||||
"ascii_folding": ascii_folding,
|
"ascii_folding": ascii_folding,
|
||||||
"ngram_min_length": ngram_min_length,
|
|
||||||
"ngram_max_length": ngram_max_length,
|
|
||||||
"prefix_only": prefix_only,
|
|
||||||
}
|
}
|
||||||
else:
|
else:
|
||||||
tokenizer_configs = self.infer_tokenizer_configs(tokenizer_name)
|
tokenizer_configs = self.infer_tokenizer_configs(tokenizer_name)
|
||||||
@@ -2089,9 +2065,6 @@ class LanceTable(Table):
|
|||||||
"stem": False,
|
"stem": False,
|
||||||
"remove_stop_words": False,
|
"remove_stop_words": False,
|
||||||
"ascii_folding": False,
|
"ascii_folding": False,
|
||||||
"ngram_min_length": 3,
|
|
||||||
"ngram_max_length": 3,
|
|
||||||
"prefix_only": False,
|
|
||||||
}
|
}
|
||||||
elif tokenizer_name == "raw":
|
elif tokenizer_name == "raw":
|
||||||
return {
|
return {
|
||||||
@@ -2102,9 +2075,6 @@ class LanceTable(Table):
|
|||||||
"stem": False,
|
"stem": False,
|
||||||
"remove_stop_words": False,
|
"remove_stop_words": False,
|
||||||
"ascii_folding": False,
|
"ascii_folding": False,
|
||||||
"ngram_min_length": 3,
|
|
||||||
"ngram_max_length": 3,
|
|
||||||
"prefix_only": False,
|
|
||||||
}
|
}
|
||||||
elif tokenizer_name == "whitespace":
|
elif tokenizer_name == "whitespace":
|
||||||
return {
|
return {
|
||||||
@@ -2115,9 +2085,6 @@ class LanceTable(Table):
|
|||||||
"stem": False,
|
"stem": False,
|
||||||
"remove_stop_words": False,
|
"remove_stop_words": False,
|
||||||
"ascii_folding": False,
|
"ascii_folding": False,
|
||||||
"ngram_min_length": 3,
|
|
||||||
"ngram_max_length": 3,
|
|
||||||
"prefix_only": False,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
# or it's with language stemming with pattern like "en_stem"
|
# or it's with language stemming with pattern like "en_stem"
|
||||||
@@ -2136,9 +2103,6 @@ class LanceTable(Table):
|
|||||||
"stem": True,
|
"stem": True,
|
||||||
"remove_stop_words": False,
|
"remove_stop_words": False,
|
||||||
"ascii_folding": False,
|
"ascii_folding": False,
|
||||||
"ngram_min_length": 3,
|
|
||||||
"ngram_max_length": 3,
|
|
||||||
"prefix_only": False,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
def add(
|
def add(
|
||||||
@@ -3673,15 +3637,8 @@ class AsyncTable:
|
|||||||
)
|
)
|
||||||
if query.distance_type is not None:
|
if query.distance_type is not None:
|
||||||
async_query = async_query.distance_type(query.distance_type)
|
async_query = async_query.distance_type(query.distance_type)
|
||||||
if query.minimum_nprobes is not None and query.maximum_nprobes is not None:
|
if query.nprobes is not None:
|
||||||
# Set both to the minimum first to avoid min > max error.
|
async_query = async_query.nprobes(query.nprobes)
|
||||||
async_query = async_query.nprobes(
|
|
||||||
query.minimum_nprobes
|
|
||||||
).maximum_nprobes(query.maximum_nprobes)
|
|
||||||
elif query.minimum_nprobes is not None:
|
|
||||||
async_query = async_query.minimum_nprobes(query.minimum_nprobes)
|
|
||||||
elif query.maximum_nprobes is not None:
|
|
||||||
async_query = async_query.maximum_nprobes(query.maximum_nprobes)
|
|
||||||
if query.refine_factor is not None:
|
if query.refine_factor is not None:
|
||||||
async_query = async_query.refine_factor(query.refine_factor)
|
async_query = async_query.refine_factor(query.refine_factor)
|
||||||
if query.vector_column:
|
if query.vector_column:
|
||||||
|
|||||||
@@ -25,4 +25,4 @@ IndexType = Literal[
|
|||||||
]
|
]
|
||||||
|
|
||||||
# Tokenizer literals
|
# Tokenizer literals
|
||||||
BaseTokenizerType = Literal["simple", "raw", "whitespace", "ngram"]
|
BaseTokenizerType = Literal["simple", "raw", "whitespace"]
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ import lancedb
|
|||||||
|
|
||||||
# --8<-- [end:import-lancedb]
|
# --8<-- [end:import-lancedb]
|
||||||
# --8<-- [start:import-numpy]
|
# --8<-- [start:import-numpy]
|
||||||
from lancedb.query import BooleanQuery, BoostQuery, MatchQuery, Occur
|
from lancedb.query import BoostQuery, MatchQuery
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pyarrow as pa
|
import pyarrow as pa
|
||||||
|
|
||||||
@@ -191,15 +191,6 @@ def test_fts_fuzzy_query():
|
|||||||
"food", # 1 insertion
|
"food", # 1 insertion
|
||||||
}
|
}
|
||||||
|
|
||||||
results = table.search(
|
|
||||||
MatchQuery("foo", "text", fuzziness=1, prefix_length=3)
|
|
||||||
).to_pandas()
|
|
||||||
assert len(results) == 2
|
|
||||||
assert set(results["text"].to_list()) == {
|
|
||||||
"foo",
|
|
||||||
"food",
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(
|
@pytest.mark.skipif(
|
||||||
os.name == "nt", reason="Need to fix https://github.com/lancedb/lance/issues/3905"
|
os.name == "nt", reason="Need to fix https://github.com/lancedb/lance/issues/3905"
|
||||||
@@ -249,60 +240,6 @@ def test_fts_boost_query():
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(
|
|
||||||
os.name == "nt", reason="Need to fix https://github.com/lancedb/lance/issues/3905"
|
|
||||||
)
|
|
||||||
def test_fts_boolean_query(tmp_path):
|
|
||||||
uri = tmp_path / "boolean-example"
|
|
||||||
db = lancedb.connect(uri)
|
|
||||||
table = db.create_table(
|
|
||||||
"my_table_fts_boolean",
|
|
||||||
data=[
|
|
||||||
{"text": "The cat and dog are playing"},
|
|
||||||
{"text": "The cat is sleeping"},
|
|
||||||
{"text": "The dog is barking"},
|
|
||||||
{"text": "The dog chases the cat"},
|
|
||||||
],
|
|
||||||
mode="overwrite",
|
|
||||||
)
|
|
||||||
table.create_fts_index("text", use_tantivy=False, replace=True)
|
|
||||||
|
|
||||||
# SHOULD
|
|
||||||
results = table.search(
|
|
||||||
MatchQuery("cat", "text") | MatchQuery("dog", "text")
|
|
||||||
).to_pandas()
|
|
||||||
assert len(results) == 4
|
|
||||||
assert set(results["text"].to_list()) == {
|
|
||||||
"The cat and dog are playing",
|
|
||||||
"The cat is sleeping",
|
|
||||||
"The dog is barking",
|
|
||||||
"The dog chases the cat",
|
|
||||||
}
|
|
||||||
# MUST
|
|
||||||
results = table.search(
|
|
||||||
MatchQuery("cat", "text") & MatchQuery("dog", "text")
|
|
||||||
).to_pandas()
|
|
||||||
assert len(results) == 2
|
|
||||||
assert set(results["text"].to_list()) == {
|
|
||||||
"The cat and dog are playing",
|
|
||||||
"The dog chases the cat",
|
|
||||||
}
|
|
||||||
|
|
||||||
# MUST NOT
|
|
||||||
results = table.search(
|
|
||||||
BooleanQuery(
|
|
||||||
[
|
|
||||||
(Occur.MUST, MatchQuery("cat", "text")),
|
|
||||||
(Occur.MUST_NOT, MatchQuery("dog", "text")),
|
|
||||||
]
|
|
||||||
)
|
|
||||||
).to_pandas()
|
|
||||||
assert len(results) == 1
|
|
||||||
assert set(results["text"].to_list()) == {
|
|
||||||
"The cat is sleeping",
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(
|
@pytest.mark.skipif(
|
||||||
os.name == "nt", reason="Need to fix https://github.com/lancedb/lance/issues/3905"
|
os.name == "nt", reason="Need to fix https://github.com/lancedb/lance/issues/3905"
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -33,11 +33,8 @@ tantivy = pytest.importorskip("tantivy")
|
|||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def table(tmp_path) -> ldb.table.LanceTable:
|
def table(tmp_path) -> ldb.table.LanceTable:
|
||||||
# Use local random state to avoid affecting other tests
|
|
||||||
rng = np.random.RandomState(42)
|
|
||||||
local_random = random.Random(42)
|
|
||||||
db = ldb.connect(tmp_path)
|
db = ldb.connect(tmp_path)
|
||||||
vectors = [rng.randn(128) for _ in range(100)]
|
vectors = [np.random.randn(128) for _ in range(100)]
|
||||||
|
|
||||||
text_nouns = ("puppy", "car")
|
text_nouns = ("puppy", "car")
|
||||||
text2_nouns = ("rabbit", "girl", "monkey")
|
text2_nouns = ("rabbit", "girl", "monkey")
|
||||||
@@ -47,10 +44,10 @@ def table(tmp_path) -> ldb.table.LanceTable:
|
|||||||
text = [
|
text = [
|
||||||
" ".join(
|
" ".join(
|
||||||
[
|
[
|
||||||
text_nouns[local_random.randrange(0, len(text_nouns))],
|
text_nouns[random.randrange(0, len(text_nouns))],
|
||||||
verbs[local_random.randrange(0, 5)],
|
verbs[random.randrange(0, 5)],
|
||||||
adv[local_random.randrange(0, 5)],
|
adv[random.randrange(0, 5)],
|
||||||
adj[local_random.randrange(0, 5)],
|
adj[random.randrange(0, 5)],
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
for _ in range(100)
|
for _ in range(100)
|
||||||
@@ -58,15 +55,15 @@ def table(tmp_path) -> ldb.table.LanceTable:
|
|||||||
text2 = [
|
text2 = [
|
||||||
" ".join(
|
" ".join(
|
||||||
[
|
[
|
||||||
text2_nouns[local_random.randrange(0, len(text2_nouns))],
|
text2_nouns[random.randrange(0, len(text2_nouns))],
|
||||||
verbs[local_random.randrange(0, 5)],
|
verbs[random.randrange(0, 5)],
|
||||||
adv[local_random.randrange(0, 5)],
|
adv[random.randrange(0, 5)],
|
||||||
adj[local_random.randrange(0, 5)],
|
adj[random.randrange(0, 5)],
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
for _ in range(100)
|
for _ in range(100)
|
||||||
]
|
]
|
||||||
count = [local_random.randint(1, 10000) for _ in range(100)]
|
count = [random.randint(1, 10000) for _ in range(100)]
|
||||||
table = db.create_table(
|
table = db.create_table(
|
||||||
"test",
|
"test",
|
||||||
data=pd.DataFrame(
|
data=pd.DataFrame(
|
||||||
@@ -85,11 +82,8 @@ def table(tmp_path) -> ldb.table.LanceTable:
|
|||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
async def async_table(tmp_path) -> ldb.table.AsyncTable:
|
async def async_table(tmp_path) -> ldb.table.AsyncTable:
|
||||||
# Use local random state to avoid affecting other tests
|
|
||||||
rng = np.random.RandomState(42)
|
|
||||||
local_random = random.Random(42)
|
|
||||||
db = await ldb.connect_async(tmp_path)
|
db = await ldb.connect_async(tmp_path)
|
||||||
vectors = [rng.randn(128) for _ in range(100)]
|
vectors = [np.random.randn(128) for _ in range(100)]
|
||||||
|
|
||||||
text_nouns = ("puppy", "car")
|
text_nouns = ("puppy", "car")
|
||||||
text2_nouns = ("rabbit", "girl", "monkey")
|
text2_nouns = ("rabbit", "girl", "monkey")
|
||||||
@@ -99,10 +93,10 @@ async def async_table(tmp_path) -> ldb.table.AsyncTable:
|
|||||||
text = [
|
text = [
|
||||||
" ".join(
|
" ".join(
|
||||||
[
|
[
|
||||||
text_nouns[local_random.randrange(0, len(text_nouns))],
|
text_nouns[random.randrange(0, len(text_nouns))],
|
||||||
verbs[local_random.randrange(0, 5)],
|
verbs[random.randrange(0, 5)],
|
||||||
adv[local_random.randrange(0, 5)],
|
adv[random.randrange(0, 5)],
|
||||||
adj[local_random.randrange(0, 5)],
|
adj[random.randrange(0, 5)],
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
for _ in range(100)
|
for _ in range(100)
|
||||||
@@ -110,15 +104,15 @@ async def async_table(tmp_path) -> ldb.table.AsyncTable:
|
|||||||
text2 = [
|
text2 = [
|
||||||
" ".join(
|
" ".join(
|
||||||
[
|
[
|
||||||
text2_nouns[local_random.randrange(0, len(text2_nouns))],
|
text2_nouns[random.randrange(0, len(text2_nouns))],
|
||||||
verbs[local_random.randrange(0, 5)],
|
verbs[random.randrange(0, 5)],
|
||||||
adv[local_random.randrange(0, 5)],
|
adv[random.randrange(0, 5)],
|
||||||
adj[local_random.randrange(0, 5)],
|
adj[random.randrange(0, 5)],
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
for _ in range(100)
|
for _ in range(100)
|
||||||
]
|
]
|
||||||
count = [local_random.randint(1, 10000) for _ in range(100)]
|
count = [random.randint(1, 10000) for _ in range(100)]
|
||||||
table = await db.create_table(
|
table = await db.create_table(
|
||||||
"test",
|
"test",
|
||||||
data=pd.DataFrame(
|
data=pd.DataFrame(
|
||||||
@@ -221,19 +215,6 @@ def test_search_fts(table, use_tantivy):
|
|||||||
assert len(results) == 5
|
assert len(results) == 5
|
||||||
assert len(results[0]) == 3 # id, text, _score
|
assert len(results[0]) == 3 # id, text, _score
|
||||||
|
|
||||||
# Test boolean query
|
|
||||||
results = (
|
|
||||||
table.search(MatchQuery("puppy", "text") & MatchQuery("runs", "text"))
|
|
||||||
.select(["id", "text"])
|
|
||||||
.limit(5)
|
|
||||||
.to_list()
|
|
||||||
)
|
|
||||||
assert len(results) == 5
|
|
||||||
assert len(results[0]) == 3 # id, text, _score
|
|
||||||
for r in results:
|
|
||||||
assert "puppy" in r["text"]
|
|
||||||
assert "runs" in r["text"]
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_fts_select_async(async_table):
|
async def test_fts_select_async(async_table):
|
||||||
@@ -675,46 +656,3 @@ def test_fts_on_list(mem_db: DBConnection):
|
|||||||
|
|
||||||
res = table.search(PhraseQuery("lance database", "text")).limit(5).to_list()
|
res = table.search(PhraseQuery("lance database", "text")).limit(5).to_list()
|
||||||
assert len(res) == 2
|
assert len(res) == 2
|
||||||
|
|
||||||
|
|
||||||
def test_fts_ngram(mem_db: DBConnection):
|
|
||||||
data = pa.table({"text": ["hello world", "lance database", "lance is cool"]})
|
|
||||||
table = mem_db.create_table("test", data=data)
|
|
||||||
table.create_fts_index("text", use_tantivy=False, base_tokenizer="ngram")
|
|
||||||
|
|
||||||
results = table.search("lan", query_type="fts").limit(10).to_list()
|
|
||||||
assert len(results) == 2
|
|
||||||
assert set(r["text"] for r in results) == {"lance database", "lance is cool"}
|
|
||||||
|
|
||||||
results = (
|
|
||||||
table.search("nce", query_type="fts").limit(10).to_list()
|
|
||||||
) # spellchecker:disable-line
|
|
||||||
assert len(results) == 2
|
|
||||||
assert set(r["text"] for r in results) == {"lance database", "lance is cool"}
|
|
||||||
|
|
||||||
# the default min_ngram_length is 3, so "la" should not match
|
|
||||||
results = table.search("la", query_type="fts").limit(10).to_list()
|
|
||||||
assert len(results) == 0
|
|
||||||
|
|
||||||
# test setting min_ngram_length and prefix_only
|
|
||||||
table.create_fts_index(
|
|
||||||
"text",
|
|
||||||
use_tantivy=False,
|
|
||||||
base_tokenizer="ngram",
|
|
||||||
replace=True,
|
|
||||||
ngram_min_length=2,
|
|
||||||
prefix_only=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
results = table.search("lan", query_type="fts").limit(10).to_list()
|
|
||||||
assert len(results) == 2
|
|
||||||
assert set(r["text"] for r in results) == {"lance database", "lance is cool"}
|
|
||||||
|
|
||||||
results = (
|
|
||||||
table.search("nce", query_type="fts").limit(10).to_list()
|
|
||||||
) # spellchecker:disable-line
|
|
||||||
assert len(results) == 0
|
|
||||||
|
|
||||||
results = table.search("la", query_type="fts").limit(10).to_list()
|
|
||||||
assert len(results) == 2
|
|
||||||
assert set(r["text"] for r in results) == {"lance database", "lance is cool"}
|
|
||||||
|
|||||||
@@ -166,7 +166,7 @@ async def test_explain_plan(table: AsyncTable):
|
|||||||
assert "Vector Search Plan" in plan
|
assert "Vector Search Plan" in plan
|
||||||
assert "KNNVectorDistance" in plan
|
assert "KNNVectorDistance" in plan
|
||||||
assert "FTS Search Plan" in plan
|
assert "FTS Search Plan" in plan
|
||||||
assert "LanceRead" in plan
|
assert "LanceScan" in plan
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
|
|||||||
@@ -25,8 +25,6 @@ from lancedb.query import (
|
|||||||
AsyncQueryBase,
|
AsyncQueryBase,
|
||||||
AsyncVectorQuery,
|
AsyncVectorQuery,
|
||||||
LanceVectorQueryBuilder,
|
LanceVectorQueryBuilder,
|
||||||
MatchQuery,
|
|
||||||
PhraseQuery,
|
|
||||||
Query,
|
Query,
|
||||||
FullTextSearchQuery,
|
FullTextSearchQuery,
|
||||||
)
|
)
|
||||||
@@ -272,9 +270,7 @@ async def test_distance_range_with_new_rows_async():
|
|||||||
# append more rows so that execution plan would be mixed with ANN & Flat KNN
|
# append more rows so that execution plan would be mixed with ANN & Flat KNN
|
||||||
new_data = pa.table(
|
new_data = pa.table(
|
||||||
{
|
{
|
||||||
"vector": pa.FixedShapeTensorArray.from_numpy_ndarray(
|
"vector": pa.FixedShapeTensorArray.from_numpy_ndarray(np.random.rand(4, 2)),
|
||||||
np.random.rand(4, 2) + 1
|
|
||||||
),
|
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
await table.add(new_data)
|
await table.add(new_data)
|
||||||
@@ -441,53 +437,6 @@ def test_query_builder_with_filter(table):
|
|||||||
assert all(np.array(rs[0]["vector"]) == [3, 4])
|
assert all(np.array(rs[0]["vector"]) == [3, 4])
|
||||||
|
|
||||||
|
|
||||||
def test_invalid_nprobes_sync(table):
|
|
||||||
with pytest.raises(ValueError, match="minimum_nprobes must be greater than 0"):
|
|
||||||
LanceVectorQueryBuilder(table, [0, 0], "vector").minimum_nprobes(0).to_list()
|
|
||||||
with pytest.raises(
|
|
||||||
ValueError,
|
|
||||||
match="maximum_nprobes must be greater than or equal to minimum_nprobes",
|
|
||||||
):
|
|
||||||
LanceVectorQueryBuilder(table, [0, 0], "vector").maximum_nprobes(5).to_list()
|
|
||||||
with pytest.raises(
|
|
||||||
ValueError,
|
|
||||||
match="minimum_nprobes must be less than or equal to maximum_nprobes",
|
|
||||||
):
|
|
||||||
LanceVectorQueryBuilder(table, [0, 0], "vector").minimum_nprobes(100).to_list()
|
|
||||||
|
|
||||||
|
|
||||||
def test_nprobes_works_sync(table):
|
|
||||||
LanceVectorQueryBuilder(table, [0, 0], "vector").nprobes(30).to_list()
|
|
||||||
|
|
||||||
|
|
||||||
def test_nprobes_min_max_works_sync(table):
|
|
||||||
LanceVectorQueryBuilder(table, [0, 0], "vector").minimum_nprobes(2).maximum_nprobes(
|
|
||||||
4
|
|
||||||
).to_list()
|
|
||||||
|
|
||||||
|
|
||||||
def test_multiple_nprobes_calls_works_sync(table):
|
|
||||||
LanceVectorQueryBuilder(table, [0, 0], "vector").nprobes(30).maximum_nprobes(
|
|
||||||
20
|
|
||||||
).minimum_nprobes(20).to_list()
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_invalid_nprobes_async(table_async: AsyncTable):
|
|
||||||
with pytest.raises(ValueError, match="minimum_nprobes must be greater than 0"):
|
|
||||||
await table_async.vector_search([0, 0]).minimum_nprobes(0).to_list()
|
|
||||||
with pytest.raises(
|
|
||||||
ValueError,
|
|
||||||
match="maximum_nprobes must be greater than or equal to minimum_nprobes",
|
|
||||||
):
|
|
||||||
await table_async.vector_search([0, 0]).maximum_nprobes(5).to_list()
|
|
||||||
with pytest.raises(
|
|
||||||
ValueError,
|
|
||||||
match="minimum_nprobes must be less than or equal to maximum_nprobes",
|
|
||||||
):
|
|
||||||
await table_async.vector_search([0, 0]).minimum_nprobes(100).to_list()
|
|
||||||
|
|
||||||
|
|
||||||
def test_query_builder_with_prefilter(table):
|
def test_query_builder_with_prefilter(table):
|
||||||
df = (
|
df = (
|
||||||
LanceVectorQueryBuilder(table, [0, 0], "vector")
|
LanceVectorQueryBuilder(table, [0, 0], "vector")
|
||||||
@@ -634,21 +583,6 @@ async def test_query_async(table_async: AsyncTable):
|
|||||||
table_async.query().nearest_to(pa.array([1, 2])).nprobes(10),
|
table_async.query().nearest_to(pa.array([1, 2])).nprobes(10),
|
||||||
expected_num_rows=2,
|
expected_num_rows=2,
|
||||||
)
|
)
|
||||||
await check_query(
|
|
||||||
table_async.query().nearest_to(pa.array([1, 2])).minimum_nprobes(10),
|
|
||||||
expected_num_rows=2,
|
|
||||||
)
|
|
||||||
await check_query(
|
|
||||||
table_async.query().nearest_to(pa.array([1, 2])).maximum_nprobes(30),
|
|
||||||
expected_num_rows=2,
|
|
||||||
)
|
|
||||||
await check_query(
|
|
||||||
table_async.query()
|
|
||||||
.nearest_to(pa.array([1, 2]))
|
|
||||||
.minimum_nprobes(10)
|
|
||||||
.maximum_nprobes(20),
|
|
||||||
expected_num_rows=2,
|
|
||||||
)
|
|
||||||
await check_query(
|
await check_query(
|
||||||
table_async.query().nearest_to(pa.array([1, 2])).bypass_vector_index(),
|
table_async.query().nearest_to(pa.array([1, 2])).bypass_vector_index(),
|
||||||
expected_num_rows=2,
|
expected_num_rows=2,
|
||||||
@@ -797,83 +731,6 @@ async def test_explain_plan_async(table_async: AsyncTable):
|
|||||||
assert "KNN" in plan
|
assert "KNN" in plan
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_explain_plan_fts(table_async: AsyncTable):
|
|
||||||
"""Test explain plan for FTS queries"""
|
|
||||||
# Create FTS index
|
|
||||||
from lancedb.index import FTS
|
|
||||||
|
|
||||||
await table_async.create_index("text", config=FTS())
|
|
||||||
|
|
||||||
# Test pure FTS query
|
|
||||||
query = await table_async.search("dog", query_type="fts", fts_columns="text")
|
|
||||||
plan = await query.explain_plan()
|
|
||||||
# Should show FTS details (issue #2465 is now fixed)
|
|
||||||
assert "MatchQuery: query=dog" in plan
|
|
||||||
assert "GlobalLimitExec" in plan # Default limit
|
|
||||||
|
|
||||||
# Test FTS query with limit
|
|
||||||
query_with_limit = await table_async.search(
|
|
||||||
"dog", query_type="fts", fts_columns="text"
|
|
||||||
)
|
|
||||||
plan_with_limit = await query_with_limit.limit(1).explain_plan()
|
|
||||||
assert "MatchQuery: query=dog" in plan_with_limit
|
|
||||||
assert "GlobalLimitExec: skip=0, fetch=1" in plan_with_limit
|
|
||||||
|
|
||||||
# Test FTS query with offset and limit
|
|
||||||
query_with_offset = await table_async.search(
|
|
||||||
"dog", query_type="fts", fts_columns="text"
|
|
||||||
)
|
|
||||||
plan_with_offset = await query_with_offset.offset(1).limit(1).explain_plan()
|
|
||||||
assert "MatchQuery: query=dog" in plan_with_offset
|
|
||||||
assert "GlobalLimitExec: skip=1, fetch=1" in plan_with_offset
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_explain_plan_vector_with_limit_offset(table_async: AsyncTable):
|
|
||||||
"""Test explain plan for vector queries with limit and offset"""
|
|
||||||
# Test vector query with limit
|
|
||||||
plan_with_limit = await (
|
|
||||||
table_async.query().nearest_to(pa.array([1, 2])).limit(1).explain_plan()
|
|
||||||
)
|
|
||||||
assert "KNN" in plan_with_limit
|
|
||||||
assert "GlobalLimitExec: skip=0, fetch=1" in plan_with_limit
|
|
||||||
|
|
||||||
# Test vector query with offset and limit
|
|
||||||
plan_with_offset = await (
|
|
||||||
table_async.query()
|
|
||||||
.nearest_to(pa.array([1, 2]))
|
|
||||||
.offset(1)
|
|
||||||
.limit(1)
|
|
||||||
.explain_plan()
|
|
||||||
)
|
|
||||||
assert "KNN" in plan_with_offset
|
|
||||||
assert "GlobalLimitExec: skip=1, fetch=1" in plan_with_offset
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_explain_plan_with_filters(table_async: AsyncTable):
|
|
||||||
"""Test explain plan for queries with filters"""
|
|
||||||
# Test vector query with filter
|
|
||||||
plan_with_filter = await (
|
|
||||||
table_async.query().nearest_to(pa.array([1, 2])).where("id = 1").explain_plan()
|
|
||||||
)
|
|
||||||
assert "KNN" in plan_with_filter
|
|
||||||
assert "LanceRead" in plan_with_filter
|
|
||||||
|
|
||||||
# Test FTS query with filter
|
|
||||||
from lancedb.index import FTS
|
|
||||||
|
|
||||||
await table_async.create_index("text", config=FTS())
|
|
||||||
query_fts_filter = await table_async.search(
|
|
||||||
"dog", query_type="fts", fts_columns="text"
|
|
||||||
)
|
|
||||||
plan_fts_filter = await query_fts_filter.where("id = 1").explain_plan()
|
|
||||||
assert "MatchQuery: query=dog" in plan_fts_filter
|
|
||||||
assert "LanceRead" in plan_fts_filter
|
|
||||||
assert "full_filter=id = Int64(1)" in plan_fts_filter # Should show filter details
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_query_camelcase_async(tmp_path):
|
async def test_query_camelcase_async(tmp_path):
|
||||||
db = await lancedb.connect_async(tmp_path)
|
db = await lancedb.connect_async(tmp_path)
|
||||||
@@ -1052,39 +909,7 @@ def test_query_serialization_sync(table: lancedb.table.Table):
|
|||||||
|
|
||||||
q = table.search([5.0, 6.0]).nprobes(10).refine_factor(5).to_query_object()
|
q = table.search([5.0, 6.0]).nprobes(10).refine_factor(5).to_query_object()
|
||||||
check_set_props(
|
check_set_props(
|
||||||
q,
|
q, vector_column="vector", vector=[5.0, 6.0], nprobes=10, refine_factor=5
|
||||||
vector_column="vector",
|
|
||||||
vector=[5.0, 6.0],
|
|
||||||
minimum_nprobes=10,
|
|
||||||
maximum_nprobes=10,
|
|
||||||
refine_factor=5,
|
|
||||||
)
|
|
||||||
|
|
||||||
q = table.search([5.0, 6.0]).minimum_nprobes(10).to_query_object()
|
|
||||||
check_set_props(
|
|
||||||
q,
|
|
||||||
vector_column="vector",
|
|
||||||
vector=[5.0, 6.0],
|
|
||||||
minimum_nprobes=10,
|
|
||||||
maximum_nprobes=None,
|
|
||||||
)
|
|
||||||
|
|
||||||
q = table.search([5.0, 6.0]).nprobes(50).to_query_object()
|
|
||||||
check_set_props(
|
|
||||||
q,
|
|
||||||
vector_column="vector",
|
|
||||||
vector=[5.0, 6.0],
|
|
||||||
minimum_nprobes=50,
|
|
||||||
maximum_nprobes=50,
|
|
||||||
)
|
|
||||||
|
|
||||||
q = table.search([5.0, 6.0]).maximum_nprobes(10).to_query_object()
|
|
||||||
check_set_props(
|
|
||||||
q,
|
|
||||||
vector_column="vector",
|
|
||||||
vector=[5.0, 6.0],
|
|
||||||
maximum_nprobes=10,
|
|
||||||
minimum_nprobes=None,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
q = table.search([5.0, 6.0]).distance_range(0.0, 1.0).to_query_object()
|
q = table.search([5.0, 6.0]).distance_range(0.0, 1.0).to_query_object()
|
||||||
@@ -1136,8 +961,7 @@ async def test_query_serialization_async(table_async: AsyncTable):
|
|||||||
limit=10,
|
limit=10,
|
||||||
vector=sample_vector,
|
vector=sample_vector,
|
||||||
postfilter=False,
|
postfilter=False,
|
||||||
minimum_nprobes=20,
|
nprobes=20,
|
||||||
maximum_nprobes=20,
|
|
||||||
with_row_id=False,
|
with_row_id=False,
|
||||||
bypass_vector_index=False,
|
bypass_vector_index=False,
|
||||||
)
|
)
|
||||||
@@ -1147,20 +971,7 @@ async def test_query_serialization_async(table_async: AsyncTable):
|
|||||||
q,
|
q,
|
||||||
vector=sample_vector,
|
vector=sample_vector,
|
||||||
postfilter=False,
|
postfilter=False,
|
||||||
minimum_nprobes=20,
|
nprobes=20,
|
||||||
maximum_nprobes=20,
|
|
||||||
with_row_id=False,
|
|
||||||
bypass_vector_index=False,
|
|
||||||
limit=10,
|
|
||||||
)
|
|
||||||
|
|
||||||
q = (await table_async.search([5.0, 6.0])).nprobes(50).to_query_object()
|
|
||||||
check_set_props(
|
|
||||||
q,
|
|
||||||
vector=sample_vector,
|
|
||||||
postfilter=False,
|
|
||||||
minimum_nprobes=50,
|
|
||||||
maximum_nprobes=50,
|
|
||||||
with_row_id=False,
|
with_row_id=False,
|
||||||
bypass_vector_index=False,
|
bypass_vector_index=False,
|
||||||
limit=10,
|
limit=10,
|
||||||
@@ -1179,8 +990,7 @@ async def test_query_serialization_async(table_async: AsyncTable):
|
|||||||
filter="id = 1",
|
filter="id = 1",
|
||||||
postfilter=True,
|
postfilter=True,
|
||||||
vector=sample_vector,
|
vector=sample_vector,
|
||||||
minimum_nprobes=20,
|
nprobes=20,
|
||||||
maximum_nprobes=20,
|
|
||||||
with_row_id=False,
|
with_row_id=False,
|
||||||
bypass_vector_index=False,
|
bypass_vector_index=False,
|
||||||
)
|
)
|
||||||
@@ -1194,8 +1004,7 @@ async def test_query_serialization_async(table_async: AsyncTable):
|
|||||||
check_set_props(
|
check_set_props(
|
||||||
q,
|
q,
|
||||||
vector=sample_vector,
|
vector=sample_vector,
|
||||||
minimum_nprobes=10,
|
nprobes=10,
|
||||||
maximum_nprobes=10,
|
|
||||||
refine_factor=5,
|
refine_factor=5,
|
||||||
postfilter=False,
|
postfilter=False,
|
||||||
with_row_id=False,
|
with_row_id=False,
|
||||||
@@ -1203,18 +1012,6 @@ async def test_query_serialization_async(table_async: AsyncTable):
|
|||||||
limit=10,
|
limit=10,
|
||||||
)
|
)
|
||||||
|
|
||||||
q = (await table_async.search([5.0, 6.0])).minimum_nprobes(5).to_query_object()
|
|
||||||
check_set_props(
|
|
||||||
q,
|
|
||||||
vector=sample_vector,
|
|
||||||
minimum_nprobes=5,
|
|
||||||
maximum_nprobes=20,
|
|
||||||
postfilter=False,
|
|
||||||
with_row_id=False,
|
|
||||||
bypass_vector_index=False,
|
|
||||||
limit=10,
|
|
||||||
)
|
|
||||||
|
|
||||||
q = (
|
q = (
|
||||||
(await table_async.search([5.0, 6.0]))
|
(await table_async.search([5.0, 6.0]))
|
||||||
.distance_range(0.0, 1.0)
|
.distance_range(0.0, 1.0)
|
||||||
@@ -1226,8 +1023,7 @@ async def test_query_serialization_async(table_async: AsyncTable):
|
|||||||
lower_bound=0.0,
|
lower_bound=0.0,
|
||||||
upper_bound=1.0,
|
upper_bound=1.0,
|
||||||
postfilter=False,
|
postfilter=False,
|
||||||
minimum_nprobes=20,
|
nprobes=20,
|
||||||
maximum_nprobes=20,
|
|
||||||
with_row_id=False,
|
with_row_id=False,
|
||||||
bypass_vector_index=False,
|
bypass_vector_index=False,
|
||||||
limit=10,
|
limit=10,
|
||||||
@@ -1239,8 +1035,7 @@ async def test_query_serialization_async(table_async: AsyncTable):
|
|||||||
distance_type="cosine",
|
distance_type="cosine",
|
||||||
vector=sample_vector,
|
vector=sample_vector,
|
||||||
postfilter=False,
|
postfilter=False,
|
||||||
minimum_nprobes=20,
|
nprobes=20,
|
||||||
maximum_nprobes=20,
|
|
||||||
with_row_id=False,
|
with_row_id=False,
|
||||||
bypass_vector_index=False,
|
bypass_vector_index=False,
|
||||||
limit=10,
|
limit=10,
|
||||||
@@ -1252,8 +1047,7 @@ async def test_query_serialization_async(table_async: AsyncTable):
|
|||||||
ef=7,
|
ef=7,
|
||||||
vector=sample_vector,
|
vector=sample_vector,
|
||||||
postfilter=False,
|
postfilter=False,
|
||||||
minimum_nprobes=20,
|
nprobes=20,
|
||||||
maximum_nprobes=20,
|
|
||||||
with_row_id=False,
|
with_row_id=False,
|
||||||
bypass_vector_index=False,
|
bypass_vector_index=False,
|
||||||
limit=10,
|
limit=10,
|
||||||
@@ -1265,34 +1059,24 @@ async def test_query_serialization_async(table_async: AsyncTable):
|
|||||||
bypass_vector_index=True,
|
bypass_vector_index=True,
|
||||||
vector=sample_vector,
|
vector=sample_vector,
|
||||||
postfilter=False,
|
postfilter=False,
|
||||||
minimum_nprobes=20,
|
nprobes=20,
|
||||||
maximum_nprobes=20,
|
|
||||||
with_row_id=False,
|
with_row_id=False,
|
||||||
limit=10,
|
limit=10,
|
||||||
)
|
)
|
||||||
|
|
||||||
# FTS queries
|
# FTS queries
|
||||||
match_query = MatchQuery("foo", "text")
|
q = (await table_async.search("foo")).limit(10).to_query_object()
|
||||||
q = (await table_async.search(match_query)).limit(10).to_query_object()
|
|
||||||
check_set_props(
|
check_set_props(
|
||||||
q,
|
q,
|
||||||
limit=10,
|
limit=10,
|
||||||
full_text_query=FullTextSearchQuery(columns=None, query=match_query),
|
full_text_query=FullTextSearchQuery(columns=[], query="foo"),
|
||||||
with_row_id=False,
|
with_row_id=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
q = (await table_async.search(match_query)).to_query_object()
|
q = (await table_async.search("foo", query_type="fts")).to_query_object()
|
||||||
check_set_props(
|
check_set_props(
|
||||||
q,
|
q,
|
||||||
full_text_query=FullTextSearchQuery(columns=None, query=match_query),
|
full_text_query=FullTextSearchQuery(columns=[], query="foo"),
|
||||||
with_row_id=False,
|
|
||||||
)
|
|
||||||
|
|
||||||
phrase_query = PhraseQuery("foo", "text", slop=1)
|
|
||||||
q = (await table_async.search(phrase_query)).to_query_object()
|
|
||||||
check_set_props(
|
|
||||||
q,
|
|
||||||
full_text_query=FullTextSearchQuery(columns=None, query=phrase_query),
|
|
||||||
with_row_id=False,
|
with_row_id=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -1359,20 +1143,3 @@ async def test_query_timeout_async(tmp_path):
|
|||||||
.nearest_to([0.0, 0.0])
|
.nearest_to([0.0, 0.0])
|
||||||
.to_list(timeout=timedelta(0))
|
.to_list(timeout=timedelta(0))
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_search_empty_table(mem_db):
|
|
||||||
"""Test searching on empty table should not crash
|
|
||||||
|
|
||||||
Regression test for issue #303:
|
|
||||||
https://github.com/lancedb/lancedb/issues/303
|
|
||||||
Searching on empty table produces scary error message
|
|
||||||
"""
|
|
||||||
schema = pa.schema(
|
|
||||||
[pa.field("vector", pa.list_(pa.float32(), 2)), pa.field("id", pa.int64())]
|
|
||||||
)
|
|
||||||
table = mem_db.create_table("test_empty_search", schema=schema)
|
|
||||||
|
|
||||||
# Search on empty table should return empty results, not crash
|
|
||||||
results = table.search([1.0, 2.0]).limit(5).to_list()
|
|
||||||
assert results == []
|
|
||||||
|
|||||||
@@ -210,25 +210,6 @@ async def test_retry_error():
|
|||||||
assert cause.status_code == 429
|
assert cause.status_code == 429
|
||||||
|
|
||||||
|
|
||||||
def test_table_unimplemented_functions():
|
|
||||||
def handler(request):
|
|
||||||
if request.path == "/v1/table/test/create/?mode=create":
|
|
||||||
request.send_response(200)
|
|
||||||
request.send_header("Content-Type", "application/json")
|
|
||||||
request.end_headers()
|
|
||||||
request.wfile.write(b"{}")
|
|
||||||
else:
|
|
||||||
request.send_response(404)
|
|
||||||
request.end_headers()
|
|
||||||
|
|
||||||
with mock_lancedb_connection(handler) as db:
|
|
||||||
table = db.create_table("test", [{"id": 1}])
|
|
||||||
with pytest.raises(NotImplementedError):
|
|
||||||
table.to_arrow()
|
|
||||||
with pytest.raises(NotImplementedError):
|
|
||||||
table.to_pandas()
|
|
||||||
|
|
||||||
|
|
||||||
def test_table_add_in_threadpool():
|
def test_table_add_in_threadpool():
|
||||||
def handler(request):
|
def handler(request):
|
||||||
if request.path == "/v1/table/test/insert/":
|
if request.path == "/v1/table/test/insert/":
|
||||||
@@ -515,8 +496,6 @@ def test_query_sync_minimal():
|
|||||||
"ef": None,
|
"ef": None,
|
||||||
"vector": [1.0, 2.0, 3.0],
|
"vector": [1.0, 2.0, 3.0],
|
||||||
"nprobes": 20,
|
"nprobes": 20,
|
||||||
"minimum_nprobes": 20,
|
|
||||||
"maximum_nprobes": 20,
|
|
||||||
"version": None,
|
"version": None,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -557,8 +536,6 @@ def test_query_sync_maximal():
|
|||||||
"refine_factor": 10,
|
"refine_factor": 10,
|
||||||
"vector": [1.0, 2.0, 3.0],
|
"vector": [1.0, 2.0, 3.0],
|
||||||
"nprobes": 5,
|
"nprobes": 5,
|
||||||
"minimum_nprobes": 5,
|
|
||||||
"maximum_nprobes": 5,
|
|
||||||
"lower_bound": None,
|
"lower_bound": None,
|
||||||
"upper_bound": None,
|
"upper_bound": None,
|
||||||
"ef": None,
|
"ef": None,
|
||||||
@@ -587,66 +564,6 @@ def test_query_sync_maximal():
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_query_sync_nprobes():
|
|
||||||
def handler(body):
|
|
||||||
assert body == {
|
|
||||||
"distance_type": "l2",
|
|
||||||
"k": 10,
|
|
||||||
"prefilter": True,
|
|
||||||
"fast_search": True,
|
|
||||||
"vector_column": "vector2",
|
|
||||||
"refine_factor": None,
|
|
||||||
"lower_bound": None,
|
|
||||||
"upper_bound": None,
|
|
||||||
"ef": None,
|
|
||||||
"vector": [1.0, 2.0, 3.0],
|
|
||||||
"nprobes": 5,
|
|
||||||
"minimum_nprobes": 5,
|
|
||||||
"maximum_nprobes": 15,
|
|
||||||
"version": None,
|
|
||||||
}
|
|
||||||
|
|
||||||
return pa.table({"id": [1, 2, 3], "name": ["a", "b", "c"]})
|
|
||||||
|
|
||||||
with query_test_table(handler) as table:
|
|
||||||
(
|
|
||||||
table.search([1, 2, 3], vector_column_name="vector2", fast_search=True)
|
|
||||||
.minimum_nprobes(5)
|
|
||||||
.maximum_nprobes(15)
|
|
||||||
.to_list()
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def test_query_sync_no_max_nprobes():
|
|
||||||
def handler(body):
|
|
||||||
assert body == {
|
|
||||||
"distance_type": "l2",
|
|
||||||
"k": 10,
|
|
||||||
"prefilter": True,
|
|
||||||
"fast_search": True,
|
|
||||||
"vector_column": "vector2",
|
|
||||||
"refine_factor": None,
|
|
||||||
"lower_bound": None,
|
|
||||||
"upper_bound": None,
|
|
||||||
"ef": None,
|
|
||||||
"vector": [1.0, 2.0, 3.0],
|
|
||||||
"nprobes": 5,
|
|
||||||
"minimum_nprobes": 5,
|
|
||||||
"maximum_nprobes": 0,
|
|
||||||
"version": None,
|
|
||||||
}
|
|
||||||
|
|
||||||
return pa.table({"id": [1, 2, 3], "name": ["a", "b", "c"]})
|
|
||||||
|
|
||||||
with query_test_table(handler) as table:
|
|
||||||
(
|
|
||||||
table.search([1, 2, 3], vector_column_name="vector2", fast_search=True)
|
|
||||||
.minimum_nprobes(5)
|
|
||||||
.maximum_nprobes(0)
|
|
||||||
.to_list()
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("server_version", [Version("0.1.0"), Version("0.2.0")])
|
@pytest.mark.parametrize("server_version", [Version("0.1.0"), Version("0.2.0")])
|
||||||
def test_query_sync_batch_queries(server_version):
|
def test_query_sync_batch_queries(server_version):
|
||||||
def handler(body):
|
def handler(body):
|
||||||
@@ -749,8 +666,6 @@ def test_query_sync_hybrid():
|
|||||||
"refine_factor": None,
|
"refine_factor": None,
|
||||||
"vector": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
|
"vector": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
|
||||||
"nprobes": 20,
|
"nprobes": 20,
|
||||||
"minimum_nprobes": 20,
|
|
||||||
"maximum_nprobes": 20,
|
|
||||||
"lower_bound": None,
|
"lower_bound": None,
|
||||||
"upper_bound": None,
|
"upper_bound": None,
|
||||||
"ef": None,
|
"ef": None,
|
||||||
|
|||||||
@@ -499,19 +499,3 @@ def test_empty_result_reranker():
|
|||||||
.rerank(reranker)
|
.rerank(reranker)
|
||||||
.to_arrow()
|
.to_arrow()
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
|
||||||
def test_cross_encoder_reranker_return_all(tmp_path, use_tantivy):
|
|
||||||
pytest.importorskip("sentence_transformers")
|
|
||||||
reranker = CrossEncoderReranker(return_score="all")
|
|
||||||
table, schema = get_test_table(tmp_path, use_tantivy)
|
|
||||||
query = "single player experience"
|
|
||||||
result = (
|
|
||||||
table.search(query, query_type="hybrid", vector_column_name="vector")
|
|
||||||
.rerank(reranker=reranker)
|
|
||||||
.to_arrow()
|
|
||||||
)
|
|
||||||
assert "_relevance_score" in result.column_names
|
|
||||||
assert "_score" in result.column_names
|
|
||||||
assert "_distance" in result.column_names
|
|
||||||
|
|||||||
@@ -245,7 +245,7 @@ def test_s3_dynamodb_sync(s3_bucket: str, commit_table: str, monkeypatch):
|
|||||||
NotImplementedError,
|
NotImplementedError,
|
||||||
match="Full-text search is only supported on the local filesystem",
|
match="Full-text search is only supported on the local filesystem",
|
||||||
):
|
):
|
||||||
table.create_fts_index("x", use_tantivy=True)
|
table.create_fts_index("x")
|
||||||
|
|
||||||
# make sure list tables still works
|
# make sure list tables still works
|
||||||
assert db.table_names() == ["test_ddb_sync"]
|
assert db.table_names() == ["test_ddb_sync"]
|
||||||
|
|||||||
@@ -1,38 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
|
||||||
|
|
||||||
import lancedb
|
|
||||||
|
|
||||||
|
|
||||||
def test_session_cache_configuration(tmp_path):
|
|
||||||
"""Test Session cache configuration and basic functionality."""
|
|
||||||
# Create session with small cache limits for testing
|
|
||||||
index_cache_size = 1024 * 1024 # 1MB
|
|
||||||
metadata_cache_size = 512 * 1024 # 512KB
|
|
||||||
|
|
||||||
session = lancedb.Session(
|
|
||||||
index_cache_size_bytes=index_cache_size,
|
|
||||||
metadata_cache_size_bytes=metadata_cache_size,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Record initial cache state
|
|
||||||
initial_cache_size = session.size_bytes
|
|
||||||
initial_cache_items = session.approx_num_items
|
|
||||||
|
|
||||||
# Test session works with database connection
|
|
||||||
db = lancedb.connect(tmp_path, session=session)
|
|
||||||
|
|
||||||
# Create and use a table to exercise the session
|
|
||||||
data = [{"id": i, "text": f"item {i}"} for i in range(100)]
|
|
||||||
table = db.create_table("test", data)
|
|
||||||
results = list(table.to_arrow().to_pylist())
|
|
||||||
|
|
||||||
assert len(results) == 100
|
|
||||||
|
|
||||||
# Verify cache usage increased after operations
|
|
||||||
final_cache_size = session.size_bytes
|
|
||||||
final_cache_items = session.approx_num_items
|
|
||||||
|
|
||||||
assert final_cache_size > initial_cache_size # Cache should have grown
|
|
||||||
assert final_cache_items >= initial_cache_items # Items should not decrease
|
|
||||||
assert initial_cache_size < index_cache_size + metadata_cache_size
|
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user