mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-23 13:29:57 +00:00
Compare commits
31 Commits
python-v0.
...
python-v0.
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c625b6f2b2 | ||
|
|
bec8fe6547 | ||
|
|
dc1150c011 | ||
|
|
afaefc6264 | ||
|
|
cb70ff8cee | ||
|
|
cbb5a841b1 | ||
|
|
c72f6770fd | ||
|
|
e5a80a5e86 | ||
|
|
8d0a7fad1f | ||
|
|
b80d4d0134 | ||
|
|
9645fe52c2 | ||
|
|
b77314168d | ||
|
|
e08d45e090 | ||
|
|
2e3ddb8382 | ||
|
|
627ca4c810 | ||
|
|
f8dae4ffe9 | ||
|
|
9eb6119468 | ||
|
|
59b57e30ed | ||
|
|
fec8d58f06 | ||
|
|
84ded9d678 | ||
|
|
65696d9713 | ||
|
|
e2f2ea32e4 | ||
|
|
d5f2eca754 | ||
|
|
7fa455a8a5 | ||
|
|
8f42b5874e | ||
|
|
274f19f560 | ||
|
|
fbcbc75b5b | ||
|
|
008f389bd0 | ||
|
|
91af6518d9 | ||
|
|
af6819762c | ||
|
|
7acece493d |
@@ -1,5 +1,5 @@
|
|||||||
[tool.bumpversion]
|
[tool.bumpversion]
|
||||||
current_version = "0.20.0-beta.1"
|
current_version = "0.20.1-beta.2"
|
||||||
parse = """(?x)
|
parse = """(?x)
|
||||||
(?P<major>0|[1-9]\\d*)\\.
|
(?P<major>0|[1-9]\\d*)\\.
|
||||||
(?P<minor>0|[1-9]\\d*)\\.
|
(?P<minor>0|[1-9]\\d*)\\.
|
||||||
|
|||||||
10
.github/workflows/make-release-commit.yml
vendored
10
.github/workflows/make-release-commit.yml
vendored
@@ -84,7 +84,7 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
pip install bump-my-version PyGithub packaging
|
pip install bump-my-version PyGithub packaging
|
||||||
bash ci/bump_version.sh ${{ inputs.type }} ${{ inputs.bump-minor }} v $COMMIT_BEFORE_BUMP
|
bash ci/bump_version.sh ${{ inputs.type }} ${{ inputs.bump-minor }} v $COMMIT_BEFORE_BUMP
|
||||||
bash ci/update_lockfiles.sh
|
bash ci/update_lockfiles.sh --amend
|
||||||
- name: Push new version tag
|
- name: Push new version tag
|
||||||
if: ${{ !inputs.dry_run }}
|
if: ${{ !inputs.dry_run }}
|
||||||
uses: ad-m/github-push-action@master
|
uses: ad-m/github-push-action@master
|
||||||
@@ -93,11 +93,3 @@ jobs:
|
|||||||
github_token: ${{ secrets.LANCEDB_RELEASE_TOKEN }}
|
github_token: ${{ secrets.LANCEDB_RELEASE_TOKEN }}
|
||||||
branch: ${{ github.ref }}
|
branch: ${{ github.ref }}
|
||||||
tags: true
|
tags: true
|
||||||
- uses: ./.github/workflows/update_package_lock
|
|
||||||
if: ${{ !inputs.dry_run && inputs.other }}
|
|
||||||
with:
|
|
||||||
github_token: ${{ secrets.GITHUB_TOKEN }}
|
|
||||||
- uses: ./.github/workflows/update_package_lock_nodejs
|
|
||||||
if: ${{ !inputs.dry_run && inputs.other }}
|
|
||||||
with:
|
|
||||||
github_token: ${{ secrets.GITHUB_TOKEN }}
|
|
||||||
|
|||||||
16
.github/workflows/npm-publish.yml
vendored
16
.github/workflows/npm-publish.yml
vendored
@@ -505,6 +505,8 @@ jobs:
|
|||||||
name: vectordb NPM Publish
|
name: vectordb NPM Publish
|
||||||
needs: [node, node-macos, node-linux-gnu, node-windows]
|
needs: [node, node-macos, node-linux-gnu, node-windows]
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
permissions:
|
||||||
|
contents: write
|
||||||
# Only runs on tags that matches the make-release action
|
# Only runs on tags that matches the make-release action
|
||||||
if: startsWith(github.ref, 'refs/tags/v')
|
if: startsWith(github.ref, 'refs/tags/v')
|
||||||
steps:
|
steps:
|
||||||
@@ -537,6 +539,20 @@ jobs:
|
|||||||
# We need to deprecate the old package to avoid confusion.
|
# We need to deprecate the old package to avoid confusion.
|
||||||
# Each time we publish a new version, it gets undeprecated.
|
# Each time we publish a new version, it gets undeprecated.
|
||||||
run: npm deprecate vectordb "Use @lancedb/lancedb instead."
|
run: npm deprecate vectordb "Use @lancedb/lancedb instead."
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
ref: main
|
||||||
|
- name: Update package-lock.json
|
||||||
|
run: |
|
||||||
|
git config user.name 'Lance Release'
|
||||||
|
git config user.email 'lance-dev@lancedb.com'
|
||||||
|
bash ci/update_lockfiles.sh
|
||||||
|
- name: Push new commit
|
||||||
|
uses: ad-m/github-push-action@master
|
||||||
|
with:
|
||||||
|
github_token: ${{ secrets.LANCEDB_RELEASE_TOKEN }}
|
||||||
|
branch: main
|
||||||
- name: Notify Slack Action
|
- name: Notify Slack Action
|
||||||
uses: ravsamhq/notify-slack-action@2.3.0
|
uses: ravsamhq/notify-slack-action@2.3.0
|
||||||
if: ${{ always() }}
|
if: ${{ always() }}
|
||||||
|
|||||||
33
.github/workflows/update_package_lock/action.yml
vendored
33
.github/workflows/update_package_lock/action.yml
vendored
@@ -1,33 +0,0 @@
|
|||||||
name: update_package_lock
|
|
||||||
description: "Update node's package.lock"
|
|
||||||
|
|
||||||
inputs:
|
|
||||||
github_token:
|
|
||||||
required: true
|
|
||||||
description: "github token for the repo"
|
|
||||||
|
|
||||||
runs:
|
|
||||||
using: "composite"
|
|
||||||
steps:
|
|
||||||
- uses: actions/setup-node@v3
|
|
||||||
with:
|
|
||||||
node-version: 20
|
|
||||||
- name: Set git configs
|
|
||||||
shell: bash
|
|
||||||
run: |
|
|
||||||
git config user.name 'Lance Release'
|
|
||||||
git config user.email 'lance-dev@lancedb.com'
|
|
||||||
- name: Update package-lock.json file
|
|
||||||
working-directory: ./node
|
|
||||||
run: |
|
|
||||||
npm install
|
|
||||||
git add package-lock.json
|
|
||||||
git commit -m "Updating package-lock.json"
|
|
||||||
shell: bash
|
|
||||||
- name: Push changes
|
|
||||||
if: ${{ inputs.dry_run }} == "false"
|
|
||||||
uses: ad-m/github-push-action@master
|
|
||||||
with:
|
|
||||||
github_token: ${{ inputs.github_token }}
|
|
||||||
branch: main
|
|
||||||
tags: true
|
|
||||||
@@ -1,33 +0,0 @@
|
|||||||
name: update_package_lock_nodejs
|
|
||||||
description: "Update nodejs's package.lock"
|
|
||||||
|
|
||||||
inputs:
|
|
||||||
github_token:
|
|
||||||
required: true
|
|
||||||
description: "github token for the repo"
|
|
||||||
|
|
||||||
runs:
|
|
||||||
using: "composite"
|
|
||||||
steps:
|
|
||||||
- uses: actions/setup-node@v3
|
|
||||||
with:
|
|
||||||
node-version: 20
|
|
||||||
- name: Set git configs
|
|
||||||
shell: bash
|
|
||||||
run: |
|
|
||||||
git config user.name 'Lance Release'
|
|
||||||
git config user.email 'lance-dev@lancedb.com'
|
|
||||||
- name: Update package-lock.json file
|
|
||||||
working-directory: ./nodejs
|
|
||||||
run: |
|
|
||||||
npm install
|
|
||||||
git add package-lock.json
|
|
||||||
git commit -m "Updating package-lock.json"
|
|
||||||
shell: bash
|
|
||||||
- name: Push changes
|
|
||||||
if: ${{ inputs.dry_run }} == "false"
|
|
||||||
uses: ad-m/github-push-action@master
|
|
||||||
with:
|
|
||||||
github_token: ${{ inputs.github_token }}
|
|
||||||
branch: main
|
|
||||||
tags: true
|
|
||||||
689
Cargo.lock
generated
689
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
16
Cargo.toml
16
Cargo.toml
@@ -21,14 +21,14 @@ categories = ["database-implementations"]
|
|||||||
rust-version = "1.78.0"
|
rust-version = "1.78.0"
|
||||||
|
|
||||||
[workspace.dependencies]
|
[workspace.dependencies]
|
||||||
lance = { "version" = "=0.29.0", "features" = ["dynamodb"], tag = "v0.29.0-beta.2", git="https://github.com/lancedb/lance.git" }
|
lance = { "version" = "=0.30.0", "features" = ["dynamodb"] }
|
||||||
lance-io = { version = "=0.29.0", tag = "v0.29.0-beta.2", git="https://github.com/lancedb/lance.git" }
|
lance-io = "=0.30.0"
|
||||||
lance-index = { version = "=0.29.0", tag = "v0.29.0-beta.2", git="https://github.com/lancedb/lance.git" }
|
lance-index = "=0.30.0"
|
||||||
lance-linalg = { version = "=0.29.0", tag = "v0.29.0-beta.2", git="https://github.com/lancedb/lance.git" }
|
lance-linalg = "=0.30.0"
|
||||||
lance-table = { version = "=0.29.0", tag = "v0.29.0-beta.2", git="https://github.com/lancedb/lance.git" }
|
lance-table = "=0.30.0"
|
||||||
lance-testing = { version = "=0.29.0", tag = "v0.29.0-beta.2", git="https://github.com/lancedb/lance.git" }
|
lance-testing = "=0.30.0"
|
||||||
lance-datafusion = { version = "=0.29.0", tag = "v0.29.0-beta.2", git="https://github.com/lancedb/lance.git" }
|
lance-datafusion = "=0.30.0"
|
||||||
lance-encoding = { version = "=0.29.0", tag = "v0.29.0-beta.2", git="https://github.com/lancedb/lance.git" }
|
lance-encoding = "=0.30.0"
|
||||||
# Note that this one does not include pyarrow
|
# Note that this one does not include pyarrow
|
||||||
arrow = { version = "55.1", optional = false }
|
arrow = { version = "55.1", optional = false }
|
||||||
arrow-array = "55.1"
|
arrow-array = "55.1"
|
||||||
|
|||||||
174
ci/set_lance_version.py
Normal file
174
ci/set_lance_version.py
Normal file
@@ -0,0 +1,174 @@
|
|||||||
|
import argparse
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
|
def run_command(command: str) -> str:
|
||||||
|
"""
|
||||||
|
Run a shell command and return stdout as a string.
|
||||||
|
If exit code is not 0, raise an exception with the stderr output.
|
||||||
|
"""
|
||||||
|
import subprocess
|
||||||
|
|
||||||
|
result = subprocess.run(command, shell=True, capture_output=True, text=True)
|
||||||
|
if result.returncode != 0:
|
||||||
|
raise Exception(f"Command failed with error: {result.stderr.strip()}")
|
||||||
|
return result.stdout.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def get_latest_stable_version() -> str:
|
||||||
|
version_line = run_command("cargo info lance | grep '^version:'")
|
||||||
|
version = version_line.split(" ")[1].strip()
|
||||||
|
return version
|
||||||
|
|
||||||
|
|
||||||
|
def get_latest_preview_version() -> str:
|
||||||
|
lance_tags = run_command(
|
||||||
|
"git ls-remote --tags https://github.com/lancedb/lance.git | grep 'refs/tags/v[0-9beta.-]\\+$'"
|
||||||
|
).splitlines()
|
||||||
|
lance_tags = (
|
||||||
|
tag.split("refs/tags/")[1]
|
||||||
|
for tag in lance_tags
|
||||||
|
if "refs/tags/" in tag and "beta" in tag
|
||||||
|
)
|
||||||
|
from packaging.version import Version
|
||||||
|
|
||||||
|
latest = max(
|
||||||
|
(tag[1:] for tag in lance_tags if tag.startswith("v")), key=lambda t: Version(t)
|
||||||
|
)
|
||||||
|
return str(latest)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_features(line: str) -> list:
|
||||||
|
"""
|
||||||
|
Extracts the features from a line in Cargo.toml.
|
||||||
|
Example: 'lance = { "version" = "=0.29.0", "features" = ["dynamodb"] }'
|
||||||
|
Returns: ['dynamodb']
|
||||||
|
"""
|
||||||
|
import re
|
||||||
|
|
||||||
|
match = re.search(r'"features"\s*=\s*\[(.*?)\]', line)
|
||||||
|
if match:
|
||||||
|
features_str = match.group(1)
|
||||||
|
return [f.strip('"') for f in features_str.split(",")]
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def update_cargo_toml(line_updater):
|
||||||
|
"""
|
||||||
|
Updates the Cargo.toml file by applying the line_updater function to each line.
|
||||||
|
The line_updater function should take a line as input and return the updated line.
|
||||||
|
"""
|
||||||
|
with open("Cargo.toml", "r") as f:
|
||||||
|
lines = f.readlines()
|
||||||
|
|
||||||
|
new_lines = []
|
||||||
|
for line in lines:
|
||||||
|
if line.startswith("lance"):
|
||||||
|
# Update the line using the provided function
|
||||||
|
new_lines.append(line_updater(line))
|
||||||
|
else:
|
||||||
|
# Keep the line unchanged
|
||||||
|
new_lines.append(line)
|
||||||
|
|
||||||
|
with open("Cargo.toml", "w") as f:
|
||||||
|
f.writelines(new_lines)
|
||||||
|
|
||||||
|
|
||||||
|
def set_stable_version(version: str):
|
||||||
|
"""
|
||||||
|
Sets lines to
|
||||||
|
lance = { "version" = "=0.29.0", "features" = ["dynamodb"] }
|
||||||
|
lance-io = "=0.29.0"
|
||||||
|
...
|
||||||
|
"""
|
||||||
|
|
||||||
|
def line_updater(line: str) -> str:
|
||||||
|
package_name = line.split("=", maxsplit=1)[0].strip()
|
||||||
|
features = extract_features(line)
|
||||||
|
if features:
|
||||||
|
return f'{package_name} = {{ "version" = "={version}", "features" = {json.dumps(features)} }}\n'
|
||||||
|
else:
|
||||||
|
return f'{package_name} = "={version}"\n'
|
||||||
|
|
||||||
|
update_cargo_toml(line_updater)
|
||||||
|
|
||||||
|
|
||||||
|
def set_preview_version(version: str):
|
||||||
|
"""
|
||||||
|
Sets lines to
|
||||||
|
lance = { "version" = "=0.29.0", "features" = ["dynamodb"], tag = "v0.29.0-beta.2", git="https://github.com/lancedb/lance.git" }
|
||||||
|
lance-io = { version = "=0.29.0", tag = "v0.29.0-beta.2", git="https://github.com/lancedb/lance.git" }
|
||||||
|
...
|
||||||
|
"""
|
||||||
|
|
||||||
|
def line_updater(line: str) -> str:
|
||||||
|
package_name = line.split("=", maxsplit=1)[0].strip()
|
||||||
|
features = extract_features(line)
|
||||||
|
base_version = version.split("-")[0] # Get the base version without beta suffix
|
||||||
|
if features:
|
||||||
|
return f'{package_name} = {{ "version" = "={base_version}", "features" = {json.dumps(features)}, "tag" = "v{version}", "git" = "https://github.com/lancedb/lance.git" }}\n'
|
||||||
|
else:
|
||||||
|
return f'{package_name} = {{ "version" = "={base_version}", "tag" = "v{version}", "git" = "https://github.com/lancedb/lance.git" }}\n'
|
||||||
|
|
||||||
|
update_cargo_toml(line_updater)
|
||||||
|
|
||||||
|
|
||||||
|
def set_local_version():
|
||||||
|
"""
|
||||||
|
Sets lines to
|
||||||
|
lance = { path = "../lance/rust/lance", features = ["dynamodb"] }
|
||||||
|
lance-io = { path = "../lance/rust/lance-io" }
|
||||||
|
...
|
||||||
|
"""
|
||||||
|
|
||||||
|
def line_updater(line: str) -> str:
|
||||||
|
package_name = line.split("=", maxsplit=1)[0].strip()
|
||||||
|
features = extract_features(line)
|
||||||
|
if features:
|
||||||
|
return f'{package_name} = {{ "path" = "../lance/rust/{package_name}", "features" = {json.dumps(features)} }}\n'
|
||||||
|
else:
|
||||||
|
return f'{package_name} = {{ "path" = "../lance/rust/{package_name}" }}\n'
|
||||||
|
|
||||||
|
update_cargo_toml(line_updater)
|
||||||
|
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description="Set the version of the Lance package.")
|
||||||
|
parser.add_argument(
|
||||||
|
"version",
|
||||||
|
type=str,
|
||||||
|
help="The version to set for the Lance package. Use 'stable' for the latest stable version, 'preview' for latest preview version, or a specific version number (e.g., '0.1.0'). You can also specify 'local' to use a local path.",
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if args.version == "stable":
|
||||||
|
latest_stable_version = get_latest_stable_version()
|
||||||
|
print(
|
||||||
|
f"Found latest stable version: \033[1mv{latest_stable_version}\033[0m",
|
||||||
|
file=sys.stderr,
|
||||||
|
)
|
||||||
|
set_stable_version(latest_stable_version)
|
||||||
|
elif args.version == "preview":
|
||||||
|
latest_preview_version = get_latest_preview_version()
|
||||||
|
print(
|
||||||
|
f"Found latest preview version: \033[1mv{latest_preview_version}\033[0m",
|
||||||
|
file=sys.stderr,
|
||||||
|
)
|
||||||
|
set_preview_version(latest_preview_version)
|
||||||
|
elif args.version == "local":
|
||||||
|
set_local_version()
|
||||||
|
else:
|
||||||
|
# Parse the version number.
|
||||||
|
version = args.version
|
||||||
|
# Ignore initial v if present.
|
||||||
|
if version.startswith("v"):
|
||||||
|
version = version[1:]
|
||||||
|
|
||||||
|
if "beta" in version:
|
||||||
|
set_preview_version(version)
|
||||||
|
else:
|
||||||
|
set_stable_version(version)
|
||||||
|
|
||||||
|
print("Updating lockfiles...", file=sys.stderr, end="")
|
||||||
|
run_command("cargo metadata > /dev/null")
|
||||||
|
print(" done.", file=sys.stderr)
|
||||||
@@ -1,18 +1,30 @@
|
|||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
|
AMEND=false
|
||||||
|
|
||||||
|
for arg in "$@"; do
|
||||||
|
if [[ "$arg" == "--amend" ]]; then
|
||||||
|
AMEND=true
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
# This updates the lockfile without building
|
# This updates the lockfile without building
|
||||||
cargo metadata > /dev/null
|
cargo metadata --quiet > /dev/null
|
||||||
|
|
||||||
pushd nodejs || exit 1
|
pushd nodejs || exit 1
|
||||||
npm install --package-lock-only
|
npm install --package-lock-only --silent
|
||||||
popd
|
popd
|
||||||
pushd node || exit 1
|
pushd node || exit 1
|
||||||
npm install --package-lock-only
|
npm install --package-lock-only --silent
|
||||||
popd
|
popd
|
||||||
|
|
||||||
if git diff --quiet --exit-code; then
|
if git diff --quiet --exit-code; then
|
||||||
echo "No lockfile changes to commit; skipping amend."
|
echo "No lockfile changes to commit; skipping amend."
|
||||||
else
|
elif $AMEND; then
|
||||||
|
git add Cargo.lock nodejs/package-lock.json node/package-lock.json
|
||||||
git commit --amend --no-edit
|
git commit --amend --no-edit
|
||||||
|
else
|
||||||
|
git add Cargo.lock nodejs/package-lock.json node/package-lock.json
|
||||||
|
git commit -m "Update lockfiles"
|
||||||
fi
|
fi
|
||||||
|
|||||||
@@ -42,6 +42,7 @@ duckdb.query("SELECT * FROM arrow_table")
|
|||||||
Have the required imports before doing any querying.
|
Have the required imports before doing any querying.
|
||||||
|
|
||||||
=== "Python"
|
=== "Python"
|
||||||
|
|
||||||
```python
|
```python
|
||||||
--8<-- "python/python/tests/docs/test_guide_tables.py:import-lancedb"
|
--8<-- "python/python/tests/docs/test_guide_tables.py:import-lancedb"
|
||||||
--8<-- "python/python/tests/docs/test_guide_tables.py:import-session-context"
|
--8<-- "python/python/tests/docs/test_guide_tables.py:import-session-context"
|
||||||
@@ -51,6 +52,7 @@ Have the required imports before doing any querying.
|
|||||||
Register the table created with the Datafusion session context.
|
Register the table created with the Datafusion session context.
|
||||||
|
|
||||||
=== "Python"
|
=== "Python"
|
||||||
|
|
||||||
```python
|
```python
|
||||||
--8<-- "python/python/tests/docs/test_guide_tables.py:lance_sql_basic"
|
--8<-- "python/python/tests/docs/test_guide_tables.py:lance_sql_basic"
|
||||||
```
|
```
|
||||||
|
|||||||
53
docs/src/js/classes/BooleanQuery.md
Normal file
53
docs/src/js/classes/BooleanQuery.md
Normal file
@@ -0,0 +1,53 @@
|
|||||||
|
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
[@lancedb/lancedb](../globals.md) / BooleanQuery
|
||||||
|
|
||||||
|
# Class: BooleanQuery
|
||||||
|
|
||||||
|
Represents a full-text query interface.
|
||||||
|
This interface defines the structure and behavior for full-text queries,
|
||||||
|
including methods to retrieve the query type and convert the query to a dictionary format.
|
||||||
|
|
||||||
|
## Implements
|
||||||
|
|
||||||
|
- [`FullTextQuery`](../interfaces/FullTextQuery.md)
|
||||||
|
|
||||||
|
## Constructors
|
||||||
|
|
||||||
|
### new BooleanQuery()
|
||||||
|
|
||||||
|
```ts
|
||||||
|
new BooleanQuery(queries): BooleanQuery
|
||||||
|
```
|
||||||
|
|
||||||
|
Creates an instance of BooleanQuery.
|
||||||
|
|
||||||
|
#### Parameters
|
||||||
|
|
||||||
|
* **queries**: [[`Occur`](../enumerations/Occur.md), [`FullTextQuery`](../interfaces/FullTextQuery.md)][]
|
||||||
|
An array of (Occur, FullTextQuery objects) to combine.
|
||||||
|
Occur specifies whether the query must match, or should match.
|
||||||
|
|
||||||
|
#### Returns
|
||||||
|
|
||||||
|
[`BooleanQuery`](BooleanQuery.md)
|
||||||
|
|
||||||
|
## Methods
|
||||||
|
|
||||||
|
### queryType()
|
||||||
|
|
||||||
|
```ts
|
||||||
|
queryType(): FullTextQueryType
|
||||||
|
```
|
||||||
|
|
||||||
|
The type of the full-text query.
|
||||||
|
|
||||||
|
#### Returns
|
||||||
|
|
||||||
|
[`FullTextQueryType`](../enumerations/FullTextQueryType.md)
|
||||||
|
|
||||||
|
#### Implementation of
|
||||||
|
|
||||||
|
[`FullTextQuery`](../interfaces/FullTextQuery.md).[`queryType`](../interfaces/FullTextQuery.md#querytype)
|
||||||
@@ -40,6 +40,7 @@ Creates an instance of MatchQuery.
|
|||||||
- `boost`: The boost factor for the query (default is 1.0).
|
- `boost`: The boost factor for the query (default is 1.0).
|
||||||
- `fuzziness`: The fuzziness level for the query (default is 0).
|
- `fuzziness`: The fuzziness level for the query (default is 0).
|
||||||
- `maxExpansions`: The maximum number of terms to consider for fuzzy matching (default is 50).
|
- `maxExpansions`: The maximum number of terms to consider for fuzzy matching (default is 50).
|
||||||
|
- `operator`: The logical operator to use for combining terms in the query (default is "OR").
|
||||||
|
|
||||||
* **options.boost?**: `number`
|
* **options.boost?**: `number`
|
||||||
|
|
||||||
@@ -47,6 +48,8 @@ Creates an instance of MatchQuery.
|
|||||||
|
|
||||||
* **options.maxExpansions?**: `number`
|
* **options.maxExpansions?**: `number`
|
||||||
|
|
||||||
|
* **options.operator?**: [`Operator`](../enumerations/Operator.md)
|
||||||
|
|
||||||
#### Returns
|
#### Returns
|
||||||
|
|
||||||
[`MatchQuery`](MatchQuery.md)
|
[`MatchQuery`](MatchQuery.md)
|
||||||
|
|||||||
@@ -38,9 +38,12 @@ Creates an instance of MultiMatchQuery.
|
|||||||
* **options?**
|
* **options?**
|
||||||
Optional parameters for the multi-match query.
|
Optional parameters for the multi-match query.
|
||||||
- `boosts`: An array of boost factors for each column (default is 1.0 for all).
|
- `boosts`: An array of boost factors for each column (default is 1.0 for all).
|
||||||
|
- `operator`: The logical operator to use for combining terms in the query (default is "OR").
|
||||||
|
|
||||||
* **options.boosts?**: `number`[]
|
* **options.boosts?**: `number`[]
|
||||||
|
|
||||||
|
* **options.operator?**: [`Operator`](../enumerations/Operator.md)
|
||||||
|
|
||||||
#### Returns
|
#### Returns
|
||||||
|
|
||||||
[`MultiMatchQuery`](MultiMatchQuery.md)
|
[`MultiMatchQuery`](MultiMatchQuery.md)
|
||||||
|
|||||||
@@ -19,7 +19,10 @@ including methods to retrieve the query type and convert the query to a dictiona
|
|||||||
### new PhraseQuery()
|
### new PhraseQuery()
|
||||||
|
|
||||||
```ts
|
```ts
|
||||||
new PhraseQuery(query, column): PhraseQuery
|
new PhraseQuery(
|
||||||
|
query,
|
||||||
|
column,
|
||||||
|
options?): PhraseQuery
|
||||||
```
|
```
|
||||||
|
|
||||||
Creates an instance of `PhraseQuery`.
|
Creates an instance of `PhraseQuery`.
|
||||||
@@ -32,6 +35,12 @@ Creates an instance of `PhraseQuery`.
|
|||||||
* **column**: `string`
|
* **column**: `string`
|
||||||
The name of the column to search within.
|
The name of the column to search within.
|
||||||
|
|
||||||
|
* **options?**
|
||||||
|
Optional parameters for the phrase query.
|
||||||
|
- `slop`: The maximum number of intervening unmatched positions allowed between words in the phrase (default is 0).
|
||||||
|
|
||||||
|
* **options.slop?**: `number`
|
||||||
|
|
||||||
#### Returns
|
#### Returns
|
||||||
|
|
||||||
[`PhraseQuery`](PhraseQuery.md)
|
[`PhraseQuery`](PhraseQuery.md)
|
||||||
|
|||||||
@@ -15,6 +15,14 @@ Enum representing the types of full-text queries supported.
|
|||||||
|
|
||||||
## Enumeration Members
|
## Enumeration Members
|
||||||
|
|
||||||
|
### Boolean
|
||||||
|
|
||||||
|
```ts
|
||||||
|
Boolean: "boolean";
|
||||||
|
```
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
### Boost
|
### Boost
|
||||||
|
|
||||||
```ts
|
```ts
|
||||||
|
|||||||
28
docs/src/js/enumerations/Occur.md
Normal file
28
docs/src/js/enumerations/Occur.md
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
[@lancedb/lancedb](../globals.md) / Occur
|
||||||
|
|
||||||
|
# Enumeration: Occur
|
||||||
|
|
||||||
|
Enum representing the occurrence of terms in full-text queries.
|
||||||
|
|
||||||
|
- `Must`: The term must be present in the document.
|
||||||
|
- `Should`: The term should contribute to the document score, but is not required.
|
||||||
|
|
||||||
|
## Enumeration Members
|
||||||
|
|
||||||
|
### Must
|
||||||
|
|
||||||
|
```ts
|
||||||
|
Must: "MUST";
|
||||||
|
```
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
### Should
|
||||||
|
|
||||||
|
```ts
|
||||||
|
Should: "SHOULD";
|
||||||
|
```
|
||||||
28
docs/src/js/enumerations/Operator.md
Normal file
28
docs/src/js/enumerations/Operator.md
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
[@lancedb/lancedb](../globals.md) / Operator
|
||||||
|
|
||||||
|
# Enumeration: Operator
|
||||||
|
|
||||||
|
Enum representing the logical operators used in full-text queries.
|
||||||
|
|
||||||
|
- `And`: All terms must match.
|
||||||
|
- `Or`: At least one term must match.
|
||||||
|
|
||||||
|
## Enumeration Members
|
||||||
|
|
||||||
|
### And
|
||||||
|
|
||||||
|
```ts
|
||||||
|
And: "AND";
|
||||||
|
```
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
### Or
|
||||||
|
|
||||||
|
```ts
|
||||||
|
Or: "OR";
|
||||||
|
```
|
||||||
@@ -12,9 +12,12 @@
|
|||||||
## Enumerations
|
## Enumerations
|
||||||
|
|
||||||
- [FullTextQueryType](enumerations/FullTextQueryType.md)
|
- [FullTextQueryType](enumerations/FullTextQueryType.md)
|
||||||
|
- [Occur](enumerations/Occur.md)
|
||||||
|
- [Operator](enumerations/Operator.md)
|
||||||
|
|
||||||
## Classes
|
## Classes
|
||||||
|
|
||||||
|
- [BooleanQuery](classes/BooleanQuery.md)
|
||||||
- [BoostQuery](classes/BoostQuery.md)
|
- [BoostQuery](classes/BoostQuery.md)
|
||||||
- [Connection](classes/Connection.md)
|
- [Connection](classes/Connection.md)
|
||||||
- [Index](classes/Index.md)
|
- [Index](classes/Index.md)
|
||||||
|
|||||||
@@ -7,3 +7,4 @@ tantivy==0.20.1
|
|||||||
--extra-index-url https://download.pytorch.org/whl/cpu
|
--extra-index-url https://download.pytorch.org/whl/cpu
|
||||||
torch
|
torch
|
||||||
polars>=0.19, <=1.3.0
|
polars>=0.19, <=1.3.0
|
||||||
|
datafusion
|
||||||
|
|||||||
@@ -8,7 +8,7 @@
|
|||||||
<parent>
|
<parent>
|
||||||
<groupId>com.lancedb</groupId>
|
<groupId>com.lancedb</groupId>
|
||||||
<artifactId>lancedb-parent</artifactId>
|
<artifactId>lancedb-parent</artifactId>
|
||||||
<version>0.20.0-beta.1</version>
|
<version>0.20.1-beta.2</version>
|
||||||
<relativePath>../pom.xml</relativePath>
|
<relativePath>../pom.xml</relativePath>
|
||||||
</parent>
|
</parent>
|
||||||
|
|
||||||
|
|||||||
@@ -6,7 +6,7 @@
|
|||||||
|
|
||||||
<groupId>com.lancedb</groupId>
|
<groupId>com.lancedb</groupId>
|
||||||
<artifactId>lancedb-parent</artifactId>
|
<artifactId>lancedb-parent</artifactId>
|
||||||
<version>0.20.0-beta.1</version>
|
<version>0.20.1-beta.2</version>
|
||||||
<packaging>pom</packaging>
|
<packaging>pom</packaging>
|
||||||
|
|
||||||
<name>LanceDB Parent</name>
|
<name>LanceDB Parent</name>
|
||||||
|
|||||||
49
node/package-lock.json
generated
49
node/package-lock.json
generated
@@ -1,12 +1,12 @@
|
|||||||
{
|
{
|
||||||
"name": "vectordb",
|
"name": "vectordb",
|
||||||
"version": "0.20.0-beta.1",
|
"version": "0.20.1-beta.2",
|
||||||
"lockfileVersion": 3,
|
"lockfileVersion": 3,
|
||||||
"requires": true,
|
"requires": true,
|
||||||
"packages": {
|
"packages": {
|
||||||
"": {
|
"": {
|
||||||
"name": "vectordb",
|
"name": "vectordb",
|
||||||
"version": "0.20.0-beta.1",
|
"version": "0.20.1-beta.2",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"x64",
|
"x64",
|
||||||
"arm64"
|
"arm64"
|
||||||
@@ -52,11 +52,11 @@
|
|||||||
"uuid": "^9.0.0"
|
"uuid": "^9.0.0"
|
||||||
},
|
},
|
||||||
"optionalDependencies": {
|
"optionalDependencies": {
|
||||||
"@lancedb/vectordb-darwin-arm64": "0.20.0-beta.1",
|
"@lancedb/vectordb-darwin-arm64": "0.20.1-beta.2",
|
||||||
"@lancedb/vectordb-darwin-x64": "0.20.0-beta.1",
|
"@lancedb/vectordb-darwin-x64": "0.20.1-beta.2",
|
||||||
"@lancedb/vectordb-linux-arm64-gnu": "0.20.0-beta.1",
|
"@lancedb/vectordb-linux-arm64-gnu": "0.20.1-beta.2",
|
||||||
"@lancedb/vectordb-linux-x64-gnu": "0.20.0-beta.1",
|
"@lancedb/vectordb-linux-x64-gnu": "0.20.1-beta.2",
|
||||||
"@lancedb/vectordb-win32-x64-msvc": "0.20.0-beta.1"
|
"@lancedb/vectordb-win32-x64-msvc": "0.20.1-beta.2"
|
||||||
},
|
},
|
||||||
"peerDependencies": {
|
"peerDependencies": {
|
||||||
"@apache-arrow/ts": "^14.0.2",
|
"@apache-arrow/ts": "^14.0.2",
|
||||||
@@ -327,65 +327,60 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
"node_modules/@lancedb/vectordb-darwin-arm64": {
|
"node_modules/@lancedb/vectordb-darwin-arm64": {
|
||||||
"version": "0.20.0-beta.1",
|
"version": "0.20.1-beta.2",
|
||||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.20.0-beta.1.tgz",
|
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.20.1-beta.2.tgz",
|
||||||
"integrity": "sha512-yds8wFjni68RfA+KziTz/8v4YKku1i6q4JF8I2EhpzDI8tT0fk1YqGlVhtdn9fHDWq/9m1M05kGVuyzLypZ2Yw==",
|
"integrity": "sha512-mqi0yI+ZwBTydaDy1FRHAUZwrWS28u6tbHTe1s4uSrmERbVI6PfmoPR+NZWWAp6ZhlseSdl/+yeI4imk11rQSw==",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"arm64"
|
"arm64"
|
||||||
],
|
],
|
||||||
"license": "Apache-2.0",
|
|
||||||
"optional": true,
|
"optional": true,
|
||||||
"os": [
|
"os": [
|
||||||
"darwin"
|
"darwin"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"node_modules/@lancedb/vectordb-darwin-x64": {
|
"node_modules/@lancedb/vectordb-darwin-x64": {
|
||||||
"version": "0.20.0-beta.1",
|
"version": "0.20.1-beta.2",
|
||||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.20.0-beta.1.tgz",
|
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.20.1-beta.2.tgz",
|
||||||
"integrity": "sha512-oF2MNtkWaJQWyUSIKU/zrbgygK94MzomUKc/Z9CYs7Ar3PI4CIfG72e5o/Zbhjpl318BkR4AbQQYX8BZaNIPVw==",
|
"integrity": "sha512-m8EYYA8JZIeNsJqQsBDUMu6r31/u7FzpjonJ4Y+CjapVl6UdvI65KUkeL2dYrFao++RuIoaiqcm3e7gRgFZpXQ==",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"x64"
|
"x64"
|
||||||
],
|
],
|
||||||
"license": "Apache-2.0",
|
|
||||||
"optional": true,
|
"optional": true,
|
||||||
"os": [
|
"os": [
|
||||||
"darwin"
|
"darwin"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"node_modules/@lancedb/vectordb-linux-arm64-gnu": {
|
"node_modules/@lancedb/vectordb-linux-arm64-gnu": {
|
||||||
"version": "0.20.0-beta.1",
|
"version": "0.20.1-beta.2",
|
||||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.20.0-beta.1.tgz",
|
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.20.1-beta.2.tgz",
|
||||||
"integrity": "sha512-3Si0+K5T4awMiUVu0dD9NizcqIiGnEdsTu4YxbKKq1aI4xoaHrYGERkz58mtIFoBQHfre42ujPDoahTkAQ1j/Q==",
|
"integrity": "sha512-3Og2+bk4GlWmMO1Yg2HBfeb5zrOMLaIHD7bEqQ4+6yw4IckAaV+ke05H0tyyqmOVrOQ0LpvtXgD7pPztjm9r9A==",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"arm64"
|
"arm64"
|
||||||
],
|
],
|
||||||
"license": "Apache-2.0",
|
|
||||||
"optional": true,
|
"optional": true,
|
||||||
"os": [
|
"os": [
|
||||||
"linux"
|
"linux"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"node_modules/@lancedb/vectordb-linux-x64-gnu": {
|
"node_modules/@lancedb/vectordb-linux-x64-gnu": {
|
||||||
"version": "0.20.0-beta.1",
|
"version": "0.20.1-beta.2",
|
||||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.20.0-beta.1.tgz",
|
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.20.1-beta.2.tgz",
|
||||||
"integrity": "sha512-5umO9XaDIxmqUiFnWaHxJtgkCO7oFWtEvLtzM4hG1mkEnwnE3bmXEO+cm+jPro7zwdKEzsnXh0GoCSUvuHk0tA==",
|
"integrity": "sha512-mwTQyA/FBoU/FkPuvCNBZG3y83gBN+iYoejehBH2HBkLUIcmlsDgSRZ1OQ+f9ijj12EMBCA11tBUPA9zhHzyrw==",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"x64"
|
"x64"
|
||||||
],
|
],
|
||||||
"license": "Apache-2.0",
|
|
||||||
"optional": true,
|
"optional": true,
|
||||||
"os": [
|
"os": [
|
||||||
"linux"
|
"linux"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"node_modules/@lancedb/vectordb-win32-x64-msvc": {
|
"node_modules/@lancedb/vectordb-win32-x64-msvc": {
|
||||||
"version": "0.20.0-beta.1",
|
"version": "0.20.1-beta.2",
|
||||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.20.0-beta.1.tgz",
|
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.20.1-beta.2.tgz",
|
||||||
"integrity": "sha512-EKyDamAi3RmDTu+BFYxr41eGLggZ3FVGu289gCprzljk38d8uxdgKhvDtYN9FWoMew4VvVk/EJQJx6L8sJJRng==",
|
"integrity": "sha512-VkjNpqhK3l3uHLLPmox+HrmKPMaZgV+qsGQWx0nfseGnSOEmXAWZWQFe0APVCQ9y0xTypQB0oH7eSOPZv2t4WQ==",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"x64"
|
"x64"
|
||||||
],
|
],
|
||||||
"license": "Apache-2.0",
|
|
||||||
"optional": true,
|
"optional": true,
|
||||||
"os": [
|
"os": [
|
||||||
"win32"
|
"win32"
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "vectordb",
|
"name": "vectordb",
|
||||||
"version": "0.20.0-beta.1",
|
"version": "0.20.1-beta.2",
|
||||||
"description": " Serverless, low-latency vector database for AI applications",
|
"description": " Serverless, low-latency vector database for AI applications",
|
||||||
"private": false,
|
"private": false,
|
||||||
"main": "dist/index.js",
|
"main": "dist/index.js",
|
||||||
@@ -89,10 +89,10 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
"optionalDependencies": {
|
"optionalDependencies": {
|
||||||
"@lancedb/vectordb-darwin-x64": "0.20.0-beta.1",
|
"@lancedb/vectordb-darwin-x64": "0.20.1-beta.2",
|
||||||
"@lancedb/vectordb-darwin-arm64": "0.20.0-beta.1",
|
"@lancedb/vectordb-darwin-arm64": "0.20.1-beta.2",
|
||||||
"@lancedb/vectordb-linux-x64-gnu": "0.20.0-beta.1",
|
"@lancedb/vectordb-linux-x64-gnu": "0.20.1-beta.2",
|
||||||
"@lancedb/vectordb-linux-arm64-gnu": "0.20.0-beta.1",
|
"@lancedb/vectordb-linux-arm64-gnu": "0.20.1-beta.2",
|
||||||
"@lancedb/vectordb-win32-x64-msvc": "0.20.0-beta.1"
|
"@lancedb/vectordb-win32-x64-msvc": "0.20.1-beta.2"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb-nodejs"
|
name = "lancedb-nodejs"
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
version = "0.20.0-beta.1"
|
version = "0.20.1-beta.2"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
description.workspace = true
|
description.workspace = true
|
||||||
repository.workspace = true
|
repository.workspace = true
|
||||||
|
|||||||
@@ -592,14 +592,14 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
|
|||||||
).rejects.toThrow("column vector was missing");
|
).rejects.toThrow("column vector was missing");
|
||||||
});
|
});
|
||||||
|
|
||||||
it("will provide a nice error if run twice", async function () {
|
it("will skip embedding application if already applied", async function () {
|
||||||
const records = sampleRecords();
|
const records = sampleRecords();
|
||||||
const table = await convertToTable(records, dummyEmbeddingConfig);
|
const table = await convertToTable(records, dummyEmbeddingConfig);
|
||||||
|
|
||||||
// fromTableToBuffer will try and apply the embeddings again
|
// fromTableToBuffer will try and apply the embeddings again
|
||||||
await expect(
|
// but should skip since the column already has non-null values
|
||||||
fromTableToBuffer(table, dummyEmbeddingConfig),
|
const result = await fromTableToBuffer(table, dummyEmbeddingConfig);
|
||||||
).rejects.toThrow("already existed");
|
expect(result.byteLength).toBeGreaterThan(0);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|||||||
@@ -33,7 +33,12 @@ import {
|
|||||||
register,
|
register,
|
||||||
} from "../lancedb/embedding";
|
} from "../lancedb/embedding";
|
||||||
import { Index } from "../lancedb/indices";
|
import { Index } from "../lancedb/indices";
|
||||||
import { instanceOfFullTextQuery } from "../lancedb/query";
|
import {
|
||||||
|
BooleanQuery,
|
||||||
|
Occur,
|
||||||
|
Operator,
|
||||||
|
instanceOfFullTextQuery,
|
||||||
|
} from "../lancedb/query";
|
||||||
import exp = require("constants");
|
import exp = require("constants");
|
||||||
|
|
||||||
describe.each([arrow15, arrow16, arrow17, arrow18])(
|
describe.each([arrow15, arrow16, arrow17, arrow18])(
|
||||||
@@ -554,6 +559,32 @@ describe("When creating an index", () => {
|
|||||||
rst = await tbl.query().limit(2).offset(1).nearestTo(queryVec).toArrow();
|
rst = await tbl.query().limit(2).offset(1).nearestTo(queryVec).toArrow();
|
||||||
expect(rst.numRows).toBe(1);
|
expect(rst.numRows).toBe(1);
|
||||||
|
|
||||||
|
// test nprobes
|
||||||
|
rst = await tbl.query().nearestTo(queryVec).limit(2).nprobes(50).toArrow();
|
||||||
|
expect(rst.numRows).toBe(2);
|
||||||
|
rst = await tbl
|
||||||
|
.query()
|
||||||
|
.nearestTo(queryVec)
|
||||||
|
.limit(2)
|
||||||
|
.minimumNprobes(15)
|
||||||
|
.toArrow();
|
||||||
|
expect(rst.numRows).toBe(2);
|
||||||
|
rst = await tbl
|
||||||
|
.query()
|
||||||
|
.nearestTo(queryVec)
|
||||||
|
.limit(2)
|
||||||
|
.minimumNprobes(10)
|
||||||
|
.maximumNprobes(20)
|
||||||
|
.toArrow();
|
||||||
|
expect(rst.numRows).toBe(2);
|
||||||
|
|
||||||
|
expect(() => tbl.query().nearestTo(queryVec).minimumNprobes(0)).toThrow(
|
||||||
|
"Invalid input, minimum_nprobes must be greater than 0",
|
||||||
|
);
|
||||||
|
expect(() => tbl.query().nearestTo(queryVec).maximumNprobes(5)).toThrow(
|
||||||
|
"Invalid input, maximum_nprobes must be greater than minimum_nprobes",
|
||||||
|
);
|
||||||
|
|
||||||
await tbl.dropIndex("vec_idx");
|
await tbl.dropIndex("vec_idx");
|
||||||
const indices2 = await tbl.listIndices();
|
const indices2 = await tbl.listIndices();
|
||||||
expect(indices2.length).toBe(0);
|
expect(indices2.length).toBe(0);
|
||||||
@@ -1531,6 +1562,18 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
|
|||||||
|
|
||||||
const results = await table.search("hello").toArray();
|
const results = await table.search("hello").toArray();
|
||||||
expect(results[0].text).toBe(data[0].text);
|
expect(results[0].text).toBe(data[0].text);
|
||||||
|
|
||||||
|
const results2 = await table
|
||||||
|
.search(new MatchQuery("hello world", "text"))
|
||||||
|
.toArray();
|
||||||
|
expect(results2.length).toBe(2);
|
||||||
|
|
||||||
|
const results3 = await table
|
||||||
|
.search(
|
||||||
|
new MatchQuery("hello world", "text", { operator: Operator.And }),
|
||||||
|
)
|
||||||
|
.toArray();
|
||||||
|
expect(results3.length).toBe(1);
|
||||||
});
|
});
|
||||||
|
|
||||||
test("full text search without lowercase", async () => {
|
test("full text search without lowercase", async () => {
|
||||||
@@ -1607,6 +1650,60 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
|
|||||||
expect(resultSet.has("fob")).toBe(true);
|
expect(resultSet.has("fob")).toBe(true);
|
||||||
expect(resultSet.has("fo")).toBe(true);
|
expect(resultSet.has("fo")).toBe(true);
|
||||||
expect(resultSet.has("food")).toBe(true);
|
expect(resultSet.has("food")).toBe(true);
|
||||||
|
|
||||||
|
const prefixResults = await table
|
||||||
|
.search(
|
||||||
|
new MatchQuery("foo", "text", { fuzziness: 3, prefixLength: 3 }),
|
||||||
|
)
|
||||||
|
.toArray();
|
||||||
|
expect(prefixResults.length).toBe(2);
|
||||||
|
const resultSet2 = new Set(prefixResults.map((r) => r.text));
|
||||||
|
expect(resultSet2.has("foo")).toBe(true);
|
||||||
|
expect(resultSet2.has("food")).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("full text search boolean query", async () => {
|
||||||
|
const db = await connect(tmpDir.name);
|
||||||
|
const data = [
|
||||||
|
{ text: "The cat and dog are playing" },
|
||||||
|
{ text: "The cat is sleeping" },
|
||||||
|
{ text: "The dog is barking" },
|
||||||
|
{ text: "The dog chases the cat" },
|
||||||
|
];
|
||||||
|
const table = await db.createTable("test", data);
|
||||||
|
await table.createIndex("text", {
|
||||||
|
config: Index.fts({ withPosition: false }),
|
||||||
|
});
|
||||||
|
|
||||||
|
const shouldResults = await table
|
||||||
|
.search(
|
||||||
|
new BooleanQuery([
|
||||||
|
[Occur.Should, new MatchQuery("cat", "text")],
|
||||||
|
[Occur.Should, new MatchQuery("dog", "text")],
|
||||||
|
]),
|
||||||
|
)
|
||||||
|
.toArray();
|
||||||
|
expect(shouldResults.length).toBe(4);
|
||||||
|
|
||||||
|
const mustResults = await table
|
||||||
|
.search(
|
||||||
|
new BooleanQuery([
|
||||||
|
[Occur.Must, new MatchQuery("cat", "text")],
|
||||||
|
[Occur.Must, new MatchQuery("dog", "text")],
|
||||||
|
]),
|
||||||
|
)
|
||||||
|
.toArray();
|
||||||
|
expect(mustResults.length).toBe(2);
|
||||||
|
|
||||||
|
const mustNotResults = await table
|
||||||
|
.search(
|
||||||
|
new BooleanQuery([
|
||||||
|
[Occur.Must, new MatchQuery("cat", "text")],
|
||||||
|
[Occur.MustNot, new MatchQuery("dog", "text")],
|
||||||
|
]),
|
||||||
|
)
|
||||||
|
.toArray();
|
||||||
|
expect(mustNotResults.length).toBe(1);
|
||||||
});
|
});
|
||||||
|
|
||||||
test.each([
|
test.each([
|
||||||
|
|||||||
@@ -417,7 +417,9 @@ function inferSchema(
|
|||||||
} else {
|
} else {
|
||||||
const inferredType = inferType(value, path, opts);
|
const inferredType = inferType(value, path, opts);
|
||||||
if (inferredType === undefined) {
|
if (inferredType === undefined) {
|
||||||
throw new Error(`Failed to infer data type for field ${path.join(".")} at row ${rowI}. \
|
throw new Error(`Failed to infer data type for field ${path.join(
|
||||||
|
".",
|
||||||
|
)} at row ${rowI}. \
|
||||||
Consider providing an explicit schema.`);
|
Consider providing an explicit schema.`);
|
||||||
}
|
}
|
||||||
pathTree.set(path, inferredType);
|
pathTree.set(path, inferredType);
|
||||||
@@ -799,11 +801,17 @@ async function applyEmbeddingsFromMetadata(
|
|||||||
`Cannot apply embedding function because the source column '${functionEntry.sourceColumn}' was not present in the data`,
|
`Cannot apply embedding function because the source column '${functionEntry.sourceColumn}' was not present in the data`,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Check if destination column exists and handle accordingly
|
||||||
if (columns[destColumn] !== undefined) {
|
if (columns[destColumn] !== undefined) {
|
||||||
throw new Error(
|
const existingColumn = columns[destColumn];
|
||||||
`Attempt to apply embeddings to table failed because column ${destColumn} already existed`,
|
// If the column exists but is all null, we can fill it with embeddings
|
||||||
);
|
if (existingColumn.nullCount !== existingColumn.length) {
|
||||||
|
// Column has non-null values, skip embedding application
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (table.batches.length > 1) {
|
if (table.batches.length > 1) {
|
||||||
throw new Error(
|
throw new Error(
|
||||||
"Internal error: `makeArrowTable` unexpectedly created a table with more than one batch",
|
"Internal error: `makeArrowTable` unexpectedly created a table with more than one batch",
|
||||||
@@ -903,11 +911,23 @@ async function applyEmbeddings<T>(
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
// Check if destination column exists and handle accordingly
|
||||||
if (Object.prototype.hasOwnProperty.call(newColumns, destColumn)) {
|
if (Object.prototype.hasOwnProperty.call(newColumns, destColumn)) {
|
||||||
throw new Error(
|
const existingColumn = newColumns[destColumn];
|
||||||
`Attempt to apply embeddings to table failed because column ${destColumn} already existed`,
|
// If the column exists but is all null, we can fill it with embeddings
|
||||||
|
if (existingColumn.nullCount !== existingColumn.length) {
|
||||||
|
// Column has non-null values, skip embedding application and return table as-is
|
||||||
|
let newTable = new ArrowTable(newColumns);
|
||||||
|
if (schema != null) {
|
||||||
|
newTable = alignTable(newTable, schema as Schema);
|
||||||
|
}
|
||||||
|
return new ArrowTable(
|
||||||
|
new Schema(newTable.schema.fields, schemaMetadata),
|
||||||
|
newTable.batches,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (table.batches.length > 1) {
|
if (table.batches.length > 1) {
|
||||||
throw new Error(
|
throw new Error(
|
||||||
"Internal error: `makeArrowTable` unexpectedly created a table with more than one batch",
|
"Internal error: `makeArrowTable` unexpectedly created a table with more than one batch",
|
||||||
|
|||||||
@@ -64,7 +64,10 @@ export {
|
|||||||
PhraseQuery,
|
PhraseQuery,
|
||||||
BoostQuery,
|
BoostQuery,
|
||||||
MultiMatchQuery,
|
MultiMatchQuery,
|
||||||
|
BooleanQuery,
|
||||||
FullTextQueryType,
|
FullTextQueryType,
|
||||||
|
Operator,
|
||||||
|
Occur,
|
||||||
} from "./query";
|
} from "./query";
|
||||||
|
|
||||||
export {
|
export {
|
||||||
|
|||||||
@@ -448,6 +448,10 @@ export class VectorQuery extends QueryBase<NativeVectorQuery> {
|
|||||||
* For best results we recommend tuning this parameter with a benchmark against
|
* For best results we recommend tuning this parameter with a benchmark against
|
||||||
* your actual data to find the smallest possible value that will still give
|
* your actual data to find the smallest possible value that will still give
|
||||||
* you the desired recall.
|
* you the desired recall.
|
||||||
|
*
|
||||||
|
* For more fine grained control over behavior when you have a very narrow filter
|
||||||
|
* you can use `minimumNprobes` and `maximumNprobes`. This method sets both
|
||||||
|
* the minimum and maximum to the same value.
|
||||||
*/
|
*/
|
||||||
nprobes(nprobes: number): VectorQuery {
|
nprobes(nprobes: number): VectorQuery {
|
||||||
super.doCall((inner) => inner.nprobes(nprobes));
|
super.doCall((inner) => inner.nprobes(nprobes));
|
||||||
@@ -455,6 +459,33 @@ export class VectorQuery extends QueryBase<NativeVectorQuery> {
|
|||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set the minimum number of probes used.
|
||||||
|
*
|
||||||
|
* This controls the minimum number of partitions that will be searched. This
|
||||||
|
* parameter will impact every query against a vector index, regardless of the
|
||||||
|
* filter. See `nprobes` for more details. Higher values will increase recall
|
||||||
|
* but will also increase latency.
|
||||||
|
*/
|
||||||
|
minimumNprobes(minimumNprobes: number): VectorQuery {
|
||||||
|
super.doCall((inner) => inner.minimumNprobes(minimumNprobes));
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set the maximum number of probes used.
|
||||||
|
*
|
||||||
|
* This controls the maximum number of partitions that will be searched. If this
|
||||||
|
* number is greater than minimumNprobes then the excess partitions will _only_ be
|
||||||
|
* searched if we have not found enough results. This can be useful when there is
|
||||||
|
* a narrow filter to allow these queries to spend more time searching and avoid
|
||||||
|
* potential false negatives.
|
||||||
|
*/
|
||||||
|
maximumNprobes(maximumNprobes: number): VectorQuery {
|
||||||
|
super.doCall((inner) => inner.maximumNprobes(maximumNprobes));
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Set the distance range to use
|
* Set the distance range to use
|
||||||
*
|
*
|
||||||
@@ -762,6 +793,31 @@ export enum FullTextQueryType {
|
|||||||
MatchPhrase = "match_phrase",
|
MatchPhrase = "match_phrase",
|
||||||
Boost = "boost",
|
Boost = "boost",
|
||||||
MultiMatch = "multi_match",
|
MultiMatch = "multi_match",
|
||||||
|
Boolean = "boolean",
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Enum representing the logical operators used in full-text queries.
|
||||||
|
*
|
||||||
|
* - `And`: All terms must match.
|
||||||
|
* - `Or`: At least one term must match.
|
||||||
|
*/
|
||||||
|
export enum Operator {
|
||||||
|
And = "AND",
|
||||||
|
Or = "OR",
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Enum representing the occurrence of terms in full-text queries.
|
||||||
|
*
|
||||||
|
* - `Must`: The term must be present in the document.
|
||||||
|
* - `Should`: The term should contribute to the document score, but is not required.
|
||||||
|
* - `MustNot`: The term must not be present in the document.
|
||||||
|
*/
|
||||||
|
export enum Occur {
|
||||||
|
Should = "SHOULD",
|
||||||
|
Must = "MUST",
|
||||||
|
MustNot = "MUST_NOT",
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -791,6 +847,7 @@ export function instanceOfFullTextQuery(obj: any): obj is FullTextQuery {
|
|||||||
export class MatchQuery implements FullTextQuery {
|
export class MatchQuery implements FullTextQuery {
|
||||||
/** @ignore */
|
/** @ignore */
|
||||||
public readonly inner: JsFullTextQuery;
|
public readonly inner: JsFullTextQuery;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates an instance of MatchQuery.
|
* Creates an instance of MatchQuery.
|
||||||
*
|
*
|
||||||
@@ -800,6 +857,8 @@ export class MatchQuery implements FullTextQuery {
|
|||||||
* - `boost`: The boost factor for the query (default is 1.0).
|
* - `boost`: The boost factor for the query (default is 1.0).
|
||||||
* - `fuzziness`: The fuzziness level for the query (default is 0).
|
* - `fuzziness`: The fuzziness level for the query (default is 0).
|
||||||
* - `maxExpansions`: The maximum number of terms to consider for fuzzy matching (default is 50).
|
* - `maxExpansions`: The maximum number of terms to consider for fuzzy matching (default is 50).
|
||||||
|
* - `operator`: The logical operator to use for combining terms in the query (default is "OR").
|
||||||
|
* - `prefixLength`: The number of beginning characters being unchanged for fuzzy matching.
|
||||||
*/
|
*/
|
||||||
constructor(
|
constructor(
|
||||||
query: string,
|
query: string,
|
||||||
@@ -808,6 +867,8 @@ export class MatchQuery implements FullTextQuery {
|
|||||||
boost?: number;
|
boost?: number;
|
||||||
fuzziness?: number;
|
fuzziness?: number;
|
||||||
maxExpansions?: number;
|
maxExpansions?: number;
|
||||||
|
operator?: Operator;
|
||||||
|
prefixLength?: number;
|
||||||
},
|
},
|
||||||
) {
|
) {
|
||||||
let fuzziness = options?.fuzziness;
|
let fuzziness = options?.fuzziness;
|
||||||
@@ -820,6 +881,8 @@ export class MatchQuery implements FullTextQuery {
|
|||||||
options?.boost ?? 1.0,
|
options?.boost ?? 1.0,
|
||||||
fuzziness,
|
fuzziness,
|
||||||
options?.maxExpansions ?? 50,
|
options?.maxExpansions ?? 50,
|
||||||
|
options?.operator ?? Operator.Or,
|
||||||
|
options?.prefixLength ?? 0,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -836,9 +899,11 @@ export class PhraseQuery implements FullTextQuery {
|
|||||||
*
|
*
|
||||||
* @param query - The phrase to search for in the specified column.
|
* @param query - The phrase to search for in the specified column.
|
||||||
* @param column - The name of the column to search within.
|
* @param column - The name of the column to search within.
|
||||||
|
* @param options - Optional parameters for the phrase query.
|
||||||
|
* - `slop`: The maximum number of intervening unmatched positions allowed between words in the phrase (default is 0).
|
||||||
*/
|
*/
|
||||||
constructor(query: string, column: string) {
|
constructor(query: string, column: string, options?: { slop?: number }) {
|
||||||
this.inner = JsFullTextQuery.phraseQuery(query, column);
|
this.inner = JsFullTextQuery.phraseQuery(query, column, options?.slop ?? 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
queryType(): FullTextQueryType {
|
queryType(): FullTextQueryType {
|
||||||
@@ -889,18 +954,21 @@ export class MultiMatchQuery implements FullTextQuery {
|
|||||||
* @param columns - An array of column names to search within.
|
* @param columns - An array of column names to search within.
|
||||||
* @param options - Optional parameters for the multi-match query.
|
* @param options - Optional parameters for the multi-match query.
|
||||||
* - `boosts`: An array of boost factors for each column (default is 1.0 for all).
|
* - `boosts`: An array of boost factors for each column (default is 1.0 for all).
|
||||||
|
* - `operator`: The logical operator to use for combining terms in the query (default is "OR").
|
||||||
*/
|
*/
|
||||||
constructor(
|
constructor(
|
||||||
query: string,
|
query: string,
|
||||||
columns: string[],
|
columns: string[],
|
||||||
options?: {
|
options?: {
|
||||||
boosts?: number[];
|
boosts?: number[];
|
||||||
|
operator?: Operator;
|
||||||
},
|
},
|
||||||
) {
|
) {
|
||||||
this.inner = JsFullTextQuery.multiMatchQuery(
|
this.inner = JsFullTextQuery.multiMatchQuery(
|
||||||
query,
|
query,
|
||||||
columns,
|
columns,
|
||||||
options?.boosts,
|
options?.boosts,
|
||||||
|
options?.operator ?? Operator.Or,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -908,3 +976,23 @@ export class MultiMatchQuery implements FullTextQuery {
|
|||||||
return FullTextQueryType.MultiMatch;
|
return FullTextQueryType.MultiMatch;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export class BooleanQuery implements FullTextQuery {
|
||||||
|
/** @ignore */
|
||||||
|
public readonly inner: JsFullTextQuery;
|
||||||
|
/**
|
||||||
|
* Creates an instance of BooleanQuery.
|
||||||
|
*
|
||||||
|
* @param queries - An array of (Occur, FullTextQuery objects) to combine.
|
||||||
|
* Occur specifies whether the query must match, or should match.
|
||||||
|
*/
|
||||||
|
constructor(queries: [Occur, FullTextQuery][]) {
|
||||||
|
this.inner = JsFullTextQuery.booleanQuery(
|
||||||
|
queries.map(([occur, query]) => [occur, query.inner]),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
queryType(): FullTextQueryType {
|
||||||
|
return FullTextQueryType.Boolean;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-darwin-arm64",
|
"name": "@lancedb/lancedb-darwin-arm64",
|
||||||
"version": "0.20.0-beta.1",
|
"version": "0.20.1-beta.2",
|
||||||
"os": ["darwin"],
|
"os": ["darwin"],
|
||||||
"cpu": ["arm64"],
|
"cpu": ["arm64"],
|
||||||
"main": "lancedb.darwin-arm64.node",
|
"main": "lancedb.darwin-arm64.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-darwin-x64",
|
"name": "@lancedb/lancedb-darwin-x64",
|
||||||
"version": "0.20.0-beta.1",
|
"version": "0.20.1-beta.2",
|
||||||
"os": ["darwin"],
|
"os": ["darwin"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.darwin-x64.node",
|
"main": "lancedb.darwin-x64.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-linux-arm64-gnu",
|
"name": "@lancedb/lancedb-linux-arm64-gnu",
|
||||||
"version": "0.20.0-beta.1",
|
"version": "0.20.1-beta.2",
|
||||||
"os": ["linux"],
|
"os": ["linux"],
|
||||||
"cpu": ["arm64"],
|
"cpu": ["arm64"],
|
||||||
"main": "lancedb.linux-arm64-gnu.node",
|
"main": "lancedb.linux-arm64-gnu.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-linux-arm64-musl",
|
"name": "@lancedb/lancedb-linux-arm64-musl",
|
||||||
"version": "0.20.0-beta.1",
|
"version": "0.20.1-beta.2",
|
||||||
"os": ["linux"],
|
"os": ["linux"],
|
||||||
"cpu": ["arm64"],
|
"cpu": ["arm64"],
|
||||||
"main": "lancedb.linux-arm64-musl.node",
|
"main": "lancedb.linux-arm64-musl.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-linux-x64-gnu",
|
"name": "@lancedb/lancedb-linux-x64-gnu",
|
||||||
"version": "0.20.0-beta.1",
|
"version": "0.20.1-beta.2",
|
||||||
"os": ["linux"],
|
"os": ["linux"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.linux-x64-gnu.node",
|
"main": "lancedb.linux-x64-gnu.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-linux-x64-musl",
|
"name": "@lancedb/lancedb-linux-x64-musl",
|
||||||
"version": "0.20.0-beta.1",
|
"version": "0.20.1-beta.2",
|
||||||
"os": ["linux"],
|
"os": ["linux"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.linux-x64-musl.node",
|
"main": "lancedb.linux-x64-musl.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-win32-arm64-msvc",
|
"name": "@lancedb/lancedb-win32-arm64-msvc",
|
||||||
"version": "0.20.0-beta.1",
|
"version": "0.20.1-beta.2",
|
||||||
"os": [
|
"os": [
|
||||||
"win32"
|
"win32"
|
||||||
],
|
],
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-win32-x64-msvc",
|
"name": "@lancedb/lancedb-win32-x64-msvc",
|
||||||
"version": "0.20.0-beta.1",
|
"version": "0.20.1-beta.2",
|
||||||
"os": ["win32"],
|
"os": ["win32"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.win32-x64-msvc.node",
|
"main": "lancedb.win32-x64-msvc.node",
|
||||||
|
|||||||
4
nodejs/package-lock.json
generated
4
nodejs/package-lock.json
generated
@@ -1,12 +1,12 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb",
|
"name": "@lancedb/lancedb",
|
||||||
"version": "0.20.0-beta.1",
|
"version": "0.20.1-beta.2",
|
||||||
"lockfileVersion": 3,
|
"lockfileVersion": 3,
|
||||||
"requires": true,
|
"requires": true,
|
||||||
"packages": {
|
"packages": {
|
||||||
"": {
|
"": {
|
||||||
"name": "@lancedb/lancedb",
|
"name": "@lancedb/lancedb",
|
||||||
"version": "0.20.0-beta.1",
|
"version": "0.20.1-beta.2",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"x64",
|
"x64",
|
||||||
"arm64"
|
"arm64"
|
||||||
|
|||||||
@@ -11,7 +11,7 @@
|
|||||||
"ann"
|
"ann"
|
||||||
],
|
],
|
||||||
"private": false,
|
"private": false,
|
||||||
"version": "0.20.0-beta.1",
|
"version": "0.20.1-beta.2",
|
||||||
"main": "dist/index.js",
|
"main": "dist/index.js",
|
||||||
"exports": {
|
"exports": {
|
||||||
".": "./dist/index.js",
|
".": "./dist/index.js",
|
||||||
|
|||||||
@@ -4,7 +4,8 @@
|
|||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
use lancedb::index::scalar::{
|
use lancedb::index::scalar::{
|
||||||
BoostQuery, FtsQuery, FullTextSearchQuery, MatchQuery, MultiMatchQuery, PhraseQuery,
|
BooleanQuery, BoostQuery, FtsQuery, FullTextSearchQuery, MatchQuery, MultiMatchQuery, Occur,
|
||||||
|
Operator, PhraseQuery,
|
||||||
};
|
};
|
||||||
use lancedb::query::ExecutableQuery;
|
use lancedb::query::ExecutableQuery;
|
||||||
use lancedb::query::Query as LanceDbQuery;
|
use lancedb::query::Query as LanceDbQuery;
|
||||||
@@ -177,6 +178,31 @@ impl VectorQuery {
|
|||||||
self.inner = self.inner.clone().nprobes(nprobe as usize);
|
self.inner = self.inner.clone().nprobes(nprobe as usize);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[napi]
|
||||||
|
pub fn minimum_nprobes(&mut self, minimum_nprobe: u32) -> napi::Result<()> {
|
||||||
|
self.inner = self
|
||||||
|
.inner
|
||||||
|
.clone()
|
||||||
|
.minimum_nprobes(minimum_nprobe as usize)
|
||||||
|
.default_error()?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[napi]
|
||||||
|
pub fn maximum_nprobes(&mut self, maximum_nprobes: u32) -> napi::Result<()> {
|
||||||
|
let maximum_nprobes = if maximum_nprobes == 0 {
|
||||||
|
None
|
||||||
|
} else {
|
||||||
|
Some(maximum_nprobes as usize)
|
||||||
|
};
|
||||||
|
self.inner = self
|
||||||
|
.inner
|
||||||
|
.clone()
|
||||||
|
.maximum_nprobes(maximum_nprobes)
|
||||||
|
.default_error()?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
#[napi]
|
#[napi]
|
||||||
pub fn distance_range(&mut self, lower_bound: Option<f64>, upper_bound: Option<f64>) {
|
pub fn distance_range(&mut self, lower_bound: Option<f64>, upper_bound: Option<f64>) {
|
||||||
// napi doesn't support f32, so we have to convert to f32
|
// napi doesn't support f32, so we have to convert to f32
|
||||||
@@ -308,6 +334,8 @@ impl JsFullTextQuery {
|
|||||||
boost: f64,
|
boost: f64,
|
||||||
fuzziness: Option<u32>,
|
fuzziness: Option<u32>,
|
||||||
max_expansions: u32,
|
max_expansions: u32,
|
||||||
|
operator: String,
|
||||||
|
prefix_length: u32,
|
||||||
) -> napi::Result<Self> {
|
) -> napi::Result<Self> {
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
inner: MatchQuery::new(query)
|
inner: MatchQuery::new(query)
|
||||||
@@ -315,14 +343,23 @@ impl JsFullTextQuery {
|
|||||||
.with_boost(boost as f32)
|
.with_boost(boost as f32)
|
||||||
.with_fuzziness(fuzziness)
|
.with_fuzziness(fuzziness)
|
||||||
.with_max_expansions(max_expansions as usize)
|
.with_max_expansions(max_expansions as usize)
|
||||||
|
.with_operator(
|
||||||
|
Operator::try_from(operator.as_str()).map_err(|e| {
|
||||||
|
napi::Error::from_reason(format!("Invalid operator: {}", e))
|
||||||
|
})?,
|
||||||
|
)
|
||||||
|
.with_prefix_length(prefix_length)
|
||||||
.into(),
|
.into(),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
#[napi(factory)]
|
#[napi(factory)]
|
||||||
pub fn phrase_query(query: String, column: String) -> napi::Result<Self> {
|
pub fn phrase_query(query: String, column: String, slop: u32) -> napi::Result<Self> {
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
inner: PhraseQuery::new(query).with_column(Some(column)).into(),
|
inner: PhraseQuery::new(query)
|
||||||
|
.with_column(Some(column))
|
||||||
|
.with_slop(slop)
|
||||||
|
.into(),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -348,6 +385,7 @@ impl JsFullTextQuery {
|
|||||||
query: String,
|
query: String,
|
||||||
columns: Vec<String>,
|
columns: Vec<String>,
|
||||||
boosts: Option<Vec<f64>>,
|
boosts: Option<Vec<f64>>,
|
||||||
|
operator: String,
|
||||||
) -> napi::Result<Self> {
|
) -> napi::Result<Self> {
|
||||||
let q = match boosts {
|
let q = match boosts {
|
||||||
Some(boosts) => MultiMatchQuery::try_new(query, columns)
|
Some(boosts) => MultiMatchQuery::try_new(query, columns)
|
||||||
@@ -358,7 +396,37 @@ impl JsFullTextQuery {
|
|||||||
napi::Error::from_reason(format!("Failed to create multi match query: {}", e))
|
napi::Error::from_reason(format!("Failed to create multi match query: {}", e))
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
Ok(Self { inner: q.into() })
|
let operator = Operator::try_from(operator.as_str()).map_err(|e| {
|
||||||
|
napi::Error::from_reason(format!("Invalid operator for multi match query: {}", e))
|
||||||
|
})?;
|
||||||
|
|
||||||
|
Ok(Self {
|
||||||
|
inner: q.with_operator(operator).into(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
#[napi(factory)]
|
||||||
|
pub fn boolean_query(queries: Vec<(String, &JsFullTextQuery)>) -> napi::Result<Self> {
|
||||||
|
let mut sub_queries = Vec::with_capacity(queries.len());
|
||||||
|
for (occur, q) in queries {
|
||||||
|
let occur = Occur::try_from(occur.as_str())
|
||||||
|
.map_err(|e| napi::Error::from_reason(e.to_string()))?;
|
||||||
|
sub_queries.push((occur, q.inner.clone()));
|
||||||
|
}
|
||||||
|
Ok(Self {
|
||||||
|
inner: BooleanQuery::new(sub_queries).into(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
#[napi(getter)]
|
||||||
|
pub fn query_type(&self) -> String {
|
||||||
|
match self.inner {
|
||||||
|
FtsQuery::Match(_) => "match".to_string(),
|
||||||
|
FtsQuery::Phrase(_) => "phrase".to_string(),
|
||||||
|
FtsQuery::Boost(_) => "boost".to_string(),
|
||||||
|
FtsQuery::MultiMatch(_) => "multi_match".to_string(),
|
||||||
|
FtsQuery::Boolean(_) => "boolean".to_string(),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
[tool.bumpversion]
|
[tool.bumpversion]
|
||||||
current_version = "0.23.0-beta.2"
|
current_version = "0.24.0"
|
||||||
parse = """(?x)
|
parse = """(?x)
|
||||||
(?P<major>0|[1-9]\\d*)\\.
|
(?P<major>0|[1-9]\\d*)\\.
|
||||||
(?P<minor>0|[1-9]\\d*)\\.
|
(?P<minor>0|[1-9]\\d*)\\.
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb-python"
|
name = "lancedb-python"
|
||||||
version = "0.23.0-beta.2"
|
version = "0.24.0"
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
description = "Python bindings for LanceDB"
|
description = "Python bindings for LanceDB"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
|
|||||||
@@ -143,6 +143,8 @@ class VectorQuery:
|
|||||||
def postfilter(self): ...
|
def postfilter(self): ...
|
||||||
def refine_factor(self, refine_factor: int): ...
|
def refine_factor(self, refine_factor: int): ...
|
||||||
def nprobes(self, nprobes: int): ...
|
def nprobes(self, nprobes: int): ...
|
||||||
|
def minimum_nprobes(self, minimum_nprobes: int): ...
|
||||||
|
def maximum_nprobes(self, maximum_nprobes: int): ...
|
||||||
def bypass_vector_index(self): ...
|
def bypass_vector_index(self): ...
|
||||||
def nearest_to_text(self, query: dict) -> HybridQuery: ...
|
def nearest_to_text(self, query: dict) -> HybridQuery: ...
|
||||||
def to_query_request(self) -> PyQueryRequest: ...
|
def to_query_request(self) -> PyQueryRequest: ...
|
||||||
@@ -158,6 +160,8 @@ class HybridQuery:
|
|||||||
def distance_type(self, distance_type: str): ...
|
def distance_type(self, distance_type: str): ...
|
||||||
def refine_factor(self, refine_factor: int): ...
|
def refine_factor(self, refine_factor: int): ...
|
||||||
def nprobes(self, nprobes: int): ...
|
def nprobes(self, nprobes: int): ...
|
||||||
|
def minimum_nprobes(self, minimum_nprobes: int): ...
|
||||||
|
def maximum_nprobes(self, maximum_nprobes: int): ...
|
||||||
def bypass_vector_index(self): ...
|
def bypass_vector_index(self): ...
|
||||||
def to_vector_query(self) -> VectorQuery: ...
|
def to_vector_query(self) -> VectorQuery: ...
|
||||||
def to_fts_query(self) -> FTSQuery: ...
|
def to_fts_query(self) -> FTSQuery: ...
|
||||||
@@ -165,23 +169,21 @@ class HybridQuery:
|
|||||||
def get_with_row_id(self) -> bool: ...
|
def get_with_row_id(self) -> bool: ...
|
||||||
def to_query_request(self) -> PyQueryRequest: ...
|
def to_query_request(self) -> PyQueryRequest: ...
|
||||||
|
|
||||||
class PyFullTextSearchQuery:
|
class FullTextQuery:
|
||||||
columns: Optional[List[str]]
|
pass
|
||||||
query: str
|
|
||||||
limit: Optional[int]
|
|
||||||
wand_factor: Optional[float]
|
|
||||||
|
|
||||||
class PyQueryRequest:
|
class PyQueryRequest:
|
||||||
limit: Optional[int]
|
limit: Optional[int]
|
||||||
offset: Optional[int]
|
offset: Optional[int]
|
||||||
filter: Optional[Union[str, bytes]]
|
filter: Optional[Union[str, bytes]]
|
||||||
full_text_search: Optional[PyFullTextSearchQuery]
|
full_text_search: Optional[FullTextQuery]
|
||||||
select: Optional[Union[str, List[str]]]
|
select: Optional[Union[str, List[str]]]
|
||||||
fast_search: Optional[bool]
|
fast_search: Optional[bool]
|
||||||
with_row_id: Optional[bool]
|
with_row_id: Optional[bool]
|
||||||
column: Optional[str]
|
column: Optional[str]
|
||||||
query_vector: Optional[List[pa.Array]]
|
query_vector: Optional[List[pa.Array]]
|
||||||
nprobes: Optional[int]
|
minimum_nprobes: Optional[int]
|
||||||
|
maximum_nprobes: Optional[int]
|
||||||
lower_bound: Optional[float]
|
lower_bound: Optional[float]
|
||||||
upper_bound: Optional[float]
|
upper_bound: Optional[float]
|
||||||
ef: Optional[int]
|
ef: Optional[int]
|
||||||
|
|||||||
@@ -4,7 +4,6 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
import abc
|
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from datetime import timedelta
|
from datetime import timedelta
|
||||||
@@ -88,15 +87,28 @@ def ensure_vector_query(
|
|||||||
return val
|
return val
|
||||||
|
|
||||||
|
|
||||||
class FullTextQueryType(Enum):
|
class FullTextQueryType(str, Enum):
|
||||||
MATCH = "match"
|
MATCH = "match"
|
||||||
MATCH_PHRASE = "match_phrase"
|
MATCH_PHRASE = "match_phrase"
|
||||||
BOOST = "boost"
|
BOOST = "boost"
|
||||||
MULTI_MATCH = "multi_match"
|
MULTI_MATCH = "multi_match"
|
||||||
|
BOOLEAN = "boolean"
|
||||||
|
|
||||||
|
|
||||||
class FullTextQuery(abc.ABC, pydantic.BaseModel):
|
class FullTextOperator(str, Enum):
|
||||||
@abc.abstractmethod
|
AND = "AND"
|
||||||
|
OR = "OR"
|
||||||
|
|
||||||
|
|
||||||
|
class Occur(str, Enum):
|
||||||
|
SHOULD = "SHOULD"
|
||||||
|
MUST = "MUST"
|
||||||
|
MUST_NOT = "MUST_NOT"
|
||||||
|
|
||||||
|
|
||||||
|
@pydantic.dataclasses.dataclass
|
||||||
|
class FullTextQuery(ABC):
|
||||||
|
@abstractmethod
|
||||||
def query_type(self) -> FullTextQueryType:
|
def query_type(self) -> FullTextQueryType:
|
||||||
"""
|
"""
|
||||||
Get the query type of the query.
|
Get the query type of the query.
|
||||||
@@ -106,35 +118,43 @@ class FullTextQuery(abc.ABC, pydantic.BaseModel):
|
|||||||
str
|
str
|
||||||
The type of the query.
|
The type of the query.
|
||||||
"""
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
@abc.abstractmethod
|
def __and__(self, other: "FullTextQuery") -> "FullTextQuery":
|
||||||
def to_dict(self) -> dict:
|
|
||||||
"""
|
"""
|
||||||
Convert the query to a dictionary.
|
Combine two queries with a logical AND operation.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
other : FullTextQuery
|
||||||
|
The other query to combine with.
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
dict
|
FullTextQuery
|
||||||
The query as a dictionary.
|
A new query that combines both queries with AND.
|
||||||
"""
|
"""
|
||||||
|
return BooleanQuery([(Occur.MUST, self), (Occur.MUST, other)])
|
||||||
|
|
||||||
|
def __or__(self, other: "FullTextQuery") -> "FullTextQuery":
|
||||||
|
"""
|
||||||
|
Combine two queries with a logical OR operation.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
other : FullTextQuery
|
||||||
|
The other query to combine with.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
FullTextQuery
|
||||||
|
A new query that combines both queries with OR.
|
||||||
|
"""
|
||||||
|
return BooleanQuery([(Occur.SHOULD, self), (Occur.SHOULD, other)])
|
||||||
|
|
||||||
|
|
||||||
|
@pydantic.dataclasses.dataclass
|
||||||
class MatchQuery(FullTextQuery):
|
class MatchQuery(FullTextQuery):
|
||||||
query: str
|
|
||||||
column: str
|
|
||||||
boost: float = 1.0
|
|
||||||
fuzziness: int = 0
|
|
||||||
max_expansions: int = 50
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
query: str,
|
|
||||||
column: str,
|
|
||||||
*,
|
|
||||||
boost: float = 1.0,
|
|
||||||
fuzziness: int = 0,
|
|
||||||
max_expansions: int = 50,
|
|
||||||
):
|
|
||||||
"""
|
"""
|
||||||
Match query for full-text search.
|
Match query for full-text search.
|
||||||
|
|
||||||
@@ -157,36 +177,30 @@ class MatchQuery(FullTextQuery):
|
|||||||
max_expansions : int, optional
|
max_expansions : int, optional
|
||||||
The maximum number of terms to consider for fuzzy matching.
|
The maximum number of terms to consider for fuzzy matching.
|
||||||
Defaults to 50.
|
Defaults to 50.
|
||||||
|
operator : FullTextOperator, default OR
|
||||||
|
The operator to use for combining the query results.
|
||||||
|
Can be either `AND` or `OR`.
|
||||||
|
If `AND`, all terms in the query must match.
|
||||||
|
If `OR`, at least one term in the query must match.
|
||||||
|
prefix_length : int, optional
|
||||||
|
The number of beginning characters being unchanged for fuzzy matching.
|
||||||
|
This is useful to achieve prefix matching.
|
||||||
"""
|
"""
|
||||||
super().__init__(
|
|
||||||
query=query,
|
query: str
|
||||||
column=column,
|
column: str
|
||||||
boost=boost,
|
boost: float = pydantic.Field(1.0, kw_only=True)
|
||||||
fuzziness=fuzziness,
|
fuzziness: int = pydantic.Field(0, kw_only=True)
|
||||||
max_expansions=max_expansions,
|
max_expansions: int = pydantic.Field(50, kw_only=True)
|
||||||
)
|
operator: FullTextOperator = pydantic.Field(FullTextOperator.OR, kw_only=True)
|
||||||
|
prefix_length: int = pydantic.Field(0, kw_only=True)
|
||||||
|
|
||||||
def query_type(self) -> FullTextQueryType:
|
def query_type(self) -> FullTextQueryType:
|
||||||
return FullTextQueryType.MATCH
|
return FullTextQueryType.MATCH
|
||||||
|
|
||||||
def to_dict(self) -> dict:
|
|
||||||
return {
|
|
||||||
"match": {
|
|
||||||
self.column: {
|
|
||||||
"query": self.query,
|
|
||||||
"boost": self.boost,
|
|
||||||
"fuzziness": self.fuzziness,
|
|
||||||
"max_expansions": self.max_expansions,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
@pydantic.dataclasses.dataclass
|
||||||
class PhraseQuery(FullTextQuery):
|
class PhraseQuery(FullTextQuery):
|
||||||
query: str
|
|
||||||
column: str
|
|
||||||
|
|
||||||
def __init__(self, query: str, column: str):
|
|
||||||
"""
|
"""
|
||||||
Phrase query for full-text search.
|
Phrase query for full-text search.
|
||||||
|
|
||||||
@@ -197,31 +211,17 @@ class PhraseQuery(FullTextQuery):
|
|||||||
column : str
|
column : str
|
||||||
The name of the column to match against.
|
The name of the column to match against.
|
||||||
"""
|
"""
|
||||||
super().__init__(query=query, column=column)
|
|
||||||
|
query: str
|
||||||
|
column: str
|
||||||
|
slop: int = pydantic.Field(0, kw_only=True)
|
||||||
|
|
||||||
def query_type(self) -> FullTextQueryType:
|
def query_type(self) -> FullTextQueryType:
|
||||||
return FullTextQueryType.MATCH_PHRASE
|
return FullTextQueryType.MATCH_PHRASE
|
||||||
|
|
||||||
def to_dict(self) -> dict:
|
|
||||||
return {
|
|
||||||
"match_phrase": {
|
|
||||||
self.column: self.query,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
@pydantic.dataclasses.dataclass
|
||||||
class BoostQuery(FullTextQuery):
|
class BoostQuery(FullTextQuery):
|
||||||
positive: FullTextQuery
|
|
||||||
negative: FullTextQuery
|
|
||||||
negative_boost: float = 0.5
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
positive: FullTextQuery,
|
|
||||||
negative: FullTextQuery,
|
|
||||||
*,
|
|
||||||
negative_boost: float = 0.5,
|
|
||||||
):
|
|
||||||
"""
|
"""
|
||||||
Boost query for full-text search.
|
Boost query for full-text search.
|
||||||
|
|
||||||
@@ -231,68 +231,65 @@ class BoostQuery(FullTextQuery):
|
|||||||
The positive query object.
|
The positive query object.
|
||||||
negative : dict
|
negative : dict
|
||||||
The negative query object.
|
The negative query object.
|
||||||
negative_boost : float
|
negative_boost : float, default 0.5
|
||||||
The boost factor for the negative query.
|
The boost factor for the negative query.
|
||||||
"""
|
"""
|
||||||
super().__init__(
|
|
||||||
positive=positive, negative=negative, negative_boost=negative_boost
|
positive: FullTextQuery
|
||||||
)
|
negative: FullTextQuery
|
||||||
|
negative_boost: float = pydantic.Field(0.5, kw_only=True)
|
||||||
|
|
||||||
def query_type(self) -> FullTextQueryType:
|
def query_type(self) -> FullTextQueryType:
|
||||||
return FullTextQueryType.BOOST
|
return FullTextQueryType.BOOST
|
||||||
|
|
||||||
def to_dict(self) -> dict:
|
|
||||||
return {
|
|
||||||
"boost": {
|
|
||||||
"positive": self.positive.to_dict(),
|
|
||||||
"negative": self.negative.to_dict(),
|
|
||||||
"negative_boost": self.negative_boost,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
@pydantic.dataclasses.dataclass
|
||||||
class MultiMatchQuery(FullTextQuery):
|
class MultiMatchQuery(FullTextQuery):
|
||||||
query: str
|
|
||||||
columns: list[str]
|
|
||||||
boosts: list[float]
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
query: str,
|
|
||||||
columns: list[str],
|
|
||||||
*,
|
|
||||||
boosts: Optional[list[float]] = None,
|
|
||||||
):
|
|
||||||
"""
|
"""
|
||||||
Multi-match query for full-text search.
|
Multi-match query for full-text search.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
query : str
|
query : str | list[Query]
|
||||||
The query string to match against.
|
If a string, the query string to match against.
|
||||||
|
|
||||||
columns : list[str]
|
columns : list[str]
|
||||||
The list of columns to match against.
|
The list of columns to match against.
|
||||||
|
|
||||||
boosts : list[float], optional
|
boosts : list[float], optional
|
||||||
The list of boost factors for each column. If not provided,
|
The list of boost factors for each column. If not provided,
|
||||||
all columns will have the same boost factor.
|
all columns will have the same boost factor.
|
||||||
|
operator : FullTextOperator, default OR
|
||||||
|
The operator to use for combining the query results.
|
||||||
|
Can be either `AND` or `OR`.
|
||||||
|
It would be applied to all columns individually.
|
||||||
|
For example, if the operator is `AND`,
|
||||||
|
then the query "hello world" is equal to
|
||||||
|
`match("hello AND world", column1) OR match("hello AND world", column2)`.
|
||||||
"""
|
"""
|
||||||
if boosts is None:
|
|
||||||
boosts = [1.0] * len(columns)
|
query: str
|
||||||
super().__init__(query=query, columns=columns, boosts=boosts)
|
columns: list[str]
|
||||||
|
boosts: Optional[list[float]] = pydantic.Field(None, kw_only=True)
|
||||||
|
operator: FullTextOperator = pydantic.Field(FullTextOperator.OR, kw_only=True)
|
||||||
|
|
||||||
def query_type(self) -> FullTextQueryType:
|
def query_type(self) -> FullTextQueryType:
|
||||||
return FullTextQueryType.MULTI_MATCH
|
return FullTextQueryType.MULTI_MATCH
|
||||||
|
|
||||||
def to_dict(self) -> dict:
|
|
||||||
return {
|
@pydantic.dataclasses.dataclass
|
||||||
"multi_match": {
|
class BooleanQuery(FullTextQuery):
|
||||||
"query": self.query,
|
"""
|
||||||
"columns": self.columns,
|
Boolean query for full-text search.
|
||||||
"boost": self.boosts,
|
|
||||||
}
|
Parameters
|
||||||
}
|
----------
|
||||||
|
queries : list[tuple(Occur, FullTextQuery)]
|
||||||
|
The list of queries with their occurrence requirements.
|
||||||
|
"""
|
||||||
|
|
||||||
|
queries: list[tuple[Occur, FullTextQuery]]
|
||||||
|
|
||||||
|
def query_type(self) -> FullTextQueryType:
|
||||||
|
return FullTextQueryType.BOOLEAN
|
||||||
|
|
||||||
|
|
||||||
class FullTextSearchQuery(pydantic.BaseModel):
|
class FullTextSearchQuery(pydantic.BaseModel):
|
||||||
@@ -445,8 +442,18 @@ class Query(pydantic.BaseModel):
|
|||||||
# which columns to return in the results
|
# which columns to return in the results
|
||||||
columns: Optional[Union[List[str], Dict[str, str]]] = None
|
columns: Optional[Union[List[str], Dict[str, str]]] = None
|
||||||
|
|
||||||
# number of IVF partitions to search
|
# minimum number of IVF partitions to search
|
||||||
nprobes: Optional[int] = None
|
#
|
||||||
|
# If None then a default value (20) will be used.
|
||||||
|
minimum_nprobes: Optional[int] = None
|
||||||
|
|
||||||
|
# maximum number of IVF partitions to search
|
||||||
|
#
|
||||||
|
# If None then a default value (20) will be used.
|
||||||
|
#
|
||||||
|
# If 0 then no limit will be applied and all partitions could be searched
|
||||||
|
# if needed to satisfy the limit.
|
||||||
|
maximum_nprobes: Optional[int] = None
|
||||||
|
|
||||||
# lower bound for distance search
|
# lower bound for distance search
|
||||||
lower_bound: Optional[float] = None
|
lower_bound: Optional[float] = None
|
||||||
@@ -484,7 +491,8 @@ class Query(pydantic.BaseModel):
|
|||||||
query.vector_column = req.column
|
query.vector_column = req.column
|
||||||
query.vector = req.query_vector
|
query.vector = req.query_vector
|
||||||
query.distance_type = req.distance_type
|
query.distance_type = req.distance_type
|
||||||
query.nprobes = req.nprobes
|
query.minimum_nprobes = req.minimum_nprobes
|
||||||
|
query.maximum_nprobes = req.maximum_nprobes
|
||||||
query.lower_bound = req.lower_bound
|
query.lower_bound = req.lower_bound
|
||||||
query.upper_bound = req.upper_bound
|
query.upper_bound = req.upper_bound
|
||||||
query.ef = req.ef
|
query.ef = req.ef
|
||||||
@@ -493,10 +501,8 @@ class Query(pydantic.BaseModel):
|
|||||||
query.postfilter = req.postfilter
|
query.postfilter = req.postfilter
|
||||||
if req.full_text_search is not None:
|
if req.full_text_search is not None:
|
||||||
query.full_text_query = FullTextSearchQuery(
|
query.full_text_query = FullTextSearchQuery(
|
||||||
columns=req.full_text_search.columns,
|
columns=None,
|
||||||
query=req.full_text_search.query,
|
query=req.full_text_search,
|
||||||
limit=req.full_text_search.limit,
|
|
||||||
wand_factor=req.full_text_search.wand_factor,
|
|
||||||
)
|
)
|
||||||
return query
|
return query
|
||||||
|
|
||||||
@@ -1047,7 +1053,8 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
|
|||||||
super().__init__(table)
|
super().__init__(table)
|
||||||
self._query = query
|
self._query = query
|
||||||
self._distance_type = None
|
self._distance_type = None
|
||||||
self._nprobes = None
|
self._minimum_nprobes = None
|
||||||
|
self._maximum_nprobes = None
|
||||||
self._lower_bound = None
|
self._lower_bound = None
|
||||||
self._upper_bound = None
|
self._upper_bound = None
|
||||||
self._refine_factor = None
|
self._refine_factor = None
|
||||||
@@ -1110,6 +1117,10 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
|
|||||||
See discussion in [Querying an ANN Index][querying-an-ann-index] for
|
See discussion in [Querying an ANN Index][querying-an-ann-index] for
|
||||||
tuning advice.
|
tuning advice.
|
||||||
|
|
||||||
|
This method sets both the minimum and maximum number of probes to the same
|
||||||
|
value. See `minimum_nprobes` and `maximum_nprobes` for more fine-grained
|
||||||
|
control.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
nprobes: int
|
nprobes: int
|
||||||
@@ -1120,7 +1131,36 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
|
|||||||
LanceVectorQueryBuilder
|
LanceVectorQueryBuilder
|
||||||
The LanceQueryBuilder object.
|
The LanceQueryBuilder object.
|
||||||
"""
|
"""
|
||||||
self._nprobes = nprobes
|
self._minimum_nprobes = nprobes
|
||||||
|
self._maximum_nprobes = nprobes
|
||||||
|
return self
|
||||||
|
|
||||||
|
def minimum_nprobes(self, minimum_nprobes: int) -> LanceVectorQueryBuilder:
|
||||||
|
"""Set the minimum number of probes to use.
|
||||||
|
|
||||||
|
See `nprobes` for more details.
|
||||||
|
|
||||||
|
These partitions will be searched on every vector query and will increase recall
|
||||||
|
at the expense of latency.
|
||||||
|
"""
|
||||||
|
self._minimum_nprobes = minimum_nprobes
|
||||||
|
return self
|
||||||
|
|
||||||
|
def maximum_nprobes(self, maximum_nprobes: int) -> LanceVectorQueryBuilder:
|
||||||
|
"""Set the maximum number of probes to use.
|
||||||
|
|
||||||
|
See `nprobes` for more details.
|
||||||
|
|
||||||
|
If this value is greater than `minimum_nprobes` then the excess partitions
|
||||||
|
will be searched only if we have not found enough results.
|
||||||
|
|
||||||
|
This can be useful when there is a narrow filter to allow these queries to
|
||||||
|
spend more time searching and avoid potential false negatives.
|
||||||
|
|
||||||
|
If this value is 0 then no limit will be applied and all partitions could be
|
||||||
|
searched if needed to satisfy the limit.
|
||||||
|
"""
|
||||||
|
self._maximum_nprobes = maximum_nprobes
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def distance_range(
|
def distance_range(
|
||||||
@@ -1224,7 +1264,8 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
|
|||||||
limit=self._limit,
|
limit=self._limit,
|
||||||
distance_type=self._distance_type,
|
distance_type=self._distance_type,
|
||||||
columns=self._columns,
|
columns=self._columns,
|
||||||
nprobes=self._nprobes,
|
minimum_nprobes=self._minimum_nprobes,
|
||||||
|
maximum_nprobes=self._maximum_nprobes,
|
||||||
lower_bound=self._lower_bound,
|
lower_bound=self._lower_bound,
|
||||||
upper_bound=self._upper_bound,
|
upper_bound=self._upper_bound,
|
||||||
refine_factor=self._refine_factor,
|
refine_factor=self._refine_factor,
|
||||||
@@ -1410,10 +1451,13 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):
|
|||||||
|
|
||||||
query = self._query
|
query = self._query
|
||||||
if self._phrase_query:
|
if self._phrase_query:
|
||||||
raise NotImplementedError(
|
if isinstance(query, str):
|
||||||
"Phrase query is not yet supported in Lance FTS. "
|
if not query.startswith('"') or not query.endswith('"'):
|
||||||
"Use tantivy-based index instead for now."
|
query = f'"{query}"'
|
||||||
)
|
elif isinstance(query, FullTextQuery) and not isinstance(
|
||||||
|
query, PhraseQuery
|
||||||
|
):
|
||||||
|
raise TypeError("Please use PhraseQuery for phrase queries.")
|
||||||
query = self.to_query_object()
|
query = self.to_query_object()
|
||||||
results = self._table._execute_query(query, timeout=timeout)
|
results = self._table._execute_query(query, timeout=timeout)
|
||||||
results = results.read_all()
|
results = results.read_all()
|
||||||
@@ -1588,7 +1632,8 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
|
|||||||
self._fts_columns = fts_columns
|
self._fts_columns = fts_columns
|
||||||
self._norm = None
|
self._norm = None
|
||||||
self._reranker = None
|
self._reranker = None
|
||||||
self._nprobes = None
|
self._minimum_nprobes = None
|
||||||
|
self._maximum_nprobes = None
|
||||||
self._refine_factor = None
|
self._refine_factor = None
|
||||||
self._distance_type = None
|
self._distance_type = None
|
||||||
self._phrase_query = None
|
self._phrase_query = None
|
||||||
@@ -1820,7 +1865,24 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
|
|||||||
LanceHybridQueryBuilder
|
LanceHybridQueryBuilder
|
||||||
The LanceHybridQueryBuilder object.
|
The LanceHybridQueryBuilder object.
|
||||||
"""
|
"""
|
||||||
self._nprobes = nprobes
|
self._minimum_nprobes = nprobes
|
||||||
|
self._maximum_nprobes = nprobes
|
||||||
|
return self
|
||||||
|
|
||||||
|
def minimum_nprobes(self, minimum_nprobes: int) -> LanceHybridQueryBuilder:
|
||||||
|
"""Set the minimum number of probes to use.
|
||||||
|
|
||||||
|
See `nprobes` for more details.
|
||||||
|
"""
|
||||||
|
self._minimum_nprobes = minimum_nprobes
|
||||||
|
return self
|
||||||
|
|
||||||
|
def maximum_nprobes(self, maximum_nprobes: int) -> LanceHybridQueryBuilder:
|
||||||
|
"""Set the maximum number of probes to use.
|
||||||
|
|
||||||
|
See `nprobes` for more details.
|
||||||
|
"""
|
||||||
|
self._maximum_nprobes = maximum_nprobes
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def distance_range(
|
def distance_range(
|
||||||
@@ -2049,8 +2111,10 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
|
|||||||
self._fts_query.phrase_query(True)
|
self._fts_query.phrase_query(True)
|
||||||
if self._distance_type:
|
if self._distance_type:
|
||||||
self._vector_query.metric(self._distance_type)
|
self._vector_query.metric(self._distance_type)
|
||||||
if self._nprobes:
|
if self._minimum_nprobes:
|
||||||
self._vector_query.nprobes(self._nprobes)
|
self._vector_query.minimum_nprobes(self._minimum_nprobes)
|
||||||
|
if self._maximum_nprobes is not None:
|
||||||
|
self._vector_query.maximum_nprobes(self._maximum_nprobes)
|
||||||
if self._refine_factor:
|
if self._refine_factor:
|
||||||
self._vector_query.refine_factor(self._refine_factor)
|
self._vector_query.refine_factor(self._refine_factor)
|
||||||
if self._ef:
|
if self._ef:
|
||||||
@@ -2513,7 +2577,7 @@ class AsyncQuery(AsyncQueryBase):
|
|||||||
self._inner.nearest_to_text({"query": query, "columns": columns})
|
self._inner.nearest_to_text({"query": query, "columns": columns})
|
||||||
)
|
)
|
||||||
# FullTextQuery object
|
# FullTextQuery object
|
||||||
return AsyncFTSQuery(self._inner.nearest_to_text({"query": query.to_dict()}))
|
return AsyncFTSQuery(self._inner.nearest_to_text({"query": query}))
|
||||||
|
|
||||||
|
|
||||||
class AsyncFTSQuery(AsyncQueryBase):
|
class AsyncFTSQuery(AsyncQueryBase):
|
||||||
@@ -2661,6 +2725,34 @@ class AsyncVectorQueryBase:
|
|||||||
self._inner.nprobes(nprobes)
|
self._inner.nprobes(nprobes)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
def minimum_nprobes(self, minimum_nprobes: int) -> Self:
|
||||||
|
"""Set the minimum number of probes to use.
|
||||||
|
|
||||||
|
See `nprobes` for more details.
|
||||||
|
|
||||||
|
These partitions will be searched on every indexed vector query and will
|
||||||
|
increase recall at the expense of latency.
|
||||||
|
"""
|
||||||
|
self._inner.minimum_nprobes(minimum_nprobes)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def maximum_nprobes(self, maximum_nprobes: int) -> Self:
|
||||||
|
"""Set the maximum number of probes to use.
|
||||||
|
|
||||||
|
See `nprobes` for more details.
|
||||||
|
|
||||||
|
If this value is greater than `minimum_nprobes` then the excess partitions
|
||||||
|
will be searched only if we have not found enough results.
|
||||||
|
|
||||||
|
This can be useful when there is a narrow filter to allow these queries to
|
||||||
|
spend more time searching and avoid potential false negatives.
|
||||||
|
|
||||||
|
If this value is 0 then no limit will be applied and all partitions could be
|
||||||
|
searched if needed to satisfy the limit.
|
||||||
|
"""
|
||||||
|
self._inner.maximum_nprobes(maximum_nprobes)
|
||||||
|
return self
|
||||||
|
|
||||||
def distance_range(
|
def distance_range(
|
||||||
self, lower_bound: Optional[float] = None, upper_bound: Optional[float] = None
|
self, lower_bound: Optional[float] = None, upper_bound: Optional[float] = None
|
||||||
) -> Self:
|
) -> Self:
|
||||||
@@ -2835,7 +2927,7 @@ class AsyncVectorQuery(AsyncQueryBase, AsyncVectorQueryBase):
|
|||||||
self._inner.nearest_to_text({"query": query, "columns": columns})
|
self._inner.nearest_to_text({"query": query, "columns": columns})
|
||||||
)
|
)
|
||||||
# FullTextQuery object
|
# FullTextQuery object
|
||||||
return AsyncHybridQuery(self._inner.nearest_to_text({"query": query.to_dict()}))
|
return AsyncHybridQuery(self._inner.nearest_to_text({"query": query}))
|
||||||
|
|
||||||
async def to_batches(
|
async def to_batches(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@@ -827,7 +827,7 @@ class Table(ABC):
|
|||||||
ordering_field_names: Optional[Union[str, List[str]]] = None,
|
ordering_field_names: Optional[Union[str, List[str]]] = None,
|
||||||
replace: bool = False,
|
replace: bool = False,
|
||||||
writer_heap_size: Optional[int] = 1024 * 1024 * 1024,
|
writer_heap_size: Optional[int] = 1024 * 1024 * 1024,
|
||||||
use_tantivy: bool = True,
|
use_tantivy: bool = False,
|
||||||
tokenizer_name: Optional[str] = None,
|
tokenizer_name: Optional[str] = None,
|
||||||
with_position: bool = False,
|
with_position: bool = False,
|
||||||
# tokenizer configs:
|
# tokenizer configs:
|
||||||
@@ -864,7 +864,7 @@ class Table(ABC):
|
|||||||
The tokenizer to use for the index. Can be "raw", "default" or the 2 letter
|
The tokenizer to use for the index. Can be "raw", "default" or the 2 letter
|
||||||
language code followed by "_stem". So for english it would be "en_stem".
|
language code followed by "_stem". So for english it would be "en_stem".
|
||||||
For available languages see: https://docs.rs/tantivy/latest/tantivy/tokenizer/enum.Language.html
|
For available languages see: https://docs.rs/tantivy/latest/tantivy/tokenizer/enum.Language.html
|
||||||
use_tantivy: bool, default True
|
use_tantivy: bool, default False
|
||||||
If True, use the legacy full-text search implementation based on tantivy.
|
If True, use the legacy full-text search implementation based on tantivy.
|
||||||
If False, use the new full-text search implementation based on lance-index.
|
If False, use the new full-text search implementation based on lance-index.
|
||||||
with_position: bool, default False
|
with_position: bool, default False
|
||||||
@@ -1970,7 +1970,7 @@ class LanceTable(Table):
|
|||||||
ordering_field_names: Optional[Union[str, List[str]]] = None,
|
ordering_field_names: Optional[Union[str, List[str]]] = None,
|
||||||
replace: bool = False,
|
replace: bool = False,
|
||||||
writer_heap_size: Optional[int] = 1024 * 1024 * 1024,
|
writer_heap_size: Optional[int] = 1024 * 1024 * 1024,
|
||||||
use_tantivy: bool = True,
|
use_tantivy: bool = False,
|
||||||
tokenizer_name: Optional[str] = None,
|
tokenizer_name: Optional[str] = None,
|
||||||
with_position: bool = False,
|
with_position: bool = False,
|
||||||
# tokenizer configs:
|
# tokenizer configs:
|
||||||
@@ -3637,8 +3637,10 @@ class AsyncTable:
|
|||||||
)
|
)
|
||||||
if query.distance_type is not None:
|
if query.distance_type is not None:
|
||||||
async_query = async_query.distance_type(query.distance_type)
|
async_query = async_query.distance_type(query.distance_type)
|
||||||
if query.nprobes is not None:
|
if query.minimum_nprobes is not None:
|
||||||
async_query = async_query.nprobes(query.nprobes)
|
async_query = async_query.minimum_nprobes(query.minimum_nprobes)
|
||||||
|
if query.maximum_nprobes is not None:
|
||||||
|
async_query = async_query.maximum_nprobes(query.maximum_nprobes)
|
||||||
if query.refine_factor is not None:
|
if query.refine_factor is not None:
|
||||||
async_query = async_query.refine_factor(query.refine_factor)
|
async_query = async_query.refine_factor(query.refine_factor)
|
||||||
if query.vector_column:
|
if query.vector_column:
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ import lancedb
|
|||||||
|
|
||||||
# --8<-- [end:import-lancedb]
|
# --8<-- [end:import-lancedb]
|
||||||
# --8<-- [start:import-numpy]
|
# --8<-- [start:import-numpy]
|
||||||
from lancedb.query import BoostQuery, MatchQuery
|
from lancedb.query import BooleanQuery, BoostQuery, MatchQuery, Occur
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pyarrow as pa
|
import pyarrow as pa
|
||||||
|
|
||||||
@@ -191,6 +191,15 @@ def test_fts_fuzzy_query():
|
|||||||
"food", # 1 insertion
|
"food", # 1 insertion
|
||||||
}
|
}
|
||||||
|
|
||||||
|
results = table.search(
|
||||||
|
MatchQuery("foo", "text", fuzziness=1, prefix_length=3)
|
||||||
|
).to_pandas()
|
||||||
|
assert len(results) == 2
|
||||||
|
assert set(results["text"].to_list()) == {
|
||||||
|
"foo",
|
||||||
|
"food",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(
|
@pytest.mark.skipif(
|
||||||
os.name == "nt", reason="Need to fix https://github.com/lancedb/lance/issues/3905"
|
os.name == "nt", reason="Need to fix https://github.com/lancedb/lance/issues/3905"
|
||||||
@@ -240,6 +249,60 @@ def test_fts_boost_query():
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipif(
|
||||||
|
os.name == "nt", reason="Need to fix https://github.com/lancedb/lance/issues/3905"
|
||||||
|
)
|
||||||
|
def test_fts_boolean_query(tmp_path):
|
||||||
|
uri = tmp_path / "boolean-example"
|
||||||
|
db = lancedb.connect(uri)
|
||||||
|
table = db.create_table(
|
||||||
|
"my_table_fts_boolean",
|
||||||
|
data=[
|
||||||
|
{"text": "The cat and dog are playing"},
|
||||||
|
{"text": "The cat is sleeping"},
|
||||||
|
{"text": "The dog is barking"},
|
||||||
|
{"text": "The dog chases the cat"},
|
||||||
|
],
|
||||||
|
mode="overwrite",
|
||||||
|
)
|
||||||
|
table.create_fts_index("text", use_tantivy=False, replace=True)
|
||||||
|
|
||||||
|
# SHOULD
|
||||||
|
results = table.search(
|
||||||
|
MatchQuery("cat", "text") | MatchQuery("dog", "text")
|
||||||
|
).to_pandas()
|
||||||
|
assert len(results) == 4
|
||||||
|
assert set(results["text"].to_list()) == {
|
||||||
|
"The cat and dog are playing",
|
||||||
|
"The cat is sleeping",
|
||||||
|
"The dog is barking",
|
||||||
|
"The dog chases the cat",
|
||||||
|
}
|
||||||
|
# MUST
|
||||||
|
results = table.search(
|
||||||
|
MatchQuery("cat", "text") & MatchQuery("dog", "text")
|
||||||
|
).to_pandas()
|
||||||
|
assert len(results) == 2
|
||||||
|
assert set(results["text"].to_list()) == {
|
||||||
|
"The cat and dog are playing",
|
||||||
|
"The dog chases the cat",
|
||||||
|
}
|
||||||
|
|
||||||
|
# MUST NOT
|
||||||
|
results = table.search(
|
||||||
|
BooleanQuery(
|
||||||
|
[
|
||||||
|
(Occur.MUST, MatchQuery("cat", "text")),
|
||||||
|
(Occur.MUST_NOT, MatchQuery("dog", "text")),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
).to_pandas()
|
||||||
|
assert len(results) == 1
|
||||||
|
assert set(results["text"].to_list()) == {
|
||||||
|
"The cat is sleeping",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(
|
@pytest.mark.skipif(
|
||||||
os.name == "nt", reason="Need to fix https://github.com/lancedb/lance/issues/3905"
|
os.name == "nt", reason="Need to fix https://github.com/lancedb/lance/issues/3905"
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -215,6 +215,19 @@ def test_search_fts(table, use_tantivy):
|
|||||||
assert len(results) == 5
|
assert len(results) == 5
|
||||||
assert len(results[0]) == 3 # id, text, _score
|
assert len(results[0]) == 3 # id, text, _score
|
||||||
|
|
||||||
|
# Test boolean query
|
||||||
|
results = (
|
||||||
|
table.search(MatchQuery("puppy", "text") & MatchQuery("runs", "text"))
|
||||||
|
.select(["id", "text"])
|
||||||
|
.limit(5)
|
||||||
|
.to_list()
|
||||||
|
)
|
||||||
|
assert len(results) == 5
|
||||||
|
assert len(results[0]) == 3 # id, text, _score
|
||||||
|
for r in results:
|
||||||
|
assert "puppy" in r["text"]
|
||||||
|
assert "runs" in r["text"]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_fts_select_async(async_table):
|
async def test_fts_select_async(async_table):
|
||||||
|
|||||||
@@ -25,6 +25,8 @@ from lancedb.query import (
|
|||||||
AsyncQueryBase,
|
AsyncQueryBase,
|
||||||
AsyncVectorQuery,
|
AsyncVectorQuery,
|
||||||
LanceVectorQueryBuilder,
|
LanceVectorQueryBuilder,
|
||||||
|
MatchQuery,
|
||||||
|
PhraseQuery,
|
||||||
Query,
|
Query,
|
||||||
FullTextSearchQuery,
|
FullTextSearchQuery,
|
||||||
)
|
)
|
||||||
@@ -437,6 +439,33 @@ def test_query_builder_with_filter(table):
|
|||||||
assert all(np.array(rs[0]["vector"]) == [3, 4])
|
assert all(np.array(rs[0]["vector"]) == [3, 4])
|
||||||
|
|
||||||
|
|
||||||
|
def test_invalid_nprobes_sync(table):
|
||||||
|
with pytest.raises(ValueError, match="minimum_nprobes must be greater than 0"):
|
||||||
|
LanceVectorQueryBuilder(table, [0, 0], "vector").minimum_nprobes(0).to_list()
|
||||||
|
with pytest.raises(
|
||||||
|
ValueError, match="maximum_nprobes must be greater than minimum_nprobes"
|
||||||
|
):
|
||||||
|
LanceVectorQueryBuilder(table, [0, 0], "vector").maximum_nprobes(5).to_list()
|
||||||
|
with pytest.raises(
|
||||||
|
ValueError, match="minimum_nprobes must be less or equal to maximum_nprobes"
|
||||||
|
):
|
||||||
|
LanceVectorQueryBuilder(table, [0, 0], "vector").minimum_nprobes(100).to_list()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_invalid_nprobes_async(table_async: AsyncTable):
|
||||||
|
with pytest.raises(ValueError, match="minimum_nprobes must be greater than 0"):
|
||||||
|
await table_async.vector_search([0, 0]).minimum_nprobes(0).to_list()
|
||||||
|
with pytest.raises(
|
||||||
|
ValueError, match="maximum_nprobes must be greater than minimum_nprobes"
|
||||||
|
):
|
||||||
|
await table_async.vector_search([0, 0]).maximum_nprobes(5).to_list()
|
||||||
|
with pytest.raises(
|
||||||
|
ValueError, match="minimum_nprobes must be less or equal to maximum_nprobes"
|
||||||
|
):
|
||||||
|
await table_async.vector_search([0, 0]).minimum_nprobes(100).to_list()
|
||||||
|
|
||||||
|
|
||||||
def test_query_builder_with_prefilter(table):
|
def test_query_builder_with_prefilter(table):
|
||||||
df = (
|
df = (
|
||||||
LanceVectorQueryBuilder(table, [0, 0], "vector")
|
LanceVectorQueryBuilder(table, [0, 0], "vector")
|
||||||
@@ -583,6 +612,21 @@ async def test_query_async(table_async: AsyncTable):
|
|||||||
table_async.query().nearest_to(pa.array([1, 2])).nprobes(10),
|
table_async.query().nearest_to(pa.array([1, 2])).nprobes(10),
|
||||||
expected_num_rows=2,
|
expected_num_rows=2,
|
||||||
)
|
)
|
||||||
|
await check_query(
|
||||||
|
table_async.query().nearest_to(pa.array([1, 2])).minimum_nprobes(10),
|
||||||
|
expected_num_rows=2,
|
||||||
|
)
|
||||||
|
await check_query(
|
||||||
|
table_async.query().nearest_to(pa.array([1, 2])).maximum_nprobes(30),
|
||||||
|
expected_num_rows=2,
|
||||||
|
)
|
||||||
|
await check_query(
|
||||||
|
table_async.query()
|
||||||
|
.nearest_to(pa.array([1, 2]))
|
||||||
|
.minimum_nprobes(10)
|
||||||
|
.maximum_nprobes(20),
|
||||||
|
expected_num_rows=2,
|
||||||
|
)
|
||||||
await check_query(
|
await check_query(
|
||||||
table_async.query().nearest_to(pa.array([1, 2])).bypass_vector_index(),
|
table_async.query().nearest_to(pa.array([1, 2])).bypass_vector_index(),
|
||||||
expected_num_rows=2,
|
expected_num_rows=2,
|
||||||
@@ -909,7 +953,39 @@ def test_query_serialization_sync(table: lancedb.table.Table):
|
|||||||
|
|
||||||
q = table.search([5.0, 6.0]).nprobes(10).refine_factor(5).to_query_object()
|
q = table.search([5.0, 6.0]).nprobes(10).refine_factor(5).to_query_object()
|
||||||
check_set_props(
|
check_set_props(
|
||||||
q, vector_column="vector", vector=[5.0, 6.0], nprobes=10, refine_factor=5
|
q,
|
||||||
|
vector_column="vector",
|
||||||
|
vector=[5.0, 6.0],
|
||||||
|
minimum_nprobes=10,
|
||||||
|
maximum_nprobes=10,
|
||||||
|
refine_factor=5,
|
||||||
|
)
|
||||||
|
|
||||||
|
q = table.search([5.0, 6.0]).minimum_nprobes(10).to_query_object()
|
||||||
|
check_set_props(
|
||||||
|
q,
|
||||||
|
vector_column="vector",
|
||||||
|
vector=[5.0, 6.0],
|
||||||
|
minimum_nprobes=10,
|
||||||
|
maximum_nprobes=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
q = table.search([5.0, 6.0]).nprobes(50).to_query_object()
|
||||||
|
check_set_props(
|
||||||
|
q,
|
||||||
|
vector_column="vector",
|
||||||
|
vector=[5.0, 6.0],
|
||||||
|
minimum_nprobes=50,
|
||||||
|
maximum_nprobes=50,
|
||||||
|
)
|
||||||
|
|
||||||
|
q = table.search([5.0, 6.0]).maximum_nprobes(10).to_query_object()
|
||||||
|
check_set_props(
|
||||||
|
q,
|
||||||
|
vector_column="vector",
|
||||||
|
vector=[5.0, 6.0],
|
||||||
|
maximum_nprobes=10,
|
||||||
|
minimum_nprobes=None,
|
||||||
)
|
)
|
||||||
|
|
||||||
q = table.search([5.0, 6.0]).distance_range(0.0, 1.0).to_query_object()
|
q = table.search([5.0, 6.0]).distance_range(0.0, 1.0).to_query_object()
|
||||||
@@ -961,7 +1037,8 @@ async def test_query_serialization_async(table_async: AsyncTable):
|
|||||||
limit=10,
|
limit=10,
|
||||||
vector=sample_vector,
|
vector=sample_vector,
|
||||||
postfilter=False,
|
postfilter=False,
|
||||||
nprobes=20,
|
minimum_nprobes=20,
|
||||||
|
maximum_nprobes=20,
|
||||||
with_row_id=False,
|
with_row_id=False,
|
||||||
bypass_vector_index=False,
|
bypass_vector_index=False,
|
||||||
)
|
)
|
||||||
@@ -971,7 +1048,20 @@ async def test_query_serialization_async(table_async: AsyncTable):
|
|||||||
q,
|
q,
|
||||||
vector=sample_vector,
|
vector=sample_vector,
|
||||||
postfilter=False,
|
postfilter=False,
|
||||||
nprobes=20,
|
minimum_nprobes=20,
|
||||||
|
maximum_nprobes=20,
|
||||||
|
with_row_id=False,
|
||||||
|
bypass_vector_index=False,
|
||||||
|
limit=10,
|
||||||
|
)
|
||||||
|
|
||||||
|
q = (await table_async.search([5.0, 6.0])).nprobes(50).to_query_object()
|
||||||
|
check_set_props(
|
||||||
|
q,
|
||||||
|
vector=sample_vector,
|
||||||
|
postfilter=False,
|
||||||
|
minimum_nprobes=50,
|
||||||
|
maximum_nprobes=50,
|
||||||
with_row_id=False,
|
with_row_id=False,
|
||||||
bypass_vector_index=False,
|
bypass_vector_index=False,
|
||||||
limit=10,
|
limit=10,
|
||||||
@@ -990,7 +1080,8 @@ async def test_query_serialization_async(table_async: AsyncTable):
|
|||||||
filter="id = 1",
|
filter="id = 1",
|
||||||
postfilter=True,
|
postfilter=True,
|
||||||
vector=sample_vector,
|
vector=sample_vector,
|
||||||
nprobes=20,
|
minimum_nprobes=20,
|
||||||
|
maximum_nprobes=20,
|
||||||
with_row_id=False,
|
with_row_id=False,
|
||||||
bypass_vector_index=False,
|
bypass_vector_index=False,
|
||||||
)
|
)
|
||||||
@@ -1004,7 +1095,8 @@ async def test_query_serialization_async(table_async: AsyncTable):
|
|||||||
check_set_props(
|
check_set_props(
|
||||||
q,
|
q,
|
||||||
vector=sample_vector,
|
vector=sample_vector,
|
||||||
nprobes=10,
|
minimum_nprobes=10,
|
||||||
|
maximum_nprobes=10,
|
||||||
refine_factor=5,
|
refine_factor=5,
|
||||||
postfilter=False,
|
postfilter=False,
|
||||||
with_row_id=False,
|
with_row_id=False,
|
||||||
@@ -1012,6 +1104,18 @@ async def test_query_serialization_async(table_async: AsyncTable):
|
|||||||
limit=10,
|
limit=10,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
q = (await table_async.search([5.0, 6.0])).minimum_nprobes(5).to_query_object()
|
||||||
|
check_set_props(
|
||||||
|
q,
|
||||||
|
vector=sample_vector,
|
||||||
|
minimum_nprobes=5,
|
||||||
|
maximum_nprobes=20,
|
||||||
|
postfilter=False,
|
||||||
|
with_row_id=False,
|
||||||
|
bypass_vector_index=False,
|
||||||
|
limit=10,
|
||||||
|
)
|
||||||
|
|
||||||
q = (
|
q = (
|
||||||
(await table_async.search([5.0, 6.0]))
|
(await table_async.search([5.0, 6.0]))
|
||||||
.distance_range(0.0, 1.0)
|
.distance_range(0.0, 1.0)
|
||||||
@@ -1023,7 +1127,8 @@ async def test_query_serialization_async(table_async: AsyncTable):
|
|||||||
lower_bound=0.0,
|
lower_bound=0.0,
|
||||||
upper_bound=1.0,
|
upper_bound=1.0,
|
||||||
postfilter=False,
|
postfilter=False,
|
||||||
nprobes=20,
|
minimum_nprobes=20,
|
||||||
|
maximum_nprobes=20,
|
||||||
with_row_id=False,
|
with_row_id=False,
|
||||||
bypass_vector_index=False,
|
bypass_vector_index=False,
|
||||||
limit=10,
|
limit=10,
|
||||||
@@ -1035,7 +1140,8 @@ async def test_query_serialization_async(table_async: AsyncTable):
|
|||||||
distance_type="cosine",
|
distance_type="cosine",
|
||||||
vector=sample_vector,
|
vector=sample_vector,
|
||||||
postfilter=False,
|
postfilter=False,
|
||||||
nprobes=20,
|
minimum_nprobes=20,
|
||||||
|
maximum_nprobes=20,
|
||||||
with_row_id=False,
|
with_row_id=False,
|
||||||
bypass_vector_index=False,
|
bypass_vector_index=False,
|
||||||
limit=10,
|
limit=10,
|
||||||
@@ -1047,7 +1153,8 @@ async def test_query_serialization_async(table_async: AsyncTable):
|
|||||||
ef=7,
|
ef=7,
|
||||||
vector=sample_vector,
|
vector=sample_vector,
|
||||||
postfilter=False,
|
postfilter=False,
|
||||||
nprobes=20,
|
minimum_nprobes=20,
|
||||||
|
maximum_nprobes=20,
|
||||||
with_row_id=False,
|
with_row_id=False,
|
||||||
bypass_vector_index=False,
|
bypass_vector_index=False,
|
||||||
limit=10,
|
limit=10,
|
||||||
@@ -1059,24 +1166,34 @@ async def test_query_serialization_async(table_async: AsyncTable):
|
|||||||
bypass_vector_index=True,
|
bypass_vector_index=True,
|
||||||
vector=sample_vector,
|
vector=sample_vector,
|
||||||
postfilter=False,
|
postfilter=False,
|
||||||
nprobes=20,
|
minimum_nprobes=20,
|
||||||
|
maximum_nprobes=20,
|
||||||
with_row_id=False,
|
with_row_id=False,
|
||||||
limit=10,
|
limit=10,
|
||||||
)
|
)
|
||||||
|
|
||||||
# FTS queries
|
# FTS queries
|
||||||
q = (await table_async.search("foo")).limit(10).to_query_object()
|
match_query = MatchQuery("foo", "text")
|
||||||
|
q = (await table_async.search(match_query)).limit(10).to_query_object()
|
||||||
check_set_props(
|
check_set_props(
|
||||||
q,
|
q,
|
||||||
limit=10,
|
limit=10,
|
||||||
full_text_query=FullTextSearchQuery(columns=[], query="foo"),
|
full_text_query=FullTextSearchQuery(columns=None, query=match_query),
|
||||||
with_row_id=False,
|
with_row_id=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
q = (await table_async.search("foo", query_type="fts")).to_query_object()
|
q = (await table_async.search(match_query)).to_query_object()
|
||||||
check_set_props(
|
check_set_props(
|
||||||
q,
|
q,
|
||||||
full_text_query=FullTextSearchQuery(columns=[], query="foo"),
|
full_text_query=FullTextSearchQuery(columns=None, query=match_query),
|
||||||
|
with_row_id=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
phrase_query = PhraseQuery("foo", "text", slop=1)
|
||||||
|
q = (await table_async.search(phrase_query)).to_query_object()
|
||||||
|
check_set_props(
|
||||||
|
q,
|
||||||
|
full_text_query=FullTextSearchQuery(columns=None, query=phrase_query),
|
||||||
with_row_id=False,
|
with_row_id=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -496,6 +496,8 @@ def test_query_sync_minimal():
|
|||||||
"ef": None,
|
"ef": None,
|
||||||
"vector": [1.0, 2.0, 3.0],
|
"vector": [1.0, 2.0, 3.0],
|
||||||
"nprobes": 20,
|
"nprobes": 20,
|
||||||
|
"minimum_nprobes": 20,
|
||||||
|
"maximum_nprobes": 20,
|
||||||
"version": None,
|
"version": None,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -536,6 +538,8 @@ def test_query_sync_maximal():
|
|||||||
"refine_factor": 10,
|
"refine_factor": 10,
|
||||||
"vector": [1.0, 2.0, 3.0],
|
"vector": [1.0, 2.0, 3.0],
|
||||||
"nprobes": 5,
|
"nprobes": 5,
|
||||||
|
"minimum_nprobes": 5,
|
||||||
|
"maximum_nprobes": 5,
|
||||||
"lower_bound": None,
|
"lower_bound": None,
|
||||||
"upper_bound": None,
|
"upper_bound": None,
|
||||||
"ef": None,
|
"ef": None,
|
||||||
@@ -564,6 +568,66 @@ def test_query_sync_maximal():
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_query_sync_nprobes():
|
||||||
|
def handler(body):
|
||||||
|
assert body == {
|
||||||
|
"distance_type": "l2",
|
||||||
|
"k": 10,
|
||||||
|
"prefilter": True,
|
||||||
|
"fast_search": True,
|
||||||
|
"vector_column": "vector2",
|
||||||
|
"refine_factor": None,
|
||||||
|
"lower_bound": None,
|
||||||
|
"upper_bound": None,
|
||||||
|
"ef": None,
|
||||||
|
"vector": [1.0, 2.0, 3.0],
|
||||||
|
"nprobes": 5,
|
||||||
|
"minimum_nprobes": 5,
|
||||||
|
"maximum_nprobes": 15,
|
||||||
|
"version": None,
|
||||||
|
}
|
||||||
|
|
||||||
|
return pa.table({"id": [1, 2, 3], "name": ["a", "b", "c"]})
|
||||||
|
|
||||||
|
with query_test_table(handler) as table:
|
||||||
|
(
|
||||||
|
table.search([1, 2, 3], vector_column_name="vector2", fast_search=True)
|
||||||
|
.minimum_nprobes(5)
|
||||||
|
.maximum_nprobes(15)
|
||||||
|
.to_list()
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_query_sync_no_max_nprobes():
|
||||||
|
def handler(body):
|
||||||
|
assert body == {
|
||||||
|
"distance_type": "l2",
|
||||||
|
"k": 10,
|
||||||
|
"prefilter": True,
|
||||||
|
"fast_search": True,
|
||||||
|
"vector_column": "vector2",
|
||||||
|
"refine_factor": None,
|
||||||
|
"lower_bound": None,
|
||||||
|
"upper_bound": None,
|
||||||
|
"ef": None,
|
||||||
|
"vector": [1.0, 2.0, 3.0],
|
||||||
|
"nprobes": 5,
|
||||||
|
"minimum_nprobes": 5,
|
||||||
|
"maximum_nprobes": 0,
|
||||||
|
"version": None,
|
||||||
|
}
|
||||||
|
|
||||||
|
return pa.table({"id": [1, 2, 3], "name": ["a", "b", "c"]})
|
||||||
|
|
||||||
|
with query_test_table(handler) as table:
|
||||||
|
(
|
||||||
|
table.search([1, 2, 3], vector_column_name="vector2", fast_search=True)
|
||||||
|
.minimum_nprobes(5)
|
||||||
|
.maximum_nprobes(0)
|
||||||
|
.to_list()
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("server_version", [Version("0.1.0"), Version("0.2.0")])
|
@pytest.mark.parametrize("server_version", [Version("0.1.0"), Version("0.2.0")])
|
||||||
def test_query_sync_batch_queries(server_version):
|
def test_query_sync_batch_queries(server_version):
|
||||||
def handler(body):
|
def handler(body):
|
||||||
@@ -666,6 +730,8 @@ def test_query_sync_hybrid():
|
|||||||
"refine_factor": None,
|
"refine_factor": None,
|
||||||
"vector": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
|
"vector": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
|
||||||
"nprobes": 20,
|
"nprobes": 20,
|
||||||
|
"minimum_nprobes": 20,
|
||||||
|
"maximum_nprobes": 20,
|
||||||
"lower_bound": None,
|
"lower_bound": None,
|
||||||
"upper_bound": None,
|
"upper_bound": None,
|
||||||
"ef": None,
|
"ef": None,
|
||||||
|
|||||||
@@ -245,7 +245,7 @@ def test_s3_dynamodb_sync(s3_bucket: str, commit_table: str, monkeypatch):
|
|||||||
NotImplementedError,
|
NotImplementedError,
|
||||||
match="Full-text search is only supported on the local filesystem",
|
match="Full-text search is only supported on the local filesystem",
|
||||||
):
|
):
|
||||||
table.create_fts_index("x")
|
table.create_fts_index("x", use_tantivy=True)
|
||||||
|
|
||||||
# make sure list tables still works
|
# make sure list tables still works
|
||||||
assert db.table_names() == ["test_ddb_sync"]
|
assert db.table_names() == ["test_ddb_sync"]
|
||||||
|
|||||||
@@ -9,15 +9,16 @@ use arrow::array::Array;
|
|||||||
use arrow::array::ArrayData;
|
use arrow::array::ArrayData;
|
||||||
use arrow::pyarrow::FromPyArrow;
|
use arrow::pyarrow::FromPyArrow;
|
||||||
use arrow::pyarrow::IntoPyArrow;
|
use arrow::pyarrow::IntoPyArrow;
|
||||||
use lancedb::index::scalar::{FtsQuery, FullTextSearchQuery, MatchQuery, PhraseQuery};
|
use lancedb::index::scalar::{
|
||||||
|
BooleanQuery, BoostQuery, FtsQuery, FullTextSearchQuery, MatchQuery, MultiMatchQuery, Occur,
|
||||||
|
Operator, PhraseQuery,
|
||||||
|
};
|
||||||
use lancedb::query::QueryExecutionOptions;
|
use lancedb::query::QueryExecutionOptions;
|
||||||
use lancedb::query::QueryFilter;
|
use lancedb::query::QueryFilter;
|
||||||
use lancedb::query::{
|
use lancedb::query::{
|
||||||
ExecutableQuery, Query as LanceDbQuery, QueryBase, Select, VectorQuery as LanceDbVectorQuery,
|
ExecutableQuery, Query as LanceDbQuery, QueryBase, Select, VectorQuery as LanceDbVectorQuery,
|
||||||
};
|
};
|
||||||
use lancedb::table::AnyQuery;
|
use lancedb::table::AnyQuery;
|
||||||
use pyo3::exceptions::PyRuntimeError;
|
|
||||||
use pyo3::exceptions::{PyNotImplementedError, PyValueError};
|
|
||||||
use pyo3::prelude::{PyAnyMethods, PyDictMethods};
|
use pyo3::prelude::{PyAnyMethods, PyDictMethods};
|
||||||
use pyo3::pymethods;
|
use pyo3::pymethods;
|
||||||
use pyo3::types::PyList;
|
use pyo3::types::PyList;
|
||||||
@@ -27,30 +28,173 @@ use pyo3::IntoPyObject;
|
|||||||
use pyo3::PyAny;
|
use pyo3::PyAny;
|
||||||
use pyo3::PyRef;
|
use pyo3::PyRef;
|
||||||
use pyo3::PyResult;
|
use pyo3::PyResult;
|
||||||
|
use pyo3::{exceptions::PyRuntimeError, FromPyObject};
|
||||||
|
use pyo3::{
|
||||||
|
exceptions::{PyNotImplementedError, PyValueError},
|
||||||
|
intern,
|
||||||
|
};
|
||||||
use pyo3::{pyclass, PyErr};
|
use pyo3::{pyclass, PyErr};
|
||||||
use pyo3_async_runtimes::tokio::future_into_py;
|
use pyo3_async_runtimes::tokio::future_into_py;
|
||||||
|
|
||||||
use crate::arrow::RecordBatchStream;
|
use crate::util::parse_distance_type;
|
||||||
use crate::error::PythonErrorExt;
|
use crate::{arrow::RecordBatchStream, util::PyLanceDB};
|
||||||
use crate::util::{parse_distance_type, parse_fts_query};
|
use crate::{error::PythonErrorExt, index::class_name};
|
||||||
|
|
||||||
// Python representation of full text search parameters
|
impl FromPyObject<'_> for PyLanceDB<FtsQuery> {
|
||||||
#[derive(Clone)]
|
fn extract_bound(ob: &Bound<'_, PyAny>) -> PyResult<Self> {
|
||||||
#[pyclass(get_all)]
|
match class_name(ob)?.as_str() {
|
||||||
pub struct PyFullTextSearchQuery {
|
"MatchQuery" => {
|
||||||
pub columns: Vec<String>,
|
let query = ob.getattr("query")?.extract()?;
|
||||||
pub query: String,
|
let column = ob.getattr("column")?.extract()?;
|
||||||
pub limit: Option<i64>,
|
let boost = ob.getattr("boost")?.extract()?;
|
||||||
pub wand_factor: Option<f32>,
|
let fuzziness = ob.getattr("fuzziness")?.extract()?;
|
||||||
|
let max_expansions = ob.getattr("max_expansions")?.extract()?;
|
||||||
|
let operator = ob.getattr("operator")?.extract::<String>()?;
|
||||||
|
let prefix_length = ob.getattr("prefix_length")?.extract()?;
|
||||||
|
|
||||||
|
Ok(PyLanceDB(
|
||||||
|
MatchQuery::new(query)
|
||||||
|
.with_column(Some(column))
|
||||||
|
.with_boost(boost)
|
||||||
|
.with_fuzziness(fuzziness)
|
||||||
|
.with_max_expansions(max_expansions)
|
||||||
|
.with_operator(Operator::try_from(operator.as_str()).map_err(|e| {
|
||||||
|
PyValueError::new_err(format!("Invalid operator: {}", e))
|
||||||
|
})?)
|
||||||
|
.with_prefix_length(prefix_length)
|
||||||
|
.into(),
|
||||||
|
))
|
||||||
|
}
|
||||||
|
"PhraseQuery" => {
|
||||||
|
let query = ob.getattr("query")?.extract()?;
|
||||||
|
let column = ob.getattr("column")?.extract()?;
|
||||||
|
let slop = ob.getattr("slop")?.extract()?;
|
||||||
|
|
||||||
|
Ok(PyLanceDB(
|
||||||
|
PhraseQuery::new(query)
|
||||||
|
.with_column(Some(column))
|
||||||
|
.with_slop(slop)
|
||||||
|
.into(),
|
||||||
|
))
|
||||||
|
}
|
||||||
|
"BoostQuery" => {
|
||||||
|
let positive: PyLanceDB<FtsQuery> = ob.getattr("positive")?.extract()?;
|
||||||
|
let negative: PyLanceDB<FtsQuery> = ob.getattr("negative")?.extract()?;
|
||||||
|
let negative_boost = ob.getattr("negative_boost")?.extract()?;
|
||||||
|
Ok(PyLanceDB(
|
||||||
|
BoostQuery::new(positive.0, negative.0, negative_boost).into(),
|
||||||
|
))
|
||||||
|
}
|
||||||
|
"MultiMatchQuery" => {
|
||||||
|
let query = ob.getattr("query")?.extract()?;
|
||||||
|
let columns = ob.getattr("columns")?.extract()?;
|
||||||
|
let boosts: Option<Vec<f32>> = ob.getattr("boosts")?.extract()?;
|
||||||
|
let operator: String = ob.getattr("operator")?.extract()?;
|
||||||
|
|
||||||
|
let q = MultiMatchQuery::try_new(query, columns)
|
||||||
|
.map_err(|e| PyValueError::new_err(format!("Invalid query: {}", e)))?;
|
||||||
|
let q = if let Some(boosts) = boosts {
|
||||||
|
q.try_with_boosts(boosts)
|
||||||
|
.map_err(|e| PyValueError::new_err(format!("Invalid boosts: {}", e)))?
|
||||||
|
} else {
|
||||||
|
q
|
||||||
|
};
|
||||||
|
|
||||||
|
let op = Operator::try_from(operator.as_str())
|
||||||
|
.map_err(|e| PyValueError::new_err(format!("Invalid operator: {}", e)))?;
|
||||||
|
|
||||||
|
Ok(PyLanceDB(q.with_operator(op).into()))
|
||||||
|
}
|
||||||
|
"BooleanQuery" => {
|
||||||
|
let queries: Vec<(String, PyLanceDB<FtsQuery>)> =
|
||||||
|
ob.getattr("queries")?.extract()?;
|
||||||
|
let mut sub_queries = Vec::with_capacity(queries.len());
|
||||||
|
for (occur, q) in queries {
|
||||||
|
let occur = Occur::try_from(occur.as_str())
|
||||||
|
.map_err(|e| PyValueError::new_err(e.to_string()))?;
|
||||||
|
sub_queries.push((occur, q.0));
|
||||||
|
}
|
||||||
|
Ok(PyLanceDB(BooleanQuery::new(sub_queries).into()))
|
||||||
|
}
|
||||||
|
name => Err(PyValueError::new_err(format!(
|
||||||
|
"Unsupported FTS query type: {}",
|
||||||
|
name
|
||||||
|
))),
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl From<FullTextSearchQuery> for PyFullTextSearchQuery {
|
impl<'py> IntoPyObject<'py> for PyLanceDB<FtsQuery> {
|
||||||
fn from(query: FullTextSearchQuery) -> Self {
|
type Target = PyAny;
|
||||||
Self {
|
type Output = Bound<'py, Self::Target>;
|
||||||
columns: query.columns().into_iter().collect(),
|
type Error = PyErr;
|
||||||
query: query.query.query().to_owned(),
|
|
||||||
limit: query.limit,
|
fn into_pyobject(self, py: pyo3::Python<'py>) -> PyResult<Self::Output> {
|
||||||
wand_factor: query.wand_factor,
|
let namespace = py
|
||||||
|
.import(intern!(py, "lancedb"))
|
||||||
|
.and_then(|m| m.getattr(intern!(py, "query")))
|
||||||
|
.expect("Failed to import namespace");
|
||||||
|
|
||||||
|
match self.0 {
|
||||||
|
FtsQuery::Match(query) => {
|
||||||
|
let kwargs = PyDict::new(py);
|
||||||
|
kwargs.set_item("boost", query.boost)?;
|
||||||
|
kwargs.set_item("fuzziness", query.fuzziness)?;
|
||||||
|
kwargs.set_item("max_expansions", query.max_expansions)?;
|
||||||
|
kwargs.set_item::<_, &str>("operator", query.operator.into())?;
|
||||||
|
kwargs.set_item("prefix_length", query.prefix_length)?;
|
||||||
|
namespace
|
||||||
|
.getattr(intern!(py, "MatchQuery"))?
|
||||||
|
.call((query.terms, query.column.unwrap()), Some(&kwargs))
|
||||||
|
}
|
||||||
|
FtsQuery::Phrase(query) => {
|
||||||
|
let kwargs = PyDict::new(py);
|
||||||
|
kwargs.set_item("slop", query.slop)?;
|
||||||
|
namespace
|
||||||
|
.getattr(intern!(py, "PhraseQuery"))?
|
||||||
|
.call((query.terms, query.column.unwrap()), Some(&kwargs))
|
||||||
|
}
|
||||||
|
FtsQuery::Boost(query) => {
|
||||||
|
let positive = PyLanceDB(query.positive.as_ref().clone()).into_pyobject(py)?;
|
||||||
|
let negative = PyLanceDB(query.negative.as_ref().clone()).into_pyobject(py)?;
|
||||||
|
let kwargs = PyDict::new(py);
|
||||||
|
kwargs.set_item("negative_boost", query.negative_boost)?;
|
||||||
|
namespace
|
||||||
|
.getattr(intern!(py, "BoostQuery"))?
|
||||||
|
.call((positive, negative), Some(&kwargs))
|
||||||
|
}
|
||||||
|
FtsQuery::MultiMatch(query) => {
|
||||||
|
let first = &query.match_queries[0];
|
||||||
|
let (columns, boosts): (Vec<_>, Vec<_>) = query
|
||||||
|
.match_queries
|
||||||
|
.iter()
|
||||||
|
.map(|q| (q.column.as_ref().unwrap().clone(), q.boost))
|
||||||
|
.unzip();
|
||||||
|
let kwargs = PyDict::new(py);
|
||||||
|
kwargs.set_item("boosts", boosts)?;
|
||||||
|
kwargs.set_item::<_, &str>("operator", first.operator.into())?;
|
||||||
|
namespace
|
||||||
|
.getattr(intern!(py, "MultiMatchQuery"))?
|
||||||
|
.call((first.terms.clone(), columns), Some(&kwargs))
|
||||||
|
}
|
||||||
|
FtsQuery::Boolean(query) => {
|
||||||
|
let mut queries: Vec<(&str, Bound<'py, PyAny>)> = Vec::with_capacity(
|
||||||
|
query.should.len() + query.must.len() + query.must_not.len(),
|
||||||
|
);
|
||||||
|
for q in query.should {
|
||||||
|
queries.push((Occur::Should.into(), PyLanceDB(q).into_pyobject(py)?));
|
||||||
|
}
|
||||||
|
for q in query.must {
|
||||||
|
queries.push((Occur::Must.into(), PyLanceDB(q).into_pyobject(py)?));
|
||||||
|
}
|
||||||
|
for q in query.must_not {
|
||||||
|
queries.push((Occur::MustNot.into(), PyLanceDB(q).into_pyobject(py)?));
|
||||||
|
}
|
||||||
|
|
||||||
|
namespace
|
||||||
|
.getattr(intern!(py, "BooleanQuery"))?
|
||||||
|
.call1((queries,))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -80,13 +224,16 @@ pub struct PyQueryRequest {
|
|||||||
pub limit: Option<usize>,
|
pub limit: Option<usize>,
|
||||||
pub offset: Option<usize>,
|
pub offset: Option<usize>,
|
||||||
pub filter: Option<PyQueryFilter>,
|
pub filter: Option<PyQueryFilter>,
|
||||||
pub full_text_search: Option<PyFullTextSearchQuery>,
|
pub full_text_search: Option<PyLanceDB<FtsQuery>>,
|
||||||
pub select: PySelect,
|
pub select: PySelect,
|
||||||
pub fast_search: Option<bool>,
|
pub fast_search: Option<bool>,
|
||||||
pub with_row_id: Option<bool>,
|
pub with_row_id: Option<bool>,
|
||||||
pub column: Option<String>,
|
pub column: Option<String>,
|
||||||
pub query_vector: Option<PyQueryVectors>,
|
pub query_vector: Option<PyQueryVectors>,
|
||||||
pub nprobes: Option<usize>,
|
pub minimum_nprobes: Option<usize>,
|
||||||
|
// None means user did not set it and default shoud be used (currenty 20)
|
||||||
|
// Some(0) means user set it to None and there is no limit
|
||||||
|
pub maximum_nprobes: Option<usize>,
|
||||||
pub lower_bound: Option<f32>,
|
pub lower_bound: Option<f32>,
|
||||||
pub upper_bound: Option<f32>,
|
pub upper_bound: Option<f32>,
|
||||||
pub ef: Option<usize>,
|
pub ef: Option<usize>,
|
||||||
@@ -106,13 +253,14 @@ impl From<AnyQuery> for PyQueryRequest {
|
|||||||
filter: query_request.filter.map(PyQueryFilter),
|
filter: query_request.filter.map(PyQueryFilter),
|
||||||
full_text_search: query_request
|
full_text_search: query_request
|
||||||
.full_text_search
|
.full_text_search
|
||||||
.map(PyFullTextSearchQuery::from),
|
.map(|fts| PyLanceDB(fts.query)),
|
||||||
select: PySelect(query_request.select),
|
select: PySelect(query_request.select),
|
||||||
fast_search: Some(query_request.fast_search),
|
fast_search: Some(query_request.fast_search),
|
||||||
with_row_id: Some(query_request.with_row_id),
|
with_row_id: Some(query_request.with_row_id),
|
||||||
column: None,
|
column: None,
|
||||||
query_vector: None,
|
query_vector: None,
|
||||||
nprobes: None,
|
minimum_nprobes: None,
|
||||||
|
maximum_nprobes: None,
|
||||||
lower_bound: None,
|
lower_bound: None,
|
||||||
upper_bound: None,
|
upper_bound: None,
|
||||||
ef: None,
|
ef: None,
|
||||||
@@ -132,7 +280,11 @@ impl From<AnyQuery> for PyQueryRequest {
|
|||||||
with_row_id: Some(vector_query.base.with_row_id),
|
with_row_id: Some(vector_query.base.with_row_id),
|
||||||
column: vector_query.column,
|
column: vector_query.column,
|
||||||
query_vector: Some(PyQueryVectors(vector_query.query_vector)),
|
query_vector: Some(PyQueryVectors(vector_query.query_vector)),
|
||||||
nprobes: Some(vector_query.nprobes),
|
minimum_nprobes: Some(vector_query.minimum_nprobes),
|
||||||
|
maximum_nprobes: match vector_query.maximum_nprobes {
|
||||||
|
None => Some(0),
|
||||||
|
Some(value) => Some(value),
|
||||||
|
},
|
||||||
lower_bound: vector_query.lower_bound,
|
lower_bound: vector_query.lower_bound,
|
||||||
upper_bound: vector_query.upper_bound,
|
upper_bound: vector_query.upper_bound,
|
||||||
ef: vector_query.ef,
|
ef: vector_query.ef,
|
||||||
@@ -269,8 +421,8 @@ impl Query {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
let mut query = FullTextSearchQuery::new_query(query);
|
let mut query = FullTextSearchQuery::new_query(query);
|
||||||
if let Some(cols) = columns {
|
match columns {
|
||||||
if !cols.is_empty() {
|
Some(cols) if !cols.is_empty() => {
|
||||||
query = query.with_columns(&cols).map_err(|e| {
|
query = query.with_columns(&cols).map_err(|e| {
|
||||||
PyValueError::new_err(format!(
|
PyValueError::new_err(format!(
|
||||||
"Failed to set full text search columns: {}",
|
"Failed to set full text search columns: {}",
|
||||||
@@ -278,15 +430,12 @@ impl Query {
|
|||||||
))
|
))
|
||||||
})?;
|
})?;
|
||||||
}
|
}
|
||||||
|
_ => {}
|
||||||
}
|
}
|
||||||
query
|
query
|
||||||
} else if let Ok(query) = fts_query.downcast::<PyDict>() {
|
|
||||||
let query = parse_fts_query(query)?;
|
|
||||||
FullTextSearchQuery::new_query(query)
|
|
||||||
} else {
|
} else {
|
||||||
return Err(PyValueError::new_err(
|
let query = fts_query.extract::<PyLanceDB<FtsQuery>>()?;
|
||||||
"query must be a string or a Query object",
|
FullTextSearchQuery::new_query(query.0)
|
||||||
));
|
|
||||||
};
|
};
|
||||||
|
|
||||||
Ok(FTSQuery {
|
Ok(FTSQuery {
|
||||||
@@ -509,6 +658,29 @@ impl VectorQuery {
|
|||||||
self.inner = self.inner.clone().nprobes(nprobe as usize);
|
self.inner = self.inner.clone().nprobes(nprobe as usize);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn minimum_nprobes(&mut self, minimum_nprobes: u32) -> PyResult<()> {
|
||||||
|
self.inner = self
|
||||||
|
.inner
|
||||||
|
.clone()
|
||||||
|
.minimum_nprobes(minimum_nprobes as usize)
|
||||||
|
.infer_error()?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn maximum_nprobes(&mut self, maximum_nprobes: u32) -> PyResult<()> {
|
||||||
|
let maximum_nprobes = if maximum_nprobes == 0 {
|
||||||
|
None
|
||||||
|
} else {
|
||||||
|
Some(maximum_nprobes as usize)
|
||||||
|
};
|
||||||
|
self.inner = self
|
||||||
|
.inner
|
||||||
|
.clone()
|
||||||
|
.maximum_nprobes(maximum_nprobes)
|
||||||
|
.infer_error()?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
#[pyo3(signature = (lower_bound=None, upper_bound=None))]
|
#[pyo3(signature = (lower_bound=None, upper_bound=None))]
|
||||||
pub fn distance_range(&mut self, lower_bound: Option<f32>, upper_bound: Option<f32>) {
|
pub fn distance_range(&mut self, lower_bound: Option<f32>, upper_bound: Option<f32>) {
|
||||||
self.inner = self.inner.clone().distance_range(lower_bound, upper_bound);
|
self.inner = self.inner.clone().distance_range(lower_bound, upper_bound);
|
||||||
|
|||||||
@@ -3,15 +3,11 @@
|
|||||||
|
|
||||||
use std::sync::Mutex;
|
use std::sync::Mutex;
|
||||||
|
|
||||||
use lancedb::index::scalar::{BoostQuery, FtsQuery, MatchQuery, MultiMatchQuery, PhraseQuery};
|
|
||||||
use lancedb::DistanceType;
|
use lancedb::DistanceType;
|
||||||
use pyo3::prelude::{PyAnyMethods, PyDictMethods, PyListMethods};
|
|
||||||
use pyo3::types::PyDict;
|
|
||||||
use pyo3::{
|
use pyo3::{
|
||||||
exceptions::{PyRuntimeError, PyValueError},
|
exceptions::{PyRuntimeError, PyValueError},
|
||||||
pyfunction, PyResult,
|
pyfunction, PyResult,
|
||||||
};
|
};
|
||||||
use pyo3::{Bound, PyAny};
|
|
||||||
|
|
||||||
/// A wrapper around a rust builder
|
/// A wrapper around a rust builder
|
||||||
///
|
///
|
||||||
@@ -64,116 +60,6 @@ pub fn validate_table_name(table_name: &str) -> PyResult<()> {
|
|||||||
.map_err(|e| PyValueError::new_err(e.to_string()))
|
.map_err(|e| PyValueError::new_err(e.to_string()))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn parse_fts_query(query: &Bound<'_, PyDict>) -> PyResult<FtsQuery> {
|
/// A wrapper around a LanceDB type to allow it to be used in Python
|
||||||
let query_type = query.keys().get_item(0)?.extract::<String>()?;
|
#[derive(Debug, Clone)]
|
||||||
let query_value = query
|
pub struct PyLanceDB<T>(pub T);
|
||||||
.get_item(&query_type)?
|
|
||||||
.ok_or(PyValueError::new_err(format!(
|
|
||||||
"Query type {} not found",
|
|
||||||
query_type
|
|
||||||
)))?;
|
|
||||||
let query_value = query_value.downcast::<PyDict>()?;
|
|
||||||
|
|
||||||
match query_type.as_str() {
|
|
||||||
"match" => {
|
|
||||||
let column = query_value.keys().get_item(0)?.extract::<String>()?;
|
|
||||||
let params = query_value
|
|
||||||
.get_item(&column)?
|
|
||||||
.ok_or(PyValueError::new_err(format!(
|
|
||||||
"column {} not found",
|
|
||||||
column
|
|
||||||
)))?;
|
|
||||||
let params = params.downcast::<PyDict>()?;
|
|
||||||
|
|
||||||
let query = params
|
|
||||||
.get_item("query")?
|
|
||||||
.ok_or(PyValueError::new_err("query not found"))?
|
|
||||||
.extract::<String>()?;
|
|
||||||
let boost = params
|
|
||||||
.get_item("boost")?
|
|
||||||
.ok_or(PyValueError::new_err("boost not found"))?
|
|
||||||
.extract::<f32>()?;
|
|
||||||
let fuzziness = params
|
|
||||||
.get_item("fuzziness")?
|
|
||||||
.ok_or(PyValueError::new_err("fuzziness not found"))?
|
|
||||||
.extract::<Option<u32>>()?;
|
|
||||||
let max_expansions = params
|
|
||||||
.get_item("max_expansions")?
|
|
||||||
.ok_or(PyValueError::new_err("max_expansions not found"))?
|
|
||||||
.extract::<usize>()?;
|
|
||||||
|
|
||||||
let query = MatchQuery::new(query)
|
|
||||||
.with_column(Some(column))
|
|
||||||
.with_boost(boost)
|
|
||||||
.with_fuzziness(fuzziness)
|
|
||||||
.with_max_expansions(max_expansions);
|
|
||||||
Ok(query.into())
|
|
||||||
}
|
|
||||||
|
|
||||||
"match_phrase" => {
|
|
||||||
let column = query_value.keys().get_item(0)?.extract::<String>()?;
|
|
||||||
let query = query_value
|
|
||||||
.get_item(&column)?
|
|
||||||
.ok_or(PyValueError::new_err(format!(
|
|
||||||
"column {} not found",
|
|
||||||
column
|
|
||||||
)))?
|
|
||||||
.extract::<String>()?;
|
|
||||||
|
|
||||||
let query = PhraseQuery::new(query).with_column(Some(column));
|
|
||||||
Ok(query.into())
|
|
||||||
}
|
|
||||||
|
|
||||||
"boost" => {
|
|
||||||
let positive: Bound<'_, PyAny> = query_value
|
|
||||||
.get_item("positive")?
|
|
||||||
.ok_or(PyValueError::new_err("positive not found"))?;
|
|
||||||
let positive = positive.downcast::<PyDict>()?;
|
|
||||||
|
|
||||||
let negative = query_value
|
|
||||||
.get_item("negative")?
|
|
||||||
.ok_or(PyValueError::new_err("negative not found"))?;
|
|
||||||
let negative = negative.downcast::<PyDict>()?;
|
|
||||||
|
|
||||||
let negative_boost = query_value
|
|
||||||
.get_item("negative_boost")?
|
|
||||||
.ok_or(PyValueError::new_err("negative_boost not found"))?
|
|
||||||
.extract::<f32>()?;
|
|
||||||
|
|
||||||
let positive_query = parse_fts_query(positive)?;
|
|
||||||
let negative_query = parse_fts_query(negative)?;
|
|
||||||
let query = BoostQuery::new(positive_query, negative_query, Some(negative_boost));
|
|
||||||
|
|
||||||
Ok(query.into())
|
|
||||||
}
|
|
||||||
|
|
||||||
"multi_match" => {
|
|
||||||
let query = query_value
|
|
||||||
.get_item("query")?
|
|
||||||
.ok_or(PyValueError::new_err("query not found"))?
|
|
||||||
.extract::<String>()?;
|
|
||||||
|
|
||||||
let columns = query_value
|
|
||||||
.get_item("columns")?
|
|
||||||
.ok_or(PyValueError::new_err("columns not found"))?
|
|
||||||
.extract::<Vec<String>>()?;
|
|
||||||
|
|
||||||
let boost = query_value
|
|
||||||
.get_item("boost")?
|
|
||||||
.ok_or(PyValueError::new_err("boost not found"))?
|
|
||||||
.extract::<Vec<f32>>()?;
|
|
||||||
|
|
||||||
let query = MultiMatchQuery::try_new(query, columns)
|
|
||||||
.and_then(|q| q.try_with_boosts(boost))
|
|
||||||
.map_err(|e| {
|
|
||||||
PyValueError::new_err(format!("Error creating MultiMatchQuery: {}", e))
|
|
||||||
})?;
|
|
||||||
Ok(query.into())
|
|
||||||
}
|
|
||||||
|
|
||||||
_ => Err(PyValueError::new_err(format!(
|
|
||||||
"Unsupported query type: {}",
|
|
||||||
query_type
|
|
||||||
))),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb-node"
|
name = "lancedb-node"
|
||||||
version = "0.20.0-beta.1"
|
version = "0.20.1-beta.2"
|
||||||
description = "Serverless, low-latency vector database for AI applications"
|
description = "Serverless, low-latency vector database for AI applications"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb"
|
name = "lancedb"
|
||||||
version = "0.20.0-beta.1"
|
version = "0.20.1-beta.2"
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
description = "LanceDB: A serverless, low-latency vector database for AI applications"
|
description = "LanceDB: A serverless, low-latency vector database for AI applications"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
|
|||||||
@@ -796,8 +796,10 @@ pub struct VectorQueryRequest {
|
|||||||
pub column: Option<String>,
|
pub column: Option<String>,
|
||||||
/// The vector(s) to search for
|
/// The vector(s) to search for
|
||||||
pub query_vector: Vec<Arc<dyn Array>>,
|
pub query_vector: Vec<Arc<dyn Array>>,
|
||||||
/// The number of partitions to search
|
/// The minimum number of partitions to search
|
||||||
pub nprobes: usize,
|
pub minimum_nprobes: usize,
|
||||||
|
/// The maximum number of partitions to search
|
||||||
|
pub maximum_nprobes: Option<usize>,
|
||||||
/// The lower bound (inclusive) of the distance to search for.
|
/// The lower bound (inclusive) of the distance to search for.
|
||||||
pub lower_bound: Option<f32>,
|
pub lower_bound: Option<f32>,
|
||||||
/// The upper bound (exclusive) of the distance to search for.
|
/// The upper bound (exclusive) of the distance to search for.
|
||||||
@@ -819,7 +821,8 @@ impl Default for VectorQueryRequest {
|
|||||||
base: QueryRequest::default(),
|
base: QueryRequest::default(),
|
||||||
column: None,
|
column: None,
|
||||||
query_vector: Vec::new(),
|
query_vector: Vec::new(),
|
||||||
nprobes: 20,
|
minimum_nprobes: 20,
|
||||||
|
maximum_nprobes: Some(20),
|
||||||
lower_bound: None,
|
lower_bound: None,
|
||||||
upper_bound: None,
|
upper_bound: None,
|
||||||
ef: None,
|
ef: None,
|
||||||
@@ -925,11 +928,75 @@ impl VectorQuery {
|
|||||||
/// For best results we recommend tuning this parameter with a benchmark against
|
/// For best results we recommend tuning this parameter with a benchmark against
|
||||||
/// your actual data to find the smallest possible value that will still give
|
/// your actual data to find the smallest possible value that will still give
|
||||||
/// you the desired recall.
|
/// you the desired recall.
|
||||||
|
///
|
||||||
|
/// This method sets both the minimum and maximum number of partitions to search.
|
||||||
|
/// For more fine-grained control see [`VectorQuery::minimum_nprobes`] and
|
||||||
|
/// [`VectorQuery::maximum_nprobes`].
|
||||||
pub fn nprobes(mut self, nprobes: usize) -> Self {
|
pub fn nprobes(mut self, nprobes: usize) -> Self {
|
||||||
self.request.nprobes = nprobes;
|
self.request.minimum_nprobes = nprobes;
|
||||||
|
self.request.maximum_nprobes = Some(nprobes);
|
||||||
self
|
self
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Set the minimum number of partitions to search
|
||||||
|
///
|
||||||
|
/// This argument is only used when the vector column has an IVF PQ index.
|
||||||
|
/// If there is no index then this value is ignored.
|
||||||
|
///
|
||||||
|
/// See [`VectorQuery::nprobes`] for more details.
|
||||||
|
///
|
||||||
|
/// These partitions will be searched on every indexed vector query.
|
||||||
|
///
|
||||||
|
/// Will return an error if the value is not greater than 0 or if maximum_nprobes
|
||||||
|
/// has been set and is less than the minimum_nprobes.
|
||||||
|
pub fn minimum_nprobes(mut self, minimum_nprobes: usize) -> Result<Self> {
|
||||||
|
if minimum_nprobes == 0 {
|
||||||
|
return Err(Error::InvalidInput {
|
||||||
|
message: "minimum_nprobes must be greater than 0".to_string(),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
if let Some(maximum_nprobes) = self.request.maximum_nprobes {
|
||||||
|
if minimum_nprobes > maximum_nprobes {
|
||||||
|
return Err(Error::InvalidInput {
|
||||||
|
message: "minimum_nprobes must be less or equal to maximum_nprobes".to_string(),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
self.request.minimum_nprobes = minimum_nprobes;
|
||||||
|
Ok(self)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Set the maximum number of partitions to search
|
||||||
|
///
|
||||||
|
/// This argument is only used when the vector column has an IVF PQ index.
|
||||||
|
/// If there is no index then this value is ignored.
|
||||||
|
///
|
||||||
|
/// See [`VectorQuery::nprobes`] for more details.
|
||||||
|
///
|
||||||
|
/// If this value is greater than minimum_nprobes then the excess partitions will
|
||||||
|
/// only be searched if the initial search does not return enough results.
|
||||||
|
///
|
||||||
|
/// This can be useful when there is a narrow filter to allow these queries to
|
||||||
|
/// spend more time searching and avoid potential false negatives.
|
||||||
|
///
|
||||||
|
/// Set to None to search all partitions, if needed, to satsify the limit
|
||||||
|
pub fn maximum_nprobes(mut self, maximum_nprobes: Option<usize>) -> Result<Self> {
|
||||||
|
if let Some(maximum_nprobes) = maximum_nprobes {
|
||||||
|
if maximum_nprobes == 0 {
|
||||||
|
return Err(Error::InvalidInput {
|
||||||
|
message: "maximum_nprobes must be greater than 0".to_string(),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
if maximum_nprobes < self.request.minimum_nprobes {
|
||||||
|
return Err(Error::InvalidInput {
|
||||||
|
message: "maximum_nprobes must be greater than minimum_nprobes".to_string(),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
self.request.maximum_nprobes = maximum_nprobes;
|
||||||
|
Ok(self)
|
||||||
|
}
|
||||||
|
|
||||||
/// Set the distance range for vector search,
|
/// Set the distance range for vector search,
|
||||||
/// only rows with distances in the range [lower_bound, upper_bound) will be returned
|
/// only rows with distances in the range [lower_bound, upper_bound) will be returned
|
||||||
pub fn distance_range(mut self, lower_bound: Option<f32>, upper_bound: Option<f32>) -> Self {
|
pub fn distance_range(mut self, lower_bound: Option<f32>, upper_bound: Option<f32>) -> Self {
|
||||||
@@ -1208,7 +1275,8 @@ mod tests {
|
|||||||
);
|
);
|
||||||
assert_eq!(query.request.base.limit.unwrap(), 100);
|
assert_eq!(query.request.base.limit.unwrap(), 100);
|
||||||
assert_eq!(query.request.base.offset.unwrap(), 1);
|
assert_eq!(query.request.base.offset.unwrap(), 1);
|
||||||
assert_eq!(query.request.nprobes, 1000);
|
assert_eq!(query.request.minimum_nprobes, 1000);
|
||||||
|
assert_eq!(query.request.maximum_nprobes, Some(1000));
|
||||||
assert!(query.request.use_index);
|
assert!(query.request.use_index);
|
||||||
assert_eq!(query.request.distance_type, Some(DistanceType::Cosine));
|
assert_eq!(query.request.distance_type, Some(DistanceType::Cosine));
|
||||||
assert_eq!(query.request.refine_factor, Some(999));
|
assert_eq!(query.request.refine_factor, Some(999));
|
||||||
|
|||||||
@@ -32,6 +32,7 @@ use lance::dataset::{ColumnAlteration, NewColumnTransform, Version};
|
|||||||
use lance_datafusion::exec::{execute_plan, OneShotExec};
|
use lance_datafusion::exec::{execute_plan, OneShotExec};
|
||||||
use reqwest::{RequestBuilder, Response};
|
use reqwest::{RequestBuilder, Response};
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
use serde_json::Number;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::io::Cursor;
|
use std::io::Cursor;
|
||||||
use std::pin::Pin;
|
use std::pin::Pin;
|
||||||
@@ -438,7 +439,18 @@ impl<S: HttpSend> RemoteTable<S> {
|
|||||||
|
|
||||||
// Apply general parameters, before we dispatch based on number of query vectors.
|
// Apply general parameters, before we dispatch based on number of query vectors.
|
||||||
body["distance_type"] = serde_json::json!(query.distance_type.unwrap_or_default());
|
body["distance_type"] = serde_json::json!(query.distance_type.unwrap_or_default());
|
||||||
body["nprobes"] = query.nprobes.into();
|
// In 0.23.1 we migrated from `nprobes` to `minimum_nprobes` and `maximum_nprobes`.
|
||||||
|
// Old client / new server: since minimum_nprobes is missing, fallback to nprobes
|
||||||
|
// New client / old server: old server will only see nprobes, make sure to set both
|
||||||
|
// nprobes and minimum_nprobes
|
||||||
|
// New client / new server: since minimum_nprobes is present, server can ignore nprobes
|
||||||
|
body["nprobes"] = query.minimum_nprobes.into();
|
||||||
|
body["minimum_nprobes"] = query.minimum_nprobes.into();
|
||||||
|
if let Some(maximum_nprobes) = query.maximum_nprobes {
|
||||||
|
body["maximum_nprobes"] = maximum_nprobes.into();
|
||||||
|
} else {
|
||||||
|
body["maximum_nprobes"] = serde_json::Value::Number(Number::from_u128(0).unwrap())
|
||||||
|
}
|
||||||
body["lower_bound"] = query.lower_bound.into();
|
body["lower_bound"] = query.lower_bound.into();
|
||||||
body["upper_bound"] = query.upper_bound.into();
|
body["upper_bound"] = query.upper_bound.into();
|
||||||
body["ef"] = query.ef.into();
|
body["ef"] = query.ef.into();
|
||||||
@@ -2075,6 +2087,8 @@ mod tests {
|
|||||||
"prefilter": true,
|
"prefilter": true,
|
||||||
"distance_type": "l2",
|
"distance_type": "l2",
|
||||||
"nprobes": 20,
|
"nprobes": 20,
|
||||||
|
"minimum_nprobes": 20,
|
||||||
|
"maximum_nprobes": 20,
|
||||||
"lower_bound": Option::<f32>::None,
|
"lower_bound": Option::<f32>::None,
|
||||||
"upper_bound": Option::<f32>::None,
|
"upper_bound": Option::<f32>::None,
|
||||||
"k": 10,
|
"k": 10,
|
||||||
@@ -2175,6 +2189,8 @@ mod tests {
|
|||||||
"bypass_vector_index": true,
|
"bypass_vector_index": true,
|
||||||
"columns": ["a", "b"],
|
"columns": ["a", "b"],
|
||||||
"nprobes": 12,
|
"nprobes": 12,
|
||||||
|
"minimum_nprobes": 12,
|
||||||
|
"maximum_nprobes": 12,
|
||||||
"lower_bound": Option::<f32>::None,
|
"lower_bound": Option::<f32>::None,
|
||||||
"upper_bound": Option::<f32>::None,
|
"upper_bound": Option::<f32>::None,
|
||||||
"ef": Option::<usize>::None,
|
"ef": Option::<usize>::None,
|
||||||
@@ -2302,6 +2318,7 @@ mod tests {
|
|||||||
"fuzziness": 0,
|
"fuzziness": 0,
|
||||||
"max_expansions": 50,
|
"max_expansions": 50,
|
||||||
"operator": "Or",
|
"operator": "Or",
|
||||||
|
"prefix_length": 0,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|||||||
@@ -2354,12 +2354,15 @@ impl BaseTable for NativeTable {
|
|||||||
query.base.limit.unwrap_or(DEFAULT_TOP_K),
|
query.base.limit.unwrap_or(DEFAULT_TOP_K),
|
||||||
)?;
|
)?;
|
||||||
}
|
}
|
||||||
|
scanner.minimum_nprobes(query.minimum_nprobes);
|
||||||
|
if let Some(maximum_nprobes) = query.maximum_nprobes {
|
||||||
|
scanner.maximum_nprobes(maximum_nprobes);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
scanner.limit(
|
scanner.limit(
|
||||||
query.base.limit.map(|limit| limit as i64),
|
query.base.limit.map(|limit| limit as i64),
|
||||||
query.base.offset.map(|offset| offset as i64),
|
query.base.offset.map(|offset| offset as i64),
|
||||||
)?;
|
)?;
|
||||||
scanner.nprobs(query.nprobes);
|
|
||||||
if let Some(ef) = query.ef {
|
if let Some(ef) = query.ef {
|
||||||
scanner.ef(ef);
|
scanner.ef(ef);
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user