mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-23 21:39:57 +00:00
Compare commits
61 Commits
python-v0.
...
python-v0.
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7a15337e03 | ||
|
|
96c66fd087 | ||
|
|
0579303602 | ||
|
|
75edb8756c | ||
|
|
88283110f4 | ||
|
|
b3a637fdeb | ||
|
|
ce24457531 | ||
|
|
087fe6343d | ||
|
|
ab8cbe62dd | ||
|
|
f076bb41f4 | ||
|
|
902fb83d54 | ||
|
|
779118339f | ||
|
|
03b62599d7 | ||
|
|
4c999fb651 | ||
|
|
6d23d32ab5 | ||
|
|
704cec34e1 | ||
|
|
a300a238db | ||
|
|
a41ff1df0a | ||
|
|
77b005d849 | ||
|
|
167fccc427 | ||
|
|
2bffbcefa5 | ||
|
|
905552f993 | ||
|
|
e4898c9313 | ||
|
|
cab36d94b2 | ||
|
|
b64252d4fd | ||
|
|
6fc006072c | ||
|
|
d4bb59b542 | ||
|
|
6b2dd6de51 | ||
|
|
dbccd9e4f1 | ||
|
|
b12ebfed4c | ||
|
|
1dadb2aefa | ||
|
|
eb9784d7f2 | ||
|
|
ba755626cc | ||
|
|
7760799cb8 | ||
|
|
4beb2d2877 | ||
|
|
a00b8595d1 | ||
|
|
9c8314b4fd | ||
|
|
c625b6f2b2 | ||
|
|
bec8fe6547 | ||
|
|
dc1150c011 | ||
|
|
afaefc6264 | ||
|
|
cb70ff8cee | ||
|
|
cbb5a841b1 | ||
|
|
c72f6770fd | ||
|
|
e5a80a5e86 | ||
|
|
8d0a7fad1f | ||
|
|
b80d4d0134 | ||
|
|
9645fe52c2 | ||
|
|
b77314168d | ||
|
|
e08d45e090 | ||
|
|
2e3ddb8382 | ||
|
|
627ca4c810 | ||
|
|
f8dae4ffe9 | ||
|
|
9eb6119468 | ||
|
|
59b57e30ed | ||
|
|
fec8d58f06 | ||
|
|
84ded9d678 | ||
|
|
65696d9713 | ||
|
|
e2f2ea32e4 | ||
|
|
d5f2eca754 | ||
|
|
7fa455a8a5 |
@@ -1,5 +1,5 @@
|
||||
[tool.bumpversion]
|
||||
current_version = "0.20.0-beta.2"
|
||||
current_version = "0.21.2-beta.0"
|
||||
parse = """(?x)
|
||||
(?P<major>0|[1-9]\\d*)\\.
|
||||
(?P<minor>0|[1-9]\\d*)\\.
|
||||
|
||||
12
.github/workflows/npm-publish.yml
vendored
12
.github/workflows/npm-publish.yml
vendored
@@ -539,10 +539,20 @@ jobs:
|
||||
# We need to deprecate the old package to avoid confusion.
|
||||
# Each time we publish a new version, it gets undeprecated.
|
||||
run: npm deprecate vectordb "Use @lancedb/lancedb instead."
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: main
|
||||
- name: Update package-lock.json
|
||||
run: bash ci/update_lockfiles.sh
|
||||
run: |
|
||||
git config user.name 'Lance Release'
|
||||
git config user.email 'lance-dev@lancedb.com'
|
||||
bash ci/update_lockfiles.sh
|
||||
- name: Push new commit
|
||||
uses: ad-m/github-push-action@master
|
||||
with:
|
||||
github_token: ${{ secrets.LANCEDB_RELEASE_TOKEN }}
|
||||
branch: main
|
||||
- name: Notify Slack Action
|
||||
uses: ravsamhq/notify-slack-action@2.3.0
|
||||
if: ${{ always() }}
|
||||
|
||||
994
Cargo.lock
generated
994
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
34
Cargo.toml
34
Cargo.toml
@@ -21,14 +21,16 @@ categories = ["database-implementations"]
|
||||
rust-version = "1.78.0"
|
||||
|
||||
[workspace.dependencies]
|
||||
lance = { "version" = "=0.29.0", "features" = ["dynamodb"] }
|
||||
lance-io = "=0.29.0"
|
||||
lance-index = "=0.29.0"
|
||||
lance-linalg = "=0.29.0"
|
||||
lance-table = "=0.29.0"
|
||||
lance-testing = "=0.29.0"
|
||||
lance-datafusion = "=0.29.0"
|
||||
lance-encoding = "=0.29.0"
|
||||
lance = { "version" = "=0.31.2", "features" = [
|
||||
"dynamodb",
|
||||
], "tag" = "v0.31.2-beta.3", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-io = { "version" = "=0.31.2", "tag" = "v0.31.2-beta.3", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-index = { "version" = "=0.31.2", "tag" = "v0.31.2-beta.3", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-linalg = { "version" = "=0.31.2", "tag" = "v0.31.2-beta.3", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-table = { "version" = "=0.31.2", "tag" = "v0.31.2-beta.3", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-testing = { "version" = "=0.31.2", "tag" = "v0.31.2-beta.3", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-datafusion = { "version" = "=0.31.2", "tag" = "v0.31.2-beta.3", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-encoding = { "version" = "=0.31.2", "tag" = "v0.31.2-beta.3", "git" = "https://github.com/lancedb/lance.git" }
|
||||
# Note that this one does not include pyarrow
|
||||
arrow = { version = "55.1", optional = false }
|
||||
arrow-array = "55.1"
|
||||
@@ -39,20 +41,20 @@ arrow-schema = "55.1"
|
||||
arrow-arith = "55.1"
|
||||
arrow-cast = "55.1"
|
||||
async-trait = "0"
|
||||
datafusion = { version = "47.0", default-features = false }
|
||||
datafusion-catalog = "47.0"
|
||||
datafusion-common = { version = "47.0", default-features = false }
|
||||
datafusion-execution = "47.0"
|
||||
datafusion-expr = "47.0"
|
||||
datafusion-physical-plan = "47.0"
|
||||
datafusion = { version = "48.0", default-features = false }
|
||||
datafusion-catalog = "48.0"
|
||||
datafusion-common = { version = "48.0", default-features = false }
|
||||
datafusion-execution = "48.0"
|
||||
datafusion-expr = "48.0"
|
||||
datafusion-physical-plan = "48.0"
|
||||
env_logger = "0.11"
|
||||
half = { "version" = "=2.5.0", default-features = false, features = [
|
||||
half = { "version" = "2.6.0", default-features = false, features = [
|
||||
"num-traits",
|
||||
] }
|
||||
futures = "0"
|
||||
log = "0.4"
|
||||
moka = { version = "0.12", features = ["future"] }
|
||||
object_store = "0.11.0"
|
||||
object_store = "0.12.0"
|
||||
pin-project = "1.0.7"
|
||||
snafu = "0.8"
|
||||
url = "2"
|
||||
|
||||
@@ -47,10 +47,10 @@ def extract_features(line: str) -> list:
|
||||
"""
|
||||
import re
|
||||
|
||||
match = re.search(r'"features"\s*=\s*\[(.*?)\]', line)
|
||||
match = re.search(r'"features"\s*=\s*\[\s*(.*?)\s*\]', line, re.DOTALL)
|
||||
if match:
|
||||
features_str = match.group(1)
|
||||
return [f.strip('"') for f in features_str.split(",")]
|
||||
return [f.strip('"') for f in features_str.split(",") if len(f) > 0]
|
||||
return []
|
||||
|
||||
|
||||
@@ -63,10 +63,24 @@ def update_cargo_toml(line_updater):
|
||||
lines = f.readlines()
|
||||
|
||||
new_lines = []
|
||||
lance_line = ""
|
||||
is_parsing_lance_line = False
|
||||
for line in lines:
|
||||
if line.startswith("lance"):
|
||||
# Update the line using the provided function
|
||||
new_lines.append(line_updater(line))
|
||||
if line.strip().endswith("}"):
|
||||
new_lines.append(line_updater(line))
|
||||
else:
|
||||
lance_line = line
|
||||
is_parsing_lance_line = True
|
||||
elif is_parsing_lance_line:
|
||||
lance_line += line
|
||||
if line.strip().endswith("}"):
|
||||
new_lines.append(line_updater(lance_line))
|
||||
lance_line = ""
|
||||
is_parsing_lance_line = False
|
||||
else:
|
||||
print("doesn't end with }:", line)
|
||||
else:
|
||||
# Keep the line unchanged
|
||||
new_lines.append(line)
|
||||
|
||||
12
docs/package-lock.json
generated
12
docs/package-lock.json
generated
@@ -19,7 +19,7 @@
|
||||
},
|
||||
"../node": {
|
||||
"name": "vectordb",
|
||||
"version": "0.12.0",
|
||||
"version": "0.21.2-beta.0",
|
||||
"cpu": [
|
||||
"x64",
|
||||
"arm64"
|
||||
@@ -65,11 +65,11 @@
|
||||
"uuid": "^9.0.0"
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"@lancedb/vectordb-darwin-arm64": "0.12.0",
|
||||
"@lancedb/vectordb-darwin-x64": "0.12.0",
|
||||
"@lancedb/vectordb-linux-arm64-gnu": "0.12.0",
|
||||
"@lancedb/vectordb-linux-x64-gnu": "0.12.0",
|
||||
"@lancedb/vectordb-win32-x64-msvc": "0.12.0"
|
||||
"@lancedb/vectordb-darwin-arm64": "0.21.2-beta.0",
|
||||
"@lancedb/vectordb-darwin-x64": "0.21.2-beta.0",
|
||||
"@lancedb/vectordb-linux-arm64-gnu": "0.21.2-beta.0",
|
||||
"@lancedb/vectordb-linux-x64-gnu": "0.21.2-beta.0",
|
||||
"@lancedb/vectordb-win32-x64-msvc": "0.21.2-beta.0"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"@apache-arrow/ts": "^14.0.2",
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
# SQL Querying
|
||||
|
||||
You can use DuckDB and Apache Datafusion to query your LanceDB tables using SQL.
|
||||
This guide will show how to query Lance tables them using both.
|
||||
|
||||
We will re-use the dataset [created previously](./pandas_and_pyarrow.md):
|
||||
We will re-use the dataset [created previously](./tables.md):
|
||||
|
||||
```python
|
||||
import lancedb
|
||||
@@ -27,21 +29,17 @@ arrow_table = table.to_lance()
|
||||
duckdb.query("SELECT * FROM arrow_table")
|
||||
```
|
||||
|
||||
```
|
||||
┌─────────────┬─────────┬────────┐
|
||||
│ vector │ item │ price │
|
||||
│ float[] │ varchar │ double │
|
||||
├─────────────┼─────────┼────────┤
|
||||
│ [3.1, 4.1] │ foo │ 10.0 │
|
||||
│ [5.9, 26.5] │ bar │ 20.0 │
|
||||
└─────────────┴─────────┴────────┘
|
||||
```
|
||||
| vector | item | price |
|
||||
| ----------- | ---- | ----- |
|
||||
| [3.1, 4.1] | foo | 10.0 |
|
||||
| [5.9, 26.5] | bar | 20.0 |
|
||||
|
||||
## Querying a LanceDB Table with Apache Datafusion
|
||||
|
||||
Have the required imports before doing any querying.
|
||||
|
||||
=== "Python"
|
||||
|
||||
```python
|
||||
--8<-- "python/python/tests/docs/test_guide_tables.py:import-lancedb"
|
||||
--8<-- "python/python/tests/docs/test_guide_tables.py:import-session-context"
|
||||
@@ -51,16 +49,12 @@ Have the required imports before doing any querying.
|
||||
Register the table created with the Datafusion session context.
|
||||
|
||||
=== "Python"
|
||||
|
||||
```python
|
||||
--8<-- "python/python/tests/docs/test_guide_tables.py:lance_sql_basic"
|
||||
```
|
||||
|
||||
```
|
||||
┌─────────────┬─────────┬────────┐
|
||||
│ vector │ item │ price │
|
||||
│ float[] │ varchar │ double │
|
||||
├─────────────┼─────────┼────────┤
|
||||
│ [3.1, 4.1] │ foo │ 10.0 │
|
||||
│ [5.9, 26.5] │ bar │ 20.0 │
|
||||
└─────────────┴─────────┴────────┘
|
||||
```
|
||||
| vector | item | price |
|
||||
| ----------- | ---- | ----- |
|
||||
| [3.1, 4.1] | foo | 10.0 |
|
||||
| [5.9, 26.5] | bar | 20.0 |
|
||||
|
||||
53
docs/src/js/classes/BooleanQuery.md
Normal file
53
docs/src/js/classes/BooleanQuery.md
Normal file
@@ -0,0 +1,53 @@
|
||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../globals.md) / BooleanQuery
|
||||
|
||||
# Class: BooleanQuery
|
||||
|
||||
Represents a full-text query interface.
|
||||
This interface defines the structure and behavior for full-text queries,
|
||||
including methods to retrieve the query type and convert the query to a dictionary format.
|
||||
|
||||
## Implements
|
||||
|
||||
- [`FullTextQuery`](../interfaces/FullTextQuery.md)
|
||||
|
||||
## Constructors
|
||||
|
||||
### new BooleanQuery()
|
||||
|
||||
```ts
|
||||
new BooleanQuery(queries): BooleanQuery
|
||||
```
|
||||
|
||||
Creates an instance of BooleanQuery.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **queries**: [[`Occur`](../enumerations/Occur.md), [`FullTextQuery`](../interfaces/FullTextQuery.md)][]
|
||||
An array of (Occur, FullTextQuery objects) to combine.
|
||||
Occur specifies whether the query must match, or should match.
|
||||
|
||||
#### Returns
|
||||
|
||||
[`BooleanQuery`](BooleanQuery.md)
|
||||
|
||||
## Methods
|
||||
|
||||
### queryType()
|
||||
|
||||
```ts
|
||||
queryType(): FullTextQueryType
|
||||
```
|
||||
|
||||
The type of the full-text query.
|
||||
|
||||
#### Returns
|
||||
|
||||
[`FullTextQueryType`](../enumerations/FullTextQueryType.md)
|
||||
|
||||
#### Implementation of
|
||||
|
||||
[`FullTextQuery`](../interfaces/FullTextQuery.md).[`queryType`](../interfaces/FullTextQuery.md#querytype)
|
||||
@@ -40,6 +40,8 @@ Creates an instance of MatchQuery.
|
||||
- `boost`: The boost factor for the query (default is 1.0).
|
||||
- `fuzziness`: The fuzziness level for the query (default is 0).
|
||||
- `maxExpansions`: The maximum number of terms to consider for fuzzy matching (default is 50).
|
||||
- `operator`: The logical operator to use for combining terms in the query (default is "OR").
|
||||
- `prefixLength`: The number of beginning characters being unchanged for fuzzy matching.
|
||||
|
||||
* **options.boost?**: `number`
|
||||
|
||||
@@ -47,6 +49,10 @@ Creates an instance of MatchQuery.
|
||||
|
||||
* **options.maxExpansions?**: `number`
|
||||
|
||||
* **options.operator?**: [`Operator`](../enumerations/Operator.md)
|
||||
|
||||
* **options.prefixLength?**: `number`
|
||||
|
||||
#### Returns
|
||||
|
||||
[`MatchQuery`](MatchQuery.md)
|
||||
|
||||
@@ -38,9 +38,12 @@ Creates an instance of MultiMatchQuery.
|
||||
* **options?**
|
||||
Optional parameters for the multi-match query.
|
||||
- `boosts`: An array of boost factors for each column (default is 1.0 for all).
|
||||
- `operator`: The logical operator to use for combining terms in the query (default is "OR").
|
||||
|
||||
* **options.boosts?**: `number`[]
|
||||
|
||||
* **options.operator?**: [`Operator`](../enumerations/Operator.md)
|
||||
|
||||
#### Returns
|
||||
|
||||
[`MultiMatchQuery`](MultiMatchQuery.md)
|
||||
|
||||
@@ -19,7 +19,10 @@ including methods to retrieve the query type and convert the query to a dictiona
|
||||
### new PhraseQuery()
|
||||
|
||||
```ts
|
||||
new PhraseQuery(query, column): PhraseQuery
|
||||
new PhraseQuery(
|
||||
query,
|
||||
column,
|
||||
options?): PhraseQuery
|
||||
```
|
||||
|
||||
Creates an instance of `PhraseQuery`.
|
||||
@@ -32,6 +35,12 @@ Creates an instance of `PhraseQuery`.
|
||||
* **column**: `string`
|
||||
The name of the column to search within.
|
||||
|
||||
* **options?**
|
||||
Optional parameters for the phrase query.
|
||||
- `slop`: The maximum number of intervening unmatched positions allowed between words in the phrase (default is 0).
|
||||
|
||||
* **options.slop?**: `number`
|
||||
|
||||
#### Returns
|
||||
|
||||
[`PhraseQuery`](PhraseQuery.md)
|
||||
|
||||
@@ -612,7 +612,7 @@ of the given query
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **query**: `string` \| [`IntoVector`](../type-aliases/IntoVector.md) \| [`FullTextQuery`](../interfaces/FullTextQuery.md)
|
||||
* **query**: `string` \| [`IntoVector`](../type-aliases/IntoVector.md) \| [`MultiVector`](../type-aliases/MultiVector.md) \| [`FullTextQuery`](../interfaces/FullTextQuery.md)
|
||||
the query, a vector or string
|
||||
|
||||
* **queryType?**: `string`
|
||||
@@ -799,7 +799,7 @@ by `query`.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **vector**: [`IntoVector`](../type-aliases/IntoVector.md)
|
||||
* **vector**: [`IntoVector`](../type-aliases/IntoVector.md) \| [`MultiVector`](../type-aliases/MultiVector.md)
|
||||
|
||||
#### Returns
|
||||
|
||||
|
||||
@@ -386,6 +386,53 @@ called then every valid row from the table will be returned.
|
||||
|
||||
***
|
||||
|
||||
### maximumNprobes()
|
||||
|
||||
```ts
|
||||
maximumNprobes(maximumNprobes): VectorQuery
|
||||
```
|
||||
|
||||
Set the maximum number of probes used.
|
||||
|
||||
This controls the maximum number of partitions that will be searched. If this
|
||||
number is greater than minimumNprobes then the excess partitions will _only_ be
|
||||
searched if we have not found enough results. This can be useful when there is
|
||||
a narrow filter to allow these queries to spend more time searching and avoid
|
||||
potential false negatives.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **maximumNprobes**: `number`
|
||||
|
||||
#### Returns
|
||||
|
||||
[`VectorQuery`](VectorQuery.md)
|
||||
|
||||
***
|
||||
|
||||
### minimumNprobes()
|
||||
|
||||
```ts
|
||||
minimumNprobes(minimumNprobes): VectorQuery
|
||||
```
|
||||
|
||||
Set the minimum number of probes used.
|
||||
|
||||
This controls the minimum number of partitions that will be searched. This
|
||||
parameter will impact every query against a vector index, regardless of the
|
||||
filter. See `nprobes` for more details. Higher values will increase recall
|
||||
but will also increase latency.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **minimumNprobes**: `number`
|
||||
|
||||
#### Returns
|
||||
|
||||
[`VectorQuery`](VectorQuery.md)
|
||||
|
||||
***
|
||||
|
||||
### nprobes()
|
||||
|
||||
```ts
|
||||
@@ -413,6 +460,10 @@ For best results we recommend tuning this parameter with a benchmark against
|
||||
your actual data to find the smallest possible value that will still give
|
||||
you the desired recall.
|
||||
|
||||
For more fine grained control over behavior when you have a very narrow filter
|
||||
you can use `minimumNprobes` and `maximumNprobes`. This method sets both
|
||||
the minimum and maximum to the same value.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **nprobes**: `number`
|
||||
|
||||
@@ -15,6 +15,14 @@ Enum representing the types of full-text queries supported.
|
||||
|
||||
## Enumeration Members
|
||||
|
||||
### Boolean
|
||||
|
||||
```ts
|
||||
Boolean: "boolean";
|
||||
```
|
||||
|
||||
***
|
||||
|
||||
### Boost
|
||||
|
||||
```ts
|
||||
|
||||
37
docs/src/js/enumerations/Occur.md
Normal file
37
docs/src/js/enumerations/Occur.md
Normal file
@@ -0,0 +1,37 @@
|
||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../globals.md) / Occur
|
||||
|
||||
# Enumeration: Occur
|
||||
|
||||
Enum representing the occurrence of terms in full-text queries.
|
||||
|
||||
- `Must`: The term must be present in the document.
|
||||
- `Should`: The term should contribute to the document score, but is not required.
|
||||
- `MustNot`: The term must not be present in the document.
|
||||
|
||||
## Enumeration Members
|
||||
|
||||
### Must
|
||||
|
||||
```ts
|
||||
Must: "MUST";
|
||||
```
|
||||
|
||||
***
|
||||
|
||||
### MustNot
|
||||
|
||||
```ts
|
||||
MustNot: "MUST_NOT";
|
||||
```
|
||||
|
||||
***
|
||||
|
||||
### Should
|
||||
|
||||
```ts
|
||||
Should: "SHOULD";
|
||||
```
|
||||
28
docs/src/js/enumerations/Operator.md
Normal file
28
docs/src/js/enumerations/Operator.md
Normal file
@@ -0,0 +1,28 @@
|
||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../globals.md) / Operator
|
||||
|
||||
# Enumeration: Operator
|
||||
|
||||
Enum representing the logical operators used in full-text queries.
|
||||
|
||||
- `And`: All terms must match.
|
||||
- `Or`: At least one term must match.
|
||||
|
||||
## Enumeration Members
|
||||
|
||||
### And
|
||||
|
||||
```ts
|
||||
And: "AND";
|
||||
```
|
||||
|
||||
***
|
||||
|
||||
### Or
|
||||
|
||||
```ts
|
||||
Or: "OR";
|
||||
```
|
||||
@@ -12,9 +12,12 @@
|
||||
## Enumerations
|
||||
|
||||
- [FullTextQueryType](enumerations/FullTextQueryType.md)
|
||||
- [Occur](enumerations/Occur.md)
|
||||
- [Operator](enumerations/Operator.md)
|
||||
|
||||
## Classes
|
||||
|
||||
- [BooleanQuery](classes/BooleanQuery.md)
|
||||
- [BoostQuery](classes/BoostQuery.md)
|
||||
- [Connection](classes/Connection.md)
|
||||
- [Index](classes/Index.md)
|
||||
@@ -81,6 +84,7 @@
|
||||
- [FieldLike](type-aliases/FieldLike.md)
|
||||
- [IntoSql](type-aliases/IntoSql.md)
|
||||
- [IntoVector](type-aliases/IntoVector.md)
|
||||
- [MultiVector](type-aliases/MultiVector.md)
|
||||
- [RecordBatchLike](type-aliases/RecordBatchLike.md)
|
||||
- [SchemaLike](type-aliases/SchemaLike.md)
|
||||
- [TableLike](type-aliases/TableLike.md)
|
||||
|
||||
@@ -23,7 +23,7 @@ whether to remove punctuation
|
||||
### baseTokenizer?
|
||||
|
||||
```ts
|
||||
optional baseTokenizer: "raw" | "simple" | "whitespace";
|
||||
optional baseTokenizer: "raw" | "simple" | "whitespace" | "ngram";
|
||||
```
|
||||
|
||||
The tokenizer to use when building the index.
|
||||
@@ -71,6 +71,36 @@ tokens longer than this length will be ignored
|
||||
|
||||
***
|
||||
|
||||
### ngramMaxLength?
|
||||
|
||||
```ts
|
||||
optional ngramMaxLength: number;
|
||||
```
|
||||
|
||||
ngram max length
|
||||
|
||||
***
|
||||
|
||||
### ngramMinLength?
|
||||
|
||||
```ts
|
||||
optional ngramMinLength: number;
|
||||
```
|
||||
|
||||
ngram min length
|
||||
|
||||
***
|
||||
|
||||
### prefixOnly?
|
||||
|
||||
```ts
|
||||
optional prefixOnly: boolean;
|
||||
```
|
||||
|
||||
whether to only index the prefix of the token for ngram tokenizer
|
||||
|
||||
***
|
||||
|
||||
### removeStopWords?
|
||||
|
||||
```ts
|
||||
|
||||
@@ -24,10 +24,10 @@ The default is 7 days
|
||||
// Delete all versions older than 1 day
|
||||
const olderThan = new Date();
|
||||
olderThan.setDate(olderThan.getDate() - 1));
|
||||
tbl.cleanupOlderVersions(olderThan);
|
||||
tbl.optimize({cleanupOlderThan: olderThan});
|
||||
|
||||
// Delete all versions except the current version
|
||||
tbl.cleanupOlderVersions(new Date());
|
||||
tbl.optimize({cleanupOlderThan: new Date()});
|
||||
```
|
||||
|
||||
***
|
||||
|
||||
11
docs/src/js/type-aliases/MultiVector.md
Normal file
11
docs/src/js/type-aliases/MultiVector.md
Normal file
@@ -0,0 +1,11 @@
|
||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../globals.md) / MultiVector
|
||||
|
||||
# Type Alias: MultiVector
|
||||
|
||||
```ts
|
||||
type MultiVector: IntoVector[];
|
||||
```
|
||||
@@ -428,7 +428,7 @@
|
||||
"\n",
|
||||
"**Why?** \n",
|
||||
"Embedding the UFO dataset and ingesting it into LanceDB takes **~2 hours on a T4 GPU**. To save time: \n",
|
||||
"- **Use the pre-prepared table with index created ** (provided below) to proceed directly to step7: search. \n",
|
||||
"- **Use the pre-prepared table with index created** (provided below) to proceed directly to **Step 7**: search. \n",
|
||||
"- **Step 5a** contains the full ingestion code for reference (run it only if necessary). \n",
|
||||
"- **Step 6** contains the details on creating the index on the multivector column"
|
||||
]
|
||||
|
||||
@@ -30,7 +30,8 @@ excluded_globs = [
|
||||
"../src/rag/advanced_techniques/*.md",
|
||||
"../src/guides/scalar_index.md",
|
||||
"../src/guides/storage.md",
|
||||
"../src/search.md"
|
||||
"../src/search.md",
|
||||
"../src/guides/sql_querying.md",
|
||||
]
|
||||
|
||||
python_prefix = "py"
|
||||
|
||||
@@ -7,3 +7,4 @@ tantivy==0.20.1
|
||||
--extra-index-url https://download.pytorch.org/whl/cpu
|
||||
torch
|
||||
polars>=0.19, <=1.3.0
|
||||
datafusion
|
||||
|
||||
19
java/.mvn/wrapper/maven-wrapper.properties
vendored
Normal file
19
java/.mvn/wrapper/maven-wrapper.properties
vendored
Normal file
@@ -0,0 +1,19 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
wrapperVersion=3.3.2
|
||||
distributionType=only-script
|
||||
distributionUrl=https://repo.maven.apache.org/maven2/org/apache/maven/apache-maven/3.9.9/apache-maven-3.9.9-bin.zip
|
||||
37
java/README.md
Normal file
37
java/README.md
Normal file
@@ -0,0 +1,37 @@
|
||||
# LanceDB Java SDK
|
||||
|
||||
## Configuration and Initialization
|
||||
|
||||
### LanceDB Cloud
|
||||
|
||||
For LanceDB Cloud, use the simplified builder API:
|
||||
|
||||
```java
|
||||
import com.lancedb.lance.namespace.LanceRestNamespace;
|
||||
|
||||
// If your DB url is db://example-db, then your database here is example-db
|
||||
LanceRestNamespace namespace = LanceDBRestNamespaces.builder()
|
||||
.apiKey("your_lancedb_cloud_api_key")
|
||||
.database("your_database_name")
|
||||
.build();
|
||||
```
|
||||
|
||||
### LanceDB Enterprise
|
||||
|
||||
For Enterprise deployments, use your VPC endpoint:
|
||||
|
||||
```java
|
||||
LanceRestNamespace namespace = LanceDBRestNamespaces.builder()
|
||||
.apiKey("your_lancedb_enterprise_api_key")
|
||||
.database("your-top-dir") // Your top level folder under your cloud bucket, e.g. s3://your-bucket/your-top-dir/
|
||||
.hostOverride("http://<vpc_endpoint_dns_name>:80")
|
||||
.build();
|
||||
```
|
||||
|
||||
## Development
|
||||
|
||||
Build:
|
||||
|
||||
```shell
|
||||
./mvnw install
|
||||
```
|
||||
@@ -8,18 +8,24 @@
|
||||
<parent>
|
||||
<groupId>com.lancedb</groupId>
|
||||
<artifactId>lancedb-parent</artifactId>
|
||||
<version>0.20.0-beta.2</version>
|
||||
<version>0.21.2-beta.0</version>
|
||||
<relativePath>../pom.xml</relativePath>
|
||||
</parent>
|
||||
|
||||
<artifactId>lancedb-core</artifactId>
|
||||
<name>LanceDB Core</name>
|
||||
<name>${project.artifactId}</name>
|
||||
<description>LanceDB Core</description>
|
||||
<packaging>jar</packaging>
|
||||
<properties>
|
||||
<rust.release.build>false</rust.release.build>
|
||||
</properties>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>com.lancedb</groupId>
|
||||
<artifactId>lance-namespace-core</artifactId>
|
||||
<version>0.0.1</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.arrow</groupId>
|
||||
<artifactId>arrow-vector</artifactId>
|
||||
|
||||
26
java/lance-namespace/pom.xml
Normal file
26
java/lance-namespace/pom.xml
Normal file
@@ -0,0 +1,26 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<parent>
|
||||
<groupId>com.lancedb</groupId>
|
||||
<artifactId>lancedb-parent</artifactId>
|
||||
<version>0.21.2-beta.0</version>
|
||||
<relativePath>../pom.xml</relativePath>
|
||||
</parent>
|
||||
|
||||
<artifactId>lancedb-lance-namespace</artifactId>
|
||||
<name>${project.artifactId}</name>
|
||||
<description>LanceDB Java Integration with Lance Namespace</description>
|
||||
<packaging>jar</packaging>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>com.lancedb</groupId>
|
||||
<artifactId>lance-namespace-core</artifactId>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
</project>
|
||||
@@ -0,0 +1,146 @@
|
||||
/*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package com.lancedb.lancedb;
|
||||
|
||||
import com.lancedb.lance.namespace.LanceRestNamespace;
|
||||
import com.lancedb.lance.namespace.client.apache.ApiClient;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
|
||||
/** Util class to help construct a {@link LanceRestNamespace} for LanceDB. */
|
||||
public class LanceDbRestNamespaces {
|
||||
private static final String DEFAULT_REGION = "us-east-1";
|
||||
private static final String CLOUD_URL_PATTERN = "https://%s.%s.api.lancedb.com";
|
||||
|
||||
private String apiKey;
|
||||
private String database;
|
||||
private Optional<String> hostOverride = Optional.empty();
|
||||
private Optional<String> region = Optional.empty();
|
||||
private Map<String, String> additionalConfig = new HashMap<>();
|
||||
|
||||
private LanceDbRestNamespaces() {}
|
||||
|
||||
/**
|
||||
* Create a new builder instance.
|
||||
*
|
||||
* @return A new LanceRestNamespaceBuilder
|
||||
*/
|
||||
public static LanceDbRestNamespaces builder() {
|
||||
return new LanceDbRestNamespaces();
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the API key (required).
|
||||
*
|
||||
* @param apiKey The LanceDB API key
|
||||
* @return This builder
|
||||
*/
|
||||
public LanceDbRestNamespaces apiKey(String apiKey) {
|
||||
if (apiKey == null || apiKey.trim().isEmpty()) {
|
||||
throw new IllegalArgumentException("API key cannot be null or empty");
|
||||
}
|
||||
this.apiKey = apiKey;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the database name (required).
|
||||
*
|
||||
* @param database The database name
|
||||
* @return This builder
|
||||
*/
|
||||
public LanceDbRestNamespaces database(String database) {
|
||||
if (database == null || database.trim().isEmpty()) {
|
||||
throw new IllegalArgumentException("Database cannot be null or empty");
|
||||
}
|
||||
this.database = database;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set a custom host override (optional). When set, this overrides the default LanceDB Cloud URL
|
||||
* construction. Use this for LanceDB Enterprise deployments.
|
||||
*
|
||||
* @param hostOverride The complete base URL (e.g., "http://your-vpc-endpoint:80")
|
||||
* @return This builder
|
||||
*/
|
||||
public LanceDbRestNamespaces hostOverride(String hostOverride) {
|
||||
this.hostOverride = Optional.ofNullable(hostOverride);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the region for LanceDB Cloud (optional). Defaults to "us-east-1" if not specified. This is
|
||||
* ignored when hostOverride is set.
|
||||
*
|
||||
* @param region The AWS region (e.g., "us-east-1", "eu-west-1")
|
||||
* @return This builder
|
||||
*/
|
||||
public LanceDbRestNamespaces region(String region) {
|
||||
this.region = Optional.ofNullable(region);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add additional configuration parameters.
|
||||
*
|
||||
* @param key The configuration key
|
||||
* @param value The configuration value
|
||||
* @return This builder
|
||||
*/
|
||||
public LanceDbRestNamespaces config(String key, String value) {
|
||||
this.additionalConfig.put(key, value);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Build the LanceRestNamespace instance.
|
||||
*
|
||||
* @return A configured LanceRestNamespace
|
||||
* @throws IllegalStateException if required parameters are missing
|
||||
*/
|
||||
public LanceRestNamespace build() {
|
||||
// Validate required fields
|
||||
if (apiKey == null) {
|
||||
throw new IllegalStateException("API key is required");
|
||||
}
|
||||
if (database == null) {
|
||||
throw new IllegalStateException("Database is required");
|
||||
}
|
||||
|
||||
// Build configuration map
|
||||
Map<String, String> config = new HashMap<>(additionalConfig);
|
||||
config.put("headers.x-lancedb-database", database);
|
||||
config.put("headers.x-api-key", apiKey);
|
||||
|
||||
// Determine base URL
|
||||
String baseUrl;
|
||||
if (hostOverride.isPresent()) {
|
||||
baseUrl = hostOverride.get();
|
||||
config.put("host_override", hostOverride.get());
|
||||
} else {
|
||||
String effectiveRegion = region.orElse(DEFAULT_REGION);
|
||||
baseUrl = String.format(CLOUD_URL_PATTERN, database, effectiveRegion);
|
||||
config.put("region", effectiveRegion);
|
||||
}
|
||||
|
||||
// Create and configure ApiClient
|
||||
ApiClient apiClient = new ApiClient();
|
||||
apiClient.setBasePath(baseUrl);
|
||||
|
||||
return new LanceRestNamespace(apiClient, config);
|
||||
}
|
||||
}
|
||||
259
java/mvnw
vendored
Executable file
259
java/mvnw
vendored
Executable file
@@ -0,0 +1,259 @@
|
||||
#!/bin/sh
|
||||
# ----------------------------------------------------------------------------
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# Apache Maven Wrapper startup batch script, version 3.3.2
|
||||
#
|
||||
# Optional ENV vars
|
||||
# -----------------
|
||||
# JAVA_HOME - location of a JDK home dir, required when download maven via java source
|
||||
# MVNW_REPOURL - repo url base for downloading maven distribution
|
||||
# MVNW_USERNAME/MVNW_PASSWORD - user and password for downloading maven
|
||||
# MVNW_VERBOSE - true: enable verbose log; debug: trace the mvnw script; others: silence the output
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
set -euf
|
||||
[ "${MVNW_VERBOSE-}" != debug ] || set -x
|
||||
|
||||
# OS specific support.
|
||||
native_path() { printf %s\\n "$1"; }
|
||||
case "$(uname)" in
|
||||
CYGWIN* | MINGW*)
|
||||
[ -z "${JAVA_HOME-}" ] || JAVA_HOME="$(cygpath --unix "$JAVA_HOME")"
|
||||
native_path() { cygpath --path --windows "$1"; }
|
||||
;;
|
||||
esac
|
||||
|
||||
# set JAVACMD and JAVACCMD
|
||||
set_java_home() {
|
||||
# For Cygwin and MinGW, ensure paths are in Unix format before anything is touched
|
||||
if [ -n "${JAVA_HOME-}" ]; then
|
||||
if [ -x "$JAVA_HOME/jre/sh/java" ]; then
|
||||
# IBM's JDK on AIX uses strange locations for the executables
|
||||
JAVACMD="$JAVA_HOME/jre/sh/java"
|
||||
JAVACCMD="$JAVA_HOME/jre/sh/javac"
|
||||
else
|
||||
JAVACMD="$JAVA_HOME/bin/java"
|
||||
JAVACCMD="$JAVA_HOME/bin/javac"
|
||||
|
||||
if [ ! -x "$JAVACMD" ] || [ ! -x "$JAVACCMD" ]; then
|
||||
echo "The JAVA_HOME environment variable is not defined correctly, so mvnw cannot run." >&2
|
||||
echo "JAVA_HOME is set to \"$JAVA_HOME\", but \"\$JAVA_HOME/bin/java\" or \"\$JAVA_HOME/bin/javac\" does not exist." >&2
|
||||
return 1
|
||||
fi
|
||||
fi
|
||||
else
|
||||
JAVACMD="$(
|
||||
'set' +e
|
||||
'unset' -f command 2>/dev/null
|
||||
'command' -v java
|
||||
)" || :
|
||||
JAVACCMD="$(
|
||||
'set' +e
|
||||
'unset' -f command 2>/dev/null
|
||||
'command' -v javac
|
||||
)" || :
|
||||
|
||||
if [ ! -x "${JAVACMD-}" ] || [ ! -x "${JAVACCMD-}" ]; then
|
||||
echo "The java/javac command does not exist in PATH nor is JAVA_HOME set, so mvnw cannot run." >&2
|
||||
return 1
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
# hash string like Java String::hashCode
|
||||
hash_string() {
|
||||
str="${1:-}" h=0
|
||||
while [ -n "$str" ]; do
|
||||
char="${str%"${str#?}"}"
|
||||
h=$(((h * 31 + $(LC_CTYPE=C printf %d "'$char")) % 4294967296))
|
||||
str="${str#?}"
|
||||
done
|
||||
printf %x\\n $h
|
||||
}
|
||||
|
||||
verbose() { :; }
|
||||
[ "${MVNW_VERBOSE-}" != true ] || verbose() { printf %s\\n "${1-}"; }
|
||||
|
||||
die() {
|
||||
printf %s\\n "$1" >&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
trim() {
|
||||
# MWRAPPER-139:
|
||||
# Trims trailing and leading whitespace, carriage returns, tabs, and linefeeds.
|
||||
# Needed for removing poorly interpreted newline sequences when running in more
|
||||
# exotic environments such as mingw bash on Windows.
|
||||
printf "%s" "${1}" | tr -d '[:space:]'
|
||||
}
|
||||
|
||||
# parse distributionUrl and optional distributionSha256Sum, requires .mvn/wrapper/maven-wrapper.properties
|
||||
while IFS="=" read -r key value; do
|
||||
case "${key-}" in
|
||||
distributionUrl) distributionUrl=$(trim "${value-}") ;;
|
||||
distributionSha256Sum) distributionSha256Sum=$(trim "${value-}") ;;
|
||||
esac
|
||||
done <"${0%/*}/.mvn/wrapper/maven-wrapper.properties"
|
||||
[ -n "${distributionUrl-}" ] || die "cannot read distributionUrl property in ${0%/*}/.mvn/wrapper/maven-wrapper.properties"
|
||||
|
||||
case "${distributionUrl##*/}" in
|
||||
maven-mvnd-*bin.*)
|
||||
MVN_CMD=mvnd.sh _MVNW_REPO_PATTERN=/maven/mvnd/
|
||||
case "${PROCESSOR_ARCHITECTURE-}${PROCESSOR_ARCHITEW6432-}:$(uname -a)" in
|
||||
*AMD64:CYGWIN* | *AMD64:MINGW*) distributionPlatform=windows-amd64 ;;
|
||||
:Darwin*x86_64) distributionPlatform=darwin-amd64 ;;
|
||||
:Darwin*arm64) distributionPlatform=darwin-aarch64 ;;
|
||||
:Linux*x86_64*) distributionPlatform=linux-amd64 ;;
|
||||
*)
|
||||
echo "Cannot detect native platform for mvnd on $(uname)-$(uname -m), use pure java version" >&2
|
||||
distributionPlatform=linux-amd64
|
||||
;;
|
||||
esac
|
||||
distributionUrl="${distributionUrl%-bin.*}-$distributionPlatform.zip"
|
||||
;;
|
||||
maven-mvnd-*) MVN_CMD=mvnd.sh _MVNW_REPO_PATTERN=/maven/mvnd/ ;;
|
||||
*) MVN_CMD="mvn${0##*/mvnw}" _MVNW_REPO_PATTERN=/org/apache/maven/ ;;
|
||||
esac
|
||||
|
||||
# apply MVNW_REPOURL and calculate MAVEN_HOME
|
||||
# maven home pattern: ~/.m2/wrapper/dists/{apache-maven-<version>,maven-mvnd-<version>-<platform>}/<hash>
|
||||
[ -z "${MVNW_REPOURL-}" ] || distributionUrl="$MVNW_REPOURL$_MVNW_REPO_PATTERN${distributionUrl#*"$_MVNW_REPO_PATTERN"}"
|
||||
distributionUrlName="${distributionUrl##*/}"
|
||||
distributionUrlNameMain="${distributionUrlName%.*}"
|
||||
distributionUrlNameMain="${distributionUrlNameMain%-bin}"
|
||||
MAVEN_USER_HOME="${MAVEN_USER_HOME:-${HOME}/.m2}"
|
||||
MAVEN_HOME="${MAVEN_USER_HOME}/wrapper/dists/${distributionUrlNameMain-}/$(hash_string "$distributionUrl")"
|
||||
|
||||
exec_maven() {
|
||||
unset MVNW_VERBOSE MVNW_USERNAME MVNW_PASSWORD MVNW_REPOURL || :
|
||||
exec "$MAVEN_HOME/bin/$MVN_CMD" "$@" || die "cannot exec $MAVEN_HOME/bin/$MVN_CMD"
|
||||
}
|
||||
|
||||
if [ -d "$MAVEN_HOME" ]; then
|
||||
verbose "found existing MAVEN_HOME at $MAVEN_HOME"
|
||||
exec_maven "$@"
|
||||
fi
|
||||
|
||||
case "${distributionUrl-}" in
|
||||
*?-bin.zip | *?maven-mvnd-?*-?*.zip) ;;
|
||||
*) die "distributionUrl is not valid, must match *-bin.zip or maven-mvnd-*.zip, but found '${distributionUrl-}'" ;;
|
||||
esac
|
||||
|
||||
# prepare tmp dir
|
||||
if TMP_DOWNLOAD_DIR="$(mktemp -d)" && [ -d "$TMP_DOWNLOAD_DIR" ]; then
|
||||
clean() { rm -rf -- "$TMP_DOWNLOAD_DIR"; }
|
||||
trap clean HUP INT TERM EXIT
|
||||
else
|
||||
die "cannot create temp dir"
|
||||
fi
|
||||
|
||||
mkdir -p -- "${MAVEN_HOME%/*}"
|
||||
|
||||
# Download and Install Apache Maven
|
||||
verbose "Couldn't find MAVEN_HOME, downloading and installing it ..."
|
||||
verbose "Downloading from: $distributionUrl"
|
||||
verbose "Downloading to: $TMP_DOWNLOAD_DIR/$distributionUrlName"
|
||||
|
||||
# select .zip or .tar.gz
|
||||
if ! command -v unzip >/dev/null; then
|
||||
distributionUrl="${distributionUrl%.zip}.tar.gz"
|
||||
distributionUrlName="${distributionUrl##*/}"
|
||||
fi
|
||||
|
||||
# verbose opt
|
||||
__MVNW_QUIET_WGET=--quiet __MVNW_QUIET_CURL=--silent __MVNW_QUIET_UNZIP=-q __MVNW_QUIET_TAR=''
|
||||
[ "${MVNW_VERBOSE-}" != true ] || __MVNW_QUIET_WGET='' __MVNW_QUIET_CURL='' __MVNW_QUIET_UNZIP='' __MVNW_QUIET_TAR=v
|
||||
|
||||
# normalize http auth
|
||||
case "${MVNW_PASSWORD:+has-password}" in
|
||||
'') MVNW_USERNAME='' MVNW_PASSWORD='' ;;
|
||||
has-password) [ -n "${MVNW_USERNAME-}" ] || MVNW_USERNAME='' MVNW_PASSWORD='' ;;
|
||||
esac
|
||||
|
||||
if [ -z "${MVNW_USERNAME-}" ] && command -v wget >/dev/null; then
|
||||
verbose "Found wget ... using wget"
|
||||
wget ${__MVNW_QUIET_WGET:+"$__MVNW_QUIET_WGET"} "$distributionUrl" -O "$TMP_DOWNLOAD_DIR/$distributionUrlName" || die "wget: Failed to fetch $distributionUrl"
|
||||
elif [ -z "${MVNW_USERNAME-}" ] && command -v curl >/dev/null; then
|
||||
verbose "Found curl ... using curl"
|
||||
curl ${__MVNW_QUIET_CURL:+"$__MVNW_QUIET_CURL"} -f -L -o "$TMP_DOWNLOAD_DIR/$distributionUrlName" "$distributionUrl" || die "curl: Failed to fetch $distributionUrl"
|
||||
elif set_java_home; then
|
||||
verbose "Falling back to use Java to download"
|
||||
javaSource="$TMP_DOWNLOAD_DIR/Downloader.java"
|
||||
targetZip="$TMP_DOWNLOAD_DIR/$distributionUrlName"
|
||||
cat >"$javaSource" <<-END
|
||||
public class Downloader extends java.net.Authenticator
|
||||
{
|
||||
protected java.net.PasswordAuthentication getPasswordAuthentication()
|
||||
{
|
||||
return new java.net.PasswordAuthentication( System.getenv( "MVNW_USERNAME" ), System.getenv( "MVNW_PASSWORD" ).toCharArray() );
|
||||
}
|
||||
public static void main( String[] args ) throws Exception
|
||||
{
|
||||
setDefault( new Downloader() );
|
||||
java.nio.file.Files.copy( java.net.URI.create( args[0] ).toURL().openStream(), java.nio.file.Paths.get( args[1] ).toAbsolutePath().normalize() );
|
||||
}
|
||||
}
|
||||
END
|
||||
# For Cygwin/MinGW, switch paths to Windows format before running javac and java
|
||||
verbose " - Compiling Downloader.java ..."
|
||||
"$(native_path "$JAVACCMD")" "$(native_path "$javaSource")" || die "Failed to compile Downloader.java"
|
||||
verbose " - Running Downloader.java ..."
|
||||
"$(native_path "$JAVACMD")" -cp "$(native_path "$TMP_DOWNLOAD_DIR")" Downloader "$distributionUrl" "$(native_path "$targetZip")"
|
||||
fi
|
||||
|
||||
# If specified, validate the SHA-256 sum of the Maven distribution zip file
|
||||
if [ -n "${distributionSha256Sum-}" ]; then
|
||||
distributionSha256Result=false
|
||||
if [ "$MVN_CMD" = mvnd.sh ]; then
|
||||
echo "Checksum validation is not supported for maven-mvnd." >&2
|
||||
echo "Please disable validation by removing 'distributionSha256Sum' from your maven-wrapper.properties." >&2
|
||||
exit 1
|
||||
elif command -v sha256sum >/dev/null; then
|
||||
if echo "$distributionSha256Sum $TMP_DOWNLOAD_DIR/$distributionUrlName" | sha256sum -c >/dev/null 2>&1; then
|
||||
distributionSha256Result=true
|
||||
fi
|
||||
elif command -v shasum >/dev/null; then
|
||||
if echo "$distributionSha256Sum $TMP_DOWNLOAD_DIR/$distributionUrlName" | shasum -a 256 -c >/dev/null 2>&1; then
|
||||
distributionSha256Result=true
|
||||
fi
|
||||
else
|
||||
echo "Checksum validation was requested but neither 'sha256sum' or 'shasum' are available." >&2
|
||||
echo "Please install either command, or disable validation by removing 'distributionSha256Sum' from your maven-wrapper.properties." >&2
|
||||
exit 1
|
||||
fi
|
||||
if [ $distributionSha256Result = false ]; then
|
||||
echo "Error: Failed to validate Maven distribution SHA-256, your Maven distribution might be compromised." >&2
|
||||
echo "If you updated your Maven version, you need to update the specified distributionSha256Sum property." >&2
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
# unzip and move
|
||||
if command -v unzip >/dev/null; then
|
||||
unzip ${__MVNW_QUIET_UNZIP:+"$__MVNW_QUIET_UNZIP"} "$TMP_DOWNLOAD_DIR/$distributionUrlName" -d "$TMP_DOWNLOAD_DIR" || die "failed to unzip"
|
||||
else
|
||||
tar xzf${__MVNW_QUIET_TAR:+"$__MVNW_QUIET_TAR"} "$TMP_DOWNLOAD_DIR/$distributionUrlName" -C "$TMP_DOWNLOAD_DIR" || die "failed to untar"
|
||||
fi
|
||||
printf %s\\n "$distributionUrl" >"$TMP_DOWNLOAD_DIR/$distributionUrlNameMain/mvnw.url"
|
||||
mv -- "$TMP_DOWNLOAD_DIR/$distributionUrlNameMain" "$MAVEN_HOME" || [ -d "$MAVEN_HOME" ] || die "fail to move MAVEN_HOME"
|
||||
|
||||
clean || :
|
||||
exec_maven "$@"
|
||||
14
java/pom.xml
14
java/pom.xml
@@ -6,11 +6,10 @@
|
||||
|
||||
<groupId>com.lancedb</groupId>
|
||||
<artifactId>lancedb-parent</artifactId>
|
||||
<version>0.20.0-beta.2</version>
|
||||
<version>0.21.2-beta.0</version>
|
||||
<packaging>pom</packaging>
|
||||
|
||||
<name>LanceDB Parent</name>
|
||||
<description>LanceDB vector database Java API</description>
|
||||
<name>${project.artifactId}</name>
|
||||
<description>LanceDB Java SDK Parent POM</description>
|
||||
<url>http://lancedb.com/</url>
|
||||
|
||||
<developers>
|
||||
@@ -29,6 +28,7 @@
|
||||
<properties>
|
||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||
<arrow.version>15.0.0</arrow.version>
|
||||
<lance-namespace.verison>0.0.1</lance-namespace.verison>
|
||||
<spotless.skip>false</spotless.skip>
|
||||
<spotless.version>2.30.0</spotless.version>
|
||||
<spotless.java.googlejavaformat.version>1.7</spotless.java.googlejavaformat.version>
|
||||
@@ -52,6 +52,7 @@
|
||||
|
||||
<modules>
|
||||
<module>core</module>
|
||||
<module>lance-namespace</module>
|
||||
</modules>
|
||||
|
||||
<scm>
|
||||
@@ -62,6 +63,11 @@
|
||||
|
||||
<dependencyManagement>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>com.lancedb</groupId>
|
||||
<artifactId>lance-namespace-core</artifactId>
|
||||
<version>${lance-namespace.verison}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.arrow</groupId>
|
||||
<artifactId>arrow-vector</artifactId>
|
||||
|
||||
49
node/package-lock.json
generated
49
node/package-lock.json
generated
@@ -1,12 +1,12 @@
|
||||
{
|
||||
"name": "vectordb",
|
||||
"version": "0.20.0-beta.2",
|
||||
"version": "0.21.2-beta.0",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "vectordb",
|
||||
"version": "0.20.0-beta.2",
|
||||
"version": "0.21.2-beta.0",
|
||||
"cpu": [
|
||||
"x64",
|
||||
"arm64"
|
||||
@@ -52,11 +52,11 @@
|
||||
"uuid": "^9.0.0"
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"@lancedb/vectordb-darwin-arm64": "0.20.0-beta.2",
|
||||
"@lancedb/vectordb-darwin-x64": "0.20.0-beta.2",
|
||||
"@lancedb/vectordb-linux-arm64-gnu": "0.20.0-beta.2",
|
||||
"@lancedb/vectordb-linux-x64-gnu": "0.20.0-beta.2",
|
||||
"@lancedb/vectordb-win32-x64-msvc": "0.20.0-beta.2"
|
||||
"@lancedb/vectordb-darwin-arm64": "0.21.2-beta.0",
|
||||
"@lancedb/vectordb-darwin-x64": "0.21.2-beta.0",
|
||||
"@lancedb/vectordb-linux-arm64-gnu": "0.21.2-beta.0",
|
||||
"@lancedb/vectordb-linux-x64-gnu": "0.21.2-beta.0",
|
||||
"@lancedb/vectordb-win32-x64-msvc": "0.21.2-beta.0"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"@apache-arrow/ts": "^14.0.2",
|
||||
@@ -327,60 +327,65 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-darwin-arm64": {
|
||||
"version": "0.20.0-beta.2",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.20.0-beta.2.tgz",
|
||||
"integrity": "sha512-H9PmJ/5KSvstVzR8Q7T22+eHRjJZ2ef3aA3gdFxXvoMi3xQ0MGIxz23HuKHGTRT4tfl1nNnpOPb2W7Na8etK9w==",
|
||||
"version": "0.21.2-beta.0",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.21.2-beta.0.tgz",
|
||||
"integrity": "sha512-RiYqpKuq9v8A4wFuHt1iPNFYjWJ1KgGFLJwQO4ajp9Hee84sDHq8mP0ATgMcc24hiaOUQ1lRRTULjGbHn4NIYw==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
"license": "Apache-2.0",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"darwin"
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-darwin-x64": {
|
||||
"version": "0.20.0-beta.2",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.20.0-beta.2.tgz",
|
||||
"integrity": "sha512-9AQkv4tIys+vg0cplZtSE48o61jd7EnmuMkUht+vLORL5/HAma84eAoU9lXHT7zAtPAQmL+98Bfvcsx7fJ6mVw==",
|
||||
"version": "0.21.2-beta.0",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.21.2-beta.0.tgz",
|
||||
"integrity": "sha512-togdP0YIjMYg/hBRMMxW434i5VB789JWU5o3hWrodbX8olEc0Txqw5Dg9CgIOldBIiCti6uTSQiTo6uldZon1w==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
"license": "Apache-2.0",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"darwin"
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-linux-arm64-gnu": {
|
||||
"version": "0.20.0-beta.2",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.20.0-beta.2.tgz",
|
||||
"integrity": "sha512-eQWoJz2ePml7NyEInTBeakWx56+5c6r2p3F+iHC5tsLuznn6eFX90koXJunRxH1WXHDN48ECUlEmKypgfEmn4w==",
|
||||
"version": "0.21.2-beta.0",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.21.2-beta.0.tgz",
|
||||
"integrity": "sha512-ErS4IQDQVTYVATPeOj/dZXQR34eZQ5rAXm3vJdQi5K6X4zCDaIjOhpmnwzPBGT9W1idaBAoDJhtNfsFaJ6/PQQ==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
"license": "Apache-2.0",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"linux"
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-linux-x64-gnu": {
|
||||
"version": "0.20.0-beta.2",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.20.0-beta.2.tgz",
|
||||
"integrity": "sha512-/+84U+Dt07m8Jk0b8h+SvOzlrynITPP3SDBOlB+OonwmGSxirXhc8gkfNZctgXOJYKMyRIRSsMHP/QNjOp2ajA==",
|
||||
"version": "0.21.2-beta.0",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.21.2-beta.0.tgz",
|
||||
"integrity": "sha512-ycDpyBGbfxtnGGa/RQo5+So6dHALiem1pbYc/LDKKluUJpadtXtEwC61o6hZTcejoYjhEE8ET7vA3OCEJfMFaw==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
"license": "Apache-2.0",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"linux"
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-win32-x64-msvc": {
|
||||
"version": "0.20.0-beta.2",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.20.0-beta.2.tgz",
|
||||
"integrity": "sha512-bgdunAPnknBh/5oO+vr6RXMr6wb3hHugNPXcIidxYMQvgFa8uhaAKtgYkAKuoyUReOYo8DGtVkZxNUUpZbF7/A==",
|
||||
"version": "0.21.2-beta.0",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.21.2-beta.0.tgz",
|
||||
"integrity": "sha512-IgVkAP/LiNIQD5P6n/9x3bgQOt5pGJarjtSF8r+ialD95QHmo6tcxrwTy/DlA+H1uI6B6h+sbN0c1KXTh1rYcg==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
"license": "Apache-2.0",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"win32"
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "vectordb",
|
||||
"version": "0.20.0-beta.2",
|
||||
"version": "0.21.2-beta.0",
|
||||
"description": " Serverless, low-latency vector database for AI applications",
|
||||
"private": false,
|
||||
"main": "dist/index.js",
|
||||
@@ -89,10 +89,10 @@
|
||||
}
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"@lancedb/vectordb-darwin-x64": "0.20.0-beta.2",
|
||||
"@lancedb/vectordb-darwin-arm64": "0.20.0-beta.2",
|
||||
"@lancedb/vectordb-linux-x64-gnu": "0.20.0-beta.2",
|
||||
"@lancedb/vectordb-linux-arm64-gnu": "0.20.0-beta.2",
|
||||
"@lancedb/vectordb-win32-x64-msvc": "0.20.0-beta.2"
|
||||
"@lancedb/vectordb-darwin-x64": "0.21.2-beta.0",
|
||||
"@lancedb/vectordb-darwin-arm64": "0.21.2-beta.0",
|
||||
"@lancedb/vectordb-linux-x64-gnu": "0.21.2-beta.0",
|
||||
"@lancedb/vectordb-linux-arm64-gnu": "0.21.2-beta.0",
|
||||
"@lancedb/vectordb-win32-x64-msvc": "0.21.2-beta.0"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
[package]
|
||||
name = "lancedb-nodejs"
|
||||
edition.workspace = true
|
||||
version = "0.20.0-beta.2"
|
||||
version = "0.21.2-beta.0"
|
||||
license.workspace = true
|
||||
description.workspace = true
|
||||
repository.workspace = true
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
import { Schema } from "apache-arrow";
|
||||
import { Bool, Field, Int32, List, Schema, Struct, Utf8 } from "apache-arrow";
|
||||
|
||||
import * as arrow15 from "apache-arrow-15";
|
||||
import * as arrow16 from "apache-arrow-16";
|
||||
@@ -11,10 +11,12 @@ import * as arrow18 from "apache-arrow-18";
|
||||
import {
|
||||
convertToTable,
|
||||
fromBufferToRecordBatch,
|
||||
fromDataToBuffer,
|
||||
fromRecordBatchToBuffer,
|
||||
fromTableToBuffer,
|
||||
makeArrowTable,
|
||||
makeEmptyTable,
|
||||
tableFromIPC,
|
||||
} from "../lancedb/arrow";
|
||||
import {
|
||||
EmbeddingFunction,
|
||||
@@ -375,8 +377,221 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
|
||||
expect(table2.schema).toEqual(schema);
|
||||
});
|
||||
|
||||
it("will handle missing columns in schema alignment when using embeddings", async function () {
|
||||
const schema = new Schema(
|
||||
[
|
||||
new Field("domain", new Utf8(), true),
|
||||
new Field("name", new Utf8(), true),
|
||||
new Field("description", new Utf8(), true),
|
||||
],
|
||||
new Map([["embedding_functions", JSON.stringify([])]]),
|
||||
);
|
||||
|
||||
const data = [
|
||||
{ domain: "google.com", name: "Google" },
|
||||
{ domain: "facebook.com", name: "Facebook" },
|
||||
];
|
||||
|
||||
const table = await convertToTable(data, undefined, { schema });
|
||||
|
||||
expect(table.numCols).toBe(3);
|
||||
expect(table.numRows).toBe(2);
|
||||
|
||||
const descriptionColumn = table.getChild("description");
|
||||
expect(descriptionColumn).toBeDefined();
|
||||
expect(descriptionColumn?.nullCount).toBe(2);
|
||||
expect(descriptionColumn?.toArray()).toEqual([null, null]);
|
||||
|
||||
expect(table.getChild("domain")?.toArray()).toEqual([
|
||||
"google.com",
|
||||
"facebook.com",
|
||||
]);
|
||||
expect(table.getChild("name")?.toArray()).toEqual([
|
||||
"Google",
|
||||
"Facebook",
|
||||
]);
|
||||
});
|
||||
|
||||
it("will handle completely missing nested struct columns", async function () {
|
||||
const schema = new Schema(
|
||||
[
|
||||
new Field("id", new Utf8(), true),
|
||||
new Field("name", new Utf8(), true),
|
||||
new Field(
|
||||
"metadata",
|
||||
new Struct([
|
||||
new Field("version", new Int32(), true),
|
||||
new Field("author", new Utf8(), true),
|
||||
new Field(
|
||||
"tags",
|
||||
new List(new Field("item", new Utf8(), true)),
|
||||
true,
|
||||
),
|
||||
]),
|
||||
true,
|
||||
),
|
||||
],
|
||||
new Map([["embedding_functions", JSON.stringify([])]]),
|
||||
);
|
||||
|
||||
const data = [
|
||||
{ id: "doc1", name: "Document 1" },
|
||||
{ id: "doc2", name: "Document 2" },
|
||||
];
|
||||
|
||||
const table = await convertToTable(data, undefined, { schema });
|
||||
|
||||
expect(table.numCols).toBe(3);
|
||||
expect(table.numRows).toBe(2);
|
||||
|
||||
const buf = await fromTableToBuffer(table);
|
||||
const retrievedTable = tableFromIPC(buf);
|
||||
|
||||
const rows = [];
|
||||
for (let i = 0; i < retrievedTable.numRows; i++) {
|
||||
rows.push(retrievedTable.get(i));
|
||||
}
|
||||
|
||||
expect(rows[0].metadata.version).toBe(null);
|
||||
expect(rows[0].metadata.author).toBe(null);
|
||||
expect(rows[0].metadata.tags).toBe(null);
|
||||
expect(rows[0].id).toBe("doc1");
|
||||
expect(rows[0].name).toBe("Document 1");
|
||||
});
|
||||
|
||||
it("will handle partially missing nested struct fields", async function () {
|
||||
const schema = new Schema(
|
||||
[
|
||||
new Field("id", new Utf8(), true),
|
||||
new Field(
|
||||
"metadata",
|
||||
new Struct([
|
||||
new Field("version", new Int32(), true),
|
||||
new Field("author", new Utf8(), true),
|
||||
new Field("created_at", new Utf8(), true),
|
||||
]),
|
||||
true,
|
||||
),
|
||||
],
|
||||
new Map([["embedding_functions", JSON.stringify([])]]),
|
||||
);
|
||||
|
||||
const data = [
|
||||
{ id: "doc1", metadata: { version: 1, author: "Alice" } },
|
||||
{ id: "doc2", metadata: { version: 2 } },
|
||||
];
|
||||
|
||||
const table = await convertToTable(data, undefined, { schema });
|
||||
|
||||
expect(table.numCols).toBe(2);
|
||||
expect(table.numRows).toBe(2);
|
||||
|
||||
const metadataColumn = table.getChild("metadata");
|
||||
expect(metadataColumn).toBeDefined();
|
||||
expect(metadataColumn?.type.toString()).toBe(
|
||||
"Struct<{version:Int32, author:Utf8, created_at:Utf8}>",
|
||||
);
|
||||
});
|
||||
|
||||
it("will handle multiple levels of nested structures", async function () {
|
||||
const schema = new Schema(
|
||||
[
|
||||
new Field("id", new Utf8(), true),
|
||||
new Field(
|
||||
"config",
|
||||
new Struct([
|
||||
new Field("database", new Utf8(), true),
|
||||
new Field(
|
||||
"connection",
|
||||
new Struct([
|
||||
new Field("host", new Utf8(), true),
|
||||
new Field("port", new Int32(), true),
|
||||
new Field(
|
||||
"ssl",
|
||||
new Struct([
|
||||
new Field("enabled", new Bool(), true),
|
||||
new Field("cert_path", new Utf8(), true),
|
||||
]),
|
||||
true,
|
||||
),
|
||||
]),
|
||||
true,
|
||||
),
|
||||
]),
|
||||
true,
|
||||
),
|
||||
],
|
||||
new Map([["embedding_functions", JSON.stringify([])]]),
|
||||
);
|
||||
|
||||
const data = [
|
||||
{
|
||||
id: "config1",
|
||||
config: {
|
||||
database: "postgres",
|
||||
connection: { host: "localhost" },
|
||||
},
|
||||
},
|
||||
{
|
||||
id: "config2",
|
||||
config: { database: "mysql" },
|
||||
},
|
||||
{
|
||||
id: "config3",
|
||||
},
|
||||
];
|
||||
|
||||
const table = await convertToTable(data, undefined, { schema });
|
||||
|
||||
expect(table.numCols).toBe(2);
|
||||
expect(table.numRows).toBe(3);
|
||||
|
||||
const configColumn = table.getChild("config");
|
||||
expect(configColumn).toBeDefined();
|
||||
expect(configColumn?.type.toString()).toBe(
|
||||
"Struct<{database:Utf8, connection:Struct<{host:Utf8, port:Int32, ssl:Struct<{enabled:Bool, cert_path:Utf8}>}>}>",
|
||||
);
|
||||
});
|
||||
|
||||
it("will handle missing columns in Arrow table input when using embeddings", async function () {
|
||||
const incompleteTable = makeArrowTable([
|
||||
{ domain: "google.com", name: "Google" },
|
||||
{ domain: "facebook.com", name: "Facebook" },
|
||||
]);
|
||||
|
||||
const schema = new Schema(
|
||||
[
|
||||
new Field("domain", new Utf8(), true),
|
||||
new Field("name", new Utf8(), true),
|
||||
new Field("description", new Utf8(), true),
|
||||
],
|
||||
new Map([["embedding_functions", JSON.stringify([])]]),
|
||||
);
|
||||
|
||||
const buf = await fromDataToBuffer(incompleteTable, undefined, schema);
|
||||
|
||||
expect(buf.byteLength).toBeGreaterThan(0);
|
||||
|
||||
const retrievedTable = tableFromIPC(buf);
|
||||
expect(retrievedTable.numCols).toBe(3);
|
||||
expect(retrievedTable.numRows).toBe(2);
|
||||
|
||||
const descriptionColumn = retrievedTable.getChild("description");
|
||||
expect(descriptionColumn).toBeDefined();
|
||||
expect(descriptionColumn?.nullCount).toBe(2);
|
||||
expect(descriptionColumn?.toArray()).toEqual([null, null]);
|
||||
|
||||
expect(retrievedTable.getChild("domain")?.toArray()).toEqual([
|
||||
"google.com",
|
||||
"facebook.com",
|
||||
]);
|
||||
expect(retrievedTable.getChild("name")?.toArray()).toEqual([
|
||||
"Google",
|
||||
"Facebook",
|
||||
]);
|
||||
});
|
||||
|
||||
it("should correctly retain values in nested struct fields", async function () {
|
||||
// Define test data with nested struct
|
||||
const testData = [
|
||||
{
|
||||
id: "doc1",
|
||||
@@ -400,10 +615,8 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
|
||||
},
|
||||
];
|
||||
|
||||
// Create Arrow table from the data
|
||||
const table = makeArrowTable(testData);
|
||||
|
||||
// Verify schema has the nested struct fields
|
||||
const metadataField = table.schema.fields.find(
|
||||
(f) => f.name === "metadata",
|
||||
);
|
||||
@@ -417,23 +630,17 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
|
||||
"text",
|
||||
]);
|
||||
|
||||
// Convert to buffer and back (simulating storage and retrieval)
|
||||
const buf = await fromTableToBuffer(table);
|
||||
const retrievedTable = tableFromIPC(buf);
|
||||
|
||||
// Verify the retrieved table has the same structure
|
||||
const rows = [];
|
||||
for (let i = 0; i < retrievedTable.numRows; i++) {
|
||||
rows.push(retrievedTable.get(i));
|
||||
}
|
||||
|
||||
// Check values in the first row
|
||||
const firstRow = rows[0];
|
||||
expect(firstRow.id).toBe("doc1");
|
||||
expect(firstRow.vector.toJSON()).toEqual([1, 2, 3]);
|
||||
|
||||
// Verify metadata values are preserved (this is where the bug is)
|
||||
expect(firstRow.metadata).toBeDefined();
|
||||
expect(firstRow.metadata.filePath).toBe("/path/to/file1.ts");
|
||||
expect(firstRow.metadata.startLine).toBe(10);
|
||||
expect(firstRow.metadata.endLine).toBe(20);
|
||||
@@ -592,14 +799,14 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
|
||||
).rejects.toThrow("column vector was missing");
|
||||
});
|
||||
|
||||
it("will provide a nice error if run twice", async function () {
|
||||
it("will skip embedding application if already applied", async function () {
|
||||
const records = sampleRecords();
|
||||
const table = await convertToTable(records, dummyEmbeddingConfig);
|
||||
|
||||
// fromTableToBuffer will try and apply the embeddings again
|
||||
await expect(
|
||||
fromTableToBuffer(table, dummyEmbeddingConfig),
|
||||
).rejects.toThrow("already existed");
|
||||
// but should skip since the column already has non-null values
|
||||
const result = await fromTableToBuffer(table, dummyEmbeddingConfig);
|
||||
expect(result.byteLength).toBeGreaterThan(0);
|
||||
});
|
||||
});
|
||||
|
||||
|
||||
@@ -33,7 +33,12 @@ import {
|
||||
register,
|
||||
} from "../lancedb/embedding";
|
||||
import { Index } from "../lancedb/indices";
|
||||
import { instanceOfFullTextQuery } from "../lancedb/query";
|
||||
import {
|
||||
BooleanQuery,
|
||||
Occur,
|
||||
Operator,
|
||||
instanceOfFullTextQuery,
|
||||
} from "../lancedb/query";
|
||||
import exp = require("constants");
|
||||
|
||||
describe.each([arrow15, arrow16, arrow17, arrow18])(
|
||||
@@ -363,9 +368,9 @@ describe("merge insert", () => {
|
||||
{ a: 4, b: "z" },
|
||||
];
|
||||
|
||||
expect(
|
||||
JSON.parse(JSON.stringify((await table.toArrow()).toArray())),
|
||||
).toEqual(expected);
|
||||
const result = (await table.toArrow()).toArray().sort((a, b) => a.a - b.a);
|
||||
|
||||
expect(result.map((row) => ({ ...row }))).toEqual(expected);
|
||||
});
|
||||
test("conditional update", async () => {
|
||||
const newData = [
|
||||
@@ -554,6 +559,32 @@ describe("When creating an index", () => {
|
||||
rst = await tbl.query().limit(2).offset(1).nearestTo(queryVec).toArrow();
|
||||
expect(rst.numRows).toBe(1);
|
||||
|
||||
// test nprobes
|
||||
rst = await tbl.query().nearestTo(queryVec).limit(2).nprobes(50).toArrow();
|
||||
expect(rst.numRows).toBe(2);
|
||||
rst = await tbl
|
||||
.query()
|
||||
.nearestTo(queryVec)
|
||||
.limit(2)
|
||||
.minimumNprobes(15)
|
||||
.toArrow();
|
||||
expect(rst.numRows).toBe(2);
|
||||
rst = await tbl
|
||||
.query()
|
||||
.nearestTo(queryVec)
|
||||
.limit(2)
|
||||
.minimumNprobes(10)
|
||||
.maximumNprobes(20)
|
||||
.toArrow();
|
||||
expect(rst.numRows).toBe(2);
|
||||
|
||||
expect(() => tbl.query().nearestTo(queryVec).minimumNprobes(0)).toThrow(
|
||||
"Invalid input, minimum_nprobes must be greater than 0",
|
||||
);
|
||||
expect(() => tbl.query().nearestTo(queryVec).maximumNprobes(5)).toThrow(
|
||||
"Invalid input, maximum_nprobes must be greater than minimum_nprobes",
|
||||
);
|
||||
|
||||
await tbl.dropIndex("vec_idx");
|
||||
const indices2 = await tbl.listIndices();
|
||||
expect(indices2.length).toBe(0);
|
||||
@@ -1531,6 +1562,18 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
|
||||
|
||||
const results = await table.search("hello").toArray();
|
||||
expect(results[0].text).toBe(data[0].text);
|
||||
|
||||
const results2 = await table
|
||||
.search(new MatchQuery("hello world", "text"))
|
||||
.toArray();
|
||||
expect(results2.length).toBe(2);
|
||||
|
||||
const results3 = await table
|
||||
.search(
|
||||
new MatchQuery("hello world", "text", { operator: Operator.And }),
|
||||
)
|
||||
.toArray();
|
||||
expect(results3.length).toBe(1);
|
||||
});
|
||||
|
||||
test("full text search without lowercase", async () => {
|
||||
@@ -1607,6 +1650,114 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
|
||||
expect(resultSet.has("fob")).toBe(true);
|
||||
expect(resultSet.has("fo")).toBe(true);
|
||||
expect(resultSet.has("food")).toBe(true);
|
||||
|
||||
const prefixResults = await table
|
||||
.search(
|
||||
new MatchQuery("foo", "text", { fuzziness: 3, prefixLength: 3 }),
|
||||
)
|
||||
.toArray();
|
||||
expect(prefixResults.length).toBe(2);
|
||||
const resultSet2 = new Set(prefixResults.map((r) => r.text));
|
||||
expect(resultSet2.has("foo")).toBe(true);
|
||||
expect(resultSet2.has("food")).toBe(true);
|
||||
});
|
||||
|
||||
test("full text search boolean query", async () => {
|
||||
const db = await connect(tmpDir.name);
|
||||
const data = [
|
||||
{ text: "The cat and dog are playing" },
|
||||
{ text: "The cat is sleeping" },
|
||||
{ text: "The dog is barking" },
|
||||
{ text: "The dog chases the cat" },
|
||||
];
|
||||
const table = await db.createTable("test", data);
|
||||
await table.createIndex("text", {
|
||||
config: Index.fts({ withPosition: false }),
|
||||
});
|
||||
|
||||
const shouldResults = await table
|
||||
.search(
|
||||
new BooleanQuery([
|
||||
[Occur.Should, new MatchQuery("cat", "text")],
|
||||
[Occur.Should, new MatchQuery("dog", "text")],
|
||||
]),
|
||||
)
|
||||
.toArray();
|
||||
expect(shouldResults.length).toBe(4);
|
||||
|
||||
const mustResults = await table
|
||||
.search(
|
||||
new BooleanQuery([
|
||||
[Occur.Must, new MatchQuery("cat", "text")],
|
||||
[Occur.Must, new MatchQuery("dog", "text")],
|
||||
]),
|
||||
)
|
||||
.toArray();
|
||||
expect(mustResults.length).toBe(2);
|
||||
|
||||
const mustNotResults = await table
|
||||
.search(
|
||||
new BooleanQuery([
|
||||
[Occur.Must, new MatchQuery("cat", "text")],
|
||||
[Occur.MustNot, new MatchQuery("dog", "text")],
|
||||
]),
|
||||
)
|
||||
.toArray();
|
||||
expect(mustNotResults.length).toBe(1);
|
||||
});
|
||||
|
||||
test("full text search ngram", async () => {
|
||||
const db = await connect(tmpDir.name);
|
||||
const data = [
|
||||
{ text: "hello world", vector: [0.1, 0.2, 0.3] },
|
||||
{ text: "lance database", vector: [0.4, 0.5, 0.6] },
|
||||
{ text: "lance is cool", vector: [0.7, 0.8, 0.9] },
|
||||
];
|
||||
const table = await db.createTable("test", data);
|
||||
await table.createIndex("text", {
|
||||
config: Index.fts({ baseTokenizer: "ngram" }),
|
||||
});
|
||||
|
||||
const results = await table.search("lan").toArray();
|
||||
expect(results.length).toBe(2);
|
||||
const resultSet = new Set(results.map((r) => r.text));
|
||||
expect(resultSet.has("lance database")).toBe(true);
|
||||
expect(resultSet.has("lance is cool")).toBe(true);
|
||||
|
||||
const results2 = await table.search("nce").toArray(); // spellchecker:disable-line
|
||||
expect(results2.length).toBe(2);
|
||||
const resultSet2 = new Set(results2.map((r) => r.text));
|
||||
expect(resultSet2.has("lance database")).toBe(true);
|
||||
expect(resultSet2.has("lance is cool")).toBe(true);
|
||||
|
||||
// the default min_ngram_length is 3, so "la" should not match
|
||||
const results3 = await table.search("la").toArray();
|
||||
expect(results3.length).toBe(0);
|
||||
|
||||
// test setting min_ngram_length and prefix_only
|
||||
await table.createIndex("text", {
|
||||
config: Index.fts({
|
||||
baseTokenizer: "ngram",
|
||||
ngramMinLength: 2,
|
||||
prefixOnly: true,
|
||||
}),
|
||||
replace: true,
|
||||
});
|
||||
|
||||
const results4 = await table.search("lan").toArray();
|
||||
expect(results4.length).toBe(2);
|
||||
const resultSet4 = new Set(results4.map((r) => r.text));
|
||||
expect(resultSet4.has("lance database")).toBe(true);
|
||||
expect(resultSet4.has("lance is cool")).toBe(true);
|
||||
|
||||
const results5 = await table.search("nce").toArray(); // spellchecker:disable-line
|
||||
expect(results5.length).toBe(0);
|
||||
|
||||
const results6 = await table.search("la").toArray();
|
||||
expect(results6.length).toBe(2);
|
||||
const resultSet6 = new Set(results6.map((r) => r.text));
|
||||
expect(resultSet6.has("lance database")).toBe(true);
|
||||
expect(resultSet6.has("lance is cool")).toBe(true);
|
||||
});
|
||||
|
||||
test.each([
|
||||
@@ -1712,4 +1863,43 @@ describe("column name options", () => {
|
||||
expect(results[0].query_index).toBe(0);
|
||||
expect(results[1].query_index).toBe(1);
|
||||
});
|
||||
|
||||
test("index and search multivectors", async () => {
|
||||
const db = await connect(tmpDir.name);
|
||||
const data = [];
|
||||
// generate 512 random multivectors
|
||||
for (let i = 0; i < 256; i++) {
|
||||
data.push({
|
||||
multivector: Array.from({ length: 10 }, () =>
|
||||
Array(2).fill(Math.random()),
|
||||
),
|
||||
});
|
||||
}
|
||||
const table = await db.createTable("multivectors", data, {
|
||||
schema: new Schema([
|
||||
new Field(
|
||||
"multivector",
|
||||
new List(
|
||||
new Field(
|
||||
"item",
|
||||
new FixedSizeList(2, new Field("item", new Float32())),
|
||||
),
|
||||
),
|
||||
),
|
||||
]),
|
||||
});
|
||||
|
||||
const results = await table.search(data[0].multivector).limit(10).toArray();
|
||||
expect(results.length).toBe(10);
|
||||
|
||||
await table.createIndex("multivector", {
|
||||
config: Index.ivfPq({ numPartitions: 2, distanceType: "cosine" }),
|
||||
});
|
||||
|
||||
const results2 = await table
|
||||
.search(data[0].multivector)
|
||||
.limit(10)
|
||||
.toArray();
|
||||
expect(results2.length).toBe(10);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -107,6 +107,20 @@ export type IntoVector =
|
||||
| number[]
|
||||
| Promise<Float32Array | Float64Array | number[]>;
|
||||
|
||||
export type MultiVector = IntoVector[];
|
||||
|
||||
export function isMultiVector(value: unknown): value is MultiVector {
|
||||
return Array.isArray(value) && isIntoVector(value[0]);
|
||||
}
|
||||
|
||||
export function isIntoVector(value: unknown): value is IntoVector {
|
||||
return (
|
||||
value instanceof Float32Array ||
|
||||
value instanceof Float64Array ||
|
||||
(Array.isArray(value) && !Array.isArray(value[0]))
|
||||
);
|
||||
}
|
||||
|
||||
export function isArrowTable(value: object): value is TableLike {
|
||||
if (value instanceof ArrowTable) return true;
|
||||
return "schema" in value && "batches" in value;
|
||||
@@ -417,7 +431,9 @@ function inferSchema(
|
||||
} else {
|
||||
const inferredType = inferType(value, path, opts);
|
||||
if (inferredType === undefined) {
|
||||
throw new Error(`Failed to infer data type for field ${path.join(".")} at row ${rowI}. \
|
||||
throw new Error(`Failed to infer data type for field ${path.join(
|
||||
".",
|
||||
)} at row ${rowI}. \
|
||||
Consider providing an explicit schema.`);
|
||||
}
|
||||
pathTree.set(path, inferredType);
|
||||
@@ -799,11 +815,17 @@ async function applyEmbeddingsFromMetadata(
|
||||
`Cannot apply embedding function because the source column '${functionEntry.sourceColumn}' was not present in the data`,
|
||||
);
|
||||
}
|
||||
|
||||
// Check if destination column exists and handle accordingly
|
||||
if (columns[destColumn] !== undefined) {
|
||||
throw new Error(
|
||||
`Attempt to apply embeddings to table failed because column ${destColumn} already existed`,
|
||||
);
|
||||
const existingColumn = columns[destColumn];
|
||||
// If the column exists but is all null, we can fill it with embeddings
|
||||
if (existingColumn.nullCount !== existingColumn.length) {
|
||||
// Column has non-null values, skip embedding application
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if (table.batches.length > 1) {
|
||||
throw new Error(
|
||||
"Internal error: `makeArrowTable` unexpectedly created a table with more than one batch",
|
||||
@@ -831,6 +853,15 @@ async function applyEmbeddingsFromMetadata(
|
||||
const vector = makeVector(vectors, destType);
|
||||
columns[destColumn] = vector;
|
||||
}
|
||||
|
||||
// Add any missing columns from the schema as null vectors
|
||||
for (const field of schema.fields) {
|
||||
if (!(field.name in columns)) {
|
||||
const nullValues = new Array(table.numRows).fill(null);
|
||||
columns[field.name] = makeVector(nullValues, field.type);
|
||||
}
|
||||
}
|
||||
|
||||
const newTable = new ArrowTable(columns);
|
||||
return alignTable(newTable, schema);
|
||||
}
|
||||
@@ -903,11 +934,23 @@ async function applyEmbeddings<T>(
|
||||
);
|
||||
}
|
||||
} else {
|
||||
// Check if destination column exists and handle accordingly
|
||||
if (Object.prototype.hasOwnProperty.call(newColumns, destColumn)) {
|
||||
throw new Error(
|
||||
`Attempt to apply embeddings to table failed because column ${destColumn} already existed`,
|
||||
);
|
||||
const existingColumn = newColumns[destColumn];
|
||||
// If the column exists but is all null, we can fill it with embeddings
|
||||
if (existingColumn.nullCount !== existingColumn.length) {
|
||||
// Column has non-null values, skip embedding application and return table as-is
|
||||
let newTable = new ArrowTable(newColumns);
|
||||
if (schema != null) {
|
||||
newTable = alignTable(newTable, schema as Schema);
|
||||
}
|
||||
return new ArrowTable(
|
||||
new Schema(newTable.schema.fields, schemaMetadata),
|
||||
newTable.batches,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if (table.batches.length > 1) {
|
||||
throw new Error(
|
||||
"Internal error: `makeArrowTable` unexpectedly created a table with more than one batch",
|
||||
@@ -967,7 +1010,21 @@ export async function convertToTable(
|
||||
embeddings?: EmbeddingFunctionConfig,
|
||||
makeTableOptions?: Partial<MakeArrowTableOptions>,
|
||||
): Promise<ArrowTable> {
|
||||
const table = makeArrowTable(data, makeTableOptions);
|
||||
let processedData = data;
|
||||
|
||||
// If we have a schema with embedding metadata, we need to preprocess the data
|
||||
// to ensure all nested fields are present
|
||||
if (
|
||||
makeTableOptions?.schema &&
|
||||
makeTableOptions.schema.metadata?.has("embedding_functions")
|
||||
) {
|
||||
processedData = ensureNestedFieldsExist(
|
||||
data,
|
||||
makeTableOptions.schema as Schema,
|
||||
);
|
||||
}
|
||||
|
||||
const table = makeArrowTable(processedData, makeTableOptions);
|
||||
return await applyEmbeddings(table, embeddings, makeTableOptions?.schema);
|
||||
}
|
||||
|
||||
@@ -1060,7 +1117,16 @@ export async function fromDataToBuffer(
|
||||
schema = sanitizeSchema(schema);
|
||||
}
|
||||
if (isArrowTable(data)) {
|
||||
return fromTableToBuffer(sanitizeTable(data), embeddings, schema);
|
||||
const table = sanitizeTable(data);
|
||||
// If we have a schema with embedding functions, we need to ensure all columns exist
|
||||
// before applying embeddings, since applyEmbeddingsFromMetadata expects all columns
|
||||
// to be present in the table
|
||||
if (schema && schema.metadata?.has("embedding_functions")) {
|
||||
const alignedTable = alignTableToSchema(table, schema);
|
||||
return fromTableToBuffer(alignedTable, embeddings, schema);
|
||||
} else {
|
||||
return fromTableToBuffer(table, embeddings, schema);
|
||||
}
|
||||
} else {
|
||||
const table = await convertToTable(data, embeddings, { schema });
|
||||
return fromTableToBuffer(table);
|
||||
@@ -1129,7 +1195,7 @@ function alignBatch(batch: RecordBatch, schema: Schema): RecordBatch {
|
||||
type: new Struct(schema.fields),
|
||||
length: batch.numRows,
|
||||
nullCount: batch.nullCount,
|
||||
children: alignedChildren,
|
||||
children: alignedChildren as unknown as ArrowData<DataType>[],
|
||||
});
|
||||
return new RecordBatch(schema, newData);
|
||||
}
|
||||
@@ -1201,6 +1267,79 @@ function validateSchemaEmbeddings(
|
||||
return new Schema(fields, schema.metadata);
|
||||
}
|
||||
|
||||
/**
|
||||
* Ensures that all nested fields defined in the schema exist in the data,
|
||||
* filling missing fields with null values.
|
||||
*/
|
||||
export function ensureNestedFieldsExist(
|
||||
data: Array<Record<string, unknown>>,
|
||||
schema: Schema,
|
||||
): Array<Record<string, unknown>> {
|
||||
return data.map((row) => {
|
||||
const completeRow: Record<string, unknown> = {};
|
||||
|
||||
for (const field of schema.fields) {
|
||||
if (field.name in row) {
|
||||
if (
|
||||
field.type.constructor.name === "Struct" &&
|
||||
row[field.name] !== null &&
|
||||
row[field.name] !== undefined
|
||||
) {
|
||||
// Handle nested struct
|
||||
const nestedValue = row[field.name] as Record<string, unknown>;
|
||||
completeRow[field.name] = ensureStructFieldsExist(
|
||||
nestedValue,
|
||||
field.type,
|
||||
);
|
||||
} else {
|
||||
// Non-struct field or null struct value
|
||||
completeRow[field.name] = row[field.name];
|
||||
}
|
||||
} else {
|
||||
// Field is missing from the data - set to null
|
||||
completeRow[field.name] = null;
|
||||
}
|
||||
}
|
||||
|
||||
return completeRow;
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Recursively ensures that all fields in a struct type exist in the data,
|
||||
* filling missing fields with null values.
|
||||
*/
|
||||
function ensureStructFieldsExist(
|
||||
data: Record<string, unknown>,
|
||||
structType: Struct,
|
||||
): Record<string, unknown> {
|
||||
const completeStruct: Record<string, unknown> = {};
|
||||
|
||||
for (const childField of structType.children) {
|
||||
if (childField.name in data) {
|
||||
if (
|
||||
childField.type.constructor.name === "Struct" &&
|
||||
data[childField.name] !== null &&
|
||||
data[childField.name] !== undefined
|
||||
) {
|
||||
// Recursively handle nested struct
|
||||
completeStruct[childField.name] = ensureStructFieldsExist(
|
||||
data[childField.name] as Record<string, unknown>,
|
||||
childField.type,
|
||||
);
|
||||
} else {
|
||||
// Non-struct field or null struct value
|
||||
completeStruct[childField.name] = data[childField.name];
|
||||
}
|
||||
} else {
|
||||
// Field is missing - set to null
|
||||
completeStruct[childField.name] = null;
|
||||
}
|
||||
}
|
||||
|
||||
return completeStruct;
|
||||
}
|
||||
|
||||
interface JsonDataType {
|
||||
type: string;
|
||||
fields?: JsonField[];
|
||||
@@ -1334,3 +1473,64 @@ function fieldToJson(field: Field): JsonField {
|
||||
metadata: field.metadata,
|
||||
};
|
||||
}
|
||||
|
||||
function alignTableToSchema(
|
||||
table: ArrowTable,
|
||||
targetSchema: Schema,
|
||||
): ArrowTable {
|
||||
const existingColumns = new Map<string, Vector>();
|
||||
|
||||
// Map existing columns
|
||||
for (const field of table.schema.fields) {
|
||||
existingColumns.set(field.name, table.getChild(field.name)!);
|
||||
}
|
||||
|
||||
// Create vectors for all fields in target schema
|
||||
const alignedColumns: Record<string, Vector> = {};
|
||||
|
||||
for (const field of targetSchema.fields) {
|
||||
if (existingColumns.has(field.name)) {
|
||||
// Column exists, use it
|
||||
alignedColumns[field.name] = existingColumns.get(field.name)!;
|
||||
} else {
|
||||
// Column missing, create null vector
|
||||
alignedColumns[field.name] = createNullVector(field, table.numRows);
|
||||
}
|
||||
}
|
||||
|
||||
// Create new table with aligned schema and columns
|
||||
return new ArrowTable(targetSchema, alignedColumns);
|
||||
}
|
||||
|
||||
function createNullVector(field: Field, numRows: number): Vector {
|
||||
if (field.type.constructor.name === "Struct") {
|
||||
// For struct types, create a struct with null fields
|
||||
const structType = field.type as Struct;
|
||||
const childVectors = structType.children.map((childField) =>
|
||||
createNullVector(childField, numRows),
|
||||
);
|
||||
|
||||
// Create struct data
|
||||
const structData = makeData({
|
||||
type: structType,
|
||||
length: numRows,
|
||||
nullCount: 0,
|
||||
children: childVectors.map((v) => v.data[0]),
|
||||
});
|
||||
|
||||
return arrowMakeVector(structData);
|
||||
} else {
|
||||
// For other types, create a vector of nulls
|
||||
const nullBitmap = new Uint8Array(Math.ceil(numRows / 8));
|
||||
// All bits are 0, meaning all values are null
|
||||
|
||||
const data = makeData({
|
||||
type: field.type,
|
||||
length: numRows,
|
||||
nullCount: numRows,
|
||||
nullBitmap,
|
||||
});
|
||||
|
||||
return arrowMakeVector(data);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -64,7 +64,10 @@ export {
|
||||
PhraseQuery,
|
||||
BoostQuery,
|
||||
MultiMatchQuery,
|
||||
BooleanQuery,
|
||||
FullTextQueryType,
|
||||
Operator,
|
||||
Occur,
|
||||
} from "./query";
|
||||
|
||||
export {
|
||||
@@ -97,6 +100,7 @@ export {
|
||||
RecordBatchLike,
|
||||
DataLike,
|
||||
IntoVector,
|
||||
MultiVector,
|
||||
} from "./arrow";
|
||||
export { IntoSql, packBits } from "./util";
|
||||
|
||||
|
||||
@@ -439,7 +439,7 @@ export interface FtsOptions {
|
||||
*
|
||||
* "raw" - Raw tokenizer. This tokenizer does not split the text into tokens and indexes the entire text as a single token.
|
||||
*/
|
||||
baseTokenizer?: "simple" | "whitespace" | "raw";
|
||||
baseTokenizer?: "simple" | "whitespace" | "raw" | "ngram";
|
||||
|
||||
/**
|
||||
* language for stemming and stop words
|
||||
@@ -472,6 +472,21 @@ export interface FtsOptions {
|
||||
* whether to remove punctuation
|
||||
*/
|
||||
asciiFolding?: boolean;
|
||||
|
||||
/**
|
||||
* ngram min length
|
||||
*/
|
||||
ngramMinLength?: number;
|
||||
|
||||
/**
|
||||
* ngram max length
|
||||
*/
|
||||
ngramMaxLength?: number;
|
||||
|
||||
/**
|
||||
* whether to only index the prefix of the token for ngram tokenizer
|
||||
*/
|
||||
prefixOnly?: boolean;
|
||||
}
|
||||
|
||||
export class Index {
|
||||
@@ -608,6 +623,9 @@ export class Index {
|
||||
options?.stem,
|
||||
options?.removeStopWords,
|
||||
options?.asciiFolding,
|
||||
options?.ngramMinLength,
|
||||
options?.ngramMaxLength,
|
||||
options?.prefixOnly,
|
||||
),
|
||||
);
|
||||
}
|
||||
|
||||
@@ -448,6 +448,10 @@ export class VectorQuery extends QueryBase<NativeVectorQuery> {
|
||||
* For best results we recommend tuning this parameter with a benchmark against
|
||||
* your actual data to find the smallest possible value that will still give
|
||||
* you the desired recall.
|
||||
*
|
||||
* For more fine grained control over behavior when you have a very narrow filter
|
||||
* you can use `minimumNprobes` and `maximumNprobes`. This method sets both
|
||||
* the minimum and maximum to the same value.
|
||||
*/
|
||||
nprobes(nprobes: number): VectorQuery {
|
||||
super.doCall((inner) => inner.nprobes(nprobes));
|
||||
@@ -455,6 +459,33 @@ export class VectorQuery extends QueryBase<NativeVectorQuery> {
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the minimum number of probes used.
|
||||
*
|
||||
* This controls the minimum number of partitions that will be searched. This
|
||||
* parameter will impact every query against a vector index, regardless of the
|
||||
* filter. See `nprobes` for more details. Higher values will increase recall
|
||||
* but will also increase latency.
|
||||
*/
|
||||
minimumNprobes(minimumNprobes: number): VectorQuery {
|
||||
super.doCall((inner) => inner.minimumNprobes(minimumNprobes));
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the maximum number of probes used.
|
||||
*
|
||||
* This controls the maximum number of partitions that will be searched. If this
|
||||
* number is greater than minimumNprobes then the excess partitions will _only_ be
|
||||
* searched if we have not found enough results. This can be useful when there is
|
||||
* a narrow filter to allow these queries to spend more time searching and avoid
|
||||
* potential false negatives.
|
||||
*/
|
||||
maximumNprobes(maximumNprobes: number): VectorQuery {
|
||||
super.doCall((inner) => inner.maximumNprobes(maximumNprobes));
|
||||
return this;
|
||||
}
|
||||
|
||||
/*
|
||||
* Set the distance range to use
|
||||
*
|
||||
@@ -762,6 +793,31 @@ export enum FullTextQueryType {
|
||||
MatchPhrase = "match_phrase",
|
||||
Boost = "boost",
|
||||
MultiMatch = "multi_match",
|
||||
Boolean = "boolean",
|
||||
}
|
||||
|
||||
/**
|
||||
* Enum representing the logical operators used in full-text queries.
|
||||
*
|
||||
* - `And`: All terms must match.
|
||||
* - `Or`: At least one term must match.
|
||||
*/
|
||||
export enum Operator {
|
||||
And = "AND",
|
||||
Or = "OR",
|
||||
}
|
||||
|
||||
/**
|
||||
* Enum representing the occurrence of terms in full-text queries.
|
||||
*
|
||||
* - `Must`: The term must be present in the document.
|
||||
* - `Should`: The term should contribute to the document score, but is not required.
|
||||
* - `MustNot`: The term must not be present in the document.
|
||||
*/
|
||||
export enum Occur {
|
||||
Should = "SHOULD",
|
||||
Must = "MUST",
|
||||
MustNot = "MUST_NOT",
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -791,6 +847,7 @@ export function instanceOfFullTextQuery(obj: any): obj is FullTextQuery {
|
||||
export class MatchQuery implements FullTextQuery {
|
||||
/** @ignore */
|
||||
public readonly inner: JsFullTextQuery;
|
||||
|
||||
/**
|
||||
* Creates an instance of MatchQuery.
|
||||
*
|
||||
@@ -800,6 +857,8 @@ export class MatchQuery implements FullTextQuery {
|
||||
* - `boost`: The boost factor for the query (default is 1.0).
|
||||
* - `fuzziness`: The fuzziness level for the query (default is 0).
|
||||
* - `maxExpansions`: The maximum number of terms to consider for fuzzy matching (default is 50).
|
||||
* - `operator`: The logical operator to use for combining terms in the query (default is "OR").
|
||||
* - `prefixLength`: The number of beginning characters being unchanged for fuzzy matching.
|
||||
*/
|
||||
constructor(
|
||||
query: string,
|
||||
@@ -808,6 +867,8 @@ export class MatchQuery implements FullTextQuery {
|
||||
boost?: number;
|
||||
fuzziness?: number;
|
||||
maxExpansions?: number;
|
||||
operator?: Operator;
|
||||
prefixLength?: number;
|
||||
},
|
||||
) {
|
||||
let fuzziness = options?.fuzziness;
|
||||
@@ -820,6 +881,8 @@ export class MatchQuery implements FullTextQuery {
|
||||
options?.boost ?? 1.0,
|
||||
fuzziness,
|
||||
options?.maxExpansions ?? 50,
|
||||
options?.operator ?? Operator.Or,
|
||||
options?.prefixLength ?? 0,
|
||||
);
|
||||
}
|
||||
|
||||
@@ -836,9 +899,11 @@ export class PhraseQuery implements FullTextQuery {
|
||||
*
|
||||
* @param query - The phrase to search for in the specified column.
|
||||
* @param column - The name of the column to search within.
|
||||
* @param options - Optional parameters for the phrase query.
|
||||
* - `slop`: The maximum number of intervening unmatched positions allowed between words in the phrase (default is 0).
|
||||
*/
|
||||
constructor(query: string, column: string) {
|
||||
this.inner = JsFullTextQuery.phraseQuery(query, column);
|
||||
constructor(query: string, column: string, options?: { slop?: number }) {
|
||||
this.inner = JsFullTextQuery.phraseQuery(query, column, options?.slop ?? 0);
|
||||
}
|
||||
|
||||
queryType(): FullTextQueryType {
|
||||
@@ -889,18 +954,21 @@ export class MultiMatchQuery implements FullTextQuery {
|
||||
* @param columns - An array of column names to search within.
|
||||
* @param options - Optional parameters for the multi-match query.
|
||||
* - `boosts`: An array of boost factors for each column (default is 1.0 for all).
|
||||
* - `operator`: The logical operator to use for combining terms in the query (default is "OR").
|
||||
*/
|
||||
constructor(
|
||||
query: string,
|
||||
columns: string[],
|
||||
options?: {
|
||||
boosts?: number[];
|
||||
operator?: Operator;
|
||||
},
|
||||
) {
|
||||
this.inner = JsFullTextQuery.multiMatchQuery(
|
||||
query,
|
||||
columns,
|
||||
options?.boosts,
|
||||
options?.operator ?? Operator.Or,
|
||||
);
|
||||
}
|
||||
|
||||
@@ -908,3 +976,23 @@ export class MultiMatchQuery implements FullTextQuery {
|
||||
return FullTextQueryType.MultiMatch;
|
||||
}
|
||||
}
|
||||
|
||||
export class BooleanQuery implements FullTextQuery {
|
||||
/** @ignore */
|
||||
public readonly inner: JsFullTextQuery;
|
||||
/**
|
||||
* Creates an instance of BooleanQuery.
|
||||
*
|
||||
* @param queries - An array of (Occur, FullTextQuery objects) to combine.
|
||||
* Occur specifies whether the query must match, or should match.
|
||||
*/
|
||||
constructor(queries: [Occur, FullTextQuery][]) {
|
||||
this.inner = JsFullTextQuery.booleanQuery(
|
||||
queries.map(([occur, query]) => [occur, query.inner]),
|
||||
);
|
||||
}
|
||||
|
||||
queryType(): FullTextQueryType {
|
||||
return FullTextQueryType.Boolean;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,9 +6,11 @@ import {
|
||||
Data,
|
||||
DataType,
|
||||
IntoVector,
|
||||
MultiVector,
|
||||
Schema,
|
||||
dataTypeToJson,
|
||||
fromDataToBuffer,
|
||||
isMultiVector,
|
||||
tableFromIPC,
|
||||
} from "./arrow";
|
||||
|
||||
@@ -75,10 +77,10 @@ export interface OptimizeOptions {
|
||||
* // Delete all versions older than 1 day
|
||||
* const olderThan = new Date();
|
||||
* olderThan.setDate(olderThan.getDate() - 1));
|
||||
* tbl.cleanupOlderVersions(olderThan);
|
||||
* tbl.optimize({cleanupOlderThan: olderThan});
|
||||
*
|
||||
* // Delete all versions except the current version
|
||||
* tbl.cleanupOlderVersions(new Date());
|
||||
* tbl.optimize({cleanupOlderThan: new Date()});
|
||||
*/
|
||||
cleanupOlderThan: Date;
|
||||
deleteUnverified: boolean;
|
||||
@@ -346,7 +348,7 @@ export abstract class Table {
|
||||
* if the query is a string and no embedding function is defined, it will be treated as a full text search query
|
||||
*/
|
||||
abstract search(
|
||||
query: string | IntoVector | FullTextQuery,
|
||||
query: string | IntoVector | MultiVector | FullTextQuery,
|
||||
queryType?: string,
|
||||
ftsColumns?: string | string[],
|
||||
): VectorQuery | Query;
|
||||
@@ -357,7 +359,7 @@ export abstract class Table {
|
||||
* is the same thing as calling `nearestTo` on the builder returned
|
||||
* by `query`. @see {@link Query#nearestTo} for more details.
|
||||
*/
|
||||
abstract vectorSearch(vector: IntoVector): VectorQuery;
|
||||
abstract vectorSearch(vector: IntoVector | MultiVector): VectorQuery;
|
||||
/**
|
||||
* Add new columns with defined values.
|
||||
* @param {AddColumnsSql[]} newColumnTransforms pairs of column names and
|
||||
@@ -668,7 +670,7 @@ export class LocalTable extends Table {
|
||||
}
|
||||
|
||||
search(
|
||||
query: string | IntoVector | FullTextQuery,
|
||||
query: string | IntoVector | MultiVector | FullTextQuery,
|
||||
queryType: string = "auto",
|
||||
ftsColumns?: string | string[],
|
||||
): VectorQuery | Query {
|
||||
@@ -715,7 +717,15 @@ export class LocalTable extends Table {
|
||||
return this.query().nearestTo(queryPromise);
|
||||
}
|
||||
|
||||
vectorSearch(vector: IntoVector): VectorQuery {
|
||||
vectorSearch(vector: IntoVector | MultiVector): VectorQuery {
|
||||
if (isMultiVector(vector)) {
|
||||
const query = this.query().nearestTo(vector[0]);
|
||||
for (const v of vector.slice(1)) {
|
||||
query.addQueryVector(v);
|
||||
}
|
||||
return query;
|
||||
}
|
||||
|
||||
return this.query().nearestTo(vector);
|
||||
}
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-darwin-arm64",
|
||||
"version": "0.20.0-beta.2",
|
||||
"version": "0.21.2-beta.0",
|
||||
"os": ["darwin"],
|
||||
"cpu": ["arm64"],
|
||||
"main": "lancedb.darwin-arm64.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-darwin-x64",
|
||||
"version": "0.20.0-beta.2",
|
||||
"version": "0.21.2-beta.0",
|
||||
"os": ["darwin"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.darwin-x64.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-arm64-gnu",
|
||||
"version": "0.20.0-beta.2",
|
||||
"version": "0.21.2-beta.0",
|
||||
"os": ["linux"],
|
||||
"cpu": ["arm64"],
|
||||
"main": "lancedb.linux-arm64-gnu.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-arm64-musl",
|
||||
"version": "0.20.0-beta.2",
|
||||
"version": "0.21.2-beta.0",
|
||||
"os": ["linux"],
|
||||
"cpu": ["arm64"],
|
||||
"main": "lancedb.linux-arm64-musl.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-x64-gnu",
|
||||
"version": "0.20.0-beta.2",
|
||||
"version": "0.21.2-beta.0",
|
||||
"os": ["linux"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.linux-x64-gnu.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-x64-musl",
|
||||
"version": "0.20.0-beta.2",
|
||||
"version": "0.21.2-beta.0",
|
||||
"os": ["linux"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.linux-x64-musl.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-win32-arm64-msvc",
|
||||
"version": "0.20.0-beta.2",
|
||||
"version": "0.21.2-beta.0",
|
||||
"os": [
|
||||
"win32"
|
||||
],
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-win32-x64-msvc",
|
||||
"version": "0.20.0-beta.2",
|
||||
"version": "0.21.2-beta.0",
|
||||
"os": ["win32"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.win32-x64-msvc.node",
|
||||
|
||||
4
nodejs/package-lock.json
generated
4
nodejs/package-lock.json
generated
@@ -1,12 +1,12 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb",
|
||||
"version": "0.20.0-beta.2",
|
||||
"version": "0.21.2-beta.0",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "@lancedb/lancedb",
|
||||
"version": "0.20.0-beta.2",
|
||||
"version": "0.21.2-beta.0",
|
||||
"cpu": [
|
||||
"x64",
|
||||
"arm64"
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
"ann"
|
||||
],
|
||||
"private": false,
|
||||
"version": "0.20.0-beta.2",
|
||||
"version": "0.21.2-beta.0",
|
||||
"main": "dist/index.js",
|
||||
"exports": {
|
||||
".": "./dist/index.js",
|
||||
|
||||
@@ -123,6 +123,9 @@ impl Index {
|
||||
stem: Option<bool>,
|
||||
remove_stop_words: Option<bool>,
|
||||
ascii_folding: Option<bool>,
|
||||
ngram_min_length: Option<u32>,
|
||||
ngram_max_length: Option<u32>,
|
||||
prefix_only: Option<bool>,
|
||||
) -> Self {
|
||||
let mut opts = FtsIndexBuilder::default();
|
||||
if let Some(with_position) = with_position {
|
||||
@@ -149,6 +152,15 @@ impl Index {
|
||||
if let Some(ascii_folding) = ascii_folding {
|
||||
opts = opts.ascii_folding(ascii_folding);
|
||||
}
|
||||
if let Some(ngram_min_length) = ngram_min_length {
|
||||
opts = opts.ngram_min_length(ngram_min_length);
|
||||
}
|
||||
if let Some(ngram_max_length) = ngram_max_length {
|
||||
opts = opts.ngram_max_length(ngram_max_length);
|
||||
}
|
||||
if let Some(prefix_only) = prefix_only {
|
||||
opts = opts.ngram_prefix_only(prefix_only);
|
||||
}
|
||||
|
||||
Self {
|
||||
inner: Mutex::new(Some(LanceDbIndex::FTS(opts))),
|
||||
|
||||
@@ -4,7 +4,8 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use lancedb::index::scalar::{
|
||||
BoostQuery, FtsQuery, FullTextSearchQuery, MatchQuery, MultiMatchQuery, PhraseQuery,
|
||||
BooleanQuery, BoostQuery, FtsQuery, FullTextSearchQuery, MatchQuery, MultiMatchQuery, Occur,
|
||||
Operator, PhraseQuery,
|
||||
};
|
||||
use lancedb::query::ExecutableQuery;
|
||||
use lancedb::query::Query as LanceDbQuery;
|
||||
@@ -177,6 +178,31 @@ impl VectorQuery {
|
||||
self.inner = self.inner.clone().nprobes(nprobe as usize);
|
||||
}
|
||||
|
||||
#[napi]
|
||||
pub fn minimum_nprobes(&mut self, minimum_nprobe: u32) -> napi::Result<()> {
|
||||
self.inner = self
|
||||
.inner
|
||||
.clone()
|
||||
.minimum_nprobes(minimum_nprobe as usize)
|
||||
.default_error()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[napi]
|
||||
pub fn maximum_nprobes(&mut self, maximum_nprobes: u32) -> napi::Result<()> {
|
||||
let maximum_nprobes = if maximum_nprobes == 0 {
|
||||
None
|
||||
} else {
|
||||
Some(maximum_nprobes as usize)
|
||||
};
|
||||
self.inner = self
|
||||
.inner
|
||||
.clone()
|
||||
.maximum_nprobes(maximum_nprobes)
|
||||
.default_error()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[napi]
|
||||
pub fn distance_range(&mut self, lower_bound: Option<f64>, upper_bound: Option<f64>) {
|
||||
// napi doesn't support f32, so we have to convert to f32
|
||||
@@ -308,6 +334,8 @@ impl JsFullTextQuery {
|
||||
boost: f64,
|
||||
fuzziness: Option<u32>,
|
||||
max_expansions: u32,
|
||||
operator: String,
|
||||
prefix_length: u32,
|
||||
) -> napi::Result<Self> {
|
||||
Ok(Self {
|
||||
inner: MatchQuery::new(query)
|
||||
@@ -315,14 +343,23 @@ impl JsFullTextQuery {
|
||||
.with_boost(boost as f32)
|
||||
.with_fuzziness(fuzziness)
|
||||
.with_max_expansions(max_expansions as usize)
|
||||
.with_operator(
|
||||
Operator::try_from(operator.as_str()).map_err(|e| {
|
||||
napi::Error::from_reason(format!("Invalid operator: {}", e))
|
||||
})?,
|
||||
)
|
||||
.with_prefix_length(prefix_length)
|
||||
.into(),
|
||||
})
|
||||
}
|
||||
|
||||
#[napi(factory)]
|
||||
pub fn phrase_query(query: String, column: String) -> napi::Result<Self> {
|
||||
pub fn phrase_query(query: String, column: String, slop: u32) -> napi::Result<Self> {
|
||||
Ok(Self {
|
||||
inner: PhraseQuery::new(query).with_column(Some(column)).into(),
|
||||
inner: PhraseQuery::new(query)
|
||||
.with_column(Some(column))
|
||||
.with_slop(slop)
|
||||
.into(),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -348,6 +385,7 @@ impl JsFullTextQuery {
|
||||
query: String,
|
||||
columns: Vec<String>,
|
||||
boosts: Option<Vec<f64>>,
|
||||
operator: String,
|
||||
) -> napi::Result<Self> {
|
||||
let q = match boosts {
|
||||
Some(boosts) => MultiMatchQuery::try_new(query, columns)
|
||||
@@ -358,7 +396,37 @@ impl JsFullTextQuery {
|
||||
napi::Error::from_reason(format!("Failed to create multi match query: {}", e))
|
||||
})?;
|
||||
|
||||
Ok(Self { inner: q.into() })
|
||||
let operator = Operator::try_from(operator.as_str()).map_err(|e| {
|
||||
napi::Error::from_reason(format!("Invalid operator for multi match query: {}", e))
|
||||
})?;
|
||||
|
||||
Ok(Self {
|
||||
inner: q.with_operator(operator).into(),
|
||||
})
|
||||
}
|
||||
|
||||
#[napi(factory)]
|
||||
pub fn boolean_query(queries: Vec<(String, &JsFullTextQuery)>) -> napi::Result<Self> {
|
||||
let mut sub_queries = Vec::with_capacity(queries.len());
|
||||
for (occur, q) in queries {
|
||||
let occur = Occur::try_from(occur.as_str())
|
||||
.map_err(|e| napi::Error::from_reason(e.to_string()))?;
|
||||
sub_queries.push((occur, q.inner.clone()));
|
||||
}
|
||||
Ok(Self {
|
||||
inner: BooleanQuery::new(sub_queries).into(),
|
||||
})
|
||||
}
|
||||
|
||||
#[napi(getter)]
|
||||
pub fn query_type(&self) -> String {
|
||||
match self.inner {
|
||||
FtsQuery::Match(_) => "match".to_string(),
|
||||
FtsQuery::Phrase(_) => "phrase".to_string(),
|
||||
FtsQuery::Boost(_) => "boost".to_string(),
|
||||
FtsQuery::MultiMatch(_) => "multi_match".to_string(),
|
||||
FtsQuery::Boolean(_) => "boolean".to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
[tool.bumpversion]
|
||||
current_version = "0.23.0"
|
||||
current_version = "0.24.2-beta.1"
|
||||
parse = """(?x)
|
||||
(?P<major>0|[1-9]\\d*)\\.
|
||||
(?P<minor>0|[1-9]\\d*)\\.
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "lancedb-python"
|
||||
version = "0.23.0"
|
||||
version = "0.24.2-beta.1"
|
||||
edition.workspace = true
|
||||
description = "Python bindings for LanceDB"
|
||||
license.workspace = true
|
||||
|
||||
@@ -85,7 +85,7 @@ embeddings = [
|
||||
"boto3>=1.28.57",
|
||||
"awscli>=1.29.57",
|
||||
"botocore>=1.31.57",
|
||||
"ollama",
|
||||
"ollama>=0.3.0",
|
||||
"ibm-watsonx-ai>=1.1.2",
|
||||
]
|
||||
azure = ["adlfs>=2024.2.0"]
|
||||
|
||||
@@ -143,6 +143,8 @@ class VectorQuery:
|
||||
def postfilter(self): ...
|
||||
def refine_factor(self, refine_factor: int): ...
|
||||
def nprobes(self, nprobes: int): ...
|
||||
def minimum_nprobes(self, minimum_nprobes: int): ...
|
||||
def maximum_nprobes(self, maximum_nprobes: int): ...
|
||||
def bypass_vector_index(self): ...
|
||||
def nearest_to_text(self, query: dict) -> HybridQuery: ...
|
||||
def to_query_request(self) -> PyQueryRequest: ...
|
||||
@@ -158,6 +160,8 @@ class HybridQuery:
|
||||
def distance_type(self, distance_type: str): ...
|
||||
def refine_factor(self, refine_factor: int): ...
|
||||
def nprobes(self, nprobes: int): ...
|
||||
def minimum_nprobes(self, minimum_nprobes: int): ...
|
||||
def maximum_nprobes(self, maximum_nprobes: int): ...
|
||||
def bypass_vector_index(self): ...
|
||||
def to_vector_query(self) -> VectorQuery: ...
|
||||
def to_fts_query(self) -> FTSQuery: ...
|
||||
@@ -165,23 +169,21 @@ class HybridQuery:
|
||||
def get_with_row_id(self) -> bool: ...
|
||||
def to_query_request(self) -> PyQueryRequest: ...
|
||||
|
||||
class PyFullTextSearchQuery:
|
||||
columns: Optional[List[str]]
|
||||
query: str
|
||||
limit: Optional[int]
|
||||
wand_factor: Optional[float]
|
||||
class FullTextQuery:
|
||||
pass
|
||||
|
||||
class PyQueryRequest:
|
||||
limit: Optional[int]
|
||||
offset: Optional[int]
|
||||
filter: Optional[Union[str, bytes]]
|
||||
full_text_search: Optional[PyFullTextSearchQuery]
|
||||
full_text_search: Optional[FullTextQuery]
|
||||
select: Optional[Union[str, List[str]]]
|
||||
fast_search: Optional[bool]
|
||||
with_row_id: Optional[bool]
|
||||
column: Optional[str]
|
||||
query_vector: Optional[List[pa.Array]]
|
||||
nprobes: Optional[int]
|
||||
minimum_nprobes: Optional[int]
|
||||
maximum_nprobes: Optional[int]
|
||||
lower_bound: Optional[float]
|
||||
upper_bound: Optional[float]
|
||||
ef: Optional[int]
|
||||
|
||||
@@ -2,14 +2,15 @@
|
||||
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
from functools import cached_property
|
||||
from typing import TYPE_CHECKING, List, Optional, Union
|
||||
from typing import TYPE_CHECKING, List, Optional, Sequence, Union
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ..util import attempt_import_or_raise
|
||||
from .base import TextEmbeddingFunction
|
||||
from .registry import register
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import numpy as np
|
||||
import ollama
|
||||
|
||||
|
||||
@@ -28,23 +29,21 @@ class OllamaEmbeddings(TextEmbeddingFunction):
|
||||
keep_alive: Optional[Union[float, str]] = None
|
||||
ollama_client_kwargs: Optional[dict] = {}
|
||||
|
||||
def ndims(self):
|
||||
def ndims(self) -> int:
|
||||
return len(self.generate_embeddings(["foo"])[0])
|
||||
|
||||
def _compute_embedding(self, text) -> Union["np.array", None]:
|
||||
return (
|
||||
self._ollama_client.embeddings(
|
||||
model=self.name,
|
||||
prompt=text,
|
||||
options=self.options,
|
||||
keep_alive=self.keep_alive,
|
||||
)["embedding"]
|
||||
or None
|
||||
def _compute_embedding(self, text: Sequence[str]) -> Sequence[Sequence[float]]:
|
||||
response = self._ollama_client.embed(
|
||||
model=self.name,
|
||||
input=text,
|
||||
options=self.options,
|
||||
keep_alive=self.keep_alive,
|
||||
)
|
||||
return response.embeddings
|
||||
|
||||
def generate_embeddings(
|
||||
self, texts: Union[List[str], "np.ndarray"]
|
||||
) -> list[Union["np.array", None]]:
|
||||
self, texts: Union[List[str], np.ndarray]
|
||||
) -> list[Union[np.array, None]]:
|
||||
"""
|
||||
Get the embeddings for the given texts
|
||||
|
||||
@@ -54,8 +53,8 @@ class OllamaEmbeddings(TextEmbeddingFunction):
|
||||
The texts to embed
|
||||
"""
|
||||
# TODO retry, rate limit, token limit
|
||||
embeddings = [self._compute_embedding(text) for text in texts]
|
||||
return embeddings
|
||||
embeddings = self._compute_embedding(texts)
|
||||
return list(embeddings)
|
||||
|
||||
@cached_property
|
||||
def _ollama_client(self) -> "ollama.Client":
|
||||
|
||||
@@ -137,6 +137,9 @@ class FTS:
|
||||
stem: bool = True
|
||||
remove_stop_words: bool = True
|
||||
ascii_folding: bool = True
|
||||
ngram_min_length: int = 3
|
||||
ngram_max_length: int = 3
|
||||
prefix_only: bool = False
|
||||
|
||||
|
||||
@dataclass
|
||||
|
||||
@@ -4,7 +4,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
import abc
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from enum import Enum
|
||||
from datetime import timedelta
|
||||
@@ -88,15 +87,28 @@ def ensure_vector_query(
|
||||
return val
|
||||
|
||||
|
||||
class FullTextQueryType(Enum):
|
||||
class FullTextQueryType(str, Enum):
|
||||
MATCH = "match"
|
||||
MATCH_PHRASE = "match_phrase"
|
||||
BOOST = "boost"
|
||||
MULTI_MATCH = "multi_match"
|
||||
BOOLEAN = "boolean"
|
||||
|
||||
|
||||
class FullTextQuery(abc.ABC, pydantic.BaseModel):
|
||||
@abc.abstractmethod
|
||||
class FullTextOperator(str, Enum):
|
||||
AND = "AND"
|
||||
OR = "OR"
|
||||
|
||||
|
||||
class Occur(str, Enum):
|
||||
SHOULD = "SHOULD"
|
||||
MUST = "MUST"
|
||||
MUST_NOT = "MUST_NOT"
|
||||
|
||||
|
||||
@pydantic.dataclasses.dataclass
|
||||
class FullTextQuery(ABC):
|
||||
@abstractmethod
|
||||
def query_type(self) -> FullTextQueryType:
|
||||
"""
|
||||
Get the query type of the query.
|
||||
@@ -106,193 +118,178 @@ class FullTextQuery(abc.ABC, pydantic.BaseModel):
|
||||
str
|
||||
The type of the query.
|
||||
"""
|
||||
pass
|
||||
|
||||
@abc.abstractmethod
|
||||
def to_dict(self) -> dict:
|
||||
def __and__(self, other: "FullTextQuery") -> "FullTextQuery":
|
||||
"""
|
||||
Convert the query to a dictionary.
|
||||
|
||||
Returns
|
||||
-------
|
||||
dict
|
||||
The query as a dictionary.
|
||||
"""
|
||||
|
||||
|
||||
class MatchQuery(FullTextQuery):
|
||||
query: str
|
||||
column: str
|
||||
boost: float = 1.0
|
||||
fuzziness: int = 0
|
||||
max_expansions: int = 50
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
query: str,
|
||||
column: str,
|
||||
*,
|
||||
boost: float = 1.0,
|
||||
fuzziness: int = 0,
|
||||
max_expansions: int = 50,
|
||||
):
|
||||
"""
|
||||
Match query for full-text search.
|
||||
Combine two queries with a logical AND operation.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
query : str
|
||||
The query string to match against.
|
||||
column : str
|
||||
The name of the column to match against.
|
||||
boost : float, default 1.0
|
||||
The boost factor for the query.
|
||||
The score of each matching document is multiplied by this value.
|
||||
fuzziness : int, optional
|
||||
The maximum edit distance for each term in the match query.
|
||||
Defaults to 0 (exact match).
|
||||
If None, fuzziness is applied automatically by the rules:
|
||||
- 0 for terms with length <= 2
|
||||
- 1 for terms with length <= 5
|
||||
- 2 for terms with length > 5
|
||||
max_expansions : int, optional
|
||||
The maximum number of terms to consider for fuzzy matching.
|
||||
Defaults to 50.
|
||||
other : FullTextQuery
|
||||
The other query to combine with.
|
||||
|
||||
Returns
|
||||
-------
|
||||
FullTextQuery
|
||||
A new query that combines both queries with AND.
|
||||
"""
|
||||
super().__init__(
|
||||
query=query,
|
||||
column=column,
|
||||
boost=boost,
|
||||
fuzziness=fuzziness,
|
||||
max_expansions=max_expansions,
|
||||
)
|
||||
return BooleanQuery([(Occur.MUST, self), (Occur.MUST, other)])
|
||||
|
||||
def __or__(self, other: "FullTextQuery") -> "FullTextQuery":
|
||||
"""
|
||||
Combine two queries with a logical OR operation.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
other : FullTextQuery
|
||||
The other query to combine with.
|
||||
|
||||
Returns
|
||||
-------
|
||||
FullTextQuery
|
||||
A new query that combines both queries with OR.
|
||||
"""
|
||||
return BooleanQuery([(Occur.SHOULD, self), (Occur.SHOULD, other)])
|
||||
|
||||
|
||||
@pydantic.dataclasses.dataclass
|
||||
class MatchQuery(FullTextQuery):
|
||||
"""
|
||||
Match query for full-text search.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
query : str
|
||||
The query string to match against.
|
||||
column : str
|
||||
The name of the column to match against.
|
||||
boost : float, default 1.0
|
||||
The boost factor for the query.
|
||||
The score of each matching document is multiplied by this value.
|
||||
fuzziness : int, optional
|
||||
The maximum edit distance for each term in the match query.
|
||||
Defaults to 0 (exact match).
|
||||
If None, fuzziness is applied automatically by the rules:
|
||||
- 0 for terms with length <= 2
|
||||
- 1 for terms with length <= 5
|
||||
- 2 for terms with length > 5
|
||||
max_expansions : int, optional
|
||||
The maximum number of terms to consider for fuzzy matching.
|
||||
Defaults to 50.
|
||||
operator : FullTextOperator, default OR
|
||||
The operator to use for combining the query results.
|
||||
Can be either `AND` or `OR`.
|
||||
If `AND`, all terms in the query must match.
|
||||
If `OR`, at least one term in the query must match.
|
||||
prefix_length : int, optional
|
||||
The number of beginning characters being unchanged for fuzzy matching.
|
||||
This is useful to achieve prefix matching.
|
||||
"""
|
||||
|
||||
query: str
|
||||
column: str
|
||||
boost: float = pydantic.Field(1.0, kw_only=True)
|
||||
fuzziness: int = pydantic.Field(0, kw_only=True)
|
||||
max_expansions: int = pydantic.Field(50, kw_only=True)
|
||||
operator: FullTextOperator = pydantic.Field(FullTextOperator.OR, kw_only=True)
|
||||
prefix_length: int = pydantic.Field(0, kw_only=True)
|
||||
|
||||
def query_type(self) -> FullTextQueryType:
|
||||
return FullTextQueryType.MATCH
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"match": {
|
||||
self.column: {
|
||||
"query": self.query,
|
||||
"boost": self.boost,
|
||||
"fuzziness": self.fuzziness,
|
||||
"max_expansions": self.max_expansions,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@pydantic.dataclasses.dataclass
|
||||
class PhraseQuery(FullTextQuery):
|
||||
"""
|
||||
Phrase query for full-text search.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
query : str
|
||||
The query string to match against.
|
||||
column : str
|
||||
The name of the column to match against.
|
||||
"""
|
||||
|
||||
query: str
|
||||
column: str
|
||||
|
||||
def __init__(self, query: str, column: str):
|
||||
"""
|
||||
Phrase query for full-text search.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
query : str
|
||||
The query string to match against.
|
||||
column : str
|
||||
The name of the column to match against.
|
||||
"""
|
||||
super().__init__(query=query, column=column)
|
||||
slop: int = pydantic.Field(0, kw_only=True)
|
||||
|
||||
def query_type(self) -> FullTextQueryType:
|
||||
return FullTextQueryType.MATCH_PHRASE
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"match_phrase": {
|
||||
self.column: self.query,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@pydantic.dataclasses.dataclass
|
||||
class BoostQuery(FullTextQuery):
|
||||
"""
|
||||
Boost query for full-text search.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
positive : dict
|
||||
The positive query object.
|
||||
negative : dict
|
||||
The negative query object.
|
||||
negative_boost : float, default 0.5
|
||||
The boost factor for the negative query.
|
||||
"""
|
||||
|
||||
positive: FullTextQuery
|
||||
negative: FullTextQuery
|
||||
negative_boost: float = 0.5
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
positive: FullTextQuery,
|
||||
negative: FullTextQuery,
|
||||
*,
|
||||
negative_boost: float = 0.5,
|
||||
):
|
||||
"""
|
||||
Boost query for full-text search.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
positive : dict
|
||||
The positive query object.
|
||||
negative : dict
|
||||
The negative query object.
|
||||
negative_boost : float
|
||||
The boost factor for the negative query.
|
||||
"""
|
||||
super().__init__(
|
||||
positive=positive, negative=negative, negative_boost=negative_boost
|
||||
)
|
||||
negative_boost: float = pydantic.Field(0.5, kw_only=True)
|
||||
|
||||
def query_type(self) -> FullTextQueryType:
|
||||
return FullTextQueryType.BOOST
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"boost": {
|
||||
"positive": self.positive.to_dict(),
|
||||
"negative": self.negative.to_dict(),
|
||||
"negative_boost": self.negative_boost,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@pydantic.dataclasses.dataclass
|
||||
class MultiMatchQuery(FullTextQuery):
|
||||
"""
|
||||
Multi-match query for full-text search.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
query : str | list[Query]
|
||||
If a string, the query string to match against.
|
||||
columns : list[str]
|
||||
The list of columns to match against.
|
||||
boosts : list[float], optional
|
||||
The list of boost factors for each column. If not provided,
|
||||
all columns will have the same boost factor.
|
||||
operator : FullTextOperator, default OR
|
||||
The operator to use for combining the query results.
|
||||
Can be either `AND` or `OR`.
|
||||
It would be applied to all columns individually.
|
||||
For example, if the operator is `AND`,
|
||||
then the query "hello world" is equal to
|
||||
`match("hello AND world", column1) OR match("hello AND world", column2)`.
|
||||
"""
|
||||
|
||||
query: str
|
||||
columns: list[str]
|
||||
boosts: list[float]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
query: str,
|
||||
columns: list[str],
|
||||
*,
|
||||
boosts: Optional[list[float]] = None,
|
||||
):
|
||||
"""
|
||||
Multi-match query for full-text search.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
query : str
|
||||
The query string to match against.
|
||||
|
||||
columns : list[str]
|
||||
The list of columns to match against.
|
||||
|
||||
boosts : list[float], optional
|
||||
The list of boost factors for each column. If not provided,
|
||||
all columns will have the same boost factor.
|
||||
"""
|
||||
if boosts is None:
|
||||
boosts = [1.0] * len(columns)
|
||||
super().__init__(query=query, columns=columns, boosts=boosts)
|
||||
boosts: Optional[list[float]] = pydantic.Field(None, kw_only=True)
|
||||
operator: FullTextOperator = pydantic.Field(FullTextOperator.OR, kw_only=True)
|
||||
|
||||
def query_type(self) -> FullTextQueryType:
|
||||
return FullTextQueryType.MULTI_MATCH
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"multi_match": {
|
||||
"query": self.query,
|
||||
"columns": self.columns,
|
||||
"boost": self.boosts,
|
||||
}
|
||||
}
|
||||
|
||||
@pydantic.dataclasses.dataclass
|
||||
class BooleanQuery(FullTextQuery):
|
||||
"""
|
||||
Boolean query for full-text search.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
queries : list[tuple(Occur, FullTextQuery)]
|
||||
The list of queries with their occurrence requirements.
|
||||
"""
|
||||
|
||||
queries: list[tuple[Occur, FullTextQuery]]
|
||||
|
||||
def query_type(self) -> FullTextQueryType:
|
||||
return FullTextQueryType.BOOLEAN
|
||||
|
||||
|
||||
class FullTextSearchQuery(pydantic.BaseModel):
|
||||
@@ -445,8 +442,18 @@ class Query(pydantic.BaseModel):
|
||||
# which columns to return in the results
|
||||
columns: Optional[Union[List[str], Dict[str, str]]] = None
|
||||
|
||||
# number of IVF partitions to search
|
||||
nprobes: Optional[int] = None
|
||||
# minimum number of IVF partitions to search
|
||||
#
|
||||
# If None then a default value (20) will be used.
|
||||
minimum_nprobes: Optional[int] = None
|
||||
|
||||
# maximum number of IVF partitions to search
|
||||
#
|
||||
# If None then a default value (20) will be used.
|
||||
#
|
||||
# If 0 then no limit will be applied and all partitions could be searched
|
||||
# if needed to satisfy the limit.
|
||||
maximum_nprobes: Optional[int] = None
|
||||
|
||||
# lower bound for distance search
|
||||
lower_bound: Optional[float] = None
|
||||
@@ -484,7 +491,8 @@ class Query(pydantic.BaseModel):
|
||||
query.vector_column = req.column
|
||||
query.vector = req.query_vector
|
||||
query.distance_type = req.distance_type
|
||||
query.nprobes = req.nprobes
|
||||
query.minimum_nprobes = req.minimum_nprobes
|
||||
query.maximum_nprobes = req.maximum_nprobes
|
||||
query.lower_bound = req.lower_bound
|
||||
query.upper_bound = req.upper_bound
|
||||
query.ef = req.ef
|
||||
@@ -493,10 +501,8 @@ class Query(pydantic.BaseModel):
|
||||
query.postfilter = req.postfilter
|
||||
if req.full_text_search is not None:
|
||||
query.full_text_query = FullTextSearchQuery(
|
||||
columns=req.full_text_search.columns,
|
||||
query=req.full_text_search.query,
|
||||
limit=req.full_text_search.limit,
|
||||
wand_factor=req.full_text_search.wand_factor,
|
||||
columns=None,
|
||||
query=req.full_text_search,
|
||||
)
|
||||
return query
|
||||
|
||||
@@ -1047,7 +1053,8 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
|
||||
super().__init__(table)
|
||||
self._query = query
|
||||
self._distance_type = None
|
||||
self._nprobes = None
|
||||
self._minimum_nprobes = None
|
||||
self._maximum_nprobes = None
|
||||
self._lower_bound = None
|
||||
self._upper_bound = None
|
||||
self._refine_factor = None
|
||||
@@ -1110,6 +1117,10 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
|
||||
See discussion in [Querying an ANN Index][querying-an-ann-index] for
|
||||
tuning advice.
|
||||
|
||||
This method sets both the minimum and maximum number of probes to the same
|
||||
value. See `minimum_nprobes` and `maximum_nprobes` for more fine-grained
|
||||
control.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
nprobes: int
|
||||
@@ -1120,7 +1131,36 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
|
||||
LanceVectorQueryBuilder
|
||||
The LanceQueryBuilder object.
|
||||
"""
|
||||
self._nprobes = nprobes
|
||||
self._minimum_nprobes = nprobes
|
||||
self._maximum_nprobes = nprobes
|
||||
return self
|
||||
|
||||
def minimum_nprobes(self, minimum_nprobes: int) -> LanceVectorQueryBuilder:
|
||||
"""Set the minimum number of probes to use.
|
||||
|
||||
See `nprobes` for more details.
|
||||
|
||||
These partitions will be searched on every vector query and will increase recall
|
||||
at the expense of latency.
|
||||
"""
|
||||
self._minimum_nprobes = minimum_nprobes
|
||||
return self
|
||||
|
||||
def maximum_nprobes(self, maximum_nprobes: int) -> LanceVectorQueryBuilder:
|
||||
"""Set the maximum number of probes to use.
|
||||
|
||||
See `nprobes` for more details.
|
||||
|
||||
If this value is greater than `minimum_nprobes` then the excess partitions
|
||||
will be searched only if we have not found enough results.
|
||||
|
||||
This can be useful when there is a narrow filter to allow these queries to
|
||||
spend more time searching and avoid potential false negatives.
|
||||
|
||||
If this value is 0 then no limit will be applied and all partitions could be
|
||||
searched if needed to satisfy the limit.
|
||||
"""
|
||||
self._maximum_nprobes = maximum_nprobes
|
||||
return self
|
||||
|
||||
def distance_range(
|
||||
@@ -1224,7 +1264,8 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
|
||||
limit=self._limit,
|
||||
distance_type=self._distance_type,
|
||||
columns=self._columns,
|
||||
nprobes=self._nprobes,
|
||||
minimum_nprobes=self._minimum_nprobes,
|
||||
maximum_nprobes=self._maximum_nprobes,
|
||||
lower_bound=self._lower_bound,
|
||||
upper_bound=self._upper_bound,
|
||||
refine_factor=self._refine_factor,
|
||||
@@ -1333,6 +1374,8 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
|
||||
if query_string is not None and not isinstance(query_string, str):
|
||||
raise ValueError("Reranking currently only supports string queries")
|
||||
self._str_query = query_string if query_string is not None else self._str_query
|
||||
if reranker.score == "all":
|
||||
self.with_row_id(True)
|
||||
return self
|
||||
|
||||
def bypass_vector_index(self) -> LanceVectorQueryBuilder:
|
||||
@@ -1410,10 +1453,13 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):
|
||||
|
||||
query = self._query
|
||||
if self._phrase_query:
|
||||
raise NotImplementedError(
|
||||
"Phrase query is not yet supported in Lance FTS. "
|
||||
"Use tantivy-based index instead for now."
|
||||
)
|
||||
if isinstance(query, str):
|
||||
if not query.startswith('"') or not query.endswith('"'):
|
||||
query = f'"{query}"'
|
||||
elif isinstance(query, FullTextQuery) and not isinstance(
|
||||
query, PhraseQuery
|
||||
):
|
||||
raise TypeError("Please use PhraseQuery for phrase queries.")
|
||||
query = self.to_query_object()
|
||||
results = self._table._execute_query(query, timeout=timeout)
|
||||
results = results.read_all()
|
||||
@@ -1525,6 +1571,8 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):
|
||||
The LanceQueryBuilder object.
|
||||
"""
|
||||
self._reranker = reranker
|
||||
if reranker.score == "all":
|
||||
self.with_row_id(True)
|
||||
return self
|
||||
|
||||
|
||||
@@ -1588,7 +1636,8 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
|
||||
self._fts_columns = fts_columns
|
||||
self._norm = None
|
||||
self._reranker = None
|
||||
self._nprobes = None
|
||||
self._minimum_nprobes = None
|
||||
self._maximum_nprobes = None
|
||||
self._refine_factor = None
|
||||
self._distance_type = None
|
||||
self._phrase_query = None
|
||||
@@ -1800,6 +1849,8 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
|
||||
|
||||
self._norm = normalize
|
||||
self._reranker = reranker
|
||||
if reranker.score == "all":
|
||||
self.with_row_id(True)
|
||||
|
||||
return self
|
||||
|
||||
@@ -1820,7 +1871,24 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
|
||||
LanceHybridQueryBuilder
|
||||
The LanceHybridQueryBuilder object.
|
||||
"""
|
||||
self._nprobes = nprobes
|
||||
self._minimum_nprobes = nprobes
|
||||
self._maximum_nprobes = nprobes
|
||||
return self
|
||||
|
||||
def minimum_nprobes(self, minimum_nprobes: int) -> LanceHybridQueryBuilder:
|
||||
"""Set the minimum number of probes to use.
|
||||
|
||||
See `nprobes` for more details.
|
||||
"""
|
||||
self._minimum_nprobes = minimum_nprobes
|
||||
return self
|
||||
|
||||
def maximum_nprobes(self, maximum_nprobes: int) -> LanceHybridQueryBuilder:
|
||||
"""Set the maximum number of probes to use.
|
||||
|
||||
See `nprobes` for more details.
|
||||
"""
|
||||
self._maximum_nprobes = maximum_nprobes
|
||||
return self
|
||||
|
||||
def distance_range(
|
||||
@@ -2049,8 +2117,10 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
|
||||
self._fts_query.phrase_query(True)
|
||||
if self._distance_type:
|
||||
self._vector_query.metric(self._distance_type)
|
||||
if self._nprobes:
|
||||
self._vector_query.nprobes(self._nprobes)
|
||||
if self._minimum_nprobes:
|
||||
self._vector_query.minimum_nprobes(self._minimum_nprobes)
|
||||
if self._maximum_nprobes is not None:
|
||||
self._vector_query.maximum_nprobes(self._maximum_nprobes)
|
||||
if self._refine_factor:
|
||||
self._vector_query.refine_factor(self._refine_factor)
|
||||
if self._ef:
|
||||
@@ -2513,7 +2583,7 @@ class AsyncQuery(AsyncQueryBase):
|
||||
self._inner.nearest_to_text({"query": query, "columns": columns})
|
||||
)
|
||||
# FullTextQuery object
|
||||
return AsyncFTSQuery(self._inner.nearest_to_text({"query": query.to_dict()}))
|
||||
return AsyncFTSQuery(self._inner.nearest_to_text({"query": query}))
|
||||
|
||||
|
||||
class AsyncFTSQuery(AsyncQueryBase):
|
||||
@@ -2661,6 +2731,34 @@ class AsyncVectorQueryBase:
|
||||
self._inner.nprobes(nprobes)
|
||||
return self
|
||||
|
||||
def minimum_nprobes(self, minimum_nprobes: int) -> Self:
|
||||
"""Set the minimum number of probes to use.
|
||||
|
||||
See `nprobes` for more details.
|
||||
|
||||
These partitions will be searched on every indexed vector query and will
|
||||
increase recall at the expense of latency.
|
||||
"""
|
||||
self._inner.minimum_nprobes(minimum_nprobes)
|
||||
return self
|
||||
|
||||
def maximum_nprobes(self, maximum_nprobes: int) -> Self:
|
||||
"""Set the maximum number of probes to use.
|
||||
|
||||
See `nprobes` for more details.
|
||||
|
||||
If this value is greater than `minimum_nprobes` then the excess partitions
|
||||
will be searched only if we have not found enough results.
|
||||
|
||||
This can be useful when there is a narrow filter to allow these queries to
|
||||
spend more time searching and avoid potential false negatives.
|
||||
|
||||
If this value is 0 then no limit will be applied and all partitions could be
|
||||
searched if needed to satisfy the limit.
|
||||
"""
|
||||
self._inner.maximum_nprobes(maximum_nprobes)
|
||||
return self
|
||||
|
||||
def distance_range(
|
||||
self, lower_bound: Optional[float] = None, upper_bound: Optional[float] = None
|
||||
) -> Self:
|
||||
@@ -2835,7 +2933,7 @@ class AsyncVectorQuery(AsyncQueryBase, AsyncVectorQueryBase):
|
||||
self._inner.nearest_to_text({"query": query, "columns": columns})
|
||||
)
|
||||
# FullTextQuery object
|
||||
return AsyncHybridQuery(self._inner.nearest_to_text({"query": query.to_dict()}))
|
||||
return AsyncHybridQuery(self._inner.nearest_to_text({"query": query}))
|
||||
|
||||
async def to_batches(
|
||||
self,
|
||||
@@ -2950,15 +3048,21 @@ class AsyncHybridQuery(AsyncQueryBase, AsyncVectorQueryBase):
|
||||
>>> asyncio.run(doctest_example()) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
|
||||
Vector Search Plan:
|
||||
ProjectionExec: expr=[vector@0 as vector, text@3 as text, _distance@2 as _distance]
|
||||
Take: columns="vector, _rowid, _distance, (text)"
|
||||
CoalesceBatchesExec: target_batch_size=1024
|
||||
GlobalLimitExec: skip=0, fetch=10
|
||||
FilterExec: _distance@2 IS NOT NULL
|
||||
SortExec: TopK(fetch=10), expr=[_distance@2 ASC NULLS LAST], preserve_partitioning=[false]
|
||||
KNNVectorDistance: metric=l2
|
||||
LanceScan: uri=..., projection=[vector], row_id=true, row_addr=false, ordered=false
|
||||
Take: columns="vector, _rowid, _distance, (text)"
|
||||
CoalesceBatchesExec: target_batch_size=1024
|
||||
GlobalLimitExec: skip=0, fetch=10
|
||||
FilterExec: _distance@2 IS NOT NULL
|
||||
SortExec: TopK(fetch=10), expr=[_distance@2 ASC NULLS LAST], preserve_partitioning=[false]
|
||||
KNNVectorDistance: metric=l2
|
||||
LanceScan: uri=..., projection=[vector], row_id=true, row_addr=false, ordered=false
|
||||
<BLANKLINE>
|
||||
FTS Search Plan:
|
||||
LanceScan: uri=..., projection=[vector, text], row_id=false, row_addr=false, ordered=true
|
||||
ProjectionExec: expr=[vector@2 as vector, text@3 as text, _score@1 as _score]
|
||||
Take: columns="_rowid, _score, (vector), (text)"
|
||||
CoalesceBatchesExec: target_batch_size=1024
|
||||
GlobalLimitExec: skip=0, fetch=10
|
||||
MatchQuery: query=hello
|
||||
<BLANKLINE>
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
||||
@@ -18,7 +18,7 @@ from lancedb._lancedb import (
|
||||
UpdateResult,
|
||||
)
|
||||
from lancedb.embeddings.base import EmbeddingFunctionConfig
|
||||
from lancedb.index import FTS, BTree, Bitmap, HnswPq, HnswSq, IvfFlat, IvfPq, LabelList
|
||||
from lancedb.index import FTS, BTree, Bitmap, HnswSq, IvfFlat, IvfPq, LabelList
|
||||
from lancedb.remote.db import LOOP
|
||||
import pyarrow as pa
|
||||
|
||||
@@ -89,7 +89,7 @@ class RemoteTable(Table):
|
||||
|
||||
def to_pandas(self):
|
||||
"""to_pandas() is not yet supported on LanceDB cloud."""
|
||||
return NotImplementedError("to_pandas() is not yet supported on LanceDB cloud.")
|
||||
raise NotImplementedError("to_pandas() is not yet supported on LanceDB cloud.")
|
||||
|
||||
def checkout(self, version: Union[int, str]):
|
||||
return LOOP.run(self._table.checkout(version))
|
||||
@@ -158,6 +158,9 @@ class RemoteTable(Table):
|
||||
stem: bool = True,
|
||||
remove_stop_words: bool = True,
|
||||
ascii_folding: bool = True,
|
||||
ngram_min_length: int = 3,
|
||||
ngram_max_length: int = 3,
|
||||
prefix_only: bool = False,
|
||||
):
|
||||
config = FTS(
|
||||
with_position=with_position,
|
||||
@@ -168,6 +171,9 @@ class RemoteTable(Table):
|
||||
stem=stem,
|
||||
remove_stop_words=remove_stop_words,
|
||||
ascii_folding=ascii_folding,
|
||||
ngram_min_length=ngram_min_length,
|
||||
ngram_max_length=ngram_max_length,
|
||||
prefix_only=prefix_only,
|
||||
)
|
||||
LOOP.run(
|
||||
self._table.create_index(
|
||||
@@ -186,6 +192,8 @@ class RemoteTable(Table):
|
||||
accelerator: Optional[str] = None,
|
||||
index_type="vector",
|
||||
wait_timeout: Optional[timedelta] = None,
|
||||
*,
|
||||
num_bits: int = 8,
|
||||
):
|
||||
"""Create an index on the table.
|
||||
Currently, the only parameters that matter are
|
||||
@@ -220,11 +228,6 @@ class RemoteTable(Table):
|
||||
>>> table.create_index("l2", "vector") # doctest: +SKIP
|
||||
"""
|
||||
|
||||
if num_partitions is not None:
|
||||
logging.warning(
|
||||
"num_partitions is not supported on LanceDB cloud."
|
||||
"This parameter will be tuned automatically."
|
||||
)
|
||||
if num_sub_vectors is not None:
|
||||
logging.warning(
|
||||
"num_sub_vectors is not supported on LanceDB cloud."
|
||||
@@ -244,13 +247,21 @@ class RemoteTable(Table):
|
||||
|
||||
index_type = index_type.upper()
|
||||
if index_type == "VECTOR" or index_type == "IVF_PQ":
|
||||
config = IvfPq(distance_type=metric)
|
||||
config = IvfPq(
|
||||
distance_type=metric,
|
||||
num_partitions=num_partitions,
|
||||
num_sub_vectors=num_sub_vectors,
|
||||
num_bits=num_bits,
|
||||
)
|
||||
elif index_type == "IVF_HNSW_PQ":
|
||||
config = HnswPq(distance_type=metric)
|
||||
raise ValueError(
|
||||
"IVF_HNSW_PQ is not supported on LanceDB cloud."
|
||||
"Please use IVF_HNSW_SQ instead."
|
||||
)
|
||||
elif index_type == "IVF_HNSW_SQ":
|
||||
config = HnswSq(distance_type=metric)
|
||||
config = HnswSq(distance_type=metric, num_partitions=num_partitions)
|
||||
elif index_type == "IVF_FLAT":
|
||||
config = IvfFlat(distance_type=metric)
|
||||
config = IvfFlat(distance_type=metric, num_partitions=num_partitions)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Unknown vector index type: {index_type}. Valid options are"
|
||||
|
||||
@@ -74,9 +74,7 @@ class AnswerdotaiRerankers(Reranker):
|
||||
if self.score == "relevance":
|
||||
combined_results = self._keep_relevance_score(combined_results)
|
||||
elif self.score == "all":
|
||||
raise NotImplementedError(
|
||||
"Answerdotai Reranker does not support score='all' yet"
|
||||
)
|
||||
combined_results = self._merge_and_keep_scores(vector_results, fts_results)
|
||||
combined_results = combined_results.sort_by(
|
||||
[("_relevance_score", "descending")]
|
||||
)
|
||||
|
||||
@@ -232,6 +232,39 @@ class Reranker(ABC):
|
||||
|
||||
return deduped_table
|
||||
|
||||
def _merge_and_keep_scores(self, vector_results: pa.Table, fts_results: pa.Table):
|
||||
"""
|
||||
Merge the results from the vector and FTS search and keep the scores.
|
||||
This op is slower than just keeping relevance score but can be useful
|
||||
for debugging.
|
||||
"""
|
||||
# add nulls to fts results for _distance
|
||||
if "_distance" not in fts_results.column_names:
|
||||
fts_results = fts_results.append_column(
|
||||
"_distance",
|
||||
pa.array([None] * len(fts_results), type=pa.float32()),
|
||||
)
|
||||
# add nulls to vector results for _score
|
||||
if "_score" not in vector_results.column_names:
|
||||
vector_results = vector_results.append_column(
|
||||
"_score",
|
||||
pa.array([None] * len(vector_results), type=pa.float32()),
|
||||
)
|
||||
|
||||
# combine them and fill the scores
|
||||
vector_results_dict = {row["_rowid"]: row for row in vector_results.to_pylist()}
|
||||
fts_results_dict = {row["_rowid"]: row for row in fts_results.to_pylist()}
|
||||
|
||||
# merge them into vector_results
|
||||
for key, value in fts_results_dict.items():
|
||||
if key in vector_results_dict:
|
||||
vector_results_dict[key]["_score"] = value["_score"]
|
||||
else:
|
||||
vector_results_dict[key] = value
|
||||
|
||||
combined = pa.Table.from_pylist(list(vector_results_dict.values()))
|
||||
return combined
|
||||
|
||||
def _keep_relevance_score(self, combined_results: pa.Table):
|
||||
if self.score == "relevance":
|
||||
if "_score" in combined_results.column_names:
|
||||
|
||||
@@ -92,14 +92,14 @@ class CohereReranker(Reranker):
|
||||
vector_results: pa.Table,
|
||||
fts_results: pa.Table,
|
||||
):
|
||||
combined_results = self.merge_results(vector_results, fts_results)
|
||||
if self.score == "all":
|
||||
combined_results = self._merge_and_keep_scores(vector_results, fts_results)
|
||||
else:
|
||||
combined_results = self.merge_results(vector_results, fts_results)
|
||||
combined_results = self._rerank(combined_results, query)
|
||||
if self.score == "relevance":
|
||||
combined_results = self._keep_relevance_score(combined_results)
|
||||
elif self.score == "all":
|
||||
raise NotImplementedError(
|
||||
"return_score='all' not implemented for cohere reranker"
|
||||
)
|
||||
|
||||
return combined_results
|
||||
|
||||
def rerank_vector(self, query: str, vector_results: pa.Table):
|
||||
|
||||
@@ -81,15 +81,15 @@ class CrossEncoderReranker(Reranker):
|
||||
vector_results: pa.Table,
|
||||
fts_results: pa.Table,
|
||||
):
|
||||
combined_results = self.merge_results(vector_results, fts_results)
|
||||
if self.score == "all":
|
||||
combined_results = self._merge_and_keep_scores(vector_results, fts_results)
|
||||
else:
|
||||
combined_results = self.merge_results(vector_results, fts_results)
|
||||
combined_results = self._rerank(combined_results, query)
|
||||
# sort the results by _score
|
||||
if self.score == "relevance":
|
||||
combined_results = self._keep_relevance_score(combined_results)
|
||||
elif self.score == "all":
|
||||
raise NotImplementedError(
|
||||
"return_score='all' not implemented for CrossEncoderReranker"
|
||||
)
|
||||
|
||||
combined_results = combined_results.sort_by(
|
||||
[("_relevance_score", "descending")]
|
||||
)
|
||||
|
||||
@@ -97,14 +97,14 @@ class JinaReranker(Reranker):
|
||||
vector_results: pa.Table,
|
||||
fts_results: pa.Table,
|
||||
):
|
||||
combined_results = self.merge_results(vector_results, fts_results)
|
||||
if self.score == "all":
|
||||
combined_results = self._merge_and_keep_scores(vector_results, fts_results)
|
||||
else:
|
||||
combined_results = self.merge_results(vector_results, fts_results)
|
||||
combined_results = self._rerank(combined_results, query)
|
||||
if self.score == "relevance":
|
||||
combined_results = self._keep_relevance_score(combined_results)
|
||||
elif self.score == "all":
|
||||
raise NotImplementedError(
|
||||
"return_score='all' not implemented for JinaReranker"
|
||||
)
|
||||
|
||||
return combined_results
|
||||
|
||||
def rerank_vector(self, query: str, vector_results: pa.Table):
|
||||
|
||||
@@ -88,14 +88,13 @@ class OpenaiReranker(Reranker):
|
||||
vector_results: pa.Table,
|
||||
fts_results: pa.Table,
|
||||
):
|
||||
combined_results = self.merge_results(vector_results, fts_results)
|
||||
if self.score == "all":
|
||||
combined_results = self._merge_and_keep_scores(vector_results, fts_results)
|
||||
else:
|
||||
combined_results = self.merge_results(vector_results, fts_results)
|
||||
combined_results = self._rerank(combined_results, query)
|
||||
if self.score == "relevance":
|
||||
combined_results = self._keep_relevance_score(combined_results)
|
||||
elif self.score == "all":
|
||||
raise NotImplementedError(
|
||||
"OpenAI Reranker does not support score='all' yet"
|
||||
)
|
||||
|
||||
combined_results = combined_results.sort_by(
|
||||
[("_relevance_score", "descending")]
|
||||
|
||||
@@ -94,14 +94,14 @@ class VoyageAIReranker(Reranker):
|
||||
vector_results: pa.Table,
|
||||
fts_results: pa.Table,
|
||||
):
|
||||
combined_results = self.merge_results(vector_results, fts_results)
|
||||
if self.score == "all":
|
||||
combined_results = self._merge_and_keep_scores(vector_results, fts_results)
|
||||
else:
|
||||
combined_results = self.merge_results(vector_results, fts_results)
|
||||
combined_results = self._rerank(combined_results, query)
|
||||
if self.score == "relevance":
|
||||
combined_results = self._keep_relevance_score(combined_results)
|
||||
elif self.score == "all":
|
||||
raise NotImplementedError(
|
||||
"return_score='all' not implemented for voyageai reranker"
|
||||
)
|
||||
|
||||
return combined_results
|
||||
|
||||
def rerank_vector(self, query: str, vector_results: pa.Table):
|
||||
|
||||
@@ -827,7 +827,7 @@ class Table(ABC):
|
||||
ordering_field_names: Optional[Union[str, List[str]]] = None,
|
||||
replace: bool = False,
|
||||
writer_heap_size: Optional[int] = 1024 * 1024 * 1024,
|
||||
use_tantivy: bool = True,
|
||||
use_tantivy: bool = False,
|
||||
tokenizer_name: Optional[str] = None,
|
||||
with_position: bool = False,
|
||||
# tokenizer configs:
|
||||
@@ -838,6 +838,9 @@ class Table(ABC):
|
||||
stem: bool = True,
|
||||
remove_stop_words: bool = True,
|
||||
ascii_folding: bool = True,
|
||||
ngram_min_length: int = 3,
|
||||
ngram_max_length: int = 3,
|
||||
prefix_only: bool = False,
|
||||
wait_timeout: Optional[timedelta] = None,
|
||||
):
|
||||
"""Create a full-text search index on the table.
|
||||
@@ -864,7 +867,7 @@ class Table(ABC):
|
||||
The tokenizer to use for the index. Can be "raw", "default" or the 2 letter
|
||||
language code followed by "_stem". So for english it would be "en_stem".
|
||||
For available languages see: https://docs.rs/tantivy/latest/tantivy/tokenizer/enum.Language.html
|
||||
use_tantivy: bool, default True
|
||||
use_tantivy: bool, default False
|
||||
If True, use the legacy full-text search implementation based on tantivy.
|
||||
If False, use the new full-text search implementation based on lance-index.
|
||||
with_position: bool, default False
|
||||
@@ -877,6 +880,7 @@ class Table(ABC):
|
||||
- "simple": Splits text by whitespace and punctuation.
|
||||
- "whitespace": Split text by whitespace, but not punctuation.
|
||||
- "raw": No tokenization. The entire text is treated as a single token.
|
||||
- "ngram": N-Gram tokenizer.
|
||||
language : str, default "English"
|
||||
The language to use for tokenization.
|
||||
max_token_length : int, default 40
|
||||
@@ -894,6 +898,12 @@ class Table(ABC):
|
||||
ascii_folding : bool, default True
|
||||
Whether to fold ASCII characters. This converts accented characters to
|
||||
their ASCII equivalent. For example, "café" would be converted to "cafe".
|
||||
ngram_min_length: int, default 3
|
||||
The minimum length of an n-gram.
|
||||
ngram_max_length: int, default 3
|
||||
The maximum length of an n-gram.
|
||||
prefix_only: bool, default False
|
||||
Whether to only index the prefix of the token for ngram tokenizer.
|
||||
wait_timeout: timedelta, optional
|
||||
The timeout to wait if indexing is asynchronous.
|
||||
"""
|
||||
@@ -1970,7 +1980,7 @@ class LanceTable(Table):
|
||||
ordering_field_names: Optional[Union[str, List[str]]] = None,
|
||||
replace: bool = False,
|
||||
writer_heap_size: Optional[int] = 1024 * 1024 * 1024,
|
||||
use_tantivy: bool = True,
|
||||
use_tantivy: bool = False,
|
||||
tokenizer_name: Optional[str] = None,
|
||||
with_position: bool = False,
|
||||
# tokenizer configs:
|
||||
@@ -1981,6 +1991,9 @@ class LanceTable(Table):
|
||||
stem: bool = True,
|
||||
remove_stop_words: bool = True,
|
||||
ascii_folding: bool = True,
|
||||
ngram_min_length: int = 3,
|
||||
ngram_max_length: int = 3,
|
||||
prefix_only: bool = False,
|
||||
):
|
||||
if not use_tantivy:
|
||||
if not isinstance(field_names, str):
|
||||
@@ -1996,6 +2009,9 @@ class LanceTable(Table):
|
||||
"stem": stem,
|
||||
"remove_stop_words": remove_stop_words,
|
||||
"ascii_folding": ascii_folding,
|
||||
"ngram_min_length": ngram_min_length,
|
||||
"ngram_max_length": ngram_max_length,
|
||||
"prefix_only": prefix_only,
|
||||
}
|
||||
else:
|
||||
tokenizer_configs = self.infer_tokenizer_configs(tokenizer_name)
|
||||
@@ -2065,6 +2081,9 @@ class LanceTable(Table):
|
||||
"stem": False,
|
||||
"remove_stop_words": False,
|
||||
"ascii_folding": False,
|
||||
"ngram_min_length": 3,
|
||||
"ngram_max_length": 3,
|
||||
"prefix_only": False,
|
||||
}
|
||||
elif tokenizer_name == "raw":
|
||||
return {
|
||||
@@ -2075,6 +2094,9 @@ class LanceTable(Table):
|
||||
"stem": False,
|
||||
"remove_stop_words": False,
|
||||
"ascii_folding": False,
|
||||
"ngram_min_length": 3,
|
||||
"ngram_max_length": 3,
|
||||
"prefix_only": False,
|
||||
}
|
||||
elif tokenizer_name == "whitespace":
|
||||
return {
|
||||
@@ -2085,6 +2107,9 @@ class LanceTable(Table):
|
||||
"stem": False,
|
||||
"remove_stop_words": False,
|
||||
"ascii_folding": False,
|
||||
"ngram_min_length": 3,
|
||||
"ngram_max_length": 3,
|
||||
"prefix_only": False,
|
||||
}
|
||||
|
||||
# or it's with language stemming with pattern like "en_stem"
|
||||
@@ -2103,6 +2128,9 @@ class LanceTable(Table):
|
||||
"stem": True,
|
||||
"remove_stop_words": False,
|
||||
"ascii_folding": False,
|
||||
"ngram_min_length": 3,
|
||||
"ngram_max_length": 3,
|
||||
"prefix_only": False,
|
||||
}
|
||||
|
||||
def add(
|
||||
@@ -3637,8 +3665,10 @@ class AsyncTable:
|
||||
)
|
||||
if query.distance_type is not None:
|
||||
async_query = async_query.distance_type(query.distance_type)
|
||||
if query.nprobes is not None:
|
||||
async_query = async_query.nprobes(query.nprobes)
|
||||
if query.minimum_nprobes is not None:
|
||||
async_query = async_query.minimum_nprobes(query.minimum_nprobes)
|
||||
if query.maximum_nprobes is not None:
|
||||
async_query = async_query.maximum_nprobes(query.maximum_nprobes)
|
||||
if query.refine_factor is not None:
|
||||
async_query = async_query.refine_factor(query.refine_factor)
|
||||
if query.vector_column:
|
||||
|
||||
@@ -25,4 +25,4 @@ IndexType = Literal[
|
||||
]
|
||||
|
||||
# Tokenizer literals
|
||||
BaseTokenizerType = Literal["simple", "raw", "whitespace"]
|
||||
BaseTokenizerType = Literal["simple", "raw", "whitespace", "ngram"]
|
||||
|
||||
@@ -6,7 +6,7 @@ import lancedb
|
||||
|
||||
# --8<-- [end:import-lancedb]
|
||||
# --8<-- [start:import-numpy]
|
||||
from lancedb.query import BoostQuery, MatchQuery
|
||||
from lancedb.query import BooleanQuery, BoostQuery, MatchQuery, Occur
|
||||
import numpy as np
|
||||
import pyarrow as pa
|
||||
|
||||
@@ -191,6 +191,15 @@ def test_fts_fuzzy_query():
|
||||
"food", # 1 insertion
|
||||
}
|
||||
|
||||
results = table.search(
|
||||
MatchQuery("foo", "text", fuzziness=1, prefix_length=3)
|
||||
).to_pandas()
|
||||
assert len(results) == 2
|
||||
assert set(results["text"].to_list()) == {
|
||||
"foo",
|
||||
"food",
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
os.name == "nt", reason="Need to fix https://github.com/lancedb/lance/issues/3905"
|
||||
@@ -240,6 +249,60 @@ def test_fts_boost_query():
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
os.name == "nt", reason="Need to fix https://github.com/lancedb/lance/issues/3905"
|
||||
)
|
||||
def test_fts_boolean_query(tmp_path):
|
||||
uri = tmp_path / "boolean-example"
|
||||
db = lancedb.connect(uri)
|
||||
table = db.create_table(
|
||||
"my_table_fts_boolean",
|
||||
data=[
|
||||
{"text": "The cat and dog are playing"},
|
||||
{"text": "The cat is sleeping"},
|
||||
{"text": "The dog is barking"},
|
||||
{"text": "The dog chases the cat"},
|
||||
],
|
||||
mode="overwrite",
|
||||
)
|
||||
table.create_fts_index("text", use_tantivy=False, replace=True)
|
||||
|
||||
# SHOULD
|
||||
results = table.search(
|
||||
MatchQuery("cat", "text") | MatchQuery("dog", "text")
|
||||
).to_pandas()
|
||||
assert len(results) == 4
|
||||
assert set(results["text"].to_list()) == {
|
||||
"The cat and dog are playing",
|
||||
"The cat is sleeping",
|
||||
"The dog is barking",
|
||||
"The dog chases the cat",
|
||||
}
|
||||
# MUST
|
||||
results = table.search(
|
||||
MatchQuery("cat", "text") & MatchQuery("dog", "text")
|
||||
).to_pandas()
|
||||
assert len(results) == 2
|
||||
assert set(results["text"].to_list()) == {
|
||||
"The cat and dog are playing",
|
||||
"The dog chases the cat",
|
||||
}
|
||||
|
||||
# MUST NOT
|
||||
results = table.search(
|
||||
BooleanQuery(
|
||||
[
|
||||
(Occur.MUST, MatchQuery("cat", "text")),
|
||||
(Occur.MUST_NOT, MatchQuery("dog", "text")),
|
||||
]
|
||||
)
|
||||
).to_pandas()
|
||||
assert len(results) == 1
|
||||
assert set(results["text"].to_list()) == {
|
||||
"The cat is sleeping",
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
os.name == "nt", reason="Need to fix https://github.com/lancedb/lance/issues/3905"
|
||||
)
|
||||
|
||||
@@ -215,6 +215,19 @@ def test_search_fts(table, use_tantivy):
|
||||
assert len(results) == 5
|
||||
assert len(results[0]) == 3 # id, text, _score
|
||||
|
||||
# Test boolean query
|
||||
results = (
|
||||
table.search(MatchQuery("puppy", "text") & MatchQuery("runs", "text"))
|
||||
.select(["id", "text"])
|
||||
.limit(5)
|
||||
.to_list()
|
||||
)
|
||||
assert len(results) == 5
|
||||
assert len(results[0]) == 3 # id, text, _score
|
||||
for r in results:
|
||||
assert "puppy" in r["text"]
|
||||
assert "runs" in r["text"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_fts_select_async(async_table):
|
||||
@@ -656,3 +669,46 @@ def test_fts_on_list(mem_db: DBConnection):
|
||||
|
||||
res = table.search(PhraseQuery("lance database", "text")).limit(5).to_list()
|
||||
assert len(res) == 2
|
||||
|
||||
|
||||
def test_fts_ngram(mem_db: DBConnection):
|
||||
data = pa.table({"text": ["hello world", "lance database", "lance is cool"]})
|
||||
table = mem_db.create_table("test", data=data)
|
||||
table.create_fts_index("text", use_tantivy=False, base_tokenizer="ngram")
|
||||
|
||||
results = table.search("lan", query_type="fts").limit(10).to_list()
|
||||
assert len(results) == 2
|
||||
assert set(r["text"] for r in results) == {"lance database", "lance is cool"}
|
||||
|
||||
results = (
|
||||
table.search("nce", query_type="fts").limit(10).to_list()
|
||||
) # spellchecker:disable-line
|
||||
assert len(results) == 2
|
||||
assert set(r["text"] for r in results) == {"lance database", "lance is cool"}
|
||||
|
||||
# the default min_ngram_length is 3, so "la" should not match
|
||||
results = table.search("la", query_type="fts").limit(10).to_list()
|
||||
assert len(results) == 0
|
||||
|
||||
# test setting min_ngram_length and prefix_only
|
||||
table.create_fts_index(
|
||||
"text",
|
||||
use_tantivy=False,
|
||||
base_tokenizer="ngram",
|
||||
replace=True,
|
||||
ngram_min_length=2,
|
||||
prefix_only=True,
|
||||
)
|
||||
|
||||
results = table.search("lan", query_type="fts").limit(10).to_list()
|
||||
assert len(results) == 2
|
||||
assert set(r["text"] for r in results) == {"lance database", "lance is cool"}
|
||||
|
||||
results = (
|
||||
table.search("nce", query_type="fts").limit(10).to_list()
|
||||
) # spellchecker:disable-line
|
||||
assert len(results) == 0
|
||||
|
||||
results = table.search("la", query_type="fts").limit(10).to_list()
|
||||
assert len(results) == 2
|
||||
assert set(r["text"] for r in results) == {"lance database", "lance is cool"}
|
||||
|
||||
@@ -25,6 +25,8 @@ from lancedb.query import (
|
||||
AsyncQueryBase,
|
||||
AsyncVectorQuery,
|
||||
LanceVectorQueryBuilder,
|
||||
MatchQuery,
|
||||
PhraseQuery,
|
||||
Query,
|
||||
FullTextSearchQuery,
|
||||
)
|
||||
@@ -270,7 +272,9 @@ async def test_distance_range_with_new_rows_async():
|
||||
# append more rows so that execution plan would be mixed with ANN & Flat KNN
|
||||
new_data = pa.table(
|
||||
{
|
||||
"vector": pa.FixedShapeTensorArray.from_numpy_ndarray(np.random.rand(4, 2)),
|
||||
"vector": pa.FixedShapeTensorArray.from_numpy_ndarray(
|
||||
np.random.rand(4, 2) + 1
|
||||
),
|
||||
}
|
||||
)
|
||||
await table.add(new_data)
|
||||
@@ -437,6 +441,33 @@ def test_query_builder_with_filter(table):
|
||||
assert all(np.array(rs[0]["vector"]) == [3, 4])
|
||||
|
||||
|
||||
def test_invalid_nprobes_sync(table):
|
||||
with pytest.raises(ValueError, match="minimum_nprobes must be greater than 0"):
|
||||
LanceVectorQueryBuilder(table, [0, 0], "vector").minimum_nprobes(0).to_list()
|
||||
with pytest.raises(
|
||||
ValueError, match="maximum_nprobes must be greater than minimum_nprobes"
|
||||
):
|
||||
LanceVectorQueryBuilder(table, [0, 0], "vector").maximum_nprobes(5).to_list()
|
||||
with pytest.raises(
|
||||
ValueError, match="minimum_nprobes must be less or equal to maximum_nprobes"
|
||||
):
|
||||
LanceVectorQueryBuilder(table, [0, 0], "vector").minimum_nprobes(100).to_list()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_invalid_nprobes_async(table_async: AsyncTable):
|
||||
with pytest.raises(ValueError, match="minimum_nprobes must be greater than 0"):
|
||||
await table_async.vector_search([0, 0]).minimum_nprobes(0).to_list()
|
||||
with pytest.raises(
|
||||
ValueError, match="maximum_nprobes must be greater than minimum_nprobes"
|
||||
):
|
||||
await table_async.vector_search([0, 0]).maximum_nprobes(5).to_list()
|
||||
with pytest.raises(
|
||||
ValueError, match="minimum_nprobes must be less or equal to maximum_nprobes"
|
||||
):
|
||||
await table_async.vector_search([0, 0]).minimum_nprobes(100).to_list()
|
||||
|
||||
|
||||
def test_query_builder_with_prefilter(table):
|
||||
df = (
|
||||
LanceVectorQueryBuilder(table, [0, 0], "vector")
|
||||
@@ -583,6 +614,21 @@ async def test_query_async(table_async: AsyncTable):
|
||||
table_async.query().nearest_to(pa.array([1, 2])).nprobes(10),
|
||||
expected_num_rows=2,
|
||||
)
|
||||
await check_query(
|
||||
table_async.query().nearest_to(pa.array([1, 2])).minimum_nprobes(10),
|
||||
expected_num_rows=2,
|
||||
)
|
||||
await check_query(
|
||||
table_async.query().nearest_to(pa.array([1, 2])).maximum_nprobes(30),
|
||||
expected_num_rows=2,
|
||||
)
|
||||
await check_query(
|
||||
table_async.query()
|
||||
.nearest_to(pa.array([1, 2]))
|
||||
.minimum_nprobes(10)
|
||||
.maximum_nprobes(20),
|
||||
expected_num_rows=2,
|
||||
)
|
||||
await check_query(
|
||||
table_async.query().nearest_to(pa.array([1, 2])).bypass_vector_index(),
|
||||
expected_num_rows=2,
|
||||
@@ -731,6 +777,82 @@ async def test_explain_plan_async(table_async: AsyncTable):
|
||||
assert "KNN" in plan
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_explain_plan_fts(table_async: AsyncTable):
|
||||
"""Test explain plan for FTS queries"""
|
||||
# Create FTS index
|
||||
from lancedb.index import FTS
|
||||
|
||||
await table_async.create_index("text", config=FTS())
|
||||
|
||||
# Test pure FTS query
|
||||
query = await table_async.search("dog", query_type="fts", fts_columns="text")
|
||||
plan = await query.explain_plan()
|
||||
# Should show FTS details (issue #2465 is now fixed)
|
||||
assert "MatchQuery: query=dog" in plan
|
||||
assert "GlobalLimitExec" in plan # Default limit
|
||||
|
||||
# Test FTS query with limit
|
||||
query_with_limit = await table_async.search(
|
||||
"dog", query_type="fts", fts_columns="text"
|
||||
)
|
||||
plan_with_limit = await query_with_limit.limit(1).explain_plan()
|
||||
assert "MatchQuery: query=dog" in plan_with_limit
|
||||
assert "GlobalLimitExec: skip=0, fetch=1" in plan_with_limit
|
||||
|
||||
# Test FTS query with offset and limit
|
||||
query_with_offset = await table_async.search(
|
||||
"dog", query_type="fts", fts_columns="text"
|
||||
)
|
||||
plan_with_offset = await query_with_offset.offset(1).limit(1).explain_plan()
|
||||
assert "MatchQuery: query=dog" in plan_with_offset
|
||||
assert "GlobalLimitExec: skip=1, fetch=1" in plan_with_offset
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_explain_plan_vector_with_limit_offset(table_async: AsyncTable):
|
||||
"""Test explain plan for vector queries with limit and offset"""
|
||||
# Test vector query with limit
|
||||
plan_with_limit = await (
|
||||
table_async.query().nearest_to(pa.array([1, 2])).limit(1).explain_plan()
|
||||
)
|
||||
assert "KNN" in plan_with_limit
|
||||
assert "GlobalLimitExec: skip=0, fetch=1" in plan_with_limit
|
||||
|
||||
# Test vector query with offset and limit
|
||||
plan_with_offset = await (
|
||||
table_async.query()
|
||||
.nearest_to(pa.array([1, 2]))
|
||||
.offset(1)
|
||||
.limit(1)
|
||||
.explain_plan()
|
||||
)
|
||||
assert "KNN" in plan_with_offset
|
||||
assert "GlobalLimitExec: skip=1, fetch=1" in plan_with_offset
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_explain_plan_with_filters(table_async: AsyncTable):
|
||||
"""Test explain plan for queries with filters"""
|
||||
# Test vector query with filter
|
||||
plan_with_filter = await (
|
||||
table_async.query().nearest_to(pa.array([1, 2])).where("id = 1").explain_plan()
|
||||
)
|
||||
assert "KNN" in plan_with_filter
|
||||
assert "FilterExec" in plan_with_filter
|
||||
|
||||
# Test FTS query with filter
|
||||
from lancedb.index import FTS
|
||||
|
||||
await table_async.create_index("text", config=FTS())
|
||||
query_fts_filter = await table_async.search(
|
||||
"dog", query_type="fts", fts_columns="text"
|
||||
)
|
||||
plan_fts_filter = await query_fts_filter.where("id = 1").explain_plan()
|
||||
assert "MatchQuery: query=dog" in plan_fts_filter
|
||||
assert "FilterExec: id@" in plan_fts_filter # Should show filter details
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_query_camelcase_async(tmp_path):
|
||||
db = await lancedb.connect_async(tmp_path)
|
||||
@@ -909,7 +1031,39 @@ def test_query_serialization_sync(table: lancedb.table.Table):
|
||||
|
||||
q = table.search([5.0, 6.0]).nprobes(10).refine_factor(5).to_query_object()
|
||||
check_set_props(
|
||||
q, vector_column="vector", vector=[5.0, 6.0], nprobes=10, refine_factor=5
|
||||
q,
|
||||
vector_column="vector",
|
||||
vector=[5.0, 6.0],
|
||||
minimum_nprobes=10,
|
||||
maximum_nprobes=10,
|
||||
refine_factor=5,
|
||||
)
|
||||
|
||||
q = table.search([5.0, 6.0]).minimum_nprobes(10).to_query_object()
|
||||
check_set_props(
|
||||
q,
|
||||
vector_column="vector",
|
||||
vector=[5.0, 6.0],
|
||||
minimum_nprobes=10,
|
||||
maximum_nprobes=None,
|
||||
)
|
||||
|
||||
q = table.search([5.0, 6.0]).nprobes(50).to_query_object()
|
||||
check_set_props(
|
||||
q,
|
||||
vector_column="vector",
|
||||
vector=[5.0, 6.0],
|
||||
minimum_nprobes=50,
|
||||
maximum_nprobes=50,
|
||||
)
|
||||
|
||||
q = table.search([5.0, 6.0]).maximum_nprobes(10).to_query_object()
|
||||
check_set_props(
|
||||
q,
|
||||
vector_column="vector",
|
||||
vector=[5.0, 6.0],
|
||||
maximum_nprobes=10,
|
||||
minimum_nprobes=None,
|
||||
)
|
||||
|
||||
q = table.search([5.0, 6.0]).distance_range(0.0, 1.0).to_query_object()
|
||||
@@ -961,7 +1115,8 @@ async def test_query_serialization_async(table_async: AsyncTable):
|
||||
limit=10,
|
||||
vector=sample_vector,
|
||||
postfilter=False,
|
||||
nprobes=20,
|
||||
minimum_nprobes=20,
|
||||
maximum_nprobes=20,
|
||||
with_row_id=False,
|
||||
bypass_vector_index=False,
|
||||
)
|
||||
@@ -971,7 +1126,20 @@ async def test_query_serialization_async(table_async: AsyncTable):
|
||||
q,
|
||||
vector=sample_vector,
|
||||
postfilter=False,
|
||||
nprobes=20,
|
||||
minimum_nprobes=20,
|
||||
maximum_nprobes=20,
|
||||
with_row_id=False,
|
||||
bypass_vector_index=False,
|
||||
limit=10,
|
||||
)
|
||||
|
||||
q = (await table_async.search([5.0, 6.0])).nprobes(50).to_query_object()
|
||||
check_set_props(
|
||||
q,
|
||||
vector=sample_vector,
|
||||
postfilter=False,
|
||||
minimum_nprobes=50,
|
||||
maximum_nprobes=50,
|
||||
with_row_id=False,
|
||||
bypass_vector_index=False,
|
||||
limit=10,
|
||||
@@ -990,7 +1158,8 @@ async def test_query_serialization_async(table_async: AsyncTable):
|
||||
filter="id = 1",
|
||||
postfilter=True,
|
||||
vector=sample_vector,
|
||||
nprobes=20,
|
||||
minimum_nprobes=20,
|
||||
maximum_nprobes=20,
|
||||
with_row_id=False,
|
||||
bypass_vector_index=False,
|
||||
)
|
||||
@@ -1004,7 +1173,8 @@ async def test_query_serialization_async(table_async: AsyncTable):
|
||||
check_set_props(
|
||||
q,
|
||||
vector=sample_vector,
|
||||
nprobes=10,
|
||||
minimum_nprobes=10,
|
||||
maximum_nprobes=10,
|
||||
refine_factor=5,
|
||||
postfilter=False,
|
||||
with_row_id=False,
|
||||
@@ -1012,6 +1182,18 @@ async def test_query_serialization_async(table_async: AsyncTable):
|
||||
limit=10,
|
||||
)
|
||||
|
||||
q = (await table_async.search([5.0, 6.0])).minimum_nprobes(5).to_query_object()
|
||||
check_set_props(
|
||||
q,
|
||||
vector=sample_vector,
|
||||
minimum_nprobes=5,
|
||||
maximum_nprobes=20,
|
||||
postfilter=False,
|
||||
with_row_id=False,
|
||||
bypass_vector_index=False,
|
||||
limit=10,
|
||||
)
|
||||
|
||||
q = (
|
||||
(await table_async.search([5.0, 6.0]))
|
||||
.distance_range(0.0, 1.0)
|
||||
@@ -1023,7 +1205,8 @@ async def test_query_serialization_async(table_async: AsyncTable):
|
||||
lower_bound=0.0,
|
||||
upper_bound=1.0,
|
||||
postfilter=False,
|
||||
nprobes=20,
|
||||
minimum_nprobes=20,
|
||||
maximum_nprobes=20,
|
||||
with_row_id=False,
|
||||
bypass_vector_index=False,
|
||||
limit=10,
|
||||
@@ -1035,7 +1218,8 @@ async def test_query_serialization_async(table_async: AsyncTable):
|
||||
distance_type="cosine",
|
||||
vector=sample_vector,
|
||||
postfilter=False,
|
||||
nprobes=20,
|
||||
minimum_nprobes=20,
|
||||
maximum_nprobes=20,
|
||||
with_row_id=False,
|
||||
bypass_vector_index=False,
|
||||
limit=10,
|
||||
@@ -1047,7 +1231,8 @@ async def test_query_serialization_async(table_async: AsyncTable):
|
||||
ef=7,
|
||||
vector=sample_vector,
|
||||
postfilter=False,
|
||||
nprobes=20,
|
||||
minimum_nprobes=20,
|
||||
maximum_nprobes=20,
|
||||
with_row_id=False,
|
||||
bypass_vector_index=False,
|
||||
limit=10,
|
||||
@@ -1059,24 +1244,34 @@ async def test_query_serialization_async(table_async: AsyncTable):
|
||||
bypass_vector_index=True,
|
||||
vector=sample_vector,
|
||||
postfilter=False,
|
||||
nprobes=20,
|
||||
minimum_nprobes=20,
|
||||
maximum_nprobes=20,
|
||||
with_row_id=False,
|
||||
limit=10,
|
||||
)
|
||||
|
||||
# FTS queries
|
||||
q = (await table_async.search("foo")).limit(10).to_query_object()
|
||||
match_query = MatchQuery("foo", "text")
|
||||
q = (await table_async.search(match_query)).limit(10).to_query_object()
|
||||
check_set_props(
|
||||
q,
|
||||
limit=10,
|
||||
full_text_query=FullTextSearchQuery(columns=[], query="foo"),
|
||||
full_text_query=FullTextSearchQuery(columns=None, query=match_query),
|
||||
with_row_id=False,
|
||||
)
|
||||
|
||||
q = (await table_async.search("foo", query_type="fts")).to_query_object()
|
||||
q = (await table_async.search(match_query)).to_query_object()
|
||||
check_set_props(
|
||||
q,
|
||||
full_text_query=FullTextSearchQuery(columns=[], query="foo"),
|
||||
full_text_query=FullTextSearchQuery(columns=None, query=match_query),
|
||||
with_row_id=False,
|
||||
)
|
||||
|
||||
phrase_query = PhraseQuery("foo", "text", slop=1)
|
||||
q = (await table_async.search(phrase_query)).to_query_object()
|
||||
check_set_props(
|
||||
q,
|
||||
full_text_query=FullTextSearchQuery(columns=None, query=phrase_query),
|
||||
with_row_id=False,
|
||||
)
|
||||
|
||||
|
||||
@@ -210,6 +210,25 @@ async def test_retry_error():
|
||||
assert cause.status_code == 429
|
||||
|
||||
|
||||
def test_table_unimplemented_functions():
|
||||
def handler(request):
|
||||
if request.path == "/v1/table/test/create/?mode=create":
|
||||
request.send_response(200)
|
||||
request.send_header("Content-Type", "application/json")
|
||||
request.end_headers()
|
||||
request.wfile.write(b"{}")
|
||||
else:
|
||||
request.send_response(404)
|
||||
request.end_headers()
|
||||
|
||||
with mock_lancedb_connection(handler) as db:
|
||||
table = db.create_table("test", [{"id": 1}])
|
||||
with pytest.raises(NotImplementedError):
|
||||
table.to_arrow()
|
||||
with pytest.raises(NotImplementedError):
|
||||
table.to_pandas()
|
||||
|
||||
|
||||
def test_table_add_in_threadpool():
|
||||
def handler(request):
|
||||
if request.path == "/v1/table/test/insert/":
|
||||
@@ -496,6 +515,8 @@ def test_query_sync_minimal():
|
||||
"ef": None,
|
||||
"vector": [1.0, 2.0, 3.0],
|
||||
"nprobes": 20,
|
||||
"minimum_nprobes": 20,
|
||||
"maximum_nprobes": 20,
|
||||
"version": None,
|
||||
}
|
||||
|
||||
@@ -536,6 +557,8 @@ def test_query_sync_maximal():
|
||||
"refine_factor": 10,
|
||||
"vector": [1.0, 2.0, 3.0],
|
||||
"nprobes": 5,
|
||||
"minimum_nprobes": 5,
|
||||
"maximum_nprobes": 5,
|
||||
"lower_bound": None,
|
||||
"upper_bound": None,
|
||||
"ef": None,
|
||||
@@ -564,6 +587,66 @@ def test_query_sync_maximal():
|
||||
)
|
||||
|
||||
|
||||
def test_query_sync_nprobes():
|
||||
def handler(body):
|
||||
assert body == {
|
||||
"distance_type": "l2",
|
||||
"k": 10,
|
||||
"prefilter": True,
|
||||
"fast_search": True,
|
||||
"vector_column": "vector2",
|
||||
"refine_factor": None,
|
||||
"lower_bound": None,
|
||||
"upper_bound": None,
|
||||
"ef": None,
|
||||
"vector": [1.0, 2.0, 3.0],
|
||||
"nprobes": 5,
|
||||
"minimum_nprobes": 5,
|
||||
"maximum_nprobes": 15,
|
||||
"version": None,
|
||||
}
|
||||
|
||||
return pa.table({"id": [1, 2, 3], "name": ["a", "b", "c"]})
|
||||
|
||||
with query_test_table(handler) as table:
|
||||
(
|
||||
table.search([1, 2, 3], vector_column_name="vector2", fast_search=True)
|
||||
.minimum_nprobes(5)
|
||||
.maximum_nprobes(15)
|
||||
.to_list()
|
||||
)
|
||||
|
||||
|
||||
def test_query_sync_no_max_nprobes():
|
||||
def handler(body):
|
||||
assert body == {
|
||||
"distance_type": "l2",
|
||||
"k": 10,
|
||||
"prefilter": True,
|
||||
"fast_search": True,
|
||||
"vector_column": "vector2",
|
||||
"refine_factor": None,
|
||||
"lower_bound": None,
|
||||
"upper_bound": None,
|
||||
"ef": None,
|
||||
"vector": [1.0, 2.0, 3.0],
|
||||
"nprobes": 5,
|
||||
"minimum_nprobes": 5,
|
||||
"maximum_nprobes": 0,
|
||||
"version": None,
|
||||
}
|
||||
|
||||
return pa.table({"id": [1, 2, 3], "name": ["a", "b", "c"]})
|
||||
|
||||
with query_test_table(handler) as table:
|
||||
(
|
||||
table.search([1, 2, 3], vector_column_name="vector2", fast_search=True)
|
||||
.minimum_nprobes(5)
|
||||
.maximum_nprobes(0)
|
||||
.to_list()
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("server_version", [Version("0.1.0"), Version("0.2.0")])
|
||||
def test_query_sync_batch_queries(server_version):
|
||||
def handler(body):
|
||||
@@ -666,6 +749,8 @@ def test_query_sync_hybrid():
|
||||
"refine_factor": None,
|
||||
"vector": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
|
||||
"nprobes": 20,
|
||||
"minimum_nprobes": 20,
|
||||
"maximum_nprobes": 20,
|
||||
"lower_bound": None,
|
||||
"upper_bound": None,
|
||||
"ef": None,
|
||||
|
||||
@@ -499,3 +499,19 @@ def test_empty_result_reranker():
|
||||
.rerank(reranker)
|
||||
.to_arrow()
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
||||
def test_cross_encoder_reranker_return_all(tmp_path, use_tantivy):
|
||||
pytest.importorskip("sentence_transformers")
|
||||
reranker = CrossEncoderReranker(return_score="all")
|
||||
table, schema = get_test_table(tmp_path, use_tantivy)
|
||||
query = "single player experience"
|
||||
result = (
|
||||
table.search(query, query_type="hybrid", vector_column_name="vector")
|
||||
.rerank(reranker=reranker)
|
||||
.to_arrow()
|
||||
)
|
||||
assert "_relevance_score" in result.column_names
|
||||
assert "_score" in result.column_names
|
||||
assert "_distance" in result.column_names
|
||||
|
||||
@@ -245,7 +245,7 @@ def test_s3_dynamodb_sync(s3_bucket: str, commit_table: str, monkeypatch):
|
||||
NotImplementedError,
|
||||
match="Full-text search is only supported on the local filesystem",
|
||||
):
|
||||
table.create_fts_index("x")
|
||||
table.create_fts_index("x", use_tantivy=True)
|
||||
|
||||
# make sure list tables still works
|
||||
assert db.table_names() == ["test_ddb_sync"]
|
||||
|
||||
@@ -47,7 +47,10 @@ pub fn extract_index_params(source: &Option<Bound<'_, PyAny>>) -> PyResult<Lance
|
||||
.max_token_length(params.max_token_length)
|
||||
.remove_stop_words(params.remove_stop_words)
|
||||
.stem(params.stem)
|
||||
.ascii_folding(params.ascii_folding);
|
||||
.ascii_folding(params.ascii_folding)
|
||||
.ngram_min_length(params.ngram_min_length)
|
||||
.ngram_max_length(params.ngram_max_length)
|
||||
.ngram_prefix_only(params.prefix_only);
|
||||
Ok(LanceDbIndex::FTS(inner_opts))
|
||||
},
|
||||
"IvfFlat" => {
|
||||
@@ -130,6 +133,9 @@ struct FtsParams {
|
||||
stem: bool,
|
||||
remove_stop_words: bool,
|
||||
ascii_folding: bool,
|
||||
ngram_min_length: u32,
|
||||
ngram_max_length: u32,
|
||||
prefix_only: bool,
|
||||
}
|
||||
|
||||
#[derive(FromPyObject)]
|
||||
|
||||
@@ -9,15 +9,16 @@ use arrow::array::Array;
|
||||
use arrow::array::ArrayData;
|
||||
use arrow::pyarrow::FromPyArrow;
|
||||
use arrow::pyarrow::IntoPyArrow;
|
||||
use lancedb::index::scalar::{FtsQuery, FullTextSearchQuery, MatchQuery, PhraseQuery};
|
||||
use lancedb::index::scalar::{
|
||||
BooleanQuery, BoostQuery, FtsQuery, FullTextSearchQuery, MatchQuery, MultiMatchQuery, Occur,
|
||||
Operator, PhraseQuery,
|
||||
};
|
||||
use lancedb::query::QueryExecutionOptions;
|
||||
use lancedb::query::QueryFilter;
|
||||
use lancedb::query::{
|
||||
ExecutableQuery, Query as LanceDbQuery, QueryBase, Select, VectorQuery as LanceDbVectorQuery,
|
||||
};
|
||||
use lancedb::table::AnyQuery;
|
||||
use pyo3::exceptions::PyRuntimeError;
|
||||
use pyo3::exceptions::{PyNotImplementedError, PyValueError};
|
||||
use pyo3::prelude::{PyAnyMethods, PyDictMethods};
|
||||
use pyo3::pymethods;
|
||||
use pyo3::types::PyList;
|
||||
@@ -27,30 +28,172 @@ use pyo3::IntoPyObject;
|
||||
use pyo3::PyAny;
|
||||
use pyo3::PyRef;
|
||||
use pyo3::PyResult;
|
||||
use pyo3::{exceptions::PyRuntimeError, FromPyObject};
|
||||
use pyo3::{
|
||||
exceptions::{PyNotImplementedError, PyValueError},
|
||||
intern,
|
||||
};
|
||||
use pyo3::{pyclass, PyErr};
|
||||
use pyo3_async_runtimes::tokio::future_into_py;
|
||||
|
||||
use crate::arrow::RecordBatchStream;
|
||||
use crate::error::PythonErrorExt;
|
||||
use crate::util::{parse_distance_type, parse_fts_query};
|
||||
use crate::util::parse_distance_type;
|
||||
use crate::{arrow::RecordBatchStream, util::PyLanceDB};
|
||||
use crate::{error::PythonErrorExt, index::class_name};
|
||||
|
||||
// Python representation of full text search parameters
|
||||
#[derive(Clone)]
|
||||
#[pyclass(get_all)]
|
||||
pub struct PyFullTextSearchQuery {
|
||||
pub columns: Vec<String>,
|
||||
pub query: String,
|
||||
pub limit: Option<i64>,
|
||||
pub wand_factor: Option<f32>,
|
||||
impl FromPyObject<'_> for PyLanceDB<FtsQuery> {
|
||||
fn extract_bound(ob: &Bound<'_, PyAny>) -> PyResult<Self> {
|
||||
match class_name(ob)?.as_str() {
|
||||
"MatchQuery" => {
|
||||
let query = ob.getattr("query")?.extract()?;
|
||||
let column = ob.getattr("column")?.extract()?;
|
||||
let boost = ob.getattr("boost")?.extract()?;
|
||||
let fuzziness = ob.getattr("fuzziness")?.extract()?;
|
||||
let max_expansions = ob.getattr("max_expansions")?.extract()?;
|
||||
let operator = ob.getattr("operator")?.extract::<String>()?;
|
||||
let prefix_length = ob.getattr("prefix_length")?.extract()?;
|
||||
|
||||
Ok(Self(
|
||||
MatchQuery::new(query)
|
||||
.with_column(Some(column))
|
||||
.with_boost(boost)
|
||||
.with_fuzziness(fuzziness)
|
||||
.with_max_expansions(max_expansions)
|
||||
.with_operator(Operator::try_from(operator.as_str()).map_err(|e| {
|
||||
PyValueError::new_err(format!("Invalid operator: {}", e))
|
||||
})?)
|
||||
.with_prefix_length(prefix_length)
|
||||
.into(),
|
||||
))
|
||||
}
|
||||
"PhraseQuery" => {
|
||||
let query = ob.getattr("query")?.extract()?;
|
||||
let column = ob.getattr("column")?.extract()?;
|
||||
let slop = ob.getattr("slop")?.extract()?;
|
||||
|
||||
Ok(Self(
|
||||
PhraseQuery::new(query)
|
||||
.with_column(Some(column))
|
||||
.with_slop(slop)
|
||||
.into(),
|
||||
))
|
||||
}
|
||||
"BoostQuery" => {
|
||||
let positive: Self = ob.getattr("positive")?.extract()?;
|
||||
let negative: Self = ob.getattr("negative")?.extract()?;
|
||||
let negative_boost = ob.getattr("negative_boost")?.extract()?;
|
||||
Ok(Self(
|
||||
BoostQuery::new(positive.0, negative.0, negative_boost).into(),
|
||||
))
|
||||
}
|
||||
"MultiMatchQuery" => {
|
||||
let query = ob.getattr("query")?.extract()?;
|
||||
let columns = ob.getattr("columns")?.extract()?;
|
||||
let boosts: Option<Vec<f32>> = ob.getattr("boosts")?.extract()?;
|
||||
let operator: String = ob.getattr("operator")?.extract()?;
|
||||
|
||||
let q = MultiMatchQuery::try_new(query, columns)
|
||||
.map_err(|e| PyValueError::new_err(format!("Invalid query: {}", e)))?;
|
||||
let q = if let Some(boosts) = boosts {
|
||||
q.try_with_boosts(boosts)
|
||||
.map_err(|e| PyValueError::new_err(format!("Invalid boosts: {}", e)))?
|
||||
} else {
|
||||
q
|
||||
};
|
||||
|
||||
let op = Operator::try_from(operator.as_str())
|
||||
.map_err(|e| PyValueError::new_err(format!("Invalid operator: {}", e)))?;
|
||||
|
||||
Ok(Self(q.with_operator(op).into()))
|
||||
}
|
||||
"BooleanQuery" => {
|
||||
let queries: Vec<(String, Self)> = ob.getattr("queries")?.extract()?;
|
||||
let mut sub_queries = Vec::with_capacity(queries.len());
|
||||
for (occur, q) in queries {
|
||||
let occur = Occur::try_from(occur.as_str())
|
||||
.map_err(|e| PyValueError::new_err(e.to_string()))?;
|
||||
sub_queries.push((occur, q.0));
|
||||
}
|
||||
Ok(Self(BooleanQuery::new(sub_queries).into()))
|
||||
}
|
||||
name => Err(PyValueError::new_err(format!(
|
||||
"Unsupported FTS query type: {}",
|
||||
name
|
||||
))),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<FullTextSearchQuery> for PyFullTextSearchQuery {
|
||||
fn from(query: FullTextSearchQuery) -> Self {
|
||||
Self {
|
||||
columns: query.columns().into_iter().collect(),
|
||||
query: query.query.query().to_owned(),
|
||||
limit: query.limit,
|
||||
wand_factor: query.wand_factor,
|
||||
impl<'py> IntoPyObject<'py> for PyLanceDB<FtsQuery> {
|
||||
type Target = PyAny;
|
||||
type Output = Bound<'py, Self::Target>;
|
||||
type Error = PyErr;
|
||||
|
||||
fn into_pyobject(self, py: pyo3::Python<'py>) -> PyResult<Self::Output> {
|
||||
let namespace = py
|
||||
.import(intern!(py, "lancedb"))
|
||||
.and_then(|m| m.getattr(intern!(py, "query")))
|
||||
.expect("Failed to import namespace");
|
||||
|
||||
match self.0 {
|
||||
FtsQuery::Match(query) => {
|
||||
let kwargs = PyDict::new(py);
|
||||
kwargs.set_item("boost", query.boost)?;
|
||||
kwargs.set_item("fuzziness", query.fuzziness)?;
|
||||
kwargs.set_item("max_expansions", query.max_expansions)?;
|
||||
kwargs.set_item::<_, &str>("operator", query.operator.into())?;
|
||||
kwargs.set_item("prefix_length", query.prefix_length)?;
|
||||
namespace
|
||||
.getattr(intern!(py, "MatchQuery"))?
|
||||
.call((query.terms, query.column.unwrap()), Some(&kwargs))
|
||||
}
|
||||
FtsQuery::Phrase(query) => {
|
||||
let kwargs = PyDict::new(py);
|
||||
kwargs.set_item("slop", query.slop)?;
|
||||
namespace
|
||||
.getattr(intern!(py, "PhraseQuery"))?
|
||||
.call((query.terms, query.column.unwrap()), Some(&kwargs))
|
||||
}
|
||||
FtsQuery::Boost(query) => {
|
||||
let positive = Self(query.positive.as_ref().clone()).into_pyobject(py)?;
|
||||
let negative = Self(query.negative.as_ref().clone()).into_pyobject(py)?;
|
||||
let kwargs = PyDict::new(py);
|
||||
kwargs.set_item("negative_boost", query.negative_boost)?;
|
||||
namespace
|
||||
.getattr(intern!(py, "BoostQuery"))?
|
||||
.call((positive, negative), Some(&kwargs))
|
||||
}
|
||||
FtsQuery::MultiMatch(query) => {
|
||||
let first = &query.match_queries[0];
|
||||
let (columns, boosts): (Vec<_>, Vec<_>) = query
|
||||
.match_queries
|
||||
.iter()
|
||||
.map(|q| (q.column.as_ref().unwrap().clone(), q.boost))
|
||||
.unzip();
|
||||
let kwargs = PyDict::new(py);
|
||||
kwargs.set_item("boosts", boosts)?;
|
||||
kwargs.set_item::<_, &str>("operator", first.operator.into())?;
|
||||
namespace
|
||||
.getattr(intern!(py, "MultiMatchQuery"))?
|
||||
.call((first.terms.clone(), columns), Some(&kwargs))
|
||||
}
|
||||
FtsQuery::Boolean(query) => {
|
||||
let mut queries: Vec<(&str, Bound<'py, PyAny>)> = Vec::with_capacity(
|
||||
query.should.len() + query.must.len() + query.must_not.len(),
|
||||
);
|
||||
for q in query.should {
|
||||
queries.push((Occur::Should.into(), Self(q).into_pyobject(py)?));
|
||||
}
|
||||
for q in query.must {
|
||||
queries.push((Occur::Must.into(), Self(q).into_pyobject(py)?));
|
||||
}
|
||||
for q in query.must_not {
|
||||
queries.push((Occur::MustNot.into(), Self(q).into_pyobject(py)?));
|
||||
}
|
||||
|
||||
namespace
|
||||
.getattr(intern!(py, "BooleanQuery"))?
|
||||
.call1((queries,))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -80,13 +223,16 @@ pub struct PyQueryRequest {
|
||||
pub limit: Option<usize>,
|
||||
pub offset: Option<usize>,
|
||||
pub filter: Option<PyQueryFilter>,
|
||||
pub full_text_search: Option<PyFullTextSearchQuery>,
|
||||
pub full_text_search: Option<PyLanceDB<FtsQuery>>,
|
||||
pub select: PySelect,
|
||||
pub fast_search: Option<bool>,
|
||||
pub with_row_id: Option<bool>,
|
||||
pub column: Option<String>,
|
||||
pub query_vector: Option<PyQueryVectors>,
|
||||
pub nprobes: Option<usize>,
|
||||
pub minimum_nprobes: Option<usize>,
|
||||
// None means user did not set it and default shoud be used (currenty 20)
|
||||
// Some(0) means user set it to None and there is no limit
|
||||
pub maximum_nprobes: Option<usize>,
|
||||
pub lower_bound: Option<f32>,
|
||||
pub upper_bound: Option<f32>,
|
||||
pub ef: Option<usize>,
|
||||
@@ -106,13 +252,14 @@ impl From<AnyQuery> for PyQueryRequest {
|
||||
filter: query_request.filter.map(PyQueryFilter),
|
||||
full_text_search: query_request
|
||||
.full_text_search
|
||||
.map(PyFullTextSearchQuery::from),
|
||||
.map(|fts| PyLanceDB(fts.query)),
|
||||
select: PySelect(query_request.select),
|
||||
fast_search: Some(query_request.fast_search),
|
||||
with_row_id: Some(query_request.with_row_id),
|
||||
column: None,
|
||||
query_vector: None,
|
||||
nprobes: None,
|
||||
minimum_nprobes: None,
|
||||
maximum_nprobes: None,
|
||||
lower_bound: None,
|
||||
upper_bound: None,
|
||||
ef: None,
|
||||
@@ -132,7 +279,11 @@ impl From<AnyQuery> for PyQueryRequest {
|
||||
with_row_id: Some(vector_query.base.with_row_id),
|
||||
column: vector_query.column,
|
||||
query_vector: Some(PyQueryVectors(vector_query.query_vector)),
|
||||
nprobes: Some(vector_query.nprobes),
|
||||
minimum_nprobes: Some(vector_query.minimum_nprobes),
|
||||
maximum_nprobes: match vector_query.maximum_nprobes {
|
||||
None => Some(0),
|
||||
Some(value) => Some(value),
|
||||
},
|
||||
lower_bound: vector_query.lower_bound,
|
||||
upper_bound: vector_query.upper_bound,
|
||||
ef: vector_query.ef,
|
||||
@@ -269,8 +420,8 @@ impl Query {
|
||||
}
|
||||
};
|
||||
let mut query = FullTextSearchQuery::new_query(query);
|
||||
if let Some(cols) = columns {
|
||||
if !cols.is_empty() {
|
||||
match columns {
|
||||
Some(cols) if !cols.is_empty() => {
|
||||
query = query.with_columns(&cols).map_err(|e| {
|
||||
PyValueError::new_err(format!(
|
||||
"Failed to set full text search columns: {}",
|
||||
@@ -278,15 +429,12 @@ impl Query {
|
||||
))
|
||||
})?;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
query
|
||||
} else if let Ok(query) = fts_query.downcast::<PyDict>() {
|
||||
let query = parse_fts_query(query)?;
|
||||
FullTextSearchQuery::new_query(query)
|
||||
} else {
|
||||
return Err(PyValueError::new_err(
|
||||
"query must be a string or a Query object",
|
||||
));
|
||||
let query = fts_query.extract::<PyLanceDB<FtsQuery>>()?;
|
||||
FullTextSearchQuery::new_query(query.0)
|
||||
};
|
||||
|
||||
Ok(FTSQuery {
|
||||
@@ -414,7 +562,10 @@ impl FTSQuery {
|
||||
}
|
||||
|
||||
pub fn explain_plan(self_: PyRef<'_, Self>, verbose: bool) -> PyResult<Bound<'_, PyAny>> {
|
||||
let inner = self_.inner.clone();
|
||||
let inner = self_
|
||||
.inner
|
||||
.clone()
|
||||
.full_text_search(self_.fts_query.clone());
|
||||
future_into_py(self_.py(), async move {
|
||||
inner
|
||||
.explain_plan(verbose)
|
||||
@@ -424,7 +575,10 @@ impl FTSQuery {
|
||||
}
|
||||
|
||||
pub fn analyze_plan(self_: PyRef<'_, Self>) -> PyResult<Bound<'_, PyAny>> {
|
||||
let inner = self_.inner.clone();
|
||||
let inner = self_
|
||||
.inner
|
||||
.clone()
|
||||
.full_text_search(self_.fts_query.clone());
|
||||
future_into_py(self_.py(), async move {
|
||||
inner
|
||||
.analyze_plan()
|
||||
@@ -509,6 +663,29 @@ impl VectorQuery {
|
||||
self.inner = self.inner.clone().nprobes(nprobe as usize);
|
||||
}
|
||||
|
||||
pub fn minimum_nprobes(&mut self, minimum_nprobes: u32) -> PyResult<()> {
|
||||
self.inner = self
|
||||
.inner
|
||||
.clone()
|
||||
.minimum_nprobes(minimum_nprobes as usize)
|
||||
.infer_error()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn maximum_nprobes(&mut self, maximum_nprobes: u32) -> PyResult<()> {
|
||||
let maximum_nprobes = if maximum_nprobes == 0 {
|
||||
None
|
||||
} else {
|
||||
Some(maximum_nprobes as usize)
|
||||
};
|
||||
self.inner = self
|
||||
.inner
|
||||
.clone()
|
||||
.maximum_nprobes(maximum_nprobes)
|
||||
.infer_error()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[pyo3(signature = (lower_bound=None, upper_bound=None))]
|
||||
pub fn distance_range(&mut self, lower_bound: Option<f32>, upper_bound: Option<f32>) {
|
||||
self.inner = self.inner.clone().distance_range(lower_bound, upper_bound);
|
||||
|
||||
@@ -3,15 +3,11 @@
|
||||
|
||||
use std::sync::Mutex;
|
||||
|
||||
use lancedb::index::scalar::{BoostQuery, FtsQuery, MatchQuery, MultiMatchQuery, PhraseQuery};
|
||||
use lancedb::DistanceType;
|
||||
use pyo3::prelude::{PyAnyMethods, PyDictMethods, PyListMethods};
|
||||
use pyo3::types::PyDict;
|
||||
use pyo3::{
|
||||
exceptions::{PyRuntimeError, PyValueError},
|
||||
pyfunction, PyResult,
|
||||
};
|
||||
use pyo3::{Bound, PyAny};
|
||||
|
||||
/// A wrapper around a rust builder
|
||||
///
|
||||
@@ -64,116 +60,6 @@ pub fn validate_table_name(table_name: &str) -> PyResult<()> {
|
||||
.map_err(|e| PyValueError::new_err(e.to_string()))
|
||||
}
|
||||
|
||||
pub fn parse_fts_query(query: &Bound<'_, PyDict>) -> PyResult<FtsQuery> {
|
||||
let query_type = query.keys().get_item(0)?.extract::<String>()?;
|
||||
let query_value = query
|
||||
.get_item(&query_type)?
|
||||
.ok_or(PyValueError::new_err(format!(
|
||||
"Query type {} not found",
|
||||
query_type
|
||||
)))?;
|
||||
let query_value = query_value.downcast::<PyDict>()?;
|
||||
|
||||
match query_type.as_str() {
|
||||
"match" => {
|
||||
let column = query_value.keys().get_item(0)?.extract::<String>()?;
|
||||
let params = query_value
|
||||
.get_item(&column)?
|
||||
.ok_or(PyValueError::new_err(format!(
|
||||
"column {} not found",
|
||||
column
|
||||
)))?;
|
||||
let params = params.downcast::<PyDict>()?;
|
||||
|
||||
let query = params
|
||||
.get_item("query")?
|
||||
.ok_or(PyValueError::new_err("query not found"))?
|
||||
.extract::<String>()?;
|
||||
let boost = params
|
||||
.get_item("boost")?
|
||||
.ok_or(PyValueError::new_err("boost not found"))?
|
||||
.extract::<f32>()?;
|
||||
let fuzziness = params
|
||||
.get_item("fuzziness")?
|
||||
.ok_or(PyValueError::new_err("fuzziness not found"))?
|
||||
.extract::<Option<u32>>()?;
|
||||
let max_expansions = params
|
||||
.get_item("max_expansions")?
|
||||
.ok_or(PyValueError::new_err("max_expansions not found"))?
|
||||
.extract::<usize>()?;
|
||||
|
||||
let query = MatchQuery::new(query)
|
||||
.with_column(Some(column))
|
||||
.with_boost(boost)
|
||||
.with_fuzziness(fuzziness)
|
||||
.with_max_expansions(max_expansions);
|
||||
Ok(query.into())
|
||||
}
|
||||
|
||||
"match_phrase" => {
|
||||
let column = query_value.keys().get_item(0)?.extract::<String>()?;
|
||||
let query = query_value
|
||||
.get_item(&column)?
|
||||
.ok_or(PyValueError::new_err(format!(
|
||||
"column {} not found",
|
||||
column
|
||||
)))?
|
||||
.extract::<String>()?;
|
||||
|
||||
let query = PhraseQuery::new(query).with_column(Some(column));
|
||||
Ok(query.into())
|
||||
}
|
||||
|
||||
"boost" => {
|
||||
let positive: Bound<'_, PyAny> = query_value
|
||||
.get_item("positive")?
|
||||
.ok_or(PyValueError::new_err("positive not found"))?;
|
||||
let positive = positive.downcast::<PyDict>()?;
|
||||
|
||||
let negative = query_value
|
||||
.get_item("negative")?
|
||||
.ok_or(PyValueError::new_err("negative not found"))?;
|
||||
let negative = negative.downcast::<PyDict>()?;
|
||||
|
||||
let negative_boost = query_value
|
||||
.get_item("negative_boost")?
|
||||
.ok_or(PyValueError::new_err("negative_boost not found"))?
|
||||
.extract::<f32>()?;
|
||||
|
||||
let positive_query = parse_fts_query(positive)?;
|
||||
let negative_query = parse_fts_query(negative)?;
|
||||
let query = BoostQuery::new(positive_query, negative_query, Some(negative_boost));
|
||||
|
||||
Ok(query.into())
|
||||
}
|
||||
|
||||
"multi_match" => {
|
||||
let query = query_value
|
||||
.get_item("query")?
|
||||
.ok_or(PyValueError::new_err("query not found"))?
|
||||
.extract::<String>()?;
|
||||
|
||||
let columns = query_value
|
||||
.get_item("columns")?
|
||||
.ok_or(PyValueError::new_err("columns not found"))?
|
||||
.extract::<Vec<String>>()?;
|
||||
|
||||
let boost = query_value
|
||||
.get_item("boost")?
|
||||
.ok_or(PyValueError::new_err("boost not found"))?
|
||||
.extract::<Vec<f32>>()?;
|
||||
|
||||
let query = MultiMatchQuery::try_new(query, columns)
|
||||
.and_then(|q| q.try_with_boosts(boost))
|
||||
.map_err(|e| {
|
||||
PyValueError::new_err(format!("Error creating MultiMatchQuery: {}", e))
|
||||
})?;
|
||||
Ok(query.into())
|
||||
}
|
||||
|
||||
_ => Err(PyValueError::new_err(format!(
|
||||
"Unsupported query type: {}",
|
||||
query_type
|
||||
))),
|
||||
}
|
||||
}
|
||||
/// A wrapper around a LanceDB type to allow it to be used in Python
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct PyLanceDB<T>(pub T);
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "lancedb-node"
|
||||
version = "0.20.0-beta.2"
|
||||
version = "0.21.2-beta.0"
|
||||
description = "Serverless, low-latency vector database for AI applications"
|
||||
license.workspace = true
|
||||
edition.workspace = true
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "lancedb"
|
||||
version = "0.20.0-beta.2"
|
||||
version = "0.21.2-beta.0"
|
||||
edition.workspace = true
|
||||
description = "LanceDB: A serverless, low-latency vector database for AI applications"
|
||||
license.workspace = true
|
||||
|
||||
@@ -105,7 +105,7 @@ impl ListingCatalog {
|
||||
}
|
||||
|
||||
async fn open_path(path: &str) -> Result<Self> {
|
||||
let (object_store, base_path) = ObjectStore::from_uri(path).await.unwrap();
|
||||
let (object_store, base_path) = ObjectStore::from_uri(path).await?;
|
||||
if object_store.is_local() {
|
||||
Self::try_create_dir(path).context(CreateDirSnafu { path })?;
|
||||
}
|
||||
@@ -216,6 +216,7 @@ impl Catalog for ListingCatalog {
|
||||
client_config: Default::default(),
|
||||
read_consistency_interval: None,
|
||||
options: Default::default(),
|
||||
session: None,
|
||||
};
|
||||
|
||||
// Add the db options to the connect request
|
||||
@@ -243,6 +244,7 @@ impl Catalog for ListingCatalog {
|
||||
client_config: Default::default(),
|
||||
read_consistency_interval: None,
|
||||
options: Default::default(),
|
||||
session: None,
|
||||
};
|
||||
|
||||
// Add the db options to the connect request
|
||||
@@ -312,6 +314,7 @@ mod tests {
|
||||
client_config: Default::default(),
|
||||
options: Default::default(),
|
||||
read_consistency_interval: None,
|
||||
session: None,
|
||||
};
|
||||
|
||||
let catalog = ListingCatalog::connect(&request).await.unwrap();
|
||||
@@ -573,6 +576,7 @@ mod tests {
|
||||
client_config: Default::default(),
|
||||
options: Default::default(),
|
||||
read_consistency_interval: None,
|
||||
session: None,
|
||||
};
|
||||
|
||||
let catalog = ListingCatalog::connect(&request).await.unwrap();
|
||||
@@ -592,6 +596,7 @@ mod tests {
|
||||
client_config: Default::default(),
|
||||
options: Default::default(),
|
||||
read_consistency_interval: None,
|
||||
session: None,
|
||||
};
|
||||
|
||||
let catalog = ListingCatalog::connect(&request).await.unwrap();
|
||||
@@ -608,6 +613,7 @@ mod tests {
|
||||
client_config: Default::default(),
|
||||
options: Default::default(),
|
||||
read_consistency_interval: None,
|
||||
session: None,
|
||||
};
|
||||
|
||||
let result = ListingCatalog::connect(&request).await;
|
||||
|
||||
@@ -627,6 +627,12 @@ pub struct ConnectRequest {
|
||||
/// consistency only applies to read operations. Write operations are
|
||||
/// always consistent.
|
||||
pub read_consistency_interval: Option<std::time::Duration>,
|
||||
|
||||
/// Optional session for object stores and caching
|
||||
///
|
||||
/// If provided, this session will be used instead of creating a default one.
|
||||
/// This allows for custom configuration of object store registries, caching, etc.
|
||||
pub session: Option<Arc<lance::session::Session>>,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
@@ -645,6 +651,7 @@ impl ConnectBuilder {
|
||||
client_config: Default::default(),
|
||||
read_consistency_interval: None,
|
||||
options: HashMap::new(),
|
||||
session: None,
|
||||
},
|
||||
embedding_registry: None,
|
||||
}
|
||||
@@ -802,6 +809,20 @@ impl ConnectBuilder {
|
||||
self
|
||||
}
|
||||
|
||||
/// Set a custom session for object stores and caching.
|
||||
///
|
||||
/// By default, a new session with default configuration will be created.
|
||||
/// This method allows you to provide a custom session with your own
|
||||
/// configuration for object store registries, caching, etc.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `session` - A custom session to use for this connection
|
||||
pub fn session(mut self, session: Arc<lance::session::Session>) -> Self {
|
||||
self.request.session = Some(session);
|
||||
self
|
||||
}
|
||||
|
||||
#[cfg(feature = "remote")]
|
||||
fn execute_remote(self) -> Result<Connection> {
|
||||
use crate::remote::db::RemoteDatabaseOptions;
|
||||
@@ -884,6 +905,7 @@ impl CatalogConnectBuilder {
|
||||
client_config: Default::default(),
|
||||
read_consistency_interval: None,
|
||||
options: HashMap::new(),
|
||||
session: None,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,7 +8,7 @@ use std::path::Path;
|
||||
use std::{collections::HashMap, sync::Arc};
|
||||
|
||||
use lance::dataset::{ReadParams, WriteMode};
|
||||
use lance::io::{ObjectStore, ObjectStoreParams, ObjectStoreRegistry, WrappingObjectStore};
|
||||
use lance::io::{ObjectStore, ObjectStoreParams, WrappingObjectStore};
|
||||
use lance_datafusion::utils::StreamingWriteSource;
|
||||
use lance_encoding::version::LanceFileVersion;
|
||||
use lance_table::io::commit::commit_handler_from_url;
|
||||
@@ -217,6 +217,9 @@ pub struct ListingDatabase {
|
||||
|
||||
// Options for tables created by this connection
|
||||
new_table_config: NewTableConfig,
|
||||
|
||||
// Session for object stores and caching
|
||||
session: Arc<lance::session::Session>,
|
||||
}
|
||||
|
||||
impl std::fmt::Display for ListingDatabase {
|
||||
@@ -262,6 +265,7 @@ impl ListingDatabase {
|
||||
uri,
|
||||
request.read_consistency_interval,
|
||||
options.new_table_config,
|
||||
request.session.clone(),
|
||||
)
|
||||
.await
|
||||
}
|
||||
@@ -313,13 +317,20 @@ impl ListingDatabase {
|
||||
|
||||
let plain_uri = url.to_string();
|
||||
|
||||
let registry = Arc::new(ObjectStoreRegistry::default());
|
||||
let session = request
|
||||
.session
|
||||
.clone()
|
||||
.unwrap_or_else(|| Arc::new(lance::session::Session::default()));
|
||||
let os_params = ObjectStoreParams {
|
||||
storage_options: Some(options.storage_options.clone()),
|
||||
..Default::default()
|
||||
};
|
||||
let (object_store, base_path) =
|
||||
ObjectStore::from_uri_and_params(registry, &plain_uri, &os_params).await?;
|
||||
let (object_store, base_path) = ObjectStore::from_uri_and_params(
|
||||
session.store_registry(),
|
||||
&plain_uri,
|
||||
&os_params,
|
||||
)
|
||||
.await?;
|
||||
if object_store.is_local() {
|
||||
Self::try_create_dir(&plain_uri).context(CreateDirSnafu { path: plain_uri })?;
|
||||
}
|
||||
@@ -342,6 +353,7 @@ impl ListingDatabase {
|
||||
read_consistency_interval: request.read_consistency_interval,
|
||||
storage_options: options.storage_options,
|
||||
new_table_config: options.new_table_config,
|
||||
session,
|
||||
})
|
||||
}
|
||||
Err(_) => {
|
||||
@@ -349,6 +361,7 @@ impl ListingDatabase {
|
||||
uri,
|
||||
request.read_consistency_interval,
|
||||
options.new_table_config,
|
||||
request.session.clone(),
|
||||
)
|
||||
.await
|
||||
}
|
||||
@@ -359,8 +372,15 @@ impl ListingDatabase {
|
||||
path: &str,
|
||||
read_consistency_interval: Option<std::time::Duration>,
|
||||
new_table_config: NewTableConfig,
|
||||
session: Option<Arc<lance::session::Session>>,
|
||||
) -> Result<Self> {
|
||||
let (object_store, base_path) = ObjectStore::from_uri(path).await?;
|
||||
let session = session.unwrap_or_else(|| Arc::new(lance::session::Session::default()));
|
||||
let (object_store, base_path) = ObjectStore::from_uri_and_params(
|
||||
session.store_registry(),
|
||||
path,
|
||||
&ObjectStoreParams::default(),
|
||||
)
|
||||
.await?;
|
||||
if object_store.is_local() {
|
||||
Self::try_create_dir(path).context(CreateDirSnafu { path })?;
|
||||
}
|
||||
@@ -374,6 +394,7 @@ impl ListingDatabase {
|
||||
read_consistency_interval,
|
||||
storage_options: HashMap::new(),
|
||||
new_table_config,
|
||||
session,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -441,6 +462,128 @@ impl ListingDatabase {
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Inherit storage options from the connection into the target map
|
||||
fn inherit_storage_options(&self, target: &mut HashMap<String, String>) {
|
||||
for (key, value) in self.storage_options.iter() {
|
||||
if !target.contains_key(key) {
|
||||
target.insert(key.clone(), value.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Extract storage option overrides from the request
|
||||
fn extract_storage_overrides(
|
||||
&self,
|
||||
request: &CreateTableRequest,
|
||||
) -> Result<(Option<LanceFileVersion>, Option<bool>)> {
|
||||
let storage_options = request
|
||||
.write_options
|
||||
.lance_write_params
|
||||
.as_ref()
|
||||
.and_then(|p| p.store_params.as_ref())
|
||||
.and_then(|sp| sp.storage_options.as_ref());
|
||||
|
||||
let storage_version_override = storage_options
|
||||
.and_then(|opts| opts.get(OPT_NEW_TABLE_STORAGE_VERSION))
|
||||
.map(|s| s.parse::<LanceFileVersion>())
|
||||
.transpose()?;
|
||||
|
||||
let v2_manifest_override = storage_options
|
||||
.and_then(|opts| opts.get(OPT_NEW_TABLE_V2_MANIFEST_PATHS))
|
||||
.map(|s| s.parse::<bool>())
|
||||
.transpose()
|
||||
.map_err(|_| Error::InvalidInput {
|
||||
message: "enable_v2_manifest_paths must be a boolean".to_string(),
|
||||
})?;
|
||||
|
||||
Ok((storage_version_override, v2_manifest_override))
|
||||
}
|
||||
|
||||
/// Prepare write parameters for table creation
|
||||
fn prepare_write_params(
|
||||
&self,
|
||||
request: &CreateTableRequest,
|
||||
storage_version_override: Option<LanceFileVersion>,
|
||||
v2_manifest_override: Option<bool>,
|
||||
) -> lance::dataset::WriteParams {
|
||||
let mut write_params = request
|
||||
.write_options
|
||||
.lance_write_params
|
||||
.clone()
|
||||
.unwrap_or_default();
|
||||
|
||||
// Only modify the storage options if we actually have something to
|
||||
// inherit. There is a difference between storage_options=None and
|
||||
// storage_options=Some({}). Using storage_options=None will cause the
|
||||
// connection's session store registry to be used. Supplying Some({})
|
||||
// will cause a new connection to be created, and that connection will
|
||||
// be dropped from the cache when python GCs the table object, which
|
||||
// confounds reuse across tables.
|
||||
if !self.storage_options.is_empty() {
|
||||
let storage_options = write_params
|
||||
.store_params
|
||||
.get_or_insert_with(Default::default)
|
||||
.storage_options
|
||||
.get_or_insert_with(Default::default);
|
||||
self.inherit_storage_options(storage_options);
|
||||
}
|
||||
|
||||
write_params.data_storage_version = self
|
||||
.new_table_config
|
||||
.data_storage_version
|
||||
.or(storage_version_override);
|
||||
|
||||
if let Some(enable_v2_manifest_paths) = self
|
||||
.new_table_config
|
||||
.enable_v2_manifest_paths
|
||||
.or(v2_manifest_override)
|
||||
{
|
||||
write_params.enable_v2_manifest_paths = enable_v2_manifest_paths;
|
||||
}
|
||||
|
||||
if matches!(&request.mode, CreateTableMode::Overwrite) {
|
||||
write_params.mode = WriteMode::Overwrite;
|
||||
}
|
||||
|
||||
write_params.session = Some(self.session.clone());
|
||||
|
||||
write_params
|
||||
}
|
||||
|
||||
/// Handle the case where table already exists based on the create mode
|
||||
async fn handle_table_exists(
|
||||
&self,
|
||||
table_name: &str,
|
||||
mode: CreateTableMode,
|
||||
data_schema: &arrow_schema::Schema,
|
||||
) -> Result<Arc<dyn BaseTable>> {
|
||||
match mode {
|
||||
CreateTableMode::Create => Err(Error::TableAlreadyExists {
|
||||
name: table_name.to_string(),
|
||||
}),
|
||||
CreateTableMode::ExistOk(callback) => {
|
||||
let req = OpenTableRequest {
|
||||
name: table_name.to_string(),
|
||||
index_cache_size: None,
|
||||
lance_read_params: None,
|
||||
};
|
||||
let req = (callback)(req);
|
||||
let table = self.open_table(req).await?;
|
||||
|
||||
let table_schema = table.schema().await?;
|
||||
|
||||
if table_schema.as_ref() != data_schema {
|
||||
return Err(Error::Schema {
|
||||
message: "Provided schema does not match existing table schema".to_string(),
|
||||
});
|
||||
}
|
||||
|
||||
Ok(table)
|
||||
}
|
||||
CreateTableMode::Overwrite => unreachable!(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
@@ -475,50 +618,14 @@ impl Database for ListingDatabase {
|
||||
Ok(f)
|
||||
}
|
||||
|
||||
async fn create_table(&self, mut request: CreateTableRequest) -> Result<Arc<dyn BaseTable>> {
|
||||
async fn create_table(&self, request: CreateTableRequest) -> Result<Arc<dyn BaseTable>> {
|
||||
let table_uri = self.table_uri(&request.name)?;
|
||||
// Inherit storage options from the connection
|
||||
let storage_options = request
|
||||
.write_options
|
||||
.lance_write_params
|
||||
.get_or_insert_with(Default::default)
|
||||
.store_params
|
||||
.get_or_insert_with(Default::default)
|
||||
.storage_options
|
||||
.get_or_insert_with(Default::default);
|
||||
for (key, value) in self.storage_options.iter() {
|
||||
if !storage_options.contains_key(key) {
|
||||
storage_options.insert(key.clone(), value.clone());
|
||||
}
|
||||
}
|
||||
|
||||
let storage_options = storage_options.clone();
|
||||
let (storage_version_override, v2_manifest_override) =
|
||||
self.extract_storage_overrides(&request)?;
|
||||
|
||||
let mut write_params = request.write_options.lance_write_params.unwrap_or_default();
|
||||
|
||||
if let Some(storage_version) = &self.new_table_config.data_storage_version {
|
||||
write_params.data_storage_version = Some(*storage_version);
|
||||
} else {
|
||||
// Allow the user to override the storage version via storage options (backwards compatibility)
|
||||
if let Some(data_storage_version) = storage_options.get(OPT_NEW_TABLE_STORAGE_VERSION) {
|
||||
write_params.data_storage_version = Some(data_storage_version.parse()?);
|
||||
}
|
||||
}
|
||||
if let Some(enable_v2_manifest_paths) = self.new_table_config.enable_v2_manifest_paths {
|
||||
write_params.enable_v2_manifest_paths = enable_v2_manifest_paths;
|
||||
} else {
|
||||
// Allow the user to override the storage version via storage options (backwards compatibility)
|
||||
if let Some(enable_v2_manifest_paths) = storage_options
|
||||
.get(OPT_NEW_TABLE_V2_MANIFEST_PATHS)
|
||||
.map(|s| s.parse::<bool>().unwrap())
|
||||
{
|
||||
write_params.enable_v2_manifest_paths = enable_v2_manifest_paths;
|
||||
}
|
||||
}
|
||||
|
||||
if matches!(&request.mode, CreateTableMode::Overwrite) {
|
||||
write_params.mode = WriteMode::Overwrite;
|
||||
}
|
||||
let write_params =
|
||||
self.prepare_write_params(&request, storage_version_override, v2_manifest_override);
|
||||
|
||||
let data_schema = request.data.arrow_schema();
|
||||
|
||||
@@ -533,30 +640,10 @@ impl Database for ListingDatabase {
|
||||
.await
|
||||
{
|
||||
Ok(table) => Ok(Arc::new(table)),
|
||||
Err(Error::TableAlreadyExists { name }) => match request.mode {
|
||||
CreateTableMode::Create => Err(Error::TableAlreadyExists { name }),
|
||||
CreateTableMode::ExistOk(callback) => {
|
||||
let req = OpenTableRequest {
|
||||
name: request.name.clone(),
|
||||
index_cache_size: None,
|
||||
lance_read_params: None,
|
||||
};
|
||||
let req = (callback)(req);
|
||||
let table = self.open_table(req).await?;
|
||||
|
||||
let table_schema = table.schema().await?;
|
||||
|
||||
if table_schema != data_schema {
|
||||
return Err(Error::Schema {
|
||||
message: "Provided schema does not match existing table schema"
|
||||
.to_string(),
|
||||
});
|
||||
}
|
||||
|
||||
Ok(table)
|
||||
}
|
||||
CreateTableMode::Overwrite => unreachable!(),
|
||||
},
|
||||
Err(Error::TableAlreadyExists { .. }) => {
|
||||
self.handle_table_exists(&request.name, request.mode, &data_schema)
|
||||
.await
|
||||
}
|
||||
Err(err) => Err(err),
|
||||
}
|
||||
}
|
||||
@@ -564,18 +651,22 @@ impl Database for ListingDatabase {
|
||||
async fn open_table(&self, mut request: OpenTableRequest) -> Result<Arc<dyn BaseTable>> {
|
||||
let table_uri = self.table_uri(&request.name)?;
|
||||
|
||||
// Inherit storage options from the connection
|
||||
let storage_options = request
|
||||
.lance_read_params
|
||||
.get_or_insert_with(Default::default)
|
||||
.store_options
|
||||
.get_or_insert_with(Default::default)
|
||||
.storage_options
|
||||
.get_or_insert_with(Default::default);
|
||||
for (key, value) in self.storage_options.iter() {
|
||||
if !storage_options.contains_key(key) {
|
||||
storage_options.insert(key.clone(), value.clone());
|
||||
}
|
||||
// Only modify the storage options if we actually have something to
|
||||
// inherit. There is a difference between storage_options=None and
|
||||
// storage_options=Some({}). Using storage_options=None will cause the
|
||||
// connection's session store registry to be used. Supplying Some({})
|
||||
// will cause a new connection to be created, and that connection will
|
||||
// be dropped from the cache when python GCs the table object, which
|
||||
// confounds reuse across tables.
|
||||
if !self.storage_options.is_empty() {
|
||||
let storage_options = request
|
||||
.lance_read_params
|
||||
.get_or_insert_with(Default::default)
|
||||
.store_options
|
||||
.get_or_insert_with(Default::default)
|
||||
.storage_options
|
||||
.get_or_insert_with(Default::default);
|
||||
self.inherit_storage_options(storage_options);
|
||||
}
|
||||
|
||||
// Some ReadParams are exposed in the OpenTableBuilder, but we also
|
||||
@@ -584,13 +675,14 @@ impl Database for ListingDatabase {
|
||||
// If we have a user provided ReadParams use that
|
||||
// If we don't then start with the default ReadParams and customize it with
|
||||
// the options from the OpenTableBuilder
|
||||
let read_params = request.lance_read_params.unwrap_or_else(|| {
|
||||
let mut read_params = request.lance_read_params.unwrap_or_else(|| {
|
||||
let mut default_params = ReadParams::default();
|
||||
if let Some(index_cache_size) = request.index_cache_size {
|
||||
default_params.index_cache_size = index_cache_size as usize;
|
||||
}
|
||||
default_params
|
||||
});
|
||||
read_params.session(self.session.clone());
|
||||
|
||||
let native_table = Arc::new(
|
||||
NativeTable::open_with_params(
|
||||
|
||||
@@ -107,7 +107,7 @@ impl ObjectStore for MirroringObjectStore {
|
||||
self.primary.delete(location).await
|
||||
}
|
||||
|
||||
fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result<ObjectMeta>> {
|
||||
fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, Result<ObjectMeta>> {
|
||||
self.primary.list(prefix)
|
||||
}
|
||||
|
||||
|
||||
@@ -119,7 +119,7 @@ impl ObjectStore for IoTrackingStore {
|
||||
let result = self.target.get(location).await;
|
||||
if let Ok(result) = &result {
|
||||
let num_bytes = result.range.end - result.range.start;
|
||||
self.record_read(num_bytes as u64);
|
||||
self.record_read(num_bytes);
|
||||
}
|
||||
result
|
||||
}
|
||||
@@ -128,12 +128,12 @@ impl ObjectStore for IoTrackingStore {
|
||||
let result = self.target.get_opts(location, options).await;
|
||||
if let Ok(result) = &result {
|
||||
let num_bytes = result.range.end - result.range.start;
|
||||
self.record_read(num_bytes as u64);
|
||||
self.record_read(num_bytes);
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
async fn get_range(&self, location: &Path, range: std::ops::Range<usize>) -> OSResult<Bytes> {
|
||||
async fn get_range(&self, location: &Path, range: std::ops::Range<u64>) -> OSResult<Bytes> {
|
||||
let result = self.target.get_range(location, range).await;
|
||||
if let Ok(result) = &result {
|
||||
self.record_read(result.len() as u64);
|
||||
@@ -144,7 +144,7 @@ impl ObjectStore for IoTrackingStore {
|
||||
async fn get_ranges(
|
||||
&self,
|
||||
location: &Path,
|
||||
ranges: &[std::ops::Range<usize>],
|
||||
ranges: &[std::ops::Range<u64>],
|
||||
) -> OSResult<Vec<Bytes>> {
|
||||
let result = self.target.get_ranges(location, ranges).await;
|
||||
if let Ok(result) = &result {
|
||||
@@ -170,7 +170,7 @@ impl ObjectStore for IoTrackingStore {
|
||||
self.target.delete_stream(locations)
|
||||
}
|
||||
|
||||
fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, OSResult<ObjectMeta>> {
|
||||
fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, OSResult<ObjectMeta>> {
|
||||
self.record_read(0);
|
||||
self.target.list(prefix)
|
||||
}
|
||||
@@ -179,7 +179,7 @@ impl ObjectStore for IoTrackingStore {
|
||||
&self,
|
||||
prefix: Option<&Path>,
|
||||
offset: &Path,
|
||||
) -> BoxStream<'_, OSResult<ObjectMeta>> {
|
||||
) -> BoxStream<'static, OSResult<ObjectMeta>> {
|
||||
self.record_read(0);
|
||||
self.target.list_with_offset(prefix, offset)
|
||||
}
|
||||
|
||||
@@ -796,8 +796,10 @@ pub struct VectorQueryRequest {
|
||||
pub column: Option<String>,
|
||||
/// The vector(s) to search for
|
||||
pub query_vector: Vec<Arc<dyn Array>>,
|
||||
/// The number of partitions to search
|
||||
pub nprobes: usize,
|
||||
/// The minimum number of partitions to search
|
||||
pub minimum_nprobes: usize,
|
||||
/// The maximum number of partitions to search
|
||||
pub maximum_nprobes: Option<usize>,
|
||||
/// The lower bound (inclusive) of the distance to search for.
|
||||
pub lower_bound: Option<f32>,
|
||||
/// The upper bound (exclusive) of the distance to search for.
|
||||
@@ -819,7 +821,8 @@ impl Default for VectorQueryRequest {
|
||||
base: QueryRequest::default(),
|
||||
column: None,
|
||||
query_vector: Vec::new(),
|
||||
nprobes: 20,
|
||||
minimum_nprobes: 20,
|
||||
maximum_nprobes: Some(20),
|
||||
lower_bound: None,
|
||||
upper_bound: None,
|
||||
ef: None,
|
||||
@@ -925,11 +928,75 @@ impl VectorQuery {
|
||||
/// For best results we recommend tuning this parameter with a benchmark against
|
||||
/// your actual data to find the smallest possible value that will still give
|
||||
/// you the desired recall.
|
||||
///
|
||||
/// This method sets both the minimum and maximum number of partitions to search.
|
||||
/// For more fine-grained control see [`VectorQuery::minimum_nprobes`] and
|
||||
/// [`VectorQuery::maximum_nprobes`].
|
||||
pub fn nprobes(mut self, nprobes: usize) -> Self {
|
||||
self.request.nprobes = nprobes;
|
||||
self.request.minimum_nprobes = nprobes;
|
||||
self.request.maximum_nprobes = Some(nprobes);
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the minimum number of partitions to search
|
||||
///
|
||||
/// This argument is only used when the vector column has an IVF PQ index.
|
||||
/// If there is no index then this value is ignored.
|
||||
///
|
||||
/// See [`VectorQuery::nprobes`] for more details.
|
||||
///
|
||||
/// These partitions will be searched on every indexed vector query.
|
||||
///
|
||||
/// Will return an error if the value is not greater than 0 or if maximum_nprobes
|
||||
/// has been set and is less than the minimum_nprobes.
|
||||
pub fn minimum_nprobes(mut self, minimum_nprobes: usize) -> Result<Self> {
|
||||
if minimum_nprobes == 0 {
|
||||
return Err(Error::InvalidInput {
|
||||
message: "minimum_nprobes must be greater than 0".to_string(),
|
||||
});
|
||||
}
|
||||
if let Some(maximum_nprobes) = self.request.maximum_nprobes {
|
||||
if minimum_nprobes > maximum_nprobes {
|
||||
return Err(Error::InvalidInput {
|
||||
message: "minimum_nprobes must be less or equal to maximum_nprobes".to_string(),
|
||||
});
|
||||
}
|
||||
}
|
||||
self.request.minimum_nprobes = minimum_nprobes;
|
||||
Ok(self)
|
||||
}
|
||||
|
||||
/// Set the maximum number of partitions to search
|
||||
///
|
||||
/// This argument is only used when the vector column has an IVF PQ index.
|
||||
/// If there is no index then this value is ignored.
|
||||
///
|
||||
/// See [`VectorQuery::nprobes`] for more details.
|
||||
///
|
||||
/// If this value is greater than minimum_nprobes then the excess partitions will
|
||||
/// only be searched if the initial search does not return enough results.
|
||||
///
|
||||
/// This can be useful when there is a narrow filter to allow these queries to
|
||||
/// spend more time searching and avoid potential false negatives.
|
||||
///
|
||||
/// Set to None to search all partitions, if needed, to satsify the limit
|
||||
pub fn maximum_nprobes(mut self, maximum_nprobes: Option<usize>) -> Result<Self> {
|
||||
if let Some(maximum_nprobes) = maximum_nprobes {
|
||||
if maximum_nprobes == 0 {
|
||||
return Err(Error::InvalidInput {
|
||||
message: "maximum_nprobes must be greater than 0".to_string(),
|
||||
});
|
||||
}
|
||||
if maximum_nprobes < self.request.minimum_nprobes {
|
||||
return Err(Error::InvalidInput {
|
||||
message: "maximum_nprobes must be greater than minimum_nprobes".to_string(),
|
||||
});
|
||||
}
|
||||
}
|
||||
self.request.maximum_nprobes = maximum_nprobes;
|
||||
Ok(self)
|
||||
}
|
||||
|
||||
/// Set the distance range for vector search,
|
||||
/// only rows with distances in the range [lower_bound, upper_bound) will be returned
|
||||
pub fn distance_range(mut self, lower_bound: Option<f32>, upper_bound: Option<f32>) -> Self {
|
||||
@@ -1208,7 +1275,8 @@ mod tests {
|
||||
);
|
||||
assert_eq!(query.request.base.limit.unwrap(), 100);
|
||||
assert_eq!(query.request.base.offset.unwrap(), 1);
|
||||
assert_eq!(query.request.nprobes, 1000);
|
||||
assert_eq!(query.request.minimum_nprobes, 1000);
|
||||
assert_eq!(query.request.maximum_nprobes, Some(1000));
|
||||
assert!(query.request.use_index);
|
||||
assert_eq!(query.request.distance_type, Some(DistanceType::Cosine));
|
||||
assert_eq!(query.request.refine_factor, Some(999));
|
||||
|
||||
@@ -32,6 +32,7 @@ use lance::dataset::{ColumnAlteration, NewColumnTransform, Version};
|
||||
use lance_datafusion::exec::{execute_plan, OneShotExec};
|
||||
use reqwest::{RequestBuilder, Response};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::Number;
|
||||
use std::collections::HashMap;
|
||||
use std::io::Cursor;
|
||||
use std::pin::Pin;
|
||||
@@ -56,6 +57,8 @@ use crate::{
|
||||
};
|
||||
|
||||
const REQUEST_TIMEOUT_HEADER: HeaderName = HeaderName::from_static("x-request-timeout-ms");
|
||||
const METRIC_TYPE_KEY: &str = "metric_type";
|
||||
const INDEX_TYPE_KEY: &str = "index_type";
|
||||
|
||||
pub struct RemoteTags<'a, S: HttpSend = Sender> {
|
||||
inner: &'a RemoteTable<S>,
|
||||
@@ -438,7 +441,18 @@ impl<S: HttpSend> RemoteTable<S> {
|
||||
|
||||
// Apply general parameters, before we dispatch based on number of query vectors.
|
||||
body["distance_type"] = serde_json::json!(query.distance_type.unwrap_or_default());
|
||||
body["nprobes"] = query.nprobes.into();
|
||||
// In 0.23.1 we migrated from `nprobes` to `minimum_nprobes` and `maximum_nprobes`.
|
||||
// Old client / new server: since minimum_nprobes is missing, fallback to nprobes
|
||||
// New client / old server: old server will only see nprobes, make sure to set both
|
||||
// nprobes and minimum_nprobes
|
||||
// New client / new server: since minimum_nprobes is present, server can ignore nprobes
|
||||
body["nprobes"] = query.minimum_nprobes.into();
|
||||
body["minimum_nprobes"] = query.minimum_nprobes.into();
|
||||
if let Some(maximum_nprobes) = query.maximum_nprobes {
|
||||
body["maximum_nprobes"] = maximum_nprobes.into();
|
||||
} else {
|
||||
body["maximum_nprobes"] = serde_json::Value::Number(Number::from_u128(0).unwrap())
|
||||
}
|
||||
body["lower_bound"] = query.lower_bound.into();
|
||||
body["upper_bound"] = query.upper_bound.into();
|
||||
body["ef"] = query.ef.into();
|
||||
@@ -985,23 +999,53 @@ impl<S: HttpSend> BaseTable for RemoteTable<S> {
|
||||
"column": column
|
||||
});
|
||||
|
||||
let (index_type, distance_type) = match index.index {
|
||||
match index.index {
|
||||
// TODO: Should we pass the actual index parameters? SaaS does not
|
||||
// yet support them.
|
||||
Index::IvfFlat(index) => ("IVF_FLAT", Some(index.distance_type)),
|
||||
Index::IvfPq(index) => ("IVF_PQ", Some(index.distance_type)),
|
||||
Index::IvfHnswSq(index) => ("IVF_HNSW_SQ", Some(index.distance_type)),
|
||||
Index::BTree(_) => ("BTREE", None),
|
||||
Index::Bitmap(_) => ("BITMAP", None),
|
||||
Index::LabelList(_) => ("LABEL_LIST", None),
|
||||
Index::IvfFlat(index) => {
|
||||
body[INDEX_TYPE_KEY] = serde_json::Value::String("IVF_FLAT".to_string());
|
||||
body[METRIC_TYPE_KEY] =
|
||||
serde_json::Value::String(index.distance_type.to_string().to_lowercase());
|
||||
if let Some(num_partitions) = index.num_partitions {
|
||||
body["num_partitions"] = serde_json::Value::Number(num_partitions.into());
|
||||
}
|
||||
}
|
||||
Index::IvfPq(index) => {
|
||||
body[INDEX_TYPE_KEY] = serde_json::Value::String("IVF_PQ".to_string());
|
||||
body[METRIC_TYPE_KEY] =
|
||||
serde_json::Value::String(index.distance_type.to_string().to_lowercase());
|
||||
if let Some(num_partitions) = index.num_partitions {
|
||||
body["num_partitions"] = serde_json::Value::Number(num_partitions.into());
|
||||
}
|
||||
if let Some(num_bits) = index.num_bits {
|
||||
body["num_bits"] = serde_json::Value::Number(num_bits.into());
|
||||
}
|
||||
}
|
||||
Index::IvfHnswSq(index) => {
|
||||
body[INDEX_TYPE_KEY] = serde_json::Value::String("IVF_HNSW_SQ".to_string());
|
||||
body[METRIC_TYPE_KEY] =
|
||||
serde_json::Value::String(index.distance_type.to_string().to_lowercase());
|
||||
if let Some(num_partitions) = index.num_partitions {
|
||||
body["num_partitions"] = serde_json::Value::Number(num_partitions.into());
|
||||
}
|
||||
}
|
||||
Index::BTree(_) => {
|
||||
body[INDEX_TYPE_KEY] = serde_json::Value::String("BTREE".to_string());
|
||||
}
|
||||
Index::Bitmap(_) => {
|
||||
body[INDEX_TYPE_KEY] = serde_json::Value::String("BITMAP".to_string());
|
||||
}
|
||||
Index::LabelList(_) => {
|
||||
body[INDEX_TYPE_KEY] = serde_json::Value::String("LABEL_LIST".to_string());
|
||||
}
|
||||
Index::FTS(fts) => {
|
||||
body[INDEX_TYPE_KEY] = serde_json::Value::String("FTS".to_string());
|
||||
let params = serde_json::to_value(&fts).map_err(|e| Error::InvalidInput {
|
||||
message: format!("failed to serialize FTS index params {:?}", e),
|
||||
})?;
|
||||
for (key, value) in params.as_object().unwrap() {
|
||||
body[key] = value.clone();
|
||||
}
|
||||
("FTS", None)
|
||||
}
|
||||
Index::Auto => {
|
||||
let schema = self.schema().await?;
|
||||
@@ -1011,9 +1055,11 @@ impl<S: HttpSend> BaseTable for RemoteTable<S> {
|
||||
message: format!("Column {} not found in schema", column),
|
||||
})?;
|
||||
if supported_vector_data_type(field.data_type()) {
|
||||
("IVF_PQ", Some(DistanceType::L2))
|
||||
body[INDEX_TYPE_KEY] = serde_json::Value::String("IVF_PQ".to_string());
|
||||
body[METRIC_TYPE_KEY] =
|
||||
serde_json::Value::String(DistanceType::L2.to_string().to_lowercase());
|
||||
} else if supported_btree_data_type(field.data_type()) {
|
||||
("BTREE", None)
|
||||
body[INDEX_TYPE_KEY] = serde_json::Value::String("BTREE".to_string());
|
||||
} else {
|
||||
return Err(Error::NotSupported {
|
||||
message: format!(
|
||||
@@ -1030,12 +1076,6 @@ impl<S: HttpSend> BaseTable for RemoteTable<S> {
|
||||
})
|
||||
}
|
||||
};
|
||||
body["index_type"] = serde_json::Value::String(index_type.into());
|
||||
if let Some(distance_type) = distance_type {
|
||||
// Phalanx expects this to be lowercase right now.
|
||||
body["metric_type"] =
|
||||
serde_json::Value::String(distance_type.to_string().to_lowercase());
|
||||
}
|
||||
|
||||
let request = request.json(&body);
|
||||
|
||||
@@ -1417,11 +1457,12 @@ mod tests {
|
||||
use chrono::{DateTime, Utc};
|
||||
use futures::{future::BoxFuture, StreamExt, TryFutureExt};
|
||||
use lance_index::scalar::inverted::query::MatchQuery;
|
||||
use lance_index::scalar::FullTextSearchQuery;
|
||||
use lance_index::scalar::{FullTextSearchQuery, InvertedIndexParams};
|
||||
use reqwest::Body;
|
||||
use rstest::rstest;
|
||||
use serde_json::json;
|
||||
|
||||
use crate::index::vector::IvfFlatIndexBuilder;
|
||||
use crate::index::vector::{IvfFlatIndexBuilder, IvfHnswSqIndexBuilder};
|
||||
use crate::remote::db::DEFAULT_SERVER_VERSION;
|
||||
use crate::remote::JSON_CONTENT_TYPE;
|
||||
use crate::{
|
||||
@@ -2075,6 +2116,8 @@ mod tests {
|
||||
"prefilter": true,
|
||||
"distance_type": "l2",
|
||||
"nprobes": 20,
|
||||
"minimum_nprobes": 20,
|
||||
"maximum_nprobes": 20,
|
||||
"lower_bound": Option::<f32>::None,
|
||||
"upper_bound": Option::<f32>::None,
|
||||
"k": 10,
|
||||
@@ -2175,6 +2218,8 @@ mod tests {
|
||||
"bypass_vector_index": true,
|
||||
"columns": ["a", "b"],
|
||||
"nprobes": 12,
|
||||
"minimum_nprobes": 12,
|
||||
"maximum_nprobes": 12,
|
||||
"lower_bound": Option::<f32>::None,
|
||||
"upper_bound": Option::<f32>::None,
|
||||
"ef": Option::<usize>::None,
|
||||
@@ -2302,6 +2347,7 @@ mod tests {
|
||||
"fuzziness": 0,
|
||||
"max_expansions": 50,
|
||||
"operator": "Or",
|
||||
"prefix_length": 0,
|
||||
},
|
||||
}
|
||||
},
|
||||
@@ -2416,29 +2462,79 @@ mod tests {
|
||||
let cases = [
|
||||
(
|
||||
"IVF_FLAT",
|
||||
Some("hamming"),
|
||||
json!({
|
||||
"metric_type": "hamming",
|
||||
}),
|
||||
Index::IvfFlat(IvfFlatIndexBuilder::default().distance_type(DistanceType::Hamming)),
|
||||
),
|
||||
("IVF_PQ", Some("l2"), Index::IvfPq(Default::default())),
|
||||
(
|
||||
"IVF_FLAT",
|
||||
json!({
|
||||
"metric_type": "hamming",
|
||||
"num_partitions": 128,
|
||||
}),
|
||||
Index::IvfFlat(
|
||||
IvfFlatIndexBuilder::default()
|
||||
.distance_type(DistanceType::Hamming)
|
||||
.num_partitions(128),
|
||||
),
|
||||
),
|
||||
(
|
||||
"IVF_PQ",
|
||||
Some("cosine"),
|
||||
Index::IvfPq(IvfPqIndexBuilder::default().distance_type(DistanceType::Cosine)),
|
||||
json!({
|
||||
"metric_type": "l2",
|
||||
}),
|
||||
Index::IvfPq(Default::default()),
|
||||
),
|
||||
(
|
||||
"IVF_PQ",
|
||||
json!({
|
||||
"metric_type": "cosine",
|
||||
"num_partitions": 128,
|
||||
"num_bits": 4,
|
||||
}),
|
||||
Index::IvfPq(
|
||||
IvfPqIndexBuilder::default()
|
||||
.distance_type(DistanceType::Cosine)
|
||||
.num_partitions(128)
|
||||
.num_bits(4),
|
||||
),
|
||||
),
|
||||
(
|
||||
"IVF_HNSW_SQ",
|
||||
Some("l2"),
|
||||
json!({
|
||||
"metric_type": "l2",
|
||||
}),
|
||||
Index::IvfHnswSq(Default::default()),
|
||||
),
|
||||
(
|
||||
"IVF_HNSW_SQ",
|
||||
json!({
|
||||
"metric_type": "l2",
|
||||
"num_partitions": 128,
|
||||
}),
|
||||
Index::IvfHnswSq(
|
||||
IvfHnswSqIndexBuilder::default()
|
||||
.distance_type(DistanceType::L2)
|
||||
.num_partitions(128),
|
||||
),
|
||||
),
|
||||
// HNSW_PQ isn't yet supported on SaaS
|
||||
("BTREE", None, Index::BTree(Default::default())),
|
||||
("BITMAP", None, Index::Bitmap(Default::default())),
|
||||
("LABEL_LIST", None, Index::LabelList(Default::default())),
|
||||
("FTS", None, Index::FTS(Default::default())),
|
||||
("BTREE", json!({}), Index::BTree(Default::default())),
|
||||
("BITMAP", json!({}), Index::Bitmap(Default::default())),
|
||||
(
|
||||
"LABEL_LIST",
|
||||
json!({}),
|
||||
Index::LabelList(Default::default()),
|
||||
),
|
||||
(
|
||||
"FTS",
|
||||
serde_json::to_value(InvertedIndexParams::default()).unwrap(),
|
||||
Index::FTS(Default::default()),
|
||||
),
|
||||
];
|
||||
|
||||
for (index_type, distance_type, index) in cases {
|
||||
let params = index.clone();
|
||||
for (index_type, expected_body, index) in cases {
|
||||
let table = Table::new_with_handler("my_table", move |request| {
|
||||
assert_eq!(request.method(), "POST");
|
||||
assert_eq!(request.url().path(), "/v1/table/my_table/create_index/");
|
||||
@@ -2448,19 +2544,9 @@ mod tests {
|
||||
);
|
||||
let body = request.body().unwrap().as_bytes().unwrap();
|
||||
let body: serde_json::Value = serde_json::from_slice(body).unwrap();
|
||||
let mut expected_body = serde_json::json!({
|
||||
"column": "a",
|
||||
"index_type": index_type,
|
||||
});
|
||||
if let Some(distance_type) = distance_type {
|
||||
expected_body["metric_type"] = distance_type.to_lowercase().into();
|
||||
}
|
||||
if let Index::FTS(fts) = ¶ms {
|
||||
let params = serde_json::to_value(fts).unwrap();
|
||||
for (key, value) in params.as_object().unwrap() {
|
||||
expected_body[key] = value.clone();
|
||||
}
|
||||
}
|
||||
let mut expected_body = expected_body.clone();
|
||||
expected_body["column"] = "a".into();
|
||||
expected_body[INDEX_TYPE_KEY] = index_type.into();
|
||||
|
||||
assert_eq!(body, expected_body);
|
||||
|
||||
|
||||
@@ -2354,12 +2354,15 @@ impl BaseTable for NativeTable {
|
||||
query.base.limit.unwrap_or(DEFAULT_TOP_K),
|
||||
)?;
|
||||
}
|
||||
scanner.minimum_nprobes(query.minimum_nprobes);
|
||||
if let Some(maximum_nprobes) = query.maximum_nprobes {
|
||||
scanner.maximum_nprobes(maximum_nprobes);
|
||||
}
|
||||
}
|
||||
scanner.limit(
|
||||
query.base.limit.map(|limit| limit as i64),
|
||||
query.base.offset.map(|offset| offset as i64),
|
||||
)?;
|
||||
scanner.nprobs(query.nprobes);
|
||||
if let Some(ef) = query.ef {
|
||||
scanner.ef(ef);
|
||||
}
|
||||
|
||||
@@ -392,9 +392,18 @@ pub mod tests {
|
||||
} else {
|
||||
expected_line.trim()
|
||||
};
|
||||
assert_eq!(&actual_trimmed[..expected_trimmed.len()], expected_trimmed);
|
||||
assert_eq!(
|
||||
&actual_trimmed[..expected_trimmed.len()],
|
||||
expected_trimmed,
|
||||
"\nactual:\n{physical_plan}\nexpected:\n{expected}"
|
||||
);
|
||||
}
|
||||
assert_eq!(lines_checked, expected.lines().count());
|
||||
assert_eq!(
|
||||
lines_checked,
|
||||
expected.lines().count(),
|
||||
"\nlines_checked:\n{lines_checked}\nexpected:\n{}",
|
||||
expected.lines().count()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -477,9 +486,9 @@ pub mod tests {
|
||||
TestFixture::check_plan(
|
||||
plan,
|
||||
"MetadataEraserExec
|
||||
RepartitionExec:...
|
||||
CoalesceBatchesExec:...
|
||||
FilterExec: i@0 >= 5
|
||||
RepartitionExec:...
|
||||
ProjectionExec:...
|
||||
LanceScan:...",
|
||||
)
|
||||
|
||||
@@ -129,7 +129,9 @@ impl DatasetRef {
|
||||
dataset: ref mut ds,
|
||||
..
|
||||
} => {
|
||||
*ds = dataset;
|
||||
if dataset.manifest().version > ds.manifest().version {
|
||||
*ds = dataset;
|
||||
}
|
||||
}
|
||||
_ => unreachable!("Dataset should be in latest mode at this point"),
|
||||
}
|
||||
|
||||
@@ -281,6 +281,46 @@ async fn test_encryption() -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_table_storage_options_override() -> Result<()> {
|
||||
// Test that table-level storage options override connection-level options
|
||||
let bucket = S3Bucket::new("test-override").await;
|
||||
let key1 = KMSKey::new().await;
|
||||
let key2 = KMSKey::new().await;
|
||||
|
||||
let uri = format!("s3://{}", bucket.0);
|
||||
|
||||
// Create connection with key1 encryption
|
||||
let db = lancedb::connect(&uri)
|
||||
.storage_options(CONFIG.iter().cloned())
|
||||
.storage_option("aws_server_side_encryption", "aws:kms")
|
||||
.storage_option("aws_sse_kms_key_id", &key1.0)
|
||||
.execute()
|
||||
.await?;
|
||||
|
||||
// Create table overriding with key2 encryption
|
||||
let data = test_data();
|
||||
let data = RecordBatchIterator::new(vec![Ok(data.clone())], data.schema());
|
||||
let _table = db
|
||||
.create_table("test_override", data)
|
||||
.storage_option("aws_sse_kms_key_id", &key2.0)
|
||||
.execute()
|
||||
.await?;
|
||||
|
||||
// Verify objects are encrypted with key2, not key1
|
||||
validate_objects_encrypted(&bucket.0, "test_override", &key2.0).await;
|
||||
|
||||
// Also test that a table created without override uses connection settings
|
||||
let data = test_data();
|
||||
let data = RecordBatchIterator::new(vec![Ok(data.clone())], data.schema());
|
||||
let _table2 = db.create_table("test_inherit", data).execute().await?;
|
||||
|
||||
// Verify this table uses key1 from connection
|
||||
validate_objects_encrypted(&bucket.0, "test_inherit", &key1.0).await;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
struct DynamoDBCommitTable(String);
|
||||
|
||||
impl DynamoDBCommitTable {
|
||||
|
||||
Reference in New Issue
Block a user