mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-23 05:19:58 +00:00
Compare commits
39 Commits
python-v0.
...
python-v0.
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
27d9e5c596 | ||
|
|
ec8271931f | ||
|
|
6c6966600c | ||
|
|
2e170c3c7b | ||
|
|
fd92e651d1 | ||
|
|
c298482ee1 | ||
|
|
d59f64b5a3 | ||
|
|
30ed8c4c43 | ||
|
|
4a2cdbf299 | ||
|
|
657843d9e9 | ||
|
|
1cd76b8498 | ||
|
|
a38f784081 | ||
|
|
647dee4e94 | ||
|
|
0844c2dd64 | ||
|
|
fd2692295c | ||
|
|
d4ea50fba1 | ||
|
|
0d42297cf8 | ||
|
|
a6d4125cbf | ||
|
|
5c32a99e61 | ||
|
|
cefaa75b24 | ||
|
|
bd62c2384f | ||
|
|
f0bc08c0d7 | ||
|
|
e52ac79c69 | ||
|
|
f091f57594 | ||
|
|
a997fd4108 | ||
|
|
1486514ccc | ||
|
|
a505bc3965 | ||
|
|
c1738250a3 | ||
|
|
1ee63984f5 | ||
|
|
2eb2c8862a | ||
|
|
4ea8e178d3 | ||
|
|
e4485a630e | ||
|
|
fb95f9b3bd | ||
|
|
625bab3f21 | ||
|
|
e59f9382a0 | ||
|
|
fdee7ba477 | ||
|
|
c44fa3abc4 | ||
|
|
fc43aac0ed | ||
|
|
e67cd0baf9 |
@@ -1,5 +1,5 @@
|
||||
[tool.bumpversion]
|
||||
current_version = "0.18.3-beta.0"
|
||||
current_version = "0.19.0-beta.5"
|
||||
parse = """(?x)
|
||||
(?P<major>0|[1-9]\\d*)\\.
|
||||
(?P<minor>0|[1-9]\\d*)\\.
|
||||
|
||||
44
.github/workflows/npm-publish.yml
vendored
44
.github/workflows/npm-publish.yml
vendored
@@ -18,6 +18,7 @@ on:
|
||||
# This should trigger a dry run (we skip the final publish step)
|
||||
paths:
|
||||
- .github/workflows/npm-publish.yml
|
||||
- Cargo.toml # Change in dependency frequently breaks builds
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}
|
||||
@@ -130,29 +131,24 @@ jobs:
|
||||
set -e &&
|
||||
apt-get update &&
|
||||
apt-get install -y protobuf-compiler pkg-config
|
||||
|
||||
# TODO: re-enable x64 musl builds. I could not figure out why, but it
|
||||
# consistently made GHA runners non-responsive at the end of build. Example:
|
||||
# https://github.com/lancedb/lancedb/actions/runs/13980431071/job/39144319470?pr=2250
|
||||
|
||||
# - target: x86_64-unknown-linux-musl
|
||||
# # This one seems to need some extra memory
|
||||
# host: ubuntu-2404-8x-x64
|
||||
# # https://github.com/napi-rs/napi-rs/blob/main/alpine.Dockerfile
|
||||
# docker: ghcr.io/napi-rs/napi-rs/nodejs-rust:lts-alpine
|
||||
# features: ","
|
||||
# pre_build: |-
|
||||
# set -e &&
|
||||
# apk add protobuf-dev curl &&
|
||||
# ln -s /usr/lib/gcc/x86_64-alpine-linux-musl/14.2.0/crtbeginS.o /usr/lib/crtbeginS.o &&
|
||||
# ln -s /usr/lib/libgcc_s.so /usr/lib/libgcc.so
|
||||
|
||||
- target: x86_64-unknown-linux-musl
|
||||
# This one seems to need some extra memory
|
||||
host: ubuntu-2404-8x-x64
|
||||
# https://github.com/napi-rs/napi-rs/blob/main/alpine.Dockerfile
|
||||
docker: ghcr.io/napi-rs/napi-rs/nodejs-rust:lts-alpine
|
||||
features: fp16kernels
|
||||
pre_build: |-
|
||||
set -e &&
|
||||
apk add protobuf-dev curl &&
|
||||
ln -s /usr/lib/gcc/x86_64-alpine-linux-musl/14.2.0/crtbeginS.o /usr/lib/crtbeginS.o &&
|
||||
ln -s /usr/lib/libgcc_s.so /usr/lib/libgcc.so &&
|
||||
CC=gcc &&
|
||||
CXX=g++
|
||||
- target: aarch64-unknown-linux-gnu
|
||||
host: ubuntu-2404-8x-x64
|
||||
# https://github.com/napi-rs/napi-rs/blob/main/debian-aarch64.Dockerfile
|
||||
docker: ghcr.io/napi-rs/napi-rs/nodejs-rust:lts-debian-aarch64
|
||||
# TODO: enable fp16kernels after https://github.com/lancedb/lance/pull/3559
|
||||
features: ","
|
||||
features: "fp16kernels"
|
||||
pre_build: |-
|
||||
set -e &&
|
||||
apt-get update &&
|
||||
@@ -170,8 +166,8 @@ jobs:
|
||||
set -e &&
|
||||
apk add protobuf-dev &&
|
||||
rustup target add aarch64-unknown-linux-musl &&
|
||||
export CC="/aarch64-linux-musl-cross/bin/aarch64-linux-musl-gcc" &&
|
||||
export CXX="/aarch64-linux-musl-cross/bin/aarch64-linux-musl-g++"
|
||||
export CC_aarch64_unknown_linux_musl=aarch64-linux-musl-gcc &&
|
||||
export CXX_aarch64_unknown_linux_musl=aarch64-linux-musl-g++
|
||||
name: build - ${{ matrix.settings.target }}
|
||||
runs-on: ${{ matrix.settings.host }}
|
||||
defaults:
|
||||
@@ -535,6 +531,12 @@ jobs:
|
||||
for filename in *.tgz; do
|
||||
npm publish $PUBLISH_ARGS $filename
|
||||
done
|
||||
- name: Deprecate
|
||||
env:
|
||||
NODE_AUTH_TOKEN: ${{ secrets.LANCEDB_NPM_REGISTRY_TOKEN }}
|
||||
# We need to deprecate the old package to avoid confusion.
|
||||
# Each time we publish a new version, it gets undeprecated.
|
||||
run: npm deprecate vectordb "Use @lancedb/lancedb instead."
|
||||
- name: Notify Slack Action
|
||||
uses: ravsamhq/notify-slack-action@2.3.0
|
||||
if: ${{ always() }}
|
||||
|
||||
1
.github/workflows/pypi-publish.yml
vendored
1
.github/workflows/pypi-publish.yml
vendored
@@ -8,6 +8,7 @@ on:
|
||||
# This should trigger a dry run (we skip the final publish step)
|
||||
paths:
|
||||
- .github/workflows/pypi-publish.yml
|
||||
- Cargo.toml # Change in dependency frequently breaks builds
|
||||
|
||||
jobs:
|
||||
linux:
|
||||
|
||||
276
Cargo.lock
generated
276
Cargo.lock
generated
@@ -1816,27 +1816,30 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "datafusion"
|
||||
version = "45.0.0"
|
||||
version = "46.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "eae420e7a5b0b7f1c39364cc76cbcd0f5fdc416b2514ae3847c2676bbd60702a"
|
||||
checksum = "914e6f9525599579abbd90b0f7a55afcaaaa40350b9e9ed52563f126dfe45fd3"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"arrow-array",
|
||||
"arrow-ipc",
|
||||
"arrow-schema",
|
||||
"async-trait",
|
||||
"bytes",
|
||||
"chrono",
|
||||
"datafusion-catalog",
|
||||
"datafusion-catalog-listing",
|
||||
"datafusion-common",
|
||||
"datafusion-common-runtime",
|
||||
"datafusion-datasource",
|
||||
"datafusion-execution",
|
||||
"datafusion-expr",
|
||||
"datafusion-expr-common",
|
||||
"datafusion-functions",
|
||||
"datafusion-functions-aggregate",
|
||||
"datafusion-functions-nested",
|
||||
"datafusion-functions-table",
|
||||
"datafusion-functions-window",
|
||||
"datafusion-macros",
|
||||
"datafusion-optimizer",
|
||||
"datafusion-physical-expr",
|
||||
"datafusion-physical-expr-common",
|
||||
@@ -1844,14 +1847,13 @@ dependencies = [
|
||||
"datafusion-physical-plan",
|
||||
"datafusion-sql",
|
||||
"futures",
|
||||
"glob",
|
||||
"itertools 0.14.0",
|
||||
"log",
|
||||
"object_store",
|
||||
"parking_lot",
|
||||
"rand 0.8.5",
|
||||
"regex",
|
||||
"sqlparser 0.53.0",
|
||||
"sqlparser 0.54.0",
|
||||
"tempfile",
|
||||
"tokio",
|
||||
"url",
|
||||
@@ -1860,9 +1862,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "datafusion-catalog"
|
||||
version = "45.0.0"
|
||||
version = "46.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6f27987bc22b810939e8dfecc55571e9d50355d6ea8ec1c47af8383a76a6d0e1"
|
||||
checksum = "998a6549e6ee4ee3980e05590b2960446a56b343ea30199ef38acd0e0b9036e2"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"async-trait",
|
||||
@@ -1876,21 +1878,39 @@ dependencies = [
|
||||
"itertools 0.14.0",
|
||||
"log",
|
||||
"parking_lot",
|
||||
"sqlparser 0.53.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "datafusion-catalog-listing"
|
||||
version = "46.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a5ac10096a5b3c0d8a227176c0e543606860842e943594ccddb45cf42a526e43"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"async-trait",
|
||||
"datafusion-catalog",
|
||||
"datafusion-common",
|
||||
"datafusion-datasource",
|
||||
"datafusion-execution",
|
||||
"datafusion-expr",
|
||||
"datafusion-physical-expr",
|
||||
"datafusion-physical-expr-common",
|
||||
"datafusion-physical-plan",
|
||||
"futures",
|
||||
"log",
|
||||
"object_store",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "datafusion-common"
|
||||
version = "45.0.0"
|
||||
version = "46.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e3f6d5b8c9408cc692f7c194b8aa0c0f9b253e065a8d960ad9cdc2a13e697602"
|
||||
checksum = "1f53d7ec508e1b3f68bd301cee3f649834fad51eff9240d898a4b2614cfd0a7a"
|
||||
dependencies = [
|
||||
"ahash",
|
||||
"arrow",
|
||||
"arrow-array",
|
||||
"arrow-buffer",
|
||||
"arrow-ipc",
|
||||
"arrow-schema",
|
||||
"base64 0.22.1",
|
||||
"half",
|
||||
"hashbrown 0.14.5",
|
||||
@@ -1899,32 +1919,60 @@ dependencies = [
|
||||
"log",
|
||||
"object_store",
|
||||
"paste",
|
||||
"sqlparser 0.53.0",
|
||||
"sqlparser 0.54.0",
|
||||
"tokio",
|
||||
"web-time",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "datafusion-common-runtime"
|
||||
version = "45.0.0"
|
||||
version = "46.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0d4603c8e8a4baf77660ab7074cc66fc15cc8a18f2ce9dfadb755fc6ee294e48"
|
||||
checksum = "e0fcf41523b22e14cc349b01526e8b9f59206653037f2949a4adbfde5f8cb668"
|
||||
dependencies = [
|
||||
"log",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "datafusion-doc"
|
||||
version = "45.0.0"
|
||||
name = "datafusion-datasource"
|
||||
version = "46.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e5bf4bc68623a5cf231eed601ed6eb41f46a37c4d15d11a0bff24cbc8396cd66"
|
||||
checksum = "cf7f37ad8b6e88b46c7eeab3236147d32ea64b823544f498455a8d9042839c92"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"async-trait",
|
||||
"bytes",
|
||||
"chrono",
|
||||
"datafusion-catalog",
|
||||
"datafusion-common",
|
||||
"datafusion-common-runtime",
|
||||
"datafusion-execution",
|
||||
"datafusion-expr",
|
||||
"datafusion-physical-expr",
|
||||
"datafusion-physical-expr-common",
|
||||
"datafusion-physical-plan",
|
||||
"futures",
|
||||
"glob",
|
||||
"itertools 0.14.0",
|
||||
"log",
|
||||
"object_store",
|
||||
"rand 0.8.5",
|
||||
"tokio",
|
||||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "datafusion-doc"
|
||||
version = "46.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7db7a0239fd060f359dc56c6e7db726abaa92babaed2fb2e91c3a8b2fff8b256"
|
||||
|
||||
[[package]]
|
||||
name = "datafusion-execution"
|
||||
version = "45.0.0"
|
||||
version = "46.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "88b491c012cdf8e051053426013429a76f74ee3c2db68496c79c323ca1084d27"
|
||||
checksum = "0938f9e5b6bc5782be4111cdfb70c02b7b5451bf34fd57e4de062a7f7c4e31f1"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"dashmap",
|
||||
@@ -1941,9 +1989,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "datafusion-expr"
|
||||
version = "45.0.0"
|
||||
version = "46.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e5a181408d4fc5dc22f9252781a8f39f2d0e5d1b33ec9bde242844980a2689c1"
|
||||
checksum = "b36c28b00b00019a8695ad7f1a53ee1673487b90322ecbd604e2cf32894eb14f"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"chrono",
|
||||
@@ -1956,26 +2004,27 @@ dependencies = [
|
||||
"indexmap 2.8.0",
|
||||
"paste",
|
||||
"serde_json",
|
||||
"sqlparser 0.53.0",
|
||||
"sqlparser 0.54.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "datafusion-expr-common"
|
||||
version = "45.0.0"
|
||||
version = "46.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d1129b48e8534d8c03c6543bcdccef0b55c8ac0c1272a15a56c67068b6eb1885"
|
||||
checksum = "18f0a851a436c5a2139189eb4617a54e6a9ccb9edc96c4b3c83b3bb7c58b950e"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"datafusion-common",
|
||||
"indexmap 2.8.0",
|
||||
"itertools 0.14.0",
|
||||
"paste",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "datafusion-functions"
|
||||
version = "45.0.0"
|
||||
version = "46.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6125874e4856dfb09b59886784fcb74cde5cfc5930b3a80a1a728ef7a010df6b"
|
||||
checksum = "e3196e37d7b65469fb79fee4f05e5bb58a456831035f9a38aa5919aeb3298d40"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"arrow-buffer",
|
||||
@@ -1989,7 +2038,6 @@ dependencies = [
|
||||
"datafusion-expr",
|
||||
"datafusion-expr-common",
|
||||
"datafusion-macros",
|
||||
"hashbrown 0.14.5",
|
||||
"hex",
|
||||
"itertools 0.14.0",
|
||||
"log",
|
||||
@@ -2003,14 +2051,12 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "datafusion-functions-aggregate"
|
||||
version = "45.0.0"
|
||||
version = "46.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f3add7b1d3888e05e7c95f2b281af900ca69ebdcb21069ba679b33bde8b3b9d6"
|
||||
checksum = "adfc2d074d5ee4d9354fdcc9283d5b2b9037849237ddecb8942a29144b77ca05"
|
||||
dependencies = [
|
||||
"ahash",
|
||||
"arrow",
|
||||
"arrow-buffer",
|
||||
"arrow-schema",
|
||||
"datafusion-common",
|
||||
"datafusion-doc",
|
||||
"datafusion-execution",
|
||||
@@ -2026,9 +2072,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "datafusion-functions-aggregate-common"
|
||||
version = "45.0.0"
|
||||
version = "46.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6e18baa4cfc3d2f144f74148ed68a1f92337f5072b6dde204a0dbbdf3324989c"
|
||||
checksum = "1cbceba0f98d921309a9121b702bcd49289d383684cccabf9a92cda1602f3bbb"
|
||||
dependencies = [
|
||||
"ahash",
|
||||
"arrow",
|
||||
@@ -2039,15 +2085,12 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "datafusion-functions-nested"
|
||||
version = "45.0.0"
|
||||
version = "46.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3ec5ee8cecb0dc370291279673097ddabec03a011f73f30d7f1096457127e03e"
|
||||
checksum = "170e27ce4baa27113ddf5f77f1a7ec484b0dbeda0c7abbd4bad3fc609c8ab71a"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"arrow-array",
|
||||
"arrow-buffer",
|
||||
"arrow-ord",
|
||||
"arrow-schema",
|
||||
"datafusion-common",
|
||||
"datafusion-doc",
|
||||
"datafusion-execution",
|
||||
@@ -2063,9 +2106,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "datafusion-functions-table"
|
||||
version = "45.0.0"
|
||||
version = "46.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2c403ddd473bbb0952ba880008428b3c7febf0ed3ce1eec35a205db20efb2a36"
|
||||
checksum = "7d3a06a7f0817ded87b026a437e7e51de7f59d48173b0a4e803aa896a7bd6bb5"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"async-trait",
|
||||
@@ -2079,9 +2122,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "datafusion-functions-window"
|
||||
version = "45.0.0"
|
||||
version = "46.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1ab18c2fb835614d06a75f24a9e09136d3a8c12a92d97c95a6af316a1787a9c5"
|
||||
checksum = "d6c608b66496a1e05e3d196131eb9bebea579eed1f59e88d962baf3dda853bc6"
|
||||
dependencies = [
|
||||
"datafusion-common",
|
||||
"datafusion-doc",
|
||||
@@ -2096,9 +2139,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "datafusion-functions-window-common"
|
||||
version = "45.0.0"
|
||||
version = "46.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a77b73bc15e7d1967121fdc7a55d819bfb9d6c03766a6c322247dce9094a53a4"
|
||||
checksum = "da2f9d83348957b4ad0cd87b5cb9445f2651863a36592fe5484d43b49a5f8d82"
|
||||
dependencies = [
|
||||
"datafusion-common",
|
||||
"datafusion-physical-expr-common",
|
||||
@@ -2106,9 +2149,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "datafusion-macros"
|
||||
version = "45.0.0"
|
||||
version = "46.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "09369b8d962291e808977cf94d495fd8b5b38647232d7ef562c27ac0f495b0af"
|
||||
checksum = "4800e1ff7ecf8f310887e9b54c9c444b8e215ccbc7b21c2f244cfae373b1ece7"
|
||||
dependencies = [
|
||||
"datafusion-expr",
|
||||
"quote",
|
||||
@@ -2117,9 +2160,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "datafusion-optimizer"
|
||||
version = "45.0.0"
|
||||
version = "46.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2403a7e4a84637f3de7d8d4d7a9ccc0cc4be92d89b0161ba3ee5be82f0531c54"
|
||||
checksum = "971c51c54cd309001376fae752fb15a6b41750b6d1552345c46afbfb6458801b"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"chrono",
|
||||
@@ -2135,15 +2178,12 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "datafusion-physical-expr"
|
||||
version = "45.0.0"
|
||||
version = "46.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "86ff72ac702b62dbf2650c4e1d715ebd3e4aab14e3885e72e8549e250307347c"
|
||||
checksum = "e1447c2c6bc8674a16be4786b4abf528c302803fafa186aa6275692570e64d85"
|
||||
dependencies = [
|
||||
"ahash",
|
||||
"arrow",
|
||||
"arrow-array",
|
||||
"arrow-buffer",
|
||||
"arrow-schema",
|
||||
"datafusion-common",
|
||||
"datafusion-expr",
|
||||
"datafusion-expr-common",
|
||||
@@ -2160,13 +2200,12 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "datafusion-physical-expr-common"
|
||||
version = "45.0.0"
|
||||
version = "46.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "60982b7d684e25579ee29754b4333057ed62e2cc925383c5f0bd8cab7962f435"
|
||||
checksum = "69f8c25dcd069073a75b3d2840a79d0f81e64bdd2c05f2d3d18939afb36a7dcb"
|
||||
dependencies = [
|
||||
"ahash",
|
||||
"arrow",
|
||||
"arrow-buffer",
|
||||
"datafusion-common",
|
||||
"datafusion-expr-common",
|
||||
"hashbrown 0.14.5",
|
||||
@@ -2175,12 +2214,11 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "datafusion-physical-optimizer"
|
||||
version = "45.0.0"
|
||||
version = "46.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ac5e85c189d5238a5cf181a624e450c4cd4c66ac77ca551d6f3ff9080bac90bb"
|
||||
checksum = "68da5266b5b9847c11d1b3404ee96b1d423814e1973e1ad3789131e5ec912763"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"arrow-schema",
|
||||
"datafusion-common",
|
||||
"datafusion-execution",
|
||||
"datafusion-expr",
|
||||
@@ -2188,22 +2226,18 @@ dependencies = [
|
||||
"datafusion-physical-expr",
|
||||
"datafusion-physical-expr-common",
|
||||
"datafusion-physical-plan",
|
||||
"futures",
|
||||
"itertools 0.14.0",
|
||||
"log",
|
||||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "datafusion-physical-plan"
|
||||
version = "45.0.0"
|
||||
version = "46.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c36bf163956d7e2542657c78b3383fdc78f791317ef358a359feffcdb968106f"
|
||||
checksum = "88cc160df00e413e370b3b259c8ea7bfbebc134d32de16325950e9e923846b7f"
|
||||
dependencies = [
|
||||
"ahash",
|
||||
"arrow",
|
||||
"arrow-array",
|
||||
"arrow-buffer",
|
||||
"arrow-ord",
|
||||
"arrow-schema",
|
||||
"async-trait",
|
||||
@@ -2228,20 +2262,18 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "datafusion-sql"
|
||||
version = "45.0.0"
|
||||
version = "46.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e13caa4daede211ecec53c78b13c503b592794d125f9a3cc3afe992edf9e7f43"
|
||||
checksum = "325a212b67b677c0eb91447bf9a11b630f9fc4f62d8e5d145bf859f5a6b29e64"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"arrow-array",
|
||||
"arrow-schema",
|
||||
"bigdecimal",
|
||||
"datafusion-common",
|
||||
"datafusion-expr",
|
||||
"indexmap 2.8.0",
|
||||
"log",
|
||||
"regex",
|
||||
"sqlparser 0.53.0",
|
||||
"sqlparser 0.54.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2687,12 +2719,21 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c"
|
||||
|
||||
[[package]]
|
||||
name = "fsst"
|
||||
version = "0.25.1"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.25.1-beta.3#33634d3b2e8f6a54e63a97721c7fcd31206e999a"
|
||||
version = "0.25.3"
|
||||
source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.4#236c8f986ab9e2d478d0754fab6e8d2643c31247"
|
||||
dependencies = [
|
||||
"rand 0.8.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fst"
|
||||
version = "0.4.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7ab85b9b05e3978cc9a9cf8fea7f01b494e1a09ed3037e16ba39edc7a29eb61a"
|
||||
dependencies = [
|
||||
"utf8-ranges",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "funty"
|
||||
version = "2.0.0"
|
||||
@@ -3666,8 +3707,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance"
|
||||
version = "0.25.1"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.25.1-beta.3#33634d3b2e8f6a54e63a97721c7fcd31206e999a"
|
||||
version = "0.25.3"
|
||||
source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.4#236c8f986ab9e2d478d0754fab6e8d2643c31247"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"arrow-arith",
|
||||
@@ -3726,8 +3767,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-arrow"
|
||||
version = "0.25.1"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.25.1-beta.3#33634d3b2e8f6a54e63a97721c7fcd31206e999a"
|
||||
version = "0.25.3"
|
||||
source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.4#236c8f986ab9e2d478d0754fab6e8d2643c31247"
|
||||
dependencies = [
|
||||
"arrow-array",
|
||||
"arrow-buffer",
|
||||
@@ -3744,8 +3785,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-core"
|
||||
version = "0.25.1"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.25.1-beta.3#33634d3b2e8f6a54e63a97721c7fcd31206e999a"
|
||||
version = "0.25.3"
|
||||
source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.4#236c8f986ab9e2d478d0754fab6e8d2643c31247"
|
||||
dependencies = [
|
||||
"arrow-array",
|
||||
"arrow-buffer",
|
||||
@@ -3781,8 +3822,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-datafusion"
|
||||
version = "0.25.1"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.25.1-beta.3#33634d3b2e8f6a54e63a97721c7fcd31206e999a"
|
||||
version = "0.25.3"
|
||||
source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.4#236c8f986ab9e2d478d0754fab6e8d2643c31247"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"arrow-array",
|
||||
@@ -3798,6 +3839,7 @@ dependencies = [
|
||||
"futures",
|
||||
"lance-arrow",
|
||||
"lance-core",
|
||||
"lance-datagen",
|
||||
"lazy_static",
|
||||
"log",
|
||||
"prost",
|
||||
@@ -3806,10 +3848,26 @@ dependencies = [
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lance-datagen"
|
||||
version = "0.25.3"
|
||||
source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.4#236c8f986ab9e2d478d0754fab6e8d2643c31247"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"arrow-array",
|
||||
"arrow-cast",
|
||||
"arrow-schema",
|
||||
"chrono",
|
||||
"futures",
|
||||
"hex",
|
||||
"rand 0.8.5",
|
||||
"rand_xoshiro",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lance-encoding"
|
||||
version = "0.25.1"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.25.1-beta.3#33634d3b2e8f6a54e63a97721c7fcd31206e999a"
|
||||
version = "0.25.3"
|
||||
source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.4#236c8f986ab9e2d478d0754fab6e8d2643c31247"
|
||||
dependencies = [
|
||||
"arrayref",
|
||||
"arrow",
|
||||
@@ -3832,6 +3890,7 @@ dependencies = [
|
||||
"lance-core",
|
||||
"lazy_static",
|
||||
"log",
|
||||
"lz4",
|
||||
"num-traits",
|
||||
"paste",
|
||||
"prost",
|
||||
@@ -3847,8 +3906,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-file"
|
||||
version = "0.25.1"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.25.1-beta.3#33634d3b2e8f6a54e63a97721c7fcd31206e999a"
|
||||
version = "0.25.3"
|
||||
source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.4#236c8f986ab9e2d478d0754fab6e8d2643c31247"
|
||||
dependencies = [
|
||||
"arrow-arith",
|
||||
"arrow-array",
|
||||
@@ -3882,8 +3941,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-index"
|
||||
version = "0.25.1"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.25.1-beta.3#33634d3b2e8f6a54e63a97721c7fcd31206e999a"
|
||||
version = "0.25.3"
|
||||
source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.4#236c8f986ab9e2d478d0754fab6e8d2643c31247"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"arrow-array",
|
||||
@@ -3902,6 +3961,7 @@ dependencies = [
|
||||
"datafusion-sql",
|
||||
"deepsize",
|
||||
"dirs",
|
||||
"fst",
|
||||
"futures",
|
||||
"half",
|
||||
"itertools 0.13.0",
|
||||
@@ -3935,8 +3995,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-io"
|
||||
version = "0.25.1"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.25.1-beta.3#33634d3b2e8f6a54e63a97721c7fcd31206e999a"
|
||||
version = "0.25.3"
|
||||
source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.4#236c8f986ab9e2d478d0754fab6e8d2643c31247"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"arrow-arith",
|
||||
@@ -3974,8 +4034,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-linalg"
|
||||
version = "0.25.1"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.25.1-beta.3#33634d3b2e8f6a54e63a97721c7fcd31206e999a"
|
||||
version = "0.25.3"
|
||||
source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.4#236c8f986ab9e2d478d0754fab6e8d2643c31247"
|
||||
dependencies = [
|
||||
"arrow-array",
|
||||
"arrow-ord",
|
||||
@@ -3998,8 +4058,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-table"
|
||||
version = "0.25.1"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.25.1-beta.3#33634d3b2e8f6a54e63a97721c7fcd31206e999a"
|
||||
version = "0.25.3"
|
||||
source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.4#236c8f986ab9e2d478d0754fab6e8d2643c31247"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"arrow-array",
|
||||
@@ -4038,8 +4098,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-testing"
|
||||
version = "0.25.1"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.25.1-beta.3#33634d3b2e8f6a54e63a97721c7fcd31206e999a"
|
||||
version = "0.25.3"
|
||||
source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.4#236c8f986ab9e2d478d0754fab6e8d2643c31247"
|
||||
dependencies = [
|
||||
"arrow-array",
|
||||
"arrow-schema",
|
||||
@@ -4050,7 +4110,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lancedb"
|
||||
version = "0.18.3-beta.0"
|
||||
version = "0.19.0-beta.5"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"arrow-array",
|
||||
@@ -4137,7 +4197,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lancedb-node"
|
||||
version = "0.18.3-beta.0"
|
||||
version = "0.19.0-beta.5"
|
||||
dependencies = [
|
||||
"arrow-array",
|
||||
"arrow-ipc",
|
||||
@@ -4162,7 +4222,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lancedb-nodejs"
|
||||
version = "0.18.3-beta.0"
|
||||
version = "0.19.0-beta.5"
|
||||
dependencies = [
|
||||
"arrow-array",
|
||||
"arrow-ipc",
|
||||
@@ -4180,7 +4240,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lancedb-python"
|
||||
version = "0.21.3-beta.0"
|
||||
version = "0.22.0-beta.5"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"env_logger",
|
||||
@@ -5895,6 +5955,15 @@ dependencies = [
|
||||
"rand 0.8.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_xoshiro"
|
||||
version = "0.6.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6f97cdb2a36ed4183de61b2f824cc45c9f1037f28afe0a322e9fff4c108b5aaa"
|
||||
dependencies = [
|
||||
"rand_core 0.6.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "random_word"
|
||||
version = "0.4.3"
|
||||
@@ -6781,11 +6850,12 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "sqlparser"
|
||||
version = "0.53.0"
|
||||
version = "0.54.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "05a528114c392209b3264855ad491fcce534b94a38771b0a0b97a79379275ce8"
|
||||
checksum = "c66e3b7374ad4a6af849b08b3e7a6eda0edbd82f0fd59b57e22671bf16979899"
|
||||
dependencies = [
|
||||
"log",
|
||||
"recursive",
|
||||
"sqlparser_derive",
|
||||
]
|
||||
|
||||
@@ -7636,7 +7706,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "458f7a779bf54acc9f347480ac654f68407d3aab21269a6e3c9f922acd9e2da9"
|
||||
dependencies = [
|
||||
"getrandom 0.3.2",
|
||||
"js-sys",
|
||||
"serde",
|
||||
"wasm-bindgen",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
||||
30
Cargo.toml
30
Cargo.toml
@@ -21,16 +21,16 @@ categories = ["database-implementations"]
|
||||
rust-version = "1.78.0"
|
||||
|
||||
[workspace.dependencies]
|
||||
lance = { "version" = "=0.25.1", "features" = [
|
||||
lance = { "version" = "=0.25.3", "features" = [
|
||||
"dynamodb",
|
||||
], tag = "v0.25.1-beta.3", git = "https://github.com/lancedb/lance.git" }
|
||||
lance-io = { version = "=0.25.1", tag = "v0.25.1-beta.3", git = "https://github.com/lancedb/lance.git" }
|
||||
lance-index = { version = "=0.25.1", tag = "v0.25.1-beta.3", git = "https://github.com/lancedb/lance.git" }
|
||||
lance-linalg = { version = "=0.25.1", tag = "v0.25.1-beta.3", git = "https://github.com/lancedb/lance.git" }
|
||||
lance-table = { version = "=0.25.1", tag = "v0.25.1-beta.3", git = "https://github.com/lancedb/lance.git" }
|
||||
lance-testing = { version = "=0.25.1", tag = "v0.25.1-beta.3", git = "https://github.com/lancedb/lance.git" }
|
||||
lance-datafusion = { version = "=0.25.1", tag = "v0.25.1-beta.3", git = "https://github.com/lancedb/lance.git" }
|
||||
lance-encoding = { version = "=0.25.1", tag = "v0.25.1-beta.3", git = "https://github.com/lancedb/lance.git" }
|
||||
], tag = "v0.25.3-beta.4", git = "https://github.com/lancedb/lance" }
|
||||
lance-io = { version = "=0.25.3", tag = "v0.25.3-beta.4", git = "https://github.com/lancedb/lance" }
|
||||
lance-index = { version = "=0.25.3", tag = "v0.25.3-beta.4", git = "https://github.com/lancedb/lance" }
|
||||
lance-linalg = { version = "=0.25.3", tag = "v0.25.3-beta.4", git = "https://github.com/lancedb/lance" }
|
||||
lance-table = { version = "=0.25.3", tag = "v0.25.3-beta.4", git = "https://github.com/lancedb/lance" }
|
||||
lance-testing = { version = "=0.25.3", tag = "v0.25.3-beta.4", git = "https://github.com/lancedb/lance" }
|
||||
lance-datafusion = { version = "=0.25.3", tag = "v0.25.3-beta.4", git = "https://github.com/lancedb/lance" }
|
||||
lance-encoding = { version = "=0.25.3", tag = "v0.25.3-beta.4", git = "https://github.com/lancedb/lance" }
|
||||
# Note that this one does not include pyarrow
|
||||
arrow = { version = "54.1", optional = false }
|
||||
arrow-array = "54.1"
|
||||
@@ -41,12 +41,12 @@ arrow-schema = "54.1"
|
||||
arrow-arith = "54.1"
|
||||
arrow-cast = "54.1"
|
||||
async-trait = "0"
|
||||
datafusion = { version = "45.0", default-features = false }
|
||||
datafusion-catalog = "45.0"
|
||||
datafusion-common = { version = "45.0", default-features = false }
|
||||
datafusion-execution = "45.0"
|
||||
datafusion-expr = "45.0"
|
||||
datafusion-physical-plan = "45.0"
|
||||
datafusion = { version = "46.0", default-features = false }
|
||||
datafusion-catalog = "46.0"
|
||||
datafusion-common = { version = "46.0", default-features = false }
|
||||
datafusion-execution = "46.0"
|
||||
datafusion-expr = "46.0"
|
||||
datafusion-physical-plan = "46.0"
|
||||
env_logger = "0.11"
|
||||
half = { "version" = "=2.4.1", default-features = false, features = [
|
||||
"num-traits",
|
||||
|
||||
75
docs/src/js/classes/BoostQuery.md
Normal file
75
docs/src/js/classes/BoostQuery.md
Normal file
@@ -0,0 +1,75 @@
|
||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../globals.md) / BoostQuery
|
||||
|
||||
# Class: BoostQuery
|
||||
|
||||
Represents a full-text query interface.
|
||||
This interface defines the structure and behavior for full-text queries,
|
||||
including methods to retrieve the query type and convert the query to a dictionary format.
|
||||
|
||||
## Implements
|
||||
|
||||
- [`FullTextQuery`](../interfaces/FullTextQuery.md)
|
||||
|
||||
## Constructors
|
||||
|
||||
### new BoostQuery()
|
||||
|
||||
```ts
|
||||
new BoostQuery(
|
||||
positive,
|
||||
negative,
|
||||
negativeBoost): BoostQuery
|
||||
```
|
||||
|
||||
Creates an instance of BoostQuery.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **positive**: [`FullTextQuery`](../interfaces/FullTextQuery.md)
|
||||
The positive query that boosts the relevance score.
|
||||
|
||||
* **negative**: [`FullTextQuery`](../interfaces/FullTextQuery.md)
|
||||
The negative query that reduces the relevance score.
|
||||
|
||||
* **negativeBoost**: `number`
|
||||
The factor by which the negative query reduces the score.
|
||||
|
||||
#### Returns
|
||||
|
||||
[`BoostQuery`](BoostQuery.md)
|
||||
|
||||
## Methods
|
||||
|
||||
### queryType()
|
||||
|
||||
```ts
|
||||
queryType(): FullTextQueryType
|
||||
```
|
||||
|
||||
#### Returns
|
||||
|
||||
[`FullTextQueryType`](../enumerations/FullTextQueryType.md)
|
||||
|
||||
#### Implementation of
|
||||
|
||||
[`FullTextQuery`](../interfaces/FullTextQuery.md).[`queryType`](../interfaces/FullTextQuery.md#querytype)
|
||||
|
||||
***
|
||||
|
||||
### toDict()
|
||||
|
||||
```ts
|
||||
toDict(): Record<string, unknown>
|
||||
```
|
||||
|
||||
#### Returns
|
||||
|
||||
`Record`<`string`, `unknown`>
|
||||
|
||||
#### Implementation of
|
||||
|
||||
[`FullTextQuery`](../interfaces/FullTextQuery.md).[`toDict`](../interfaces/FullTextQuery.md#todict)
|
||||
83
docs/src/js/classes/MatchQuery.md
Normal file
83
docs/src/js/classes/MatchQuery.md
Normal file
@@ -0,0 +1,83 @@
|
||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../globals.md) / MatchQuery
|
||||
|
||||
# Class: MatchQuery
|
||||
|
||||
Represents a full-text query interface.
|
||||
This interface defines the structure and behavior for full-text queries,
|
||||
including methods to retrieve the query type and convert the query to a dictionary format.
|
||||
|
||||
## Implements
|
||||
|
||||
- [`FullTextQuery`](../interfaces/FullTextQuery.md)
|
||||
|
||||
## Constructors
|
||||
|
||||
### new MatchQuery()
|
||||
|
||||
```ts
|
||||
new MatchQuery(
|
||||
query,
|
||||
column,
|
||||
boost,
|
||||
fuzziness,
|
||||
maxExpansions): MatchQuery
|
||||
```
|
||||
|
||||
Creates an instance of MatchQuery.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **query**: `string`
|
||||
The text query to search for.
|
||||
|
||||
* **column**: `string`
|
||||
The name of the column to search within.
|
||||
|
||||
* **boost**: `number` = `1.0`
|
||||
(Optional) The boost factor to influence the relevance score of this query. Default is `1.0`.
|
||||
|
||||
* **fuzziness**: `number` = `0`
|
||||
(Optional) The allowed edit distance for fuzzy matching. Default is `0`.
|
||||
|
||||
* **maxExpansions**: `number` = `50`
|
||||
(Optional) The maximum number of terms to consider for fuzzy matching. Default is `50`.
|
||||
|
||||
#### Returns
|
||||
|
||||
[`MatchQuery`](MatchQuery.md)
|
||||
|
||||
## Methods
|
||||
|
||||
### queryType()
|
||||
|
||||
```ts
|
||||
queryType(): FullTextQueryType
|
||||
```
|
||||
|
||||
#### Returns
|
||||
|
||||
[`FullTextQueryType`](../enumerations/FullTextQueryType.md)
|
||||
|
||||
#### Implementation of
|
||||
|
||||
[`FullTextQuery`](../interfaces/FullTextQuery.md).[`queryType`](../interfaces/FullTextQuery.md#querytype)
|
||||
|
||||
***
|
||||
|
||||
### toDict()
|
||||
|
||||
```ts
|
||||
toDict(): Record<string, unknown>
|
||||
```
|
||||
|
||||
#### Returns
|
||||
|
||||
`Record`<`string`, `unknown`>
|
||||
|
||||
#### Implementation of
|
||||
|
||||
[`FullTextQuery`](../interfaces/FullTextQuery.md).[`toDict`](../interfaces/FullTextQuery.md#todict)
|
||||
77
docs/src/js/classes/MultiMatchQuery.md
Normal file
77
docs/src/js/classes/MultiMatchQuery.md
Normal file
@@ -0,0 +1,77 @@
|
||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../globals.md) / MultiMatchQuery
|
||||
|
||||
# Class: MultiMatchQuery
|
||||
|
||||
Represents a full-text query interface.
|
||||
This interface defines the structure and behavior for full-text queries,
|
||||
including methods to retrieve the query type and convert the query to a dictionary format.
|
||||
|
||||
## Implements
|
||||
|
||||
- [`FullTextQuery`](../interfaces/FullTextQuery.md)
|
||||
|
||||
## Constructors
|
||||
|
||||
### new MultiMatchQuery()
|
||||
|
||||
```ts
|
||||
new MultiMatchQuery(
|
||||
query,
|
||||
columns,
|
||||
boosts): MultiMatchQuery
|
||||
```
|
||||
|
||||
Creates an instance of MultiMatchQuery.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **query**: `string`
|
||||
The text query to search for across multiple columns.
|
||||
|
||||
* **columns**: `string`[]
|
||||
An array of column names to search within.
|
||||
|
||||
* **boosts**: `number`[] = `...`
|
||||
(Optional) An array of boost factors corresponding to each column. Default is an array of 1.0 for each column.
|
||||
The `boosts` array should have the same length as `columns`. If not provided, all columns will have a default boost of 1.0.
|
||||
If the length of `boosts` is less than `columns`, it will be padded with 1.0s.
|
||||
|
||||
#### Returns
|
||||
|
||||
[`MultiMatchQuery`](MultiMatchQuery.md)
|
||||
|
||||
## Methods
|
||||
|
||||
### queryType()
|
||||
|
||||
```ts
|
||||
queryType(): FullTextQueryType
|
||||
```
|
||||
|
||||
#### Returns
|
||||
|
||||
[`FullTextQueryType`](../enumerations/FullTextQueryType.md)
|
||||
|
||||
#### Implementation of
|
||||
|
||||
[`FullTextQuery`](../interfaces/FullTextQuery.md).[`queryType`](../interfaces/FullTextQuery.md#querytype)
|
||||
|
||||
***
|
||||
|
||||
### toDict()
|
||||
|
||||
```ts
|
||||
toDict(): Record<string, unknown>
|
||||
```
|
||||
|
||||
#### Returns
|
||||
|
||||
`Record`<`string`, `unknown`>
|
||||
|
||||
#### Implementation of
|
||||
|
||||
[`FullTextQuery`](../interfaces/FullTextQuery.md).[`toDict`](../interfaces/FullTextQuery.md#todict)
|
||||
69
docs/src/js/classes/PhraseQuery.md
Normal file
69
docs/src/js/classes/PhraseQuery.md
Normal file
@@ -0,0 +1,69 @@
|
||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../globals.md) / PhraseQuery
|
||||
|
||||
# Class: PhraseQuery
|
||||
|
||||
Represents a full-text query interface.
|
||||
This interface defines the structure and behavior for full-text queries,
|
||||
including methods to retrieve the query type and convert the query to a dictionary format.
|
||||
|
||||
## Implements
|
||||
|
||||
- [`FullTextQuery`](../interfaces/FullTextQuery.md)
|
||||
|
||||
## Constructors
|
||||
|
||||
### new PhraseQuery()
|
||||
|
||||
```ts
|
||||
new PhraseQuery(query, column): PhraseQuery
|
||||
```
|
||||
|
||||
Creates an instance of `PhraseQuery`.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **query**: `string`
|
||||
The phrase to search for in the specified column.
|
||||
|
||||
* **column**: `string`
|
||||
The name of the column to search within.
|
||||
|
||||
#### Returns
|
||||
|
||||
[`PhraseQuery`](PhraseQuery.md)
|
||||
|
||||
## Methods
|
||||
|
||||
### queryType()
|
||||
|
||||
```ts
|
||||
queryType(): FullTextQueryType
|
||||
```
|
||||
|
||||
#### Returns
|
||||
|
||||
[`FullTextQueryType`](../enumerations/FullTextQueryType.md)
|
||||
|
||||
#### Implementation of
|
||||
|
||||
[`FullTextQuery`](../interfaces/FullTextQuery.md).[`queryType`](../interfaces/FullTextQuery.md#querytype)
|
||||
|
||||
***
|
||||
|
||||
### toDict()
|
||||
|
||||
```ts
|
||||
toDict(): Record<string, unknown>
|
||||
```
|
||||
|
||||
#### Returns
|
||||
|
||||
`Record`<`string`, `unknown`>
|
||||
|
||||
#### Implementation of
|
||||
|
||||
[`FullTextQuery`](../interfaces/FullTextQuery.md).[`toDict`](../interfaces/FullTextQuery.md#todict)
|
||||
@@ -206,7 +206,7 @@ fullTextSearch(query, options?): this
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **query**: `string`
|
||||
* **query**: `string` \| [`FullTextQuery`](../interfaces/FullTextQuery.md)
|
||||
|
||||
* **options?**: `Partial`<[`FullTextSearchOptions`](../interfaces/FullTextSearchOptions.md)>
|
||||
|
||||
@@ -309,7 +309,7 @@ nearestToText(query, columns?): Query
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **query**: `string`
|
||||
* **query**: `string` \| [`FullTextQuery`](../interfaces/FullTextQuery.md)
|
||||
|
||||
* **columns?**: `string`[]
|
||||
|
||||
|
||||
@@ -192,7 +192,7 @@ fullTextSearch(query, options?): this
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **query**: `string`
|
||||
* **query**: `string` \| [`FullTextQuery`](../interfaces/FullTextQuery.md)
|
||||
|
||||
* **options?**: `Partial`<[`FullTextSearchOptions`](../interfaces/FullTextSearchOptions.md)>
|
||||
|
||||
|
||||
@@ -347,7 +347,7 @@ fullTextSearch(query, options?): this
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **query**: `string`
|
||||
* **query**: `string` \| [`FullTextQuery`](../interfaces/FullTextQuery.md)
|
||||
|
||||
* **options?**: `Partial`<[`FullTextSearchOptions`](../interfaces/FullTextSearchOptions.md)>
|
||||
|
||||
|
||||
46
docs/src/js/enumerations/FullTextQueryType.md
Normal file
46
docs/src/js/enumerations/FullTextQueryType.md
Normal file
@@ -0,0 +1,46 @@
|
||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../globals.md) / FullTextQueryType
|
||||
|
||||
# Enumeration: FullTextQueryType
|
||||
|
||||
Enum representing the types of full-text queries supported.
|
||||
|
||||
- `Match`: Performs a full-text search for terms in the query string.
|
||||
- `MatchPhrase`: Searches for an exact phrase match in the text.
|
||||
- `Boost`: Boosts the relevance score of specific terms in the query.
|
||||
- `MultiMatch`: Searches across multiple fields for the query terms.
|
||||
|
||||
## Enumeration Members
|
||||
|
||||
### Boost
|
||||
|
||||
```ts
|
||||
Boost: "boost";
|
||||
```
|
||||
|
||||
***
|
||||
|
||||
### Match
|
||||
|
||||
```ts
|
||||
Match: "match";
|
||||
```
|
||||
|
||||
***
|
||||
|
||||
### MatchPhrase
|
||||
|
||||
```ts
|
||||
MatchPhrase: "match_phrase";
|
||||
```
|
||||
|
||||
***
|
||||
|
||||
### MultiMatch
|
||||
|
||||
```ts
|
||||
MultiMatch: "multi_match";
|
||||
```
|
||||
@@ -9,12 +9,20 @@
|
||||
- [embedding](namespaces/embedding/README.md)
|
||||
- [rerankers](namespaces/rerankers/README.md)
|
||||
|
||||
## Enumerations
|
||||
|
||||
- [FullTextQueryType](enumerations/FullTextQueryType.md)
|
||||
|
||||
## Classes
|
||||
|
||||
- [BoostQuery](classes/BoostQuery.md)
|
||||
- [Connection](classes/Connection.md)
|
||||
- [Index](classes/Index.md)
|
||||
- [MakeArrowTableOptions](classes/MakeArrowTableOptions.md)
|
||||
- [MatchQuery](classes/MatchQuery.md)
|
||||
- [MergeInsertBuilder](classes/MergeInsertBuilder.md)
|
||||
- [MultiMatchQuery](classes/MultiMatchQuery.md)
|
||||
- [PhraseQuery](classes/PhraseQuery.md)
|
||||
- [Query](classes/Query.md)
|
||||
- [QueryBase](classes/QueryBase.md)
|
||||
- [RecordBatchIterator](classes/RecordBatchIterator.md)
|
||||
@@ -33,6 +41,7 @@
|
||||
- [CreateTableOptions](interfaces/CreateTableOptions.md)
|
||||
- [ExecutableQuery](interfaces/ExecutableQuery.md)
|
||||
- [FtsOptions](interfaces/FtsOptions.md)
|
||||
- [FullTextQuery](interfaces/FullTextQuery.md)
|
||||
- [FullTextSearchOptions](interfaces/FullTextSearchOptions.md)
|
||||
- [HnswPqOptions](interfaces/HnswPqOptions.md)
|
||||
- [HnswSqOptions](interfaces/HnswSqOptions.md)
|
||||
|
||||
35
docs/src/js/interfaces/FullTextQuery.md
Normal file
35
docs/src/js/interfaces/FullTextQuery.md
Normal file
@@ -0,0 +1,35 @@
|
||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../globals.md) / FullTextQuery
|
||||
|
||||
# Interface: FullTextQuery
|
||||
|
||||
Represents a full-text query interface.
|
||||
This interface defines the structure and behavior for full-text queries,
|
||||
including methods to retrieve the query type and convert the query to a dictionary format.
|
||||
|
||||
## Methods
|
||||
|
||||
### queryType()
|
||||
|
||||
```ts
|
||||
queryType(): FullTextQueryType
|
||||
```
|
||||
|
||||
#### Returns
|
||||
|
||||
[`FullTextQueryType`](../enumerations/FullTextQueryType.md)
|
||||
|
||||
***
|
||||
|
||||
### toDict()
|
||||
|
||||
```ts
|
||||
toDict(): Record<string, unknown>
|
||||
```
|
||||
|
||||
#### Returns
|
||||
|
||||
`Record`<`string`, `unknown`>
|
||||
@@ -20,3 +20,13 @@ The maximum number of rows to return in a single batch
|
||||
|
||||
Batches may have fewer rows if the underlying data is stored
|
||||
in smaller chunks.
|
||||
|
||||
***
|
||||
|
||||
### timeoutMs?
|
||||
|
||||
```ts
|
||||
optional timeoutMs: number;
|
||||
```
|
||||
|
||||
Timeout for query execution in milliseconds
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
<parent>
|
||||
<groupId>com.lancedb</groupId>
|
||||
<artifactId>lancedb-parent</artifactId>
|
||||
<version>0.18.3-beta.0</version>
|
||||
<version>0.19.0-beta.5</version>
|
||||
<relativePath>../pom.xml</relativePath>
|
||||
</parent>
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
|
||||
<groupId>com.lancedb</groupId>
|
||||
<artifactId>lancedb-parent</artifactId>
|
||||
<version>0.18.3-beta.0</version>
|
||||
<version>0.19.0-beta.5</version>
|
||||
<packaging>pom</packaging>
|
||||
|
||||
<name>LanceDB Parent</name>
|
||||
|
||||
51
node/package-lock.json
generated
51
node/package-lock.json
generated
@@ -1,12 +1,12 @@
|
||||
{
|
||||
"name": "vectordb",
|
||||
"version": "0.18.3-beta.0",
|
||||
"version": "0.19.0-beta.5",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "vectordb",
|
||||
"version": "0.18.3-beta.0",
|
||||
"version": "0.19.0-beta.5",
|
||||
"cpu": [
|
||||
"x64",
|
||||
"arm64"
|
||||
@@ -52,11 +52,11 @@
|
||||
"uuid": "^9.0.0"
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"@lancedb/vectordb-darwin-arm64": "0.18.3-beta.0",
|
||||
"@lancedb/vectordb-darwin-x64": "0.18.3-beta.0",
|
||||
"@lancedb/vectordb-linux-arm64-gnu": "0.18.3-beta.0",
|
||||
"@lancedb/vectordb-linux-x64-gnu": "0.18.3-beta.0",
|
||||
"@lancedb/vectordb-win32-x64-msvc": "0.18.3-beta.0"
|
||||
"@lancedb/vectordb-darwin-arm64": "0.19.0-beta.5",
|
||||
"@lancedb/vectordb-darwin-x64": "0.19.0-beta.5",
|
||||
"@lancedb/vectordb-linux-arm64-gnu": "0.19.0-beta.5",
|
||||
"@lancedb/vectordb-linux-x64-gnu": "0.19.0-beta.5",
|
||||
"@lancedb/vectordb-win32-x64-msvc": "0.19.0-beta.5"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"@apache-arrow/ts": "^14.0.2",
|
||||
@@ -327,9 +327,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-darwin-arm64": {
|
||||
"version": "0.18.3-beta.0",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.18.3-beta.0.tgz",
|
||||
"integrity": "sha512-dhJ5VlXV2N/L67mIpTSePhb8krX0FyQgpuz3I+4T4vYuU5JEF3cmedQ5TF5+3cGJhZim4PHRYLkfgCyTlxcqUg==",
|
||||
"version": "0.19.0-beta.5",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.19.0-beta.5.tgz",
|
||||
"integrity": "sha512-NuJVGaV4b6XgH3dlkCEquvtGM1cY5sIJE5M/LgJ3HYYvAbco/seBQM5AHTV/7CULoPEY9eQeJZOj9fWP5oQLYQ==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
@@ -340,9 +340,9 @@
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-darwin-x64": {
|
||||
"version": "0.18.3-beta.0",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.18.3-beta.0.tgz",
|
||||
"integrity": "sha512-SHqPkuyfe87d5skf9GERzdeu6AKvVIbXMUwl5N+dVrE7HH6qiuP2HvOmiyHS2lJFgo0Ph8jSBVzPDxxtjF36Dg==",
|
||||
"version": "0.19.0-beta.5",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.19.0-beta.5.tgz",
|
||||
"integrity": "sha512-hbadwvQcUgKJfluUHhN+mx+XeFRwTuh9mD0L3Tf3t3BkDTxyHpEG5WNgOpWrh6e1RU6zW54CoCyQuSEaVqGgGw==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
@@ -353,9 +353,9 @@
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-linux-arm64-gnu": {
|
||||
"version": "0.18.3-beta.0",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.18.3-beta.0.tgz",
|
||||
"integrity": "sha512-ohnWsV1n9cxL5ik/GGL4FdQ04Ff9REELcNb1zgmJYyEfwyc6TH9m5HdySO/1ACPZJiLbML4gSvZ10J0Zyb+2SA==",
|
||||
"version": "0.19.0-beta.5",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.19.0-beta.5.tgz",
|
||||
"integrity": "sha512-fu/EOYLr3mx76/SP4dEgbq0vSYHfuTf68lVl5/tL6eIb1Purz42l22+jNKLJ/S3Plase2SkXdxyY90K2Y/CvSg==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
@@ -366,9 +366,9 @@
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-linux-x64-gnu": {
|
||||
"version": "0.18.3-beta.0",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.18.3-beta.0.tgz",
|
||||
"integrity": "sha512-nhbW2CKaBSUesiYCPBd9fAsDYIJLadlGsrb2gfjODlFy+2Lpnbz6T9SuV7dNqj6KBw+KHhaRhLqta7tyMZm/EA==",
|
||||
"version": "0.19.0-beta.5",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.19.0-beta.5.tgz",
|
||||
"integrity": "sha512-pzb8fl5M8155sc/mEFnKmuh9rCfQohHBlb+j+5qNMe84AyygQ8Me1H3b1h9fOkUPu2Y168zYfuGkjNv4Bjm9eA==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
@@ -379,9 +379,9 @@
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-win32-x64-msvc": {
|
||||
"version": "0.18.3-beta.0",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.18.3-beta.0.tgz",
|
||||
"integrity": "sha512-VE4TvMdZ7DIrTC8VYylGxEcH4h2UEejSwGX4PxRzrN9QsCQ4m4pOh3L/UguSO3g+Y1QEaGE20iWQoX6wgSEUhA==",
|
||||
"version": "0.19.0-beta.5",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.19.0-beta.5.tgz",
|
||||
"integrity": "sha512-5z6BSfTuZYJdDL2wwRrEQlnfluahzaUH2U7vj3i4ik4zaAwvaYcrjmdYCTLRYhFscUqzxd2pVFHbfRYe+maYzA==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
@@ -1184,9 +1184,10 @@
|
||||
}
|
||||
},
|
||||
"node_modules/axios": {
|
||||
"version": "1.7.7",
|
||||
"resolved": "https://registry.npmjs.org/axios/-/axios-1.7.7.tgz",
|
||||
"integrity": "sha512-S4kL7XrjgBmvdGut0sN3yJxqYzrDOnivkBiN0OFs6hLiUam3UPvswUo0kqGyhqUZGEOytHyumEdXsAkgCOUf3Q==",
|
||||
"version": "1.8.4",
|
||||
"resolved": "https://registry.npmjs.org/axios/-/axios-1.8.4.tgz",
|
||||
"integrity": "sha512-eBSYY4Y68NNlHbHBMdeDmKNtDgXWhQsJcGqzO3iLUM0GraQFSS9cVgPX5I9b3lbdFKyYoAEGAZF1DwhTaljNAw==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"follow-redirects": "^1.15.6",
|
||||
"form-data": "^4.0.0",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "vectordb",
|
||||
"version": "0.18.3-beta.0",
|
||||
"version": "0.19.0-beta.5",
|
||||
"description": " Serverless, low-latency vector database for AI applications",
|
||||
"private": false,
|
||||
"main": "dist/index.js",
|
||||
@@ -89,10 +89,10 @@
|
||||
}
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"@lancedb/vectordb-darwin-x64": "0.18.3-beta.0",
|
||||
"@lancedb/vectordb-darwin-arm64": "0.18.3-beta.0",
|
||||
"@lancedb/vectordb-linux-x64-gnu": "0.18.3-beta.0",
|
||||
"@lancedb/vectordb-linux-arm64-gnu": "0.18.3-beta.0",
|
||||
"@lancedb/vectordb-win32-x64-msvc": "0.18.3-beta.0"
|
||||
"@lancedb/vectordb-darwin-x64": "0.19.0-beta.5",
|
||||
"@lancedb/vectordb-darwin-arm64": "0.19.0-beta.5",
|
||||
"@lancedb/vectordb-linux-x64-gnu": "0.19.0-beta.5",
|
||||
"@lancedb/vectordb-linux-arm64-gnu": "0.19.0-beta.5",
|
||||
"@lancedb/vectordb-win32-x64-msvc": "0.19.0-beta.5"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
[package]
|
||||
name = "lancedb-nodejs"
|
||||
edition.workspace = true
|
||||
version = "0.18.3-beta.0"
|
||||
version = "0.19.0-beta.5"
|
||||
license.workspace = true
|
||||
description.workspace = true
|
||||
repository.workspace = true
|
||||
|
||||
@@ -867,6 +867,44 @@ describe("When creating an index", () => {
|
||||
});
|
||||
});
|
||||
|
||||
describe("When querying a table", () => {
|
||||
let tmpDir: tmp.DirResult;
|
||||
beforeEach(() => {
|
||||
tmpDir = tmp.dirSync({ unsafeCleanup: true });
|
||||
});
|
||||
afterEach(() => tmpDir.removeCallback());
|
||||
|
||||
it("should throw an error when timeout is reached", async () => {
|
||||
const db = await connect(tmpDir.name);
|
||||
const data = makeArrowTable([
|
||||
{ text: "a", vector: [0.1, 0.2] },
|
||||
{ text: "b", vector: [0.3, 0.4] },
|
||||
]);
|
||||
const table = await db.createTable("test", data);
|
||||
await table.createIndex("text", { config: Index.fts() });
|
||||
|
||||
await expect(
|
||||
table.query().where("text != 'a'").toArray({ timeoutMs: 0 }),
|
||||
).rejects.toThrow("Query timeout");
|
||||
|
||||
await expect(
|
||||
table.query().nearestTo([0.0, 0.0]).toArrow({ timeoutMs: 0 }),
|
||||
).rejects.toThrow("Query timeout");
|
||||
|
||||
await expect(
|
||||
table.search("a", "fts").toArray({ timeoutMs: 0 }),
|
||||
).rejects.toThrow("Query timeout");
|
||||
|
||||
await expect(
|
||||
table
|
||||
.query()
|
||||
.nearestToText("a")
|
||||
.nearestTo([0.0, 0.0])
|
||||
.toArrow({ timeoutMs: 0 }),
|
||||
).rejects.toThrow("Query timeout");
|
||||
});
|
||||
});
|
||||
|
||||
describe("Read consistency interval", () => {
|
||||
let tmpDir: tmp.DirResult;
|
||||
beforeEach(() => {
|
||||
@@ -1266,6 +1304,27 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
|
||||
expect(results[0].text).toBe(data[0].text);
|
||||
});
|
||||
|
||||
test("full text index on list", async () => {
|
||||
const db = await connect(tmpDir.name);
|
||||
const data = [
|
||||
{ text: ["lance database", "the", "search"], vector: [0.1, 0.2, 0.3] },
|
||||
{ text: ["lance database"], vector: [0.4, 0.5, 0.6] },
|
||||
{ text: ["lance", "search"], vector: [0.7, 0.8, 0.9] },
|
||||
{ text: ["database", "search"], vector: [1.0, 1.1, 1.2] },
|
||||
{ text: ["unrelated", "doc"], vector: [1.3, 1.4, 1.5] },
|
||||
];
|
||||
const table = await db.createTable("test", data);
|
||||
await table.createIndex("text", {
|
||||
config: Index.fts(),
|
||||
});
|
||||
|
||||
const results = await table.search("lance").toArray();
|
||||
expect(results.length).toBe(3);
|
||||
|
||||
const results2 = await table.search('"lance database"').toArray();
|
||||
expect(results2.length).toBe(2);
|
||||
});
|
||||
|
||||
test("full text search without positions", async () => {
|
||||
const db = await connect(tmpDir.name);
|
||||
const data = [
|
||||
|
||||
@@ -47,6 +47,12 @@ export {
|
||||
QueryExecutionOptions,
|
||||
FullTextSearchOptions,
|
||||
RecordBatchIterator,
|
||||
FullTextQuery,
|
||||
MatchQuery,
|
||||
PhraseQuery,
|
||||
BoostQuery,
|
||||
MultiMatchQuery,
|
||||
FullTextQueryType,
|
||||
} from "./query";
|
||||
|
||||
export {
|
||||
|
||||
@@ -17,6 +17,7 @@ import {
|
||||
VectorQuery as NativeVectorQuery,
|
||||
} from "./native";
|
||||
import { Reranker } from "./rerankers";
|
||||
|
||||
export class RecordBatchIterator implements AsyncIterator<RecordBatch> {
|
||||
private promisedInner?: Promise<NativeBatchIterator>;
|
||||
private inner?: NativeBatchIterator;
|
||||
@@ -62,7 +63,7 @@ class RecordBatchIterable<
|
||||
// biome-ignore lint/suspicious/noExplicitAny: skip
|
||||
[Symbol.asyncIterator](): AsyncIterator<RecordBatch<any>, any, undefined> {
|
||||
return new RecordBatchIterator(
|
||||
this.inner.execute(this.options?.maxBatchLength),
|
||||
this.inner.execute(this.options?.maxBatchLength, this.options?.timeoutMs),
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -78,6 +79,11 @@ export interface QueryExecutionOptions {
|
||||
* in smaller chunks.
|
||||
*/
|
||||
maxBatchLength?: number;
|
||||
|
||||
/**
|
||||
* Timeout for query execution in milliseconds
|
||||
*/
|
||||
timeoutMs?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -152,7 +158,7 @@ export class QueryBase<NativeQueryType extends NativeQuery | NativeVectorQuery>
|
||||
}
|
||||
|
||||
fullTextSearch(
|
||||
query: string,
|
||||
query: string | FullTextQuery,
|
||||
options?: Partial<FullTextSearchOptions>,
|
||||
): this {
|
||||
let columns: string[] | null = null;
|
||||
@@ -164,9 +170,18 @@ export class QueryBase<NativeQueryType extends NativeQuery | NativeVectorQuery>
|
||||
}
|
||||
}
|
||||
|
||||
this.doCall((inner: NativeQueryType) =>
|
||||
inner.fullTextSearch(query, columns),
|
||||
);
|
||||
this.doCall((inner: NativeQueryType) => {
|
||||
if (typeof query === "string") {
|
||||
inner.fullTextSearch({
|
||||
query: query,
|
||||
columns: columns,
|
||||
});
|
||||
} else {
|
||||
// If query is a FullTextQuery object, convert it to a dict
|
||||
const queryObj = query.toDict();
|
||||
inner.fullTextSearch(queryObj);
|
||||
}
|
||||
});
|
||||
return this;
|
||||
}
|
||||
|
||||
@@ -273,9 +288,11 @@ export class QueryBase<NativeQueryType extends NativeQuery | NativeVectorQuery>
|
||||
options?: Partial<QueryExecutionOptions>,
|
||||
): Promise<NativeBatchIterator> {
|
||||
if (this.inner instanceof Promise) {
|
||||
return this.inner.then((inner) => inner.execute(options?.maxBatchLength));
|
||||
return this.inner.then((inner) =>
|
||||
inner.execute(options?.maxBatchLength, options?.timeoutMs),
|
||||
);
|
||||
} else {
|
||||
return this.inner.execute(options?.maxBatchLength);
|
||||
return this.inner.execute(options?.maxBatchLength, options?.timeoutMs);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -718,8 +735,167 @@ export class Query extends QueryBase<NativeQuery> {
|
||||
}
|
||||
}
|
||||
|
||||
nearestToText(query: string, columns?: string[]): Query {
|
||||
this.doCall((inner) => inner.fullTextSearch(query, columns));
|
||||
nearestToText(query: string | FullTextQuery, columns?: string[]): Query {
|
||||
this.doCall((inner) => {
|
||||
if (typeof query === "string") {
|
||||
inner.fullTextSearch({
|
||||
query: query,
|
||||
columns: columns,
|
||||
});
|
||||
} else {
|
||||
const queryObj = query.toDict();
|
||||
inner.fullTextSearch(queryObj);
|
||||
}
|
||||
});
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Enum representing the types of full-text queries supported.
|
||||
*
|
||||
* - `Match`: Performs a full-text search for terms in the query string.
|
||||
* - `MatchPhrase`: Searches for an exact phrase match in the text.
|
||||
* - `Boost`: Boosts the relevance score of specific terms in the query.
|
||||
* - `MultiMatch`: Searches across multiple fields for the query terms.
|
||||
*/
|
||||
export enum FullTextQueryType {
|
||||
Match = "match",
|
||||
MatchPhrase = "match_phrase",
|
||||
Boost = "boost",
|
||||
MultiMatch = "multi_match",
|
||||
}
|
||||
|
||||
/**
|
||||
* Represents a full-text query interface.
|
||||
* This interface defines the structure and behavior for full-text queries,
|
||||
* including methods to retrieve the query type and convert the query to a dictionary format.
|
||||
*/
|
||||
export interface FullTextQuery {
|
||||
queryType(): FullTextQueryType;
|
||||
toDict(): Record<string, unknown>;
|
||||
}
|
||||
|
||||
export class MatchQuery implements FullTextQuery {
|
||||
/**
|
||||
* Creates an instance of MatchQuery.
|
||||
*
|
||||
* @param query - The text query to search for.
|
||||
* @param column - The name of the column to search within.
|
||||
* @param boost - (Optional) The boost factor to influence the relevance score of this query. Default is `1.0`.
|
||||
* @param fuzziness - (Optional) The allowed edit distance for fuzzy matching. Default is `0`.
|
||||
* @param maxExpansions - (Optional) The maximum number of terms to consider for fuzzy matching. Default is `50`.
|
||||
*/
|
||||
constructor(
|
||||
private query: string,
|
||||
private column: string,
|
||||
private boost: number = 1.0,
|
||||
private fuzziness: number = 0,
|
||||
private maxExpansions: number = 50,
|
||||
) {}
|
||||
|
||||
queryType(): FullTextQueryType {
|
||||
return FullTextQueryType.Match;
|
||||
}
|
||||
|
||||
toDict(): Record<string, unknown> {
|
||||
return {
|
||||
[this.queryType()]: {
|
||||
[this.column]: {
|
||||
query: this.query,
|
||||
boost: this.boost,
|
||||
fuzziness: this.fuzziness,
|
||||
// biome-ignore lint/style/useNamingConvention: use underscore for consistency with the other APIs
|
||||
max_expansions: this.maxExpansions,
|
||||
},
|
||||
},
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
export class PhraseQuery implements FullTextQuery {
|
||||
/**
|
||||
* Creates an instance of `PhraseQuery`.
|
||||
*
|
||||
* @param query - The phrase to search for in the specified column.
|
||||
* @param column - The name of the column to search within.
|
||||
*/
|
||||
constructor(
|
||||
private query: string,
|
||||
private column: string,
|
||||
) {}
|
||||
|
||||
queryType(): FullTextQueryType {
|
||||
return FullTextQueryType.MatchPhrase;
|
||||
}
|
||||
|
||||
toDict(): Record<string, unknown> {
|
||||
return {
|
||||
[this.queryType()]: {
|
||||
[this.column]: this.query,
|
||||
},
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
export class BoostQuery implements FullTextQuery {
|
||||
/**
|
||||
* Creates an instance of BoostQuery.
|
||||
*
|
||||
* @param positive - The positive query that boosts the relevance score.
|
||||
* @param negative - The negative query that reduces the relevance score.
|
||||
* @param negativeBoost - The factor by which the negative query reduces the score.
|
||||
*/
|
||||
constructor(
|
||||
private positive: FullTextQuery,
|
||||
private negative: FullTextQuery,
|
||||
private negativeBoost: number,
|
||||
) {}
|
||||
|
||||
queryType(): FullTextQueryType {
|
||||
return FullTextQueryType.Boost;
|
||||
}
|
||||
|
||||
toDict(): Record<string, unknown> {
|
||||
return {
|
||||
[this.queryType()]: {
|
||||
positive: this.positive.toDict(),
|
||||
negative: this.negative.toDict(),
|
||||
// biome-ignore lint/style/useNamingConvention: use underscore for consistency with the other APIs
|
||||
negative_boost: this.negativeBoost,
|
||||
},
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
export class MultiMatchQuery implements FullTextQuery {
|
||||
/**
|
||||
* Creates an instance of MultiMatchQuery.
|
||||
*
|
||||
* @param query - The text query to search for across multiple columns.
|
||||
* @param columns - An array of column names to search within.
|
||||
* @param boosts - (Optional) An array of boost factors corresponding to each column. Default is an array of 1.0 for each column.
|
||||
*
|
||||
* The `boosts` array should have the same length as `columns`. If not provided, all columns will have a default boost of 1.0.
|
||||
* If the length of `boosts` is less than `columns`, it will be padded with 1.0s.
|
||||
*/
|
||||
constructor(
|
||||
private query: string,
|
||||
private columns: string[],
|
||||
private boosts: number[] = columns.map(() => 1.0),
|
||||
) {}
|
||||
|
||||
queryType(): FullTextQueryType {
|
||||
return FullTextQueryType.MultiMatch;
|
||||
}
|
||||
|
||||
toDict(): Record<string, unknown> {
|
||||
return {
|
||||
[this.queryType()]: {
|
||||
query: this.query,
|
||||
columns: this.columns,
|
||||
boost: this.boosts,
|
||||
},
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-darwin-arm64",
|
||||
"version": "0.18.3-beta.0",
|
||||
"version": "0.19.0-beta.5",
|
||||
"os": ["darwin"],
|
||||
"cpu": ["arm64"],
|
||||
"main": "lancedb.darwin-arm64.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-darwin-x64",
|
||||
"version": "0.18.3-beta.0",
|
||||
"version": "0.19.0-beta.5",
|
||||
"os": ["darwin"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.darwin-x64.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-arm64-gnu",
|
||||
"version": "0.18.3-beta.0",
|
||||
"version": "0.19.0-beta.5",
|
||||
"os": ["linux"],
|
||||
"cpu": ["arm64"],
|
||||
"main": "lancedb.linux-arm64-gnu.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-arm64-musl",
|
||||
"version": "0.18.3-beta.0",
|
||||
"version": "0.19.0-beta.5",
|
||||
"os": ["linux"],
|
||||
"cpu": ["arm64"],
|
||||
"main": "lancedb.linux-arm64-musl.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-x64-gnu",
|
||||
"version": "0.18.3-beta.0",
|
||||
"version": "0.19.0-beta.5",
|
||||
"os": ["linux"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.linux-x64-gnu.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-x64-musl",
|
||||
"version": "0.18.3-beta.0",
|
||||
"version": "0.19.0-beta.5",
|
||||
"os": ["linux"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.linux-x64-musl.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-win32-arm64-msvc",
|
||||
"version": "0.18.3-beta.0",
|
||||
"version": "0.19.0-beta.5",
|
||||
"os": [
|
||||
"win32"
|
||||
],
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-win32-x64-msvc",
|
||||
"version": "0.18.3-beta.0",
|
||||
"version": "0.19.0-beta.5",
|
||||
"os": ["win32"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.win32-x64-msvc.node",
|
||||
|
||||
252
nodejs/package-lock.json
generated
252
nodejs/package-lock.json
generated
@@ -1,12 +1,12 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb",
|
||||
"version": "0.18.3-beta.0",
|
||||
"version": "0.19.0-beta.5",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "@lancedb/lancedb",
|
||||
"version": "0.18.3-beta.0",
|
||||
"version": "0.19.0-beta.5",
|
||||
"cpu": [
|
||||
"x64",
|
||||
"arm64"
|
||||
@@ -2304,89 +2304,20 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@babel/code-frame": {
|
||||
"version": "7.23.5",
|
||||
"resolved": "https://registry.npmjs.org/@babel/code-frame/-/code-frame-7.23.5.tgz",
|
||||
"integrity": "sha512-CgH3s1a96LipHCmSUmYFPwY7MNx8C3avkq7i4Wl3cfa662ldtUe4VM1TPXX70pfmrlWTb6jLqTYrZyT2ZTJBgA==",
|
||||
"version": "7.26.2",
|
||||
"resolved": "https://registry.npmjs.org/@babel/code-frame/-/code-frame-7.26.2.tgz",
|
||||
"integrity": "sha512-RJlIHRueQgwWitWgF8OdFYGZX328Ax5BCemNGlqHfplnRT9ESi8JkFlvaVYbS+UubVY6dpv87Fs2u5M29iNFVQ==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@babel/highlight": "^7.23.4",
|
||||
"chalk": "^2.4.2"
|
||||
"@babel/helper-validator-identifier": "^7.25.9",
|
||||
"js-tokens": "^4.0.0",
|
||||
"picocolors": "^1.0.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=6.9.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@babel/code-frame/node_modules/ansi-styles": {
|
||||
"version": "3.2.1",
|
||||
"resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-3.2.1.tgz",
|
||||
"integrity": "sha512-VT0ZI6kZRdTh8YyJw3SMbYm/u+NqfsAxEpWO0Pf9sq8/e94WxxOpPKx9FR1FlyCtOVDNOQ+8ntlqFxiRc+r5qA==",
|
||||
"dev": true,
|
||||
"dependencies": {
|
||||
"color-convert": "^1.9.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=4"
|
||||
}
|
||||
},
|
||||
"node_modules/@babel/code-frame/node_modules/chalk": {
|
||||
"version": "2.4.2",
|
||||
"resolved": "https://registry.npmjs.org/chalk/-/chalk-2.4.2.tgz",
|
||||
"integrity": "sha512-Mti+f9lpJNcwF4tWV8/OrTTtF1gZi+f8FqlyAdouralcFWFQWF2+NgCHShjkCb+IFBLq9buZwE1xckQU4peSuQ==",
|
||||
"dev": true,
|
||||
"dependencies": {
|
||||
"ansi-styles": "^3.2.1",
|
||||
"escape-string-regexp": "^1.0.5",
|
||||
"supports-color": "^5.3.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=4"
|
||||
}
|
||||
},
|
||||
"node_modules/@babel/code-frame/node_modules/color-convert": {
|
||||
"version": "1.9.3",
|
||||
"resolved": "https://registry.npmjs.org/color-convert/-/color-convert-1.9.3.tgz",
|
||||
"integrity": "sha512-QfAUtd+vFdAtFQcC8CCyYt1fYWxSqAiK2cSD6zDB8N3cpsEBAvRxp9zOGg6G/SHHJYAT88/az/IuDGALsNVbGg==",
|
||||
"dev": true,
|
||||
"dependencies": {
|
||||
"color-name": "1.1.3"
|
||||
}
|
||||
},
|
||||
"node_modules/@babel/code-frame/node_modules/color-name": {
|
||||
"version": "1.1.3",
|
||||
"resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.3.tgz",
|
||||
"integrity": "sha512-72fSenhMw2HZMTVHeCA9KCmpEIbzWiQsjN+BHcBbS9vr1mtt+vJjPdksIBNUmKAW8TFUDPJK5SUU3QhE9NEXDw==",
|
||||
"dev": true
|
||||
},
|
||||
"node_modules/@babel/code-frame/node_modules/escape-string-regexp": {
|
||||
"version": "1.0.5",
|
||||
"resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-1.0.5.tgz",
|
||||
"integrity": "sha512-vbRorB5FUQWvla16U8R/qgaFIya2qGzwDrNmCZuYKrbdSUMG6I1ZCGQRefkRVhuOkIGVne7BQ35DSfo1qvJqFg==",
|
||||
"dev": true,
|
||||
"engines": {
|
||||
"node": ">=0.8.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@babel/code-frame/node_modules/has-flag": {
|
||||
"version": "3.0.0",
|
||||
"resolved": "https://registry.npmjs.org/has-flag/-/has-flag-3.0.0.tgz",
|
||||
"integrity": "sha512-sKJf1+ceQBr4SMkvQnBDNDtf4TXpVhVGateu0t918bl30FnbE2m4vNLX+VWe/dpjlb+HugGYzW7uQXH98HPEYw==",
|
||||
"dev": true,
|
||||
"engines": {
|
||||
"node": ">=4"
|
||||
}
|
||||
},
|
||||
"node_modules/@babel/code-frame/node_modules/supports-color": {
|
||||
"version": "5.5.0",
|
||||
"resolved": "https://registry.npmjs.org/supports-color/-/supports-color-5.5.0.tgz",
|
||||
"integrity": "sha512-QjVjwdXIt408MIiAqCX4oUKsgU2EqAGzs2Ppkm4aQYbjm+ZEWEcW4SfFNTr4uMNZma0ey4f5lgLrkB0aX0QMow==",
|
||||
"dev": true,
|
||||
"dependencies": {
|
||||
"has-flag": "^3.0.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=4"
|
||||
}
|
||||
},
|
||||
"node_modules/@babel/compat-data": {
|
||||
"version": "7.23.5",
|
||||
"resolved": "https://registry.npmjs.org/@babel/compat-data/-/compat-data-7.23.5.tgz",
|
||||
@@ -2589,19 +2520,21 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@babel/helper-string-parser": {
|
||||
"version": "7.23.4",
|
||||
"resolved": "https://registry.npmjs.org/@babel/helper-string-parser/-/helper-string-parser-7.23.4.tgz",
|
||||
"integrity": "sha512-803gmbQdqwdf4olxrX4AJyFBV/RTr3rSmOj0rKwesmzlfhYNDEs+/iOcznzpNWlJlIlTJC2QfPFcHB6DlzdVLQ==",
|
||||
"version": "7.25.9",
|
||||
"resolved": "https://registry.npmjs.org/@babel/helper-string-parser/-/helper-string-parser-7.25.9.tgz",
|
||||
"integrity": "sha512-4A/SCr/2KLd5jrtOMFzaKjVtAei3+2r/NChoBNoZ3EyP/+GlhoaEGoWOZUmFmoITP7zOJyHIMm+DYRd8o3PvHA==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">=6.9.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@babel/helper-validator-identifier": {
|
||||
"version": "7.22.20",
|
||||
"resolved": "https://registry.npmjs.org/@babel/helper-validator-identifier/-/helper-validator-identifier-7.22.20.tgz",
|
||||
"integrity": "sha512-Y4OZ+ytlatR8AI+8KZfKuL5urKp7qey08ha31L8b3BwewJAoJamTzyvxPR/5D+KkdJCGPq/+8TukHBlY10FX9A==",
|
||||
"version": "7.25.9",
|
||||
"resolved": "https://registry.npmjs.org/@babel/helper-validator-identifier/-/helper-validator-identifier-7.25.9.tgz",
|
||||
"integrity": "sha512-Ed61U6XJc3CVRfkERJWDz4dJwKe7iLmmJsbOGu9wSloNSFttHV0I8g6UAgb7qnK5ly5bGLPd4oXZlxCdANBOWQ==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">=6.9.0"
|
||||
}
|
||||
@@ -2616,109 +2549,28 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@babel/helpers": {
|
||||
"version": "7.23.8",
|
||||
"resolved": "https://registry.npmjs.org/@babel/helpers/-/helpers-7.23.8.tgz",
|
||||
"integrity": "sha512-KDqYz4PiOWvDFrdHLPhKtCThtIcKVy6avWD2oG4GEvyQ+XDZwHD4YQd+H2vNMnq2rkdxsDkU82T+Vk8U/WXHRQ==",
|
||||
"version": "7.27.0",
|
||||
"resolved": "https://registry.npmjs.org/@babel/helpers/-/helpers-7.27.0.tgz",
|
||||
"integrity": "sha512-U5eyP/CTFPuNE3qk+WZMxFkp/4zUzdceQlfzf7DdGdhp+Fezd7HD+i8Y24ZuTMKX3wQBld449jijbGq6OdGNQg==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@babel/template": "^7.22.15",
|
||||
"@babel/traverse": "^7.23.7",
|
||||
"@babel/types": "^7.23.6"
|
||||
"@babel/template": "^7.27.0",
|
||||
"@babel/types": "^7.27.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=6.9.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@babel/highlight": {
|
||||
"version": "7.23.4",
|
||||
"resolved": "https://registry.npmjs.org/@babel/highlight/-/highlight-7.23.4.tgz",
|
||||
"integrity": "sha512-acGdbYSfp2WheJoJm/EBBBLh/ID8KDc64ISZ9DYtBmC8/Q204PZJLHyzeB5qMzJ5trcOkybd78M4x2KWsUq++A==",
|
||||
"dev": true,
|
||||
"dependencies": {
|
||||
"@babel/helper-validator-identifier": "^7.22.20",
|
||||
"chalk": "^2.4.2",
|
||||
"js-tokens": "^4.0.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=6.9.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@babel/highlight/node_modules/ansi-styles": {
|
||||
"version": "3.2.1",
|
||||
"resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-3.2.1.tgz",
|
||||
"integrity": "sha512-VT0ZI6kZRdTh8YyJw3SMbYm/u+NqfsAxEpWO0Pf9sq8/e94WxxOpPKx9FR1FlyCtOVDNOQ+8ntlqFxiRc+r5qA==",
|
||||
"dev": true,
|
||||
"dependencies": {
|
||||
"color-convert": "^1.9.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=4"
|
||||
}
|
||||
},
|
||||
"node_modules/@babel/highlight/node_modules/chalk": {
|
||||
"version": "2.4.2",
|
||||
"resolved": "https://registry.npmjs.org/chalk/-/chalk-2.4.2.tgz",
|
||||
"integrity": "sha512-Mti+f9lpJNcwF4tWV8/OrTTtF1gZi+f8FqlyAdouralcFWFQWF2+NgCHShjkCb+IFBLq9buZwE1xckQU4peSuQ==",
|
||||
"dev": true,
|
||||
"dependencies": {
|
||||
"ansi-styles": "^3.2.1",
|
||||
"escape-string-regexp": "^1.0.5",
|
||||
"supports-color": "^5.3.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=4"
|
||||
}
|
||||
},
|
||||
"node_modules/@babel/highlight/node_modules/color-convert": {
|
||||
"version": "1.9.3",
|
||||
"resolved": "https://registry.npmjs.org/color-convert/-/color-convert-1.9.3.tgz",
|
||||
"integrity": "sha512-QfAUtd+vFdAtFQcC8CCyYt1fYWxSqAiK2cSD6zDB8N3cpsEBAvRxp9zOGg6G/SHHJYAT88/az/IuDGALsNVbGg==",
|
||||
"dev": true,
|
||||
"dependencies": {
|
||||
"color-name": "1.1.3"
|
||||
}
|
||||
},
|
||||
"node_modules/@babel/highlight/node_modules/color-name": {
|
||||
"version": "1.1.3",
|
||||
"resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.3.tgz",
|
||||
"integrity": "sha512-72fSenhMw2HZMTVHeCA9KCmpEIbzWiQsjN+BHcBbS9vr1mtt+vJjPdksIBNUmKAW8TFUDPJK5SUU3QhE9NEXDw==",
|
||||
"dev": true
|
||||
},
|
||||
"node_modules/@babel/highlight/node_modules/escape-string-regexp": {
|
||||
"version": "1.0.5",
|
||||
"resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-1.0.5.tgz",
|
||||
"integrity": "sha512-vbRorB5FUQWvla16U8R/qgaFIya2qGzwDrNmCZuYKrbdSUMG6I1ZCGQRefkRVhuOkIGVne7BQ35DSfo1qvJqFg==",
|
||||
"dev": true,
|
||||
"engines": {
|
||||
"node": ">=0.8.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@babel/highlight/node_modules/has-flag": {
|
||||
"version": "3.0.0",
|
||||
"resolved": "https://registry.npmjs.org/has-flag/-/has-flag-3.0.0.tgz",
|
||||
"integrity": "sha512-sKJf1+ceQBr4SMkvQnBDNDtf4TXpVhVGateu0t918bl30FnbE2m4vNLX+VWe/dpjlb+HugGYzW7uQXH98HPEYw==",
|
||||
"dev": true,
|
||||
"engines": {
|
||||
"node": ">=4"
|
||||
}
|
||||
},
|
||||
"node_modules/@babel/highlight/node_modules/supports-color": {
|
||||
"version": "5.5.0",
|
||||
"resolved": "https://registry.npmjs.org/supports-color/-/supports-color-5.5.0.tgz",
|
||||
"integrity": "sha512-QjVjwdXIt408MIiAqCX4oUKsgU2EqAGzs2Ppkm4aQYbjm+ZEWEcW4SfFNTr4uMNZma0ey4f5lgLrkB0aX0QMow==",
|
||||
"dev": true,
|
||||
"dependencies": {
|
||||
"has-flag": "^3.0.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=4"
|
||||
}
|
||||
},
|
||||
"node_modules/@babel/parser": {
|
||||
"version": "7.23.6",
|
||||
"resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.23.6.tgz",
|
||||
"integrity": "sha512-Z2uID7YJ7oNvAI20O9X0bblw7Qqs8Q2hFy0R9tAfnfLkp5MW0UH9eUvnDSnFwKZ0AvgS1ucqR4KzvVHgnke1VQ==",
|
||||
"version": "7.27.0",
|
||||
"resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.27.0.tgz",
|
||||
"integrity": "sha512-iaepho73/2Pz7w2eMS0Q5f83+0RKI7i4xmiYeBmDzfRVbQtTOG7Ts0S4HzJVsTMGI9keU8rNfuZr8DKfSt7Yyg==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@babel/types": "^7.27.0"
|
||||
},
|
||||
"bin": {
|
||||
"parser": "bin/babel-parser.js"
|
||||
},
|
||||
@@ -2904,14 +2756,15 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@babel/template": {
|
||||
"version": "7.22.15",
|
||||
"resolved": "https://registry.npmjs.org/@babel/template/-/template-7.22.15.tgz",
|
||||
"integrity": "sha512-QPErUVm4uyJa60rkI73qneDacvdvzxshT3kksGqlGWYdOTIUOwJ7RDUL8sGqslY1uXWSL6xMFKEXDS3ox2uF0w==",
|
||||
"version": "7.27.0",
|
||||
"resolved": "https://registry.npmjs.org/@babel/template/-/template-7.27.0.tgz",
|
||||
"integrity": "sha512-2ncevenBqXI6qRMukPlXwHKHchC7RyMuu4xv5JBXRfOGVcTy1mXCD12qrp7Jsoxll1EV3+9sE4GugBVRjT2jFA==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@babel/code-frame": "^7.22.13",
|
||||
"@babel/parser": "^7.22.15",
|
||||
"@babel/types": "^7.22.15"
|
||||
"@babel/code-frame": "^7.26.2",
|
||||
"@babel/parser": "^7.27.0",
|
||||
"@babel/types": "^7.27.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=6.9.0"
|
||||
@@ -2948,14 +2801,14 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@babel/types": {
|
||||
"version": "7.23.6",
|
||||
"resolved": "https://registry.npmjs.org/@babel/types/-/types-7.23.6.tgz",
|
||||
"integrity": "sha512-+uarb83brBzPKN38NX1MkB6vb6+mwvR6amUulqAE7ccQw1pEl+bCia9TbdG1lsnFP7lZySvUn37CHyXQdfTwzg==",
|
||||
"version": "7.27.0",
|
||||
"resolved": "https://registry.npmjs.org/@babel/types/-/types-7.27.0.tgz",
|
||||
"integrity": "sha512-H45s8fVLYjbhFH62dIJ3WtmJ6RSPt/3DRO0ZcT2SUiYiQyz3BLVb9ADEnLl91m74aQPS3AzzeajZHYOalWe3bg==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@babel/helper-string-parser": "^7.23.4",
|
||||
"@babel/helper-validator-identifier": "^7.22.20",
|
||||
"to-fast-properties": "^2.0.0"
|
||||
"@babel/helper-string-parser": "^7.25.9",
|
||||
"@babel/helper-validator-identifier": "^7.25.9"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=6.9.0"
|
||||
@@ -5550,10 +5403,11 @@
|
||||
"devOptional": true
|
||||
},
|
||||
"node_modules/axios": {
|
||||
"version": "1.7.7",
|
||||
"resolved": "https://registry.npmjs.org/axios/-/axios-1.7.7.tgz",
|
||||
"integrity": "sha512-S4kL7XrjgBmvdGut0sN3yJxqYzrDOnivkBiN0OFs6hLiUam3UPvswUo0kqGyhqUZGEOytHyumEdXsAkgCOUf3Q==",
|
||||
"version": "1.8.4",
|
||||
"resolved": "https://registry.npmjs.org/axios/-/axios-1.8.4.tgz",
|
||||
"integrity": "sha512-eBSYY4Y68NNlHbHBMdeDmKNtDgXWhQsJcGqzO3iLUM0GraQFSS9cVgPX5I9b3lbdFKyYoAEGAZF1DwhTaljNAw==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"follow-redirects": "^1.15.6",
|
||||
"form-data": "^4.0.0",
|
||||
@@ -7869,7 +7723,8 @@
|
||||
"version": "4.0.0",
|
||||
"resolved": "https://registry.npmjs.org/js-tokens/-/js-tokens-4.0.0.tgz",
|
||||
"integrity": "sha512-RdJUflcE3cUzKiMqQgsCu06FPu9UdIJO0beYbPhHN4k6apgJtifcoCtT9bcxOpYBtpD2kCM6Sbzg4CausW/PKQ==",
|
||||
"dev": true
|
||||
"dev": true,
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/js-yaml": {
|
||||
"version": "3.14.1",
|
||||
@@ -9360,15 +9215,6 @@
|
||||
"integrity": "sha512-3f0uOEAQwIqGuWW2MVzYg8fV/QNnc/IpuJNG837rLuczAaLVHslWHZQj4IGiEl5Hs3kkbhwL9Ab7Hrsmuj+Smw==",
|
||||
"dev": true
|
||||
},
|
||||
"node_modules/to-fast-properties": {
|
||||
"version": "2.0.0",
|
||||
"resolved": "https://registry.npmjs.org/to-fast-properties/-/to-fast-properties-2.0.0.tgz",
|
||||
"integrity": "sha512-/OaKK0xYrs3DmxRYqL/yDc+FxFUVYhDlXMhRmv3z915w2HF1tnN1omB354j8VUGO/hbRzyD6Y3sA7v7GS/ceog==",
|
||||
"dev": true,
|
||||
"engines": {
|
||||
"node": ">=4"
|
||||
}
|
||||
},
|
||||
"node_modules/to-regex-range": {
|
||||
"version": "5.0.1",
|
||||
"resolved": "https://registry.npmjs.org/to-regex-range/-/to-regex-range-5.0.1.tgz",
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
"ann"
|
||||
],
|
||||
"private": false,
|
||||
"version": "0.18.3-beta.0",
|
||||
"version": "0.19.0-beta.5",
|
||||
"main": "dist/index.js",
|
||||
"exports": {
|
||||
".": "./dist/index.js",
|
||||
@@ -29,6 +29,7 @@
|
||||
"aarch64-apple-darwin",
|
||||
"x86_64-unknown-linux-gnu",
|
||||
"aarch64-unknown-linux-gnu",
|
||||
"x86_64-unknown-linux-musl",
|
||||
"aarch64-unknown-linux-musl",
|
||||
"x86_64-pc-windows-msvc",
|
||||
"aarch64-pc-windows-msvc"
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use lancedb::index::scalar::FullTextSearchQuery;
|
||||
use lancedb::index::scalar::{FtsQuery, FullTextSearchQuery, MatchQuery, PhraseQuery};
|
||||
use lancedb::query::ExecutableQuery;
|
||||
use lancedb::query::Query as LanceDbQuery;
|
||||
use lancedb::query::QueryBase;
|
||||
@@ -18,7 +18,7 @@ use crate::error::NapiErrorExt;
|
||||
use crate::iterator::RecordBatchIterator;
|
||||
use crate::rerankers::Reranker;
|
||||
use crate::rerankers::RerankerCallbacks;
|
||||
use crate::util::parse_distance_type;
|
||||
use crate::util::{parse_distance_type, parse_fts_query};
|
||||
|
||||
#[napi]
|
||||
pub struct Query {
|
||||
@@ -38,9 +38,53 @@ impl Query {
|
||||
}
|
||||
|
||||
#[napi]
|
||||
pub fn full_text_search(&mut self, query: String, columns: Option<Vec<String>>) {
|
||||
let query = FullTextSearchQuery::new(query).columns(columns);
|
||||
pub fn full_text_search(&mut self, query: napi::JsUnknown) -> napi::Result<()> {
|
||||
let query = unsafe { query.cast::<napi::JsObject>() };
|
||||
let query = if let Some(query_text) = query.get::<_, String>("query").transpose() {
|
||||
let mut query_text = query_text?;
|
||||
let columns = query.get::<_, Option<Vec<String>>>("columns")?.flatten();
|
||||
|
||||
let is_phrase =
|
||||
query_text.len() >= 2 && query_text.starts_with('"') && query_text.ends_with('"');
|
||||
let is_multi_match = columns.as_ref().map(|cols| cols.len() > 1).unwrap_or(false);
|
||||
|
||||
if is_phrase {
|
||||
// Remove the surrounding quotes for phrase queries
|
||||
query_text = query_text[1..query_text.len() - 1].to_string();
|
||||
}
|
||||
|
||||
let query: FtsQuery = match (is_phrase, is_multi_match) {
|
||||
(false, _) => MatchQuery::new(query_text).into(),
|
||||
(true, false) => PhraseQuery::new(query_text).into(),
|
||||
(true, true) => {
|
||||
return Err(napi::Error::from_reason(
|
||||
"Phrase queries cannot be used with multiple columns.",
|
||||
));
|
||||
}
|
||||
};
|
||||
let mut query = FullTextSearchQuery::new_query(query);
|
||||
if let Some(cols) = columns {
|
||||
if !cols.is_empty() {
|
||||
query = query.with_columns(&cols).map_err(|e| {
|
||||
napi::Error::from_reason(format!(
|
||||
"Failed to set full text search columns: {}",
|
||||
e
|
||||
))
|
||||
})?;
|
||||
}
|
||||
}
|
||||
query
|
||||
} else if let Some(query) = query.get::<_, napi::JsObject>("query")? {
|
||||
let query = parse_fts_query(&query)?;
|
||||
FullTextSearchQuery::new_query(query)
|
||||
} else {
|
||||
return Err(napi::Error::from_reason(
|
||||
"Invalid full text search query object".to_string(),
|
||||
));
|
||||
};
|
||||
|
||||
self.inner = self.inner.clone().full_text_search(query);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[napi]
|
||||
@@ -87,11 +131,15 @@ impl Query {
|
||||
pub async fn execute(
|
||||
&self,
|
||||
max_batch_length: Option<u32>,
|
||||
timeout_ms: Option<u32>,
|
||||
) -> napi::Result<RecordBatchIterator> {
|
||||
let mut execution_opts = QueryExecutionOptions::default();
|
||||
if let Some(max_batch_length) = max_batch_length {
|
||||
execution_opts.max_batch_length = max_batch_length;
|
||||
}
|
||||
if let Some(timeout_ms) = timeout_ms {
|
||||
execution_opts.timeout = Some(std::time::Duration::from_millis(timeout_ms as u64))
|
||||
}
|
||||
let inner_stream = self
|
||||
.inner
|
||||
.execute_with_options(execution_opts)
|
||||
@@ -195,9 +243,53 @@ impl VectorQuery {
|
||||
}
|
||||
|
||||
#[napi]
|
||||
pub fn full_text_search(&mut self, query: String, columns: Option<Vec<String>>) {
|
||||
let query = FullTextSearchQuery::new(query).columns(columns);
|
||||
pub fn full_text_search(&mut self, query: napi::JsUnknown) -> napi::Result<()> {
|
||||
let query = unsafe { query.cast::<napi::JsObject>() };
|
||||
let query = if let Some(query_text) = query.get::<_, String>("query").transpose() {
|
||||
let mut query_text = query_text?;
|
||||
let columns = query.get::<_, Option<Vec<String>>>("columns")?.flatten();
|
||||
|
||||
let is_phrase =
|
||||
query_text.len() >= 2 && query_text.starts_with('"') && query_text.ends_with('"');
|
||||
let is_multi_match = columns.as_ref().map(|cols| cols.len() > 1).unwrap_or(false);
|
||||
|
||||
if is_phrase {
|
||||
// Remove the surrounding quotes for phrase queries
|
||||
query_text = query_text[1..query_text.len() - 1].to_string();
|
||||
}
|
||||
|
||||
let query: FtsQuery = match (is_phrase, is_multi_match) {
|
||||
(false, _) => MatchQuery::new(query_text).into(),
|
||||
(true, false) => PhraseQuery::new(query_text).into(),
|
||||
(true, true) => {
|
||||
return Err(napi::Error::from_reason(
|
||||
"Phrase queries cannot be used with multiple columns.",
|
||||
));
|
||||
}
|
||||
};
|
||||
let mut query = FullTextSearchQuery::new_query(query);
|
||||
if let Some(cols) = columns {
|
||||
if !cols.is_empty() {
|
||||
query = query.with_columns(&cols).map_err(|e| {
|
||||
napi::Error::from_reason(format!(
|
||||
"Failed to set full text search columns: {}",
|
||||
e
|
||||
))
|
||||
})?;
|
||||
}
|
||||
}
|
||||
query
|
||||
} else if let Some(query) = query.get::<_, napi::JsObject>("query")? {
|
||||
let query = parse_fts_query(&query)?;
|
||||
FullTextSearchQuery::new_query(query)
|
||||
} else {
|
||||
return Err(napi::Error::from_reason(
|
||||
"Invalid full text search query object".to_string(),
|
||||
));
|
||||
};
|
||||
|
||||
self.inner = self.inner.clone().full_text_search(query);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[napi]
|
||||
@@ -242,11 +334,15 @@ impl VectorQuery {
|
||||
pub async fn execute(
|
||||
&self,
|
||||
max_batch_length: Option<u32>,
|
||||
timeout_ms: Option<u32>,
|
||||
) -> napi::Result<RecordBatchIterator> {
|
||||
let mut execution_opts = QueryExecutionOptions::default();
|
||||
if let Some(max_batch_length) = max_batch_length {
|
||||
execution_opts.max_batch_length = max_batch_length;
|
||||
}
|
||||
if let Some(timeout_ms) = timeout_ms {
|
||||
execution_opts.timeout = Some(std::time::Duration::from_millis(timeout_ms as u64))
|
||||
}
|
||||
let inner_stream = self
|
||||
.inner
|
||||
.execute_with_options(execution_opts)
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
use lancedb::index::scalar::{BoostQuery, FtsQuery, MatchQuery, MultiMatchQuery, PhraseQuery};
|
||||
use lancedb::DistanceType;
|
||||
|
||||
pub fn parse_distance_type(distance_type: impl AsRef<str>) -> napi::Result<DistanceType> {
|
||||
@@ -15,3 +16,144 @@ pub fn parse_distance_type(distance_type: impl AsRef<str>) -> napi::Result<Dista
|
||||
))),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn parse_fts_query(query: &napi::JsObject) -> napi::Result<FtsQuery> {
|
||||
let query_type = query
|
||||
.get_property_names()?
|
||||
.get_element::<napi::JsString>(0)?;
|
||||
let query_type = query_type.into_utf8()?.into_owned()?;
|
||||
let query_value =
|
||||
query
|
||||
.get::<_, napi::JsObject>(&query_type)?
|
||||
.ok_or(napi::Error::from_reason(format!(
|
||||
"query value {} not found",
|
||||
query_type
|
||||
)))?;
|
||||
|
||||
match query_type.as_str() {
|
||||
"match" => {
|
||||
let column = query_value
|
||||
.get_property_names()?
|
||||
.get_element::<napi::JsString>(0)?
|
||||
.into_utf8()?
|
||||
.into_owned()?;
|
||||
let params =
|
||||
query_value
|
||||
.get::<_, napi::JsObject>(&column)?
|
||||
.ok_or(napi::Error::from_reason(format!(
|
||||
"column {} not found",
|
||||
column
|
||||
)))?;
|
||||
|
||||
let query = params
|
||||
.get::<_, napi::JsString>("query")?
|
||||
.ok_or(napi::Error::from_reason("query not found"))?
|
||||
.into_utf8()?
|
||||
.into_owned()?;
|
||||
let boost = params
|
||||
.get::<_, napi::JsNumber>("boost")?
|
||||
.ok_or(napi::Error::from_reason("boost not found"))?
|
||||
.get_double()? as f32;
|
||||
let fuzziness = params
|
||||
.get::<_, napi::JsNumber>("fuzziness")?
|
||||
.map(|f| f.get_uint32())
|
||||
.transpose()?;
|
||||
let max_expansions = params
|
||||
.get::<_, napi::JsNumber>("max_expansions")?
|
||||
.ok_or(napi::Error::from_reason("max_expansions not found"))?
|
||||
.get_uint32()? as usize;
|
||||
|
||||
let query = MatchQuery::new(query)
|
||||
.with_column(Some(column))
|
||||
.with_boost(boost)
|
||||
.with_fuzziness(fuzziness)
|
||||
.with_max_expansions(max_expansions);
|
||||
Ok(query.into())
|
||||
}
|
||||
|
||||
"match_phrase" => {
|
||||
let column = query_value
|
||||
.get_property_names()?
|
||||
.get_element::<napi::JsString>(0)?
|
||||
.into_utf8()?
|
||||
.into_owned()?;
|
||||
let query = query_value
|
||||
.get::<_, napi::JsString>(&column)?
|
||||
.ok_or(napi::Error::from_reason(format!(
|
||||
"column {} not found",
|
||||
column
|
||||
)))?
|
||||
.into_utf8()?
|
||||
.into_owned()?;
|
||||
|
||||
let query = PhraseQuery::new(query).with_column(Some(column));
|
||||
Ok(query.into())
|
||||
}
|
||||
|
||||
"boost" => {
|
||||
let positive = query_value
|
||||
.get::<_, napi::JsObject>("positive")?
|
||||
.ok_or(napi::Error::from_reason("positive not found"))?;
|
||||
|
||||
let negative = query_value
|
||||
.get::<_, napi::JsObject>("negative")?
|
||||
.ok_or(napi::Error::from_reason("negative not found"))?;
|
||||
let negative_boost = query_value
|
||||
.get::<_, napi::JsNumber>("negative_boost")?
|
||||
.ok_or(napi::Error::from_reason("negative_boost not found"))?
|
||||
.get_double()? as f32;
|
||||
|
||||
let positive = parse_fts_query(&positive)?;
|
||||
let negative = parse_fts_query(&negative)?;
|
||||
let query = BoostQuery::new(positive, negative, Some(negative_boost));
|
||||
Ok(query.into())
|
||||
}
|
||||
|
||||
"multi_match" => {
|
||||
let query = query_value
|
||||
.get::<_, napi::JsString>("query")?
|
||||
.ok_or(napi::Error::from_reason("query not found"))?
|
||||
.into_utf8()?
|
||||
.into_owned()?;
|
||||
let columns_array = query_value
|
||||
.get::<_, napi::JsTypedArray>("columns")?
|
||||
.ok_or(napi::Error::from_reason("columns not found"))?;
|
||||
let columns_num = columns_array.get_array_length()?;
|
||||
let mut columns = Vec::with_capacity(columns_num as usize);
|
||||
for i in 0..columns_num {
|
||||
let column = columns_array
|
||||
.get_element::<napi::JsString>(i)?
|
||||
.into_utf8()?
|
||||
.into_owned()?;
|
||||
columns.push(column);
|
||||
}
|
||||
let boost_array = query_value
|
||||
.get::<_, napi::JsTypedArray>("boost")?
|
||||
.ok_or(napi::Error::from_reason("boost not found"))?;
|
||||
if boost_array.get_array_length()? != columns_num {
|
||||
return Err(napi::Error::from_reason(format!(
|
||||
"boost array length ({}) does not match columns length ({})",
|
||||
boost_array.get_array_length()?,
|
||||
columns_num
|
||||
)));
|
||||
}
|
||||
let mut boost = Vec::with_capacity(columns_num as usize);
|
||||
for i in 0..columns_num {
|
||||
let b = boost_array.get_element::<napi::JsNumber>(i)?.get_double()? as f32;
|
||||
boost.push(b);
|
||||
}
|
||||
|
||||
let query =
|
||||
MultiMatchQuery::try_new_with_boosts(query, columns, boost).map_err(|e| {
|
||||
napi::Error::from_reason(format!("Error creating MultiMatchQuery: {}", e))
|
||||
})?;
|
||||
|
||||
Ok(query.into())
|
||||
}
|
||||
|
||||
_ => Err(napi::Error::from_reason(format!(
|
||||
"Unsupported query type: {}",
|
||||
query_type
|
||||
))),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
[tool.bumpversion]
|
||||
current_version = "0.22.0-beta.0"
|
||||
current_version = "0.22.0-beta.6"
|
||||
parse = """(?x)
|
||||
(?P<major>0|[1-9]\\d*)\\.
|
||||
(?P<minor>0|[1-9]\\d*)\\.
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "lancedb-python"
|
||||
version = "0.22.0-beta.0"
|
||||
version = "0.22.0-beta.6"
|
||||
edition.workspace = true
|
||||
description = "Python bindings for LanceDB"
|
||||
license.workspace = true
|
||||
|
||||
@@ -4,11 +4,12 @@ name = "lancedb"
|
||||
dynamic = ["version"]
|
||||
dependencies = [
|
||||
"deprecation",
|
||||
"tqdm>=4.27.0",
|
||||
"numpy",
|
||||
"overrides>=0.7",
|
||||
"packaging",
|
||||
"pyarrow>=14",
|
||||
"pydantic>=1.10",
|
||||
"packaging",
|
||||
"overrides>=0.7",
|
||||
"tqdm>=4.27.0",
|
||||
]
|
||||
description = "lancedb"
|
||||
authors = [{ name = "LanceDB Devs", email = "dev@lancedb.com" }]
|
||||
@@ -55,6 +56,7 @@ tests = [
|
||||
"tantivy",
|
||||
"pyarrow-stubs",
|
||||
"pylance>=0.23.2",
|
||||
"requests",
|
||||
]
|
||||
dev = [
|
||||
"ruff",
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
from datetime import timedelta
|
||||
from typing import Dict, List, Optional, Tuple, Any, Union, Literal
|
||||
|
||||
import pyarrow as pa
|
||||
@@ -94,7 +95,9 @@ class Query:
|
||||
def postfilter(self): ...
|
||||
def nearest_to(self, query_vec: pa.Array) -> VectorQuery: ...
|
||||
def nearest_to_text(self, query: dict) -> FTSQuery: ...
|
||||
async def execute(self, max_batch_length: Optional[int]) -> RecordBatchStream: ...
|
||||
async def execute(
|
||||
self, max_batch_length: Optional[int], timeout: Optional[timedelta]
|
||||
) -> RecordBatchStream: ...
|
||||
async def explain_plan(self, verbose: Optional[bool]) -> str: ...
|
||||
async def analyze_plan(self) -> str: ...
|
||||
def to_query_request(self) -> PyQueryRequest: ...
|
||||
@@ -110,7 +113,9 @@ class FTSQuery:
|
||||
def get_query(self) -> str: ...
|
||||
def add_query_vector(self, query_vec: pa.Array) -> None: ...
|
||||
def nearest_to(self, query_vec: pa.Array) -> HybridQuery: ...
|
||||
async def execute(self, max_batch_length: Optional[int]) -> RecordBatchStream: ...
|
||||
async def execute(
|
||||
self, max_batch_length: Optional[int], timeout: Optional[timedelta]
|
||||
) -> RecordBatchStream: ...
|
||||
def to_query_request(self) -> PyQueryRequest: ...
|
||||
|
||||
class VectorQuery:
|
||||
|
||||
@@ -1,9 +1,12 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
|
||||
import base64
|
||||
import os
|
||||
from typing import ClassVar, TYPE_CHECKING, List, Union
|
||||
from typing import ClassVar, TYPE_CHECKING, List, Union, Any
|
||||
|
||||
from pathlib import Path
|
||||
from urllib.parse import urlparse
|
||||
from io import BytesIO
|
||||
|
||||
import numpy as np
|
||||
import pyarrow as pa
|
||||
@@ -11,12 +14,100 @@ import pyarrow as pa
|
||||
from ..util import attempt_import_or_raise
|
||||
from .base import EmbeddingFunction
|
||||
from .registry import register
|
||||
from .utils import api_key_not_found_help, IMAGES
|
||||
from .utils import api_key_not_found_help, IMAGES, TEXT
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import PIL
|
||||
|
||||
|
||||
def is_valid_url(text):
|
||||
try:
|
||||
parsed = urlparse(text)
|
||||
return bool(parsed.scheme) and bool(parsed.netloc)
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def transform_input(input_data: Union[str, bytes, Path]):
|
||||
PIL = attempt_import_or_raise("PIL", "pillow")
|
||||
if isinstance(input_data, str):
|
||||
if is_valid_url(input_data):
|
||||
content = {"type": "image_url", "image_url": input_data}
|
||||
else:
|
||||
content = {"type": "text", "text": input_data}
|
||||
elif isinstance(input_data, PIL.Image.Image):
|
||||
buffered = BytesIO()
|
||||
input_data.save(buffered, format="JPEG")
|
||||
img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
|
||||
content = {
|
||||
"type": "image_base64",
|
||||
"image_base64": "data:image/jpeg;base64," + img_str,
|
||||
}
|
||||
elif isinstance(input_data, bytes):
|
||||
img = PIL.Image.open(BytesIO(input_data))
|
||||
buffered = BytesIO()
|
||||
img.save(buffered, format="JPEG")
|
||||
img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
|
||||
content = {
|
||||
"type": "image_base64",
|
||||
"image_base64": "data:image/jpeg;base64," + img_str,
|
||||
}
|
||||
elif isinstance(input_data, Path):
|
||||
img = PIL.Image.open(input_data)
|
||||
buffered = BytesIO()
|
||||
img.save(buffered, format="JPEG")
|
||||
img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
|
||||
content = {
|
||||
"type": "image_base64",
|
||||
"image_base64": "data:image/jpeg;base64," + img_str,
|
||||
}
|
||||
else:
|
||||
raise ValueError("Each input should be either str, bytes, Path or Image.")
|
||||
|
||||
return {"content": [content]}
|
||||
|
||||
|
||||
def sanitize_multimodal_input(inputs: Union[TEXT, IMAGES]) -> List[Any]:
|
||||
"""
|
||||
Sanitize the input to the embedding function.
|
||||
"""
|
||||
PIL = attempt_import_or_raise("PIL", "pillow")
|
||||
if isinstance(inputs, (str, bytes, Path, PIL.Image.Image)):
|
||||
inputs = [inputs]
|
||||
elif isinstance(inputs, pa.Array):
|
||||
inputs = inputs.to_pylist()
|
||||
elif isinstance(inputs, pa.ChunkedArray):
|
||||
inputs = inputs.combine_chunks().to_pylist()
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Input type {type(inputs)} not allowed with multimodal model."
|
||||
)
|
||||
|
||||
if not all(isinstance(x, (str, bytes, Path, PIL.Image.Image)) for x in inputs):
|
||||
raise ValueError("Each input should be either str, bytes, Path or Image.")
|
||||
|
||||
return [transform_input(i) for i in inputs]
|
||||
|
||||
|
||||
def sanitize_text_input(inputs: TEXT) -> List[str]:
|
||||
"""
|
||||
Sanitize the input to the embedding function.
|
||||
"""
|
||||
if isinstance(inputs, str):
|
||||
inputs = [inputs]
|
||||
elif isinstance(inputs, pa.Array):
|
||||
inputs = inputs.to_pylist()
|
||||
elif isinstance(inputs, pa.ChunkedArray):
|
||||
inputs = inputs.combine_chunks().to_pylist()
|
||||
else:
|
||||
raise ValueError(f"Input type {type(inputs)} not allowed with text model.")
|
||||
|
||||
if not all(isinstance(x, str) for x in inputs):
|
||||
raise ValueError("Each input should be str.")
|
||||
|
||||
return inputs
|
||||
|
||||
|
||||
@register("voyageai")
|
||||
class VoyageAIEmbeddingFunction(EmbeddingFunction):
|
||||
"""
|
||||
@@ -74,6 +165,11 @@ class VoyageAIEmbeddingFunction(EmbeddingFunction):
|
||||
]
|
||||
multimodal_embedding_models: list = ["voyage-multimodal-3"]
|
||||
|
||||
def _is_multimodal_model(self, model_name: str):
|
||||
return (
|
||||
model_name in self.multimodal_embedding_models or "multimodal" in model_name
|
||||
)
|
||||
|
||||
def ndims(self):
|
||||
if self.name == "voyage-3-lite":
|
||||
return 512
|
||||
@@ -85,55 +181,12 @@ class VoyageAIEmbeddingFunction(EmbeddingFunction):
|
||||
"voyage-finance-2",
|
||||
"voyage-multilingual-2",
|
||||
"voyage-law-2",
|
||||
"voyage-multimodal-3",
|
||||
]:
|
||||
return 1024
|
||||
else:
|
||||
raise ValueError(f"Model {self.name} not supported")
|
||||
|
||||
def sanitize_input(self, images: IMAGES) -> Union[List[bytes], np.ndarray]:
|
||||
"""
|
||||
Sanitize the input to the embedding function.
|
||||
"""
|
||||
if isinstance(images, (str, bytes)):
|
||||
images = [images]
|
||||
elif isinstance(images, pa.Array):
|
||||
images = images.to_pylist()
|
||||
elif isinstance(images, pa.ChunkedArray):
|
||||
images = images.combine_chunks().to_pylist()
|
||||
return images
|
||||
|
||||
def generate_text_embeddings(self, text: str, **kwargs) -> np.ndarray:
|
||||
"""
|
||||
Get the embeddings for the given texts
|
||||
|
||||
Parameters
|
||||
----------
|
||||
texts: list[str] or np.ndarray (of str)
|
||||
The texts to embed
|
||||
input_type: Optional[str]
|
||||
|
||||
truncation: Optional[bool]
|
||||
"""
|
||||
client = VoyageAIEmbeddingFunction._get_client()
|
||||
if self.name in self.text_embedding_models:
|
||||
rs = client.embed(texts=[text], model=self.name, **kwargs)
|
||||
elif self.name in self.multimodal_embedding_models:
|
||||
rs = client.multimodal_embed(inputs=[[text]], model=self.name, **kwargs)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Model {self.name} not supported to generate text embeddings"
|
||||
)
|
||||
|
||||
return rs.embeddings[0]
|
||||
|
||||
def generate_image_embedding(
|
||||
self, image: "PIL.Image.Image", **kwargs
|
||||
) -> np.ndarray:
|
||||
rs = VoyageAIEmbeddingFunction._get_client().multimodal_embed(
|
||||
inputs=[[image]], model=self.name, **kwargs
|
||||
)
|
||||
return rs.embeddings[0]
|
||||
|
||||
def compute_query_embeddings(
|
||||
self, query: Union[str, "PIL.Image.Image"], *args, **kwargs
|
||||
) -> List[np.ndarray]:
|
||||
@@ -144,23 +197,52 @@ class VoyageAIEmbeddingFunction(EmbeddingFunction):
|
||||
----------
|
||||
query : Union[str, PIL.Image.Image]
|
||||
The query to embed. A query can be either text or an image.
|
||||
|
||||
Returns
|
||||
-------
|
||||
List[np.array]: the list of embeddings
|
||||
"""
|
||||
if isinstance(query, str):
|
||||
return [self.generate_text_embeddings(query, input_type="query")]
|
||||
client = VoyageAIEmbeddingFunction._get_client()
|
||||
if self._is_multimodal_model(self.name):
|
||||
result = client.multimodal_embed(
|
||||
inputs=[[query]], model=self.name, input_type="query", **kwargs
|
||||
)
|
||||
else:
|
||||
PIL = attempt_import_or_raise("PIL", "pillow")
|
||||
if isinstance(query, PIL.Image.Image):
|
||||
return [self.generate_image_embedding(query, input_type="query")]
|
||||
else:
|
||||
raise TypeError("Only text PIL images supported as query")
|
||||
result = client.embed(
|
||||
texts=[query], model=self.name, input_type="query", **kwargs
|
||||
)
|
||||
|
||||
return [result.embeddings[0]]
|
||||
|
||||
def compute_source_embeddings(
|
||||
self, images: IMAGES, *args, **kwargs
|
||||
self, inputs: Union[TEXT, IMAGES], *args, **kwargs
|
||||
) -> List[np.array]:
|
||||
images = self.sanitize_input(images)
|
||||
return [
|
||||
self.generate_image_embedding(img, input_type="document") for img in images
|
||||
]
|
||||
"""
|
||||
Compute the embeddings for the inputs
|
||||
|
||||
Parameters
|
||||
----------
|
||||
inputs : Union[TEXT, IMAGES]
|
||||
The inputs to embed. The input can be either str, bytes, Path (to an image),
|
||||
PIL.Image or list of these.
|
||||
|
||||
Returns
|
||||
-------
|
||||
List[np.array]: the list of embeddings
|
||||
"""
|
||||
client = VoyageAIEmbeddingFunction._get_client()
|
||||
if self._is_multimodal_model(self.name):
|
||||
inputs = sanitize_multimodal_input(inputs)
|
||||
result = client.multimodal_embed(
|
||||
inputs=inputs, model=self.name, input_type="document", **kwargs
|
||||
)
|
||||
else:
|
||||
inputs = sanitize_text_input(inputs)
|
||||
result = client.embed(
|
||||
texts=inputs, model=self.name, input_type="document", **kwargs
|
||||
)
|
||||
|
||||
return result.embeddings
|
||||
|
||||
@staticmethod
|
||||
def _get_client():
|
||||
|
||||
@@ -4,7 +4,10 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
import abc
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from enum import Enum
|
||||
from datetime import timedelta
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Dict,
|
||||
@@ -83,6 +86,213 @@ def ensure_vector_query(
|
||||
return val
|
||||
|
||||
|
||||
class FullTextQueryType(Enum):
|
||||
MATCH = "match"
|
||||
MATCH_PHRASE = "match_phrase"
|
||||
BOOST = "boost"
|
||||
MULTI_MATCH = "multi_match"
|
||||
|
||||
|
||||
class FullTextQuery(abc.ABC, pydantic.BaseModel):
|
||||
@abc.abstractmethod
|
||||
def query_type(self) -> FullTextQueryType:
|
||||
"""
|
||||
Get the query type of the query.
|
||||
|
||||
Returns
|
||||
-------
|
||||
str
|
||||
The type of the query.
|
||||
"""
|
||||
|
||||
@abc.abstractmethod
|
||||
def to_dict(self) -> dict:
|
||||
"""
|
||||
Convert the query to a dictionary.
|
||||
|
||||
Returns
|
||||
-------
|
||||
dict
|
||||
The query as a dictionary.
|
||||
"""
|
||||
|
||||
|
||||
class MatchQuery(FullTextQuery):
|
||||
query: str
|
||||
column: str
|
||||
boost: float = 1.0
|
||||
fuzziness: int = 0
|
||||
max_expansions: int = 50
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
query: str,
|
||||
column: str,
|
||||
*,
|
||||
boost: float = 1.0,
|
||||
fuzziness: int = 0,
|
||||
max_expansions: int = 50,
|
||||
):
|
||||
"""
|
||||
Match query for full-text search.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
query : str
|
||||
The query string to match against.
|
||||
column : str
|
||||
The name of the column to match against.
|
||||
boost : float, default 1.0
|
||||
The boost factor for the query.
|
||||
The score of each matching document is multiplied by this value.
|
||||
fuzziness : int, optional
|
||||
The maximum edit distance for each term in the match query.
|
||||
Defaults to 0 (exact match).
|
||||
If None, fuzziness is applied automatically by the rules:
|
||||
- 0 for terms with length <= 2
|
||||
- 1 for terms with length <= 5
|
||||
- 2 for terms with length > 5
|
||||
max_expansions : int, optional
|
||||
The maximum number of terms to consider for fuzzy matching.
|
||||
Defaults to 50.
|
||||
"""
|
||||
super().__init__(
|
||||
query=query,
|
||||
column=column,
|
||||
boost=boost,
|
||||
fuzziness=fuzziness,
|
||||
max_expansions=max_expansions,
|
||||
)
|
||||
|
||||
def query_type(self) -> FullTextQueryType:
|
||||
return FullTextQueryType.MATCH
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"match": {
|
||||
self.column: {
|
||||
"query": self.query,
|
||||
"boost": self.boost,
|
||||
"fuzziness": self.fuzziness,
|
||||
"max_expansions": self.max_expansions,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
class PhraseQuery(FullTextQuery):
|
||||
query: str
|
||||
column: str
|
||||
|
||||
def __init__(self, query: str, column: str):
|
||||
"""
|
||||
Phrase query for full-text search.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
query : str
|
||||
The query string to match against.
|
||||
column : str
|
||||
The name of the column to match against.
|
||||
"""
|
||||
super().__init__(query=query, column=column)
|
||||
|
||||
def query_type(self) -> FullTextQueryType:
|
||||
return FullTextQueryType.MATCH_PHRASE
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"match_phrase": {
|
||||
self.column: self.query,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
class BoostQuery(FullTextQuery):
|
||||
positive: FullTextQuery
|
||||
negative: FullTextQuery
|
||||
negative_boost: float = 0.5
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
positive: FullTextQuery,
|
||||
negative: FullTextQuery,
|
||||
*,
|
||||
negative_boost: float = 0.5,
|
||||
):
|
||||
"""
|
||||
Boost query for full-text search.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
positive : dict
|
||||
The positive query object.
|
||||
negative : dict
|
||||
The negative query object.
|
||||
negative_boost : float
|
||||
The boost factor for the negative query.
|
||||
"""
|
||||
super().__init__(
|
||||
positive=positive, negative=negative, negative_boost=negative_boost
|
||||
)
|
||||
|
||||
def query_type(self) -> FullTextQueryType:
|
||||
return FullTextQueryType.BOOST
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"boost": {
|
||||
"positive": self.positive.to_dict(),
|
||||
"negative": self.negative.to_dict(),
|
||||
"negative_boost": self.negative_boost,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
class MultiMatchQuery(FullTextQuery):
|
||||
query: str
|
||||
columns: list[str]
|
||||
boosts: list[float]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
query: str,
|
||||
columns: list[str],
|
||||
*,
|
||||
boosts: Optional[list[float]] = None,
|
||||
):
|
||||
"""
|
||||
Multi-match query for full-text search.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
query : str | list[Query]
|
||||
If a string, the query string to match against.
|
||||
|
||||
columns : list[str]
|
||||
The list of columns to match against.
|
||||
|
||||
boosts : list[float], optional
|
||||
The list of boost factors for each column. If not provided,
|
||||
all columns will have the same boost factor.
|
||||
"""
|
||||
if boosts is None:
|
||||
boosts = [1.0] * len(columns)
|
||||
super().__init__(query=query, columns=columns, boosts=boosts)
|
||||
|
||||
def query_type(self) -> FullTextQueryType:
|
||||
return FullTextQueryType.MULTI_MATCH
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"multi_match": {
|
||||
"query": self.query,
|
||||
"columns": self.columns,
|
||||
"boost": self.boosts,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
class FullTextSearchQuery(pydantic.BaseModel):
|
||||
"""A LanceDB Full Text Search Query
|
||||
|
||||
@@ -92,18 +302,13 @@ class FullTextSearchQuery(pydantic.BaseModel):
|
||||
The columns to search
|
||||
|
||||
If None, then the table should select the column automatically.
|
||||
query: str
|
||||
The query to search for
|
||||
limit: Optional[int] = None
|
||||
The limit on the number of results to return
|
||||
wand_factor: Optional[float] = None
|
||||
The wand factor to use for the search
|
||||
query: str | FullTextQuery
|
||||
If a string, it is treated as a MatchQuery.
|
||||
If a FullTextQuery object, it is used directly.
|
||||
"""
|
||||
|
||||
columns: Optional[List[str]] = None
|
||||
query: str
|
||||
limit: Optional[int] = None
|
||||
wand_factor: Optional[float] = None
|
||||
query: Union[str, FullTextQuery]
|
||||
|
||||
|
||||
class Query(pydantic.BaseModel):
|
||||
@@ -357,7 +562,7 @@ class LanceQueryBuilder(ABC):
|
||||
table, query, vector_column_name, fts_columns=fts_columns
|
||||
)
|
||||
|
||||
if isinstance(query, str):
|
||||
if isinstance(query, (str, FullTextQuery)):
|
||||
# fts
|
||||
return LanceFtsQueryBuilder(
|
||||
table,
|
||||
@@ -382,8 +587,10 @@ class LanceQueryBuilder(ABC):
|
||||
# If query_type is fts, then query must be a string.
|
||||
# otherwise raise TypeError
|
||||
if query_type == "fts":
|
||||
if not isinstance(query, str):
|
||||
raise TypeError(f"'fts' queries must be a string: {type(query)}")
|
||||
if not isinstance(query, (str, FullTextQuery)):
|
||||
raise TypeError(
|
||||
f"'fts' query must be a string or FullTextQuery: {type(query)}"
|
||||
)
|
||||
return query, query_type
|
||||
elif query_type == "vector":
|
||||
query = cls._query_to_vector(table, query, vector_column_name)
|
||||
@@ -444,7 +651,12 @@ class LanceQueryBuilder(ABC):
|
||||
"""
|
||||
return self.to_pandas()
|
||||
|
||||
def to_pandas(self, flatten: Optional[Union[int, bool]] = None) -> "pd.DataFrame":
|
||||
def to_pandas(
|
||||
self,
|
||||
flatten: Optional[Union[int, bool]] = None,
|
||||
*,
|
||||
timeout: Optional[timedelta] = None,
|
||||
) -> "pd.DataFrame":
|
||||
"""
|
||||
Execute the query and return the results as a pandas DataFrame.
|
||||
In addition to the selected columns, LanceDB also returns a vector
|
||||
@@ -458,12 +670,15 @@ class LanceQueryBuilder(ABC):
|
||||
If flatten is an integer, flatten the nested columns up to the
|
||||
specified depth.
|
||||
If unspecified, do not flatten the nested columns.
|
||||
timeout: Optional[timedelta]
|
||||
The maximum time to wait for the query to complete.
|
||||
If None, wait indefinitely.
|
||||
"""
|
||||
tbl = flatten_columns(self.to_arrow(), flatten)
|
||||
tbl = flatten_columns(self.to_arrow(timeout=timeout), flatten)
|
||||
return tbl.to_pandas()
|
||||
|
||||
@abstractmethod
|
||||
def to_arrow(self) -> pa.Table:
|
||||
def to_arrow(self, *, timeout: Optional[timedelta] = None) -> pa.Table:
|
||||
"""
|
||||
Execute the query and return the results as an
|
||||
[Apache Arrow Table](https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table).
|
||||
@@ -471,34 +686,65 @@ class LanceQueryBuilder(ABC):
|
||||
In addition to the selected columns, LanceDB also returns a vector
|
||||
and also the "_distance" column which is the distance between the query
|
||||
vector and the returned vectors.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
timeout: Optional[timedelta]
|
||||
The maximum time to wait for the query to complete.
|
||||
If None, wait indefinitely.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def to_batches(self, /, batch_size: Optional[int] = None) -> pa.RecordBatchReader:
|
||||
def to_batches(
|
||||
self,
|
||||
/,
|
||||
batch_size: Optional[int] = None,
|
||||
*,
|
||||
timeout: Optional[timedelta] = None,
|
||||
) -> pa.RecordBatchReader:
|
||||
"""
|
||||
Execute the query and return the results as a pyarrow
|
||||
[RecordBatchReader](https://arrow.apache.org/docs/python/generated/pyarrow.RecordBatchReader.html)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
batch_size: int
|
||||
The maximum number of selected records in a RecordBatch object.
|
||||
timeout: Optional[timedelta]
|
||||
The maximum time to wait for the query to complete.
|
||||
If None, wait indefinitely.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def to_list(self) -> List[dict]:
|
||||
def to_list(self, *, timeout: Optional[timedelta] = None) -> List[dict]:
|
||||
"""
|
||||
Execute the query and return the results as a list of dictionaries.
|
||||
|
||||
Each list entry is a dictionary with the selected column names as keys,
|
||||
or all table columns if `select` is not called. The vector and the "_distance"
|
||||
fields are returned whether or not they're explicitly selected.
|
||||
"""
|
||||
return self.to_arrow().to_pylist()
|
||||
|
||||
def to_pydantic(self, model: Type[LanceModel]) -> List[LanceModel]:
|
||||
Parameters
|
||||
----------
|
||||
timeout: Optional[timedelta]
|
||||
The maximum time to wait for the query to complete.
|
||||
If None, wait indefinitely.
|
||||
"""
|
||||
return self.to_arrow(timeout=timeout).to_pylist()
|
||||
|
||||
def to_pydantic(
|
||||
self, model: Type[LanceModel], *, timeout: Optional[timedelta] = None
|
||||
) -> List[LanceModel]:
|
||||
"""Return the table as a list of pydantic models.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
model: Type[LanceModel]
|
||||
The pydantic model to use.
|
||||
timeout: Optional[timedelta]
|
||||
The maximum time to wait for the query to complete.
|
||||
If None, wait indefinitely.
|
||||
|
||||
Returns
|
||||
-------
|
||||
@@ -506,19 +752,25 @@ class LanceQueryBuilder(ABC):
|
||||
"""
|
||||
return [
|
||||
model(**{k: v for k, v in row.items() if k in model.field_names()})
|
||||
for row in self.to_arrow().to_pylist()
|
||||
for row in self.to_arrow(timeout=timeout).to_pylist()
|
||||
]
|
||||
|
||||
def to_polars(self) -> "pl.DataFrame":
|
||||
def to_polars(self, *, timeout: Optional[timedelta] = None) -> "pl.DataFrame":
|
||||
"""
|
||||
Execute the query and return the results as a Polars DataFrame.
|
||||
In addition to the selected columns, LanceDB also returns a vector
|
||||
and also the "_distance" column which is the distance between the query
|
||||
vector and the returned vector.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
timeout: Optional[timedelta]
|
||||
The maximum time to wait for the query to complete.
|
||||
If None, wait indefinitely.
|
||||
"""
|
||||
import polars as pl
|
||||
|
||||
return pl.from_arrow(self.to_arrow())
|
||||
return pl.from_arrow(self.to_arrow(timeout=timeout))
|
||||
|
||||
def limit(self, limit: Union[int, None]) -> Self:
|
||||
"""Set the maximum number of results to return.
|
||||
@@ -712,13 +964,14 @@ class LanceQueryBuilder(ABC):
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def text(self, text: str) -> Self:
|
||||
def text(self, text: str | FullTextQuery) -> Self:
|
||||
"""Set the text to search for.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
text: str
|
||||
The text to search for.
|
||||
text: str | FullTextQuery
|
||||
If a string, it is treated as a MatchQuery.
|
||||
If a FullTextQuery object, it is used directly.
|
||||
|
||||
Returns
|
||||
-------
|
||||
@@ -932,7 +1185,7 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
|
||||
self._refine_factor = refine_factor
|
||||
return self
|
||||
|
||||
def to_arrow(self) -> pa.Table:
|
||||
def to_arrow(self, *, timeout: Optional[timedelta] = None) -> pa.Table:
|
||||
"""
|
||||
Execute the query and return the results as an
|
||||
[Apache Arrow Table](https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table).
|
||||
@@ -940,8 +1193,14 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
|
||||
In addition to the selected columns, LanceDB also returns a vector
|
||||
and also the "_distance" column which is the distance between the query
|
||||
vector and the returned vectors.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
timeout: Optional[timedelta]
|
||||
The maximum time to wait for the query to complete.
|
||||
If None, wait indefinitely.
|
||||
"""
|
||||
return self.to_batches().read_all()
|
||||
return self.to_batches(timeout=timeout).read_all()
|
||||
|
||||
def to_query_object(self) -> Query:
|
||||
"""
|
||||
@@ -971,7 +1230,13 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
|
||||
bypass_vector_index=self._bypass_vector_index,
|
||||
)
|
||||
|
||||
def to_batches(self, /, batch_size: Optional[int] = None) -> pa.RecordBatchReader:
|
||||
def to_batches(
|
||||
self,
|
||||
/,
|
||||
batch_size: Optional[int] = None,
|
||||
*,
|
||||
timeout: Optional[timedelta] = None,
|
||||
) -> pa.RecordBatchReader:
|
||||
"""
|
||||
Execute the query and return the result as a RecordBatchReader object.
|
||||
|
||||
@@ -979,6 +1244,9 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
|
||||
----------
|
||||
batch_size: int
|
||||
The maximum number of selected records in a RecordBatch object.
|
||||
timeout: timedelta, default None
|
||||
The maximum time to wait for the query to complete.
|
||||
If None, wait indefinitely.
|
||||
|
||||
Returns
|
||||
-------
|
||||
@@ -988,7 +1256,9 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
|
||||
if isinstance(vector[0], np.ndarray):
|
||||
vector = [v.tolist() for v in vector]
|
||||
query = self.to_query_object()
|
||||
result_set = self._table._execute_query(query, batch_size)
|
||||
result_set = self._table._execute_query(
|
||||
query, batch_size=batch_size, timeout=timeout
|
||||
)
|
||||
if self._reranker is not None:
|
||||
rs_table = result_set.read_all()
|
||||
result_set = self._reranker.rerank_vector(self._str_query, rs_table)
|
||||
@@ -1084,7 +1354,7 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):
|
||||
def __init__(
|
||||
self,
|
||||
table: "Table",
|
||||
query: str,
|
||||
query: str | FullTextQuery,
|
||||
ordering_field_name: Optional[str] = None,
|
||||
fts_columns: Optional[Union[str, List[str]]] = None,
|
||||
):
|
||||
@@ -1127,7 +1397,7 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):
|
||||
offset=self._offset,
|
||||
)
|
||||
|
||||
def to_arrow(self) -> pa.Table:
|
||||
def to_arrow(self, *, timeout: Optional[timedelta] = None) -> pa.Table:
|
||||
path, fs, exist = self._table._get_fts_index_path()
|
||||
if exist:
|
||||
return self.tantivy_to_arrow()
|
||||
@@ -1139,14 +1409,16 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):
|
||||
"Use tantivy-based index instead for now."
|
||||
)
|
||||
query = self.to_query_object()
|
||||
results = self._table._execute_query(query)
|
||||
results = self._table._execute_query(query, timeout=timeout)
|
||||
results = results.read_all()
|
||||
if self._reranker is not None:
|
||||
results = self._reranker.rerank_fts(self._query, results)
|
||||
check_reranker_result(results)
|
||||
return results
|
||||
|
||||
def to_batches(self, /, batch_size: Optional[int] = None):
|
||||
def to_batches(
|
||||
self, /, batch_size: Optional[int] = None, timeout: Optional[timedelta] = None
|
||||
):
|
||||
raise NotImplementedError("to_batches on an FTS query")
|
||||
|
||||
def tantivy_to_arrow(self) -> pa.Table:
|
||||
@@ -1251,8 +1523,8 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):
|
||||
|
||||
|
||||
class LanceEmptyQueryBuilder(LanceQueryBuilder):
|
||||
def to_arrow(self) -> pa.Table:
|
||||
return self.to_batches().read_all()
|
||||
def to_arrow(self, *, timeout: Optional[timedelta] = None) -> pa.Table:
|
||||
return self.to_batches(timeout=timeout).read_all()
|
||||
|
||||
def to_query_object(self) -> Query:
|
||||
return Query(
|
||||
@@ -1263,9 +1535,11 @@ class LanceEmptyQueryBuilder(LanceQueryBuilder):
|
||||
offset=self._offset,
|
||||
)
|
||||
|
||||
def to_batches(self, /, batch_size: Optional[int] = None) -> pa.RecordBatchReader:
|
||||
def to_batches(
|
||||
self, /, batch_size: Optional[int] = None, timeout: Optional[timedelta] = None
|
||||
) -> pa.RecordBatchReader:
|
||||
query = self.to_query_object()
|
||||
return self._table._execute_query(query, batch_size)
|
||||
return self._table._execute_query(query, batch_size=batch_size, timeout=timeout)
|
||||
|
||||
def rerank(self, reranker: Reranker) -> LanceEmptyQueryBuilder:
|
||||
"""Rerank the results using the specified reranker.
|
||||
@@ -1298,7 +1572,7 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
|
||||
def __init__(
|
||||
self,
|
||||
table: "Table",
|
||||
query: Optional[str] = None,
|
||||
query: Optional[Union[str, FullTextQuery]] = None,
|
||||
vector_column: Optional[str] = None,
|
||||
fts_columns: Optional[Union[str, List[str]]] = None,
|
||||
):
|
||||
@@ -1328,8 +1602,8 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
|
||||
text_query = text or query
|
||||
if text_query is None:
|
||||
raise ValueError("Text query must be provided for hybrid search.")
|
||||
if not isinstance(text_query, str):
|
||||
raise ValueError("Text query must be a string")
|
||||
if not isinstance(text_query, (str, FullTextQuery)):
|
||||
raise ValueError("Text query must be a string or FullTextQuery")
|
||||
|
||||
return vector_query, text_query
|
||||
|
||||
@@ -1353,7 +1627,7 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
|
||||
def to_query_object(self) -> Query:
|
||||
raise NotImplementedError("to_query_object not yet supported on a hybrid query")
|
||||
|
||||
def to_arrow(self) -> pa.Table:
|
||||
def to_arrow(self, *, timeout: Optional[timedelta] = None) -> pa.Table:
|
||||
vector_query, fts_query = self._validate_query(
|
||||
self._query, self._vector, self._text
|
||||
)
|
||||
@@ -1396,9 +1670,11 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
|
||||
self._reranker = RRFReranker()
|
||||
|
||||
with ThreadPoolExecutor() as executor:
|
||||
fts_future = executor.submit(self._fts_query.with_row_id(True).to_arrow)
|
||||
fts_future = executor.submit(
|
||||
self._fts_query.with_row_id(True).to_arrow, timeout=timeout
|
||||
)
|
||||
vector_future = executor.submit(
|
||||
self._vector_query.with_row_id(True).to_arrow
|
||||
self._vector_query.with_row_id(True).to_arrow, timeout=timeout
|
||||
)
|
||||
fts_results = fts_future.result()
|
||||
vector_results = vector_future.result()
|
||||
@@ -1485,7 +1761,9 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
|
||||
|
||||
return results
|
||||
|
||||
def to_batches(self):
|
||||
def to_batches(
|
||||
self, /, batch_size: Optional[int] = None, timeout: Optional[timedelta] = None
|
||||
):
|
||||
raise NotImplementedError("to_batches not yet supported on a hybrid query")
|
||||
|
||||
@staticmethod
|
||||
@@ -1691,7 +1969,7 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
|
||||
self._vector = vector
|
||||
return self
|
||||
|
||||
def text(self, text: str) -> LanceHybridQueryBuilder:
|
||||
def text(self, text: str | FullTextQuery) -> LanceHybridQueryBuilder:
|
||||
self._text = text
|
||||
return self
|
||||
|
||||
@@ -1849,7 +2127,10 @@ class AsyncQueryBase(object):
|
||||
return self
|
||||
|
||||
async def to_batches(
|
||||
self, *, max_batch_length: Optional[int] = None
|
||||
self,
|
||||
*,
|
||||
max_batch_length: Optional[int] = None,
|
||||
timeout: Optional[timedelta] = None,
|
||||
) -> AsyncRecordBatchReader:
|
||||
"""
|
||||
Execute the query and return the results as an Apache Arrow RecordBatchReader.
|
||||
@@ -1862,34 +2143,56 @@ class AsyncQueryBase(object):
|
||||
If not specified, a default batch length is used.
|
||||
It is possible for batches to be smaller than the provided length if the
|
||||
underlying data is stored in smaller chunks.
|
||||
timeout: Optional[timedelta]
|
||||
The maximum time to wait for the query to complete.
|
||||
If not specified, no timeout is applied. If the query does not
|
||||
complete within the specified time, an error will be raised.
|
||||
"""
|
||||
return AsyncRecordBatchReader(await self._inner.execute(max_batch_length))
|
||||
return AsyncRecordBatchReader(
|
||||
await self._inner.execute(max_batch_length, timeout)
|
||||
)
|
||||
|
||||
async def to_arrow(self) -> pa.Table:
|
||||
async def to_arrow(self, timeout: Optional[timedelta] = None) -> pa.Table:
|
||||
"""
|
||||
Execute the query and collect the results into an Apache Arrow Table.
|
||||
|
||||
This method will collect all results into memory before returning. If
|
||||
you expect a large number of results, you may want to use
|
||||
[to_batches][lancedb.query.AsyncQueryBase.to_batches]
|
||||
|
||||
Parameters
|
||||
----------
|
||||
timeout: Optional[timedelta]
|
||||
The maximum time to wait for the query to complete.
|
||||
If not specified, no timeout is applied. If the query does not
|
||||
complete within the specified time, an error will be raised.
|
||||
"""
|
||||
batch_iter = await self.to_batches()
|
||||
batch_iter = await self.to_batches(timeout=timeout)
|
||||
return pa.Table.from_batches(
|
||||
await batch_iter.read_all(), schema=batch_iter.schema
|
||||
)
|
||||
|
||||
async def to_list(self) -> List[dict]:
|
||||
async def to_list(self, timeout: Optional[timedelta] = None) -> List[dict]:
|
||||
"""
|
||||
Execute the query and return the results as a list of dictionaries.
|
||||
|
||||
Each list entry is a dictionary with the selected column names as keys,
|
||||
or all table columns if `select` is not called. The vector and the "_distance"
|
||||
fields are returned whether or not they're explicitly selected.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
timeout: Optional[timedelta]
|
||||
The maximum time to wait for the query to complete.
|
||||
If not specified, no timeout is applied. If the query does not
|
||||
complete within the specified time, an error will be raised.
|
||||
"""
|
||||
return (await self.to_arrow()).to_pylist()
|
||||
return (await self.to_arrow(timeout=timeout)).to_pylist()
|
||||
|
||||
async def to_pandas(
|
||||
self, flatten: Optional[Union[int, bool]] = None
|
||||
self,
|
||||
flatten: Optional[Union[int, bool]] = None,
|
||||
timeout: Optional[timedelta] = None,
|
||||
) -> "pd.DataFrame":
|
||||
"""
|
||||
Execute the query and collect the results into a pandas DataFrame.
|
||||
@@ -1918,10 +2221,19 @@ class AsyncQueryBase(object):
|
||||
If flatten is an integer, flatten the nested columns up to the
|
||||
specified depth.
|
||||
If unspecified, do not flatten the nested columns.
|
||||
timeout: Optional[timedelta]
|
||||
The maximum time to wait for the query to complete.
|
||||
If not specified, no timeout is applied. If the query does not
|
||||
complete within the specified time, an error will be raised.
|
||||
"""
|
||||
return (flatten_columns(await self.to_arrow(), flatten)).to_pandas()
|
||||
return (
|
||||
flatten_columns(await self.to_arrow(timeout=timeout), flatten)
|
||||
).to_pandas()
|
||||
|
||||
async def to_polars(self) -> "pl.DataFrame":
|
||||
async def to_polars(
|
||||
self,
|
||||
timeout: Optional[timedelta] = None,
|
||||
) -> "pl.DataFrame":
|
||||
"""
|
||||
Execute the query and collect the results into a Polars DataFrame.
|
||||
|
||||
@@ -1930,6 +2242,13 @@ class AsyncQueryBase(object):
|
||||
[to_batches][lancedb.query.AsyncQueryBase.to_batches] and convert each batch to
|
||||
polars separately.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
timeout: Optional[timedelta]
|
||||
The maximum time to wait for the query to complete.
|
||||
If not specified, no timeout is applied. If the query does not
|
||||
complete within the specified time, an error will be raised.
|
||||
|
||||
Examples
|
||||
--------
|
||||
|
||||
@@ -1945,7 +2264,7 @@ class AsyncQueryBase(object):
|
||||
"""
|
||||
import polars as pl
|
||||
|
||||
return pl.from_arrow(await self.to_arrow())
|
||||
return pl.from_arrow(await self.to_arrow(timeout=timeout))
|
||||
|
||||
async def explain_plan(self, verbose: Optional[bool] = False):
|
||||
"""Return the execution plan for this query.
|
||||
@@ -2088,7 +2407,7 @@ class AsyncQuery(AsyncQueryBase):
|
||||
)
|
||||
|
||||
def nearest_to_text(
|
||||
self, query: str, columns: Union[str, List[str], None] = None
|
||||
self, query: str | FullTextQuery, columns: Union[str, List[str], None] = None
|
||||
) -> AsyncFTSQuery:
|
||||
"""
|
||||
Find the documents that are most relevant to the given text query.
|
||||
@@ -2114,9 +2433,13 @@ class AsyncQuery(AsyncQueryBase):
|
||||
columns = [columns]
|
||||
if columns is None:
|
||||
columns = []
|
||||
return AsyncFTSQuery(
|
||||
self._inner.nearest_to_text({"query": query, "columns": columns})
|
||||
)
|
||||
|
||||
if isinstance(query, str):
|
||||
return AsyncFTSQuery(
|
||||
self._inner.nearest_to_text({"query": query, "columns": columns})
|
||||
)
|
||||
# FullTextQuery object
|
||||
return AsyncFTSQuery(self._inner.nearest_to_text({"query": query.to_dict()}))
|
||||
|
||||
|
||||
class AsyncFTSQuery(AsyncQueryBase):
|
||||
@@ -2212,9 +2535,12 @@ class AsyncFTSQuery(AsyncQueryBase):
|
||||
)
|
||||
|
||||
async def to_batches(
|
||||
self, *, max_batch_length: Optional[int] = None
|
||||
self,
|
||||
*,
|
||||
max_batch_length: Optional[int] = None,
|
||||
timeout: Optional[timedelta] = None,
|
||||
) -> AsyncRecordBatchReader:
|
||||
reader = await super().to_batches()
|
||||
reader = await super().to_batches(timeout=timeout)
|
||||
results = pa.Table.from_batches(await reader.read_all(), reader.schema)
|
||||
if self._reranker:
|
||||
results = self._reranker.rerank_fts(self.get_query(), results)
|
||||
@@ -2399,7 +2725,7 @@ class AsyncVectorQuery(AsyncQueryBase, AsyncVectorQueryBase):
|
||||
return self
|
||||
|
||||
def nearest_to_text(
|
||||
self, query: str, columns: Union[str, List[str], None] = None
|
||||
self, query: str | FullTextQuery, columns: Union[str, List[str], None] = None
|
||||
) -> AsyncHybridQuery:
|
||||
"""
|
||||
Find the documents that are most relevant to the given text query,
|
||||
@@ -2429,14 +2755,21 @@ class AsyncVectorQuery(AsyncQueryBase, AsyncVectorQueryBase):
|
||||
columns = [columns]
|
||||
if columns is None:
|
||||
columns = []
|
||||
return AsyncHybridQuery(
|
||||
self._inner.nearest_to_text({"query": query, "columns": columns})
|
||||
)
|
||||
|
||||
if isinstance(query, str):
|
||||
return AsyncHybridQuery(
|
||||
self._inner.nearest_to_text({"query": query, "columns": columns})
|
||||
)
|
||||
# FullTextQuery object
|
||||
return AsyncHybridQuery(self._inner.nearest_to_text({"query": query.to_dict()}))
|
||||
|
||||
async def to_batches(
|
||||
self, *, max_batch_length: Optional[int] = None
|
||||
self,
|
||||
*,
|
||||
max_batch_length: Optional[int] = None,
|
||||
timeout: Optional[timedelta] = None,
|
||||
) -> AsyncRecordBatchReader:
|
||||
reader = await super().to_batches()
|
||||
reader = await super().to_batches(timeout=timeout)
|
||||
results = pa.Table.from_batches(await reader.read_all(), reader.schema)
|
||||
if self._reranker:
|
||||
results = self._reranker.rerank_vector(self._query_string, results)
|
||||
@@ -2492,7 +2825,10 @@ class AsyncHybridQuery(AsyncQueryBase, AsyncVectorQueryBase):
|
||||
return self
|
||||
|
||||
async def to_batches(
|
||||
self, *, max_batch_length: Optional[int] = None
|
||||
self,
|
||||
*,
|
||||
max_batch_length: Optional[int] = None,
|
||||
timeout: Optional[timedelta] = None,
|
||||
) -> AsyncRecordBatchReader:
|
||||
fts_query = AsyncFTSQuery(self._inner.to_fts_query())
|
||||
vec_query = AsyncVectorQuery(self._inner.to_vector_query())
|
||||
@@ -2504,8 +2840,8 @@ class AsyncHybridQuery(AsyncQueryBase, AsyncVectorQueryBase):
|
||||
vec_query.with_row_id()
|
||||
|
||||
fts_results, vector_results = await asyncio.gather(
|
||||
fts_query.to_arrow(),
|
||||
vec_query.to_arrow(),
|
||||
fts_query.to_arrow(timeout=timeout),
|
||||
vec_query.to_arrow(timeout=timeout),
|
||||
)
|
||||
|
||||
result = LanceHybridQueryBuilder._combine_hybrid_results(
|
||||
|
||||
@@ -355,9 +355,15 @@ class RemoteTable(Table):
|
||||
)
|
||||
|
||||
def _execute_query(
|
||||
self, query: Query, batch_size: Optional[int] = None
|
||||
self,
|
||||
query: Query,
|
||||
*,
|
||||
batch_size: Optional[int] = None,
|
||||
timeout: Optional[timedelta] = None,
|
||||
) -> pa.RecordBatchReader:
|
||||
async_iter = LOOP.run(self._table._execute_query(query, batch_size=batch_size))
|
||||
async_iter = LOOP.run(
|
||||
self._table._execute_query(query, batch_size=batch_size, timeout=timeout)
|
||||
)
|
||||
|
||||
def iter_sync():
|
||||
try:
|
||||
|
||||
@@ -52,6 +52,7 @@ from .query import (
|
||||
AsyncHybridQuery,
|
||||
AsyncQuery,
|
||||
AsyncVectorQuery,
|
||||
FullTextQuery,
|
||||
LanceEmptyQueryBuilder,
|
||||
LanceFtsQueryBuilder,
|
||||
LanceHybridQueryBuilder,
|
||||
@@ -919,7 +920,9 @@ class Table(ABC):
|
||||
@abstractmethod
|
||||
def search(
|
||||
self,
|
||||
query: Optional[Union[VEC, str, "PIL.Image.Image", Tuple]] = None,
|
||||
query: Optional[
|
||||
Union[VEC, str, "PIL.Image.Image", Tuple, FullTextQuery]
|
||||
] = None,
|
||||
vector_column_name: Optional[str] = None,
|
||||
query_type: QueryType = "auto",
|
||||
ordering_field_name: Optional[str] = None,
|
||||
@@ -1004,7 +1007,11 @@ class Table(ABC):
|
||||
|
||||
@abstractmethod
|
||||
def _execute_query(
|
||||
self, query: Query, batch_size: Optional[int] = None
|
||||
self,
|
||||
query: Query,
|
||||
*,
|
||||
batch_size: Optional[int] = None,
|
||||
timeout: Optional[timedelta] = None,
|
||||
) -> pa.RecordBatchReader: ...
|
||||
|
||||
@abstractmethod
|
||||
@@ -2039,7 +2046,9 @@ class LanceTable(Table):
|
||||
@overload
|
||||
def search(
|
||||
self,
|
||||
query: Optional[Union[VEC, str, "PIL.Image.Image", Tuple]] = None,
|
||||
query: Optional[
|
||||
Union[VEC, str, "PIL.Image.Image", Tuple, FullTextQuery]
|
||||
] = None,
|
||||
vector_column_name: Optional[str] = None,
|
||||
query_type: Literal["hybrid"] = "hybrid",
|
||||
ordering_field_name: Optional[str] = None,
|
||||
@@ -2058,7 +2067,9 @@ class LanceTable(Table):
|
||||
|
||||
def search(
|
||||
self,
|
||||
query: Optional[Union[VEC, str, "PIL.Image.Image", Tuple]] = None,
|
||||
query: Optional[
|
||||
Union[VEC, str, "PIL.Image.Image", Tuple, FullTextQuery]
|
||||
] = None,
|
||||
vector_column_name: Optional[str] = None,
|
||||
query_type: QueryType = "auto",
|
||||
ordering_field_name: Optional[str] = None,
|
||||
@@ -2305,9 +2316,15 @@ class LanceTable(Table):
|
||||
LOOP.run(self._table.update(values, where=where, updates_sql=values_sql))
|
||||
|
||||
def _execute_query(
|
||||
self, query: Query, batch_size: Optional[int] = None
|
||||
self,
|
||||
query: Query,
|
||||
*,
|
||||
batch_size: Optional[int] = None,
|
||||
timeout: Optional[timedelta] = None,
|
||||
) -> pa.RecordBatchReader:
|
||||
async_iter = LOOP.run(self._table._execute_query(query, batch_size))
|
||||
async_iter = LOOP.run(
|
||||
self._table._execute_query(query, batch_size=batch_size, timeout=timeout)
|
||||
)
|
||||
|
||||
def iter_sync():
|
||||
try:
|
||||
@@ -3134,7 +3151,9 @@ class AsyncTable:
|
||||
@overload
|
||||
async def search(
|
||||
self,
|
||||
query: Optional[Union[VEC, str, "PIL.Image.Image", Tuple]] = None,
|
||||
query: Optional[
|
||||
Union[VEC, str, "PIL.Image.Image", Tuple, FullTextQuery]
|
||||
] = None,
|
||||
vector_column_name: Optional[str] = None,
|
||||
query_type: Literal["vector"] = ...,
|
||||
ordering_field_name: Optional[str] = None,
|
||||
@@ -3143,7 +3162,9 @@ class AsyncTable:
|
||||
|
||||
async def search(
|
||||
self,
|
||||
query: Optional[Union[VEC, str, "PIL.Image.Image", Tuple]] = None,
|
||||
query: Optional[
|
||||
Union[VEC, str, "PIL.Image.Image", Tuple, FullTextQuery]
|
||||
] = None,
|
||||
vector_column_name: Optional[str] = None,
|
||||
query_type: QueryType = "auto",
|
||||
ordering_field_name: Optional[str] = None,
|
||||
@@ -3253,6 +3274,8 @@ class AsyncTable:
|
||||
if is_embedding(query):
|
||||
vector_query = query
|
||||
query_type = "vector"
|
||||
elif isinstance(query, FullTextQuery):
|
||||
query_type = "fts"
|
||||
elif isinstance(query, str):
|
||||
try:
|
||||
(
|
||||
@@ -3373,13 +3396,15 @@ class AsyncTable:
|
||||
async_query = async_query.nearest_to_text(
|
||||
query.full_text_query.query, query.full_text_query.columns
|
||||
)
|
||||
if query.full_text_query.limit is not None:
|
||||
async_query = async_query.limit(query.full_text_query.limit)
|
||||
|
||||
return async_query
|
||||
|
||||
async def _execute_query(
|
||||
self, query: Query, batch_size: Optional[int] = None
|
||||
self,
|
||||
query: Query,
|
||||
*,
|
||||
batch_size: Optional[int] = None,
|
||||
timeout: Optional[timedelta] = None,
|
||||
) -> pa.RecordBatchReader:
|
||||
# The sync table calls into this method, so we need to map the
|
||||
# query to the async version of the query and run that here. This is only
|
||||
@@ -3387,7 +3412,9 @@ class AsyncTable:
|
||||
|
||||
async_query = self._sync_query_to_async(query)
|
||||
|
||||
return await async_query.to_batches(max_batch_length=batch_size)
|
||||
return await async_query.to_batches(
|
||||
max_batch_length=batch_size, timeout=timeout
|
||||
)
|
||||
|
||||
async def _explain_plan(self, query: Query, verbose: Optional[bool]) -> str:
|
||||
# This method is used by the sync table
|
||||
|
||||
@@ -12,6 +12,7 @@ import pyarrow as pa
|
||||
import pytest
|
||||
from lancedb.embeddings import get_registry
|
||||
from lancedb.pydantic import LanceModel, Vector
|
||||
import requests
|
||||
|
||||
# These are integration tests for embedding functions.
|
||||
# They are slow because they require downloading models
|
||||
@@ -516,3 +517,61 @@ def test_voyageai_embedding_function():
|
||||
|
||||
tbl.add(df)
|
||||
assert len(tbl.to_pandas()["vector"][0]) == voyageai.ndims()
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.skipif(
|
||||
os.environ.get("VOYAGE_API_KEY") is None, reason="VOYAGE_API_KEY not set"
|
||||
)
|
||||
def test_voyageai_multimodal_embedding_function():
|
||||
voyageai = (
|
||||
get_registry().get("voyageai").create(name="voyage-multimodal-3", max_retries=0)
|
||||
)
|
||||
|
||||
class Images(LanceModel):
|
||||
label: str
|
||||
image_uri: str = voyageai.SourceField() # image uri as the source
|
||||
image_bytes: bytes = voyageai.SourceField() # image bytes as the source
|
||||
vector: Vector(voyageai.ndims()) = voyageai.VectorField() # vector column
|
||||
vec_from_bytes: Vector(voyageai.ndims()) = (
|
||||
voyageai.VectorField()
|
||||
) # Another vector column
|
||||
|
||||
db = lancedb.connect("~/lancedb")
|
||||
table = db.create_table("test", schema=Images, mode="overwrite")
|
||||
labels = ["cat", "cat", "dog", "dog", "horse", "horse"]
|
||||
uris = [
|
||||
"http://farm1.staticflickr.com/53/167798175_7c7845bbbd_z.jpg",
|
||||
"http://farm1.staticflickr.com/134/332220238_da527d8140_z.jpg",
|
||||
"http://farm9.staticflickr.com/8387/8602747737_2e5c2a45d4_z.jpg",
|
||||
"http://farm5.staticflickr.com/4092/5017326486_1f46057f5f_z.jpg",
|
||||
"http://farm9.staticflickr.com/8216/8434969557_d37882c42d_z.jpg",
|
||||
"http://farm6.staticflickr.com/5142/5835678453_4f3a4edb45_z.jpg",
|
||||
]
|
||||
# get each uri as bytes
|
||||
image_bytes = [requests.get(uri).content for uri in uris]
|
||||
table.add(
|
||||
pd.DataFrame({"label": labels, "image_uri": uris, "image_bytes": image_bytes})
|
||||
)
|
||||
assert len(table.to_pandas()["vector"][0]) == voyageai.ndims()
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.skipif(
|
||||
os.environ.get("VOYAGE_API_KEY") is None, reason="VOYAGE_API_KEY not set"
|
||||
)
|
||||
def test_voyageai_multimodal_embedding_text_function():
|
||||
voyageai = (
|
||||
get_registry().get("voyageai").create(name="voyage-multimodal-3", max_retries=0)
|
||||
)
|
||||
|
||||
class TextModel(LanceModel):
|
||||
text: str = voyageai.SourceField()
|
||||
vector: Vector(voyageai.ndims()) = voyageai.VectorField()
|
||||
|
||||
df = pd.DataFrame({"text": ["hello world", "goodbye world"]})
|
||||
db = lancedb.connect("~/lancedb")
|
||||
tbl = db.create_table("test", schema=TextModel, mode="overwrite")
|
||||
|
||||
tbl.add(df)
|
||||
assert len(tbl.to_pandas()["vector"][0]) == voyageai.ndims()
|
||||
|
||||
@@ -20,7 +20,9 @@ from unittest import mock
|
||||
import lancedb as ldb
|
||||
from lancedb.db import DBConnection
|
||||
from lancedb.index import FTS
|
||||
from lancedb.query import BoostQuery, MatchQuery, MultiMatchQuery, PhraseQuery
|
||||
import numpy as np
|
||||
import pyarrow as pa
|
||||
import pandas as pd
|
||||
import pytest
|
||||
from utils import exception_output
|
||||
@@ -178,11 +180,47 @@ def test_search_fts(table, use_tantivy):
|
||||
results = table.search("puppy").select(["id", "text"]).to_list()
|
||||
assert len(results) == 10
|
||||
|
||||
if not use_tantivy:
|
||||
# Test with a query
|
||||
results = (
|
||||
table.search(MatchQuery("puppy", "text"))
|
||||
.select(["id", "text"])
|
||||
.limit(5)
|
||||
.to_list()
|
||||
)
|
||||
assert len(results) == 5
|
||||
|
||||
# Test boost query
|
||||
results = (
|
||||
table.search(
|
||||
BoostQuery(
|
||||
MatchQuery("puppy", "text"),
|
||||
MatchQuery("runs", "text"),
|
||||
)
|
||||
)
|
||||
.select(["id", "text"])
|
||||
.limit(5)
|
||||
.to_list()
|
||||
)
|
||||
assert len(results) == 5
|
||||
|
||||
# Test multi match query
|
||||
table.create_fts_index("text2", use_tantivy=use_tantivy)
|
||||
results = (
|
||||
table.search(MultiMatchQuery("puppy", ["text", "text2"]))
|
||||
.select(["id", "text"])
|
||||
.limit(5)
|
||||
.to_list()
|
||||
)
|
||||
assert len(results) == 5
|
||||
assert len(results[0]) == 3 # id, text, _score
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_fts_select_async(async_table):
|
||||
tbl = await async_table
|
||||
await tbl.create_index("text", config=FTS())
|
||||
await tbl.create_index("text2", config=FTS())
|
||||
results = (
|
||||
await tbl.query()
|
||||
.nearest_to_text("puppy")
|
||||
@@ -193,6 +231,54 @@ async def test_fts_select_async(async_table):
|
||||
assert len(results) == 5
|
||||
assert len(results[0]) == 3 # id, text, _score
|
||||
|
||||
# Test with FullTextQuery
|
||||
results = (
|
||||
await tbl.query()
|
||||
.nearest_to_text(MatchQuery("puppy", "text"))
|
||||
.select(["id", "text"])
|
||||
.limit(5)
|
||||
.to_list()
|
||||
)
|
||||
assert len(results) == 5
|
||||
assert len(results[0]) == 3 # id, text, _score
|
||||
|
||||
# Test with BoostQuery
|
||||
results = (
|
||||
await tbl.query()
|
||||
.nearest_to_text(
|
||||
BoostQuery(
|
||||
MatchQuery("puppy", "text"),
|
||||
MatchQuery("runs", "text"),
|
||||
)
|
||||
)
|
||||
.select(["id", "text"])
|
||||
.limit(5)
|
||||
.to_list()
|
||||
)
|
||||
assert len(results) == 5
|
||||
assert len(results[0]) == 3 # id, text, _score
|
||||
|
||||
# Test with MultiMatchQuery
|
||||
results = (
|
||||
await tbl.query()
|
||||
.nearest_to_text(MultiMatchQuery("puppy", ["text", "text2"]))
|
||||
.select(["id", "text"])
|
||||
.limit(5)
|
||||
.to_list()
|
||||
)
|
||||
assert len(results) == 5
|
||||
assert len(results[0]) == 3 # id, text, _score
|
||||
|
||||
# Test with search() API
|
||||
results = (
|
||||
await (await tbl.search(MatchQuery("puppy", "text")))
|
||||
.select(["id", "text"])
|
||||
.limit(5)
|
||||
.to_list()
|
||||
)
|
||||
assert len(results) == 5
|
||||
assert len(results[0]) == 3 # id, text, _score
|
||||
|
||||
|
||||
def test_search_fts_phrase_query(table):
|
||||
table.create_fts_index("text", use_tantivy=False, with_position=False)
|
||||
@@ -207,6 +293,13 @@ def test_search_fts_phrase_query(table):
|
||||
assert len(results) > len(phrase_results)
|
||||
assert len(phrase_results) > 0
|
||||
|
||||
# Test with a query
|
||||
phrase_results = (
|
||||
table.search(PhraseQuery("puppy runs", "text")).limit(100).to_list()
|
||||
)
|
||||
assert len(results) > len(phrase_results)
|
||||
assert len(phrase_results) > 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_search_fts_phrase_query_async(async_table):
|
||||
@@ -227,6 +320,16 @@ async def test_search_fts_phrase_query_async(async_table):
|
||||
assert len(results) > len(phrase_results)
|
||||
assert len(phrase_results) > 0
|
||||
|
||||
# Test with a query
|
||||
phrase_results = (
|
||||
await async_table.query()
|
||||
.nearest_to_text(PhraseQuery("puppy runs", "text"))
|
||||
.limit(100)
|
||||
.to_list()
|
||||
)
|
||||
assert len(results) > len(phrase_results)
|
||||
assert len(phrase_results) > 0
|
||||
|
||||
|
||||
def test_search_fts_specify_column(table):
|
||||
table.create_fts_index("text", use_tantivy=False)
|
||||
@@ -524,3 +627,32 @@ def test_language(mem_db: DBConnection):
|
||||
# Stop words -> no results
|
||||
results = table.search("la", query_type="fts").limit(5).to_list()
|
||||
assert len(results) == 0
|
||||
|
||||
|
||||
def test_fts_on_list(mem_db: DBConnection):
|
||||
data = pa.table(
|
||||
{
|
||||
"text": [
|
||||
["lance database", "the", "search"],
|
||||
["lance database"],
|
||||
["lance", "search"],
|
||||
["database", "search"],
|
||||
["unrelated", "doc"],
|
||||
],
|
||||
"vector": [
|
||||
[1.0, 2.0, 3.0],
|
||||
[4.0, 5.0, 6.0],
|
||||
[7.0, 8.0, 9.0],
|
||||
[10.0, 11.0, 12.0],
|
||||
[13.0, 14.0, 15.0],
|
||||
],
|
||||
}
|
||||
)
|
||||
table = mem_db.create_table("test", data=data)
|
||||
table.create_fts_index("text", use_tantivy=False)
|
||||
|
||||
res = table.search("lance").limit(5).to_list()
|
||||
assert len(res) == 3
|
||||
|
||||
res = table.search(PhraseQuery("lance database", "text")).limit(5).to_list()
|
||||
assert len(res) == 2
|
||||
|
||||
@@ -31,6 +31,7 @@ async def some_table(db_async):
|
||||
{
|
||||
"id": list(range(NROWS)),
|
||||
"vector": sample_fixed_size_list_array(NROWS, DIM),
|
||||
"fsb": pa.array([bytes([i]) for i in range(NROWS)], pa.binary(1)),
|
||||
"tags": [
|
||||
[f"tag{random.randint(0, 8)}" for _ in range(2)] for _ in range(NROWS)
|
||||
],
|
||||
@@ -85,6 +86,16 @@ async def test_create_scalar_index(some_table: AsyncTable):
|
||||
assert len(indices) == 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_create_fixed_size_binary_index(some_table: AsyncTable):
|
||||
await some_table.create_index("fsb", config=BTree())
|
||||
indices = await some_table.list_indices()
|
||||
assert str(indices) == '[Index(BTree, columns=["fsb"], name="fsb_idx")]'
|
||||
assert len(indices) == 1
|
||||
assert indices[0].index_type == "BTree"
|
||||
assert indices[0].columns == ["fsb"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_create_bitmap_index(some_table: AsyncTable):
|
||||
await some_table.create_index("id", config=Bitmap())
|
||||
|
||||
@@ -511,7 +511,8 @@ def test_query_builder_with_different_vector_column():
|
||||
columns=["b"],
|
||||
vector_column="foo_vector",
|
||||
),
|
||||
None,
|
||||
batch_size=None,
|
||||
timeout=None,
|
||||
)
|
||||
|
||||
|
||||
@@ -1076,3 +1077,67 @@ async def test_query_serialization_async(table_async: AsyncTable):
|
||||
full_text_query=FullTextSearchQuery(columns=[], query="foo"),
|
||||
with_row_id=False,
|
||||
)
|
||||
|
||||
|
||||
def test_query_timeout(tmp_path):
|
||||
# Use local directory instead of memory:// to add a bit of latency to
|
||||
# operations so a timeout of zero will trigger exceptions.
|
||||
db = lancedb.connect(tmp_path)
|
||||
data = pa.table(
|
||||
{
|
||||
"text": ["a", "b"],
|
||||
"vector": pa.FixedSizeListArray.from_arrays(
|
||||
pc.random(4).cast(pa.float32()), 2
|
||||
),
|
||||
}
|
||||
)
|
||||
table = db.create_table("test", data)
|
||||
table.create_fts_index("text", use_tantivy=False)
|
||||
|
||||
with pytest.raises(Exception, match="Query timeout"):
|
||||
table.search().where("text = 'a'").to_list(timeout=timedelta(0))
|
||||
|
||||
with pytest.raises(Exception, match="Query timeout"):
|
||||
table.search([0.0, 0.0]).to_arrow(timeout=timedelta(0))
|
||||
|
||||
with pytest.raises(Exception, match="Query timeout"):
|
||||
table.search("a", query_type="fts").to_pandas(timeout=timedelta(0))
|
||||
|
||||
with pytest.raises(Exception, match="Query timeout"):
|
||||
table.search(query_type="hybrid").vector([0.0, 0.0]).text("a").to_arrow(
|
||||
timeout=timedelta(0)
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_query_timeout_async(tmp_path):
|
||||
db = await lancedb.connect_async(tmp_path)
|
||||
data = pa.table(
|
||||
{
|
||||
"text": ["a", "b"],
|
||||
"vector": pa.FixedSizeListArray.from_arrays(
|
||||
pc.random(4).cast(pa.float32()), 2
|
||||
),
|
||||
}
|
||||
)
|
||||
table = await db.create_table("test", data)
|
||||
await table.create_index("text", config=FTS())
|
||||
|
||||
with pytest.raises(Exception, match="Query timeout"):
|
||||
await table.query().where("text != 'a'").to_list(timeout=timedelta(0))
|
||||
|
||||
with pytest.raises(Exception, match="Query timeout"):
|
||||
await table.vector_search([0.0, 0.0]).to_arrow(timeout=timedelta(0))
|
||||
|
||||
with pytest.raises(Exception, match="Query timeout"):
|
||||
await (await table.search("a", query_type="fts")).to_pandas(
|
||||
timeout=timedelta(0)
|
||||
)
|
||||
|
||||
with pytest.raises(Exception, match="Query timeout"):
|
||||
await (
|
||||
table.query()
|
||||
.nearest_to_text("a")
|
||||
.nearest_to([0.0, 0.0])
|
||||
.to_list(timeout=timedelta(0))
|
||||
)
|
||||
|
||||
@@ -444,6 +444,16 @@ def test_query_sync_fts():
|
||||
"prefilter": True,
|
||||
"with_row_id": True,
|
||||
"version": None,
|
||||
} or body == {
|
||||
"full_text_query": {
|
||||
"query": "puppy",
|
||||
"columns": ["description", "name"],
|
||||
},
|
||||
"k": 42,
|
||||
"vector": [],
|
||||
"prefilter": True,
|
||||
"with_row_id": True,
|
||||
"version": None,
|
||||
}
|
||||
|
||||
return pa.table({"id": [1, 2, 3]})
|
||||
|
||||
@@ -2,25 +2,26 @@
|
||||
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use arrow::array::make_array;
|
||||
use arrow::array::Array;
|
||||
use arrow::array::ArrayData;
|
||||
use arrow::pyarrow::FromPyArrow;
|
||||
use arrow::pyarrow::IntoPyArrow;
|
||||
use lancedb::index::scalar::FullTextSearchQuery;
|
||||
use lancedb::index::scalar::{FtsQuery, FullTextSearchQuery, MatchQuery, PhraseQuery};
|
||||
use lancedb::query::QueryExecutionOptions;
|
||||
use lancedb::query::QueryFilter;
|
||||
use lancedb::query::{
|
||||
ExecutableQuery, Query as LanceDbQuery, QueryBase, Select, VectorQuery as LanceDbVectorQuery,
|
||||
};
|
||||
use lancedb::table::AnyQuery;
|
||||
use pyo3::exceptions::PyNotImplementedError;
|
||||
use pyo3::exceptions::PyRuntimeError;
|
||||
use pyo3::exceptions::{PyNotImplementedError, PyValueError};
|
||||
use pyo3::prelude::{PyAnyMethods, PyDictMethods};
|
||||
use pyo3::pymethods;
|
||||
use pyo3::types::PyDict;
|
||||
use pyo3::types::PyList;
|
||||
use pyo3::types::{PyDict, PyString};
|
||||
use pyo3::Bound;
|
||||
use pyo3::IntoPyObject;
|
||||
use pyo3::PyAny;
|
||||
@@ -31,7 +32,7 @@ use pyo3_async_runtimes::tokio::future_into_py;
|
||||
|
||||
use crate::arrow::RecordBatchStream;
|
||||
use crate::error::PythonErrorExt;
|
||||
use crate::util::parse_distance_type;
|
||||
use crate::util::{parse_distance_type, parse_fts_query};
|
||||
|
||||
// Python representation of full text search parameters
|
||||
#[derive(Clone)]
|
||||
@@ -46,8 +47,8 @@ pub struct PyFullTextSearchQuery {
|
||||
impl From<FullTextSearchQuery> for PyFullTextSearchQuery {
|
||||
fn from(query: FullTextSearchQuery) -> Self {
|
||||
PyFullTextSearchQuery {
|
||||
columns: query.columns,
|
||||
query: query.query,
|
||||
columns: query.columns().into_iter().collect(),
|
||||
query: query.query.query().to_owned(),
|
||||
limit: query.limit,
|
||||
wand_factor: query.wand_factor,
|
||||
}
|
||||
@@ -236,29 +237,69 @@ impl Query {
|
||||
}
|
||||
|
||||
pub fn nearest_to_text(&mut self, query: Bound<'_, PyDict>) -> PyResult<FTSQuery> {
|
||||
let query_text = query
|
||||
let fts_query = query
|
||||
.get_item("query")?
|
||||
.ok_or(PyErr::new::<PyRuntimeError, _>(
|
||||
"Query text is required for nearest_to_text",
|
||||
))?
|
||||
.extract::<String>()?;
|
||||
let columns = query
|
||||
.get_item("columns")?
|
||||
.map(|columns| columns.extract::<Vec<String>>())
|
||||
.transpose()?;
|
||||
))?;
|
||||
|
||||
let fts_query = FullTextSearchQuery::new(query_text).columns(columns);
|
||||
let query = if let Ok(query_text) = fts_query.downcast::<PyString>() {
|
||||
let mut query_text = query_text.to_string();
|
||||
let columns = query
|
||||
.get_item("columns")?
|
||||
.map(|columns| columns.extract::<Vec<String>>())
|
||||
.transpose()?;
|
||||
|
||||
let is_phrase =
|
||||
query_text.len() >= 2 && query_text.starts_with('"') && query_text.ends_with('"');
|
||||
let is_multi_match = columns.as_ref().map(|cols| cols.len() > 1).unwrap_or(false);
|
||||
|
||||
if is_phrase {
|
||||
// Remove the surrounding quotes for phrase queries
|
||||
query_text = query_text[1..query_text.len() - 1].to_string();
|
||||
}
|
||||
|
||||
let query: FtsQuery = match (is_phrase, is_multi_match) {
|
||||
(false, _) => MatchQuery::new(query_text).into(),
|
||||
(true, false) => PhraseQuery::new(query_text).into(),
|
||||
(true, true) => {
|
||||
return Err(PyValueError::new_err(
|
||||
"Phrase queries cannot be used with multiple columns.",
|
||||
));
|
||||
}
|
||||
};
|
||||
let mut query = FullTextSearchQuery::new_query(query);
|
||||
if let Some(cols) = columns {
|
||||
if !cols.is_empty() {
|
||||
query = query.with_columns(&cols).map_err(|e| {
|
||||
PyValueError::new_err(format!(
|
||||
"Failed to set full text search columns: {}",
|
||||
e
|
||||
))
|
||||
})?;
|
||||
}
|
||||
}
|
||||
query
|
||||
} else if let Ok(query) = fts_query.downcast::<PyDict>() {
|
||||
let query = parse_fts_query(query)?;
|
||||
FullTextSearchQuery::new_query(query)
|
||||
} else {
|
||||
return Err(PyValueError::new_err(
|
||||
"query must be a string or a Query object",
|
||||
));
|
||||
};
|
||||
|
||||
Ok(FTSQuery {
|
||||
fts_query,
|
||||
inner: self.inner.clone(),
|
||||
fts_query: query,
|
||||
})
|
||||
}
|
||||
|
||||
#[pyo3(signature = (max_batch_length=None))]
|
||||
#[pyo3(signature = (max_batch_length=None, timeout=None))]
|
||||
pub fn execute(
|
||||
self_: PyRef<'_, Self>,
|
||||
max_batch_length: Option<u32>,
|
||||
timeout: Option<Duration>,
|
||||
) -> PyResult<Bound<'_, PyAny>> {
|
||||
let inner = self_.inner.clone();
|
||||
future_into_py(self_.py(), async move {
|
||||
@@ -266,6 +307,9 @@ impl Query {
|
||||
if let Some(max_batch_length) = max_batch_length {
|
||||
opts.max_batch_length = max_batch_length;
|
||||
}
|
||||
if let Some(timeout) = timeout {
|
||||
opts.timeout = Some(timeout);
|
||||
}
|
||||
let inner_stream = inner.execute_with_options(opts).await.infer_error()?;
|
||||
Ok(RecordBatchStream::new(inner_stream))
|
||||
})
|
||||
@@ -337,10 +381,11 @@ impl FTSQuery {
|
||||
self.inner = self.inner.clone().postfilter();
|
||||
}
|
||||
|
||||
#[pyo3(signature = (max_batch_length=None))]
|
||||
#[pyo3(signature = (max_batch_length=None, timeout=None))]
|
||||
pub fn execute(
|
||||
self_: PyRef<'_, Self>,
|
||||
max_batch_length: Option<u32>,
|
||||
timeout: Option<Duration>,
|
||||
) -> PyResult<Bound<'_, PyAny>> {
|
||||
let inner = self_
|
||||
.inner
|
||||
@@ -352,6 +397,9 @@ impl FTSQuery {
|
||||
if let Some(max_batch_length) = max_batch_length {
|
||||
opts.max_batch_length = max_batch_length;
|
||||
}
|
||||
if let Some(timeout) = timeout {
|
||||
opts.timeout = Some(timeout);
|
||||
}
|
||||
let inner_stream = inner.execute_with_options(opts).await.infer_error()?;
|
||||
Ok(RecordBatchStream::new(inner_stream))
|
||||
})
|
||||
@@ -386,7 +434,7 @@ impl FTSQuery {
|
||||
}
|
||||
|
||||
pub fn get_query(&self) -> String {
|
||||
self.fts_query.query.clone()
|
||||
self.fts_query.query.query().to_owned()
|
||||
}
|
||||
|
||||
pub fn to_query_request(&self) -> PyQueryRequest {
|
||||
@@ -474,10 +522,11 @@ impl VectorQuery {
|
||||
self.inner = self.inner.clone().bypass_vector_index()
|
||||
}
|
||||
|
||||
#[pyo3(signature = (max_batch_length=None))]
|
||||
#[pyo3(signature = (max_batch_length=None, timeout=None))]
|
||||
pub fn execute(
|
||||
self_: PyRef<'_, Self>,
|
||||
max_batch_length: Option<u32>,
|
||||
timeout: Option<Duration>,
|
||||
) -> PyResult<Bound<'_, PyAny>> {
|
||||
let inner = self_.inner.clone();
|
||||
future_into_py(self_.py(), async move {
|
||||
@@ -485,6 +534,9 @@ impl VectorQuery {
|
||||
if let Some(max_batch_length) = max_batch_length {
|
||||
opts.max_batch_length = max_batch_length;
|
||||
}
|
||||
if let Some(timeout) = timeout {
|
||||
opts.timeout = Some(timeout);
|
||||
}
|
||||
let inner_stream = inner.execute_with_options(opts).await.infer_error()?;
|
||||
Ok(RecordBatchStream::new(inner_stream))
|
||||
})
|
||||
|
||||
@@ -3,11 +3,15 @@
|
||||
|
||||
use std::sync::Mutex;
|
||||
|
||||
use lancedb::index::scalar::{BoostQuery, FtsQuery, MatchQuery, MultiMatchQuery, PhraseQuery};
|
||||
use lancedb::DistanceType;
|
||||
use pyo3::prelude::{PyAnyMethods, PyDictMethods, PyListMethods};
|
||||
use pyo3::types::PyDict;
|
||||
use pyo3::{
|
||||
exceptions::{PyRuntimeError, PyValueError},
|
||||
pyfunction, PyResult,
|
||||
};
|
||||
use pyo3::{Bound, PyAny};
|
||||
|
||||
/// A wrapper around a rust builder
|
||||
///
|
||||
@@ -59,3 +63,116 @@ pub fn validate_table_name(table_name: &str) -> PyResult<()> {
|
||||
lancedb::utils::validate_table_name(table_name)
|
||||
.map_err(|e| PyValueError::new_err(e.to_string()))
|
||||
}
|
||||
|
||||
pub fn parse_fts_query(query: &Bound<'_, PyDict>) -> PyResult<FtsQuery> {
|
||||
let query_type = query.keys().get_item(0)?.extract::<String>()?;
|
||||
let query_value = query
|
||||
.get_item(&query_type)?
|
||||
.ok_or(PyValueError::new_err(format!(
|
||||
"Query type {} not found",
|
||||
query_type
|
||||
)))?;
|
||||
let query_value = query_value.downcast::<PyDict>()?;
|
||||
|
||||
match query_type.as_str() {
|
||||
"match" => {
|
||||
let column = query_value.keys().get_item(0)?.extract::<String>()?;
|
||||
let params = query_value
|
||||
.get_item(&column)?
|
||||
.ok_or(PyValueError::new_err(format!(
|
||||
"column {} not found",
|
||||
column
|
||||
)))?;
|
||||
let params = params.downcast::<PyDict>()?;
|
||||
|
||||
let query = params
|
||||
.get_item("query")?
|
||||
.ok_or(PyValueError::new_err("query not found"))?
|
||||
.extract::<String>()?;
|
||||
let boost = params
|
||||
.get_item("boost")?
|
||||
.ok_or(PyValueError::new_err("boost not found"))?
|
||||
.extract::<f32>()?;
|
||||
let fuzziness = params
|
||||
.get_item("fuzziness")?
|
||||
.ok_or(PyValueError::new_err("fuzziness not found"))?
|
||||
.extract::<Option<u32>>()?;
|
||||
let max_expansions = params
|
||||
.get_item("max_expansions")?
|
||||
.ok_or(PyValueError::new_err("max_expansions not found"))?
|
||||
.extract::<usize>()?;
|
||||
|
||||
let query = MatchQuery::new(query)
|
||||
.with_column(Some(column))
|
||||
.with_boost(boost)
|
||||
.with_fuzziness(fuzziness)
|
||||
.with_max_expansions(max_expansions);
|
||||
Ok(query.into())
|
||||
}
|
||||
|
||||
"match_phrase" => {
|
||||
let column = query_value.keys().get_item(0)?.extract::<String>()?;
|
||||
let query = query_value
|
||||
.get_item(&column)?
|
||||
.ok_or(PyValueError::new_err(format!(
|
||||
"column {} not found",
|
||||
column
|
||||
)))?
|
||||
.extract::<String>()?;
|
||||
|
||||
let query = PhraseQuery::new(query).with_column(Some(column));
|
||||
Ok(query.into())
|
||||
}
|
||||
|
||||
"boost" => {
|
||||
let positive: Bound<'_, PyAny> = query_value
|
||||
.get_item("positive")?
|
||||
.ok_or(PyValueError::new_err("positive not found"))?;
|
||||
let positive = positive.downcast::<PyDict>()?;
|
||||
|
||||
let negative = query_value
|
||||
.get_item("negative")?
|
||||
.ok_or(PyValueError::new_err("negative not found"))?;
|
||||
let negative = negative.downcast::<PyDict>()?;
|
||||
|
||||
let negative_boost = query_value
|
||||
.get_item("negative_boost")?
|
||||
.ok_or(PyValueError::new_err("negative_boost not found"))?
|
||||
.extract::<f32>()?;
|
||||
|
||||
let positive_query = parse_fts_query(positive)?;
|
||||
let negative_query = parse_fts_query(negative)?;
|
||||
let query = BoostQuery::new(positive_query, negative_query, Some(negative_boost));
|
||||
|
||||
Ok(query.into())
|
||||
}
|
||||
|
||||
"multi_match" => {
|
||||
let query = query_value
|
||||
.get_item("query")?
|
||||
.ok_or(PyValueError::new_err("query not found"))?
|
||||
.extract::<String>()?;
|
||||
|
||||
let columns = query_value
|
||||
.get_item("columns")?
|
||||
.ok_or(PyValueError::new_err("columns not found"))?
|
||||
.extract::<Vec<String>>()?;
|
||||
|
||||
let boost = query_value
|
||||
.get_item("boost")?
|
||||
.ok_or(PyValueError::new_err("boost not found"))?
|
||||
.extract::<Vec<f32>>()?;
|
||||
|
||||
let query =
|
||||
MultiMatchQuery::try_new_with_boosts(query, columns, boost).map_err(|e| {
|
||||
PyValueError::new_err(format!("Error creating MultiMatchQuery: {}", e))
|
||||
})?;
|
||||
Ok(query.into())
|
||||
}
|
||||
|
||||
_ => Err(PyValueError::new_err(format!(
|
||||
"Unsupported query type: {}",
|
||||
query_type
|
||||
))),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "lancedb-node"
|
||||
version = "0.18.3-beta.0"
|
||||
version = "0.19.0-beta.5"
|
||||
description = "Serverless, low-latency vector database for AI applications"
|
||||
license.workspace = true
|
||||
edition.workspace = true
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "lancedb"
|
||||
version = "0.18.3-beta.0"
|
||||
version = "0.19.0-beta.5"
|
||||
edition.workspace = true
|
||||
description = "LanceDB: A serverless, low-latency vector database for AI applications"
|
||||
license.workspace = true
|
||||
|
||||
@@ -142,12 +142,6 @@ impl CreateTableBuilder<true> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Apply the given write options when writing the initial data
|
||||
pub fn write_options(mut self, write_options: WriteOptions) -> Self {
|
||||
self.request.write_options = write_options;
|
||||
self
|
||||
}
|
||||
|
||||
/// Execute the create table operation
|
||||
pub async fn execute(self) -> Result<Table> {
|
||||
let embedding_registry = self.embedding_registry.clone();
|
||||
@@ -229,6 +223,12 @@ impl<const HAS_DATA: bool> CreateTableBuilder<HAS_DATA> {
|
||||
self
|
||||
}
|
||||
|
||||
/// Apply the given write options when writing the initial data
|
||||
pub fn write_options(mut self, write_options: WriteOptions) -> Self {
|
||||
self.request.write_options = write_options;
|
||||
self
|
||||
}
|
||||
|
||||
/// Set an option for the storage layer.
|
||||
///
|
||||
/// Options already set on the connection will be inherited by the table,
|
||||
|
||||
@@ -80,5 +80,6 @@ impl FtsIndexBuilder {
|
||||
}
|
||||
}
|
||||
|
||||
pub use lance_index::scalar::inverted::query::*;
|
||||
pub use lance_index::scalar::inverted::TokenizerConfig;
|
||||
pub use lance_index::scalar::FullTextSearchQuery;
|
||||
|
||||
@@ -14,6 +14,9 @@ use object_store::{
|
||||
|
||||
use async_trait::async_trait;
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod io_tracking;
|
||||
|
||||
#[derive(Debug)]
|
||||
struct MirroringObjectStore {
|
||||
primary: Arc<dyn ObjectStore>,
|
||||
|
||||
237
rust/lancedb/src/io/object_store/io_tracking.rs
Normal file
237
rust/lancedb/src/io/object_store/io_tracking.rs
Normal file
@@ -0,0 +1,237 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
use std::{
|
||||
fmt::{Display, Formatter},
|
||||
sync::{Arc, Mutex},
|
||||
};
|
||||
|
||||
use bytes::Bytes;
|
||||
use futures::stream::BoxStream;
|
||||
use lance::io::WrappingObjectStore;
|
||||
use object_store::{
|
||||
path::Path, GetOptions, GetResult, ListResult, MultipartUpload, ObjectMeta, ObjectStore,
|
||||
PutMultipartOpts, PutOptions, PutPayload, PutResult, Result as OSResult, UploadPart,
|
||||
};
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
pub struct IoStats {
|
||||
pub read_iops: u64,
|
||||
pub read_bytes: u64,
|
||||
pub write_iops: u64,
|
||||
pub write_bytes: u64,
|
||||
}
|
||||
|
||||
impl Display for IoStats {
|
||||
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{:#?}", self)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct IoTrackingStore {
|
||||
target: Arc<dyn ObjectStore>,
|
||||
stats: Arc<Mutex<IoStats>>,
|
||||
}
|
||||
|
||||
impl Display for IoTrackingStore {
|
||||
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{:#?}", self)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Clone)]
|
||||
pub struct IoStatsHolder(Arc<Mutex<IoStats>>);
|
||||
|
||||
impl IoStatsHolder {
|
||||
pub fn incremental_stats(&self) -> IoStats {
|
||||
std::mem::take(&mut self.0.lock().expect("failed to lock IoStats"))
|
||||
}
|
||||
}
|
||||
|
||||
impl WrappingObjectStore for IoStatsHolder {
|
||||
fn wrap(&self, target: Arc<dyn ObjectStore>) -> Arc<dyn ObjectStore> {
|
||||
Arc::new(IoTrackingStore {
|
||||
target,
|
||||
stats: self.0.clone(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl IoTrackingStore {
|
||||
pub fn new_wrapper() -> (Arc<dyn WrappingObjectStore>, Arc<Mutex<IoStats>>) {
|
||||
let stats = Arc::new(Mutex::new(IoStats::default()));
|
||||
(Arc::new(IoStatsHolder(stats.clone())), stats)
|
||||
}
|
||||
|
||||
fn record_read(&self, num_bytes: u64) {
|
||||
let mut stats = self.stats.lock().unwrap();
|
||||
stats.read_iops += 1;
|
||||
stats.read_bytes += num_bytes;
|
||||
}
|
||||
|
||||
fn record_write(&self, num_bytes: u64) {
|
||||
let mut stats = self.stats.lock().unwrap();
|
||||
stats.write_iops += 1;
|
||||
stats.write_bytes += num_bytes;
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
#[deny(clippy::missing_trait_methods)]
|
||||
impl ObjectStore for IoTrackingStore {
|
||||
async fn put(&self, location: &Path, bytes: PutPayload) -> OSResult<PutResult> {
|
||||
self.record_write(bytes.content_length() as u64);
|
||||
self.target.put(location, bytes).await
|
||||
}
|
||||
|
||||
async fn put_opts(
|
||||
&self,
|
||||
location: &Path,
|
||||
bytes: PutPayload,
|
||||
opts: PutOptions,
|
||||
) -> OSResult<PutResult> {
|
||||
self.record_write(bytes.content_length() as u64);
|
||||
self.target.put_opts(location, bytes, opts).await
|
||||
}
|
||||
|
||||
async fn put_multipart(&self, location: &Path) -> OSResult<Box<dyn MultipartUpload>> {
|
||||
let target = self.target.put_multipart(location).await?;
|
||||
Ok(Box::new(IoTrackingMultipartUpload {
|
||||
target,
|
||||
stats: self.stats.clone(),
|
||||
}))
|
||||
}
|
||||
|
||||
async fn put_multipart_opts(
|
||||
&self,
|
||||
location: &Path,
|
||||
opts: PutMultipartOpts,
|
||||
) -> OSResult<Box<dyn MultipartUpload>> {
|
||||
let target = self.target.put_multipart_opts(location, opts).await?;
|
||||
Ok(Box::new(IoTrackingMultipartUpload {
|
||||
target,
|
||||
stats: self.stats.clone(),
|
||||
}))
|
||||
}
|
||||
|
||||
async fn get(&self, location: &Path) -> OSResult<GetResult> {
|
||||
let result = self.target.get(location).await;
|
||||
if let Ok(result) = &result {
|
||||
let num_bytes = result.range.end - result.range.start;
|
||||
self.record_read(num_bytes as u64);
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
async fn get_opts(&self, location: &Path, options: GetOptions) -> OSResult<GetResult> {
|
||||
let result = self.target.get_opts(location, options).await;
|
||||
if let Ok(result) = &result {
|
||||
let num_bytes = result.range.end - result.range.start;
|
||||
self.record_read(num_bytes as u64);
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
async fn get_range(&self, location: &Path, range: std::ops::Range<usize>) -> OSResult<Bytes> {
|
||||
let result = self.target.get_range(location, range).await;
|
||||
if let Ok(result) = &result {
|
||||
self.record_read(result.len() as u64);
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
async fn get_ranges(
|
||||
&self,
|
||||
location: &Path,
|
||||
ranges: &[std::ops::Range<usize>],
|
||||
) -> OSResult<Vec<Bytes>> {
|
||||
let result = self.target.get_ranges(location, ranges).await;
|
||||
if let Ok(result) = &result {
|
||||
self.record_read(result.iter().map(|b| b.len() as u64).sum());
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
async fn head(&self, location: &Path) -> OSResult<ObjectMeta> {
|
||||
self.record_read(0);
|
||||
self.target.head(location).await
|
||||
}
|
||||
|
||||
async fn delete(&self, location: &Path) -> OSResult<()> {
|
||||
self.record_write(0);
|
||||
self.target.delete(location).await
|
||||
}
|
||||
|
||||
fn delete_stream<'a>(
|
||||
&'a self,
|
||||
locations: BoxStream<'a, OSResult<Path>>,
|
||||
) -> BoxStream<'a, OSResult<Path>> {
|
||||
self.target.delete_stream(locations)
|
||||
}
|
||||
|
||||
fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, OSResult<ObjectMeta>> {
|
||||
self.record_read(0);
|
||||
self.target.list(prefix)
|
||||
}
|
||||
|
||||
fn list_with_offset(
|
||||
&self,
|
||||
prefix: Option<&Path>,
|
||||
offset: &Path,
|
||||
) -> BoxStream<'_, OSResult<ObjectMeta>> {
|
||||
self.record_read(0);
|
||||
self.target.list_with_offset(prefix, offset)
|
||||
}
|
||||
|
||||
async fn list_with_delimiter(&self, prefix: Option<&Path>) -> OSResult<ListResult> {
|
||||
self.record_read(0);
|
||||
self.target.list_with_delimiter(prefix).await
|
||||
}
|
||||
|
||||
async fn copy(&self, from: &Path, to: &Path) -> OSResult<()> {
|
||||
self.record_write(0);
|
||||
self.target.copy(from, to).await
|
||||
}
|
||||
|
||||
async fn rename(&self, from: &Path, to: &Path) -> OSResult<()> {
|
||||
self.record_write(0);
|
||||
self.target.rename(from, to).await
|
||||
}
|
||||
|
||||
async fn rename_if_not_exists(&self, from: &Path, to: &Path) -> OSResult<()> {
|
||||
self.record_write(0);
|
||||
self.target.rename_if_not_exists(from, to).await
|
||||
}
|
||||
|
||||
async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> OSResult<()> {
|
||||
self.record_write(0);
|
||||
self.target.copy_if_not_exists(from, to).await
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct IoTrackingMultipartUpload {
|
||||
target: Box<dyn MultipartUpload>,
|
||||
stats: Arc<Mutex<IoStats>>,
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl MultipartUpload for IoTrackingMultipartUpload {
|
||||
async fn abort(&mut self) -> OSResult<()> {
|
||||
self.target.abort().await
|
||||
}
|
||||
|
||||
async fn complete(&mut self) -> OSResult<PutResult> {
|
||||
self.target.complete().await
|
||||
}
|
||||
|
||||
fn put_part(&mut self, payload: PutPayload) -> UploadPart {
|
||||
{
|
||||
let mut stats = self.stats.lock().unwrap();
|
||||
stats.write_iops += 1;
|
||||
stats.write_bytes += payload.content_length() as u64;
|
||||
}
|
||||
self.target.put_part(payload)
|
||||
}
|
||||
}
|
||||
@@ -1,8 +1,8 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
use std::future::Future;
|
||||
use std::sync::Arc;
|
||||
use std::{future::Future, time::Duration};
|
||||
|
||||
use arrow::compute::concat_batches;
|
||||
use arrow_array::{make_array, Array, Float16Array, Float32Array, Float64Array};
|
||||
@@ -25,6 +25,7 @@ use crate::error::{Error, Result};
|
||||
use crate::rerankers::rrf::RRFReranker;
|
||||
use crate::rerankers::{check_reranker_result, NormalizeMethod, Reranker};
|
||||
use crate::table::BaseTable;
|
||||
use crate::utils::TimeoutStream;
|
||||
use crate::DistanceType;
|
||||
use crate::{arrow::SendableRecordBatchStream, table::AnyQuery};
|
||||
|
||||
@@ -525,12 +526,15 @@ pub struct QueryExecutionOptions {
|
||||
///
|
||||
/// By default, this is 1024
|
||||
pub max_batch_length: u32,
|
||||
/// Max duration to wait for the query to execute before timing out.
|
||||
pub timeout: Option<Duration>,
|
||||
}
|
||||
|
||||
impl Default for QueryExecutionOptions {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
max_batch_length: 1024,
|
||||
timeout: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1007,7 +1011,10 @@ impl VectorQuery {
|
||||
self
|
||||
}
|
||||
|
||||
pub async fn execute_hybrid(&self) -> Result<SendableRecordBatchStream> {
|
||||
pub async fn execute_hybrid(
|
||||
&self,
|
||||
options: QueryExecutionOptions,
|
||||
) -> Result<SendableRecordBatchStream> {
|
||||
// clone query and specify we want to include row IDs, which can be needed for reranking
|
||||
let mut fts_query = Query::new(self.parent.clone());
|
||||
fts_query.request = self.request.base.clone();
|
||||
@@ -1016,7 +1023,10 @@ impl VectorQuery {
|
||||
let mut vector_query = self.clone().with_row_id();
|
||||
|
||||
vector_query.request.base.full_text_search = None;
|
||||
let (fts_results, vec_results) = try_join!(fts_query.execute(), vector_query.execute())?;
|
||||
let (fts_results, vec_results) = try_join!(
|
||||
fts_query.execute_with_options(options.clone()),
|
||||
vector_query.inner_execute_with_options(options)
|
||||
)?;
|
||||
|
||||
let (fts_results, vec_results) = try_join!(
|
||||
fts_results.try_collect::<Vec<_>>(),
|
||||
@@ -1056,7 +1066,7 @@ impl VectorQuery {
|
||||
})?;
|
||||
|
||||
let mut results = reranker
|
||||
.rerank_hybrid(&fts_query.query, vec_results, fts_results)
|
||||
.rerank_hybrid(&fts_query.query.query(), vec_results, fts_results)
|
||||
.await?;
|
||||
|
||||
check_reranker_result(&results)?;
|
||||
@@ -1074,6 +1084,20 @@ impl VectorQuery {
|
||||
RecordBatchStreamAdapter::new(results.schema(), stream::iter([Ok(results)])),
|
||||
))
|
||||
}
|
||||
|
||||
async fn inner_execute_with_options(
|
||||
&self,
|
||||
options: QueryExecutionOptions,
|
||||
) -> Result<SendableRecordBatchStream> {
|
||||
let plan = self.create_plan(options.clone()).await?;
|
||||
let inner = execute_plan(plan, Default::default())?;
|
||||
let inner = if let Some(timeout) = options.timeout {
|
||||
TimeoutStream::new_boxed(inner, timeout)
|
||||
} else {
|
||||
inner
|
||||
};
|
||||
Ok(DatasetRecordBatchStream::new(inner).into())
|
||||
}
|
||||
}
|
||||
|
||||
impl ExecutableQuery for VectorQuery {
|
||||
@@ -1087,16 +1111,13 @@ impl ExecutableQuery for VectorQuery {
|
||||
options: QueryExecutionOptions,
|
||||
) -> Result<SendableRecordBatchStream> {
|
||||
if self.request.base.full_text_search.is_some() {
|
||||
let hybrid_result = async move { self.execute_hybrid().await }.boxed().await?;
|
||||
let hybrid_result = async move { self.execute_hybrid(options).await }
|
||||
.boxed()
|
||||
.await?;
|
||||
return Ok(hybrid_result);
|
||||
}
|
||||
|
||||
Ok(SendableRecordBatchStream::from(
|
||||
DatasetRecordBatchStream::new(execute_plan(
|
||||
self.create_plan(options).await?,
|
||||
Default::default(),
|
||||
)?),
|
||||
))
|
||||
self.inner_execute_with_options(options).await
|
||||
}
|
||||
|
||||
async fn explain_plan(&self, verbose: bool) -> Result<String> {
|
||||
|
||||
@@ -13,7 +13,7 @@ use reqwest::{
|
||||
use crate::error::{Error, Result};
|
||||
use crate::remote::db::RemoteOptions;
|
||||
|
||||
const REQUEST_ID_HEADER: &str = "x-request-id";
|
||||
const REQUEST_ID_HEADER: HeaderName = HeaderName::from_static("x-request-id");
|
||||
|
||||
/// Configuration for the LanceDB Cloud HTTP client.
|
||||
#[derive(Clone, Debug)]
|
||||
@@ -299,7 +299,7 @@ impl<S: HttpSend> RestfulLanceDbClient<S> {
|
||||
) -> Result<HeaderMap> {
|
||||
let mut headers = HeaderMap::new();
|
||||
headers.insert(
|
||||
"x-api-key",
|
||||
HeaderName::from_static("x-api-key"),
|
||||
HeaderValue::from_str(api_key).map_err(|_| Error::InvalidInput {
|
||||
message: "non-ascii api key provided".to_string(),
|
||||
})?,
|
||||
@@ -307,7 +307,7 @@ impl<S: HttpSend> RestfulLanceDbClient<S> {
|
||||
if region == "local" {
|
||||
let host = format!("{}.local.api.lancedb.com", db_name);
|
||||
headers.insert(
|
||||
"Host",
|
||||
http::header::HOST,
|
||||
HeaderValue::from_str(&host).map_err(|_| Error::InvalidInput {
|
||||
message: format!("non-ascii database name '{}' provided", db_name),
|
||||
})?,
|
||||
@@ -315,7 +315,7 @@ impl<S: HttpSend> RestfulLanceDbClient<S> {
|
||||
}
|
||||
if has_host_override {
|
||||
headers.insert(
|
||||
"x-lancedb-database",
|
||||
HeaderName::from_static("x-lancedb-database"),
|
||||
HeaderValue::from_str(db_name).map_err(|_| Error::InvalidInput {
|
||||
message: format!("non-ascii database name '{}' provided", db_name),
|
||||
})?,
|
||||
@@ -323,7 +323,7 @@ impl<S: HttpSend> RestfulLanceDbClient<S> {
|
||||
}
|
||||
if db_prefix.is_some() {
|
||||
headers.insert(
|
||||
"x-lancedb-database-prefix",
|
||||
HeaderName::from_static("x-lancedb-database-prefix"),
|
||||
HeaderValue::from_str(db_prefix.unwrap()).map_err(|_| Error::InvalidInput {
|
||||
message: format!(
|
||||
"non-ascii database prefix '{}' provided",
|
||||
@@ -335,7 +335,7 @@ impl<S: HttpSend> RestfulLanceDbClient<S> {
|
||||
|
||||
if let Some(v) = options.0.get("account_name") {
|
||||
headers.insert(
|
||||
"x-azure-storage-account-name",
|
||||
HeaderName::from_static("x-azure-storage-account-name"),
|
||||
HeaderValue::from_str(v).map_err(|_| Error::InvalidInput {
|
||||
message: format!("non-ascii storage account name '{}' provided", db_name),
|
||||
})?,
|
||||
@@ -343,7 +343,7 @@ impl<S: HttpSend> RestfulLanceDbClient<S> {
|
||||
}
|
||||
if let Some(v) = options.0.get("azure_storage_account_name") {
|
||||
headers.insert(
|
||||
"x-azure-storage-account-name",
|
||||
HeaderName::from_static("x-azure-storage-account-name"),
|
||||
HeaderValue::from_str(v).map_err(|_| Error::InvalidInput {
|
||||
message: format!("non-ascii storage account name '{}' provided", db_name),
|
||||
})?,
|
||||
|
||||
@@ -52,6 +52,10 @@ impl ServerVersion {
|
||||
pub fn support_multivector(&self) -> bool {
|
||||
self.0 >= semver::Version::new(0, 2, 0)
|
||||
}
|
||||
|
||||
pub fn support_structural_fts(&self) -> bool {
|
||||
self.0 >= semver::Version::new(0, 3, 0)
|
||||
}
|
||||
}
|
||||
|
||||
pub const OPT_REMOTE_PREFIX: &str = "remote_database_";
|
||||
|
||||
@@ -20,7 +20,7 @@ use datafusion_physical_plan::stream::RecordBatchStreamAdapter;
|
||||
use datafusion_physical_plan::{ExecutionPlan, RecordBatchStream, SendableRecordBatchStream};
|
||||
use futures::TryStreamExt;
|
||||
use http::header::CONTENT_TYPE;
|
||||
use http::StatusCode;
|
||||
use http::{HeaderName, StatusCode};
|
||||
use lance::arrow::json::{JsonDataType, JsonSchema};
|
||||
use lance::dataset::scanner::DatasetRecordBatchStream;
|
||||
use lance::dataset::{ColumnAlteration, NewColumnTransform, Version};
|
||||
@@ -44,6 +44,8 @@ use super::client::{HttpSend, RestfulLanceDbClient, Sender};
|
||||
use super::db::ServerVersion;
|
||||
use super::ARROW_STREAM_CONTENT_TYPE;
|
||||
|
||||
const REQUEST_TIMEOUT_HEADER: HeaderName = HeaderName::from_static("x-request-timeout-ms");
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct RemoteTable<S: HttpSend = Sender> {
|
||||
#[allow(dead_code)]
|
||||
@@ -155,7 +157,11 @@ impl<S: HttpSend> RemoteTable<S> {
|
||||
Ok(Box::pin(RecordBatchStreamAdapter::new(schema, stream)))
|
||||
}
|
||||
|
||||
fn apply_query_params(body: &mut serde_json::Value, params: &QueryRequest) -> Result<()> {
|
||||
fn apply_query_params(
|
||||
&self,
|
||||
body: &mut serde_json::Value,
|
||||
params: &QueryRequest,
|
||||
) -> Result<()> {
|
||||
body["prefilter"] = params.prefilter.into();
|
||||
if let Some(offset) = params.offset {
|
||||
body["offset"] = serde_json::Value::Number(serde_json::Number::from(offset));
|
||||
@@ -209,10 +215,17 @@ impl<S: HttpSend> RemoteTable<S> {
|
||||
message: "Wand factor is not yet supported in LanceDB Cloud".into(),
|
||||
});
|
||||
}
|
||||
body["full_text_query"] = serde_json::json!({
|
||||
"columns": full_text_search.columns,
|
||||
"query": full_text_search.query,
|
||||
})
|
||||
|
||||
if self.server_version.support_structural_fts() {
|
||||
body["full_text_query"] = serde_json::json!({
|
||||
"query": full_text_search.query.clone(),
|
||||
});
|
||||
} else {
|
||||
body["full_text_query"] = serde_json::json!({
|
||||
"columns": full_text_search.columns().into_iter().collect::<Vec<_>>(),
|
||||
"query": full_text_search.query.query(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -223,7 +236,7 @@ impl<S: HttpSend> RemoteTable<S> {
|
||||
mut body: serde_json::Value,
|
||||
query: &VectorQueryRequest,
|
||||
) -> Result<Vec<serde_json::Value>> {
|
||||
Self::apply_query_params(&mut body, &query.base)?;
|
||||
self.apply_query_params(&mut body, &query.base)?;
|
||||
|
||||
// Apply general parameters, before we dispatch based on number of query vectors.
|
||||
body["distance_type"] = serde_json::json!(query.distance_type.unwrap_or_default());
|
||||
@@ -321,9 +334,19 @@ impl<S: HttpSend> RemoteTable<S> {
|
||||
async fn execute_query(
|
||||
&self,
|
||||
query: &AnyQuery,
|
||||
_options: QueryExecutionOptions,
|
||||
options: &QueryExecutionOptions,
|
||||
) -> Result<Vec<Pin<Box<dyn RecordBatchStream + Send>>>> {
|
||||
let request = self.client.post(&format!("/v1/table/{}/query/", self.name));
|
||||
let mut request = self.client.post(&format!("/v1/table/{}/query/", self.name));
|
||||
|
||||
if let Some(timeout) = options.timeout {
|
||||
// Client side timeout
|
||||
request = request.timeout(timeout);
|
||||
// Also send to server, so it can abort the query if it takes too long.
|
||||
// (If it doesn't fit into u64, it's not worth sending anyways.)
|
||||
if let Ok(timeout_ms) = u64::try_from(timeout.as_millis()) {
|
||||
request = request.header(REQUEST_TIMEOUT_HEADER, timeout_ms);
|
||||
}
|
||||
}
|
||||
|
||||
let query_bodies = self.prepare_query_bodies(query).await?;
|
||||
let requests: Vec<reqwest::RequestBuilder> = query_bodies
|
||||
@@ -346,7 +369,7 @@ impl<S: HttpSend> RemoteTable<S> {
|
||||
match query {
|
||||
AnyQuery::Query(query) => {
|
||||
let mut body = base_body.clone();
|
||||
Self::apply_query_params(&mut body, query)?;
|
||||
self.apply_query_params(&mut body, query)?;
|
||||
// Empty vector can be passed if no vector search is performed.
|
||||
body["vector"] = serde_json::Value::Array(Vec::new());
|
||||
Ok(vec![body])
|
||||
@@ -532,7 +555,7 @@ impl<S: HttpSend> BaseTable for RemoteTable<S> {
|
||||
query: &AnyQuery,
|
||||
options: QueryExecutionOptions,
|
||||
) -> Result<Arc<dyn ExecutionPlan>> {
|
||||
let streams = self.execute_query(query, options).await?;
|
||||
let streams = self.execute_query(query, &options).await?;
|
||||
if streams.len() == 1 {
|
||||
let stream = streams.into_iter().next().unwrap();
|
||||
Ok(Arc::new(OneShotExec::new(stream)))
|
||||
@@ -548,9 +571,9 @@ impl<S: HttpSend> BaseTable for RemoteTable<S> {
|
||||
async fn query(
|
||||
&self,
|
||||
query: &AnyQuery,
|
||||
_options: QueryExecutionOptions,
|
||||
options: QueryExecutionOptions,
|
||||
) -> Result<DatasetRecordBatchStream> {
|
||||
let streams = self.execute_query(query, _options).await?;
|
||||
let streams = self.execute_query(query, &options).await?;
|
||||
|
||||
if streams.len() == 1 {
|
||||
Ok(DatasetRecordBatchStream::new(
|
||||
@@ -1037,6 +1060,7 @@ mod tests {
|
||||
use arrow_schema::{DataType, Field, Schema};
|
||||
use chrono::{DateTime, Utc};
|
||||
use futures::{future::BoxFuture, StreamExt, TryFutureExt};
|
||||
use lance_index::scalar::inverted::query::MatchQuery;
|
||||
use lance_index::scalar::FullTextSearchQuery;
|
||||
use reqwest::Body;
|
||||
use rstest::rstest;
|
||||
@@ -1683,7 +1707,18 @@ mod tests {
|
||||
"prefilter": true,
|
||||
"version": null
|
||||
});
|
||||
assert_eq!(body, expected_body);
|
||||
let expected_body_2 = serde_json::json!({
|
||||
"full_text_query": {
|
||||
"columns": ["b","a"],
|
||||
"query": "hello world",
|
||||
},
|
||||
"k": 10,
|
||||
"vector": [],
|
||||
"with_row_id": true,
|
||||
"prefilter": true,
|
||||
"version": null
|
||||
});
|
||||
assert!(body == expected_body || body == expected_body_2);
|
||||
|
||||
let data = RecordBatch::try_new(
|
||||
Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)])),
|
||||
@@ -1702,7 +1737,8 @@ mod tests {
|
||||
.query()
|
||||
.full_text_search(
|
||||
FullTextSearchQuery::new("hello world".into())
|
||||
.columns(Some(vec!["a".into(), "b".into()])),
|
||||
.with_columns(&["a".into(), "b".into()])
|
||||
.unwrap(),
|
||||
)
|
||||
.with_row_id()
|
||||
.limit(10)
|
||||
@@ -1711,6 +1747,66 @@ mod tests {
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_query_structured_fts() {
|
||||
let table =
|
||||
Table::new_with_handler_version("my_table", semver::Version::new(0, 3, 0), |request| {
|
||||
assert_eq!(request.method(), "POST");
|
||||
assert_eq!(request.url().path(), "/v1/table/my_table/query/");
|
||||
assert_eq!(
|
||||
request.headers().get("Content-Type").unwrap(),
|
||||
JSON_CONTENT_TYPE
|
||||
);
|
||||
|
||||
let body = request.body().unwrap().as_bytes().unwrap();
|
||||
let body: serde_json::Value = serde_json::from_slice(body).unwrap();
|
||||
let expected_body = serde_json::json!({
|
||||
"full_text_query": {
|
||||
"query": {
|
||||
"match": {
|
||||
"terms": "hello world",
|
||||
"column": "a",
|
||||
"boost": 1.0,
|
||||
"fuzziness": 0,
|
||||
"max_expansions": 50,
|
||||
},
|
||||
}
|
||||
},
|
||||
"k": 10,
|
||||
"vector": [],
|
||||
"with_row_id": true,
|
||||
"prefilter": true,
|
||||
"version": null
|
||||
});
|
||||
assert_eq!(body, expected_body);
|
||||
|
||||
let data = RecordBatch::try_new(
|
||||
Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)])),
|
||||
vec![Arc::new(Int32Array::from(vec![1, 2, 3]))],
|
||||
)
|
||||
.unwrap();
|
||||
let response_body = write_ipc_file(&data);
|
||||
http::Response::builder()
|
||||
.status(200)
|
||||
.header(CONTENT_TYPE, ARROW_FILE_CONTENT_TYPE)
|
||||
.body(response_body)
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
let _ = table
|
||||
.query()
|
||||
.full_text_search(FullTextSearchQuery::new_query(
|
||||
MatchQuery::new("hello world".to_owned())
|
||||
.with_column(Some("a".to_owned()))
|
||||
.into(),
|
||||
))
|
||||
.with_row_id()
|
||||
.limit(10)
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
#[rstest]
|
||||
#[case(DEFAULT_SERVER_VERSION.clone())]
|
||||
#[case(semver::Version::new(0, 2, 0))]
|
||||
|
||||
@@ -68,7 +68,7 @@ use crate::query::{
|
||||
use crate::utils::{
|
||||
default_vector_column, supported_bitmap_data_type, supported_btree_data_type,
|
||||
supported_fts_data_type, supported_label_list_data_type, supported_vector_data_type,
|
||||
PatchReadParam, PatchWriteParam,
|
||||
PatchReadParam, PatchWriteParam, TimeoutStream,
|
||||
};
|
||||
|
||||
use self::dataset::DatasetConsistencyWrapper;
|
||||
@@ -1775,11 +1775,14 @@ impl NativeTable {
|
||||
query: &AnyQuery,
|
||||
options: QueryExecutionOptions,
|
||||
) -> Result<DatasetRecordBatchStream> {
|
||||
let plan = self.create_plan(query, options).await?;
|
||||
Ok(DatasetRecordBatchStream::new(execute_plan(
|
||||
plan,
|
||||
Default::default(),
|
||||
)?))
|
||||
let plan = self.create_plan(query, options.clone()).await?;
|
||||
let inner = execute_plan(plan, Default::default())?;
|
||||
let inner = if let Some(timeout) = options.timeout {
|
||||
TimeoutStream::new_boxed(inner, timeout)
|
||||
} else {
|
||||
inner
|
||||
};
|
||||
Ok(DatasetRecordBatchStream::new(inner))
|
||||
}
|
||||
|
||||
/// Check whether the table uses V2 manifest paths.
|
||||
|
||||
@@ -48,7 +48,6 @@ impl DatasetRef {
|
||||
refresh_task,
|
||||
..
|
||||
} => {
|
||||
dataset.checkout_latest().await?;
|
||||
// Replace the refresh task
|
||||
if let Some(refresh_task) = refresh_task {
|
||||
refresh_task.abort();
|
||||
@@ -372,3 +371,48 @@ impl DerefMut for DatasetWriteGuard<'_> {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use arrow_schema::{DataType, Field, Schema};
|
||||
use lance::{dataset::WriteParams, io::ObjectStoreParams};
|
||||
|
||||
use super::*;
|
||||
|
||||
use crate::{connect, io::object_store::io_tracking::IoStatsHolder, table::WriteOptions};
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_iops_open_strong_consistency() {
|
||||
let db = connect("memory://")
|
||||
.read_consistency_interval(Some(Duration::ZERO))
|
||||
.execute()
|
||||
.await
|
||||
.expect("Failed to connect to database");
|
||||
let io_stats = IoStatsHolder::default();
|
||||
|
||||
let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)]));
|
||||
|
||||
let table = db
|
||||
.create_empty_table("test", schema)
|
||||
.write_options(WriteOptions {
|
||||
lance_write_params: Some(WriteParams {
|
||||
store_params: Some(ObjectStoreParams {
|
||||
object_store_wrapper: Some(Arc::new(io_stats.clone())),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
})
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
io_stats.incremental_stats();
|
||||
|
||||
// We should only need 1 read IOP to check the schema: looking for the
|
||||
// latest version.
|
||||
table.schema().await.unwrap();
|
||||
let stats = io_stats.incremental_stats();
|
||||
assert_eq!(stats.read_iops, 1);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,14 +3,20 @@
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use arrow_schema::{DataType, Schema};
|
||||
use arrow_array::RecordBatch;
|
||||
use arrow_schema::{DataType, Schema, SchemaRef};
|
||||
use datafusion_common::{DataFusionError, Result as DataFusionResult};
|
||||
use datafusion_execution::RecordBatchStream;
|
||||
use futures::{FutureExt, Stream};
|
||||
use lance::arrow::json::JsonDataType;
|
||||
use lance::dataset::{ReadParams, WriteParams};
|
||||
use lance::index::vector::utils::infer_vector_dim;
|
||||
use lance::io::{ObjectStoreParams, WrappingObjectStore};
|
||||
use lazy_static::lazy_static;
|
||||
use std::pin::Pin;
|
||||
|
||||
use crate::error::{Error, Result};
|
||||
use datafusion_physical_plan::SendableRecordBatchStream;
|
||||
|
||||
lazy_static! {
|
||||
static ref TABLE_NAME_REGEX: regex::Regex = regex::Regex::new(r"^[a-zA-Z0-9_\-\.]+$").unwrap();
|
||||
@@ -135,6 +141,7 @@ pub fn supported_btree_data_type(dtype: &DataType) -> bool {
|
||||
| DataType::Date32
|
||||
| DataType::Date64
|
||||
| DataType::Timestamp(_, _)
|
||||
| DataType::FixedSizeBinary(_)
|
||||
)
|
||||
}
|
||||
|
||||
@@ -151,7 +158,17 @@ pub fn supported_label_list_data_type(dtype: &DataType) -> bool {
|
||||
}
|
||||
|
||||
pub fn supported_fts_data_type(dtype: &DataType) -> bool {
|
||||
matches!(dtype, DataType::Utf8 | DataType::LargeUtf8)
|
||||
supported_fts_data_type_impl(dtype, false)
|
||||
}
|
||||
|
||||
fn supported_fts_data_type_impl(dtype: &DataType, in_list: bool) -> bool {
|
||||
match (dtype, in_list) {
|
||||
(DataType::Utf8 | DataType::LargeUtf8, _) => true,
|
||||
(DataType::List(field) | DataType::LargeList(field), false) => {
|
||||
supported_fts_data_type_impl(field.data_type(), true)
|
||||
}
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn supported_vector_data_type(dtype: &DataType) -> bool {
|
||||
@@ -177,11 +194,97 @@ pub fn string_to_datatype(s: &str) -> Option<DataType> {
|
||||
(&json_type).try_into().ok()
|
||||
}
|
||||
|
||||
enum TimeoutState {
|
||||
NotStarted {
|
||||
timeout: std::time::Duration,
|
||||
},
|
||||
Started {
|
||||
deadline: Pin<Box<tokio::time::Sleep>>,
|
||||
timeout: std::time::Duration,
|
||||
},
|
||||
Completed,
|
||||
}
|
||||
|
||||
/// A `Stream` wrapper that implements a timeout.
|
||||
///
|
||||
/// The timeout starts when the first `poll_next` is called. As soon as the timeout
|
||||
/// duration has passed, the stream will return an `Err` indicating a timeout error
|
||||
/// for the next poll.
|
||||
pub struct TimeoutStream {
|
||||
inner: SendableRecordBatchStream,
|
||||
state: TimeoutState,
|
||||
}
|
||||
|
||||
impl TimeoutStream {
|
||||
pub fn new(inner: SendableRecordBatchStream, timeout: std::time::Duration) -> Self {
|
||||
Self {
|
||||
inner,
|
||||
state: TimeoutState::NotStarted { timeout },
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new_boxed(
|
||||
inner: SendableRecordBatchStream,
|
||||
timeout: std::time::Duration,
|
||||
) -> SendableRecordBatchStream {
|
||||
Box::pin(Self::new(inner, timeout))
|
||||
}
|
||||
|
||||
fn timeout_error(timeout: &std::time::Duration) -> DataFusionError {
|
||||
DataFusionError::Execution(format!("Query timeout after {} ms", timeout.as_millis()))
|
||||
}
|
||||
}
|
||||
|
||||
impl RecordBatchStream for TimeoutStream {
|
||||
fn schema(&self) -> SchemaRef {
|
||||
self.inner.schema()
|
||||
}
|
||||
}
|
||||
|
||||
impl Stream for TimeoutStream {
|
||||
type Item = DataFusionResult<RecordBatch>;
|
||||
|
||||
fn poll_next(
|
||||
mut self: std::pin::Pin<&mut Self>,
|
||||
cx: &mut std::task::Context<'_>,
|
||||
) -> std::task::Poll<Option<Self::Item>> {
|
||||
match &mut self.state {
|
||||
TimeoutState::NotStarted { timeout } => {
|
||||
if timeout.is_zero() {
|
||||
return std::task::Poll::Ready(Some(Err(Self::timeout_error(timeout))));
|
||||
}
|
||||
let deadline = Box::pin(tokio::time::sleep(*timeout));
|
||||
self.state = TimeoutState::Started {
|
||||
deadline,
|
||||
timeout: *timeout,
|
||||
};
|
||||
self.poll_next(cx)
|
||||
}
|
||||
TimeoutState::Started { deadline, timeout } => match deadline.poll_unpin(cx) {
|
||||
std::task::Poll::Ready(_) => {
|
||||
let err = Self::timeout_error(timeout);
|
||||
self.state = TimeoutState::Completed;
|
||||
std::task::Poll::Ready(Some(Err(err)))
|
||||
}
|
||||
std::task::Poll::Pending => {
|
||||
let inner = Pin::new(&mut self.inner);
|
||||
inner.poll_next(cx)
|
||||
}
|
||||
},
|
||||
TimeoutState::Completed => std::task::Poll::Ready(None),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use arrow_array::Int32Array;
|
||||
use arrow_schema::Field;
|
||||
use datafusion_physical_plan::stream::RecordBatchStreamAdapter;
|
||||
use futures::{stream, StreamExt};
|
||||
use tokio::time::sleep;
|
||||
|
||||
use arrow_schema::{DataType, Field};
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_guess_default_column() {
|
||||
@@ -248,4 +351,85 @@ mod tests {
|
||||
let expected = DataType::Int32;
|
||||
assert_eq!(string_to_datatype(string), Some(expected));
|
||||
}
|
||||
|
||||
fn sample_batch() -> RecordBatch {
|
||||
let schema = Arc::new(Schema::new(vec![Field::new(
|
||||
"col1",
|
||||
DataType::Int32,
|
||||
false,
|
||||
)]));
|
||||
RecordBatch::try_new(
|
||||
schema.clone(),
|
||||
vec![Arc::new(Int32Array::from(vec![1, 2, 3]))],
|
||||
)
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_timeout_stream() {
|
||||
let batch = sample_batch();
|
||||
let schema = batch.schema();
|
||||
let mock_stream = stream::iter(vec![Ok(batch.clone()), Ok(batch.clone())]);
|
||||
|
||||
let sendable_stream: SendableRecordBatchStream =
|
||||
Box::pin(RecordBatchStreamAdapter::new(schema.clone(), mock_stream));
|
||||
let timeout_duration = std::time::Duration::from_millis(10);
|
||||
let mut timeout_stream = TimeoutStream::new(sendable_stream, timeout_duration);
|
||||
|
||||
// Poll the stream to get the first batch
|
||||
let first_result = timeout_stream.next().await;
|
||||
assert!(first_result.is_some());
|
||||
assert!(first_result.unwrap().is_ok());
|
||||
|
||||
// Sleep for the timeout duration
|
||||
sleep(timeout_duration).await;
|
||||
|
||||
// Poll the stream again and ensure it returns a timeout error
|
||||
let second_result = timeout_stream.next().await.unwrap();
|
||||
assert!(second_result.is_err());
|
||||
assert!(second_result
|
||||
.unwrap_err()
|
||||
.to_string()
|
||||
.contains("Query timeout"));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_timeout_stream_zero_duration() {
|
||||
let batch = sample_batch();
|
||||
let schema = batch.schema();
|
||||
let mock_stream = stream::iter(vec![Ok(batch.clone()), Ok(batch.clone())]);
|
||||
|
||||
let sendable_stream: SendableRecordBatchStream =
|
||||
Box::pin(RecordBatchStreamAdapter::new(schema.clone(), mock_stream));
|
||||
|
||||
// Setup similar to test_timeout_stream
|
||||
let timeout_duration = std::time::Duration::from_secs(0);
|
||||
let mut timeout_stream = TimeoutStream::new(sendable_stream, timeout_duration);
|
||||
|
||||
// First poll should immediately return a timeout error
|
||||
let result = timeout_stream.next().await.unwrap();
|
||||
assert!(result.is_err());
|
||||
assert!(result.unwrap_err().to_string().contains("Query timeout"));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_timeout_stream_completes_normally() {
|
||||
let batch = sample_batch();
|
||||
let schema = batch.schema();
|
||||
let mock_stream = stream::iter(vec![Ok(batch.clone()), Ok(batch.clone())]);
|
||||
|
||||
let sendable_stream: SendableRecordBatchStream =
|
||||
Box::pin(RecordBatchStreamAdapter::new(schema.clone(), mock_stream));
|
||||
|
||||
// Setup a stream with 2 batches
|
||||
// Use a longer timeout that won't trigger
|
||||
let timeout_duration = std::time::Duration::from_secs(1);
|
||||
let mut timeout_stream = TimeoutStream::new(sendable_stream, timeout_duration);
|
||||
|
||||
// Both polls should return data normally
|
||||
assert!(timeout_stream.next().await.unwrap().is_ok());
|
||||
assert!(timeout_stream.next().await.unwrap().is_ok());
|
||||
// Stream should be empty now
|
||||
assert!(timeout_stream.next().await.is_none());
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user