mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-23 13:29:57 +00:00
Compare commits
21 Commits
python-v0.
...
python-v0.
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7a15337e03 | ||
|
|
96c66fd087 | ||
|
|
0579303602 | ||
|
|
75edb8756c | ||
|
|
88283110f4 | ||
|
|
b3a637fdeb | ||
|
|
ce24457531 | ||
|
|
087fe6343d | ||
|
|
ab8cbe62dd | ||
|
|
f076bb41f4 | ||
|
|
902fb83d54 | ||
|
|
779118339f | ||
|
|
03b62599d7 | ||
|
|
4c999fb651 | ||
|
|
6d23d32ab5 | ||
|
|
704cec34e1 | ||
|
|
a300a238db | ||
|
|
a41ff1df0a | ||
|
|
77b005d849 | ||
|
|
167fccc427 | ||
|
|
2bffbcefa5 |
@@ -1,5 +1,5 @@
|
|||||||
[tool.bumpversion]
|
[tool.bumpversion]
|
||||||
current_version = "0.21.1-beta.0"
|
current_version = "0.21.2-beta.0"
|
||||||
parse = """(?x)
|
parse = """(?x)
|
||||||
(?P<major>0|[1-9]\\d*)\\.
|
(?P<major>0|[1-9]\\d*)\\.
|
||||||
(?P<minor>0|[1-9]\\d*)\\.
|
(?P<minor>0|[1-9]\\d*)\\.
|
||||||
|
|||||||
153
Cargo.lock
generated
153
Cargo.lock
generated
@@ -1209,6 +1209,31 @@ dependencies = [
|
|||||||
"generic-array",
|
"generic-array",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "bon"
|
||||||
|
version = "3.6.4"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "f61138465baf186c63e8d9b6b613b508cd832cba4ce93cf37ce5f096f91ac1a6"
|
||||||
|
dependencies = [
|
||||||
|
"bon-macros",
|
||||||
|
"rustversion",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "bon-macros"
|
||||||
|
version = "3.6.4"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "40d1dad34aa19bf02295382f08d9bc40651585bd497266831d40ee6296fb49ca"
|
||||||
|
dependencies = [
|
||||||
|
"darling",
|
||||||
|
"ident_case",
|
||||||
|
"prettyplease",
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"rustversion",
|
||||||
|
"syn 2.0.103",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "brotli"
|
name = "brotli"
|
||||||
version = "3.5.0"
|
version = "3.5.0"
|
||||||
@@ -2490,9 +2515,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "downcast-rs"
|
name = "downcast-rs"
|
||||||
version = "1.2.1"
|
version = "2.0.1"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "75b325c5dbd37f80359721ad39aca5a29fb04c89279657cffdda8736d0c0b9d2"
|
checksum = "ea8a8b81cacc08888170eef4d13b775126db426d0b348bee9d18c2c1eaf123cf"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "dunce"
|
name = "dunce"
|
||||||
@@ -2815,8 +2840,8 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "fsst"
|
name = "fsst"
|
||||||
version = "0.31.1"
|
version = "0.31.2"
|
||||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.31.1-beta.2#dff098a5aa66866197cfcd7ae7ca004aed02928f"
|
source = "git+https://github.com/lancedb/lance.git?tag=v0.31.2-beta.3#6e987921d0efbe42c018047ea45d1b6f624d8280"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"rand 0.8.5",
|
"rand 0.8.5",
|
||||||
]
|
]
|
||||||
@@ -3765,9 +3790,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||||||
checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222"
|
checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"cfg-if",
|
"cfg-if",
|
||||||
"js-sys",
|
|
||||||
"wasm-bindgen",
|
|
||||||
"web-sys",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -3908,8 +3930,8 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lance"
|
name = "lance"
|
||||||
version = "0.31.1"
|
version = "0.31.2"
|
||||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.31.1-beta.2#dff098a5aa66866197cfcd7ae7ca004aed02928f"
|
source = "git+https://github.com/lancedb/lance.git?tag=v0.31.2-beta.3#6e987921d0efbe42c018047ea45d1b6f624d8280"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow",
|
"arrow",
|
||||||
"arrow-arith",
|
"arrow-arith",
|
||||||
@@ -3971,8 +3993,8 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lance-arrow"
|
name = "lance-arrow"
|
||||||
version = "0.31.1"
|
version = "0.31.2"
|
||||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.31.1-beta.2#dff098a5aa66866197cfcd7ae7ca004aed02928f"
|
source = "git+https://github.com/lancedb/lance.git?tag=v0.31.2-beta.3#6e987921d0efbe42c018047ea45d1b6f624d8280"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow-array",
|
"arrow-array",
|
||||||
"arrow-buffer",
|
"arrow-buffer",
|
||||||
@@ -3989,8 +4011,8 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lance-core"
|
name = "lance-core"
|
||||||
version = "0.31.1"
|
version = "0.31.2"
|
||||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.31.1-beta.2#dff098a5aa66866197cfcd7ae7ca004aed02928f"
|
source = "git+https://github.com/lancedb/lance.git?tag=v0.31.2-beta.3#6e987921d0efbe42c018047ea45d1b6f624d8280"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow-array",
|
"arrow-array",
|
||||||
"arrow-buffer",
|
"arrow-buffer",
|
||||||
@@ -4025,8 +4047,8 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lance-datafusion"
|
name = "lance-datafusion"
|
||||||
version = "0.31.1"
|
version = "0.31.2"
|
||||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.31.1-beta.2#dff098a5aa66866197cfcd7ae7ca004aed02928f"
|
source = "git+https://github.com/lancedb/lance.git?tag=v0.31.2-beta.3#6e987921d0efbe42c018047ea45d1b6f624d8280"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow",
|
"arrow",
|
||||||
"arrow-array",
|
"arrow-array",
|
||||||
@@ -4054,8 +4076,8 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lance-datagen"
|
name = "lance-datagen"
|
||||||
version = "0.31.1"
|
version = "0.31.2"
|
||||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.31.1-beta.2#dff098a5aa66866197cfcd7ae7ca004aed02928f"
|
source = "git+https://github.com/lancedb/lance.git?tag=v0.31.2-beta.3#6e987921d0efbe42c018047ea45d1b6f624d8280"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow",
|
"arrow",
|
||||||
"arrow-array",
|
"arrow-array",
|
||||||
@@ -4071,8 +4093,8 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lance-encoding"
|
name = "lance-encoding"
|
||||||
version = "0.31.1"
|
version = "0.31.2"
|
||||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.31.1-beta.2#dff098a5aa66866197cfcd7ae7ca004aed02928f"
|
source = "git+https://github.com/lancedb/lance.git?tag=v0.31.2-beta.3#6e987921d0efbe42c018047ea45d1b6f624d8280"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrayref",
|
"arrayref",
|
||||||
"arrow",
|
"arrow",
|
||||||
@@ -4111,8 +4133,8 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lance-file"
|
name = "lance-file"
|
||||||
version = "0.31.1"
|
version = "0.31.2"
|
||||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.31.1-beta.2#dff098a5aa66866197cfcd7ae7ca004aed02928f"
|
source = "git+https://github.com/lancedb/lance.git?tag=v0.31.2-beta.3#6e987921d0efbe42c018047ea45d1b6f624d8280"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow-arith",
|
"arrow-arith",
|
||||||
"arrow-array",
|
"arrow-array",
|
||||||
@@ -4146,8 +4168,8 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lance-index"
|
name = "lance-index"
|
||||||
version = "0.31.1"
|
version = "0.31.2"
|
||||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.31.1-beta.2#dff098a5aa66866197cfcd7ae7ca004aed02928f"
|
source = "git+https://github.com/lancedb/lance.git?tag=v0.31.2-beta.3#6e987921d0efbe42c018047ea45d1b6f624d8280"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow",
|
"arrow",
|
||||||
"arrow-array",
|
"arrow-array",
|
||||||
@@ -4201,8 +4223,8 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lance-io"
|
name = "lance-io"
|
||||||
version = "0.31.1"
|
version = "0.31.2"
|
||||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.31.1-beta.2#dff098a5aa66866197cfcd7ae7ca004aed02928f"
|
source = "git+https://github.com/lancedb/lance.git?tag=v0.31.2-beta.3#6e987921d0efbe42c018047ea45d1b6f624d8280"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow",
|
"arrow",
|
||||||
"arrow-arith",
|
"arrow-arith",
|
||||||
@@ -4240,10 +4262,11 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lance-linalg"
|
name = "lance-linalg"
|
||||||
version = "0.31.1"
|
version = "0.31.2"
|
||||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.31.1-beta.2#dff098a5aa66866197cfcd7ae7ca004aed02928f"
|
source = "git+https://github.com/lancedb/lance.git?tag=v0.31.2-beta.3#6e987921d0efbe42c018047ea45d1b6f624d8280"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow-array",
|
"arrow-array",
|
||||||
|
"arrow-buffer",
|
||||||
"arrow-ord",
|
"arrow-ord",
|
||||||
"arrow-schema",
|
"arrow-schema",
|
||||||
"bitvec",
|
"bitvec",
|
||||||
@@ -4263,8 +4286,8 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lance-table"
|
name = "lance-table"
|
||||||
version = "0.31.1"
|
version = "0.31.2"
|
||||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.31.1-beta.2#dff098a5aa66866197cfcd7ae7ca004aed02928f"
|
source = "git+https://github.com/lancedb/lance.git?tag=v0.31.2-beta.3#6e987921d0efbe42c018047ea45d1b6f624d8280"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow",
|
"arrow",
|
||||||
"arrow-array",
|
"arrow-array",
|
||||||
@@ -4302,8 +4325,8 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lance-testing"
|
name = "lance-testing"
|
||||||
version = "0.31.1"
|
version = "0.31.2"
|
||||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.31.1-beta.2#dff098a5aa66866197cfcd7ae7ca004aed02928f"
|
source = "git+https://github.com/lancedb/lance.git?tag=v0.31.2-beta.3#6e987921d0efbe42c018047ea45d1b6f624d8280"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow-array",
|
"arrow-array",
|
||||||
"arrow-schema",
|
"arrow-schema",
|
||||||
@@ -4314,7 +4337,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lancedb"
|
name = "lancedb"
|
||||||
version = "0.21.1-beta.0"
|
version = "0.21.2-beta.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow",
|
"arrow",
|
||||||
"arrow-array",
|
"arrow-array",
|
||||||
@@ -4401,7 +4424,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lancedb-node"
|
name = "lancedb-node"
|
||||||
version = "0.21.1-beta.0"
|
version = "0.21.2-beta.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow-array",
|
"arrow-array",
|
||||||
"arrow-ipc",
|
"arrow-ipc",
|
||||||
@@ -4426,7 +4449,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lancedb-nodejs"
|
name = "lancedb-nodejs"
|
||||||
version = "0.21.1-beta.0"
|
version = "0.21.2-beta.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow-array",
|
"arrow-array",
|
||||||
"arrow-ipc",
|
"arrow-ipc",
|
||||||
@@ -4446,7 +4469,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lancedb-python"
|
name = "lancedb-python"
|
||||||
version = "0.24.1-beta.0"
|
version = "0.24.2-beta.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow",
|
"arrow",
|
||||||
"env_logger",
|
"env_logger",
|
||||||
@@ -4721,11 +4744,10 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "measure_time"
|
name = "measure_time"
|
||||||
version = "0.8.3"
|
version = "0.9.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "dbefd235b0aadd181626f281e1d684e116972988c14c264e42069d5e8a5775cc"
|
checksum = "51c55d61e72fc3ab704396c5fa16f4c184db37978ae4e94ca8959693a235fc0e"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"instant",
|
|
||||||
"log",
|
"log",
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -5259,9 +5281,9 @@ checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "ownedbytes"
|
name = "ownedbytes"
|
||||||
version = "0.7.0"
|
version = "0.9.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "c3a059efb063b8f425b948e042e6b9bd85edfe60e913630ed727b23e2dfcc558"
|
checksum = "2fbd56f7631767e61784dc43f8580f403f4475bd4aaa4da003e6295e1bab4a7e"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"stable_deref_trait",
|
"stable_deref_trait",
|
||||||
]
|
]
|
||||||
@@ -7055,9 +7077,9 @@ checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "sketches-ddsketch"
|
name = "sketches-ddsketch"
|
||||||
version = "0.2.2"
|
version = "0.3.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "85636c14b73d81f541e525f585c0a2109e6744e1565b5c1668e31c70c10ed65c"
|
checksum = "c1e9a774a6c28142ac54bb25d25562e6bcf957493a184f15ad4eebccb23e410a"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"serde",
|
"serde",
|
||||||
]
|
]
|
||||||
@@ -7387,14 +7409,15 @@ checksum = "7b2093cf4c8eb1e67749a6762251bc9cd836b6fc171623bd0a9d324d37af2417"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "tantivy"
|
name = "tantivy"
|
||||||
version = "0.22.0"
|
version = "0.24.1"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "f8d0582f186c0a6d55655d24543f15e43607299425c5ad8352c242b914b31856"
|
checksum = "ca2374a21157427c5faff2d90930f035b6c22a5d7b0e5b0b7f522e988ef33c06"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"aho-corasick",
|
"aho-corasick",
|
||||||
"arc-swap",
|
"arc-swap",
|
||||||
"base64 0.22.1",
|
"base64 0.22.1",
|
||||||
"bitpacking",
|
"bitpacking",
|
||||||
|
"bon",
|
||||||
"byteorder",
|
"byteorder",
|
||||||
"census",
|
"census",
|
||||||
"crc32fast",
|
"crc32fast",
|
||||||
@@ -7404,20 +7427,20 @@ dependencies = [
|
|||||||
"fnv",
|
"fnv",
|
||||||
"fs4",
|
"fs4",
|
||||||
"htmlescape",
|
"htmlescape",
|
||||||
"itertools 0.12.1",
|
"hyperloglogplus",
|
||||||
|
"itertools 0.14.0",
|
||||||
"levenshtein_automata",
|
"levenshtein_automata",
|
||||||
"log",
|
"log",
|
||||||
"lru",
|
"lru",
|
||||||
"lz4_flex",
|
"lz4_flex",
|
||||||
"measure_time",
|
"measure_time",
|
||||||
"memmap2 0.9.5",
|
"memmap2 0.9.5",
|
||||||
"num_cpus",
|
|
||||||
"once_cell",
|
"once_cell",
|
||||||
"oneshot",
|
"oneshot",
|
||||||
"rayon",
|
"rayon",
|
||||||
"regex",
|
"regex",
|
||||||
"rust-stemmers",
|
"rust-stemmers",
|
||||||
"rustc-hash 1.1.0",
|
"rustc-hash 2.1.1",
|
||||||
"serde",
|
"serde",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
"sketches-ddsketch",
|
"sketches-ddsketch",
|
||||||
@@ -7430,7 +7453,7 @@ dependencies = [
|
|||||||
"tantivy-stacker",
|
"tantivy-stacker",
|
||||||
"tantivy-tokenizer-api",
|
"tantivy-tokenizer-api",
|
||||||
"tempfile",
|
"tempfile",
|
||||||
"thiserror 1.0.69",
|
"thiserror 2.0.12",
|
||||||
"time",
|
"time",
|
||||||
"uuid",
|
"uuid",
|
||||||
"winapi",
|
"winapi",
|
||||||
@@ -7438,22 +7461,22 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "tantivy-bitpacker"
|
name = "tantivy-bitpacker"
|
||||||
version = "0.6.0"
|
version = "0.8.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "284899c2325d6832203ac6ff5891b297fc5239c3dc754c5bc1977855b23c10df"
|
checksum = "1adc286a39e089ae9938935cd488d7d34f14502544a36607effd2239ff0e2494"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"bitpacking",
|
"bitpacking",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "tantivy-columnar"
|
name = "tantivy-columnar"
|
||||||
version = "0.3.0"
|
version = "0.5.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "12722224ffbe346c7fec3275c699e508fd0d4710e629e933d5736ec524a1f44e"
|
checksum = "6300428e0c104c4f7db6f95b466a6f5c1b9aece094ec57cdd365337908dc7344"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"downcast-rs",
|
"downcast-rs",
|
||||||
"fastdivide",
|
"fastdivide",
|
||||||
"itertools 0.12.1",
|
"itertools 0.14.0",
|
||||||
"serde",
|
"serde",
|
||||||
"tantivy-bitpacker",
|
"tantivy-bitpacker",
|
||||||
"tantivy-common",
|
"tantivy-common",
|
||||||
@@ -7463,9 +7486,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "tantivy-common"
|
name = "tantivy-common"
|
||||||
version = "0.7.0"
|
version = "0.9.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "8019e3cabcfd20a1380b491e13ff42f57bb38bf97c3d5fa5c07e50816e0621f4"
|
checksum = "e91b6ea6090ce03dc72c27d0619e77185d26cc3b20775966c346c6d4f7e99d7f"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"async-trait",
|
"async-trait",
|
||||||
"byteorder",
|
"byteorder",
|
||||||
@@ -7487,19 +7510,23 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "tantivy-query-grammar"
|
name = "tantivy-query-grammar"
|
||||||
version = "0.22.0"
|
version = "0.24.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "847434d4af57b32e309f4ab1b4f1707a6c566656264caa427ff4285c4d9d0b82"
|
checksum = "e810cdeeebca57fc3f7bfec5f85fdbea9031b2ac9b990eb5ff49b371d52bbe6a"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"nom",
|
"nom",
|
||||||
|
"serde",
|
||||||
|
"serde_json",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "tantivy-sstable"
|
name = "tantivy-sstable"
|
||||||
version = "0.3.0"
|
version = "0.5.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "c69578242e8e9fc989119f522ba5b49a38ac20f576fc778035b96cc94f41f98e"
|
checksum = "709f22c08a4c90e1b36711c1c6cad5ae21b20b093e535b69b18783dd2cb99416"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
"futures-util",
|
||||||
|
"itertools 0.14.0",
|
||||||
"tantivy-bitpacker",
|
"tantivy-bitpacker",
|
||||||
"tantivy-common",
|
"tantivy-common",
|
||||||
"tantivy-fst",
|
"tantivy-fst",
|
||||||
@@ -7508,9 +7535,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "tantivy-stacker"
|
name = "tantivy-stacker"
|
||||||
version = "0.3.0"
|
version = "0.5.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "c56d6ff5591fc332739b3ce7035b57995a3ce29a93ffd6012660e0949c956ea8"
|
checksum = "2bcdebb267671311d1e8891fd9d1301803fdb8ad21ba22e0a30d0cab49ba59c1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"murmurhash32",
|
"murmurhash32",
|
||||||
"rand_distr 0.4.3",
|
"rand_distr 0.4.3",
|
||||||
@@ -7519,9 +7546,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "tantivy-tokenizer-api"
|
name = "tantivy-tokenizer-api"
|
||||||
version = "0.3.0"
|
version = "0.5.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "2a0dcade25819a89cfe6f17d932c9cedff11989936bf6dd4f336d50392053b04"
|
checksum = "dfa942fcee81e213e09715bbce8734ae2180070b97b33839a795ba1de201547d"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"serde",
|
"serde",
|
||||||
]
|
]
|
||||||
|
|||||||
18
Cargo.toml
18
Cargo.toml
@@ -21,14 +21,16 @@ categories = ["database-implementations"]
|
|||||||
rust-version = "1.78.0"
|
rust-version = "1.78.0"
|
||||||
|
|
||||||
[workspace.dependencies]
|
[workspace.dependencies]
|
||||||
lance = { "version" = "=0.31.1", tag="v0.31.1-beta.2", git="https://github.com/lancedb/lance.git", features = ["dynamodb"] }
|
lance = { "version" = "=0.31.2", "features" = [
|
||||||
lance-io = { "version" = "=0.31.1", tag="v0.31.1-beta.2", git="https://github.com/lancedb/lance.git" }
|
"dynamodb",
|
||||||
lance-index = { "version" = "=0.31.1", tag="v0.31.1-beta.2", git="https://github.com/lancedb/lance.git" }
|
], "tag" = "v0.31.2-beta.3", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
lance-linalg = { "version" = "=0.31.1", tag="v0.31.1-beta.2", git="https://github.com/lancedb/lance.git" }
|
lance-io = { "version" = "=0.31.2", "tag" = "v0.31.2-beta.3", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
lance-table = { "version" = "=0.31.1", tag="v0.31.1-beta.2", git="https://github.com/lancedb/lance.git" }
|
lance-index = { "version" = "=0.31.2", "tag" = "v0.31.2-beta.3", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
lance-testing = { "version" = "=0.31.1", tag="v0.31.1-beta.2", git="https://github.com/lancedb/lance.git" }
|
lance-linalg = { "version" = "=0.31.2", "tag" = "v0.31.2-beta.3", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
lance-datafusion = { "version" = "=0.31.1", tag="v0.31.1-beta.2", git="https://github.com/lancedb/lance.git" }
|
lance-table = { "version" = "=0.31.2", "tag" = "v0.31.2-beta.3", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
lance-encoding = { "version" = "=0.31.1", tag="v0.31.1-beta.2", git="https://github.com/lancedb/lance.git" }
|
lance-testing = { "version" = "=0.31.2", "tag" = "v0.31.2-beta.3", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
|
lance-datafusion = { "version" = "=0.31.2", "tag" = "v0.31.2-beta.3", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
|
lance-encoding = { "version" = "=0.31.2", "tag" = "v0.31.2-beta.3", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
# Note that this one does not include pyarrow
|
# Note that this one does not include pyarrow
|
||||||
arrow = { version = "55.1", optional = false }
|
arrow = { version = "55.1", optional = false }
|
||||||
arrow-array = "55.1"
|
arrow-array = "55.1"
|
||||||
|
|||||||
@@ -47,10 +47,10 @@ def extract_features(line: str) -> list:
|
|||||||
"""
|
"""
|
||||||
import re
|
import re
|
||||||
|
|
||||||
match = re.search(r'"features"\s*=\s*\[(.*?)\]', line)
|
match = re.search(r'"features"\s*=\s*\[\s*(.*?)\s*\]', line, re.DOTALL)
|
||||||
if match:
|
if match:
|
||||||
features_str = match.group(1)
|
features_str = match.group(1)
|
||||||
return [f.strip('"') for f in features_str.split(",")]
|
return [f.strip('"') for f in features_str.split(",") if len(f) > 0]
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
|
||||||
@@ -63,10 +63,24 @@ def update_cargo_toml(line_updater):
|
|||||||
lines = f.readlines()
|
lines = f.readlines()
|
||||||
|
|
||||||
new_lines = []
|
new_lines = []
|
||||||
|
lance_line = ""
|
||||||
|
is_parsing_lance_line = False
|
||||||
for line in lines:
|
for line in lines:
|
||||||
if line.startswith("lance"):
|
if line.startswith("lance"):
|
||||||
# Update the line using the provided function
|
# Update the line using the provided function
|
||||||
|
if line.strip().endswith("}"):
|
||||||
new_lines.append(line_updater(line))
|
new_lines.append(line_updater(line))
|
||||||
|
else:
|
||||||
|
lance_line = line
|
||||||
|
is_parsing_lance_line = True
|
||||||
|
elif is_parsing_lance_line:
|
||||||
|
lance_line += line
|
||||||
|
if line.strip().endswith("}"):
|
||||||
|
new_lines.append(line_updater(lance_line))
|
||||||
|
lance_line = ""
|
||||||
|
is_parsing_lance_line = False
|
||||||
|
else:
|
||||||
|
print("doesn't end with }:", line)
|
||||||
else:
|
else:
|
||||||
# Keep the line unchanged
|
# Keep the line unchanged
|
||||||
new_lines.append(line)
|
new_lines.append(line)
|
||||||
|
|||||||
12
docs/package-lock.json
generated
12
docs/package-lock.json
generated
@@ -19,7 +19,7 @@
|
|||||||
},
|
},
|
||||||
"../node": {
|
"../node": {
|
||||||
"name": "vectordb",
|
"name": "vectordb",
|
||||||
"version": "0.12.0",
|
"version": "0.21.2-beta.0",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"x64",
|
"x64",
|
||||||
"arm64"
|
"arm64"
|
||||||
@@ -65,11 +65,11 @@
|
|||||||
"uuid": "^9.0.0"
|
"uuid": "^9.0.0"
|
||||||
},
|
},
|
||||||
"optionalDependencies": {
|
"optionalDependencies": {
|
||||||
"@lancedb/vectordb-darwin-arm64": "0.12.0",
|
"@lancedb/vectordb-darwin-arm64": "0.21.2-beta.0",
|
||||||
"@lancedb/vectordb-darwin-x64": "0.12.0",
|
"@lancedb/vectordb-darwin-x64": "0.21.2-beta.0",
|
||||||
"@lancedb/vectordb-linux-arm64-gnu": "0.12.0",
|
"@lancedb/vectordb-linux-arm64-gnu": "0.21.2-beta.0",
|
||||||
"@lancedb/vectordb-linux-x64-gnu": "0.12.0",
|
"@lancedb/vectordb-linux-x64-gnu": "0.21.2-beta.0",
|
||||||
"@lancedb/vectordb-win32-x64-msvc": "0.12.0"
|
"@lancedb/vectordb-win32-x64-msvc": "0.21.2-beta.0"
|
||||||
},
|
},
|
||||||
"peerDependencies": {
|
"peerDependencies": {
|
||||||
"@apache-arrow/ts": "^14.0.2",
|
"@apache-arrow/ts": "^14.0.2",
|
||||||
|
|||||||
@@ -1,7 +1,9 @@
|
|||||||
|
# SQL Querying
|
||||||
|
|
||||||
You can use DuckDB and Apache Datafusion to query your LanceDB tables using SQL.
|
You can use DuckDB and Apache Datafusion to query your LanceDB tables using SQL.
|
||||||
This guide will show how to query Lance tables them using both.
|
This guide will show how to query Lance tables them using both.
|
||||||
|
|
||||||
We will re-use the dataset [created previously](./pandas_and_pyarrow.md):
|
We will re-use the dataset [created previously](./tables.md):
|
||||||
|
|
||||||
```python
|
```python
|
||||||
import lancedb
|
import lancedb
|
||||||
@@ -27,15 +29,10 @@ arrow_table = table.to_lance()
|
|||||||
duckdb.query("SELECT * FROM arrow_table")
|
duckdb.query("SELECT * FROM arrow_table")
|
||||||
```
|
```
|
||||||
|
|
||||||
```
|
| vector | item | price |
|
||||||
┌─────────────┬─────────┬────────┐
|
| ----------- | ---- | ----- |
|
||||||
│ vector │ item │ price │
|
| [3.1, 4.1] | foo | 10.0 |
|
||||||
│ float[] │ varchar │ double │
|
| [5.9, 26.5] | bar | 20.0 |
|
||||||
├─────────────┼─────────┼────────┤
|
|
||||||
│ [3.1, 4.1] │ foo │ 10.0 │
|
|
||||||
│ [5.9, 26.5] │ bar │ 20.0 │
|
|
||||||
└─────────────┴─────────┴────────┘
|
|
||||||
```
|
|
||||||
|
|
||||||
## Querying a LanceDB Table with Apache Datafusion
|
## Querying a LanceDB Table with Apache Datafusion
|
||||||
|
|
||||||
@@ -57,12 +54,7 @@ Register the table created with the Datafusion session context.
|
|||||||
--8<-- "python/python/tests/docs/test_guide_tables.py:lance_sql_basic"
|
--8<-- "python/python/tests/docs/test_guide_tables.py:lance_sql_basic"
|
||||||
```
|
```
|
||||||
|
|
||||||
```
|
| vector | item | price |
|
||||||
┌─────────────┬─────────┬────────┐
|
| ----------- | ---- | ----- |
|
||||||
│ vector │ item │ price │
|
| [3.1, 4.1] | foo | 10.0 |
|
||||||
│ float[] │ varchar │ double │
|
| [5.9, 26.5] | bar | 20.0 |
|
||||||
├─────────────┼─────────┼────────┤
|
|
||||||
│ [3.1, 4.1] │ foo │ 10.0 │
|
|
||||||
│ [5.9, 26.5] │ bar │ 20.0 │
|
|
||||||
└─────────────┴─────────┴────────┘
|
|
||||||
```
|
|
||||||
|
|||||||
@@ -41,6 +41,7 @@ Creates an instance of MatchQuery.
|
|||||||
- `fuzziness`: The fuzziness level for the query (default is 0).
|
- `fuzziness`: The fuzziness level for the query (default is 0).
|
||||||
- `maxExpansions`: The maximum number of terms to consider for fuzzy matching (default is 50).
|
- `maxExpansions`: The maximum number of terms to consider for fuzzy matching (default is 50).
|
||||||
- `operator`: The logical operator to use for combining terms in the query (default is "OR").
|
- `operator`: The logical operator to use for combining terms in the query (default is "OR").
|
||||||
|
- `prefixLength`: The number of beginning characters being unchanged for fuzzy matching.
|
||||||
|
|
||||||
* **options.boost?**: `number`
|
* **options.boost?**: `number`
|
||||||
|
|
||||||
@@ -50,6 +51,8 @@ Creates an instance of MatchQuery.
|
|||||||
|
|
||||||
* **options.operator?**: [`Operator`](../enumerations/Operator.md)
|
* **options.operator?**: [`Operator`](../enumerations/Operator.md)
|
||||||
|
|
||||||
|
* **options.prefixLength?**: `number`
|
||||||
|
|
||||||
#### Returns
|
#### Returns
|
||||||
|
|
||||||
[`MatchQuery`](MatchQuery.md)
|
[`MatchQuery`](MatchQuery.md)
|
||||||
|
|||||||
@@ -612,7 +612,7 @@ of the given query
|
|||||||
|
|
||||||
#### Parameters
|
#### Parameters
|
||||||
|
|
||||||
* **query**: `string` \| [`IntoVector`](../type-aliases/IntoVector.md) \| [`FullTextQuery`](../interfaces/FullTextQuery.md)
|
* **query**: `string` \| [`IntoVector`](../type-aliases/IntoVector.md) \| [`MultiVector`](../type-aliases/MultiVector.md) \| [`FullTextQuery`](../interfaces/FullTextQuery.md)
|
||||||
the query, a vector or string
|
the query, a vector or string
|
||||||
|
|
||||||
* **queryType?**: `string`
|
* **queryType?**: `string`
|
||||||
@@ -799,7 +799,7 @@ by `query`.
|
|||||||
|
|
||||||
#### Parameters
|
#### Parameters
|
||||||
|
|
||||||
* **vector**: [`IntoVector`](../type-aliases/IntoVector.md)
|
* **vector**: [`IntoVector`](../type-aliases/IntoVector.md) \| [`MultiVector`](../type-aliases/MultiVector.md)
|
||||||
|
|
||||||
#### Returns
|
#### Returns
|
||||||
|
|
||||||
|
|||||||
@@ -386,6 +386,53 @@ called then every valid row from the table will be returned.
|
|||||||
|
|
||||||
***
|
***
|
||||||
|
|
||||||
|
### maximumNprobes()
|
||||||
|
|
||||||
|
```ts
|
||||||
|
maximumNprobes(maximumNprobes): VectorQuery
|
||||||
|
```
|
||||||
|
|
||||||
|
Set the maximum number of probes used.
|
||||||
|
|
||||||
|
This controls the maximum number of partitions that will be searched. If this
|
||||||
|
number is greater than minimumNprobes then the excess partitions will _only_ be
|
||||||
|
searched if we have not found enough results. This can be useful when there is
|
||||||
|
a narrow filter to allow these queries to spend more time searching and avoid
|
||||||
|
potential false negatives.
|
||||||
|
|
||||||
|
#### Parameters
|
||||||
|
|
||||||
|
* **maximumNprobes**: `number`
|
||||||
|
|
||||||
|
#### Returns
|
||||||
|
|
||||||
|
[`VectorQuery`](VectorQuery.md)
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
### minimumNprobes()
|
||||||
|
|
||||||
|
```ts
|
||||||
|
minimumNprobes(minimumNprobes): VectorQuery
|
||||||
|
```
|
||||||
|
|
||||||
|
Set the minimum number of probes used.
|
||||||
|
|
||||||
|
This controls the minimum number of partitions that will be searched. This
|
||||||
|
parameter will impact every query against a vector index, regardless of the
|
||||||
|
filter. See `nprobes` for more details. Higher values will increase recall
|
||||||
|
but will also increase latency.
|
||||||
|
|
||||||
|
#### Parameters
|
||||||
|
|
||||||
|
* **minimumNprobes**: `number`
|
||||||
|
|
||||||
|
#### Returns
|
||||||
|
|
||||||
|
[`VectorQuery`](VectorQuery.md)
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
### nprobes()
|
### nprobes()
|
||||||
|
|
||||||
```ts
|
```ts
|
||||||
@@ -413,6 +460,10 @@ For best results we recommend tuning this parameter with a benchmark against
|
|||||||
your actual data to find the smallest possible value that will still give
|
your actual data to find the smallest possible value that will still give
|
||||||
you the desired recall.
|
you the desired recall.
|
||||||
|
|
||||||
|
For more fine grained control over behavior when you have a very narrow filter
|
||||||
|
you can use `minimumNprobes` and `maximumNprobes`. This method sets both
|
||||||
|
the minimum and maximum to the same value.
|
||||||
|
|
||||||
#### Parameters
|
#### Parameters
|
||||||
|
|
||||||
* **nprobes**: `number`
|
* **nprobes**: `number`
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ Enum representing the occurrence of terms in full-text queries.
|
|||||||
|
|
||||||
- `Must`: The term must be present in the document.
|
- `Must`: The term must be present in the document.
|
||||||
- `Should`: The term should contribute to the document score, but is not required.
|
- `Should`: The term should contribute to the document score, but is not required.
|
||||||
|
- `MustNot`: The term must not be present in the document.
|
||||||
|
|
||||||
## Enumeration Members
|
## Enumeration Members
|
||||||
|
|
||||||
@@ -21,6 +22,14 @@ Must: "MUST";
|
|||||||
|
|
||||||
***
|
***
|
||||||
|
|
||||||
|
### MustNot
|
||||||
|
|
||||||
|
```ts
|
||||||
|
MustNot: "MUST_NOT";
|
||||||
|
```
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
### Should
|
### Should
|
||||||
|
|
||||||
```ts
|
```ts
|
||||||
|
|||||||
@@ -84,6 +84,7 @@
|
|||||||
- [FieldLike](type-aliases/FieldLike.md)
|
- [FieldLike](type-aliases/FieldLike.md)
|
||||||
- [IntoSql](type-aliases/IntoSql.md)
|
- [IntoSql](type-aliases/IntoSql.md)
|
||||||
- [IntoVector](type-aliases/IntoVector.md)
|
- [IntoVector](type-aliases/IntoVector.md)
|
||||||
|
- [MultiVector](type-aliases/MultiVector.md)
|
||||||
- [RecordBatchLike](type-aliases/RecordBatchLike.md)
|
- [RecordBatchLike](type-aliases/RecordBatchLike.md)
|
||||||
- [SchemaLike](type-aliases/SchemaLike.md)
|
- [SchemaLike](type-aliases/SchemaLike.md)
|
||||||
- [TableLike](type-aliases/TableLike.md)
|
- [TableLike](type-aliases/TableLike.md)
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ whether to remove punctuation
|
|||||||
### baseTokenizer?
|
### baseTokenizer?
|
||||||
|
|
||||||
```ts
|
```ts
|
||||||
optional baseTokenizer: "raw" | "simple" | "whitespace";
|
optional baseTokenizer: "raw" | "simple" | "whitespace" | "ngram";
|
||||||
```
|
```
|
||||||
|
|
||||||
The tokenizer to use when building the index.
|
The tokenizer to use when building the index.
|
||||||
@@ -71,6 +71,36 @@ tokens longer than this length will be ignored
|
|||||||
|
|
||||||
***
|
***
|
||||||
|
|
||||||
|
### ngramMaxLength?
|
||||||
|
|
||||||
|
```ts
|
||||||
|
optional ngramMaxLength: number;
|
||||||
|
```
|
||||||
|
|
||||||
|
ngram max length
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
### ngramMinLength?
|
||||||
|
|
||||||
|
```ts
|
||||||
|
optional ngramMinLength: number;
|
||||||
|
```
|
||||||
|
|
||||||
|
ngram min length
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
### prefixOnly?
|
||||||
|
|
||||||
|
```ts
|
||||||
|
optional prefixOnly: boolean;
|
||||||
|
```
|
||||||
|
|
||||||
|
whether to only index the prefix of the token for ngram tokenizer
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
### removeStopWords?
|
### removeStopWords?
|
||||||
|
|
||||||
```ts
|
```ts
|
||||||
|
|||||||
@@ -24,10 +24,10 @@ The default is 7 days
|
|||||||
// Delete all versions older than 1 day
|
// Delete all versions older than 1 day
|
||||||
const olderThan = new Date();
|
const olderThan = new Date();
|
||||||
olderThan.setDate(olderThan.getDate() - 1));
|
olderThan.setDate(olderThan.getDate() - 1));
|
||||||
tbl.cleanupOlderVersions(olderThan);
|
tbl.optimize({cleanupOlderThan: olderThan});
|
||||||
|
|
||||||
// Delete all versions except the current version
|
// Delete all versions except the current version
|
||||||
tbl.cleanupOlderVersions(new Date());
|
tbl.optimize({cleanupOlderThan: new Date()});
|
||||||
```
|
```
|
||||||
|
|
||||||
***
|
***
|
||||||
|
|||||||
11
docs/src/js/type-aliases/MultiVector.md
Normal file
11
docs/src/js/type-aliases/MultiVector.md
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
[@lancedb/lancedb](../globals.md) / MultiVector
|
||||||
|
|
||||||
|
# Type Alias: MultiVector
|
||||||
|
|
||||||
|
```ts
|
||||||
|
type MultiVector: IntoVector[];
|
||||||
|
```
|
||||||
@@ -30,7 +30,8 @@ excluded_globs = [
|
|||||||
"../src/rag/advanced_techniques/*.md",
|
"../src/rag/advanced_techniques/*.md",
|
||||||
"../src/guides/scalar_index.md",
|
"../src/guides/scalar_index.md",
|
||||||
"../src/guides/storage.md",
|
"../src/guides/storage.md",
|
||||||
"../src/search.md"
|
"../src/search.md",
|
||||||
|
"../src/guides/sql_querying.md",
|
||||||
]
|
]
|
||||||
|
|
||||||
python_prefix = "py"
|
python_prefix = "py"
|
||||||
|
|||||||
19
java/.mvn/wrapper/maven-wrapper.properties
vendored
Normal file
19
java/.mvn/wrapper/maven-wrapper.properties
vendored
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
# Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
# or more contributor license agreements. See the NOTICE file
|
||||||
|
# distributed with this work for additional information
|
||||||
|
# regarding copyright ownership. The ASF licenses this file
|
||||||
|
# to you under the Apache License, Version 2.0 (the
|
||||||
|
# "License"); you may not use this file except in compliance
|
||||||
|
# with the License. You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing,
|
||||||
|
# software distributed under the License is distributed on an
|
||||||
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
# KIND, either express or implied. See the License for the
|
||||||
|
# specific language governing permissions and limitations
|
||||||
|
# under the License.
|
||||||
|
wrapperVersion=3.3.2
|
||||||
|
distributionType=only-script
|
||||||
|
distributionUrl=https://repo.maven.apache.org/maven2/org/apache/maven/apache-maven/3.9.9/apache-maven-3.9.9-bin.zip
|
||||||
37
java/README.md
Normal file
37
java/README.md
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
# LanceDB Java SDK
|
||||||
|
|
||||||
|
## Configuration and Initialization
|
||||||
|
|
||||||
|
### LanceDB Cloud
|
||||||
|
|
||||||
|
For LanceDB Cloud, use the simplified builder API:
|
||||||
|
|
||||||
|
```java
|
||||||
|
import com.lancedb.lance.namespace.LanceRestNamespace;
|
||||||
|
|
||||||
|
// If your DB url is db://example-db, then your database here is example-db
|
||||||
|
LanceRestNamespace namespace = LanceDBRestNamespaces.builder()
|
||||||
|
.apiKey("your_lancedb_cloud_api_key")
|
||||||
|
.database("your_database_name")
|
||||||
|
.build();
|
||||||
|
```
|
||||||
|
|
||||||
|
### LanceDB Enterprise
|
||||||
|
|
||||||
|
For Enterprise deployments, use your VPC endpoint:
|
||||||
|
|
||||||
|
```java
|
||||||
|
LanceRestNamespace namespace = LanceDBRestNamespaces.builder()
|
||||||
|
.apiKey("your_lancedb_enterprise_api_key")
|
||||||
|
.database("your-top-dir") // Your top level folder under your cloud bucket, e.g. s3://your-bucket/your-top-dir/
|
||||||
|
.hostOverride("http://<vpc_endpoint_dns_name>:80")
|
||||||
|
.build();
|
||||||
|
```
|
||||||
|
|
||||||
|
## Development
|
||||||
|
|
||||||
|
Build:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
./mvnw install
|
||||||
|
```
|
||||||
@@ -8,18 +8,24 @@
|
|||||||
<parent>
|
<parent>
|
||||||
<groupId>com.lancedb</groupId>
|
<groupId>com.lancedb</groupId>
|
||||||
<artifactId>lancedb-parent</artifactId>
|
<artifactId>lancedb-parent</artifactId>
|
||||||
<version>0.21.1-beta.0</version>
|
<version>0.21.2-beta.0</version>
|
||||||
<relativePath>../pom.xml</relativePath>
|
<relativePath>../pom.xml</relativePath>
|
||||||
</parent>
|
</parent>
|
||||||
|
|
||||||
<artifactId>lancedb-core</artifactId>
|
<artifactId>lancedb-core</artifactId>
|
||||||
<name>LanceDB Core</name>
|
<name>${project.artifactId}</name>
|
||||||
|
<description>LanceDB Core</description>
|
||||||
<packaging>jar</packaging>
|
<packaging>jar</packaging>
|
||||||
<properties>
|
<properties>
|
||||||
<rust.release.build>false</rust.release.build>
|
<rust.release.build>false</rust.release.build>
|
||||||
</properties>
|
</properties>
|
||||||
|
|
||||||
<dependencies>
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.lancedb</groupId>
|
||||||
|
<artifactId>lance-namespace-core</artifactId>
|
||||||
|
<version>0.0.1</version>
|
||||||
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.arrow</groupId>
|
<groupId>org.apache.arrow</groupId>
|
||||||
<artifactId>arrow-vector</artifactId>
|
<artifactId>arrow-vector</artifactId>
|
||||||
|
|||||||
26
java/lance-namespace/pom.xml
Normal file
26
java/lance-namespace/pom.xml
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
|
||||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||||
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||||
|
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
<parent>
|
||||||
|
<groupId>com.lancedb</groupId>
|
||||||
|
<artifactId>lancedb-parent</artifactId>
|
||||||
|
<version>0.21.2-beta.0</version>
|
||||||
|
<relativePath>../pom.xml</relativePath>
|
||||||
|
</parent>
|
||||||
|
|
||||||
|
<artifactId>lancedb-lance-namespace</artifactId>
|
||||||
|
<name>${project.artifactId}</name>
|
||||||
|
<description>LanceDB Java Integration with Lance Namespace</description>
|
||||||
|
<packaging>jar</packaging>
|
||||||
|
|
||||||
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.lancedb</groupId>
|
||||||
|
<artifactId>lance-namespace-core</artifactId>
|
||||||
|
</dependency>
|
||||||
|
</dependencies>
|
||||||
|
</project>
|
||||||
@@ -0,0 +1,146 @@
|
|||||||
|
/*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package com.lancedb.lancedb;
|
||||||
|
|
||||||
|
import com.lancedb.lance.namespace.LanceRestNamespace;
|
||||||
|
import com.lancedb.lance.namespace.client.apache.ApiClient;
|
||||||
|
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
/** Util class to help construct a {@link LanceRestNamespace} for LanceDB. */
|
||||||
|
public class LanceDbRestNamespaces {
|
||||||
|
private static final String DEFAULT_REGION = "us-east-1";
|
||||||
|
private static final String CLOUD_URL_PATTERN = "https://%s.%s.api.lancedb.com";
|
||||||
|
|
||||||
|
private String apiKey;
|
||||||
|
private String database;
|
||||||
|
private Optional<String> hostOverride = Optional.empty();
|
||||||
|
private Optional<String> region = Optional.empty();
|
||||||
|
private Map<String, String> additionalConfig = new HashMap<>();
|
||||||
|
|
||||||
|
private LanceDbRestNamespaces() {}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new builder instance.
|
||||||
|
*
|
||||||
|
* @return A new LanceRestNamespaceBuilder
|
||||||
|
*/
|
||||||
|
public static LanceDbRestNamespaces builder() {
|
||||||
|
return new LanceDbRestNamespaces();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set the API key (required).
|
||||||
|
*
|
||||||
|
* @param apiKey The LanceDB API key
|
||||||
|
* @return This builder
|
||||||
|
*/
|
||||||
|
public LanceDbRestNamespaces apiKey(String apiKey) {
|
||||||
|
if (apiKey == null || apiKey.trim().isEmpty()) {
|
||||||
|
throw new IllegalArgumentException("API key cannot be null or empty");
|
||||||
|
}
|
||||||
|
this.apiKey = apiKey;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set the database name (required).
|
||||||
|
*
|
||||||
|
* @param database The database name
|
||||||
|
* @return This builder
|
||||||
|
*/
|
||||||
|
public LanceDbRestNamespaces database(String database) {
|
||||||
|
if (database == null || database.trim().isEmpty()) {
|
||||||
|
throw new IllegalArgumentException("Database cannot be null or empty");
|
||||||
|
}
|
||||||
|
this.database = database;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set a custom host override (optional). When set, this overrides the default LanceDB Cloud URL
|
||||||
|
* construction. Use this for LanceDB Enterprise deployments.
|
||||||
|
*
|
||||||
|
* @param hostOverride The complete base URL (e.g., "http://your-vpc-endpoint:80")
|
||||||
|
* @return This builder
|
||||||
|
*/
|
||||||
|
public LanceDbRestNamespaces hostOverride(String hostOverride) {
|
||||||
|
this.hostOverride = Optional.ofNullable(hostOverride);
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set the region for LanceDB Cloud (optional). Defaults to "us-east-1" if not specified. This is
|
||||||
|
* ignored when hostOverride is set.
|
||||||
|
*
|
||||||
|
* @param region The AWS region (e.g., "us-east-1", "eu-west-1")
|
||||||
|
* @return This builder
|
||||||
|
*/
|
||||||
|
public LanceDbRestNamespaces region(String region) {
|
||||||
|
this.region = Optional.ofNullable(region);
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Add additional configuration parameters.
|
||||||
|
*
|
||||||
|
* @param key The configuration key
|
||||||
|
* @param value The configuration value
|
||||||
|
* @return This builder
|
||||||
|
*/
|
||||||
|
public LanceDbRestNamespaces config(String key, String value) {
|
||||||
|
this.additionalConfig.put(key, value);
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Build the LanceRestNamespace instance.
|
||||||
|
*
|
||||||
|
* @return A configured LanceRestNamespace
|
||||||
|
* @throws IllegalStateException if required parameters are missing
|
||||||
|
*/
|
||||||
|
public LanceRestNamespace build() {
|
||||||
|
// Validate required fields
|
||||||
|
if (apiKey == null) {
|
||||||
|
throw new IllegalStateException("API key is required");
|
||||||
|
}
|
||||||
|
if (database == null) {
|
||||||
|
throw new IllegalStateException("Database is required");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build configuration map
|
||||||
|
Map<String, String> config = new HashMap<>(additionalConfig);
|
||||||
|
config.put("headers.x-lancedb-database", database);
|
||||||
|
config.put("headers.x-api-key", apiKey);
|
||||||
|
|
||||||
|
// Determine base URL
|
||||||
|
String baseUrl;
|
||||||
|
if (hostOverride.isPresent()) {
|
||||||
|
baseUrl = hostOverride.get();
|
||||||
|
config.put("host_override", hostOverride.get());
|
||||||
|
} else {
|
||||||
|
String effectiveRegion = region.orElse(DEFAULT_REGION);
|
||||||
|
baseUrl = String.format(CLOUD_URL_PATTERN, database, effectiveRegion);
|
||||||
|
config.put("region", effectiveRegion);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create and configure ApiClient
|
||||||
|
ApiClient apiClient = new ApiClient();
|
||||||
|
apiClient.setBasePath(baseUrl);
|
||||||
|
|
||||||
|
return new LanceRestNamespace(apiClient, config);
|
||||||
|
}
|
||||||
|
}
|
||||||
259
java/mvnw
vendored
Executable file
259
java/mvnw
vendored
Executable file
@@ -0,0 +1,259 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
# Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
# or more contributor license agreements. See the NOTICE file
|
||||||
|
# distributed with this work for additional information
|
||||||
|
# regarding copyright ownership. The ASF licenses this file
|
||||||
|
# to you under the Apache License, Version 2.0 (the
|
||||||
|
# "License"); you may not use this file except in compliance
|
||||||
|
# with the License. You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing,
|
||||||
|
# software distributed under the License is distributed on an
|
||||||
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
# KIND, either express or implied. See the License for the
|
||||||
|
# specific language governing permissions and limitations
|
||||||
|
# under the License.
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
# Apache Maven Wrapper startup batch script, version 3.3.2
|
||||||
|
#
|
||||||
|
# Optional ENV vars
|
||||||
|
# -----------------
|
||||||
|
# JAVA_HOME - location of a JDK home dir, required when download maven via java source
|
||||||
|
# MVNW_REPOURL - repo url base for downloading maven distribution
|
||||||
|
# MVNW_USERNAME/MVNW_PASSWORD - user and password for downloading maven
|
||||||
|
# MVNW_VERBOSE - true: enable verbose log; debug: trace the mvnw script; others: silence the output
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
set -euf
|
||||||
|
[ "${MVNW_VERBOSE-}" != debug ] || set -x
|
||||||
|
|
||||||
|
# OS specific support.
|
||||||
|
native_path() { printf %s\\n "$1"; }
|
||||||
|
case "$(uname)" in
|
||||||
|
CYGWIN* | MINGW*)
|
||||||
|
[ -z "${JAVA_HOME-}" ] || JAVA_HOME="$(cygpath --unix "$JAVA_HOME")"
|
||||||
|
native_path() { cygpath --path --windows "$1"; }
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
# set JAVACMD and JAVACCMD
|
||||||
|
set_java_home() {
|
||||||
|
# For Cygwin and MinGW, ensure paths are in Unix format before anything is touched
|
||||||
|
if [ -n "${JAVA_HOME-}" ]; then
|
||||||
|
if [ -x "$JAVA_HOME/jre/sh/java" ]; then
|
||||||
|
# IBM's JDK on AIX uses strange locations for the executables
|
||||||
|
JAVACMD="$JAVA_HOME/jre/sh/java"
|
||||||
|
JAVACCMD="$JAVA_HOME/jre/sh/javac"
|
||||||
|
else
|
||||||
|
JAVACMD="$JAVA_HOME/bin/java"
|
||||||
|
JAVACCMD="$JAVA_HOME/bin/javac"
|
||||||
|
|
||||||
|
if [ ! -x "$JAVACMD" ] || [ ! -x "$JAVACCMD" ]; then
|
||||||
|
echo "The JAVA_HOME environment variable is not defined correctly, so mvnw cannot run." >&2
|
||||||
|
echo "JAVA_HOME is set to \"$JAVA_HOME\", but \"\$JAVA_HOME/bin/java\" or \"\$JAVA_HOME/bin/javac\" does not exist." >&2
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
JAVACMD="$(
|
||||||
|
'set' +e
|
||||||
|
'unset' -f command 2>/dev/null
|
||||||
|
'command' -v java
|
||||||
|
)" || :
|
||||||
|
JAVACCMD="$(
|
||||||
|
'set' +e
|
||||||
|
'unset' -f command 2>/dev/null
|
||||||
|
'command' -v javac
|
||||||
|
)" || :
|
||||||
|
|
||||||
|
if [ ! -x "${JAVACMD-}" ] || [ ! -x "${JAVACCMD-}" ]; then
|
||||||
|
echo "The java/javac command does not exist in PATH nor is JAVA_HOME set, so mvnw cannot run." >&2
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# hash string like Java String::hashCode
|
||||||
|
hash_string() {
|
||||||
|
str="${1:-}" h=0
|
||||||
|
while [ -n "$str" ]; do
|
||||||
|
char="${str%"${str#?}"}"
|
||||||
|
h=$(((h * 31 + $(LC_CTYPE=C printf %d "'$char")) % 4294967296))
|
||||||
|
str="${str#?}"
|
||||||
|
done
|
||||||
|
printf %x\\n $h
|
||||||
|
}
|
||||||
|
|
||||||
|
verbose() { :; }
|
||||||
|
[ "${MVNW_VERBOSE-}" != true ] || verbose() { printf %s\\n "${1-}"; }
|
||||||
|
|
||||||
|
die() {
|
||||||
|
printf %s\\n "$1" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
trim() {
|
||||||
|
# MWRAPPER-139:
|
||||||
|
# Trims trailing and leading whitespace, carriage returns, tabs, and linefeeds.
|
||||||
|
# Needed for removing poorly interpreted newline sequences when running in more
|
||||||
|
# exotic environments such as mingw bash on Windows.
|
||||||
|
printf "%s" "${1}" | tr -d '[:space:]'
|
||||||
|
}
|
||||||
|
|
||||||
|
# parse distributionUrl and optional distributionSha256Sum, requires .mvn/wrapper/maven-wrapper.properties
|
||||||
|
while IFS="=" read -r key value; do
|
||||||
|
case "${key-}" in
|
||||||
|
distributionUrl) distributionUrl=$(trim "${value-}") ;;
|
||||||
|
distributionSha256Sum) distributionSha256Sum=$(trim "${value-}") ;;
|
||||||
|
esac
|
||||||
|
done <"${0%/*}/.mvn/wrapper/maven-wrapper.properties"
|
||||||
|
[ -n "${distributionUrl-}" ] || die "cannot read distributionUrl property in ${0%/*}/.mvn/wrapper/maven-wrapper.properties"
|
||||||
|
|
||||||
|
case "${distributionUrl##*/}" in
|
||||||
|
maven-mvnd-*bin.*)
|
||||||
|
MVN_CMD=mvnd.sh _MVNW_REPO_PATTERN=/maven/mvnd/
|
||||||
|
case "${PROCESSOR_ARCHITECTURE-}${PROCESSOR_ARCHITEW6432-}:$(uname -a)" in
|
||||||
|
*AMD64:CYGWIN* | *AMD64:MINGW*) distributionPlatform=windows-amd64 ;;
|
||||||
|
:Darwin*x86_64) distributionPlatform=darwin-amd64 ;;
|
||||||
|
:Darwin*arm64) distributionPlatform=darwin-aarch64 ;;
|
||||||
|
:Linux*x86_64*) distributionPlatform=linux-amd64 ;;
|
||||||
|
*)
|
||||||
|
echo "Cannot detect native platform for mvnd on $(uname)-$(uname -m), use pure java version" >&2
|
||||||
|
distributionPlatform=linux-amd64
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
distributionUrl="${distributionUrl%-bin.*}-$distributionPlatform.zip"
|
||||||
|
;;
|
||||||
|
maven-mvnd-*) MVN_CMD=mvnd.sh _MVNW_REPO_PATTERN=/maven/mvnd/ ;;
|
||||||
|
*) MVN_CMD="mvn${0##*/mvnw}" _MVNW_REPO_PATTERN=/org/apache/maven/ ;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
# apply MVNW_REPOURL and calculate MAVEN_HOME
|
||||||
|
# maven home pattern: ~/.m2/wrapper/dists/{apache-maven-<version>,maven-mvnd-<version>-<platform>}/<hash>
|
||||||
|
[ -z "${MVNW_REPOURL-}" ] || distributionUrl="$MVNW_REPOURL$_MVNW_REPO_PATTERN${distributionUrl#*"$_MVNW_REPO_PATTERN"}"
|
||||||
|
distributionUrlName="${distributionUrl##*/}"
|
||||||
|
distributionUrlNameMain="${distributionUrlName%.*}"
|
||||||
|
distributionUrlNameMain="${distributionUrlNameMain%-bin}"
|
||||||
|
MAVEN_USER_HOME="${MAVEN_USER_HOME:-${HOME}/.m2}"
|
||||||
|
MAVEN_HOME="${MAVEN_USER_HOME}/wrapper/dists/${distributionUrlNameMain-}/$(hash_string "$distributionUrl")"
|
||||||
|
|
||||||
|
exec_maven() {
|
||||||
|
unset MVNW_VERBOSE MVNW_USERNAME MVNW_PASSWORD MVNW_REPOURL || :
|
||||||
|
exec "$MAVEN_HOME/bin/$MVN_CMD" "$@" || die "cannot exec $MAVEN_HOME/bin/$MVN_CMD"
|
||||||
|
}
|
||||||
|
|
||||||
|
if [ -d "$MAVEN_HOME" ]; then
|
||||||
|
verbose "found existing MAVEN_HOME at $MAVEN_HOME"
|
||||||
|
exec_maven "$@"
|
||||||
|
fi
|
||||||
|
|
||||||
|
case "${distributionUrl-}" in
|
||||||
|
*?-bin.zip | *?maven-mvnd-?*-?*.zip) ;;
|
||||||
|
*) die "distributionUrl is not valid, must match *-bin.zip or maven-mvnd-*.zip, but found '${distributionUrl-}'" ;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
# prepare tmp dir
|
||||||
|
if TMP_DOWNLOAD_DIR="$(mktemp -d)" && [ -d "$TMP_DOWNLOAD_DIR" ]; then
|
||||||
|
clean() { rm -rf -- "$TMP_DOWNLOAD_DIR"; }
|
||||||
|
trap clean HUP INT TERM EXIT
|
||||||
|
else
|
||||||
|
die "cannot create temp dir"
|
||||||
|
fi
|
||||||
|
|
||||||
|
mkdir -p -- "${MAVEN_HOME%/*}"
|
||||||
|
|
||||||
|
# Download and Install Apache Maven
|
||||||
|
verbose "Couldn't find MAVEN_HOME, downloading and installing it ..."
|
||||||
|
verbose "Downloading from: $distributionUrl"
|
||||||
|
verbose "Downloading to: $TMP_DOWNLOAD_DIR/$distributionUrlName"
|
||||||
|
|
||||||
|
# select .zip or .tar.gz
|
||||||
|
if ! command -v unzip >/dev/null; then
|
||||||
|
distributionUrl="${distributionUrl%.zip}.tar.gz"
|
||||||
|
distributionUrlName="${distributionUrl##*/}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# verbose opt
|
||||||
|
__MVNW_QUIET_WGET=--quiet __MVNW_QUIET_CURL=--silent __MVNW_QUIET_UNZIP=-q __MVNW_QUIET_TAR=''
|
||||||
|
[ "${MVNW_VERBOSE-}" != true ] || __MVNW_QUIET_WGET='' __MVNW_QUIET_CURL='' __MVNW_QUIET_UNZIP='' __MVNW_QUIET_TAR=v
|
||||||
|
|
||||||
|
# normalize http auth
|
||||||
|
case "${MVNW_PASSWORD:+has-password}" in
|
||||||
|
'') MVNW_USERNAME='' MVNW_PASSWORD='' ;;
|
||||||
|
has-password) [ -n "${MVNW_USERNAME-}" ] || MVNW_USERNAME='' MVNW_PASSWORD='' ;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
if [ -z "${MVNW_USERNAME-}" ] && command -v wget >/dev/null; then
|
||||||
|
verbose "Found wget ... using wget"
|
||||||
|
wget ${__MVNW_QUIET_WGET:+"$__MVNW_QUIET_WGET"} "$distributionUrl" -O "$TMP_DOWNLOAD_DIR/$distributionUrlName" || die "wget: Failed to fetch $distributionUrl"
|
||||||
|
elif [ -z "${MVNW_USERNAME-}" ] && command -v curl >/dev/null; then
|
||||||
|
verbose "Found curl ... using curl"
|
||||||
|
curl ${__MVNW_QUIET_CURL:+"$__MVNW_QUIET_CURL"} -f -L -o "$TMP_DOWNLOAD_DIR/$distributionUrlName" "$distributionUrl" || die "curl: Failed to fetch $distributionUrl"
|
||||||
|
elif set_java_home; then
|
||||||
|
verbose "Falling back to use Java to download"
|
||||||
|
javaSource="$TMP_DOWNLOAD_DIR/Downloader.java"
|
||||||
|
targetZip="$TMP_DOWNLOAD_DIR/$distributionUrlName"
|
||||||
|
cat >"$javaSource" <<-END
|
||||||
|
public class Downloader extends java.net.Authenticator
|
||||||
|
{
|
||||||
|
protected java.net.PasswordAuthentication getPasswordAuthentication()
|
||||||
|
{
|
||||||
|
return new java.net.PasswordAuthentication( System.getenv( "MVNW_USERNAME" ), System.getenv( "MVNW_PASSWORD" ).toCharArray() );
|
||||||
|
}
|
||||||
|
public static void main( String[] args ) throws Exception
|
||||||
|
{
|
||||||
|
setDefault( new Downloader() );
|
||||||
|
java.nio.file.Files.copy( java.net.URI.create( args[0] ).toURL().openStream(), java.nio.file.Paths.get( args[1] ).toAbsolutePath().normalize() );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
END
|
||||||
|
# For Cygwin/MinGW, switch paths to Windows format before running javac and java
|
||||||
|
verbose " - Compiling Downloader.java ..."
|
||||||
|
"$(native_path "$JAVACCMD")" "$(native_path "$javaSource")" || die "Failed to compile Downloader.java"
|
||||||
|
verbose " - Running Downloader.java ..."
|
||||||
|
"$(native_path "$JAVACMD")" -cp "$(native_path "$TMP_DOWNLOAD_DIR")" Downloader "$distributionUrl" "$(native_path "$targetZip")"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# If specified, validate the SHA-256 sum of the Maven distribution zip file
|
||||||
|
if [ -n "${distributionSha256Sum-}" ]; then
|
||||||
|
distributionSha256Result=false
|
||||||
|
if [ "$MVN_CMD" = mvnd.sh ]; then
|
||||||
|
echo "Checksum validation is not supported for maven-mvnd." >&2
|
||||||
|
echo "Please disable validation by removing 'distributionSha256Sum' from your maven-wrapper.properties." >&2
|
||||||
|
exit 1
|
||||||
|
elif command -v sha256sum >/dev/null; then
|
||||||
|
if echo "$distributionSha256Sum $TMP_DOWNLOAD_DIR/$distributionUrlName" | sha256sum -c >/dev/null 2>&1; then
|
||||||
|
distributionSha256Result=true
|
||||||
|
fi
|
||||||
|
elif command -v shasum >/dev/null; then
|
||||||
|
if echo "$distributionSha256Sum $TMP_DOWNLOAD_DIR/$distributionUrlName" | shasum -a 256 -c >/dev/null 2>&1; then
|
||||||
|
distributionSha256Result=true
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
echo "Checksum validation was requested but neither 'sha256sum' or 'shasum' are available." >&2
|
||||||
|
echo "Please install either command, or disable validation by removing 'distributionSha256Sum' from your maven-wrapper.properties." >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
if [ $distributionSha256Result = false ]; then
|
||||||
|
echo "Error: Failed to validate Maven distribution SHA-256, your Maven distribution might be compromised." >&2
|
||||||
|
echo "If you updated your Maven version, you need to update the specified distributionSha256Sum property." >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# unzip and move
|
||||||
|
if command -v unzip >/dev/null; then
|
||||||
|
unzip ${__MVNW_QUIET_UNZIP:+"$__MVNW_QUIET_UNZIP"} "$TMP_DOWNLOAD_DIR/$distributionUrlName" -d "$TMP_DOWNLOAD_DIR" || die "failed to unzip"
|
||||||
|
else
|
||||||
|
tar xzf${__MVNW_QUIET_TAR:+"$__MVNW_QUIET_TAR"} "$TMP_DOWNLOAD_DIR/$distributionUrlName" -C "$TMP_DOWNLOAD_DIR" || die "failed to untar"
|
||||||
|
fi
|
||||||
|
printf %s\\n "$distributionUrl" >"$TMP_DOWNLOAD_DIR/$distributionUrlNameMain/mvnw.url"
|
||||||
|
mv -- "$TMP_DOWNLOAD_DIR/$distributionUrlNameMain" "$MAVEN_HOME" || [ -d "$MAVEN_HOME" ] || die "fail to move MAVEN_HOME"
|
||||||
|
|
||||||
|
clean || :
|
||||||
|
exec_maven "$@"
|
||||||
14
java/pom.xml
14
java/pom.xml
@@ -6,11 +6,10 @@
|
|||||||
|
|
||||||
<groupId>com.lancedb</groupId>
|
<groupId>com.lancedb</groupId>
|
||||||
<artifactId>lancedb-parent</artifactId>
|
<artifactId>lancedb-parent</artifactId>
|
||||||
<version>0.21.1-beta.0</version>
|
<version>0.21.2-beta.0</version>
|
||||||
<packaging>pom</packaging>
|
<packaging>pom</packaging>
|
||||||
|
<name>${project.artifactId}</name>
|
||||||
<name>LanceDB Parent</name>
|
<description>LanceDB Java SDK Parent POM</description>
|
||||||
<description>LanceDB vector database Java API</description>
|
|
||||||
<url>http://lancedb.com/</url>
|
<url>http://lancedb.com/</url>
|
||||||
|
|
||||||
<developers>
|
<developers>
|
||||||
@@ -29,6 +28,7 @@
|
|||||||
<properties>
|
<properties>
|
||||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||||
<arrow.version>15.0.0</arrow.version>
|
<arrow.version>15.0.0</arrow.version>
|
||||||
|
<lance-namespace.verison>0.0.1</lance-namespace.verison>
|
||||||
<spotless.skip>false</spotless.skip>
|
<spotless.skip>false</spotless.skip>
|
||||||
<spotless.version>2.30.0</spotless.version>
|
<spotless.version>2.30.0</spotless.version>
|
||||||
<spotless.java.googlejavaformat.version>1.7</spotless.java.googlejavaformat.version>
|
<spotless.java.googlejavaformat.version>1.7</spotless.java.googlejavaformat.version>
|
||||||
@@ -52,6 +52,7 @@
|
|||||||
|
|
||||||
<modules>
|
<modules>
|
||||||
<module>core</module>
|
<module>core</module>
|
||||||
|
<module>lance-namespace</module>
|
||||||
</modules>
|
</modules>
|
||||||
|
|
||||||
<scm>
|
<scm>
|
||||||
@@ -62,6 +63,11 @@
|
|||||||
|
|
||||||
<dependencyManagement>
|
<dependencyManagement>
|
||||||
<dependencies>
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.lancedb</groupId>
|
||||||
|
<artifactId>lance-namespace-core</artifactId>
|
||||||
|
<version>${lance-namespace.verison}</version>
|
||||||
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.arrow</groupId>
|
<groupId>org.apache.arrow</groupId>
|
||||||
<artifactId>arrow-vector</artifactId>
|
<artifactId>arrow-vector</artifactId>
|
||||||
|
|||||||
44
node/package-lock.json
generated
44
node/package-lock.json
generated
@@ -1,12 +1,12 @@
|
|||||||
{
|
{
|
||||||
"name": "vectordb",
|
"name": "vectordb",
|
||||||
"version": "0.21.1-beta.0",
|
"version": "0.21.2-beta.0",
|
||||||
"lockfileVersion": 3,
|
"lockfileVersion": 3,
|
||||||
"requires": true,
|
"requires": true,
|
||||||
"packages": {
|
"packages": {
|
||||||
"": {
|
"": {
|
||||||
"name": "vectordb",
|
"name": "vectordb",
|
||||||
"version": "0.21.1-beta.0",
|
"version": "0.21.2-beta.0",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"x64",
|
"x64",
|
||||||
"arm64"
|
"arm64"
|
||||||
@@ -52,11 +52,11 @@
|
|||||||
"uuid": "^9.0.0"
|
"uuid": "^9.0.0"
|
||||||
},
|
},
|
||||||
"optionalDependencies": {
|
"optionalDependencies": {
|
||||||
"@lancedb/vectordb-darwin-arm64": "0.21.1-beta.0",
|
"@lancedb/vectordb-darwin-arm64": "0.21.2-beta.0",
|
||||||
"@lancedb/vectordb-darwin-x64": "0.21.1-beta.0",
|
"@lancedb/vectordb-darwin-x64": "0.21.2-beta.0",
|
||||||
"@lancedb/vectordb-linux-arm64-gnu": "0.21.1-beta.0",
|
"@lancedb/vectordb-linux-arm64-gnu": "0.21.2-beta.0",
|
||||||
"@lancedb/vectordb-linux-x64-gnu": "0.21.1-beta.0",
|
"@lancedb/vectordb-linux-x64-gnu": "0.21.2-beta.0",
|
||||||
"@lancedb/vectordb-win32-x64-msvc": "0.21.1-beta.0"
|
"@lancedb/vectordb-win32-x64-msvc": "0.21.2-beta.0"
|
||||||
},
|
},
|
||||||
"peerDependencies": {
|
"peerDependencies": {
|
||||||
"@apache-arrow/ts": "^14.0.2",
|
"@apache-arrow/ts": "^14.0.2",
|
||||||
@@ -327,9 +327,9 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
"node_modules/@lancedb/vectordb-darwin-arm64": {
|
"node_modules/@lancedb/vectordb-darwin-arm64": {
|
||||||
"version": "0.21.1-beta.0",
|
"version": "0.21.2-beta.0",
|
||||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.21.1-beta.0.tgz",
|
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.21.2-beta.0.tgz",
|
||||||
"integrity": "sha512-easypFtN4rFFsSNumFLK/VEhD2BVp+jl6ysICGyutjD/UEiulVdhixBkK5miJOfu/1p67Rjit5C8u3acpX+k2g==",
|
"integrity": "sha512-RiYqpKuq9v8A4wFuHt1iPNFYjWJ1KgGFLJwQO4ajp9Hee84sDHq8mP0ATgMcc24hiaOUQ1lRRTULjGbHn4NIYw==",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"arm64"
|
"arm64"
|
||||||
],
|
],
|
||||||
@@ -340,9 +340,9 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
"node_modules/@lancedb/vectordb-darwin-x64": {
|
"node_modules/@lancedb/vectordb-darwin-x64": {
|
||||||
"version": "0.21.1-beta.0",
|
"version": "0.21.2-beta.0",
|
||||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.21.1-beta.0.tgz",
|
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.21.2-beta.0.tgz",
|
||||||
"integrity": "sha512-ez//lKtXu7EWgZlUYgwBM2We4/ty8rOtkDMF3RlveWJAKn+zNX0UM3vTa9W7WbCcBn9Ycs3eQGrBvb0iYFIDgw==",
|
"integrity": "sha512-togdP0YIjMYg/hBRMMxW434i5VB789JWU5o3hWrodbX8olEc0Txqw5Dg9CgIOldBIiCti6uTSQiTo6uldZon1w==",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"x64"
|
"x64"
|
||||||
],
|
],
|
||||||
@@ -353,9 +353,9 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
"node_modules/@lancedb/vectordb-linux-arm64-gnu": {
|
"node_modules/@lancedb/vectordb-linux-arm64-gnu": {
|
||||||
"version": "0.21.1-beta.0",
|
"version": "0.21.2-beta.0",
|
||||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.21.1-beta.0.tgz",
|
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.21.2-beta.0.tgz",
|
||||||
"integrity": "sha512-T+vfr3A/59V8JMB5vonUmFDE8Vcf7Qe+DhQMf6kUlQxx80TujMeTdkaOf9/zBAopN2T8Y2h+GNScjl/WomYOFg==",
|
"integrity": "sha512-ErS4IQDQVTYVATPeOj/dZXQR34eZQ5rAXm3vJdQi5K6X4zCDaIjOhpmnwzPBGT9W1idaBAoDJhtNfsFaJ6/PQQ==",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"arm64"
|
"arm64"
|
||||||
],
|
],
|
||||||
@@ -366,9 +366,9 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
"node_modules/@lancedb/vectordb-linux-x64-gnu": {
|
"node_modules/@lancedb/vectordb-linux-x64-gnu": {
|
||||||
"version": "0.21.1-beta.0",
|
"version": "0.21.2-beta.0",
|
||||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.21.1-beta.0.tgz",
|
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.21.2-beta.0.tgz",
|
||||||
"integrity": "sha512-FpDd4g2+xGrU41gywx4KFPGOlpBZq3VrE+4BBiTrRW6IO5Kbs2Mmq7ufJuDLlLqPs6ZQ5/Wlbcq5PmdRSoeq8A==",
|
"integrity": "sha512-ycDpyBGbfxtnGGa/RQo5+So6dHALiem1pbYc/LDKKluUJpadtXtEwC61o6hZTcejoYjhEE8ET7vA3OCEJfMFaw==",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"x64"
|
"x64"
|
||||||
],
|
],
|
||||||
@@ -379,9 +379,9 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
"node_modules/@lancedb/vectordb-win32-x64-msvc": {
|
"node_modules/@lancedb/vectordb-win32-x64-msvc": {
|
||||||
"version": "0.21.1-beta.0",
|
"version": "0.21.2-beta.0",
|
||||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.21.1-beta.0.tgz",
|
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.21.2-beta.0.tgz",
|
||||||
"integrity": "sha512-SEKHecFpgODmrUsAE8pBLu8OMKnAx97Ap0FrH6AGGglJKAVirrrg9BKSPfmHMZCvyPSHzG5TUMxhtNm+Ibg5DQ==",
|
"integrity": "sha512-IgVkAP/LiNIQD5P6n/9x3bgQOt5pGJarjtSF8r+ialD95QHmo6tcxrwTy/DlA+H1uI6B6h+sbN0c1KXTh1rYcg==",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"x64"
|
"x64"
|
||||||
],
|
],
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "vectordb",
|
"name": "vectordb",
|
||||||
"version": "0.21.1-beta.0",
|
"version": "0.21.2-beta.0",
|
||||||
"description": " Serverless, low-latency vector database for AI applications",
|
"description": " Serverless, low-latency vector database for AI applications",
|
||||||
"private": false,
|
"private": false,
|
||||||
"main": "dist/index.js",
|
"main": "dist/index.js",
|
||||||
@@ -89,10 +89,10 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
"optionalDependencies": {
|
"optionalDependencies": {
|
||||||
"@lancedb/vectordb-darwin-x64": "0.21.1-beta.0",
|
"@lancedb/vectordb-darwin-x64": "0.21.2-beta.0",
|
||||||
"@lancedb/vectordb-darwin-arm64": "0.21.1-beta.0",
|
"@lancedb/vectordb-darwin-arm64": "0.21.2-beta.0",
|
||||||
"@lancedb/vectordb-linux-x64-gnu": "0.21.1-beta.0",
|
"@lancedb/vectordb-linux-x64-gnu": "0.21.2-beta.0",
|
||||||
"@lancedb/vectordb-linux-arm64-gnu": "0.21.1-beta.0",
|
"@lancedb/vectordb-linux-arm64-gnu": "0.21.2-beta.0",
|
||||||
"@lancedb/vectordb-win32-x64-msvc": "0.21.1-beta.0"
|
"@lancedb/vectordb-win32-x64-msvc": "0.21.2-beta.0"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb-nodejs"
|
name = "lancedb-nodejs"
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
version = "0.21.1-beta.0"
|
version = "0.21.2-beta.0"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
description.workspace = true
|
description.workspace = true
|
||||||
repository.workspace = true
|
repository.workspace = true
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
// SPDX-License-Identifier: Apache-2.0
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||||
|
|
||||||
import { Schema } from "apache-arrow";
|
import { Bool, Field, Int32, List, Schema, Struct, Utf8 } from "apache-arrow";
|
||||||
|
|
||||||
import * as arrow15 from "apache-arrow-15";
|
import * as arrow15 from "apache-arrow-15";
|
||||||
import * as arrow16 from "apache-arrow-16";
|
import * as arrow16 from "apache-arrow-16";
|
||||||
@@ -11,10 +11,12 @@ import * as arrow18 from "apache-arrow-18";
|
|||||||
import {
|
import {
|
||||||
convertToTable,
|
convertToTable,
|
||||||
fromBufferToRecordBatch,
|
fromBufferToRecordBatch,
|
||||||
|
fromDataToBuffer,
|
||||||
fromRecordBatchToBuffer,
|
fromRecordBatchToBuffer,
|
||||||
fromTableToBuffer,
|
fromTableToBuffer,
|
||||||
makeArrowTable,
|
makeArrowTable,
|
||||||
makeEmptyTable,
|
makeEmptyTable,
|
||||||
|
tableFromIPC,
|
||||||
} from "../lancedb/arrow";
|
} from "../lancedb/arrow";
|
||||||
import {
|
import {
|
||||||
EmbeddingFunction,
|
EmbeddingFunction,
|
||||||
@@ -375,8 +377,221 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
|
|||||||
expect(table2.schema).toEqual(schema);
|
expect(table2.schema).toEqual(schema);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("will handle missing columns in schema alignment when using embeddings", async function () {
|
||||||
|
const schema = new Schema(
|
||||||
|
[
|
||||||
|
new Field("domain", new Utf8(), true),
|
||||||
|
new Field("name", new Utf8(), true),
|
||||||
|
new Field("description", new Utf8(), true),
|
||||||
|
],
|
||||||
|
new Map([["embedding_functions", JSON.stringify([])]]),
|
||||||
|
);
|
||||||
|
|
||||||
|
const data = [
|
||||||
|
{ domain: "google.com", name: "Google" },
|
||||||
|
{ domain: "facebook.com", name: "Facebook" },
|
||||||
|
];
|
||||||
|
|
||||||
|
const table = await convertToTable(data, undefined, { schema });
|
||||||
|
|
||||||
|
expect(table.numCols).toBe(3);
|
||||||
|
expect(table.numRows).toBe(2);
|
||||||
|
|
||||||
|
const descriptionColumn = table.getChild("description");
|
||||||
|
expect(descriptionColumn).toBeDefined();
|
||||||
|
expect(descriptionColumn?.nullCount).toBe(2);
|
||||||
|
expect(descriptionColumn?.toArray()).toEqual([null, null]);
|
||||||
|
|
||||||
|
expect(table.getChild("domain")?.toArray()).toEqual([
|
||||||
|
"google.com",
|
||||||
|
"facebook.com",
|
||||||
|
]);
|
||||||
|
expect(table.getChild("name")?.toArray()).toEqual([
|
||||||
|
"Google",
|
||||||
|
"Facebook",
|
||||||
|
]);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("will handle completely missing nested struct columns", async function () {
|
||||||
|
const schema = new Schema(
|
||||||
|
[
|
||||||
|
new Field("id", new Utf8(), true),
|
||||||
|
new Field("name", new Utf8(), true),
|
||||||
|
new Field(
|
||||||
|
"metadata",
|
||||||
|
new Struct([
|
||||||
|
new Field("version", new Int32(), true),
|
||||||
|
new Field("author", new Utf8(), true),
|
||||||
|
new Field(
|
||||||
|
"tags",
|
||||||
|
new List(new Field("item", new Utf8(), true)),
|
||||||
|
true,
|
||||||
|
),
|
||||||
|
]),
|
||||||
|
true,
|
||||||
|
),
|
||||||
|
],
|
||||||
|
new Map([["embedding_functions", JSON.stringify([])]]),
|
||||||
|
);
|
||||||
|
|
||||||
|
const data = [
|
||||||
|
{ id: "doc1", name: "Document 1" },
|
||||||
|
{ id: "doc2", name: "Document 2" },
|
||||||
|
];
|
||||||
|
|
||||||
|
const table = await convertToTable(data, undefined, { schema });
|
||||||
|
|
||||||
|
expect(table.numCols).toBe(3);
|
||||||
|
expect(table.numRows).toBe(2);
|
||||||
|
|
||||||
|
const buf = await fromTableToBuffer(table);
|
||||||
|
const retrievedTable = tableFromIPC(buf);
|
||||||
|
|
||||||
|
const rows = [];
|
||||||
|
for (let i = 0; i < retrievedTable.numRows; i++) {
|
||||||
|
rows.push(retrievedTable.get(i));
|
||||||
|
}
|
||||||
|
|
||||||
|
expect(rows[0].metadata.version).toBe(null);
|
||||||
|
expect(rows[0].metadata.author).toBe(null);
|
||||||
|
expect(rows[0].metadata.tags).toBe(null);
|
||||||
|
expect(rows[0].id).toBe("doc1");
|
||||||
|
expect(rows[0].name).toBe("Document 1");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("will handle partially missing nested struct fields", async function () {
|
||||||
|
const schema = new Schema(
|
||||||
|
[
|
||||||
|
new Field("id", new Utf8(), true),
|
||||||
|
new Field(
|
||||||
|
"metadata",
|
||||||
|
new Struct([
|
||||||
|
new Field("version", new Int32(), true),
|
||||||
|
new Field("author", new Utf8(), true),
|
||||||
|
new Field("created_at", new Utf8(), true),
|
||||||
|
]),
|
||||||
|
true,
|
||||||
|
),
|
||||||
|
],
|
||||||
|
new Map([["embedding_functions", JSON.stringify([])]]),
|
||||||
|
);
|
||||||
|
|
||||||
|
const data = [
|
||||||
|
{ id: "doc1", metadata: { version: 1, author: "Alice" } },
|
||||||
|
{ id: "doc2", metadata: { version: 2 } },
|
||||||
|
];
|
||||||
|
|
||||||
|
const table = await convertToTable(data, undefined, { schema });
|
||||||
|
|
||||||
|
expect(table.numCols).toBe(2);
|
||||||
|
expect(table.numRows).toBe(2);
|
||||||
|
|
||||||
|
const metadataColumn = table.getChild("metadata");
|
||||||
|
expect(metadataColumn).toBeDefined();
|
||||||
|
expect(metadataColumn?.type.toString()).toBe(
|
||||||
|
"Struct<{version:Int32, author:Utf8, created_at:Utf8}>",
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("will handle multiple levels of nested structures", async function () {
|
||||||
|
const schema = new Schema(
|
||||||
|
[
|
||||||
|
new Field("id", new Utf8(), true),
|
||||||
|
new Field(
|
||||||
|
"config",
|
||||||
|
new Struct([
|
||||||
|
new Field("database", new Utf8(), true),
|
||||||
|
new Field(
|
||||||
|
"connection",
|
||||||
|
new Struct([
|
||||||
|
new Field("host", new Utf8(), true),
|
||||||
|
new Field("port", new Int32(), true),
|
||||||
|
new Field(
|
||||||
|
"ssl",
|
||||||
|
new Struct([
|
||||||
|
new Field("enabled", new Bool(), true),
|
||||||
|
new Field("cert_path", new Utf8(), true),
|
||||||
|
]),
|
||||||
|
true,
|
||||||
|
),
|
||||||
|
]),
|
||||||
|
true,
|
||||||
|
),
|
||||||
|
]),
|
||||||
|
true,
|
||||||
|
),
|
||||||
|
],
|
||||||
|
new Map([["embedding_functions", JSON.stringify([])]]),
|
||||||
|
);
|
||||||
|
|
||||||
|
const data = [
|
||||||
|
{
|
||||||
|
id: "config1",
|
||||||
|
config: {
|
||||||
|
database: "postgres",
|
||||||
|
connection: { host: "localhost" },
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
id: "config2",
|
||||||
|
config: { database: "mysql" },
|
||||||
|
},
|
||||||
|
{
|
||||||
|
id: "config3",
|
||||||
|
},
|
||||||
|
];
|
||||||
|
|
||||||
|
const table = await convertToTable(data, undefined, { schema });
|
||||||
|
|
||||||
|
expect(table.numCols).toBe(2);
|
||||||
|
expect(table.numRows).toBe(3);
|
||||||
|
|
||||||
|
const configColumn = table.getChild("config");
|
||||||
|
expect(configColumn).toBeDefined();
|
||||||
|
expect(configColumn?.type.toString()).toBe(
|
||||||
|
"Struct<{database:Utf8, connection:Struct<{host:Utf8, port:Int32, ssl:Struct<{enabled:Bool, cert_path:Utf8}>}>}>",
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("will handle missing columns in Arrow table input when using embeddings", async function () {
|
||||||
|
const incompleteTable = makeArrowTable([
|
||||||
|
{ domain: "google.com", name: "Google" },
|
||||||
|
{ domain: "facebook.com", name: "Facebook" },
|
||||||
|
]);
|
||||||
|
|
||||||
|
const schema = new Schema(
|
||||||
|
[
|
||||||
|
new Field("domain", new Utf8(), true),
|
||||||
|
new Field("name", new Utf8(), true),
|
||||||
|
new Field("description", new Utf8(), true),
|
||||||
|
],
|
||||||
|
new Map([["embedding_functions", JSON.stringify([])]]),
|
||||||
|
);
|
||||||
|
|
||||||
|
const buf = await fromDataToBuffer(incompleteTable, undefined, schema);
|
||||||
|
|
||||||
|
expect(buf.byteLength).toBeGreaterThan(0);
|
||||||
|
|
||||||
|
const retrievedTable = tableFromIPC(buf);
|
||||||
|
expect(retrievedTable.numCols).toBe(3);
|
||||||
|
expect(retrievedTable.numRows).toBe(2);
|
||||||
|
|
||||||
|
const descriptionColumn = retrievedTable.getChild("description");
|
||||||
|
expect(descriptionColumn).toBeDefined();
|
||||||
|
expect(descriptionColumn?.nullCount).toBe(2);
|
||||||
|
expect(descriptionColumn?.toArray()).toEqual([null, null]);
|
||||||
|
|
||||||
|
expect(retrievedTable.getChild("domain")?.toArray()).toEqual([
|
||||||
|
"google.com",
|
||||||
|
"facebook.com",
|
||||||
|
]);
|
||||||
|
expect(retrievedTable.getChild("name")?.toArray()).toEqual([
|
||||||
|
"Google",
|
||||||
|
"Facebook",
|
||||||
|
]);
|
||||||
|
});
|
||||||
|
|
||||||
it("should correctly retain values in nested struct fields", async function () {
|
it("should correctly retain values in nested struct fields", async function () {
|
||||||
// Define test data with nested struct
|
|
||||||
const testData = [
|
const testData = [
|
||||||
{
|
{
|
||||||
id: "doc1",
|
id: "doc1",
|
||||||
@@ -400,10 +615,8 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
|
|||||||
},
|
},
|
||||||
];
|
];
|
||||||
|
|
||||||
// Create Arrow table from the data
|
|
||||||
const table = makeArrowTable(testData);
|
const table = makeArrowTable(testData);
|
||||||
|
|
||||||
// Verify schema has the nested struct fields
|
|
||||||
const metadataField = table.schema.fields.find(
|
const metadataField = table.schema.fields.find(
|
||||||
(f) => f.name === "metadata",
|
(f) => f.name === "metadata",
|
||||||
);
|
);
|
||||||
@@ -417,23 +630,17 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
|
|||||||
"text",
|
"text",
|
||||||
]);
|
]);
|
||||||
|
|
||||||
// Convert to buffer and back (simulating storage and retrieval)
|
|
||||||
const buf = await fromTableToBuffer(table);
|
const buf = await fromTableToBuffer(table);
|
||||||
const retrievedTable = tableFromIPC(buf);
|
const retrievedTable = tableFromIPC(buf);
|
||||||
|
|
||||||
// Verify the retrieved table has the same structure
|
|
||||||
const rows = [];
|
const rows = [];
|
||||||
for (let i = 0; i < retrievedTable.numRows; i++) {
|
for (let i = 0; i < retrievedTable.numRows; i++) {
|
||||||
rows.push(retrievedTable.get(i));
|
rows.push(retrievedTable.get(i));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check values in the first row
|
|
||||||
const firstRow = rows[0];
|
const firstRow = rows[0];
|
||||||
expect(firstRow.id).toBe("doc1");
|
expect(firstRow.id).toBe("doc1");
|
||||||
expect(firstRow.vector.toJSON()).toEqual([1, 2, 3]);
|
expect(firstRow.vector.toJSON()).toEqual([1, 2, 3]);
|
||||||
|
|
||||||
// Verify metadata values are preserved (this is where the bug is)
|
|
||||||
expect(firstRow.metadata).toBeDefined();
|
|
||||||
expect(firstRow.metadata.filePath).toBe("/path/to/file1.ts");
|
expect(firstRow.metadata.filePath).toBe("/path/to/file1.ts");
|
||||||
expect(firstRow.metadata.startLine).toBe(10);
|
expect(firstRow.metadata.startLine).toBe(10);
|
||||||
expect(firstRow.metadata.endLine).toBe(20);
|
expect(firstRow.metadata.endLine).toBe(20);
|
||||||
|
|||||||
@@ -1706,6 +1706,60 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
|
|||||||
expect(mustNotResults.length).toBe(1);
|
expect(mustNotResults.length).toBe(1);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test("full text search ngram", async () => {
|
||||||
|
const db = await connect(tmpDir.name);
|
||||||
|
const data = [
|
||||||
|
{ text: "hello world", vector: [0.1, 0.2, 0.3] },
|
||||||
|
{ text: "lance database", vector: [0.4, 0.5, 0.6] },
|
||||||
|
{ text: "lance is cool", vector: [0.7, 0.8, 0.9] },
|
||||||
|
];
|
||||||
|
const table = await db.createTable("test", data);
|
||||||
|
await table.createIndex("text", {
|
||||||
|
config: Index.fts({ baseTokenizer: "ngram" }),
|
||||||
|
});
|
||||||
|
|
||||||
|
const results = await table.search("lan").toArray();
|
||||||
|
expect(results.length).toBe(2);
|
||||||
|
const resultSet = new Set(results.map((r) => r.text));
|
||||||
|
expect(resultSet.has("lance database")).toBe(true);
|
||||||
|
expect(resultSet.has("lance is cool")).toBe(true);
|
||||||
|
|
||||||
|
const results2 = await table.search("nce").toArray(); // spellchecker:disable-line
|
||||||
|
expect(results2.length).toBe(2);
|
||||||
|
const resultSet2 = new Set(results2.map((r) => r.text));
|
||||||
|
expect(resultSet2.has("lance database")).toBe(true);
|
||||||
|
expect(resultSet2.has("lance is cool")).toBe(true);
|
||||||
|
|
||||||
|
// the default min_ngram_length is 3, so "la" should not match
|
||||||
|
const results3 = await table.search("la").toArray();
|
||||||
|
expect(results3.length).toBe(0);
|
||||||
|
|
||||||
|
// test setting min_ngram_length and prefix_only
|
||||||
|
await table.createIndex("text", {
|
||||||
|
config: Index.fts({
|
||||||
|
baseTokenizer: "ngram",
|
||||||
|
ngramMinLength: 2,
|
||||||
|
prefixOnly: true,
|
||||||
|
}),
|
||||||
|
replace: true,
|
||||||
|
});
|
||||||
|
|
||||||
|
const results4 = await table.search("lan").toArray();
|
||||||
|
expect(results4.length).toBe(2);
|
||||||
|
const resultSet4 = new Set(results4.map((r) => r.text));
|
||||||
|
expect(resultSet4.has("lance database")).toBe(true);
|
||||||
|
expect(resultSet4.has("lance is cool")).toBe(true);
|
||||||
|
|
||||||
|
const results5 = await table.search("nce").toArray(); // spellchecker:disable-line
|
||||||
|
expect(results5.length).toBe(0);
|
||||||
|
|
||||||
|
const results6 = await table.search("la").toArray();
|
||||||
|
expect(results6.length).toBe(2);
|
||||||
|
const resultSet6 = new Set(results6.map((r) => r.text));
|
||||||
|
expect(resultSet6.has("lance database")).toBe(true);
|
||||||
|
expect(resultSet6.has("lance is cool")).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
test.each([
|
test.each([
|
||||||
[0.4, 0.5, 0.599], // number[]
|
[0.4, 0.5, 0.599], // number[]
|
||||||
Float32Array.of(0.4, 0.5, 0.599), // Float32Array
|
Float32Array.of(0.4, 0.5, 0.599), // Float32Array
|
||||||
@@ -1809,4 +1863,43 @@ describe("column name options", () => {
|
|||||||
expect(results[0].query_index).toBe(0);
|
expect(results[0].query_index).toBe(0);
|
||||||
expect(results[1].query_index).toBe(1);
|
expect(results[1].query_index).toBe(1);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test("index and search multivectors", async () => {
|
||||||
|
const db = await connect(tmpDir.name);
|
||||||
|
const data = [];
|
||||||
|
// generate 512 random multivectors
|
||||||
|
for (let i = 0; i < 256; i++) {
|
||||||
|
data.push({
|
||||||
|
multivector: Array.from({ length: 10 }, () =>
|
||||||
|
Array(2).fill(Math.random()),
|
||||||
|
),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
const table = await db.createTable("multivectors", data, {
|
||||||
|
schema: new Schema([
|
||||||
|
new Field(
|
||||||
|
"multivector",
|
||||||
|
new List(
|
||||||
|
new Field(
|
||||||
|
"item",
|
||||||
|
new FixedSizeList(2, new Field("item", new Float32())),
|
||||||
|
),
|
||||||
|
),
|
||||||
|
),
|
||||||
|
]),
|
||||||
|
});
|
||||||
|
|
||||||
|
const results = await table.search(data[0].multivector).limit(10).toArray();
|
||||||
|
expect(results.length).toBe(10);
|
||||||
|
|
||||||
|
await table.createIndex("multivector", {
|
||||||
|
config: Index.ivfPq({ numPartitions: 2, distanceType: "cosine" }),
|
||||||
|
});
|
||||||
|
|
||||||
|
const results2 = await table
|
||||||
|
.search(data[0].multivector)
|
||||||
|
.limit(10)
|
||||||
|
.toArray();
|
||||||
|
expect(results2.length).toBe(10);
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -107,6 +107,20 @@ export type IntoVector =
|
|||||||
| number[]
|
| number[]
|
||||||
| Promise<Float32Array | Float64Array | number[]>;
|
| Promise<Float32Array | Float64Array | number[]>;
|
||||||
|
|
||||||
|
export type MultiVector = IntoVector[];
|
||||||
|
|
||||||
|
export function isMultiVector(value: unknown): value is MultiVector {
|
||||||
|
return Array.isArray(value) && isIntoVector(value[0]);
|
||||||
|
}
|
||||||
|
|
||||||
|
export function isIntoVector(value: unknown): value is IntoVector {
|
||||||
|
return (
|
||||||
|
value instanceof Float32Array ||
|
||||||
|
value instanceof Float64Array ||
|
||||||
|
(Array.isArray(value) && !Array.isArray(value[0]))
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
export function isArrowTable(value: object): value is TableLike {
|
export function isArrowTable(value: object): value is TableLike {
|
||||||
if (value instanceof ArrowTable) return true;
|
if (value instanceof ArrowTable) return true;
|
||||||
return "schema" in value && "batches" in value;
|
return "schema" in value && "batches" in value;
|
||||||
@@ -839,6 +853,15 @@ async function applyEmbeddingsFromMetadata(
|
|||||||
const vector = makeVector(vectors, destType);
|
const vector = makeVector(vectors, destType);
|
||||||
columns[destColumn] = vector;
|
columns[destColumn] = vector;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Add any missing columns from the schema as null vectors
|
||||||
|
for (const field of schema.fields) {
|
||||||
|
if (!(field.name in columns)) {
|
||||||
|
const nullValues = new Array(table.numRows).fill(null);
|
||||||
|
columns[field.name] = makeVector(nullValues, field.type);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const newTable = new ArrowTable(columns);
|
const newTable = new ArrowTable(columns);
|
||||||
return alignTable(newTable, schema);
|
return alignTable(newTable, schema);
|
||||||
}
|
}
|
||||||
@@ -987,7 +1010,21 @@ export async function convertToTable(
|
|||||||
embeddings?: EmbeddingFunctionConfig,
|
embeddings?: EmbeddingFunctionConfig,
|
||||||
makeTableOptions?: Partial<MakeArrowTableOptions>,
|
makeTableOptions?: Partial<MakeArrowTableOptions>,
|
||||||
): Promise<ArrowTable> {
|
): Promise<ArrowTable> {
|
||||||
const table = makeArrowTable(data, makeTableOptions);
|
let processedData = data;
|
||||||
|
|
||||||
|
// If we have a schema with embedding metadata, we need to preprocess the data
|
||||||
|
// to ensure all nested fields are present
|
||||||
|
if (
|
||||||
|
makeTableOptions?.schema &&
|
||||||
|
makeTableOptions.schema.metadata?.has("embedding_functions")
|
||||||
|
) {
|
||||||
|
processedData = ensureNestedFieldsExist(
|
||||||
|
data,
|
||||||
|
makeTableOptions.schema as Schema,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
const table = makeArrowTable(processedData, makeTableOptions);
|
||||||
return await applyEmbeddings(table, embeddings, makeTableOptions?.schema);
|
return await applyEmbeddings(table, embeddings, makeTableOptions?.schema);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1080,7 +1117,16 @@ export async function fromDataToBuffer(
|
|||||||
schema = sanitizeSchema(schema);
|
schema = sanitizeSchema(schema);
|
||||||
}
|
}
|
||||||
if (isArrowTable(data)) {
|
if (isArrowTable(data)) {
|
||||||
return fromTableToBuffer(sanitizeTable(data), embeddings, schema);
|
const table = sanitizeTable(data);
|
||||||
|
// If we have a schema with embedding functions, we need to ensure all columns exist
|
||||||
|
// before applying embeddings, since applyEmbeddingsFromMetadata expects all columns
|
||||||
|
// to be present in the table
|
||||||
|
if (schema && schema.metadata?.has("embedding_functions")) {
|
||||||
|
const alignedTable = alignTableToSchema(table, schema);
|
||||||
|
return fromTableToBuffer(alignedTable, embeddings, schema);
|
||||||
|
} else {
|
||||||
|
return fromTableToBuffer(table, embeddings, schema);
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
const table = await convertToTable(data, embeddings, { schema });
|
const table = await convertToTable(data, embeddings, { schema });
|
||||||
return fromTableToBuffer(table);
|
return fromTableToBuffer(table);
|
||||||
@@ -1149,7 +1195,7 @@ function alignBatch(batch: RecordBatch, schema: Schema): RecordBatch {
|
|||||||
type: new Struct(schema.fields),
|
type: new Struct(schema.fields),
|
||||||
length: batch.numRows,
|
length: batch.numRows,
|
||||||
nullCount: batch.nullCount,
|
nullCount: batch.nullCount,
|
||||||
children: alignedChildren,
|
children: alignedChildren as unknown as ArrowData<DataType>[],
|
||||||
});
|
});
|
||||||
return new RecordBatch(schema, newData);
|
return new RecordBatch(schema, newData);
|
||||||
}
|
}
|
||||||
@@ -1221,6 +1267,79 @@ function validateSchemaEmbeddings(
|
|||||||
return new Schema(fields, schema.metadata);
|
return new Schema(fields, schema.metadata);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Ensures that all nested fields defined in the schema exist in the data,
|
||||||
|
* filling missing fields with null values.
|
||||||
|
*/
|
||||||
|
export function ensureNestedFieldsExist(
|
||||||
|
data: Array<Record<string, unknown>>,
|
||||||
|
schema: Schema,
|
||||||
|
): Array<Record<string, unknown>> {
|
||||||
|
return data.map((row) => {
|
||||||
|
const completeRow: Record<string, unknown> = {};
|
||||||
|
|
||||||
|
for (const field of schema.fields) {
|
||||||
|
if (field.name in row) {
|
||||||
|
if (
|
||||||
|
field.type.constructor.name === "Struct" &&
|
||||||
|
row[field.name] !== null &&
|
||||||
|
row[field.name] !== undefined
|
||||||
|
) {
|
||||||
|
// Handle nested struct
|
||||||
|
const nestedValue = row[field.name] as Record<string, unknown>;
|
||||||
|
completeRow[field.name] = ensureStructFieldsExist(
|
||||||
|
nestedValue,
|
||||||
|
field.type,
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
// Non-struct field or null struct value
|
||||||
|
completeRow[field.name] = row[field.name];
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Field is missing from the data - set to null
|
||||||
|
completeRow[field.name] = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return completeRow;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Recursively ensures that all fields in a struct type exist in the data,
|
||||||
|
* filling missing fields with null values.
|
||||||
|
*/
|
||||||
|
function ensureStructFieldsExist(
|
||||||
|
data: Record<string, unknown>,
|
||||||
|
structType: Struct,
|
||||||
|
): Record<string, unknown> {
|
||||||
|
const completeStruct: Record<string, unknown> = {};
|
||||||
|
|
||||||
|
for (const childField of structType.children) {
|
||||||
|
if (childField.name in data) {
|
||||||
|
if (
|
||||||
|
childField.type.constructor.name === "Struct" &&
|
||||||
|
data[childField.name] !== null &&
|
||||||
|
data[childField.name] !== undefined
|
||||||
|
) {
|
||||||
|
// Recursively handle nested struct
|
||||||
|
completeStruct[childField.name] = ensureStructFieldsExist(
|
||||||
|
data[childField.name] as Record<string, unknown>,
|
||||||
|
childField.type,
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
// Non-struct field or null struct value
|
||||||
|
completeStruct[childField.name] = data[childField.name];
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Field is missing - set to null
|
||||||
|
completeStruct[childField.name] = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return completeStruct;
|
||||||
|
}
|
||||||
|
|
||||||
interface JsonDataType {
|
interface JsonDataType {
|
||||||
type: string;
|
type: string;
|
||||||
fields?: JsonField[];
|
fields?: JsonField[];
|
||||||
@@ -1354,3 +1473,64 @@ function fieldToJson(field: Field): JsonField {
|
|||||||
metadata: field.metadata,
|
metadata: field.metadata,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function alignTableToSchema(
|
||||||
|
table: ArrowTable,
|
||||||
|
targetSchema: Schema,
|
||||||
|
): ArrowTable {
|
||||||
|
const existingColumns = new Map<string, Vector>();
|
||||||
|
|
||||||
|
// Map existing columns
|
||||||
|
for (const field of table.schema.fields) {
|
||||||
|
existingColumns.set(field.name, table.getChild(field.name)!);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create vectors for all fields in target schema
|
||||||
|
const alignedColumns: Record<string, Vector> = {};
|
||||||
|
|
||||||
|
for (const field of targetSchema.fields) {
|
||||||
|
if (existingColumns.has(field.name)) {
|
||||||
|
// Column exists, use it
|
||||||
|
alignedColumns[field.name] = existingColumns.get(field.name)!;
|
||||||
|
} else {
|
||||||
|
// Column missing, create null vector
|
||||||
|
alignedColumns[field.name] = createNullVector(field, table.numRows);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create new table with aligned schema and columns
|
||||||
|
return new ArrowTable(targetSchema, alignedColumns);
|
||||||
|
}
|
||||||
|
|
||||||
|
function createNullVector(field: Field, numRows: number): Vector {
|
||||||
|
if (field.type.constructor.name === "Struct") {
|
||||||
|
// For struct types, create a struct with null fields
|
||||||
|
const structType = field.type as Struct;
|
||||||
|
const childVectors = structType.children.map((childField) =>
|
||||||
|
createNullVector(childField, numRows),
|
||||||
|
);
|
||||||
|
|
||||||
|
// Create struct data
|
||||||
|
const structData = makeData({
|
||||||
|
type: structType,
|
||||||
|
length: numRows,
|
||||||
|
nullCount: 0,
|
||||||
|
children: childVectors.map((v) => v.data[0]),
|
||||||
|
});
|
||||||
|
|
||||||
|
return arrowMakeVector(structData);
|
||||||
|
} else {
|
||||||
|
// For other types, create a vector of nulls
|
||||||
|
const nullBitmap = new Uint8Array(Math.ceil(numRows / 8));
|
||||||
|
// All bits are 0, meaning all values are null
|
||||||
|
|
||||||
|
const data = makeData({
|
||||||
|
type: field.type,
|
||||||
|
length: numRows,
|
||||||
|
nullCount: numRows,
|
||||||
|
nullBitmap,
|
||||||
|
});
|
||||||
|
|
||||||
|
return arrowMakeVector(data);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -100,6 +100,7 @@ export {
|
|||||||
RecordBatchLike,
|
RecordBatchLike,
|
||||||
DataLike,
|
DataLike,
|
||||||
IntoVector,
|
IntoVector,
|
||||||
|
MultiVector,
|
||||||
} from "./arrow";
|
} from "./arrow";
|
||||||
export { IntoSql, packBits } from "./util";
|
export { IntoSql, packBits } from "./util";
|
||||||
|
|
||||||
|
|||||||
@@ -439,7 +439,7 @@ export interface FtsOptions {
|
|||||||
*
|
*
|
||||||
* "raw" - Raw tokenizer. This tokenizer does not split the text into tokens and indexes the entire text as a single token.
|
* "raw" - Raw tokenizer. This tokenizer does not split the text into tokens and indexes the entire text as a single token.
|
||||||
*/
|
*/
|
||||||
baseTokenizer?: "simple" | "whitespace" | "raw";
|
baseTokenizer?: "simple" | "whitespace" | "raw" | "ngram";
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* language for stemming and stop words
|
* language for stemming and stop words
|
||||||
@@ -472,6 +472,21 @@ export interface FtsOptions {
|
|||||||
* whether to remove punctuation
|
* whether to remove punctuation
|
||||||
*/
|
*/
|
||||||
asciiFolding?: boolean;
|
asciiFolding?: boolean;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* ngram min length
|
||||||
|
*/
|
||||||
|
ngramMinLength?: number;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* ngram max length
|
||||||
|
*/
|
||||||
|
ngramMaxLength?: number;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* whether to only index the prefix of the token for ngram tokenizer
|
||||||
|
*/
|
||||||
|
prefixOnly?: boolean;
|
||||||
}
|
}
|
||||||
|
|
||||||
export class Index {
|
export class Index {
|
||||||
@@ -608,6 +623,9 @@ export class Index {
|
|||||||
options?.stem,
|
options?.stem,
|
||||||
options?.removeStopWords,
|
options?.removeStopWords,
|
||||||
options?.asciiFolding,
|
options?.asciiFolding,
|
||||||
|
options?.ngramMinLength,
|
||||||
|
options?.ngramMaxLength,
|
||||||
|
options?.prefixOnly,
|
||||||
),
|
),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -6,9 +6,11 @@ import {
|
|||||||
Data,
|
Data,
|
||||||
DataType,
|
DataType,
|
||||||
IntoVector,
|
IntoVector,
|
||||||
|
MultiVector,
|
||||||
Schema,
|
Schema,
|
||||||
dataTypeToJson,
|
dataTypeToJson,
|
||||||
fromDataToBuffer,
|
fromDataToBuffer,
|
||||||
|
isMultiVector,
|
||||||
tableFromIPC,
|
tableFromIPC,
|
||||||
} from "./arrow";
|
} from "./arrow";
|
||||||
|
|
||||||
@@ -75,10 +77,10 @@ export interface OptimizeOptions {
|
|||||||
* // Delete all versions older than 1 day
|
* // Delete all versions older than 1 day
|
||||||
* const olderThan = new Date();
|
* const olderThan = new Date();
|
||||||
* olderThan.setDate(olderThan.getDate() - 1));
|
* olderThan.setDate(olderThan.getDate() - 1));
|
||||||
* tbl.cleanupOlderVersions(olderThan);
|
* tbl.optimize({cleanupOlderThan: olderThan});
|
||||||
*
|
*
|
||||||
* // Delete all versions except the current version
|
* // Delete all versions except the current version
|
||||||
* tbl.cleanupOlderVersions(new Date());
|
* tbl.optimize({cleanupOlderThan: new Date()});
|
||||||
*/
|
*/
|
||||||
cleanupOlderThan: Date;
|
cleanupOlderThan: Date;
|
||||||
deleteUnverified: boolean;
|
deleteUnverified: boolean;
|
||||||
@@ -346,7 +348,7 @@ export abstract class Table {
|
|||||||
* if the query is a string and no embedding function is defined, it will be treated as a full text search query
|
* if the query is a string and no embedding function is defined, it will be treated as a full text search query
|
||||||
*/
|
*/
|
||||||
abstract search(
|
abstract search(
|
||||||
query: string | IntoVector | FullTextQuery,
|
query: string | IntoVector | MultiVector | FullTextQuery,
|
||||||
queryType?: string,
|
queryType?: string,
|
||||||
ftsColumns?: string | string[],
|
ftsColumns?: string | string[],
|
||||||
): VectorQuery | Query;
|
): VectorQuery | Query;
|
||||||
@@ -357,7 +359,7 @@ export abstract class Table {
|
|||||||
* is the same thing as calling `nearestTo` on the builder returned
|
* is the same thing as calling `nearestTo` on the builder returned
|
||||||
* by `query`. @see {@link Query#nearestTo} for more details.
|
* by `query`. @see {@link Query#nearestTo} for more details.
|
||||||
*/
|
*/
|
||||||
abstract vectorSearch(vector: IntoVector): VectorQuery;
|
abstract vectorSearch(vector: IntoVector | MultiVector): VectorQuery;
|
||||||
/**
|
/**
|
||||||
* Add new columns with defined values.
|
* Add new columns with defined values.
|
||||||
* @param {AddColumnsSql[]} newColumnTransforms pairs of column names and
|
* @param {AddColumnsSql[]} newColumnTransforms pairs of column names and
|
||||||
@@ -668,7 +670,7 @@ export class LocalTable extends Table {
|
|||||||
}
|
}
|
||||||
|
|
||||||
search(
|
search(
|
||||||
query: string | IntoVector | FullTextQuery,
|
query: string | IntoVector | MultiVector | FullTextQuery,
|
||||||
queryType: string = "auto",
|
queryType: string = "auto",
|
||||||
ftsColumns?: string | string[],
|
ftsColumns?: string | string[],
|
||||||
): VectorQuery | Query {
|
): VectorQuery | Query {
|
||||||
@@ -715,7 +717,15 @@ export class LocalTable extends Table {
|
|||||||
return this.query().nearestTo(queryPromise);
|
return this.query().nearestTo(queryPromise);
|
||||||
}
|
}
|
||||||
|
|
||||||
vectorSearch(vector: IntoVector): VectorQuery {
|
vectorSearch(vector: IntoVector | MultiVector): VectorQuery {
|
||||||
|
if (isMultiVector(vector)) {
|
||||||
|
const query = this.query().nearestTo(vector[0]);
|
||||||
|
for (const v of vector.slice(1)) {
|
||||||
|
query.addQueryVector(v);
|
||||||
|
}
|
||||||
|
return query;
|
||||||
|
}
|
||||||
|
|
||||||
return this.query().nearestTo(vector);
|
return this.query().nearestTo(vector);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-darwin-arm64",
|
"name": "@lancedb/lancedb-darwin-arm64",
|
||||||
"version": "0.21.1-beta.0",
|
"version": "0.21.2-beta.0",
|
||||||
"os": ["darwin"],
|
"os": ["darwin"],
|
||||||
"cpu": ["arm64"],
|
"cpu": ["arm64"],
|
||||||
"main": "lancedb.darwin-arm64.node",
|
"main": "lancedb.darwin-arm64.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-darwin-x64",
|
"name": "@lancedb/lancedb-darwin-x64",
|
||||||
"version": "0.21.1-beta.0",
|
"version": "0.21.2-beta.0",
|
||||||
"os": ["darwin"],
|
"os": ["darwin"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.darwin-x64.node",
|
"main": "lancedb.darwin-x64.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-linux-arm64-gnu",
|
"name": "@lancedb/lancedb-linux-arm64-gnu",
|
||||||
"version": "0.21.1-beta.0",
|
"version": "0.21.2-beta.0",
|
||||||
"os": ["linux"],
|
"os": ["linux"],
|
||||||
"cpu": ["arm64"],
|
"cpu": ["arm64"],
|
||||||
"main": "lancedb.linux-arm64-gnu.node",
|
"main": "lancedb.linux-arm64-gnu.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-linux-arm64-musl",
|
"name": "@lancedb/lancedb-linux-arm64-musl",
|
||||||
"version": "0.21.1-beta.0",
|
"version": "0.21.2-beta.0",
|
||||||
"os": ["linux"],
|
"os": ["linux"],
|
||||||
"cpu": ["arm64"],
|
"cpu": ["arm64"],
|
||||||
"main": "lancedb.linux-arm64-musl.node",
|
"main": "lancedb.linux-arm64-musl.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-linux-x64-gnu",
|
"name": "@lancedb/lancedb-linux-x64-gnu",
|
||||||
"version": "0.21.1-beta.0",
|
"version": "0.21.2-beta.0",
|
||||||
"os": ["linux"],
|
"os": ["linux"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.linux-x64-gnu.node",
|
"main": "lancedb.linux-x64-gnu.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-linux-x64-musl",
|
"name": "@lancedb/lancedb-linux-x64-musl",
|
||||||
"version": "0.21.1-beta.0",
|
"version": "0.21.2-beta.0",
|
||||||
"os": ["linux"],
|
"os": ["linux"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.linux-x64-musl.node",
|
"main": "lancedb.linux-x64-musl.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-win32-arm64-msvc",
|
"name": "@lancedb/lancedb-win32-arm64-msvc",
|
||||||
"version": "0.21.1-beta.0",
|
"version": "0.21.2-beta.0",
|
||||||
"os": [
|
"os": [
|
||||||
"win32"
|
"win32"
|
||||||
],
|
],
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-win32-x64-msvc",
|
"name": "@lancedb/lancedb-win32-x64-msvc",
|
||||||
"version": "0.21.1-beta.0",
|
"version": "0.21.2-beta.0",
|
||||||
"os": ["win32"],
|
"os": ["win32"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.win32-x64-msvc.node",
|
"main": "lancedb.win32-x64-msvc.node",
|
||||||
|
|||||||
4
nodejs/package-lock.json
generated
4
nodejs/package-lock.json
generated
@@ -1,12 +1,12 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb",
|
"name": "@lancedb/lancedb",
|
||||||
"version": "0.21.1-beta.0",
|
"version": "0.21.2-beta.0",
|
||||||
"lockfileVersion": 3,
|
"lockfileVersion": 3,
|
||||||
"requires": true,
|
"requires": true,
|
||||||
"packages": {
|
"packages": {
|
||||||
"": {
|
"": {
|
||||||
"name": "@lancedb/lancedb",
|
"name": "@lancedb/lancedb",
|
||||||
"version": "0.21.1-beta.0",
|
"version": "0.21.2-beta.0",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"x64",
|
"x64",
|
||||||
"arm64"
|
"arm64"
|
||||||
|
|||||||
@@ -11,7 +11,7 @@
|
|||||||
"ann"
|
"ann"
|
||||||
],
|
],
|
||||||
"private": false,
|
"private": false,
|
||||||
"version": "0.21.1-beta.0",
|
"version": "0.21.2-beta.0",
|
||||||
"main": "dist/index.js",
|
"main": "dist/index.js",
|
||||||
"exports": {
|
"exports": {
|
||||||
".": "./dist/index.js",
|
".": "./dist/index.js",
|
||||||
|
|||||||
@@ -123,6 +123,9 @@ impl Index {
|
|||||||
stem: Option<bool>,
|
stem: Option<bool>,
|
||||||
remove_stop_words: Option<bool>,
|
remove_stop_words: Option<bool>,
|
||||||
ascii_folding: Option<bool>,
|
ascii_folding: Option<bool>,
|
||||||
|
ngram_min_length: Option<u32>,
|
||||||
|
ngram_max_length: Option<u32>,
|
||||||
|
prefix_only: Option<bool>,
|
||||||
) -> Self {
|
) -> Self {
|
||||||
let mut opts = FtsIndexBuilder::default();
|
let mut opts = FtsIndexBuilder::default();
|
||||||
if let Some(with_position) = with_position {
|
if let Some(with_position) = with_position {
|
||||||
@@ -149,6 +152,15 @@ impl Index {
|
|||||||
if let Some(ascii_folding) = ascii_folding {
|
if let Some(ascii_folding) = ascii_folding {
|
||||||
opts = opts.ascii_folding(ascii_folding);
|
opts = opts.ascii_folding(ascii_folding);
|
||||||
}
|
}
|
||||||
|
if let Some(ngram_min_length) = ngram_min_length {
|
||||||
|
opts = opts.ngram_min_length(ngram_min_length);
|
||||||
|
}
|
||||||
|
if let Some(ngram_max_length) = ngram_max_length {
|
||||||
|
opts = opts.ngram_max_length(ngram_max_length);
|
||||||
|
}
|
||||||
|
if let Some(prefix_only) = prefix_only {
|
||||||
|
opts = opts.ngram_prefix_only(prefix_only);
|
||||||
|
}
|
||||||
|
|
||||||
Self {
|
Self {
|
||||||
inner: Mutex::new(Some(LanceDbIndex::FTS(opts))),
|
inner: Mutex::new(Some(LanceDbIndex::FTS(opts))),
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
[tool.bumpversion]
|
[tool.bumpversion]
|
||||||
current_version = "0.24.1-beta.1"
|
current_version = "0.24.2-beta.1"
|
||||||
parse = """(?x)
|
parse = """(?x)
|
||||||
(?P<major>0|[1-9]\\d*)\\.
|
(?P<major>0|[1-9]\\d*)\\.
|
||||||
(?P<minor>0|[1-9]\\d*)\\.
|
(?P<minor>0|[1-9]\\d*)\\.
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb-python"
|
name = "lancedb-python"
|
||||||
version = "0.24.1-beta.1"
|
version = "0.24.2-beta.1"
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
description = "Python bindings for LanceDB"
|
description = "Python bindings for LanceDB"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
|
|||||||
@@ -137,6 +137,9 @@ class FTS:
|
|||||||
stem: bool = True
|
stem: bool = True
|
||||||
remove_stop_words: bool = True
|
remove_stop_words: bool = True
|
||||||
ascii_folding: bool = True
|
ascii_folding: bool = True
|
||||||
|
ngram_min_length: int = 3
|
||||||
|
ngram_max_length: int = 3
|
||||||
|
prefix_only: bool = False
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
|||||||
@@ -1374,6 +1374,8 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
|
|||||||
if query_string is not None and not isinstance(query_string, str):
|
if query_string is not None and not isinstance(query_string, str):
|
||||||
raise ValueError("Reranking currently only supports string queries")
|
raise ValueError("Reranking currently only supports string queries")
|
||||||
self._str_query = query_string if query_string is not None else self._str_query
|
self._str_query = query_string if query_string is not None else self._str_query
|
||||||
|
if reranker.score == "all":
|
||||||
|
self.with_row_id(True)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def bypass_vector_index(self) -> LanceVectorQueryBuilder:
|
def bypass_vector_index(self) -> LanceVectorQueryBuilder:
|
||||||
@@ -1569,6 +1571,8 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):
|
|||||||
The LanceQueryBuilder object.
|
The LanceQueryBuilder object.
|
||||||
"""
|
"""
|
||||||
self._reranker = reranker
|
self._reranker = reranker
|
||||||
|
if reranker.score == "all":
|
||||||
|
self.with_row_id(True)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
||||||
@@ -1845,6 +1849,8 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
|
|||||||
|
|
||||||
self._norm = normalize
|
self._norm = normalize
|
||||||
self._reranker = reranker
|
self._reranker = reranker
|
||||||
|
if reranker.score == "all":
|
||||||
|
self.with_row_id(True)
|
||||||
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|||||||
@@ -89,7 +89,7 @@ class RemoteTable(Table):
|
|||||||
|
|
||||||
def to_pandas(self):
|
def to_pandas(self):
|
||||||
"""to_pandas() is not yet supported on LanceDB cloud."""
|
"""to_pandas() is not yet supported on LanceDB cloud."""
|
||||||
return NotImplementedError("to_pandas() is not yet supported on LanceDB cloud.")
|
raise NotImplementedError("to_pandas() is not yet supported on LanceDB cloud.")
|
||||||
|
|
||||||
def checkout(self, version: Union[int, str]):
|
def checkout(self, version: Union[int, str]):
|
||||||
return LOOP.run(self._table.checkout(version))
|
return LOOP.run(self._table.checkout(version))
|
||||||
@@ -158,6 +158,9 @@ class RemoteTable(Table):
|
|||||||
stem: bool = True,
|
stem: bool = True,
|
||||||
remove_stop_words: bool = True,
|
remove_stop_words: bool = True,
|
||||||
ascii_folding: bool = True,
|
ascii_folding: bool = True,
|
||||||
|
ngram_min_length: int = 3,
|
||||||
|
ngram_max_length: int = 3,
|
||||||
|
prefix_only: bool = False,
|
||||||
):
|
):
|
||||||
config = FTS(
|
config = FTS(
|
||||||
with_position=with_position,
|
with_position=with_position,
|
||||||
@@ -168,6 +171,9 @@ class RemoteTable(Table):
|
|||||||
stem=stem,
|
stem=stem,
|
||||||
remove_stop_words=remove_stop_words,
|
remove_stop_words=remove_stop_words,
|
||||||
ascii_folding=ascii_folding,
|
ascii_folding=ascii_folding,
|
||||||
|
ngram_min_length=ngram_min_length,
|
||||||
|
ngram_max_length=ngram_max_length,
|
||||||
|
prefix_only=prefix_only,
|
||||||
)
|
)
|
||||||
LOOP.run(
|
LOOP.run(
|
||||||
self._table.create_index(
|
self._table.create_index(
|
||||||
|
|||||||
@@ -74,9 +74,7 @@ class AnswerdotaiRerankers(Reranker):
|
|||||||
if self.score == "relevance":
|
if self.score == "relevance":
|
||||||
combined_results = self._keep_relevance_score(combined_results)
|
combined_results = self._keep_relevance_score(combined_results)
|
||||||
elif self.score == "all":
|
elif self.score == "all":
|
||||||
raise NotImplementedError(
|
combined_results = self._merge_and_keep_scores(vector_results, fts_results)
|
||||||
"Answerdotai Reranker does not support score='all' yet"
|
|
||||||
)
|
|
||||||
combined_results = combined_results.sort_by(
|
combined_results = combined_results.sort_by(
|
||||||
[("_relevance_score", "descending")]
|
[("_relevance_score", "descending")]
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -232,6 +232,39 @@ class Reranker(ABC):
|
|||||||
|
|
||||||
return deduped_table
|
return deduped_table
|
||||||
|
|
||||||
|
def _merge_and_keep_scores(self, vector_results: pa.Table, fts_results: pa.Table):
|
||||||
|
"""
|
||||||
|
Merge the results from the vector and FTS search and keep the scores.
|
||||||
|
This op is slower than just keeping relevance score but can be useful
|
||||||
|
for debugging.
|
||||||
|
"""
|
||||||
|
# add nulls to fts results for _distance
|
||||||
|
if "_distance" not in fts_results.column_names:
|
||||||
|
fts_results = fts_results.append_column(
|
||||||
|
"_distance",
|
||||||
|
pa.array([None] * len(fts_results), type=pa.float32()),
|
||||||
|
)
|
||||||
|
# add nulls to vector results for _score
|
||||||
|
if "_score" not in vector_results.column_names:
|
||||||
|
vector_results = vector_results.append_column(
|
||||||
|
"_score",
|
||||||
|
pa.array([None] * len(vector_results), type=pa.float32()),
|
||||||
|
)
|
||||||
|
|
||||||
|
# combine them and fill the scores
|
||||||
|
vector_results_dict = {row["_rowid"]: row for row in vector_results.to_pylist()}
|
||||||
|
fts_results_dict = {row["_rowid"]: row for row in fts_results.to_pylist()}
|
||||||
|
|
||||||
|
# merge them into vector_results
|
||||||
|
for key, value in fts_results_dict.items():
|
||||||
|
if key in vector_results_dict:
|
||||||
|
vector_results_dict[key]["_score"] = value["_score"]
|
||||||
|
else:
|
||||||
|
vector_results_dict[key] = value
|
||||||
|
|
||||||
|
combined = pa.Table.from_pylist(list(vector_results_dict.values()))
|
||||||
|
return combined
|
||||||
|
|
||||||
def _keep_relevance_score(self, combined_results: pa.Table):
|
def _keep_relevance_score(self, combined_results: pa.Table):
|
||||||
if self.score == "relevance":
|
if self.score == "relevance":
|
||||||
if "_score" in combined_results.column_names:
|
if "_score" in combined_results.column_names:
|
||||||
|
|||||||
@@ -92,14 +92,14 @@ class CohereReranker(Reranker):
|
|||||||
vector_results: pa.Table,
|
vector_results: pa.Table,
|
||||||
fts_results: pa.Table,
|
fts_results: pa.Table,
|
||||||
):
|
):
|
||||||
|
if self.score == "all":
|
||||||
|
combined_results = self._merge_and_keep_scores(vector_results, fts_results)
|
||||||
|
else:
|
||||||
combined_results = self.merge_results(vector_results, fts_results)
|
combined_results = self.merge_results(vector_results, fts_results)
|
||||||
combined_results = self._rerank(combined_results, query)
|
combined_results = self._rerank(combined_results, query)
|
||||||
if self.score == "relevance":
|
if self.score == "relevance":
|
||||||
combined_results = self._keep_relevance_score(combined_results)
|
combined_results = self._keep_relevance_score(combined_results)
|
||||||
elif self.score == "all":
|
|
||||||
raise NotImplementedError(
|
|
||||||
"return_score='all' not implemented for cohere reranker"
|
|
||||||
)
|
|
||||||
return combined_results
|
return combined_results
|
||||||
|
|
||||||
def rerank_vector(self, query: str, vector_results: pa.Table):
|
def rerank_vector(self, query: str, vector_results: pa.Table):
|
||||||
|
|||||||
@@ -81,15 +81,15 @@ class CrossEncoderReranker(Reranker):
|
|||||||
vector_results: pa.Table,
|
vector_results: pa.Table,
|
||||||
fts_results: pa.Table,
|
fts_results: pa.Table,
|
||||||
):
|
):
|
||||||
|
if self.score == "all":
|
||||||
|
combined_results = self._merge_and_keep_scores(vector_results, fts_results)
|
||||||
|
else:
|
||||||
combined_results = self.merge_results(vector_results, fts_results)
|
combined_results = self.merge_results(vector_results, fts_results)
|
||||||
combined_results = self._rerank(combined_results, query)
|
combined_results = self._rerank(combined_results, query)
|
||||||
# sort the results by _score
|
# sort the results by _score
|
||||||
if self.score == "relevance":
|
if self.score == "relevance":
|
||||||
combined_results = self._keep_relevance_score(combined_results)
|
combined_results = self._keep_relevance_score(combined_results)
|
||||||
elif self.score == "all":
|
|
||||||
raise NotImplementedError(
|
|
||||||
"return_score='all' not implemented for CrossEncoderReranker"
|
|
||||||
)
|
|
||||||
combined_results = combined_results.sort_by(
|
combined_results = combined_results.sort_by(
|
||||||
[("_relevance_score", "descending")]
|
[("_relevance_score", "descending")]
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -97,14 +97,14 @@ class JinaReranker(Reranker):
|
|||||||
vector_results: pa.Table,
|
vector_results: pa.Table,
|
||||||
fts_results: pa.Table,
|
fts_results: pa.Table,
|
||||||
):
|
):
|
||||||
|
if self.score == "all":
|
||||||
|
combined_results = self._merge_and_keep_scores(vector_results, fts_results)
|
||||||
|
else:
|
||||||
combined_results = self.merge_results(vector_results, fts_results)
|
combined_results = self.merge_results(vector_results, fts_results)
|
||||||
combined_results = self._rerank(combined_results, query)
|
combined_results = self._rerank(combined_results, query)
|
||||||
if self.score == "relevance":
|
if self.score == "relevance":
|
||||||
combined_results = self._keep_relevance_score(combined_results)
|
combined_results = self._keep_relevance_score(combined_results)
|
||||||
elif self.score == "all":
|
|
||||||
raise NotImplementedError(
|
|
||||||
"return_score='all' not implemented for JinaReranker"
|
|
||||||
)
|
|
||||||
return combined_results
|
return combined_results
|
||||||
|
|
||||||
def rerank_vector(self, query: str, vector_results: pa.Table):
|
def rerank_vector(self, query: str, vector_results: pa.Table):
|
||||||
|
|||||||
@@ -88,14 +88,13 @@ class OpenaiReranker(Reranker):
|
|||||||
vector_results: pa.Table,
|
vector_results: pa.Table,
|
||||||
fts_results: pa.Table,
|
fts_results: pa.Table,
|
||||||
):
|
):
|
||||||
|
if self.score == "all":
|
||||||
|
combined_results = self._merge_and_keep_scores(vector_results, fts_results)
|
||||||
|
else:
|
||||||
combined_results = self.merge_results(vector_results, fts_results)
|
combined_results = self.merge_results(vector_results, fts_results)
|
||||||
combined_results = self._rerank(combined_results, query)
|
combined_results = self._rerank(combined_results, query)
|
||||||
if self.score == "relevance":
|
if self.score == "relevance":
|
||||||
combined_results = self._keep_relevance_score(combined_results)
|
combined_results = self._keep_relevance_score(combined_results)
|
||||||
elif self.score == "all":
|
|
||||||
raise NotImplementedError(
|
|
||||||
"OpenAI Reranker does not support score='all' yet"
|
|
||||||
)
|
|
||||||
|
|
||||||
combined_results = combined_results.sort_by(
|
combined_results = combined_results.sort_by(
|
||||||
[("_relevance_score", "descending")]
|
[("_relevance_score", "descending")]
|
||||||
|
|||||||
@@ -94,14 +94,14 @@ class VoyageAIReranker(Reranker):
|
|||||||
vector_results: pa.Table,
|
vector_results: pa.Table,
|
||||||
fts_results: pa.Table,
|
fts_results: pa.Table,
|
||||||
):
|
):
|
||||||
|
if self.score == "all":
|
||||||
|
combined_results = self._merge_and_keep_scores(vector_results, fts_results)
|
||||||
|
else:
|
||||||
combined_results = self.merge_results(vector_results, fts_results)
|
combined_results = self.merge_results(vector_results, fts_results)
|
||||||
combined_results = self._rerank(combined_results, query)
|
combined_results = self._rerank(combined_results, query)
|
||||||
if self.score == "relevance":
|
if self.score == "relevance":
|
||||||
combined_results = self._keep_relevance_score(combined_results)
|
combined_results = self._keep_relevance_score(combined_results)
|
||||||
elif self.score == "all":
|
|
||||||
raise NotImplementedError(
|
|
||||||
"return_score='all' not implemented for voyageai reranker"
|
|
||||||
)
|
|
||||||
return combined_results
|
return combined_results
|
||||||
|
|
||||||
def rerank_vector(self, query: str, vector_results: pa.Table):
|
def rerank_vector(self, query: str, vector_results: pa.Table):
|
||||||
|
|||||||
@@ -838,6 +838,9 @@ class Table(ABC):
|
|||||||
stem: bool = True,
|
stem: bool = True,
|
||||||
remove_stop_words: bool = True,
|
remove_stop_words: bool = True,
|
||||||
ascii_folding: bool = True,
|
ascii_folding: bool = True,
|
||||||
|
ngram_min_length: int = 3,
|
||||||
|
ngram_max_length: int = 3,
|
||||||
|
prefix_only: bool = False,
|
||||||
wait_timeout: Optional[timedelta] = None,
|
wait_timeout: Optional[timedelta] = None,
|
||||||
):
|
):
|
||||||
"""Create a full-text search index on the table.
|
"""Create a full-text search index on the table.
|
||||||
@@ -877,6 +880,7 @@ class Table(ABC):
|
|||||||
- "simple": Splits text by whitespace and punctuation.
|
- "simple": Splits text by whitespace and punctuation.
|
||||||
- "whitespace": Split text by whitespace, but not punctuation.
|
- "whitespace": Split text by whitespace, but not punctuation.
|
||||||
- "raw": No tokenization. The entire text is treated as a single token.
|
- "raw": No tokenization. The entire text is treated as a single token.
|
||||||
|
- "ngram": N-Gram tokenizer.
|
||||||
language : str, default "English"
|
language : str, default "English"
|
||||||
The language to use for tokenization.
|
The language to use for tokenization.
|
||||||
max_token_length : int, default 40
|
max_token_length : int, default 40
|
||||||
@@ -894,6 +898,12 @@ class Table(ABC):
|
|||||||
ascii_folding : bool, default True
|
ascii_folding : bool, default True
|
||||||
Whether to fold ASCII characters. This converts accented characters to
|
Whether to fold ASCII characters. This converts accented characters to
|
||||||
their ASCII equivalent. For example, "café" would be converted to "cafe".
|
their ASCII equivalent. For example, "café" would be converted to "cafe".
|
||||||
|
ngram_min_length: int, default 3
|
||||||
|
The minimum length of an n-gram.
|
||||||
|
ngram_max_length: int, default 3
|
||||||
|
The maximum length of an n-gram.
|
||||||
|
prefix_only: bool, default False
|
||||||
|
Whether to only index the prefix of the token for ngram tokenizer.
|
||||||
wait_timeout: timedelta, optional
|
wait_timeout: timedelta, optional
|
||||||
The timeout to wait if indexing is asynchronous.
|
The timeout to wait if indexing is asynchronous.
|
||||||
"""
|
"""
|
||||||
@@ -1981,6 +1991,9 @@ class LanceTable(Table):
|
|||||||
stem: bool = True,
|
stem: bool = True,
|
||||||
remove_stop_words: bool = True,
|
remove_stop_words: bool = True,
|
||||||
ascii_folding: bool = True,
|
ascii_folding: bool = True,
|
||||||
|
ngram_min_length: int = 3,
|
||||||
|
ngram_max_length: int = 3,
|
||||||
|
prefix_only: bool = False,
|
||||||
):
|
):
|
||||||
if not use_tantivy:
|
if not use_tantivy:
|
||||||
if not isinstance(field_names, str):
|
if not isinstance(field_names, str):
|
||||||
@@ -1996,6 +2009,9 @@ class LanceTable(Table):
|
|||||||
"stem": stem,
|
"stem": stem,
|
||||||
"remove_stop_words": remove_stop_words,
|
"remove_stop_words": remove_stop_words,
|
||||||
"ascii_folding": ascii_folding,
|
"ascii_folding": ascii_folding,
|
||||||
|
"ngram_min_length": ngram_min_length,
|
||||||
|
"ngram_max_length": ngram_max_length,
|
||||||
|
"prefix_only": prefix_only,
|
||||||
}
|
}
|
||||||
else:
|
else:
|
||||||
tokenizer_configs = self.infer_tokenizer_configs(tokenizer_name)
|
tokenizer_configs = self.infer_tokenizer_configs(tokenizer_name)
|
||||||
@@ -2065,6 +2081,9 @@ class LanceTable(Table):
|
|||||||
"stem": False,
|
"stem": False,
|
||||||
"remove_stop_words": False,
|
"remove_stop_words": False,
|
||||||
"ascii_folding": False,
|
"ascii_folding": False,
|
||||||
|
"ngram_min_length": 3,
|
||||||
|
"ngram_max_length": 3,
|
||||||
|
"prefix_only": False,
|
||||||
}
|
}
|
||||||
elif tokenizer_name == "raw":
|
elif tokenizer_name == "raw":
|
||||||
return {
|
return {
|
||||||
@@ -2075,6 +2094,9 @@ class LanceTable(Table):
|
|||||||
"stem": False,
|
"stem": False,
|
||||||
"remove_stop_words": False,
|
"remove_stop_words": False,
|
||||||
"ascii_folding": False,
|
"ascii_folding": False,
|
||||||
|
"ngram_min_length": 3,
|
||||||
|
"ngram_max_length": 3,
|
||||||
|
"prefix_only": False,
|
||||||
}
|
}
|
||||||
elif tokenizer_name == "whitespace":
|
elif tokenizer_name == "whitespace":
|
||||||
return {
|
return {
|
||||||
@@ -2085,6 +2107,9 @@ class LanceTable(Table):
|
|||||||
"stem": False,
|
"stem": False,
|
||||||
"remove_stop_words": False,
|
"remove_stop_words": False,
|
||||||
"ascii_folding": False,
|
"ascii_folding": False,
|
||||||
|
"ngram_min_length": 3,
|
||||||
|
"ngram_max_length": 3,
|
||||||
|
"prefix_only": False,
|
||||||
}
|
}
|
||||||
|
|
||||||
# or it's with language stemming with pattern like "en_stem"
|
# or it's with language stemming with pattern like "en_stem"
|
||||||
@@ -2103,6 +2128,9 @@ class LanceTable(Table):
|
|||||||
"stem": True,
|
"stem": True,
|
||||||
"remove_stop_words": False,
|
"remove_stop_words": False,
|
||||||
"ascii_folding": False,
|
"ascii_folding": False,
|
||||||
|
"ngram_min_length": 3,
|
||||||
|
"ngram_max_length": 3,
|
||||||
|
"prefix_only": False,
|
||||||
}
|
}
|
||||||
|
|
||||||
def add(
|
def add(
|
||||||
|
|||||||
@@ -25,4 +25,4 @@ IndexType = Literal[
|
|||||||
]
|
]
|
||||||
|
|
||||||
# Tokenizer literals
|
# Tokenizer literals
|
||||||
BaseTokenizerType = Literal["simple", "raw", "whitespace"]
|
BaseTokenizerType = Literal["simple", "raw", "whitespace", "ngram"]
|
||||||
|
|||||||
@@ -669,3 +669,46 @@ def test_fts_on_list(mem_db: DBConnection):
|
|||||||
|
|
||||||
res = table.search(PhraseQuery("lance database", "text")).limit(5).to_list()
|
res = table.search(PhraseQuery("lance database", "text")).limit(5).to_list()
|
||||||
assert len(res) == 2
|
assert len(res) == 2
|
||||||
|
|
||||||
|
|
||||||
|
def test_fts_ngram(mem_db: DBConnection):
|
||||||
|
data = pa.table({"text": ["hello world", "lance database", "lance is cool"]})
|
||||||
|
table = mem_db.create_table("test", data=data)
|
||||||
|
table.create_fts_index("text", use_tantivy=False, base_tokenizer="ngram")
|
||||||
|
|
||||||
|
results = table.search("lan", query_type="fts").limit(10).to_list()
|
||||||
|
assert len(results) == 2
|
||||||
|
assert set(r["text"] for r in results) == {"lance database", "lance is cool"}
|
||||||
|
|
||||||
|
results = (
|
||||||
|
table.search("nce", query_type="fts").limit(10).to_list()
|
||||||
|
) # spellchecker:disable-line
|
||||||
|
assert len(results) == 2
|
||||||
|
assert set(r["text"] for r in results) == {"lance database", "lance is cool"}
|
||||||
|
|
||||||
|
# the default min_ngram_length is 3, so "la" should not match
|
||||||
|
results = table.search("la", query_type="fts").limit(10).to_list()
|
||||||
|
assert len(results) == 0
|
||||||
|
|
||||||
|
# test setting min_ngram_length and prefix_only
|
||||||
|
table.create_fts_index(
|
||||||
|
"text",
|
||||||
|
use_tantivy=False,
|
||||||
|
base_tokenizer="ngram",
|
||||||
|
replace=True,
|
||||||
|
ngram_min_length=2,
|
||||||
|
prefix_only=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
results = table.search("lan", query_type="fts").limit(10).to_list()
|
||||||
|
assert len(results) == 2
|
||||||
|
assert set(r["text"] for r in results) == {"lance database", "lance is cool"}
|
||||||
|
|
||||||
|
results = (
|
||||||
|
table.search("nce", query_type="fts").limit(10).to_list()
|
||||||
|
) # spellchecker:disable-line
|
||||||
|
assert len(results) == 0
|
||||||
|
|
||||||
|
results = table.search("la", query_type="fts").limit(10).to_list()
|
||||||
|
assert len(results) == 2
|
||||||
|
assert set(r["text"] for r in results) == {"lance database", "lance is cool"}
|
||||||
|
|||||||
@@ -272,7 +272,9 @@ async def test_distance_range_with_new_rows_async():
|
|||||||
# append more rows so that execution plan would be mixed with ANN & Flat KNN
|
# append more rows so that execution plan would be mixed with ANN & Flat KNN
|
||||||
new_data = pa.table(
|
new_data = pa.table(
|
||||||
{
|
{
|
||||||
"vector": pa.FixedShapeTensorArray.from_numpy_ndarray(np.random.rand(4, 2)),
|
"vector": pa.FixedShapeTensorArray.from_numpy_ndarray(
|
||||||
|
np.random.rand(4, 2) + 1
|
||||||
|
),
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
await table.add(new_data)
|
await table.add(new_data)
|
||||||
|
|||||||
@@ -210,6 +210,25 @@ async def test_retry_error():
|
|||||||
assert cause.status_code == 429
|
assert cause.status_code == 429
|
||||||
|
|
||||||
|
|
||||||
|
def test_table_unimplemented_functions():
|
||||||
|
def handler(request):
|
||||||
|
if request.path == "/v1/table/test/create/?mode=create":
|
||||||
|
request.send_response(200)
|
||||||
|
request.send_header("Content-Type", "application/json")
|
||||||
|
request.end_headers()
|
||||||
|
request.wfile.write(b"{}")
|
||||||
|
else:
|
||||||
|
request.send_response(404)
|
||||||
|
request.end_headers()
|
||||||
|
|
||||||
|
with mock_lancedb_connection(handler) as db:
|
||||||
|
table = db.create_table("test", [{"id": 1}])
|
||||||
|
with pytest.raises(NotImplementedError):
|
||||||
|
table.to_arrow()
|
||||||
|
with pytest.raises(NotImplementedError):
|
||||||
|
table.to_pandas()
|
||||||
|
|
||||||
|
|
||||||
def test_table_add_in_threadpool():
|
def test_table_add_in_threadpool():
|
||||||
def handler(request):
|
def handler(request):
|
||||||
if request.path == "/v1/table/test/insert/":
|
if request.path == "/v1/table/test/insert/":
|
||||||
|
|||||||
@@ -499,3 +499,19 @@ def test_empty_result_reranker():
|
|||||||
.rerank(reranker)
|
.rerank(reranker)
|
||||||
.to_arrow()
|
.to_arrow()
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("use_tantivy", [True, False])
|
||||||
|
def test_cross_encoder_reranker_return_all(tmp_path, use_tantivy):
|
||||||
|
pytest.importorskip("sentence_transformers")
|
||||||
|
reranker = CrossEncoderReranker(return_score="all")
|
||||||
|
table, schema = get_test_table(tmp_path, use_tantivy)
|
||||||
|
query = "single player experience"
|
||||||
|
result = (
|
||||||
|
table.search(query, query_type="hybrid", vector_column_name="vector")
|
||||||
|
.rerank(reranker=reranker)
|
||||||
|
.to_arrow()
|
||||||
|
)
|
||||||
|
assert "_relevance_score" in result.column_names
|
||||||
|
assert "_score" in result.column_names
|
||||||
|
assert "_distance" in result.column_names
|
||||||
|
|||||||
@@ -47,7 +47,10 @@ pub fn extract_index_params(source: &Option<Bound<'_, PyAny>>) -> PyResult<Lance
|
|||||||
.max_token_length(params.max_token_length)
|
.max_token_length(params.max_token_length)
|
||||||
.remove_stop_words(params.remove_stop_words)
|
.remove_stop_words(params.remove_stop_words)
|
||||||
.stem(params.stem)
|
.stem(params.stem)
|
||||||
.ascii_folding(params.ascii_folding);
|
.ascii_folding(params.ascii_folding)
|
||||||
|
.ngram_min_length(params.ngram_min_length)
|
||||||
|
.ngram_max_length(params.ngram_max_length)
|
||||||
|
.ngram_prefix_only(params.prefix_only);
|
||||||
Ok(LanceDbIndex::FTS(inner_opts))
|
Ok(LanceDbIndex::FTS(inner_opts))
|
||||||
},
|
},
|
||||||
"IvfFlat" => {
|
"IvfFlat" => {
|
||||||
@@ -130,6 +133,9 @@ struct FtsParams {
|
|||||||
stem: bool,
|
stem: bool,
|
||||||
remove_stop_words: bool,
|
remove_stop_words: bool,
|
||||||
ascii_folding: bool,
|
ascii_folding: bool,
|
||||||
|
ngram_min_length: u32,
|
||||||
|
ngram_max_length: u32,
|
||||||
|
prefix_only: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(FromPyObject)]
|
#[derive(FromPyObject)]
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb-node"
|
name = "lancedb-node"
|
||||||
version = "0.21.1-beta.0"
|
version = "0.21.2-beta.0"
|
||||||
description = "Serverless, low-latency vector database for AI applications"
|
description = "Serverless, low-latency vector database for AI applications"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb"
|
name = "lancedb"
|
||||||
version = "0.21.1-beta.0"
|
version = "0.21.2-beta.0"
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
description = "LanceDB: A serverless, low-latency vector database for AI applications"
|
description = "LanceDB: A serverless, low-latency vector database for AI applications"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
|
|||||||
@@ -216,6 +216,7 @@ impl Catalog for ListingCatalog {
|
|||||||
client_config: Default::default(),
|
client_config: Default::default(),
|
||||||
read_consistency_interval: None,
|
read_consistency_interval: None,
|
||||||
options: Default::default(),
|
options: Default::default(),
|
||||||
|
session: None,
|
||||||
};
|
};
|
||||||
|
|
||||||
// Add the db options to the connect request
|
// Add the db options to the connect request
|
||||||
@@ -243,6 +244,7 @@ impl Catalog for ListingCatalog {
|
|||||||
client_config: Default::default(),
|
client_config: Default::default(),
|
||||||
read_consistency_interval: None,
|
read_consistency_interval: None,
|
||||||
options: Default::default(),
|
options: Default::default(),
|
||||||
|
session: None,
|
||||||
};
|
};
|
||||||
|
|
||||||
// Add the db options to the connect request
|
// Add the db options to the connect request
|
||||||
@@ -312,6 +314,7 @@ mod tests {
|
|||||||
client_config: Default::default(),
|
client_config: Default::default(),
|
||||||
options: Default::default(),
|
options: Default::default(),
|
||||||
read_consistency_interval: None,
|
read_consistency_interval: None,
|
||||||
|
session: None,
|
||||||
};
|
};
|
||||||
|
|
||||||
let catalog = ListingCatalog::connect(&request).await.unwrap();
|
let catalog = ListingCatalog::connect(&request).await.unwrap();
|
||||||
@@ -573,6 +576,7 @@ mod tests {
|
|||||||
client_config: Default::default(),
|
client_config: Default::default(),
|
||||||
options: Default::default(),
|
options: Default::default(),
|
||||||
read_consistency_interval: None,
|
read_consistency_interval: None,
|
||||||
|
session: None,
|
||||||
};
|
};
|
||||||
|
|
||||||
let catalog = ListingCatalog::connect(&request).await.unwrap();
|
let catalog = ListingCatalog::connect(&request).await.unwrap();
|
||||||
@@ -592,6 +596,7 @@ mod tests {
|
|||||||
client_config: Default::default(),
|
client_config: Default::default(),
|
||||||
options: Default::default(),
|
options: Default::default(),
|
||||||
read_consistency_interval: None,
|
read_consistency_interval: None,
|
||||||
|
session: None,
|
||||||
};
|
};
|
||||||
|
|
||||||
let catalog = ListingCatalog::connect(&request).await.unwrap();
|
let catalog = ListingCatalog::connect(&request).await.unwrap();
|
||||||
@@ -608,6 +613,7 @@ mod tests {
|
|||||||
client_config: Default::default(),
|
client_config: Default::default(),
|
||||||
options: Default::default(),
|
options: Default::default(),
|
||||||
read_consistency_interval: None,
|
read_consistency_interval: None,
|
||||||
|
session: None,
|
||||||
};
|
};
|
||||||
|
|
||||||
let result = ListingCatalog::connect(&request).await;
|
let result = ListingCatalog::connect(&request).await;
|
||||||
|
|||||||
@@ -627,6 +627,12 @@ pub struct ConnectRequest {
|
|||||||
/// consistency only applies to read operations. Write operations are
|
/// consistency only applies to read operations. Write operations are
|
||||||
/// always consistent.
|
/// always consistent.
|
||||||
pub read_consistency_interval: Option<std::time::Duration>,
|
pub read_consistency_interval: Option<std::time::Duration>,
|
||||||
|
|
||||||
|
/// Optional session for object stores and caching
|
||||||
|
///
|
||||||
|
/// If provided, this session will be used instead of creating a default one.
|
||||||
|
/// This allows for custom configuration of object store registries, caching, etc.
|
||||||
|
pub session: Option<Arc<lance::session::Session>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
@@ -645,6 +651,7 @@ impl ConnectBuilder {
|
|||||||
client_config: Default::default(),
|
client_config: Default::default(),
|
||||||
read_consistency_interval: None,
|
read_consistency_interval: None,
|
||||||
options: HashMap::new(),
|
options: HashMap::new(),
|
||||||
|
session: None,
|
||||||
},
|
},
|
||||||
embedding_registry: None,
|
embedding_registry: None,
|
||||||
}
|
}
|
||||||
@@ -802,6 +809,20 @@ impl ConnectBuilder {
|
|||||||
self
|
self
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Set a custom session for object stores and caching.
|
||||||
|
///
|
||||||
|
/// By default, a new session with default configuration will be created.
|
||||||
|
/// This method allows you to provide a custom session with your own
|
||||||
|
/// configuration for object store registries, caching, etc.
|
||||||
|
///
|
||||||
|
/// # Arguments
|
||||||
|
///
|
||||||
|
/// * `session` - A custom session to use for this connection
|
||||||
|
pub fn session(mut self, session: Arc<lance::session::Session>) -> Self {
|
||||||
|
self.request.session = Some(session);
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(feature = "remote")]
|
#[cfg(feature = "remote")]
|
||||||
fn execute_remote(self) -> Result<Connection> {
|
fn execute_remote(self) -> Result<Connection> {
|
||||||
use crate::remote::db::RemoteDatabaseOptions;
|
use crate::remote::db::RemoteDatabaseOptions;
|
||||||
@@ -884,6 +905,7 @@ impl CatalogConnectBuilder {
|
|||||||
client_config: Default::default(),
|
client_config: Default::default(),
|
||||||
read_consistency_interval: None,
|
read_consistency_interval: None,
|
||||||
options: HashMap::new(),
|
options: HashMap::new(),
|
||||||
|
session: None,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ use std::path::Path;
|
|||||||
use std::{collections::HashMap, sync::Arc};
|
use std::{collections::HashMap, sync::Arc};
|
||||||
|
|
||||||
use lance::dataset::{ReadParams, WriteMode};
|
use lance::dataset::{ReadParams, WriteMode};
|
||||||
use lance::io::{ObjectStore, ObjectStoreParams, ObjectStoreRegistry, WrappingObjectStore};
|
use lance::io::{ObjectStore, ObjectStoreParams, WrappingObjectStore};
|
||||||
use lance_datafusion::utils::StreamingWriteSource;
|
use lance_datafusion::utils::StreamingWriteSource;
|
||||||
use lance_encoding::version::LanceFileVersion;
|
use lance_encoding::version::LanceFileVersion;
|
||||||
use lance_table::io::commit::commit_handler_from_url;
|
use lance_table::io::commit::commit_handler_from_url;
|
||||||
@@ -217,6 +217,9 @@ pub struct ListingDatabase {
|
|||||||
|
|
||||||
// Options for tables created by this connection
|
// Options for tables created by this connection
|
||||||
new_table_config: NewTableConfig,
|
new_table_config: NewTableConfig,
|
||||||
|
|
||||||
|
// Session for object stores and caching
|
||||||
|
session: Arc<lance::session::Session>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl std::fmt::Display for ListingDatabase {
|
impl std::fmt::Display for ListingDatabase {
|
||||||
@@ -262,6 +265,7 @@ impl ListingDatabase {
|
|||||||
uri,
|
uri,
|
||||||
request.read_consistency_interval,
|
request.read_consistency_interval,
|
||||||
options.new_table_config,
|
options.new_table_config,
|
||||||
|
request.session.clone(),
|
||||||
)
|
)
|
||||||
.await
|
.await
|
||||||
}
|
}
|
||||||
@@ -313,13 +317,20 @@ impl ListingDatabase {
|
|||||||
|
|
||||||
let plain_uri = url.to_string();
|
let plain_uri = url.to_string();
|
||||||
|
|
||||||
let registry = Arc::new(ObjectStoreRegistry::default());
|
let session = request
|
||||||
|
.session
|
||||||
|
.clone()
|
||||||
|
.unwrap_or_else(|| Arc::new(lance::session::Session::default()));
|
||||||
let os_params = ObjectStoreParams {
|
let os_params = ObjectStoreParams {
|
||||||
storage_options: Some(options.storage_options.clone()),
|
storage_options: Some(options.storage_options.clone()),
|
||||||
..Default::default()
|
..Default::default()
|
||||||
};
|
};
|
||||||
let (object_store, base_path) =
|
let (object_store, base_path) = ObjectStore::from_uri_and_params(
|
||||||
ObjectStore::from_uri_and_params(registry, &plain_uri, &os_params).await?;
|
session.store_registry(),
|
||||||
|
&plain_uri,
|
||||||
|
&os_params,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
if object_store.is_local() {
|
if object_store.is_local() {
|
||||||
Self::try_create_dir(&plain_uri).context(CreateDirSnafu { path: plain_uri })?;
|
Self::try_create_dir(&plain_uri).context(CreateDirSnafu { path: plain_uri })?;
|
||||||
}
|
}
|
||||||
@@ -342,6 +353,7 @@ impl ListingDatabase {
|
|||||||
read_consistency_interval: request.read_consistency_interval,
|
read_consistency_interval: request.read_consistency_interval,
|
||||||
storage_options: options.storage_options,
|
storage_options: options.storage_options,
|
||||||
new_table_config: options.new_table_config,
|
new_table_config: options.new_table_config,
|
||||||
|
session,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
Err(_) => {
|
Err(_) => {
|
||||||
@@ -349,6 +361,7 @@ impl ListingDatabase {
|
|||||||
uri,
|
uri,
|
||||||
request.read_consistency_interval,
|
request.read_consistency_interval,
|
||||||
options.new_table_config,
|
options.new_table_config,
|
||||||
|
request.session.clone(),
|
||||||
)
|
)
|
||||||
.await
|
.await
|
||||||
}
|
}
|
||||||
@@ -359,8 +372,15 @@ impl ListingDatabase {
|
|||||||
path: &str,
|
path: &str,
|
||||||
read_consistency_interval: Option<std::time::Duration>,
|
read_consistency_interval: Option<std::time::Duration>,
|
||||||
new_table_config: NewTableConfig,
|
new_table_config: NewTableConfig,
|
||||||
|
session: Option<Arc<lance::session::Session>>,
|
||||||
) -> Result<Self> {
|
) -> Result<Self> {
|
||||||
let (object_store, base_path) = ObjectStore::from_uri(path).await?;
|
let session = session.unwrap_or_else(|| Arc::new(lance::session::Session::default()));
|
||||||
|
let (object_store, base_path) = ObjectStore::from_uri_and_params(
|
||||||
|
session.store_registry(),
|
||||||
|
path,
|
||||||
|
&ObjectStoreParams::default(),
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
if object_store.is_local() {
|
if object_store.is_local() {
|
||||||
Self::try_create_dir(path).context(CreateDirSnafu { path })?;
|
Self::try_create_dir(path).context(CreateDirSnafu { path })?;
|
||||||
}
|
}
|
||||||
@@ -374,6 +394,7 @@ impl ListingDatabase {
|
|||||||
read_consistency_interval,
|
read_consistency_interval,
|
||||||
storage_options: HashMap::new(),
|
storage_options: HashMap::new(),
|
||||||
new_table_config,
|
new_table_config,
|
||||||
|
session,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -441,6 +462,128 @@ impl ListingDatabase {
|
|||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Inherit storage options from the connection into the target map
|
||||||
|
fn inherit_storage_options(&self, target: &mut HashMap<String, String>) {
|
||||||
|
for (key, value) in self.storage_options.iter() {
|
||||||
|
if !target.contains_key(key) {
|
||||||
|
target.insert(key.clone(), value.clone());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extract storage option overrides from the request
|
||||||
|
fn extract_storage_overrides(
|
||||||
|
&self,
|
||||||
|
request: &CreateTableRequest,
|
||||||
|
) -> Result<(Option<LanceFileVersion>, Option<bool>)> {
|
||||||
|
let storage_options = request
|
||||||
|
.write_options
|
||||||
|
.lance_write_params
|
||||||
|
.as_ref()
|
||||||
|
.and_then(|p| p.store_params.as_ref())
|
||||||
|
.and_then(|sp| sp.storage_options.as_ref());
|
||||||
|
|
||||||
|
let storage_version_override = storage_options
|
||||||
|
.and_then(|opts| opts.get(OPT_NEW_TABLE_STORAGE_VERSION))
|
||||||
|
.map(|s| s.parse::<LanceFileVersion>())
|
||||||
|
.transpose()?;
|
||||||
|
|
||||||
|
let v2_manifest_override = storage_options
|
||||||
|
.and_then(|opts| opts.get(OPT_NEW_TABLE_V2_MANIFEST_PATHS))
|
||||||
|
.map(|s| s.parse::<bool>())
|
||||||
|
.transpose()
|
||||||
|
.map_err(|_| Error::InvalidInput {
|
||||||
|
message: "enable_v2_manifest_paths must be a boolean".to_string(),
|
||||||
|
})?;
|
||||||
|
|
||||||
|
Ok((storage_version_override, v2_manifest_override))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Prepare write parameters for table creation
|
||||||
|
fn prepare_write_params(
|
||||||
|
&self,
|
||||||
|
request: &CreateTableRequest,
|
||||||
|
storage_version_override: Option<LanceFileVersion>,
|
||||||
|
v2_manifest_override: Option<bool>,
|
||||||
|
) -> lance::dataset::WriteParams {
|
||||||
|
let mut write_params = request
|
||||||
|
.write_options
|
||||||
|
.lance_write_params
|
||||||
|
.clone()
|
||||||
|
.unwrap_or_default();
|
||||||
|
|
||||||
|
// Only modify the storage options if we actually have something to
|
||||||
|
// inherit. There is a difference between storage_options=None and
|
||||||
|
// storage_options=Some({}). Using storage_options=None will cause the
|
||||||
|
// connection's session store registry to be used. Supplying Some({})
|
||||||
|
// will cause a new connection to be created, and that connection will
|
||||||
|
// be dropped from the cache when python GCs the table object, which
|
||||||
|
// confounds reuse across tables.
|
||||||
|
if !self.storage_options.is_empty() {
|
||||||
|
let storage_options = write_params
|
||||||
|
.store_params
|
||||||
|
.get_or_insert_with(Default::default)
|
||||||
|
.storage_options
|
||||||
|
.get_or_insert_with(Default::default);
|
||||||
|
self.inherit_storage_options(storage_options);
|
||||||
|
}
|
||||||
|
|
||||||
|
write_params.data_storage_version = self
|
||||||
|
.new_table_config
|
||||||
|
.data_storage_version
|
||||||
|
.or(storage_version_override);
|
||||||
|
|
||||||
|
if let Some(enable_v2_manifest_paths) = self
|
||||||
|
.new_table_config
|
||||||
|
.enable_v2_manifest_paths
|
||||||
|
.or(v2_manifest_override)
|
||||||
|
{
|
||||||
|
write_params.enable_v2_manifest_paths = enable_v2_manifest_paths;
|
||||||
|
}
|
||||||
|
|
||||||
|
if matches!(&request.mode, CreateTableMode::Overwrite) {
|
||||||
|
write_params.mode = WriteMode::Overwrite;
|
||||||
|
}
|
||||||
|
|
||||||
|
write_params.session = Some(self.session.clone());
|
||||||
|
|
||||||
|
write_params
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Handle the case where table already exists based on the create mode
|
||||||
|
async fn handle_table_exists(
|
||||||
|
&self,
|
||||||
|
table_name: &str,
|
||||||
|
mode: CreateTableMode,
|
||||||
|
data_schema: &arrow_schema::Schema,
|
||||||
|
) -> Result<Arc<dyn BaseTable>> {
|
||||||
|
match mode {
|
||||||
|
CreateTableMode::Create => Err(Error::TableAlreadyExists {
|
||||||
|
name: table_name.to_string(),
|
||||||
|
}),
|
||||||
|
CreateTableMode::ExistOk(callback) => {
|
||||||
|
let req = OpenTableRequest {
|
||||||
|
name: table_name.to_string(),
|
||||||
|
index_cache_size: None,
|
||||||
|
lance_read_params: None,
|
||||||
|
};
|
||||||
|
let req = (callback)(req);
|
||||||
|
let table = self.open_table(req).await?;
|
||||||
|
|
||||||
|
let table_schema = table.schema().await?;
|
||||||
|
|
||||||
|
if table_schema.as_ref() != data_schema {
|
||||||
|
return Err(Error::Schema {
|
||||||
|
message: "Provided schema does not match existing table schema".to_string(),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(table)
|
||||||
|
}
|
||||||
|
CreateTableMode::Overwrite => unreachable!(),
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait::async_trait]
|
#[async_trait::async_trait]
|
||||||
@@ -475,50 +618,14 @@ impl Database for ListingDatabase {
|
|||||||
Ok(f)
|
Ok(f)
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn create_table(&self, mut request: CreateTableRequest) -> Result<Arc<dyn BaseTable>> {
|
async fn create_table(&self, request: CreateTableRequest) -> Result<Arc<dyn BaseTable>> {
|
||||||
let table_uri = self.table_uri(&request.name)?;
|
let table_uri = self.table_uri(&request.name)?;
|
||||||
// Inherit storage options from the connection
|
|
||||||
let storage_options = request
|
|
||||||
.write_options
|
|
||||||
.lance_write_params
|
|
||||||
.get_or_insert_with(Default::default)
|
|
||||||
.store_params
|
|
||||||
.get_or_insert_with(Default::default)
|
|
||||||
.storage_options
|
|
||||||
.get_or_insert_with(Default::default);
|
|
||||||
for (key, value) in self.storage_options.iter() {
|
|
||||||
if !storage_options.contains_key(key) {
|
|
||||||
storage_options.insert(key.clone(), value.clone());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let storage_options = storage_options.clone();
|
let (storage_version_override, v2_manifest_override) =
|
||||||
|
self.extract_storage_overrides(&request)?;
|
||||||
|
|
||||||
let mut write_params = request.write_options.lance_write_params.unwrap_or_default();
|
let write_params =
|
||||||
|
self.prepare_write_params(&request, storage_version_override, v2_manifest_override);
|
||||||
if let Some(storage_version) = &self.new_table_config.data_storage_version {
|
|
||||||
write_params.data_storage_version = Some(*storage_version);
|
|
||||||
} else {
|
|
||||||
// Allow the user to override the storage version via storage options (backwards compatibility)
|
|
||||||
if let Some(data_storage_version) = storage_options.get(OPT_NEW_TABLE_STORAGE_VERSION) {
|
|
||||||
write_params.data_storage_version = Some(data_storage_version.parse()?);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if let Some(enable_v2_manifest_paths) = self.new_table_config.enable_v2_manifest_paths {
|
|
||||||
write_params.enable_v2_manifest_paths = enable_v2_manifest_paths;
|
|
||||||
} else {
|
|
||||||
// Allow the user to override the storage version via storage options (backwards compatibility)
|
|
||||||
if let Some(enable_v2_manifest_paths) = storage_options
|
|
||||||
.get(OPT_NEW_TABLE_V2_MANIFEST_PATHS)
|
|
||||||
.map(|s| s.parse::<bool>().unwrap())
|
|
||||||
{
|
|
||||||
write_params.enable_v2_manifest_paths = enable_v2_manifest_paths;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if matches!(&request.mode, CreateTableMode::Overwrite) {
|
|
||||||
write_params.mode = WriteMode::Overwrite;
|
|
||||||
}
|
|
||||||
|
|
||||||
let data_schema = request.data.arrow_schema();
|
let data_schema = request.data.arrow_schema();
|
||||||
|
|
||||||
@@ -533,30 +640,10 @@ impl Database for ListingDatabase {
|
|||||||
.await
|
.await
|
||||||
{
|
{
|
||||||
Ok(table) => Ok(Arc::new(table)),
|
Ok(table) => Ok(Arc::new(table)),
|
||||||
Err(Error::TableAlreadyExists { name }) => match request.mode {
|
Err(Error::TableAlreadyExists { .. }) => {
|
||||||
CreateTableMode::Create => Err(Error::TableAlreadyExists { name }),
|
self.handle_table_exists(&request.name, request.mode, &data_schema)
|
||||||
CreateTableMode::ExistOk(callback) => {
|
.await
|
||||||
let req = OpenTableRequest {
|
|
||||||
name: request.name.clone(),
|
|
||||||
index_cache_size: None,
|
|
||||||
lance_read_params: None,
|
|
||||||
};
|
|
||||||
let req = (callback)(req);
|
|
||||||
let table = self.open_table(req).await?;
|
|
||||||
|
|
||||||
let table_schema = table.schema().await?;
|
|
||||||
|
|
||||||
if table_schema != data_schema {
|
|
||||||
return Err(Error::Schema {
|
|
||||||
message: "Provided schema does not match existing table schema"
|
|
||||||
.to_string(),
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(table)
|
|
||||||
}
|
|
||||||
CreateTableMode::Overwrite => unreachable!(),
|
|
||||||
},
|
|
||||||
Err(err) => Err(err),
|
Err(err) => Err(err),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -564,7 +651,14 @@ impl Database for ListingDatabase {
|
|||||||
async fn open_table(&self, mut request: OpenTableRequest) -> Result<Arc<dyn BaseTable>> {
|
async fn open_table(&self, mut request: OpenTableRequest) -> Result<Arc<dyn BaseTable>> {
|
||||||
let table_uri = self.table_uri(&request.name)?;
|
let table_uri = self.table_uri(&request.name)?;
|
||||||
|
|
||||||
// Inherit storage options from the connection
|
// Only modify the storage options if we actually have something to
|
||||||
|
// inherit. There is a difference between storage_options=None and
|
||||||
|
// storage_options=Some({}). Using storage_options=None will cause the
|
||||||
|
// connection's session store registry to be used. Supplying Some({})
|
||||||
|
// will cause a new connection to be created, and that connection will
|
||||||
|
// be dropped from the cache when python GCs the table object, which
|
||||||
|
// confounds reuse across tables.
|
||||||
|
if !self.storage_options.is_empty() {
|
||||||
let storage_options = request
|
let storage_options = request
|
||||||
.lance_read_params
|
.lance_read_params
|
||||||
.get_or_insert_with(Default::default)
|
.get_or_insert_with(Default::default)
|
||||||
@@ -572,10 +666,7 @@ impl Database for ListingDatabase {
|
|||||||
.get_or_insert_with(Default::default)
|
.get_or_insert_with(Default::default)
|
||||||
.storage_options
|
.storage_options
|
||||||
.get_or_insert_with(Default::default);
|
.get_or_insert_with(Default::default);
|
||||||
for (key, value) in self.storage_options.iter() {
|
self.inherit_storage_options(storage_options);
|
||||||
if !storage_options.contains_key(key) {
|
|
||||||
storage_options.insert(key.clone(), value.clone());
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Some ReadParams are exposed in the OpenTableBuilder, but we also
|
// Some ReadParams are exposed in the OpenTableBuilder, but we also
|
||||||
@@ -584,13 +675,14 @@ impl Database for ListingDatabase {
|
|||||||
// If we have a user provided ReadParams use that
|
// If we have a user provided ReadParams use that
|
||||||
// If we don't then start with the default ReadParams and customize it with
|
// If we don't then start with the default ReadParams and customize it with
|
||||||
// the options from the OpenTableBuilder
|
// the options from the OpenTableBuilder
|
||||||
let read_params = request.lance_read_params.unwrap_or_else(|| {
|
let mut read_params = request.lance_read_params.unwrap_or_else(|| {
|
||||||
let mut default_params = ReadParams::default();
|
let mut default_params = ReadParams::default();
|
||||||
if let Some(index_cache_size) = request.index_cache_size {
|
if let Some(index_cache_size) = request.index_cache_size {
|
||||||
default_params.index_cache_size = index_cache_size as usize;
|
default_params.index_cache_size = index_cache_size as usize;
|
||||||
}
|
}
|
||||||
default_params
|
default_params
|
||||||
});
|
});
|
||||||
|
read_params.session(self.session.clone());
|
||||||
|
|
||||||
let native_table = Arc::new(
|
let native_table = Arc::new(
|
||||||
NativeTable::open_with_params(
|
NativeTable::open_with_params(
|
||||||
|
|||||||
@@ -281,6 +281,46 @@ async fn test_encryption() -> Result<()> {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_table_storage_options_override() -> Result<()> {
|
||||||
|
// Test that table-level storage options override connection-level options
|
||||||
|
let bucket = S3Bucket::new("test-override").await;
|
||||||
|
let key1 = KMSKey::new().await;
|
||||||
|
let key2 = KMSKey::new().await;
|
||||||
|
|
||||||
|
let uri = format!("s3://{}", bucket.0);
|
||||||
|
|
||||||
|
// Create connection with key1 encryption
|
||||||
|
let db = lancedb::connect(&uri)
|
||||||
|
.storage_options(CONFIG.iter().cloned())
|
||||||
|
.storage_option("aws_server_side_encryption", "aws:kms")
|
||||||
|
.storage_option("aws_sse_kms_key_id", &key1.0)
|
||||||
|
.execute()
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
// Create table overriding with key2 encryption
|
||||||
|
let data = test_data();
|
||||||
|
let data = RecordBatchIterator::new(vec![Ok(data.clone())], data.schema());
|
||||||
|
let _table = db
|
||||||
|
.create_table("test_override", data)
|
||||||
|
.storage_option("aws_sse_kms_key_id", &key2.0)
|
||||||
|
.execute()
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
// Verify objects are encrypted with key2, not key1
|
||||||
|
validate_objects_encrypted(&bucket.0, "test_override", &key2.0).await;
|
||||||
|
|
||||||
|
// Also test that a table created without override uses connection settings
|
||||||
|
let data = test_data();
|
||||||
|
let data = RecordBatchIterator::new(vec![Ok(data.clone())], data.schema());
|
||||||
|
let _table2 = db.create_table("test_inherit", data).execute().await?;
|
||||||
|
|
||||||
|
// Verify this table uses key1 from connection
|
||||||
|
validate_objects_encrypted(&bucket.0, "test_inherit", &key1.0).await;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
struct DynamoDBCommitTable(String);
|
struct DynamoDBCommitTable(String);
|
||||||
|
|
||||||
impl DynamoDBCommitTable {
|
impl DynamoDBCommitTable {
|
||||||
|
|||||||
Reference in New Issue
Block a user