mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-23 13:29:57 +00:00
Compare commits
456 Commits
python-v0.
...
python-v0.
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
05f484b716 | ||
|
|
7e92aa657a | ||
|
|
e5f40a4b09 | ||
|
|
6779c1c192 | ||
|
|
e0bf6d9bd0 | ||
|
|
67f041be91 | ||
|
|
d388ef2f55 | ||
|
|
e52dc877e3 | ||
|
|
ca4fdf5499 | ||
|
|
0e9ad764b0 | ||
|
|
cae0348c51 | ||
|
|
e9e0a37ca8 | ||
|
|
c37a28abbd | ||
|
|
98c1e635b3 | ||
|
|
9992b927fd | ||
|
|
80d501011c | ||
|
|
6e3a9d08e0 | ||
|
|
268d8e057b | ||
|
|
dfc518b8fb | ||
|
|
98acf34ae8 | ||
|
|
25988d23cd | ||
|
|
c0dd98c798 | ||
|
|
ee73a3bcb8 | ||
|
|
c07989ac29 | ||
|
|
8f7ef26f5f | ||
|
|
e14f079fe2 | ||
|
|
7d790bd9e7 | ||
|
|
dbdd0a7b4b | ||
|
|
befb79c5f9 | ||
|
|
0a387a5429 | ||
|
|
5a173e1d54 | ||
|
|
51bdbcad98 | ||
|
|
0c7809c7a0 | ||
|
|
2de226220b | ||
|
|
bd5b6f21e2 | ||
|
|
6331807b95 | ||
|
|
83cb3f01a4 | ||
|
|
81f2cdf736 | ||
|
|
d404a3590c | ||
|
|
e688484bd3 | ||
|
|
3bcd61c8de | ||
|
|
c76ec48603 | ||
|
|
d974413745 | ||
|
|
ec4f2fbd30 | ||
|
|
6375ea419a | ||
|
|
6689192cee | ||
|
|
dbec598610 | ||
|
|
8f6e7ce4f3 | ||
|
|
b482f41bf4 | ||
|
|
4dc7497547 | ||
|
|
d744972f2f | ||
|
|
9bc320874a | ||
|
|
510d449167 | ||
|
|
356e89a800 | ||
|
|
ae1cf4441d | ||
|
|
1ae08fe31d | ||
|
|
a517629c65 | ||
|
|
553dae1607 | ||
|
|
9c7e00eec3 | ||
|
|
a7d66032aa | ||
|
|
7fb8a732a5 | ||
|
|
f393ac3b0d | ||
|
|
ca83354780 | ||
|
|
272cbcad7a | ||
|
|
722fe1836c | ||
|
|
d1983602c2 | ||
|
|
9148cd6d47 | ||
|
|
47dbb988bf | ||
|
|
6821536d44 | ||
|
|
d6f0663671 | ||
|
|
ea33b68c6c | ||
|
|
1453bf4e7a | ||
|
|
abaf315baf | ||
|
|
14b9277ac1 | ||
|
|
d621826b79 | ||
|
|
08c0803ae1 | ||
|
|
62632cb90b | ||
|
|
14566df213 | ||
|
|
acfdf1b9cb | ||
|
|
f95402af7c | ||
|
|
d14c9b6d9e | ||
|
|
c1af53b787 | ||
|
|
2a02d1394b | ||
|
|
085066d2a8 | ||
|
|
adf1a38f4d | ||
|
|
294c33a42e | ||
|
|
245786fed7 | ||
|
|
edd9a043f8 | ||
|
|
38c09fc294 | ||
|
|
ebaa2dede5 | ||
|
|
ba7618a026 | ||
|
|
a6bcbd007b | ||
|
|
5af74b5aca | ||
|
|
8a52619bc0 | ||
|
|
314d4c93e5 | ||
|
|
c5471ee694 | ||
|
|
4605359d3b | ||
|
|
f1596122e6 | ||
|
|
3aa0c40168 | ||
|
|
677b7c1fcc | ||
|
|
8303a7197b | ||
|
|
5fa9bfc4a8 | ||
|
|
bf2e9d0088 | ||
|
|
f04590ddad | ||
|
|
62c5117def | ||
|
|
22c196b3e3 | ||
|
|
1f4ac71fa3 | ||
|
|
b5aad2d856 | ||
|
|
ca6f55b160 | ||
|
|
6f8cf1e068 | ||
|
|
e0277383a5 | ||
|
|
d6b408e26f | ||
|
|
2447372c1f | ||
|
|
f0298d8372 | ||
|
|
54693e6bec | ||
|
|
73b2977bff | ||
|
|
aec85f7875 | ||
|
|
51f92ecb3d | ||
|
|
5b60412d66 | ||
|
|
53d63966a9 | ||
|
|
5ba87575e7 | ||
|
|
cc5f2136a6 | ||
|
|
78e5fb5451 | ||
|
|
8104c5c18e | ||
|
|
4fbabdeec3 | ||
|
|
eb31d95fef | ||
|
|
3169c36525 | ||
|
|
1b990983b3 | ||
|
|
0c21f91c16 | ||
|
|
7e50c239eb | ||
|
|
24e8043150 | ||
|
|
990440385d | ||
|
|
a693a9d897 | ||
|
|
82936c77ef | ||
|
|
dddcddcaf9 | ||
|
|
a9727eb318 | ||
|
|
48d55bf952 | ||
|
|
d2e71c8b08 | ||
|
|
f53aace89c | ||
|
|
d982ee934a | ||
|
|
57605a2d86 | ||
|
|
738511c5f2 | ||
|
|
0b0f42537e | ||
|
|
e412194008 | ||
|
|
a9088224c5 | ||
|
|
688c57a0d8 | ||
|
|
12a98deded | ||
|
|
e4bb042918 | ||
|
|
04e1662681 | ||
|
|
ce2242e06d | ||
|
|
778339388a | ||
|
|
7f8637a0b4 | ||
|
|
09cd08222d | ||
|
|
a248d7feec | ||
|
|
cc9473a94a | ||
|
|
d77e95a4f4 | ||
|
|
62f053ac92 | ||
|
|
34e10caad2 | ||
|
|
f5726e2d0c | ||
|
|
12b4fb42fc | ||
|
|
1328cd46f1 | ||
|
|
0c940ed9f8 | ||
|
|
5f59e51583 | ||
|
|
8d0ea29f89 | ||
|
|
b9468bb980 | ||
|
|
a42df158a3 | ||
|
|
9df6905d86 | ||
|
|
3ffed89793 | ||
|
|
f150768739 | ||
|
|
b432ecf2f6 | ||
|
|
d1a7257810 | ||
|
|
5c5e23bbb9 | ||
|
|
e5796a4836 | ||
|
|
b9c5323265 | ||
|
|
e41a52863a | ||
|
|
13acc8a480 | ||
|
|
22b9eceb12 | ||
|
|
5f62302614 | ||
|
|
d84e0d1db8 | ||
|
|
ac94b2a420 | ||
|
|
b49bc113c4 | ||
|
|
77b5b1cf0e | ||
|
|
e910809de0 | ||
|
|
90b5b55126 | ||
|
|
488e4f8452 | ||
|
|
ba6f949515 | ||
|
|
3dd8522bc9 | ||
|
|
e01ef63488 | ||
|
|
a6cf24b359 | ||
|
|
9a07c9aad8 | ||
|
|
d405798952 | ||
|
|
e8a8b92b2a | ||
|
|
66362c6506 | ||
|
|
5228ca4b6b | ||
|
|
dcc216a244 | ||
|
|
a7aa168c7f | ||
|
|
7a89b5ec68 | ||
|
|
ee862abd29 | ||
|
|
4e1ed2b139 | ||
|
|
008e0b1a93 | ||
|
|
82cbcf6d07 | ||
|
|
1cd5426aea | ||
|
|
41f0e32a06 | ||
|
|
ccfd043939 | ||
|
|
b4d451ed21 | ||
|
|
4c303ba293 | ||
|
|
66eaa2a00e | ||
|
|
5f14a411af | ||
|
|
bea3cef627 | ||
|
|
0e92a7277c | ||
|
|
83ed8d1e49 | ||
|
|
a1ab549457 | ||
|
|
3ba1618be9 | ||
|
|
9a9fc77a95 | ||
|
|
c89d5e6e6d | ||
|
|
d012db24c2 | ||
|
|
7af213801a | ||
|
|
8f54cfcde9 | ||
|
|
119b928a52 | ||
|
|
8bcdc81fd3 | ||
|
|
39e14c70c5 | ||
|
|
af8263af94 | ||
|
|
be4ab9eef3 | ||
|
|
184d2bc969 | ||
|
|
ff6f005336 | ||
|
|
49333e522c | ||
|
|
44eba363b5 | ||
|
|
4568df422d | ||
|
|
a90358a1e3 | ||
|
|
f7f9beaf31 | ||
|
|
cfdbddc5cf | ||
|
|
88affc1428 | ||
|
|
a7be064f00 | ||
|
|
707df47c3f | ||
|
|
6e97fada13 | ||
|
|
3f66be666d | ||
|
|
eda4c587fc | ||
|
|
91d64d86e0 | ||
|
|
ff81c0d698 | ||
|
|
fcfb4587bb | ||
|
|
f43c06d9ce | ||
|
|
ba01d274eb | ||
|
|
615c469af2 | ||
|
|
a649b3b1e4 | ||
|
|
be76242884 | ||
|
|
f4994cb0ec | ||
|
|
00b0c75710 | ||
|
|
47299385fa | ||
|
|
9dea884a7f | ||
|
|
85f8cf20aa | ||
|
|
5e720b2776 | ||
|
|
30a8223944 | ||
|
|
5b1587d84a | ||
|
|
78bafb3007 | ||
|
|
4417f7c5a7 | ||
|
|
577d6ea16e | ||
|
|
53d2ef5e81 | ||
|
|
e48ceb2ebd | ||
|
|
327692ccb1 | ||
|
|
bc224a6a0b | ||
|
|
2dcb39f556 | ||
|
|
6bda6f2f2a | ||
|
|
a3fafd6b54 | ||
|
|
dc8d6835c0 | ||
|
|
f55d99cec5 | ||
|
|
3d8b2f5531 | ||
|
|
b71aa4117f | ||
|
|
55db26f59a | ||
|
|
7e42f58dec | ||
|
|
2790b19279 | ||
|
|
4ba655d05e | ||
|
|
986891db98 | ||
|
|
036bf02901 | ||
|
|
4e31f0cc7a | ||
|
|
0a16e29b93 | ||
|
|
cf7d7a19f5 | ||
|
|
fe2fb91a8b | ||
|
|
81af350d85 | ||
|
|
99adfe065a | ||
|
|
277406509e | ||
|
|
63411b4d8b | ||
|
|
d998f80b04 | ||
|
|
629379a532 | ||
|
|
821cf0e434 | ||
|
|
99ba5331f0 | ||
|
|
121687231c | ||
|
|
ac40d4b235 | ||
|
|
c5a52565ac | ||
|
|
b0a88a7286 | ||
|
|
d41d849e0e | ||
|
|
bf5202f196 | ||
|
|
8be2861061 | ||
|
|
0560e3a0e5 | ||
|
|
b83fbfc344 | ||
|
|
60b22d84bf | ||
|
|
7d55a94efd | ||
|
|
4d8e401d34 | ||
|
|
684eb8b087 | ||
|
|
4e3b82feaa | ||
|
|
8e248a9d67 | ||
|
|
065ffde443 | ||
|
|
c3059dc689 | ||
|
|
a9caa5f2d4 | ||
|
|
8411c36b96 | ||
|
|
7773bda7ee | ||
|
|
392777952f | ||
|
|
7e75e50d3a | ||
|
|
4b8af261a3 | ||
|
|
c8728d4ca1 | ||
|
|
446f837335 | ||
|
|
8f9ad978f5 | ||
|
|
0df38341d5 | ||
|
|
60260018cf | ||
|
|
bb100c5c19 | ||
|
|
eab9072bb5 | ||
|
|
ee1d0b596f | ||
|
|
38a4524893 | ||
|
|
ee0f0611d9 | ||
|
|
34966312cb | ||
|
|
756188358c | ||
|
|
dc5126d8d1 | ||
|
|
50c20af060 | ||
|
|
0965d7dd5a | ||
|
|
7bbb2872de | ||
|
|
e81d2975da | ||
|
|
2c7f96ba4f | ||
|
|
f9dd7a5d8a | ||
|
|
1d4943688d | ||
|
|
7856a94d2c | ||
|
|
371d2f979e | ||
|
|
fff8e399a3 | ||
|
|
73e4015797 | ||
|
|
5142a27482 | ||
|
|
81df2a524e | ||
|
|
40638e5515 | ||
|
|
018314a5c1 | ||
|
|
409eb30ea5 | ||
|
|
ff9872fd44 | ||
|
|
a0608044a1 | ||
|
|
2e4ea7d2bc | ||
|
|
57e5695a54 | ||
|
|
ce58ea7c38 | ||
|
|
57207eff4a | ||
|
|
2d78bff120 | ||
|
|
7c09b9b9a9 | ||
|
|
bd0034a157 | ||
|
|
144b3b5d83 | ||
|
|
b6f0a31686 | ||
|
|
9ec526f73f | ||
|
|
600bfd7237 | ||
|
|
d087e7891d | ||
|
|
098e397cf0 | ||
|
|
63ee8fa6a1 | ||
|
|
693091db29 | ||
|
|
dca4533dbe | ||
|
|
f6bbe199dc | ||
|
|
366e522c2b | ||
|
|
244b6919cc | ||
|
|
aca785ff98 | ||
|
|
bbdebf2c38 | ||
|
|
1336cce0dc | ||
|
|
6c83b6a513 | ||
|
|
6bec4bec51 | ||
|
|
23d30dfc78 | ||
|
|
94c8c50f96 | ||
|
|
72765d8e1a | ||
|
|
a2a8f9615e | ||
|
|
b085d9aaa1 | ||
|
|
6eb662de9b | ||
|
|
2bb2bb581a | ||
|
|
38321fa226 | ||
|
|
22749c3fa2 | ||
|
|
123a49df77 | ||
|
|
a57aa4b142 | ||
|
|
d8e3e54226 | ||
|
|
ccfdf4853a | ||
|
|
87e5d86e90 | ||
|
|
1cf8a3e4e0 | ||
|
|
5372843281 | ||
|
|
54677b8f0b | ||
|
|
ebcf9bf6ae | ||
|
|
797514bcbf | ||
|
|
1c872ce501 | ||
|
|
479f471c14 | ||
|
|
ae0d2f2599 | ||
|
|
1e8678f11a | ||
|
|
662968559d | ||
|
|
9d895801f2 | ||
|
|
80613a40fd | ||
|
|
d43ef7f11e | ||
|
|
554e068917 | ||
|
|
567734dd6e | ||
|
|
1589499f89 | ||
|
|
682e95fa83 | ||
|
|
1ad5e7f2f0 | ||
|
|
ddb3ef4ce5 | ||
|
|
ef20b2a138 | ||
|
|
2e0f251bfd | ||
|
|
2cb91e818d | ||
|
|
2835c76336 | ||
|
|
8068a2bbc3 | ||
|
|
24111d543a | ||
|
|
7eec2b8f9a | ||
|
|
b2b70ea399 | ||
|
|
e50a3c1783 | ||
|
|
b517134309 | ||
|
|
6fb539b5bf | ||
|
|
f37fe120fd | ||
|
|
2e115acb9a | ||
|
|
27a638362d | ||
|
|
22a6695d7a | ||
|
|
57eff82ee7 | ||
|
|
7732f7d41c | ||
|
|
5ca98c326f | ||
|
|
b55db397eb | ||
|
|
c04d72ac8a | ||
|
|
28b02fb72a | ||
|
|
f3cf986777 | ||
|
|
c73fcc8898 | ||
|
|
cd9debc3b7 | ||
|
|
26a97ba997 | ||
|
|
ce19fedb08 | ||
|
|
14e8e48de2 | ||
|
|
c30faf6083 | ||
|
|
64a4f025bb | ||
|
|
6dc968e7d3 | ||
|
|
06b5b69f1e | ||
|
|
6bd3a838fc | ||
|
|
f36fea8f20 | ||
|
|
0a30591729 | ||
|
|
0ed39b6146 | ||
|
|
a8c7f80073 | ||
|
|
0293bbe142 | ||
|
|
7372656369 | ||
|
|
d46bc5dd6e | ||
|
|
86efb11572 | ||
|
|
bb01ad5290 | ||
|
|
1b8cda0941 | ||
|
|
bc85a749a3 | ||
|
|
02c35d3457 | ||
|
|
345c136cfb | ||
|
|
043e388254 | ||
|
|
fe64fc4671 | ||
|
|
6d66404506 | ||
|
|
eff94ecea8 | ||
|
|
7dfb555fea | ||
|
|
f762a669e7 | ||
|
|
0bdc7140dd | ||
|
|
8f6e955b24 | ||
|
|
1096da09da | ||
|
|
683824f1e9 | ||
|
|
db7bdefe77 | ||
|
|
e41894b071 | ||
|
|
e1ae2bcbd8 | ||
|
|
ababc3f8ec | ||
|
|
a1377afcaa |
@@ -1,5 +1,5 @@
|
|||||||
[bumpversion]
|
[bumpversion]
|
||||||
current_version = 0.4.17
|
current_version = 0.4.15
|
||||||
commit = True
|
commit = True
|
||||||
message = Bump version: {current_version} → {new_version}
|
message = Bump version: {current_version} → {new_version}
|
||||||
tag = True
|
tag = True
|
||||||
|
|||||||
@@ -14,10 +14,6 @@ inputs:
|
|||||||
# Note: this does *not* mean the host is arm64, since we might be cross-compiling.
|
# Note: this does *not* mean the host is arm64, since we might be cross-compiling.
|
||||||
required: false
|
required: false
|
||||||
default: "false"
|
default: "false"
|
||||||
manylinux:
|
|
||||||
description: "The manylinux version to build for"
|
|
||||||
required: false
|
|
||||||
default: "2_17"
|
|
||||||
runs:
|
runs:
|
||||||
using: "composite"
|
using: "composite"
|
||||||
steps:
|
steps:
|
||||||
@@ -32,7 +28,7 @@ runs:
|
|||||||
command: build
|
command: build
|
||||||
working-directory: python
|
working-directory: python
|
||||||
target: x86_64-unknown-linux-gnu
|
target: x86_64-unknown-linux-gnu
|
||||||
manylinux: ${{ inputs.manylinux }}
|
manylinux: "2_17"
|
||||||
args: ${{ inputs.args }}
|
args: ${{ inputs.args }}
|
||||||
before-script-linux: |
|
before-script-linux: |
|
||||||
set -e
|
set -e
|
||||||
@@ -47,7 +43,7 @@ runs:
|
|||||||
command: build
|
command: build
|
||||||
working-directory: python
|
working-directory: python
|
||||||
target: aarch64-unknown-linux-gnu
|
target: aarch64-unknown-linux-gnu
|
||||||
manylinux: ${{ inputs.manylinux }}
|
manylinux: "2_24"
|
||||||
args: ${{ inputs.args }}
|
args: ${{ inputs.args }}
|
||||||
before-script-linux: |
|
before-script-linux: |
|
||||||
set -e
|
set -e
|
||||||
|
|||||||
1
.github/workflows/node.yml
vendored
1
.github/workflows/node.yml
vendored
@@ -107,7 +107,6 @@ jobs:
|
|||||||
AWS_ENDPOINT: http://localhost:4566
|
AWS_ENDPOINT: http://localhost:4566
|
||||||
# this one is for dynamodb
|
# this one is for dynamodb
|
||||||
DYNAMODB_ENDPOINT: http://localhost:4566
|
DYNAMODB_ENDPOINT: http://localhost:4566
|
||||||
ALLOW_HTTP: true
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
|
|||||||
9
.github/workflows/nodejs.yml
vendored
9
.github/workflows/nodejs.yml
vendored
@@ -28,10 +28,6 @@ jobs:
|
|||||||
run:
|
run:
|
||||||
shell: bash
|
shell: bash
|
||||||
working-directory: nodejs
|
working-directory: nodejs
|
||||||
env:
|
|
||||||
# Need up-to-date compilers for kernels
|
|
||||||
CC: gcc-12
|
|
||||||
CXX: g++-12
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
@@ -85,12 +81,7 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
npm ci
|
npm ci
|
||||||
npm run build
|
npm run build
|
||||||
- name: Setup localstack
|
|
||||||
working-directory: .
|
|
||||||
run: docker compose up --detach --wait
|
|
||||||
- name: Test
|
- name: Test
|
||||||
env:
|
|
||||||
S3_TEST: "1"
|
|
||||||
run: npm run test
|
run: npm run test
|
||||||
macos:
|
macos:
|
||||||
timeout-minutes: 30
|
timeout-minutes: 30
|
||||||
|
|||||||
23
.github/workflows/pypi-publish.yml
vendored
23
.github/workflows/pypi-publish.yml
vendored
@@ -6,23 +6,13 @@ on:
|
|||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
linux:
|
linux:
|
||||||
name: Python ${{ matrix.config.platform }} manylinux${{ matrix.config.manylinux }}
|
|
||||||
timeout-minutes: 60
|
timeout-minutes: 60
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
python-minor-version: ["8"]
|
python-minor-version: ["8"]
|
||||||
config:
|
platform:
|
||||||
- platform: x86_64
|
- x86_64
|
||||||
manylinux: "2_17"
|
- aarch64
|
||||||
extra_args: ""
|
|
||||||
- platform: x86_64
|
|
||||||
manylinux: "2_28"
|
|
||||||
extra_args: "--features fp16kernels"
|
|
||||||
- platform: aarch64
|
|
||||||
manylinux: "2_24"
|
|
||||||
extra_args: ""
|
|
||||||
# We don't build fp16 kernels for aarch64, because it uses
|
|
||||||
# cross compilation image, which doesn't have a new enough compiler.
|
|
||||||
runs-on: "ubuntu-22.04"
|
runs-on: "ubuntu-22.04"
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
@@ -36,9 +26,8 @@ jobs:
|
|||||||
- uses: ./.github/workflows/build_linux_wheel
|
- uses: ./.github/workflows/build_linux_wheel
|
||||||
with:
|
with:
|
||||||
python-minor-version: ${{ matrix.python-minor-version }}
|
python-minor-version: ${{ matrix.python-minor-version }}
|
||||||
args: "--release --strip ${{ matrix.config.extra_args }}"
|
args: "--release --strip"
|
||||||
arm-build: ${{ matrix.config.platform == 'aarch64' }}
|
arm-build: ${{ matrix.platform == 'aarch64' }}
|
||||||
manylinux: ${{ matrix.config.manylinux }}
|
|
||||||
- uses: ./.github/workflows/upload_wheel
|
- uses: ./.github/workflows/upload_wheel
|
||||||
with:
|
with:
|
||||||
token: ${{ secrets.LANCEDB_PYPI_API_TOKEN }}
|
token: ${{ secrets.LANCEDB_PYPI_API_TOKEN }}
|
||||||
@@ -69,7 +58,7 @@ jobs:
|
|||||||
- uses: ./.github/workflows/build_mac_wheel
|
- uses: ./.github/workflows/build_mac_wheel
|
||||||
with:
|
with:
|
||||||
python-minor-version: ${{ matrix.python-minor-version }}
|
python-minor-version: ${{ matrix.python-minor-version }}
|
||||||
args: "--release --strip --target ${{ matrix.config.target }} --features fp16kernels"
|
args: "--release --strip --target ${{ matrix.config.target }}"
|
||||||
- uses: ./.github/workflows/upload_wheel
|
- uses: ./.github/workflows/upload_wheel
|
||||||
with:
|
with:
|
||||||
python-minor-version: ${{ matrix.python-minor-version }}
|
python-minor-version: ${{ matrix.python-minor-version }}
|
||||||
|
|||||||
4
.github/workflows/python.yml
vendored
4
.github/workflows/python.yml
vendored
@@ -99,8 +99,6 @@ jobs:
|
|||||||
workspaces: python
|
workspaces: python
|
||||||
- uses: ./.github/workflows/build_linux_wheel
|
- uses: ./.github/workflows/build_linux_wheel
|
||||||
- uses: ./.github/workflows/run_tests
|
- uses: ./.github/workflows/run_tests
|
||||||
with:
|
|
||||||
integration: true
|
|
||||||
# Make sure wheels are not included in the Rust cache
|
# Make sure wheels are not included in the Rust cache
|
||||||
- name: Delete wheels
|
- name: Delete wheels
|
||||||
run: rm -rf target/wheels
|
run: rm -rf target/wheels
|
||||||
@@ -192,4 +190,4 @@ jobs:
|
|||||||
pip install -e .[tests]
|
pip install -e .[tests]
|
||||||
pip install tantivy
|
pip install tantivy
|
||||||
- name: Run tests
|
- name: Run tests
|
||||||
run: pytest -m "not slow and not s3_test" -x -v --durations=30 python/tests
|
run: pytest -m "not slow" -x -v --durations=30 python/tests
|
||||||
|
|||||||
16
.github/workflows/run_tests/action.yml
vendored
16
.github/workflows/run_tests/action.yml
vendored
@@ -5,10 +5,6 @@ inputs:
|
|||||||
python-minor-version:
|
python-minor-version:
|
||||||
required: true
|
required: true
|
||||||
description: "8 9 10 11 12"
|
description: "8 9 10 11 12"
|
||||||
integration:
|
|
||||||
required: false
|
|
||||||
description: "Run integration tests"
|
|
||||||
default: "false"
|
|
||||||
runs:
|
runs:
|
||||||
using: "composite"
|
using: "composite"
|
||||||
steps:
|
steps:
|
||||||
@@ -16,16 +12,6 @@ runs:
|
|||||||
shell: bash
|
shell: bash
|
||||||
run: |
|
run: |
|
||||||
pip3 install $(ls target/wheels/lancedb-*.whl)[tests,dev]
|
pip3 install $(ls target/wheels/lancedb-*.whl)[tests,dev]
|
||||||
- name: Setup localstack for integration tests
|
- name: pytest
|
||||||
if: ${{ inputs.integration == 'true' }}
|
|
||||||
shell: bash
|
shell: bash
|
||||||
working-directory: .
|
|
||||||
run: docker compose up --detach --wait
|
|
||||||
- name: pytest (with integration)
|
|
||||||
shell: bash
|
|
||||||
if: ${{ inputs.integration == 'true' }}
|
|
||||||
run: pytest -m "not slow" -x -v --durations=30 python/python/tests
|
run: pytest -m "not slow" -x -v --durations=30 python/python/tests
|
||||||
- name: pytest (no integration tests)
|
|
||||||
shell: bash
|
|
||||||
if: ${{ inputs.integration != 'true' }}
|
|
||||||
run: pytest -m "not slow and not s3_test" -x -v --durations=30 python/python/tests
|
|
||||||
|
|||||||
14
.github/workflows/rust.yml
vendored
14
.github/workflows/rust.yml
vendored
@@ -31,10 +31,6 @@ jobs:
|
|||||||
run:
|
run:
|
||||||
shell: bash
|
shell: bash
|
||||||
working-directory: rust
|
working-directory: rust
|
||||||
env:
|
|
||||||
# Need up-to-date compilers for kernels
|
|
||||||
CC: gcc-12
|
|
||||||
CXX: g++-12
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
@@ -58,10 +54,6 @@ jobs:
|
|||||||
run:
|
run:
|
||||||
shell: bash
|
shell: bash
|
||||||
working-directory: rust
|
working-directory: rust
|
||||||
env:
|
|
||||||
# Need up-to-date compilers for kernels
|
|
||||||
CC: gcc-12
|
|
||||||
CXX: g++-12
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
@@ -76,9 +68,6 @@ jobs:
|
|||||||
sudo apt install -y protobuf-compiler libssl-dev
|
sudo apt install -y protobuf-compiler libssl-dev
|
||||||
- name: Build
|
- name: Build
|
||||||
run: cargo build --all-features
|
run: cargo build --all-features
|
||||||
- name: Start S3 integration test environment
|
|
||||||
working-directory: .
|
|
||||||
run: docker compose up --detach --wait
|
|
||||||
- name: Run tests
|
- name: Run tests
|
||||||
run: cargo test --all-features
|
run: cargo test --all-features
|
||||||
- name: Run examples
|
- name: Run examples
|
||||||
@@ -108,8 +97,7 @@ jobs:
|
|||||||
- name: Build
|
- name: Build
|
||||||
run: cargo build --all-features
|
run: cargo build --all-features
|
||||||
- name: Run tests
|
- name: Run tests
|
||||||
# Run with everything except the integration tests.
|
run: cargo test --all-features
|
||||||
run: cargo test --features remote,fp16kernels
|
|
||||||
windows:
|
windows:
|
||||||
runs-on: windows-2022
|
runs-on: windows-2022
|
||||||
steps:
|
steps:
|
||||||
|
|||||||
@@ -14,10 +14,10 @@ keywords = ["lancedb", "lance", "database", "vector", "search"]
|
|||||||
categories = ["database-implementations"]
|
categories = ["database-implementations"]
|
||||||
|
|
||||||
[workspace.dependencies]
|
[workspace.dependencies]
|
||||||
lance = { "version" = "=0.10.10", "features" = ["dynamodb"] }
|
lance = { "version" = "=0.10.6", "features" = ["dynamodb"] }
|
||||||
lance-index = { "version" = "=0.10.10" }
|
lance-index = { "version" = "=0.10.6" }
|
||||||
lance-linalg = { "version" = "=0.10.10" }
|
lance-linalg = { "version" = "=0.10.6" }
|
||||||
lance-testing = { "version" = "=0.10.10" }
|
lance-testing = { "version" = "=0.10.6" }
|
||||||
# Note that this one does not include pyarrow
|
# Note that this one does not include pyarrow
|
||||||
arrow = { version = "50.0", optional = false }
|
arrow = { version = "50.0", optional = false }
|
||||||
arrow-array = "50.0"
|
arrow-array = "50.0"
|
||||||
|
|||||||
@@ -1,18 +1,18 @@
|
|||||||
version: "3.9"
|
version: "3.9"
|
||||||
services:
|
services:
|
||||||
localstack:
|
localstack:
|
||||||
image: localstack/localstack:3.3
|
image: localstack/localstack:0.14
|
||||||
ports:
|
ports:
|
||||||
- 4566:4566
|
- 4566:4566
|
||||||
environment:
|
environment:
|
||||||
- SERVICES=s3,dynamodb,kms
|
- SERVICES=s3,dynamodb
|
||||||
- DEBUG=1
|
- DEBUG=1
|
||||||
- LS_LOG=trace
|
- LS_LOG=trace
|
||||||
- DOCKER_HOST=unix:///var/run/docker.sock
|
- DOCKER_HOST=unix:///var/run/docker.sock
|
||||||
- AWS_ACCESS_KEY_ID=ACCESSKEY
|
- AWS_ACCESS_KEY_ID=ACCESSKEY
|
||||||
- AWS_SECRET_ACCESS_KEY=SECRETKEY
|
- AWS_SECRET_ACCESS_KEY=SECRETKEY
|
||||||
healthcheck:
|
healthcheck:
|
||||||
test: [ "CMD", "curl", "-s", "http://localhost:4566/_localstack/health" ]
|
test: [ "CMD", "curl", "-f", "http://localhost:4566/health" ]
|
||||||
interval: 5s
|
interval: 5s
|
||||||
retries: 3
|
retries: 3
|
||||||
start_period: 10s
|
start_period: 10s
|
||||||
|
|||||||
@@ -143,6 +143,7 @@ nav:
|
|||||||
- TransformersJS Embedding Search: examples/transformerjs_embedding_search_nodejs.md
|
- TransformersJS Embedding Search: examples/transformerjs_embedding_search_nodejs.md
|
||||||
- 🦀 Rust:
|
- 🦀 Rust:
|
||||||
- Overview: examples/examples_rust.md
|
- Overview: examples/examples_rust.md
|
||||||
|
- 🔧 CLI & Config: cli_config.md
|
||||||
- 💭 FAQs: faq.md
|
- 💭 FAQs: faq.md
|
||||||
- ⚙️ API reference:
|
- ⚙️ API reference:
|
||||||
- 🐍 Python: python/python.md
|
- 🐍 Python: python/python.md
|
||||||
@@ -225,10 +226,3 @@ extra:
|
|||||||
analytics:
|
analytics:
|
||||||
provider: google
|
provider: google
|
||||||
property: G-B7NFM40W74
|
property: G-B7NFM40W74
|
||||||
social:
|
|
||||||
- icon: fontawesome/brands/github
|
|
||||||
link: https://github.com/lancedb/lancedb
|
|
||||||
- icon: fontawesome/brands/x-twitter
|
|
||||||
link: https://twitter.com/lancedb
|
|
||||||
- icon: fontawesome/brands/linkedin
|
|
||||||
link: https://www.linkedin.com/company/lancedb
|
|
||||||
|
|||||||
51
docs/src/cli_config.md
Normal file
51
docs/src/cli_config.md
Normal file
@@ -0,0 +1,51 @@
|
|||||||
|
|
||||||
|
# CLI & Config
|
||||||
|
|
||||||
|
## LanceDB CLI
|
||||||
|
Once lanceDB is installed, you can access the CLI using `lancedb` command on the console.
|
||||||
|
|
||||||
|
```
|
||||||
|
lancedb
|
||||||
|
```
|
||||||
|
|
||||||
|
This lists out all the various command-line options available. You can get the usage or help for a particular command.
|
||||||
|
|
||||||
|
```
|
||||||
|
lancedb {command} --help
|
||||||
|
```
|
||||||
|
|
||||||
|
## LanceDB config
|
||||||
|
LanceDB uses a global config file to store certain settings. These settings are configurable using the lanceDB cli.
|
||||||
|
To view your config settings, you can use:
|
||||||
|
|
||||||
|
```
|
||||||
|
lancedb config
|
||||||
|
```
|
||||||
|
|
||||||
|
These config parameters can be tuned using the cli.
|
||||||
|
|
||||||
|
```
|
||||||
|
lancedb {config_name} --{argument}
|
||||||
|
```
|
||||||
|
|
||||||
|
## LanceDB Opt-in Diagnostics
|
||||||
|
When enabled, LanceDB will send anonymous events to help us improve LanceDB. These diagnostics are used only for error reporting and no data is collected. Error & stats allow us to automate certain aspects of bug reporting, prioritization of fixes and feature requests.
|
||||||
|
These diagnostics are opt-in and can be enabled or disabled using the `lancedb diagnostics` command. These are enabled by default.
|
||||||
|
|
||||||
|
### Get usage help
|
||||||
|
|
||||||
|
```
|
||||||
|
lancedb diagnostics --help
|
||||||
|
```
|
||||||
|
|
||||||
|
### Disable diagnostics
|
||||||
|
|
||||||
|
```
|
||||||
|
lancedb diagnostics --disabled
|
||||||
|
```
|
||||||
|
|
||||||
|
### Enable diagnostics
|
||||||
|
|
||||||
|
```
|
||||||
|
lancedb diagnostics --enabled
|
||||||
|
```
|
||||||
@@ -177,32 +177,6 @@ Allows you to set parameters when registering a `sentence-transformers` object.
|
|||||||
Visit sentence-transformers [HuggingFace HUB](https://huggingface.co/sentence-transformers) page for more information on the available models.
|
Visit sentence-transformers [HuggingFace HUB](https://huggingface.co/sentence-transformers) page for more information on the available models.
|
||||||
|
|
||||||
|
|
||||||
### Huggingface embedding models
|
|
||||||
We offer support for all huggingface models (which can be loaded via [transformers](https://huggingface.co/docs/transformers/en/index) library). The default model is `colbert-ir/colbertv2.0` which also has its own special callout - `registry.get("colbert")`
|
|
||||||
|
|
||||||
Example usage -
|
|
||||||
```python
|
|
||||||
import lancedb
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
from lancedb.embeddings import get_registry
|
|
||||||
from lancedb.pydantic import LanceModel, Vector
|
|
||||||
|
|
||||||
model = get_registry().get("huggingface").create(name='facebook/bart-base')
|
|
||||||
|
|
||||||
class TextModel(LanceModel):
|
|
||||||
text: str = model.SourceField()
|
|
||||||
vector: Vector(model.ndims()) = model.VectorField()
|
|
||||||
|
|
||||||
df = pd.DataFrame({"text": ["hi hello sayonara", "goodbye world"]})
|
|
||||||
table = db.create_table("greets", schema=Words)
|
|
||||||
table.add()
|
|
||||||
query = "old greeting"
|
|
||||||
actual = table.search(query).limit(1).to_pydantic(Words)[0]
|
|
||||||
print(actual.text)
|
|
||||||
```
|
|
||||||
|
|
||||||
|
|
||||||
### OpenAI embeddings
|
### OpenAI embeddings
|
||||||
LanceDB registers the OpenAI embeddings function in the registry by default, as `openai`. Below are the parameters that you can customize when creating the instances:
|
LanceDB registers the OpenAI embeddings function in the registry by default, as `openai`. Below are the parameters that you can customize when creating the instances:
|
||||||
|
|
||||||
|
|||||||
@@ -55,139 +55,18 @@ LanceDB OSS supports object stores such as AWS S3 (and compatible stores), Azure
|
|||||||
const db = await lancedb.connect("az://bucket/path");
|
const db = await lancedb.connect("az://bucket/path");
|
||||||
```
|
```
|
||||||
|
|
||||||
In most cases, when running in the respective cloud and permissions are set up correctly, no additional configuration is required. When running outside of the respective cloud, authentication credentials must be provided. Credentials and other configuration options can be set in two ways: first, by setting environment variables. And second, by passing a `storage_options` object to the `connect` function. For example, to increase the request timeout to 60 seconds, you can set the `TIMEOUT` environment variable to `60s`:
|
In most cases, when running in the respective cloud and permissions are set up correctly, no additional configuration is required. When running outside of the respective cloud, authentication credentials must be provided using environment variables. In general, these environment variables are the same as those used by the respective cloud SDKs. The sections below describe the environment variables that can be used to configure each object store.
|
||||||
|
|
||||||
```bash
|
LanceDB OSS uses the [object-store](https://docs.rs/object_store/latest/object_store/) Rust crate for object store access. There are general environment variables that can be used to configure the object store, such as the request timeout and proxy configuration. See the [object_store ClientConfigKey](https://docs.rs/object_store/latest/object_store/enum.ClientConfigKey.html) doc for available configuration options. The environment variables that can be set are the snake-cased versions of these variable names. For example, to set `ProxyUrl` use the environment variable `PROXY_URL`. (Don't let the Rust docs intimidate you! We link to them so you can see an up-to-date list of the available options.)
|
||||||
export TIMEOUT=60s
|
|
||||||
```
|
|
||||||
|
|
||||||
!!! note "`storage_options` availability"
|
|
||||||
|
|
||||||
The `storage_options` parameter is only available in Python *async* API and JavaScript API.
|
|
||||||
It is not yet supported in the Python synchronous API.
|
|
||||||
|
|
||||||
If you only want this to apply to one particular connection, you can pass the `storage_options` argument when opening the connection:
|
|
||||||
|
|
||||||
=== "Python"
|
|
||||||
|
|
||||||
```python
|
|
||||||
import lancedb
|
|
||||||
db = await lancedb.connect_async(
|
|
||||||
"s3://bucket/path",
|
|
||||||
storage_options={"timeout": "60s"}
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
=== "JavaScript"
|
|
||||||
|
|
||||||
```javascript
|
|
||||||
const lancedb = require("lancedb");
|
|
||||||
const db = await lancedb.connect("s3://bucket/path",
|
|
||||||
{storageOptions: {timeout: "60s"}});
|
|
||||||
```
|
|
||||||
|
|
||||||
Getting even more specific, you can set the `timeout` for only a particular table:
|
|
||||||
|
|
||||||
=== "Python"
|
|
||||||
|
|
||||||
<!-- skip-test -->
|
|
||||||
```python
|
|
||||||
import lancedb
|
|
||||||
db = await lancedb.connect_async("s3://bucket/path")
|
|
||||||
table = await db.create_table(
|
|
||||||
"table",
|
|
||||||
[{"a": 1, "b": 2}],
|
|
||||||
storage_options={"timeout": "60s"}
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
=== "JavaScript"
|
|
||||||
|
|
||||||
<!-- skip-test -->
|
|
||||||
```javascript
|
|
||||||
const lancedb = require("lancedb");
|
|
||||||
const db = await lancedb.connect("s3://bucket/path");
|
|
||||||
const table = db.createTable(
|
|
||||||
"table",
|
|
||||||
[{ a: 1, b: 2}],
|
|
||||||
{storageOptions: {timeout: "60s"}}
|
|
||||||
);
|
|
||||||
```
|
|
||||||
|
|
||||||
!!! info "Storage option casing"
|
|
||||||
|
|
||||||
The storage option keys are case-insensitive. So `connect_timeout` and `CONNECT_TIMEOUT` are the same setting. Usually lowercase is used in the `storage_options` argument and uppercase is used for environment variables. In the `lancedb` Node package, the keys can also be provided in `camelCase` capitalization. For example, `connectTimeout` is equivalent to `connect_timeout`.
|
|
||||||
|
|
||||||
### General configuration
|
|
||||||
|
|
||||||
There are several options that can be set for all object stores, mostly related to network client configuration.
|
|
||||||
|
|
||||||
<!-- from here: https://docs.rs/object_store/latest/object_store/enum.ClientConfigKey.html -->
|
|
||||||
|
|
||||||
| Key | Description |
|
|
||||||
|----------------------------|--------------------------------------------------------------------------------------------------|
|
|
||||||
| `allow_http` | Allow non-TLS, i.e. non-HTTPS connections. Default: `False`. |
|
|
||||||
| `allow_invalid_certificates`| Skip certificate validation on HTTPS connections. Default: `False`. |
|
|
||||||
| `connect_timeout` | Timeout for only the connect phase of a Client. Default: `5s`. |
|
|
||||||
| `timeout` | Timeout for the entire request, from connection until the response body has finished. Default: `30s`. |
|
|
||||||
| `user_agent` | User agent string to use in requests. |
|
|
||||||
| `proxy_url` | URL of a proxy server to use for requests. Default: `None`. |
|
|
||||||
| `proxy_ca_certificate` | PEM-formatted CA certificate for proxy connections. |
|
|
||||||
| `proxy_excludes` | List of hosts that bypass the proxy. This is a comma-separated list of domains and IP masks. Any subdomain of the provided domain will be bypassed. For example, `example.com, 192.168.1.0/24` would bypass `https://api.example.com`, `https://www.example.com`, and any IP in the range `192.168.1.0/24`. |
|
|
||||||
|
|
||||||
|
|
||||||
### AWS S3
|
### AWS S3
|
||||||
|
|
||||||
To configure credentials for AWS S3, you can use the `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`, and `AWS_SESSION_TOKEN` keys. Region can also be set, but it is not mandatory when using AWS.
|
To configure credentials for AWS S3, you can use the `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`, and `AWS_SESSION_TOKEN` environment variables.
|
||||||
These can be set as environment variables or passed in the `storage_options` parameter:
|
|
||||||
|
|
||||||
=== "Python"
|
|
||||||
|
|
||||||
```python
|
|
||||||
import lancedb
|
|
||||||
db = await lancedb.connect_async(
|
|
||||||
"s3://bucket/path",
|
|
||||||
storage_options={
|
|
||||||
"aws_access_key_id": "my-access-key",
|
|
||||||
"aws_secret_access_key": "my-secret-key",
|
|
||||||
"aws_session_token": "my-session-token",
|
|
||||||
}
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
=== "JavaScript"
|
|
||||||
|
|
||||||
```javascript
|
|
||||||
const lancedb = require("lancedb");
|
|
||||||
const db = await lancedb.connect(
|
|
||||||
"s3://bucket/path",
|
|
||||||
{
|
|
||||||
storageOptions: {
|
|
||||||
awsAccessKeyId: "my-access-key",
|
|
||||||
awsSecretAccessKey: "my-secret-key",
|
|
||||||
awsSessionToken: "my-session-token",
|
|
||||||
}
|
|
||||||
}
|
|
||||||
);
|
|
||||||
```
|
|
||||||
|
|
||||||
Alternatively, if you are using AWS SSO, you can use the `AWS_PROFILE` and `AWS_DEFAULT_REGION` environment variables.
|
Alternatively, if you are using AWS SSO, you can use the `AWS_PROFILE` and `AWS_DEFAULT_REGION` environment variables.
|
||||||
|
|
||||||
The following keys can be used as both environment variables or keys in the `storage_options` parameter:
|
You can see a full list of environment variables [here](https://docs.rs/object_store/latest/object_store/aws/struct.AmazonS3Builder.html#method.from_env).
|
||||||
|
|
||||||
| Key | Description |
|
|
||||||
|------------------------------------|------------------------------------------------------------------------------------------------------|
|
|
||||||
| `aws_region` / `region` | The AWS region the bucket is in. This can be automatically detected when using AWS S3, but must be specified for S3-compatible stores. |
|
|
||||||
| `aws_access_key_id` / `access_key_id` | The AWS access key ID to use. |
|
|
||||||
| `aws_secret_access_key` / `secret_access_key` | The AWS secret access key to use. |
|
|
||||||
| `aws_session_token` / `session_token` | The AWS session token to use. |
|
|
||||||
| `aws_endpoint` / `endpoint` | The endpoint to use for S3-compatible stores. |
|
|
||||||
| `aws_virtual_hosted_style_request` / `virtual_hosted_style_request` | Whether to use virtual hosted-style requests, where the bucket name is part of the endpoint. Meant to be used with `aws_endpoint`. Default: `False`. |
|
|
||||||
| `aws_s3_express` / `s3_express` | Whether to use S3 Express One Zone endpoints. Default: `False`. See more details below. |
|
|
||||||
| `aws_server_side_encryption` | The server-side encryption algorithm to use. Must be one of `"AES256"`, `"aws:kms"`, or `"aws:kms:dsse"`. Default: `None`. |
|
|
||||||
| `aws_sse_kms_key_id` | The KMS key ID to use for server-side encryption. If set, `aws_server_side_encryption` must be `"aws:kms"` or `"aws:kms:dsse"`. |
|
|
||||||
| `aws_sse_bucket_key_enabled` | Whether to use bucket keys for server-side encryption. |
|
|
||||||
|
|
||||||
|
|
||||||
!!! tip "Automatic cleanup for failed writes"
|
!!! tip "Automatic cleanup for failed writes"
|
||||||
|
|
||||||
@@ -267,174 +146,22 @@ For **read-only access**, LanceDB will need a policy such as:
|
|||||||
|
|
||||||
#### S3-compatible stores
|
#### S3-compatible stores
|
||||||
|
|
||||||
LanceDB can also connect to S3-compatible stores, such as MinIO. To do so, you must specify both region and endpoint:
|
LanceDB can also connect to S3-compatible stores, such as MinIO. To do so, you must specify two environment variables: `AWS_ENDPOINT` and `AWS_DEFAULT_REGION`. `AWS_ENDPOINT` should be the URL of the S3-compatible store, and `AWS_DEFAULT_REGION` should be the region to use.
|
||||||
|
|
||||||
=== "Python"
|
|
||||||
|
|
||||||
```python
|
|
||||||
import lancedb
|
|
||||||
db = await lancedb.connect_async(
|
|
||||||
"s3://bucket/path",
|
|
||||||
storage_options={
|
|
||||||
"region": "us-east-1",
|
|
||||||
"endpoint": "http://minio:9000",
|
|
||||||
}
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
=== "JavaScript"
|
|
||||||
|
|
||||||
```javascript
|
|
||||||
const lancedb = require("lancedb");
|
|
||||||
const db = await lancedb.connect(
|
|
||||||
"s3://bucket/path",
|
|
||||||
{
|
|
||||||
storageOptions: {
|
|
||||||
region: "us-east-1",
|
|
||||||
endpoint: "http://minio:9000",
|
|
||||||
}
|
|
||||||
}
|
|
||||||
);
|
|
||||||
```
|
|
||||||
|
|
||||||
This can also be done with the ``AWS_ENDPOINT`` and ``AWS_DEFAULT_REGION`` environment variables.
|
|
||||||
|
|
||||||
#### S3 Express
|
|
||||||
|
|
||||||
LanceDB supports [S3 Express One Zone](https://aws.amazon.com/s3/storage-classes/express-one-zone/) endpoints, but requires additional configuration. Also, S3 Express endpoints only support connecting from an EC2 instance within the same region.
|
|
||||||
|
|
||||||
To configure LanceDB to use an S3 Express endpoint, you must set the storage option `s3_express`. The bucket name in your table URI should **include the suffix**.
|
|
||||||
|
|
||||||
=== "Python"
|
|
||||||
|
|
||||||
```python
|
|
||||||
import lancedb
|
|
||||||
db = await lancedb.connect_async(
|
|
||||||
"s3://my-bucket--use1-az4--x-s3/path",
|
|
||||||
storage_options={
|
|
||||||
"region": "us-east-1",
|
|
||||||
"s3_express": "true",
|
|
||||||
}
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
=== "JavaScript"
|
|
||||||
|
|
||||||
```javascript
|
|
||||||
const lancedb = require("lancedb");
|
|
||||||
const db = await lancedb.connect(
|
|
||||||
"s3://my-bucket--use1-az4--x-s3/path",
|
|
||||||
{
|
|
||||||
storageOptions: {
|
|
||||||
region: "us-east-1",
|
|
||||||
s3Express: "true",
|
|
||||||
}
|
|
||||||
}
|
|
||||||
);
|
|
||||||
```
|
|
||||||
|
|
||||||
|
<!-- TODO: we should also document the use of S3 Express once we fully support it -->
|
||||||
|
|
||||||
### Google Cloud Storage
|
### Google Cloud Storage
|
||||||
|
|
||||||
GCS credentials are configured by setting the `GOOGLE_SERVICE_ACCOUNT` environment variable to the path of a JSON file containing the service account credentials. Alternatively, you can pass the path to the JSON file in the `storage_options`:
|
GCS credentials are configured by setting the `GOOGLE_SERVICE_ACCOUNT` environment variable to the path of a JSON file containing the service account credentials. There are several aliases for this environment variable, documented [here](https://docs.rs/object_store/latest/object_store/gcp/struct.GoogleCloudStorageBuilder.html#method.from_env).
|
||||||
|
|
||||||
=== "Python"
|
|
||||||
|
|
||||||
<!-- skip-test -->
|
|
||||||
```python
|
|
||||||
import lancedb
|
|
||||||
db = await lancedb.connect_async(
|
|
||||||
"gs://my-bucket/my-database",
|
|
||||||
storage_options={
|
|
||||||
"service_account": "path/to/service-account.json",
|
|
||||||
}
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
=== "JavaScript"
|
|
||||||
|
|
||||||
```javascript
|
|
||||||
const lancedb = require("lancedb");
|
|
||||||
const db = await lancedb.connect(
|
|
||||||
"gs://my-bucket/my-database",
|
|
||||||
{
|
|
||||||
storageOptions: {
|
|
||||||
serviceAccount: "path/to/service-account.json",
|
|
||||||
}
|
|
||||||
}
|
|
||||||
);
|
|
||||||
```
|
|
||||||
|
|
||||||
|
|
||||||
!!! info "HTTP/2 support"
|
!!! info "HTTP/2 support"
|
||||||
|
|
||||||
By default, GCS uses HTTP/1 for communication, as opposed to HTTP/2. This improves maximum throughput significantly. However, if you wish to use HTTP/2 for some reason, you can set the environment variable `HTTP1_ONLY` to `false`.
|
By default, GCS uses HTTP/1 for communication, as opposed to HTTP/2. This improves maximum throughput significantly. However, if you wish to use HTTP/2 for some reason, you can set the environment variable `HTTP1_ONLY` to `false`.
|
||||||
|
|
||||||
|
|
||||||
The following keys can be used as both environment variables or keys in the `storage_options` parameter:
|
|
||||||
<!-- source: https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html -->
|
|
||||||
|
|
||||||
| Key | Description |
|
|
||||||
|---------------------------------------|----------------------------------------------|
|
|
||||||
| ``google_service_account`` / `service_account` | Path to the service account JSON file. |
|
|
||||||
| ``google_service_account_key`` | The serialized service account key. |
|
|
||||||
| ``google_application_credentials`` | Path to the application credentials. |
|
|
||||||
|
|
||||||
|
|
||||||
### Azure Blob Storage
|
### Azure Blob Storage
|
||||||
|
|
||||||
Azure Blob Storage credentials can be configured by setting the `AZURE_STORAGE_ACCOUNT_NAME`and `AZURE_STORAGE_ACCOUNT_KEY` environment variables. Alternatively, you can pass the account name and key in the `storage_options` parameter:
|
Azure Blob Storage credentials can be configured by setting the `AZURE_STORAGE_ACCOUNT_NAME` and ``AZURE_STORAGE_ACCOUNT_KEY`` environment variables. The full list of environment variables that can be set are documented [here](https://docs.rs/object_store/latest/object_store/azure/struct.MicrosoftAzureBuilder.html#method.from_env).
|
||||||
|
|
||||||
=== "Python"
|
|
||||||
|
|
||||||
<!-- skip-test -->
|
|
||||||
```python
|
|
||||||
import lancedb
|
|
||||||
db = await lancedb.connect_async(
|
|
||||||
"az://my-container/my-database",
|
|
||||||
storage_options={
|
|
||||||
account_name: "some-account",
|
|
||||||
account_key: "some-key",
|
|
||||||
}
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
=== "JavaScript"
|
|
||||||
|
|
||||||
```javascript
|
|
||||||
const lancedb = require("lancedb");
|
|
||||||
const db = await lancedb.connect(
|
|
||||||
"az://my-container/my-database",
|
|
||||||
{
|
|
||||||
storageOptions: {
|
|
||||||
accountName: "some-account",
|
|
||||||
accountKey: "some-key",
|
|
||||||
}
|
|
||||||
}
|
|
||||||
);
|
|
||||||
```
|
|
||||||
|
|
||||||
These keys can be used as both environment variables or keys in the `storage_options` parameter:
|
|
||||||
|
|
||||||
<!-- source: https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html -->
|
|
||||||
|
|
||||||
| Key | Description |
|
|
||||||
|---------------------------------------|--------------------------------------------------------------------------------------------------|
|
|
||||||
| ``azure_storage_account_name`` | The name of the azure storage account. |
|
|
||||||
| ``azure_storage_account_key`` | The serialized service account key. |
|
|
||||||
| ``azure_client_id`` | Service principal client id for authorizing requests. |
|
|
||||||
| ``azure_client_secret`` | Service principal client secret for authorizing requests. |
|
|
||||||
| ``azure_tenant_id`` | Tenant id used in oauth flows. |
|
|
||||||
| ``azure_storage_sas_key`` | Shared access signature. The signature is expected to be percent-encoded, much like they are provided in the azure storage explorer or azure portal. |
|
|
||||||
| ``azure_storage_token`` | Bearer token. |
|
|
||||||
| ``azure_storage_use_emulator`` | Use object store with azurite storage emulator. |
|
|
||||||
| ``azure_endpoint`` | Override the endpoint used to communicate with blob storage. |
|
|
||||||
| ``azure_use_fabric_endpoint`` | Use object store with url scheme account.dfs.fabric.microsoft.com. |
|
|
||||||
| ``azure_msi_endpoint`` | Endpoint to request a imds managed identity token. |
|
|
||||||
| ``azure_object_id`` | Object id for use with managed identity authentication. |
|
|
||||||
| ``azure_msi_resource_id`` | Msi resource id for use with managed identity authentication. |
|
|
||||||
| ``azure_federated_token_file`` | File containing token for Azure AD workload identity federation. |
|
|
||||||
| ``azure_use_azure_cli`` | Use azure cli for acquiring access token. |
|
|
||||||
| ``azure_disable_tagging`` | Disables tagging objects. This can be desirable if not supported by the backing store. |
|
|
||||||
|
|
||||||
<!-- TODO: demonstrate how to configure networked file systems for optimal performance -->
|
<!-- TODO: demonstrate how to configure networked file systems for optimal performance -->
|
||||||
@@ -142,7 +142,6 @@ rules are as follows:
|
|||||||
|
|
||||||
**`Example`**
|
**`Example`**
|
||||||
|
|
||||||
```ts
|
|
||||||
import { fromTableToBuffer, makeArrowTable } from "../arrow";
|
import { fromTableToBuffer, makeArrowTable } from "../arrow";
|
||||||
import { Field, FixedSizeList, Float16, Float32, Int32, Schema } from "apache-arrow";
|
import { Field, FixedSizeList, Float16, Float32, Int32, Schema } from "apache-arrow";
|
||||||
|
|
||||||
|
|||||||
@@ -66,7 +66,6 @@ Currently, Lance supports a growing list of SQL expressions.
|
|||||||
- `LIKE`, `NOT LIKE`
|
- `LIKE`, `NOT LIKE`
|
||||||
- `CAST`
|
- `CAST`
|
||||||
- `regexp_match(column, pattern)`
|
- `regexp_match(column, pattern)`
|
||||||
- [DataFusion Functions](https://arrow.apache.org/datafusion/user-guide/sql/scalar_functions.html)
|
|
||||||
|
|
||||||
For example, the following filter string is acceptable:
|
For example, the following filter string is acceptable:
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
import glob
|
import glob
|
||||||
from typing import Iterator, List
|
from typing import Iterator
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
glob_string = "../src/**/*.md"
|
glob_string = "../src/**/*.md"
|
||||||
@@ -50,24 +50,11 @@ def yield_lines(lines: Iterator[str], prefix: str, suffix: str):
|
|||||||
yield line[strip_length:]
|
yield line[strip_length:]
|
||||||
|
|
||||||
|
|
||||||
def wrap_async(lines: List[str]) -> List[str]:
|
|
||||||
# Indent all the lines
|
|
||||||
lines = [" " + line for line in lines]
|
|
||||||
# Put all lines in `async def main():`
|
|
||||||
lines = ["async def main():\n"] + lines
|
|
||||||
# Put `import asyncio\n asyncio.run(main())` at the end
|
|
||||||
lines = lines + ["\n", "import asyncio\n", "asyncio.run(main())\n"]
|
|
||||||
return lines
|
|
||||||
|
|
||||||
|
|
||||||
for file in filter(lambda file: file not in excluded_files, files):
|
for file in filter(lambda file: file not in excluded_files, files):
|
||||||
with open(file, "r") as f:
|
with open(file, "r") as f:
|
||||||
lines = list(yield_lines(iter(f), "```", "```"))
|
lines = list(yield_lines(iter(f), "```", "```"))
|
||||||
|
|
||||||
if len(lines) > 0:
|
if len(lines) > 0:
|
||||||
if any("await" in line for line in lines):
|
|
||||||
lines = wrap_async(lines)
|
|
||||||
|
|
||||||
print(lines)
|
print(lines)
|
||||||
out_path = (
|
out_path = (
|
||||||
Path(python_folder)
|
Path(python_folder)
|
||||||
|
|||||||
14
node/package-lock.json
generated
14
node/package-lock.json
generated
@@ -1,12 +1,12 @@
|
|||||||
{
|
{
|
||||||
"name": "vectordb",
|
"name": "vectordb",
|
||||||
"version": "0.4.17",
|
"version": "0.4.15",
|
||||||
"lockfileVersion": 3,
|
"lockfileVersion": 3,
|
||||||
"requires": true,
|
"requires": true,
|
||||||
"packages": {
|
"packages": {
|
||||||
"": {
|
"": {
|
||||||
"name": "vectordb",
|
"name": "vectordb",
|
||||||
"version": "0.4.17",
|
"version": "0.4.15",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"x64",
|
"x64",
|
||||||
"arm64"
|
"arm64"
|
||||||
@@ -52,11 +52,11 @@
|
|||||||
"uuid": "^9.0.0"
|
"uuid": "^9.0.0"
|
||||||
},
|
},
|
||||||
"optionalDependencies": {
|
"optionalDependencies": {
|
||||||
"@lancedb/vectordb-darwin-arm64": "0.4.17",
|
"@lancedb/vectordb-darwin-arm64": "0.4.15",
|
||||||
"@lancedb/vectordb-darwin-x64": "0.4.17",
|
"@lancedb/vectordb-darwin-x64": "0.4.15",
|
||||||
"@lancedb/vectordb-linux-arm64-gnu": "0.4.17",
|
"@lancedb/vectordb-linux-arm64-gnu": "0.4.15",
|
||||||
"@lancedb/vectordb-linux-x64-gnu": "0.4.17",
|
"@lancedb/vectordb-linux-x64-gnu": "0.4.15",
|
||||||
"@lancedb/vectordb-win32-x64-msvc": "0.4.17"
|
"@lancedb/vectordb-win32-x64-msvc": "0.4.15"
|
||||||
},
|
},
|
||||||
"peerDependencies": {
|
"peerDependencies": {
|
||||||
"@apache-arrow/ts": "^14.0.2",
|
"@apache-arrow/ts": "^14.0.2",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "vectordb",
|
"name": "vectordb",
|
||||||
"version": "0.4.17",
|
"version": "0.4.15",
|
||||||
"description": " Serverless, low-latency vector database for AI applications",
|
"description": " Serverless, low-latency vector database for AI applications",
|
||||||
"main": "dist/index.js",
|
"main": "dist/index.js",
|
||||||
"types": "dist/index.d.ts",
|
"types": "dist/index.d.ts",
|
||||||
@@ -88,10 +88,10 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
"optionalDependencies": {
|
"optionalDependencies": {
|
||||||
"@lancedb/vectordb-darwin-arm64": "0.4.17",
|
"@lancedb/vectordb-darwin-arm64": "0.4.15",
|
||||||
"@lancedb/vectordb-darwin-x64": "0.4.17",
|
"@lancedb/vectordb-darwin-x64": "0.4.15",
|
||||||
"@lancedb/vectordb-linux-arm64-gnu": "0.4.17",
|
"@lancedb/vectordb-linux-arm64-gnu": "0.4.15",
|
||||||
"@lancedb/vectordb-linux-x64-gnu": "0.4.17",
|
"@lancedb/vectordb-linux-x64-gnu": "0.4.15",
|
||||||
"@lancedb/vectordb-win32-x64-msvc": "0.4.17"
|
"@lancedb/vectordb-win32-x64-msvc": "0.4.15"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -78,25 +78,12 @@ export interface ConnectionOptions {
|
|||||||
/** User provided AWS crednetials.
|
/** User provided AWS crednetials.
|
||||||
*
|
*
|
||||||
* If not provided, LanceDB will use the default credentials provider chain.
|
* If not provided, LanceDB will use the default credentials provider chain.
|
||||||
*
|
|
||||||
* @deprecated Pass `aws_access_key_id`, `aws_secret_access_key`, and `aws_session_token`
|
|
||||||
* through `storageOptions` instead.
|
|
||||||
*/
|
*/
|
||||||
awsCredentials?: AwsCredentials
|
awsCredentials?: AwsCredentials
|
||||||
|
|
||||||
/** AWS region to connect to. Default is {@link defaultAwsRegion}
|
/** AWS region to connect to. Default is {@link defaultAwsRegion}. */
|
||||||
*
|
|
||||||
* @deprecated Pass `region` through `storageOptions` instead.
|
|
||||||
*/
|
|
||||||
awsRegion?: string
|
awsRegion?: string
|
||||||
|
|
||||||
/**
|
|
||||||
* User provided options for object storage. For example, S3 credentials or request timeouts.
|
|
||||||
*
|
|
||||||
* The various options are described at https://lancedb.github.io/lancedb/guides/storage/
|
|
||||||
*/
|
|
||||||
storageOptions?: Record<string, string>
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* API key for the remote connections
|
* API key for the remote connections
|
||||||
*
|
*
|
||||||
@@ -189,6 +176,7 @@ export async function connect (
|
|||||||
if (typeof arg === 'string') {
|
if (typeof arg === 'string') {
|
||||||
opts = { uri: arg }
|
opts = { uri: arg }
|
||||||
} else {
|
} else {
|
||||||
|
// opts = { uri: arg.uri, awsCredentials = arg.awsCredentials }
|
||||||
const keys = Object.keys(arg)
|
const keys = Object.keys(arg)
|
||||||
if (keys.length === 1 && keys[0] === 'uri' && typeof arg.uri === 'string') {
|
if (keys.length === 1 && keys[0] === 'uri' && typeof arg.uri === 'string') {
|
||||||
opts = { uri: arg.uri }
|
opts = { uri: arg.uri }
|
||||||
@@ -210,26 +198,12 @@ export async function connect (
|
|||||||
// Remote connection
|
// Remote connection
|
||||||
return new RemoteConnection(opts)
|
return new RemoteConnection(opts)
|
||||||
}
|
}
|
||||||
|
|
||||||
const storageOptions = opts.storageOptions ?? {};
|
|
||||||
if (opts.awsCredentials?.accessKeyId !== undefined) {
|
|
||||||
storageOptions.aws_access_key_id = opts.awsCredentials.accessKeyId
|
|
||||||
}
|
|
||||||
if (opts.awsCredentials?.secretKey !== undefined) {
|
|
||||||
storageOptions.aws_secret_access_key = opts.awsCredentials.secretKey
|
|
||||||
}
|
|
||||||
if (opts.awsCredentials?.sessionToken !== undefined) {
|
|
||||||
storageOptions.aws_session_token = opts.awsCredentials.sessionToken
|
|
||||||
}
|
|
||||||
if (opts.awsRegion !== undefined) {
|
|
||||||
storageOptions.region = opts.awsRegion
|
|
||||||
}
|
|
||||||
// It's a pain to pass a record to Rust, so we convert it to an array of key-value pairs
|
|
||||||
const storageOptionsArr = Object.entries(storageOptions);
|
|
||||||
|
|
||||||
const db = await databaseNew(
|
const db = await databaseNew(
|
||||||
opts.uri,
|
opts.uri,
|
||||||
storageOptionsArr,
|
opts.awsCredentials?.accessKeyId,
|
||||||
|
opts.awsCredentials?.secretKey,
|
||||||
|
opts.awsCredentials?.sessionToken,
|
||||||
|
opts.awsRegion,
|
||||||
opts.readConsistencyInterval
|
opts.readConsistencyInterval
|
||||||
)
|
)
|
||||||
return new LocalConnection(db, opts)
|
return new LocalConnection(db, opts)
|
||||||
@@ -746,6 +720,7 @@ export class LocalConnection implements Connection {
|
|||||||
const tbl = await databaseOpenTable.call(
|
const tbl = await databaseOpenTable.call(
|
||||||
this._db,
|
this._db,
|
||||||
name,
|
name,
|
||||||
|
...getAwsArgs(this._options())
|
||||||
)
|
)
|
||||||
if (embeddings !== undefined) {
|
if (embeddings !== undefined) {
|
||||||
return new LocalTable(tbl, name, this._options(), embeddings)
|
return new LocalTable(tbl, name, this._options(), embeddings)
|
||||||
|
|||||||
@@ -38,7 +38,7 @@ export class Query<T = number[]> {
|
|||||||
constructor (query?: T, tbl?: any, embeddings?: EmbeddingFunction<T>) {
|
constructor (query?: T, tbl?: any, embeddings?: EmbeddingFunction<T>) {
|
||||||
this._tbl = tbl
|
this._tbl = tbl
|
||||||
this._query = query
|
this._query = query
|
||||||
this._limit = 10
|
this._limit = undefined
|
||||||
this._nprobes = 20
|
this._nprobes = 20
|
||||||
this._refineFactor = undefined
|
this._refineFactor = undefined
|
||||||
this._select = undefined
|
this._select = undefined
|
||||||
@@ -50,7 +50,6 @@ export class Query<T = number[]> {
|
|||||||
|
|
||||||
/***
|
/***
|
||||||
* Sets the number of results that will be returned
|
* Sets the number of results that will be returned
|
||||||
* default value is 10
|
|
||||||
* @param value number of results
|
* @param value number of results
|
||||||
*/
|
*/
|
||||||
limit (value: number): Query<T> {
|
limit (value: number): Query<T> {
|
||||||
|
|||||||
@@ -103,22 +103,6 @@ function toLanceRes (res: AxiosResponse): RemoteResponse {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async function decodeErrorData(
|
|
||||||
res: RemoteResponse,
|
|
||||||
responseType?: ResponseType
|
|
||||||
): Promise<string> {
|
|
||||||
const errorData = await res.body()
|
|
||||||
if (responseType === 'arraybuffer') {
|
|
||||||
return new TextDecoder().decode(errorData)
|
|
||||||
} else {
|
|
||||||
if (typeof errorData === 'object') {
|
|
||||||
return JSON.stringify(errorData)
|
|
||||||
}
|
|
||||||
|
|
||||||
return errorData
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
export class HttpLancedbClient {
|
export class HttpLancedbClient {
|
||||||
private readonly _url: string
|
private readonly _url: string
|
||||||
private readonly _apiKey: () => string
|
private readonly _apiKey: () => string
|
||||||
@@ -196,7 +180,7 @@ export class HttpLancedbClient {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (response.status !== 200) {
|
if (response.status !== 200) {
|
||||||
const errorData = await decodeErrorData(response)
|
const errorData = new TextDecoder().decode(await response.body())
|
||||||
throw new Error(
|
throw new Error(
|
||||||
`Server Error, status: ${response.status}, ` +
|
`Server Error, status: ${response.status}, ` +
|
||||||
`message: ${response.statusText}: ${errorData}`
|
`message: ${response.statusText}: ${errorData}`
|
||||||
@@ -242,7 +226,7 @@ export class HttpLancedbClient {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (response.status !== 200) {
|
if (response.status !== 200) {
|
||||||
const errorData = await decodeErrorData(response, responseType)
|
const errorData = new TextDecoder().decode(await response.body())
|
||||||
throw new Error(
|
throw new Error(
|
||||||
`Server Error, status: ${response.status}, ` +
|
`Server Error, status: ${response.status}, ` +
|
||||||
`message: ${response.statusText}: ${errorData}`
|
`message: ${response.statusText}: ${errorData}`
|
||||||
|
|||||||
@@ -38,7 +38,7 @@ import {
|
|||||||
fromRecordsToStreamBuffer,
|
fromRecordsToStreamBuffer,
|
||||||
fromTableToStreamBuffer
|
fromTableToStreamBuffer
|
||||||
} from '../arrow'
|
} from '../arrow'
|
||||||
import { toSQL, TTLCache } from '../util'
|
import { toSQL } from '../util'
|
||||||
import { type HttpMiddleware } from '../middleware'
|
import { type HttpMiddleware } from '../middleware'
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -47,7 +47,6 @@ import { type HttpMiddleware } from '../middleware'
|
|||||||
export class RemoteConnection implements Connection {
|
export class RemoteConnection implements Connection {
|
||||||
private _client: HttpLancedbClient
|
private _client: HttpLancedbClient
|
||||||
private readonly _dbName: string
|
private readonly _dbName: string
|
||||||
private readonly _tableCache = new TTLCache(300_000)
|
|
||||||
|
|
||||||
constructor (opts: ConnectionOptions) {
|
constructor (opts: ConnectionOptions) {
|
||||||
if (!opts.uri.startsWith('db://')) {
|
if (!opts.uri.startsWith('db://')) {
|
||||||
@@ -90,9 +89,6 @@ export class RemoteConnection implements Connection {
|
|||||||
page_token: pageToken
|
page_token: pageToken
|
||||||
})
|
})
|
||||||
const body = await response.body()
|
const body = await response.body()
|
||||||
for (const table of body.tables) {
|
|
||||||
this._tableCache.set(table, true)
|
|
||||||
}
|
|
||||||
return body.tables
|
return body.tables
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -105,12 +101,6 @@ export class RemoteConnection implements Connection {
|
|||||||
name: string,
|
name: string,
|
||||||
embeddings?: EmbeddingFunction<T>
|
embeddings?: EmbeddingFunction<T>
|
||||||
): Promise<Table<T>> {
|
): Promise<Table<T>> {
|
||||||
// check if the table exists
|
|
||||||
if (this._tableCache.get(name) === undefined) {
|
|
||||||
await this._client.post(`/v1/table/${encodeURIComponent(name)}/describe/`)
|
|
||||||
this._tableCache.set(name, true)
|
|
||||||
}
|
|
||||||
|
|
||||||
if (embeddings !== undefined) {
|
if (embeddings !== undefined) {
|
||||||
return new RemoteTable(this._client, name, embeddings)
|
return new RemoteTable(this._client, name, embeddings)
|
||||||
} else {
|
} else {
|
||||||
@@ -166,7 +156,7 @@ export class RemoteConnection implements Connection {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const res = await this._client.post(
|
const res = await this._client.post(
|
||||||
`/v1/table/${encodeURIComponent(tableName)}/create/`,
|
`/v1/table/${tableName}/create/`,
|
||||||
buffer,
|
buffer,
|
||||||
undefined,
|
undefined,
|
||||||
'application/vnd.apache.arrow.stream'
|
'application/vnd.apache.arrow.stream'
|
||||||
@@ -179,7 +169,6 @@ export class RemoteConnection implements Connection {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
this._tableCache.set(tableName, true)
|
|
||||||
if (embeddings === undefined) {
|
if (embeddings === undefined) {
|
||||||
return new RemoteTable(this._client, tableName)
|
return new RemoteTable(this._client, tableName)
|
||||||
} else {
|
} else {
|
||||||
@@ -188,8 +177,7 @@ export class RemoteConnection implements Connection {
|
|||||||
}
|
}
|
||||||
|
|
||||||
async dropTable (name: string): Promise<void> {
|
async dropTable (name: string): Promise<void> {
|
||||||
await this._client.post(`/v1/table/${encodeURIComponent(name)}/drop/`)
|
await this._client.post(`/v1/table/${name}/drop/`)
|
||||||
this._tableCache.delete(name)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
withMiddleware (middleware: HttpMiddleware): Connection {
|
withMiddleware (middleware: HttpMiddleware): Connection {
|
||||||
@@ -280,7 +268,7 @@ export class RemoteTable<T = number[]> implements Table<T> {
|
|||||||
|
|
||||||
get schema (): Promise<any> {
|
get schema (): Promise<any> {
|
||||||
return this._client
|
return this._client
|
||||||
.post(`/v1/table/${encodeURIComponent(this._name)}/describe/`)
|
.post(`/v1/table/${this._name}/describe/`)
|
||||||
.then(async (res) => {
|
.then(async (res) => {
|
||||||
if (res.status !== 200) {
|
if (res.status !== 200) {
|
||||||
throw new Error(
|
throw new Error(
|
||||||
@@ -294,7 +282,7 @@ export class RemoteTable<T = number[]> implements Table<T> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
search (query: T): Query<T> {
|
search (query: T): Query<T> {
|
||||||
return new RemoteQuery(query, this._client, encodeURIComponent(this._name)) //, this._embeddings_new)
|
return new RemoteQuery(query, this._client, this._name) //, this._embeddings_new)
|
||||||
}
|
}
|
||||||
|
|
||||||
filter (where: string): Query<T> {
|
filter (where: string): Query<T> {
|
||||||
@@ -336,7 +324,7 @@ export class RemoteTable<T = number[]> implements Table<T> {
|
|||||||
|
|
||||||
const buffer = await fromTableToStreamBuffer(tbl, this._embeddings)
|
const buffer = await fromTableToStreamBuffer(tbl, this._embeddings)
|
||||||
const res = await this._client.post(
|
const res = await this._client.post(
|
||||||
`/v1/table/${encodeURIComponent(this._name)}/merge_insert/`,
|
`/v1/table/${this._name}/merge_insert/`,
|
||||||
buffer,
|
buffer,
|
||||||
queryParams,
|
queryParams,
|
||||||
'application/vnd.apache.arrow.stream'
|
'application/vnd.apache.arrow.stream'
|
||||||
@@ -360,7 +348,7 @@ export class RemoteTable<T = number[]> implements Table<T> {
|
|||||||
|
|
||||||
const buffer = await fromTableToStreamBuffer(tbl, this._embeddings)
|
const buffer = await fromTableToStreamBuffer(tbl, this._embeddings)
|
||||||
const res = await this._client.post(
|
const res = await this._client.post(
|
||||||
`/v1/table/${encodeURIComponent(this._name)}/insert/`,
|
`/v1/table/${this._name}/insert/`,
|
||||||
buffer,
|
buffer,
|
||||||
{
|
{
|
||||||
mode: 'append'
|
mode: 'append'
|
||||||
@@ -386,7 +374,7 @@ export class RemoteTable<T = number[]> implements Table<T> {
|
|||||||
}
|
}
|
||||||
const buffer = await fromTableToStreamBuffer(tbl, this._embeddings)
|
const buffer = await fromTableToStreamBuffer(tbl, this._embeddings)
|
||||||
const res = await this._client.post(
|
const res = await this._client.post(
|
||||||
`/v1/table/${encodeURIComponent(this._name)}/insert/`,
|
`/v1/table/${this._name}/insert/`,
|
||||||
buffer,
|
buffer,
|
||||||
{
|
{
|
||||||
mode: 'overwrite'
|
mode: 'overwrite'
|
||||||
@@ -433,7 +421,7 @@ export class RemoteTable<T = number[]> implements Table<T> {
|
|||||||
index_cache_size: indexCacheSize
|
index_cache_size: indexCacheSize
|
||||||
}
|
}
|
||||||
const res = await this._client.post(
|
const res = await this._client.post(
|
||||||
`/v1/table/${encodeURIComponent(this._name)}/create_index/`,
|
`/v1/table/${this._name}/create_index/`,
|
||||||
data
|
data
|
||||||
)
|
)
|
||||||
if (res.status !== 200) {
|
if (res.status !== 200) {
|
||||||
@@ -454,7 +442,7 @@ export class RemoteTable<T = number[]> implements Table<T> {
|
|||||||
replace: true
|
replace: true
|
||||||
}
|
}
|
||||||
const res = await this._client.post(
|
const res = await this._client.post(
|
||||||
`/v1/table/${encodeURIComponent(this._name)}/create_scalar_index/`,
|
`/v1/table/${this._name}/create_scalar_index/`,
|
||||||
data
|
data
|
||||||
)
|
)
|
||||||
if (res.status !== 200) {
|
if (res.status !== 200) {
|
||||||
@@ -466,15 +454,13 @@ export class RemoteTable<T = number[]> implements Table<T> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async countRows (filter?: string): Promise<number> {
|
async countRows (): Promise<number> {
|
||||||
const result = await this._client.post(`/v1/table/${encodeURIComponent(this._name)}/count_rows/`, {
|
const result = await this._client.post(`/v1/table/${this._name}/describe/`)
|
||||||
predicate: filter
|
return (await result.body())?.stats?.num_rows
|
||||||
})
|
|
||||||
return (await result.body())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async delete (filter: string): Promise<void> {
|
async delete (filter: string): Promise<void> {
|
||||||
await this._client.post(`/v1/table/${encodeURIComponent(this._name)}/delete/`, {
|
await this._client.post(`/v1/table/${this._name}/delete/`, {
|
||||||
predicate: filter
|
predicate: filter
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
@@ -493,7 +479,7 @@ export class RemoteTable<T = number[]> implements Table<T> {
|
|||||||
updates[key] = toSQL(value)
|
updates[key] = toSQL(value)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
await this._client.post(`/v1/table/${encodeURIComponent(this._name)}/update/`, {
|
await this._client.post(`/v1/table/${this._name}/update/`, {
|
||||||
predicate: filter,
|
predicate: filter,
|
||||||
updates: Object.entries(updates).map(([key, value]) => [key, value])
|
updates: Object.entries(updates).map(([key, value]) => [key, value])
|
||||||
})
|
})
|
||||||
@@ -501,7 +487,7 @@ export class RemoteTable<T = number[]> implements Table<T> {
|
|||||||
|
|
||||||
async listIndices (): Promise<VectorIndex[]> {
|
async listIndices (): Promise<VectorIndex[]> {
|
||||||
const results = await this._client.post(
|
const results = await this._client.post(
|
||||||
`/v1/table/${encodeURIComponent(this._name)}/index/list/`
|
`/v1/table/${this._name}/index/list/`
|
||||||
)
|
)
|
||||||
return (await results.body()).indexes?.map((index: any) => ({
|
return (await results.body()).indexes?.map((index: any) => ({
|
||||||
columns: index.columns,
|
columns: index.columns,
|
||||||
@@ -512,7 +498,7 @@ export class RemoteTable<T = number[]> implements Table<T> {
|
|||||||
|
|
||||||
async indexStats (indexUuid: string): Promise<IndexStats> {
|
async indexStats (indexUuid: string): Promise<IndexStats> {
|
||||||
const results = await this._client.post(
|
const results = await this._client.post(
|
||||||
`/v1/table/${encodeURIComponent(this._name)}/index/${indexUuid}/stats/`
|
`/v1/table/${this._name}/index/${indexUuid}/stats/`
|
||||||
)
|
)
|
||||||
const body = await results.body()
|
const body = await results.body()
|
||||||
return {
|
return {
|
||||||
|
|||||||
@@ -42,7 +42,6 @@ import {
|
|||||||
Float16,
|
Float16,
|
||||||
Int64
|
Int64
|
||||||
} from 'apache-arrow'
|
} from 'apache-arrow'
|
||||||
import type { RemoteRequest, RemoteResponse } from '../middleware'
|
|
||||||
|
|
||||||
const expect = chai.expect
|
const expect = chai.expect
|
||||||
const assert = chai.assert
|
const assert = chai.assert
|
||||||
@@ -75,19 +74,6 @@ describe('LanceDB client', function () {
|
|||||||
assert.equal(con.uri, uri)
|
assert.equal(con.uri, uri)
|
||||||
})
|
})
|
||||||
|
|
||||||
it('should accept custom storage options', async function () {
|
|
||||||
const uri = await createTestDB()
|
|
||||||
const storageOptions = {
|
|
||||||
region: 'us-west-2',
|
|
||||||
timeout: '30s'
|
|
||||||
};
|
|
||||||
const con = await lancedb.connect({
|
|
||||||
uri,
|
|
||||||
storageOptions
|
|
||||||
})
|
|
||||||
assert.equal(con.uri, uri)
|
|
||||||
})
|
|
||||||
|
|
||||||
it('should return the existing table names', async function () {
|
it('should return the existing table names', async function () {
|
||||||
const uri = await createTestDB()
|
const uri = await createTestDB()
|
||||||
const con = await lancedb.connect(uri)
|
const con = await lancedb.connect(uri)
|
||||||
@@ -138,9 +124,9 @@ describe('LanceDB client', function () {
|
|||||||
const uri = await createTestDB(2, 100)
|
const uri = await createTestDB(2, 100)
|
||||||
const con = await lancedb.connect(uri)
|
const con = await lancedb.connect(uri)
|
||||||
const table = (await con.openTable('vectors')) as LocalTable
|
const table = (await con.openTable('vectors')) as LocalTable
|
||||||
let results = await table.filter('id % 2 = 0').limit(100).execute()
|
let results = await table.filter('id % 2 = 0').execute()
|
||||||
assertResults(results)
|
assertResults(results)
|
||||||
results = await table.where('id % 2 = 0').limit(100).execute()
|
results = await table.where('id % 2 = 0').execute()
|
||||||
assertResults(results)
|
assertResults(results)
|
||||||
|
|
||||||
// Should reject a bad filter
|
// Should reject a bad filter
|
||||||
@@ -927,22 +913,7 @@ describe('Remote LanceDB client', function () {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Search
|
// Search
|
||||||
const table = await con.withMiddleware(new (class {
|
const table = await con.openTable('vectors')
|
||||||
async onRemoteRequest(req: RemoteRequest, next: (req: RemoteRequest) => Promise<RemoteResponse>) {
|
|
||||||
// intercept call to check if the table exists and make the call succeed
|
|
||||||
if (req.uri.endsWith('/describe/')) {
|
|
||||||
return {
|
|
||||||
status: 200,
|
|
||||||
statusText: 'OK',
|
|
||||||
headers: new Map(),
|
|
||||||
body: async () => ({})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return await next(req)
|
|
||||||
}
|
|
||||||
})()).openTable('vectors')
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
await table.search([0.1, 0.3]).execute()
|
await table.search([0.1, 0.3]).execute()
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
|
|||||||
@@ -42,36 +42,3 @@ export function toSQL (value: Literal): string {
|
|||||||
// eslint-disable-next-line @typescript-eslint/restrict-template-expressions
|
// eslint-disable-next-line @typescript-eslint/restrict-template-expressions
|
||||||
throw new Error(`Unsupported value type: ${typeof value} value: (${value})`)
|
throw new Error(`Unsupported value type: ${typeof value} value: (${value})`)
|
||||||
}
|
}
|
||||||
|
|
||||||
export class TTLCache {
|
|
||||||
private readonly cache: Map<string, { value: any, expires: number }>
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @param ttl Time to live in milliseconds
|
|
||||||
*/
|
|
||||||
constructor (private readonly ttl: number) {
|
|
||||||
this.cache = new Map()
|
|
||||||
}
|
|
||||||
|
|
||||||
get (key: string): any | undefined {
|
|
||||||
const entry = this.cache.get(key)
|
|
||||||
if (entry === undefined) {
|
|
||||||
return undefined
|
|
||||||
}
|
|
||||||
|
|
||||||
if (entry.expires < Date.now()) {
|
|
||||||
this.cache.delete(key)
|
|
||||||
return undefined
|
|
||||||
}
|
|
||||||
|
|
||||||
return entry.value
|
|
||||||
}
|
|
||||||
|
|
||||||
set (key: string, value: any): void {
|
|
||||||
this.cache.set(key, { value, expires: Date.now() + this.ttl })
|
|
||||||
}
|
|
||||||
|
|
||||||
delete (key: string): void {
|
|
||||||
this.cache.delete(key)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -1,219 +0,0 @@
|
|||||||
// Copyright 2024 Lance Developers.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
/* eslint-disable @typescript-eslint/naming-convention */
|
|
||||||
|
|
||||||
import { connect } from "../dist";
|
|
||||||
import {
|
|
||||||
CreateBucketCommand,
|
|
||||||
DeleteBucketCommand,
|
|
||||||
DeleteObjectCommand,
|
|
||||||
HeadObjectCommand,
|
|
||||||
ListObjectsV2Command,
|
|
||||||
S3Client,
|
|
||||||
} from "@aws-sdk/client-s3";
|
|
||||||
import {
|
|
||||||
CreateKeyCommand,
|
|
||||||
ScheduleKeyDeletionCommand,
|
|
||||||
KMSClient,
|
|
||||||
} from "@aws-sdk/client-kms";
|
|
||||||
|
|
||||||
// Skip these tests unless the S3_TEST environment variable is set
|
|
||||||
const maybeDescribe = process.env.S3_TEST ? describe : describe.skip;
|
|
||||||
|
|
||||||
// These are all keys that are accepted by storage_options
|
|
||||||
const CONFIG = {
|
|
||||||
allowHttp: "true",
|
|
||||||
awsAccessKeyId: "ACCESSKEY",
|
|
||||||
awsSecretAccessKey: "SECRETKEY",
|
|
||||||
awsEndpoint: "http://127.0.0.1:4566",
|
|
||||||
awsRegion: "us-east-1",
|
|
||||||
};
|
|
||||||
|
|
||||||
class S3Bucket {
|
|
||||||
name: string;
|
|
||||||
constructor(name: string) {
|
|
||||||
this.name = name;
|
|
||||||
}
|
|
||||||
|
|
||||||
static s3Client() {
|
|
||||||
return new S3Client({
|
|
||||||
region: CONFIG.awsRegion,
|
|
||||||
credentials: {
|
|
||||||
accessKeyId: CONFIG.awsAccessKeyId,
|
|
||||||
secretAccessKey: CONFIG.awsSecretAccessKey,
|
|
||||||
},
|
|
||||||
endpoint: CONFIG.awsEndpoint,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
public static async create(name: string): Promise<S3Bucket> {
|
|
||||||
const client = this.s3Client();
|
|
||||||
// Delete the bucket if it already exists
|
|
||||||
try {
|
|
||||||
await this.deleteBucket(client, name);
|
|
||||||
} catch (e) {
|
|
||||||
// It's fine if the bucket doesn't exist
|
|
||||||
}
|
|
||||||
await client.send(new CreateBucketCommand({ Bucket: name }));
|
|
||||||
return new S3Bucket(name);
|
|
||||||
}
|
|
||||||
|
|
||||||
public async delete() {
|
|
||||||
const client = S3Bucket.s3Client();
|
|
||||||
await S3Bucket.deleteBucket(client, this.name);
|
|
||||||
}
|
|
||||||
|
|
||||||
static async deleteBucket(client: S3Client, name: string) {
|
|
||||||
// Must delete all objects before we can delete the bucket
|
|
||||||
const objects = await client.send(
|
|
||||||
new ListObjectsV2Command({ Bucket: name }),
|
|
||||||
);
|
|
||||||
if (objects.Contents) {
|
|
||||||
for (const object of objects.Contents) {
|
|
||||||
await client.send(
|
|
||||||
new DeleteObjectCommand({ Bucket: name, Key: object.Key }),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
await client.send(new DeleteBucketCommand({ Bucket: name }));
|
|
||||||
}
|
|
||||||
|
|
||||||
public async assertAllEncrypted(path: string, keyId: string) {
|
|
||||||
const client = S3Bucket.s3Client();
|
|
||||||
const objects = await client.send(
|
|
||||||
new ListObjectsV2Command({ Bucket: this.name, Prefix: path }),
|
|
||||||
);
|
|
||||||
if (objects.Contents) {
|
|
||||||
for (const object of objects.Contents) {
|
|
||||||
const metadata = await client.send(
|
|
||||||
new HeadObjectCommand({ Bucket: this.name, Key: object.Key }),
|
|
||||||
);
|
|
||||||
expect(metadata.ServerSideEncryption).toBe("aws:kms");
|
|
||||||
expect(metadata.SSEKMSKeyId).toContain(keyId);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
class KmsKey {
|
|
||||||
keyId: string;
|
|
||||||
constructor(keyId: string) {
|
|
||||||
this.keyId = keyId;
|
|
||||||
}
|
|
||||||
|
|
||||||
static kmsClient() {
|
|
||||||
return new KMSClient({
|
|
||||||
region: CONFIG.awsRegion,
|
|
||||||
credentials: {
|
|
||||||
accessKeyId: CONFIG.awsAccessKeyId,
|
|
||||||
secretAccessKey: CONFIG.awsSecretAccessKey,
|
|
||||||
},
|
|
||||||
endpoint: CONFIG.awsEndpoint,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
public static async create(): Promise<KmsKey> {
|
|
||||||
const client = this.kmsClient();
|
|
||||||
const key = await client.send(new CreateKeyCommand({}));
|
|
||||||
const keyId = key?.KeyMetadata?.KeyId;
|
|
||||||
if (!keyId) {
|
|
||||||
throw new Error("Failed to create KMS key");
|
|
||||||
}
|
|
||||||
return new KmsKey(keyId);
|
|
||||||
}
|
|
||||||
|
|
||||||
public async delete() {
|
|
||||||
const client = KmsKey.kmsClient();
|
|
||||||
await client.send(new ScheduleKeyDeletionCommand({ KeyId: this.keyId }));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
maybeDescribe("storage_options", () => {
|
|
||||||
let bucket: S3Bucket;
|
|
||||||
let kmsKey: KmsKey;
|
|
||||||
beforeAll(async () => {
|
|
||||||
bucket = await S3Bucket.create("lancedb");
|
|
||||||
kmsKey = await KmsKey.create();
|
|
||||||
});
|
|
||||||
afterAll(async () => {
|
|
||||||
await kmsKey.delete();
|
|
||||||
await bucket.delete();
|
|
||||||
});
|
|
||||||
|
|
||||||
it("can be used to configure auth and endpoints", async () => {
|
|
||||||
const uri = `s3://${bucket.name}/test`;
|
|
||||||
const db = await connect(uri, { storageOptions: CONFIG });
|
|
||||||
|
|
||||||
let table = await db.createTable("test", [{ a: 1, b: 2 }]);
|
|
||||||
|
|
||||||
let rowCount = await table.countRows();
|
|
||||||
expect(rowCount).toBe(1);
|
|
||||||
|
|
||||||
let tableNames = await db.tableNames();
|
|
||||||
expect(tableNames).toEqual(["test"]);
|
|
||||||
|
|
||||||
table = await db.openTable("test");
|
|
||||||
rowCount = await table.countRows();
|
|
||||||
expect(rowCount).toBe(1);
|
|
||||||
|
|
||||||
await table.add([
|
|
||||||
{ a: 2, b: 3 },
|
|
||||||
{ a: 3, b: 4 },
|
|
||||||
]);
|
|
||||||
rowCount = await table.countRows();
|
|
||||||
expect(rowCount).toBe(3);
|
|
||||||
|
|
||||||
await db.dropTable("test");
|
|
||||||
|
|
||||||
tableNames = await db.tableNames();
|
|
||||||
expect(tableNames).toEqual([]);
|
|
||||||
});
|
|
||||||
|
|
||||||
it("can configure encryption at connection and table level", async () => {
|
|
||||||
const uri = `s3://${bucket.name}/test`;
|
|
||||||
let db = await connect(uri, { storageOptions: CONFIG });
|
|
||||||
|
|
||||||
let table = await db.createTable("table1", [{ a: 1, b: 2 }], {
|
|
||||||
storageOptions: {
|
|
||||||
awsServerSideEncryption: "aws:kms",
|
|
||||||
awsSseKmsKeyId: kmsKey.keyId,
|
|
||||||
},
|
|
||||||
});
|
|
||||||
|
|
||||||
let rowCount = await table.countRows();
|
|
||||||
expect(rowCount).toBe(1);
|
|
||||||
|
|
||||||
await table.add([{ a: 2, b: 3 }]);
|
|
||||||
|
|
||||||
await bucket.assertAllEncrypted("test/table1.lance", kmsKey.keyId);
|
|
||||||
|
|
||||||
// Now with encryption settings at connection level
|
|
||||||
db = await connect(uri, {
|
|
||||||
storageOptions: {
|
|
||||||
...CONFIG,
|
|
||||||
awsServerSideEncryption: "aws:kms",
|
|
||||||
awsSseKmsKeyId: kmsKey.keyId,
|
|
||||||
},
|
|
||||||
});
|
|
||||||
table = await db.createTable("table2", [{ a: 1, b: 2 }]);
|
|
||||||
rowCount = await table.countRows();
|
|
||||||
expect(rowCount).toBe(1);
|
|
||||||
|
|
||||||
await table.add([{ a: 2, b: 3 }]);
|
|
||||||
|
|
||||||
await bucket.assertAllEncrypted("test/table2.lance", kmsKey.keyId);
|
|
||||||
});
|
|
||||||
});
|
|
||||||
@@ -13,32 +13,10 @@
|
|||||||
// limitations under the License.
|
// limitations under the License.
|
||||||
|
|
||||||
import { fromTableToBuffer, makeArrowTable, makeEmptyTable } from "./arrow";
|
import { fromTableToBuffer, makeArrowTable, makeEmptyTable } from "./arrow";
|
||||||
import { ConnectionOptions, Connection as LanceDbConnection } from "./native";
|
import { Connection as LanceDbConnection } from "./native";
|
||||||
import { Table } from "./table";
|
import { Table } from "./table";
|
||||||
import { Table as ArrowTable, Schema } from "apache-arrow";
|
import { Table as ArrowTable, Schema } from "apache-arrow";
|
||||||
|
|
||||||
/**
|
|
||||||
* Connect to a LanceDB instance at the given URI.
|
|
||||||
*
|
|
||||||
* Accpeted formats:
|
|
||||||
*
|
|
||||||
* - `/path/to/database` - local database
|
|
||||||
* - `s3://bucket/path/to/database` or `gs://bucket/path/to/database` - database on cloud storage
|
|
||||||
* - `db://host:port` - remote database (LanceDB cloud)
|
|
||||||
* @param {string} uri - The uri of the database. If the database uri starts
|
|
||||||
* with `db://` then it connects to a remote database.
|
|
||||||
* @see {@link ConnectionOptions} for more details on the URI format.
|
|
||||||
*/
|
|
||||||
export async function connect(
|
|
||||||
uri: string,
|
|
||||||
opts?: Partial<ConnectionOptions>,
|
|
||||||
): Promise<Connection> {
|
|
||||||
opts = opts ?? {};
|
|
||||||
opts.storageOptions = cleanseStorageOptions(opts.storageOptions);
|
|
||||||
const nativeConn = await LanceDbConnection.new(uri, opts);
|
|
||||||
return new Connection(nativeConn);
|
|
||||||
}
|
|
||||||
|
|
||||||
export interface CreateTableOptions {
|
export interface CreateTableOptions {
|
||||||
/**
|
/**
|
||||||
* The mode to use when creating the table.
|
* The mode to use when creating the table.
|
||||||
@@ -55,28 +33,6 @@ export interface CreateTableOptions {
|
|||||||
* then no error will be raised.
|
* then no error will be raised.
|
||||||
*/
|
*/
|
||||||
existOk: boolean;
|
existOk: boolean;
|
||||||
|
|
||||||
/**
|
|
||||||
* Configuration for object storage.
|
|
||||||
*
|
|
||||||
* Options already set on the connection will be inherited by the table,
|
|
||||||
* but can be overridden here.
|
|
||||||
*
|
|
||||||
* The available options are described at https://lancedb.github.io/lancedb/guides/storage/
|
|
||||||
*/
|
|
||||||
storageOptions?: Record<string, string>;
|
|
||||||
}
|
|
||||||
|
|
||||||
export interface OpenTableOptions {
|
|
||||||
/**
|
|
||||||
* Configuration for object storage.
|
|
||||||
*
|
|
||||||
* Options already set on the connection will be inherited by the table,
|
|
||||||
* but can be overridden here.
|
|
||||||
*
|
|
||||||
* The available options are described at https://lancedb.github.io/lancedb/guides/storage/
|
|
||||||
*/
|
|
||||||
storageOptions?: Record<string, string>;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface TableNamesOptions {
|
export interface TableNamesOptions {
|
||||||
@@ -153,14 +109,8 @@ export class Connection {
|
|||||||
* Open a table in the database.
|
* Open a table in the database.
|
||||||
* @param {string} name - The name of the table
|
* @param {string} name - The name of the table
|
||||||
*/
|
*/
|
||||||
async openTable(
|
async openTable(name: string): Promise<Table> {
|
||||||
name: string,
|
const innerTable = await this.inner.openTable(name);
|
||||||
options?: Partial<OpenTableOptions>,
|
|
||||||
): Promise<Table> {
|
|
||||||
const innerTable = await this.inner.openTable(
|
|
||||||
name,
|
|
||||||
cleanseStorageOptions(options?.storageOptions),
|
|
||||||
);
|
|
||||||
return new Table(innerTable);
|
return new Table(innerTable);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -189,12 +139,7 @@ export class Connection {
|
|||||||
table = makeArrowTable(data);
|
table = makeArrowTable(data);
|
||||||
}
|
}
|
||||||
const buf = await fromTableToBuffer(table);
|
const buf = await fromTableToBuffer(table);
|
||||||
const innerTable = await this.inner.createTable(
|
const innerTable = await this.inner.createTable(name, buf, mode);
|
||||||
name,
|
|
||||||
buf,
|
|
||||||
mode,
|
|
||||||
cleanseStorageOptions(options?.storageOptions),
|
|
||||||
);
|
|
||||||
return new Table(innerTable);
|
return new Table(innerTable);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -217,12 +162,7 @@ export class Connection {
|
|||||||
|
|
||||||
const table = makeEmptyTable(schema);
|
const table = makeEmptyTable(schema);
|
||||||
const buf = await fromTableToBuffer(table);
|
const buf = await fromTableToBuffer(table);
|
||||||
const innerTable = await this.inner.createEmptyTable(
|
const innerTable = await this.inner.createEmptyTable(name, buf, mode);
|
||||||
name,
|
|
||||||
buf,
|
|
||||||
mode,
|
|
||||||
cleanseStorageOptions(options?.storageOptions),
|
|
||||||
);
|
|
||||||
return new Table(innerTable);
|
return new Table(innerTable);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -234,43 +174,3 @@ export class Connection {
|
|||||||
return this.inner.dropTable(name);
|
return this.inner.dropTable(name);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Takes storage options and makes all the keys snake case.
|
|
||||||
*/
|
|
||||||
function cleanseStorageOptions(
|
|
||||||
options?: Record<string, string>,
|
|
||||||
): Record<string, string> | undefined {
|
|
||||||
if (options === undefined) {
|
|
||||||
return undefined;
|
|
||||||
}
|
|
||||||
const result: Record<string, string> = {};
|
|
||||||
for (const [key, value] of Object.entries(options)) {
|
|
||||||
if (value !== undefined) {
|
|
||||||
const newKey = camelToSnakeCase(key);
|
|
||||||
result[newKey] = value;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Convert a string to snake case. It might already be snake case, in which case it is
|
|
||||||
* returned unchanged.
|
|
||||||
*/
|
|
||||||
function camelToSnakeCase(camel: string): string {
|
|
||||||
if (camel.includes("_")) {
|
|
||||||
// Assume if there is at least one underscore, it is already snake case
|
|
||||||
return camel;
|
|
||||||
}
|
|
||||||
if (camel.toLocaleUpperCase() === camel) {
|
|
||||||
// Assume if the string is all uppercase, it is already snake case
|
|
||||||
return camel;
|
|
||||||
}
|
|
||||||
|
|
||||||
let result = camel.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);
|
|
||||||
if (result.startsWith("_")) {
|
|
||||||
result = result.slice(1);
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -12,6 +12,12 @@
|
|||||||
// See the License for the specific language governing permissions and
|
// See the License for the specific language governing permissions and
|
||||||
// limitations under the License.
|
// limitations under the License.
|
||||||
|
|
||||||
|
import { Connection } from "./connection";
|
||||||
|
import {
|
||||||
|
Connection as LanceDbConnection,
|
||||||
|
ConnectionOptions,
|
||||||
|
} from "./native.js";
|
||||||
|
|
||||||
export {
|
export {
|
||||||
WriteOptions,
|
WriteOptions,
|
||||||
WriteMode,
|
WriteMode,
|
||||||
@@ -26,7 +32,6 @@ export {
|
|||||||
VectorColumnOptions,
|
VectorColumnOptions,
|
||||||
} from "./arrow";
|
} from "./arrow";
|
||||||
export {
|
export {
|
||||||
connect,
|
|
||||||
Connection,
|
Connection,
|
||||||
CreateTableOptions,
|
CreateTableOptions,
|
||||||
TableNamesOptions,
|
TableNamesOptions,
|
||||||
@@ -41,3 +46,24 @@ export {
|
|||||||
export { Index, IndexOptions, IvfPqOptions } from "./indices";
|
export { Index, IndexOptions, IvfPqOptions } from "./indices";
|
||||||
export { Table, AddDataOptions, IndexConfig, UpdateOptions } from "./table";
|
export { Table, AddDataOptions, IndexConfig, UpdateOptions } from "./table";
|
||||||
export * as embedding from "./embedding";
|
export * as embedding from "./embedding";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Connect to a LanceDB instance at the given URI.
|
||||||
|
*
|
||||||
|
* Accpeted formats:
|
||||||
|
*
|
||||||
|
* - `/path/to/database` - local database
|
||||||
|
* - `s3://bucket/path/to/database` or `gs://bucket/path/to/database` - database on cloud storage
|
||||||
|
* - `db://host:port` - remote database (LanceDB cloud)
|
||||||
|
* @param {string} uri - The uri of the database. If the database uri starts
|
||||||
|
* with `db://` then it connects to a remote database.
|
||||||
|
* @see {@link ConnectionOptions} for more details on the URI format.
|
||||||
|
*/
|
||||||
|
export async function connect(
|
||||||
|
uri: string,
|
||||||
|
opts?: Partial<ConnectionOptions>,
|
||||||
|
): Promise<Connection> {
|
||||||
|
opts = opts ?? {};
|
||||||
|
const nativeConn = await LanceDbConnection.new(uri, opts);
|
||||||
|
return new Connection(nativeConn);
|
||||||
|
}
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-darwin-arm64",
|
"name": "@lancedb/lancedb-darwin-arm64",
|
||||||
"version": "0.4.17",
|
"version": "0.4.15",
|
||||||
"os": [
|
"os": [
|
||||||
"darwin"
|
"darwin"
|
||||||
],
|
],
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-darwin-x64",
|
"name": "@lancedb/lancedb-darwin-x64",
|
||||||
"version": "0.4.17",
|
"version": "0.4.15",
|
||||||
"os": [
|
"os": [
|
||||||
"darwin"
|
"darwin"
|
||||||
],
|
],
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-linux-arm64-gnu",
|
"name": "@lancedb/lancedb-linux-arm64-gnu",
|
||||||
"version": "0.4.17",
|
"version": "0.4.15",
|
||||||
"os": [
|
"os": [
|
||||||
"linux"
|
"linux"
|
||||||
],
|
],
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-linux-x64-gnu",
|
"name": "@lancedb/lancedb-linux-x64-gnu",
|
||||||
"version": "0.4.17",
|
"version": "0.4.15",
|
||||||
"os": [
|
"os": [
|
||||||
"linux"
|
"linux"
|
||||||
],
|
],
|
||||||
|
|||||||
1689
nodejs/package-lock.json
generated
1689
nodejs/package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb",
|
"name": "@lancedb/lancedb",
|
||||||
"version": "0.4.17",
|
"version": "0.4.15",
|
||||||
"main": "./dist/index.js",
|
"main": "./dist/index.js",
|
||||||
"types": "./dist/index.d.ts",
|
"types": "./dist/index.d.ts",
|
||||||
"napi": {
|
"napi": {
|
||||||
@@ -18,8 +18,6 @@
|
|||||||
},
|
},
|
||||||
"license": "Apache 2.0",
|
"license": "Apache 2.0",
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
"@aws-sdk/client-s3": "^3.33.0",
|
|
||||||
"@aws-sdk/client-kms": "^3.33.0",
|
|
||||||
"@napi-rs/cli": "^2.18.0",
|
"@napi-rs/cli": "^2.18.0",
|
||||||
"@types/jest": "^29.1.2",
|
"@types/jest": "^29.1.2",
|
||||||
"@types/tmp": "^0.2.6",
|
"@types/tmp": "^0.2.6",
|
||||||
@@ -65,16 +63,15 @@
|
|||||||
"lint": "eslint lancedb && eslint __test__",
|
"lint": "eslint lancedb && eslint __test__",
|
||||||
"prepublishOnly": "napi prepublish -t npm",
|
"prepublishOnly": "napi prepublish -t npm",
|
||||||
"test": "npm run build && jest --verbose",
|
"test": "npm run build && jest --verbose",
|
||||||
"integration": "S3_TEST=1 npm run test",
|
|
||||||
"universal": "napi universal",
|
"universal": "napi universal",
|
||||||
"version": "napi version"
|
"version": "napi version"
|
||||||
},
|
},
|
||||||
"optionalDependencies": {
|
"optionalDependencies": {
|
||||||
"@lancedb/lancedb-darwin-arm64": "0.4.17",
|
"@lancedb/lancedb-darwin-arm64": "0.4.15",
|
||||||
"@lancedb/lancedb-darwin-x64": "0.4.17",
|
"@lancedb/lancedb-darwin-x64": "0.4.15",
|
||||||
"@lancedb/lancedb-linux-arm64-gnu": "0.4.17",
|
"@lancedb/lancedb-linux-arm64-gnu": "0.4.15",
|
||||||
"@lancedb/lancedb-linux-x64-gnu": "0.4.17",
|
"@lancedb/lancedb-linux-x64-gnu": "0.4.15",
|
||||||
"@lancedb/lancedb-win32-x64-msvc": "0.4.17"
|
"@lancedb/lancedb-win32-x64-msvc": "0.4.15"
|
||||||
},
|
},
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"openai": "^4.29.2",
|
"openai": "^4.29.2",
|
||||||
|
|||||||
@@ -12,8 +12,6 @@
|
|||||||
// See the License for the specific language governing permissions and
|
// See the License for the specific language governing permissions and
|
||||||
// limitations under the License.
|
// limitations under the License.
|
||||||
|
|
||||||
use std::collections::HashMap;
|
|
||||||
|
|
||||||
use napi::bindgen_prelude::*;
|
use napi::bindgen_prelude::*;
|
||||||
use napi_derive::*;
|
use napi_derive::*;
|
||||||
|
|
||||||
@@ -66,11 +64,6 @@ impl Connection {
|
|||||||
builder =
|
builder =
|
||||||
builder.read_consistency_interval(std::time::Duration::from_secs_f64(interval));
|
builder.read_consistency_interval(std::time::Duration::from_secs_f64(interval));
|
||||||
}
|
}
|
||||||
if let Some(storage_options) = options.storage_options {
|
|
||||||
for (key, value) in storage_options {
|
|
||||||
builder = builder.storage_option(key, value);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(Self::inner_new(
|
Ok(Self::inner_new(
|
||||||
builder
|
builder
|
||||||
.execute()
|
.execute()
|
||||||
@@ -125,18 +118,14 @@ impl Connection {
|
|||||||
name: String,
|
name: String,
|
||||||
buf: Buffer,
|
buf: Buffer,
|
||||||
mode: String,
|
mode: String,
|
||||||
storage_options: Option<HashMap<String, String>>,
|
|
||||||
) -> napi::Result<Table> {
|
) -> napi::Result<Table> {
|
||||||
let batches = ipc_file_to_batches(buf.to_vec())
|
let batches = ipc_file_to_batches(buf.to_vec())
|
||||||
.map_err(|e| napi::Error::from_reason(format!("Failed to read IPC file: {}", e)))?;
|
.map_err(|e| napi::Error::from_reason(format!("Failed to read IPC file: {}", e)))?;
|
||||||
let mode = Self::parse_create_mode_str(&mode)?;
|
let mode = Self::parse_create_mode_str(&mode)?;
|
||||||
let mut builder = self.get_inner()?.create_table(&name, batches).mode(mode);
|
let tbl = self
|
||||||
if let Some(storage_options) = storage_options {
|
.get_inner()?
|
||||||
for (key, value) in storage_options {
|
.create_table(&name, batches)
|
||||||
builder = builder.storage_option(key, value);
|
.mode(mode)
|
||||||
}
|
|
||||||
}
|
|
||||||
let tbl = builder
|
|
||||||
.execute()
|
.execute()
|
||||||
.await
|
.await
|
||||||
.map_err(|e| napi::Error::from_reason(format!("{}", e)))?;
|
.map_err(|e| napi::Error::from_reason(format!("{}", e)))?;
|
||||||
@@ -149,22 +138,15 @@ impl Connection {
|
|||||||
name: String,
|
name: String,
|
||||||
schema_buf: Buffer,
|
schema_buf: Buffer,
|
||||||
mode: String,
|
mode: String,
|
||||||
storage_options: Option<HashMap<String, String>>,
|
|
||||||
) -> napi::Result<Table> {
|
) -> napi::Result<Table> {
|
||||||
let schema = ipc_file_to_schema(schema_buf.to_vec()).map_err(|e| {
|
let schema = ipc_file_to_schema(schema_buf.to_vec()).map_err(|e| {
|
||||||
napi::Error::from_reason(format!("Failed to marshal schema from JS to Rust: {}", e))
|
napi::Error::from_reason(format!("Failed to marshal schema from JS to Rust: {}", e))
|
||||||
})?;
|
})?;
|
||||||
let mode = Self::parse_create_mode_str(&mode)?;
|
let mode = Self::parse_create_mode_str(&mode)?;
|
||||||
let mut builder = self
|
let tbl = self
|
||||||
.get_inner()?
|
.get_inner()?
|
||||||
.create_empty_table(&name, schema)
|
.create_empty_table(&name, schema)
|
||||||
.mode(mode);
|
.mode(mode)
|
||||||
if let Some(storage_options) = storage_options {
|
|
||||||
for (key, value) in storage_options {
|
|
||||||
builder = builder.storage_option(key, value);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
let tbl = builder
|
|
||||||
.execute()
|
.execute()
|
||||||
.await
|
.await
|
||||||
.map_err(|e| napi::Error::from_reason(format!("{}", e)))?;
|
.map_err(|e| napi::Error::from_reason(format!("{}", e)))?;
|
||||||
@@ -172,18 +154,10 @@ impl Connection {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[napi]
|
#[napi]
|
||||||
pub async fn open_table(
|
pub async fn open_table(&self, name: String) -> napi::Result<Table> {
|
||||||
&self,
|
let tbl = self
|
||||||
name: String,
|
.get_inner()?
|
||||||
storage_options: Option<HashMap<String, String>>,
|
.open_table(&name)
|
||||||
) -> napi::Result<Table> {
|
|
||||||
let mut builder = self.get_inner()?.open_table(&name);
|
|
||||||
if let Some(storage_options) = storage_options {
|
|
||||||
for (key, value) in storage_options {
|
|
||||||
builder = builder.storage_option(key, value);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
let tbl = builder
|
|
||||||
.execute()
|
.execute()
|
||||||
.await
|
.await
|
||||||
.map_err(|e| napi::Error::from_reason(format!("{}", e)))?;
|
.map_err(|e| napi::Error::from_reason(format!("{}", e)))?;
|
||||||
|
|||||||
@@ -12,8 +12,7 @@
|
|||||||
// See the License for the specific language governing permissions and
|
// See the License for the specific language governing permissions and
|
||||||
// limitations under the License.
|
// limitations under the License.
|
||||||
|
|
||||||
use std::collections::HashMap;
|
use connection::Connection;
|
||||||
|
|
||||||
use napi_derive::*;
|
use napi_derive::*;
|
||||||
|
|
||||||
mod connection;
|
mod connection;
|
||||||
@@ -39,10 +38,6 @@ pub struct ConnectionOptions {
|
|||||||
/// Note: this consistency only applies to read operations. Write operations are
|
/// Note: this consistency only applies to read operations. Write operations are
|
||||||
/// always consistent.
|
/// always consistent.
|
||||||
pub read_consistency_interval: Option<f64>,
|
pub read_consistency_interval: Option<f64>,
|
||||||
/// (For LanceDB OSS only): configuration for object storage.
|
|
||||||
///
|
|
||||||
/// The available options are described at https://lancedb.github.io/lancedb/guides/storage/
|
|
||||||
pub storage_options: Option<HashMap<String, String>>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Write mode for writing a table.
|
/// Write mode for writing a table.
|
||||||
@@ -59,7 +54,7 @@ pub struct WriteOptions {
|
|||||||
pub mode: Option<WriteMode>,
|
pub mode: Option<WriteMode>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[napi(object)]
|
#[napi]
|
||||||
pub struct OpenTableOptions {
|
pub async fn connect(uri: String, options: ConnectionOptions) -> napi::Result<Connection> {
|
||||||
pub storage_options: Option<HashMap<String, String>>,
|
Connection::new(uri, options).await
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
[bumpversion]
|
[bumpversion]
|
||||||
current_version = 0.6.8
|
current_version = 0.6.6
|
||||||
commit = True
|
commit = True
|
||||||
message = [python] Bump version: {current_version} → {new_version}
|
message = [python] Bump version: {current_version} → {new_version}
|
||||||
tag = True
|
tag = True
|
||||||
|
|||||||
@@ -31,6 +31,3 @@ pyo3-build-config = { version = "0.20.3", features = [
|
|||||||
"extension-module",
|
"extension-module",
|
||||||
"abi3-py38",
|
"abi3-py38",
|
||||||
] }
|
] }
|
||||||
|
|
||||||
[features]
|
|
||||||
fp16kernels = ["lancedb/fp16kernels"]
|
|
||||||
|
|||||||
@@ -41,7 +41,7 @@ To build the python package you can use maturin:
|
|||||||
```bash
|
```bash
|
||||||
# This will build the rust bindings and place them in the appropriate place
|
# This will build the rust bindings and place them in the appropriate place
|
||||||
# in your venv or conda environment
|
# in your venv or conda environment
|
||||||
maturin develop
|
matruin develop
|
||||||
```
|
```
|
||||||
|
|
||||||
To run the unit tests:
|
To run the unit tests:
|
||||||
|
|||||||
@@ -1,17 +1,19 @@
|
|||||||
[project]
|
[project]
|
||||||
name = "lancedb"
|
name = "lancedb"
|
||||||
version = "0.6.8"
|
version = "0.6.6"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"deprecation",
|
"deprecation",
|
||||||
"pylance==0.10.10",
|
"pylance==0.10.6",
|
||||||
"ratelimiter~=1.0",
|
"ratelimiter~=1.0",
|
||||||
"requests>=2.31.0",
|
|
||||||
"retry>=0.9.2",
|
"retry>=0.9.2",
|
||||||
"tqdm>=4.27.0",
|
"tqdm>=4.27.0",
|
||||||
"pydantic>=1.10",
|
"pydantic>=1.10",
|
||||||
"attrs>=21.3.0",
|
"attrs>=21.3.0",
|
||||||
"semver>=3.0",
|
"semver>=3.0",
|
||||||
"cachetools",
|
"cachetools",
|
||||||
|
"pyyaml>=6.0",
|
||||||
|
"click>=8.1.7",
|
||||||
|
"requests>=2.31.0",
|
||||||
"overrides>=0.7",
|
"overrides>=0.7",
|
||||||
]
|
]
|
||||||
description = "lancedb"
|
description = "lancedb"
|
||||||
@@ -39,7 +41,6 @@ classifiers = [
|
|||||||
"Programming Language :: Python :: 3.9",
|
"Programming Language :: Python :: 3.9",
|
||||||
"Programming Language :: Python :: 3.10",
|
"Programming Language :: Python :: 3.10",
|
||||||
"Programming Language :: Python :: 3.11",
|
"Programming Language :: Python :: 3.11",
|
||||||
"Programming Language :: Python :: 3.12",
|
|
||||||
"Topic :: Scientific/Engineering",
|
"Topic :: Scientific/Engineering",
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -49,7 +50,6 @@ repository = "https://github.com/lancedb/lancedb"
|
|||||||
[project.optional-dependencies]
|
[project.optional-dependencies]
|
||||||
tests = [
|
tests = [
|
||||||
"aiohttp",
|
"aiohttp",
|
||||||
"boto3",
|
|
||||||
"pandas>=1.4",
|
"pandas>=1.4",
|
||||||
"pytest",
|
"pytest",
|
||||||
"pytest-mock",
|
"pytest-mock",
|
||||||
@@ -57,7 +57,6 @@ tests = [
|
|||||||
"duckdb",
|
"duckdb",
|
||||||
"pytz",
|
"pytz",
|
||||||
"polars>=0.19",
|
"polars>=0.19",
|
||||||
"tantivy"
|
|
||||||
]
|
]
|
||||||
dev = ["ruff", "pre-commit"]
|
dev = ["ruff", "pre-commit"]
|
||||||
docs = [
|
docs = [
|
||||||
@@ -88,17 +87,19 @@ azure = ["adlfs>=2024.2.0"]
|
|||||||
python-source = "python"
|
python-source = "python"
|
||||||
module-name = "lancedb._lancedb"
|
module-name = "lancedb._lancedb"
|
||||||
|
|
||||||
|
[project.scripts]
|
||||||
|
lancedb = "lancedb.cli.cli:cli"
|
||||||
|
|
||||||
[build-system]
|
[build-system]
|
||||||
requires = ["maturin>=1.4"]
|
requires = ["maturin>=1.4"]
|
||||||
build-backend = "maturin"
|
build-backend = "maturin"
|
||||||
|
|
||||||
[tool.ruff.lint]
|
[tool.ruff.lint]
|
||||||
select = ["F", "E", "W", "G", "TCH", "PERF"]
|
select = ["F", "E", "W", "I", "G", "TCH", "PERF"]
|
||||||
|
|
||||||
[tool.pytest.ini_options]
|
[tool.pytest.ini_options]
|
||||||
addopts = "--strict-markers --ignore-glob=lancedb/embeddings/*.py"
|
addopts = "--strict-markers --ignore-glob=lancedb/embeddings/*.py"
|
||||||
markers = [
|
markers = [
|
||||||
"slow: marks tests as slow (deselect with '-m \"not slow\"')",
|
"slow: marks tests as slow (deselect with '-m \"not slow\"')",
|
||||||
"asyncio",
|
"asyncio",
|
||||||
"s3_test"
|
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -15,7 +15,7 @@ import importlib.metadata
|
|||||||
import os
|
import os
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
from datetime import timedelta
|
from datetime import timedelta
|
||||||
from typing import Dict, Optional, Union
|
from typing import Optional, Union
|
||||||
|
|
||||||
__version__ = importlib.metadata.version("lancedb")
|
__version__ = importlib.metadata.version("lancedb")
|
||||||
|
|
||||||
@@ -25,6 +25,7 @@ from .db import AsyncConnection, DBConnection, LanceDBConnection
|
|||||||
from .remote.db import RemoteDBConnection
|
from .remote.db import RemoteDBConnection
|
||||||
from .schema import vector
|
from .schema import vector
|
||||||
from .table import AsyncTable
|
from .table import AsyncTable
|
||||||
|
from .utils import sentry_log
|
||||||
|
|
||||||
|
|
||||||
def connect(
|
def connect(
|
||||||
@@ -118,7 +119,6 @@ async def connect_async(
|
|||||||
host_override: Optional[str] = None,
|
host_override: Optional[str] = None,
|
||||||
read_consistency_interval: Optional[timedelta] = None,
|
read_consistency_interval: Optional[timedelta] = None,
|
||||||
request_thread_pool: Optional[Union[int, ThreadPoolExecutor]] = None,
|
request_thread_pool: Optional[Union[int, ThreadPoolExecutor]] = None,
|
||||||
storage_options: Optional[Dict[str, str]] = None,
|
|
||||||
) -> AsyncConnection:
|
) -> AsyncConnection:
|
||||||
"""Connect to a LanceDB database.
|
"""Connect to a LanceDB database.
|
||||||
|
|
||||||
@@ -145,9 +145,6 @@ async def connect_async(
|
|||||||
the last check, then the table will be checked for updates. Note: this
|
the last check, then the table will be checked for updates. Note: this
|
||||||
consistency only applies to read operations. Write operations are
|
consistency only applies to read operations. Write operations are
|
||||||
always consistent.
|
always consistent.
|
||||||
storage_options: dict, optional
|
|
||||||
Additional options for the storage backend. See available options at
|
|
||||||
https://lancedb.github.io/lancedb/guides/storage/
|
|
||||||
|
|
||||||
Examples
|
Examples
|
||||||
--------
|
--------
|
||||||
@@ -176,7 +173,6 @@ async def connect_async(
|
|||||||
region,
|
region,
|
||||||
host_override,
|
host_override,
|
||||||
read_consistency_interval_secs,
|
read_consistency_interval_secs,
|
||||||
storage_options,
|
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -188,6 +184,7 @@ __all__ = [
|
|||||||
"AsyncTable",
|
"AsyncTable",
|
||||||
"URI",
|
"URI",
|
||||||
"sanitize_uri",
|
"sanitize_uri",
|
||||||
|
"sentry_log",
|
||||||
"vector",
|
"vector",
|
||||||
"DBConnection",
|
"DBConnection",
|
||||||
"LanceDBConnection",
|
"LanceDBConnection",
|
||||||
|
|||||||
@@ -19,18 +19,10 @@ class Connection(object):
|
|||||||
self, start_after: Optional[str], limit: Optional[int]
|
self, start_after: Optional[str], limit: Optional[int]
|
||||||
) -> list[str]: ...
|
) -> list[str]: ...
|
||||||
async def create_table(
|
async def create_table(
|
||||||
self,
|
self, name: str, mode: str, data: pa.RecordBatchReader
|
||||||
name: str,
|
|
||||||
mode: str,
|
|
||||||
data: pa.RecordBatchReader,
|
|
||||||
storage_options: Optional[Dict[str, str]] = None,
|
|
||||||
) -> Table: ...
|
) -> Table: ...
|
||||||
async def create_empty_table(
|
async def create_empty_table(
|
||||||
self,
|
self, name: str, mode: str, schema: pa.Schema
|
||||||
name: str,
|
|
||||||
mode: str,
|
|
||||||
schema: pa.Schema,
|
|
||||||
storage_options: Optional[Dict[str, str]] = None,
|
|
||||||
) -> Table: ...
|
) -> Table: ...
|
||||||
|
|
||||||
class Table:
|
class Table:
|
||||||
|
|||||||
12
python/python/lancedb/cli/__init__.py
Normal file
12
python/python/lancedb/cli/__init__.py
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
# Copyright 2023 LanceDB Developers
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
47
python/python/lancedb/cli/cli.py
Normal file
47
python/python/lancedb/cli/cli.py
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
# Copyright 2023 LanceDB Developers
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import click
|
||||||
|
|
||||||
|
from lancedb.utils import CONFIG
|
||||||
|
|
||||||
|
|
||||||
|
@click.group()
|
||||||
|
@click.version_option(help="LanceDB command line interface entry point")
|
||||||
|
def cli():
|
||||||
|
"LanceDB command line interface"
|
||||||
|
|
||||||
|
|
||||||
|
diagnostics_help = """
|
||||||
|
Enable or disable LanceDB diagnostics. When enabled, LanceDB will send anonymous events
|
||||||
|
to help us improve LanceDB. These diagnostics are used only for error reporting and no
|
||||||
|
data is collected. You can find more about diagnosis on our docs:
|
||||||
|
https://lancedb.github.io/lancedb/cli_config/
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@cli.command(help=diagnostics_help)
|
||||||
|
@click.option("--enabled/--disabled", default=True)
|
||||||
|
def diagnostics(enabled):
|
||||||
|
CONFIG.update({"diagnostics": True if enabled else False})
|
||||||
|
click.echo("LanceDB diagnostics is %s" % ("enabled" if enabled else "disabled"))
|
||||||
|
|
||||||
|
|
||||||
|
@cli.command(help="Show current LanceDB configuration")
|
||||||
|
def config():
|
||||||
|
# TODO: pretty print as table with colors and formatting
|
||||||
|
click.echo("Current LanceDB configuration:")
|
||||||
|
cfg = CONFIG.copy()
|
||||||
|
cfg.pop("uuid") # Don't show uuid as it is not configurable
|
||||||
|
for item, amount in cfg.items():
|
||||||
|
click.echo("{} ({})".format(item, amount))
|
||||||
@@ -18,13 +18,14 @@ import inspect
|
|||||||
import os
|
import os
|
||||||
from abc import abstractmethod
|
from abc import abstractmethod
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import TYPE_CHECKING, Dict, Iterable, List, Literal, Optional, Union
|
from typing import TYPE_CHECKING, Iterable, List, Literal, Optional, Union
|
||||||
|
|
||||||
import pyarrow as pa
|
import pyarrow as pa
|
||||||
from overrides import EnforceOverrides, override
|
from overrides import EnforceOverrides, override
|
||||||
from pyarrow import fs
|
from pyarrow import fs
|
||||||
|
|
||||||
from lancedb.common import data_to_reader, validate_schema
|
from lancedb.common import data_to_reader, validate_schema
|
||||||
|
from lancedb.utils.events import register_event
|
||||||
|
|
||||||
from ._lancedb import connect as lancedb_connect
|
from ._lancedb import connect as lancedb_connect
|
||||||
from .pydantic import LanceModel
|
from .pydantic import LanceModel
|
||||||
@@ -533,7 +534,6 @@ class AsyncConnection(object):
|
|||||||
exist_ok: Optional[bool] = None,
|
exist_ok: Optional[bool] = None,
|
||||||
on_bad_vectors: Optional[str] = None,
|
on_bad_vectors: Optional[str] = None,
|
||||||
fill_value: Optional[float] = None,
|
fill_value: Optional[float] = None,
|
||||||
storage_options: Optional[Dict[str, str]] = None,
|
|
||||||
) -> AsyncTable:
|
) -> AsyncTable:
|
||||||
"""Create an [AsyncTable][lancedb.table.AsyncTable] in the database.
|
"""Create an [AsyncTable][lancedb.table.AsyncTable] in the database.
|
||||||
|
|
||||||
@@ -571,12 +571,6 @@ class AsyncConnection(object):
|
|||||||
One of "error", "drop", "fill".
|
One of "error", "drop", "fill".
|
||||||
fill_value: float
|
fill_value: float
|
||||||
The value to use when filling vectors. Only used if on_bad_vectors="fill".
|
The value to use when filling vectors. Only used if on_bad_vectors="fill".
|
||||||
storage_options: dict, optional
|
|
||||||
Additional options for the storage backend. Options already set on the
|
|
||||||
connection will be inherited by the table, but can be overridden here.
|
|
||||||
See available options at
|
|
||||||
https://lancedb.github.io/lancedb/guides/storage/
|
|
||||||
|
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
@@ -736,40 +730,32 @@ class AsyncConnection(object):
|
|||||||
mode = "exist_ok"
|
mode = "exist_ok"
|
||||||
|
|
||||||
if data is None:
|
if data is None:
|
||||||
new_table = await self._inner.create_empty_table(
|
new_table = await self._inner.create_empty_table(name, mode, schema)
|
||||||
name, mode, schema, storage_options=storage_options
|
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
data = data_to_reader(data, schema)
|
data = data_to_reader(data, schema)
|
||||||
new_table = await self._inner.create_table(
|
new_table = await self._inner.create_table(
|
||||||
name,
|
name,
|
||||||
mode,
|
mode,
|
||||||
data,
|
data,
|
||||||
storage_options=storage_options,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
register_event("create_table")
|
||||||
return AsyncTable(new_table)
|
return AsyncTable(new_table)
|
||||||
|
|
||||||
async def open_table(
|
async def open_table(self, name: str) -> Table:
|
||||||
self, name: str, storage_options: Optional[Dict[str, str]] = None
|
|
||||||
) -> Table:
|
|
||||||
"""Open a Lance Table in the database.
|
"""Open a Lance Table in the database.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
name: str
|
name: str
|
||||||
The name of the table.
|
The name of the table.
|
||||||
storage_options: dict, optional
|
|
||||||
Additional options for the storage backend. Options already set on the
|
|
||||||
connection will be inherited by the table, but can be overridden here.
|
|
||||||
See available options at
|
|
||||||
https://lancedb.github.io/lancedb/guides/storage/
|
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
A LanceTable object representing the table.
|
A LanceTable object representing the table.
|
||||||
"""
|
"""
|
||||||
table = await self._inner.open_table(name, storage_options)
|
table = await self._inner.open_table(name)
|
||||||
|
register_event("open_table")
|
||||||
return AsyncTable(table)
|
return AsyncTable(table)
|
||||||
|
|
||||||
async def drop_table(self, name: str):
|
async def drop_table(self, name: str):
|
||||||
|
|||||||
@@ -10,6 +10,7 @@
|
|||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
# ruff: noqa: F401
|
# ruff: noqa: F401
|
||||||
from .base import EmbeddingFunction, EmbeddingFunctionConfig, TextEmbeddingFunction
|
from .base import EmbeddingFunction, EmbeddingFunctionConfig, TextEmbeddingFunction
|
||||||
from .bedrock import BedRockText
|
from .bedrock import BedRockText
|
||||||
@@ -20,7 +21,4 @@ from .open_clip import OpenClipEmbeddings
|
|||||||
from .openai import OpenAIEmbeddings
|
from .openai import OpenAIEmbeddings
|
||||||
from .registry import EmbeddingFunctionRegistry, get_registry
|
from .registry import EmbeddingFunctionRegistry, get_registry
|
||||||
from .sentence_transformers import SentenceTransformerEmbeddings
|
from .sentence_transformers import SentenceTransformerEmbeddings
|
||||||
from .gte import GteEmbeddings
|
|
||||||
from .transformers import TransformersEmbeddingFunction, ColbertEmbeddings
|
|
||||||
from .imagebind import ImageBindEmbeddings
|
|
||||||
from .utils import with_embeddings
|
from .utils import with_embeddings
|
||||||
|
|||||||
@@ -78,9 +78,6 @@ class BedRockText(TextEmbeddingFunction):
|
|||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
keep_untouched = (cached_property,)
|
keep_untouched = (cached_property,)
|
||||||
else:
|
|
||||||
model_config = dict()
|
|
||||||
model_config["ignored_types"] = (cached_property,)
|
|
||||||
|
|
||||||
def ndims(self):
|
def ndims(self):
|
||||||
# return len(self._generate_embedding("test"))
|
# return len(self._generate_embedding("test"))
|
||||||
|
|||||||
@@ -94,9 +94,6 @@ class GeminiText(TextEmbeddingFunction):
|
|||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
keep_untouched = (cached_property,)
|
keep_untouched = (cached_property,)
|
||||||
else:
|
|
||||||
model_config = dict()
|
|
||||||
model_config["ignored_types"] = (cached_property,)
|
|
||||||
|
|
||||||
def ndims(self):
|
def ndims(self):
|
||||||
# TODO: fix hardcoding
|
# TODO: fix hardcoding
|
||||||
|
|||||||
@@ -22,8 +22,6 @@ from .base import EmbeddingFunction
|
|||||||
from .registry import register
|
from .registry import register
|
||||||
from .utils import AUDIO, IMAGES, TEXT
|
from .utils import AUDIO, IMAGES, TEXT
|
||||||
|
|
||||||
from lancedb.pydantic import PYDANTIC_VERSION
|
|
||||||
|
|
||||||
|
|
||||||
@register("imagebind")
|
@register("imagebind")
|
||||||
class ImageBindEmbeddings(EmbeddingFunction):
|
class ImageBindEmbeddings(EmbeddingFunction):
|
||||||
@@ -40,14 +38,6 @@ class ImageBindEmbeddings(EmbeddingFunction):
|
|||||||
device: str = "cpu"
|
device: str = "cpu"
|
||||||
normalize: bool = False
|
normalize: bool = False
|
||||||
|
|
||||||
if PYDANTIC_VERSION < (2, 0): # Pydantic 1.x compat
|
|
||||||
|
|
||||||
class Config:
|
|
||||||
keep_untouched = (cached_property,)
|
|
||||||
else:
|
|
||||||
model_config = dict()
|
|
||||||
model_config["ignored_types"] = (cached_property,)
|
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
super().__init__(*args, **kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
self._ndims = 1024
|
self._ndims = 1024
|
||||||
|
|||||||
@@ -1,106 +0,0 @@
|
|||||||
# Copyright (c) 2023. LanceDB Developers
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
from functools import cached_property
|
|
||||||
from typing import List, Any
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
from pydantic import PrivateAttr
|
|
||||||
from lancedb.pydantic import PYDANTIC_VERSION
|
|
||||||
|
|
||||||
from ..util import attempt_import_or_raise
|
|
||||||
from .base import EmbeddingFunction
|
|
||||||
from .registry import register
|
|
||||||
from .utils import TEXT
|
|
||||||
|
|
||||||
|
|
||||||
@register("huggingface")
|
|
||||||
class TransformersEmbeddingFunction(EmbeddingFunction):
|
|
||||||
"""
|
|
||||||
An embedding function that can use any model from the transformers library.
|
|
||||||
|
|
||||||
Parameters:
|
|
||||||
----------
|
|
||||||
name : str
|
|
||||||
The name of the model to use. This should be a model name that can be loaded
|
|
||||||
by transformers.AutoModel.from_pretrained. For example, "bert-base-uncased".
|
|
||||||
default: "colbert-ir/colbertv2.0""
|
|
||||||
|
|
||||||
to download package, run :
|
|
||||||
`pip install transformers`
|
|
||||||
you may need to install pytorch as well - `https://pytorch.org/get-started/locally/`
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
name: str = "colbert-ir/colbertv2.0"
|
|
||||||
_tokenizer: Any = PrivateAttr()
|
|
||||||
_model: Any = PrivateAttr()
|
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
|
||||||
super().__init__(*args, **kwargs)
|
|
||||||
self._ndims = None
|
|
||||||
transformers = attempt_import_or_raise("transformers")
|
|
||||||
self._tokenizer = transformers.AutoTokenizer.from_pretrained(self.name)
|
|
||||||
self._model = transformers.AutoModel.from_pretrained(self.name)
|
|
||||||
|
|
||||||
if PYDANTIC_VERSION < (2, 0): # Pydantic 1.x compat
|
|
||||||
|
|
||||||
class Config:
|
|
||||||
keep_untouched = (cached_property,)
|
|
||||||
else:
|
|
||||||
model_config = dict()
|
|
||||||
model_config["ignored_types"] = (cached_property,)
|
|
||||||
|
|
||||||
def ndims(self):
|
|
||||||
self._ndims = self._model.config.hidden_size
|
|
||||||
return self._ndims
|
|
||||||
|
|
||||||
def compute_query_embeddings(self, query: str, *args, **kwargs) -> List[np.array]:
|
|
||||||
return self.compute_source_embeddings(query)
|
|
||||||
|
|
||||||
def compute_source_embeddings(self, texts: TEXT, *args, **kwargs) -> List[np.array]:
|
|
||||||
texts = self.sanitize_input(texts)
|
|
||||||
embedding = []
|
|
||||||
for text in texts:
|
|
||||||
encoding = self._tokenizer(
|
|
||||||
text, return_tensors="pt", padding=True, truncation=True
|
|
||||||
)
|
|
||||||
emb = self._model(**encoding).last_hidden_state.mean(dim=1).squeeze()
|
|
||||||
embedding.append(emb.detach().numpy())
|
|
||||||
|
|
||||||
return embedding
|
|
||||||
|
|
||||||
|
|
||||||
@register("colbert")
|
|
||||||
class ColbertEmbeddings(TransformersEmbeddingFunction):
|
|
||||||
"""
|
|
||||||
An embedding function that uses the colbert model from the huggingface library.
|
|
||||||
|
|
||||||
Parameters:
|
|
||||||
----------
|
|
||||||
name : str
|
|
||||||
The name of the model to use. This should be a model name that can be loaded
|
|
||||||
by transformers.AutoModel.from_pretrained. For example, "bert-base-uncased".
|
|
||||||
default: "colbert-ir/colbertv2.0""
|
|
||||||
|
|
||||||
to download package, run :
|
|
||||||
`pip install transformers`
|
|
||||||
you may need to install pytorch as well - `https://pytorch.org/get-started/locally/`
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
name: str = "colbert-ir/colbertv2.0"
|
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
|
||||||
super().__init__(*args, **kwargs)
|
|
||||||
@@ -19,14 +19,15 @@ import sys
|
|||||||
import time
|
import time
|
||||||
import urllib.error
|
import urllib.error
|
||||||
import weakref
|
import weakref
|
||||||
import logging
|
|
||||||
from typing import Callable, List, Union
|
from typing import Callable, List, Union
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pyarrow as pa
|
import pyarrow as pa
|
||||||
from lance.vector import vec_to_table
|
from lance.vector import vec_to_table
|
||||||
from retry import retry
|
from retry import retry
|
||||||
|
|
||||||
from ..util import deprecated, safe_import_pandas
|
from ..util import deprecated, safe_import_pandas
|
||||||
|
from ..utils.general import LOGGER
|
||||||
|
|
||||||
pd = safe_import_pandas()
|
pd = safe_import_pandas()
|
||||||
|
|
||||||
@@ -255,7 +256,7 @@ def retry_with_exponential_backoff(
|
|||||||
)
|
)
|
||||||
|
|
||||||
delay *= exponential_base * (1 + jitter * random.random())
|
delay *= exponential_base * (1 + jitter * random.random())
|
||||||
logging.info("Retrying in %s seconds...", delay)
|
LOGGER.info(f"Retrying in {delay:.2f} seconds due to {e}")
|
||||||
time.sleep(delay)
|
time.sleep(delay)
|
||||||
|
|
||||||
return wrapper
|
return wrapper
|
||||||
@@ -276,5 +277,5 @@ def url_retrieve(url: str):
|
|||||||
|
|
||||||
|
|
||||||
def api_key_not_found_help(provider):
|
def api_key_not_found_help(provider):
|
||||||
logging.error("Could not find API key for %s", provider)
|
LOGGER.error(f"Could not find API key for {provider}.")
|
||||||
raise ValueError(f"Please set the {provider.upper()}_API_KEY environment variable.")
|
raise ValueError(f"Please set the {provider.upper()}_API_KEY environment variable.")
|
||||||
|
|||||||
@@ -18,7 +18,6 @@ from concurrent.futures import ThreadPoolExecutor
|
|||||||
from typing import Iterable, List, Optional, Union
|
from typing import Iterable, List, Optional, Union
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
from cachetools import TTLCache
|
|
||||||
import pyarrow as pa
|
import pyarrow as pa
|
||||||
from overrides import override
|
from overrides import override
|
||||||
|
|
||||||
@@ -30,6 +29,7 @@ from ..table import Table, _sanitize_data
|
|||||||
from ..util import validate_table_name
|
from ..util import validate_table_name
|
||||||
from .arrow import to_ipc_binary
|
from .arrow import to_ipc_binary
|
||||||
from .client import ARROW_STREAM_CONTENT_TYPE, RestfulLanceDBClient
|
from .client import ARROW_STREAM_CONTENT_TYPE, RestfulLanceDBClient
|
||||||
|
from .errors import LanceDBClientError
|
||||||
|
|
||||||
|
|
||||||
class RemoteDBConnection(DBConnection):
|
class RemoteDBConnection(DBConnection):
|
||||||
@@ -60,7 +60,6 @@ class RemoteDBConnection(DBConnection):
|
|||||||
read_timeout=read_timeout,
|
read_timeout=read_timeout,
|
||||||
)
|
)
|
||||||
self._request_thread_pool = request_thread_pool
|
self._request_thread_pool = request_thread_pool
|
||||||
self._table_cache = TTLCache(maxsize=10000, ttl=300)
|
|
||||||
|
|
||||||
def __repr__(self) -> str:
|
def __repr__(self) -> str:
|
||||||
return f"RemoteConnect(name={self.db_name})"
|
return f"RemoteConnect(name={self.db_name})"
|
||||||
@@ -90,7 +89,6 @@ class RemoteDBConnection(DBConnection):
|
|||||||
else:
|
else:
|
||||||
break
|
break
|
||||||
for item in result:
|
for item in result:
|
||||||
self._table_cache[item] = True
|
|
||||||
yield item
|
yield item
|
||||||
|
|
||||||
@override
|
@override
|
||||||
@@ -111,10 +109,16 @@ class RemoteDBConnection(DBConnection):
|
|||||||
self._client.mount_retry_adapter_for_table(name)
|
self._client.mount_retry_adapter_for_table(name)
|
||||||
|
|
||||||
# check if table exists
|
# check if table exists
|
||||||
if self._table_cache.get(name) is None:
|
try:
|
||||||
self._client.post(f"/v1/table/{name}/describe/")
|
self._client.post(f"/v1/table/{name}/describe/")
|
||||||
self._table_cache[name] = True
|
except LanceDBClientError as err:
|
||||||
|
if str(err).startswith("Not found"):
|
||||||
|
logging.error(
|
||||||
|
"Table %s does not exist. Please first call "
|
||||||
|
"db.create_table(%s, data).",
|
||||||
|
name,
|
||||||
|
name,
|
||||||
|
)
|
||||||
return RemoteTable(self, name)
|
return RemoteTable(self, name)
|
||||||
|
|
||||||
@override
|
@override
|
||||||
@@ -263,7 +267,6 @@ class RemoteDBConnection(DBConnection):
|
|||||||
content_type=ARROW_STREAM_CONTENT_TYPE,
|
content_type=ARROW_STREAM_CONTENT_TYPE,
|
||||||
)
|
)
|
||||||
|
|
||||||
self._table_cache[name] = True
|
|
||||||
return RemoteTable(self, name)
|
return RemoteTable(self, name)
|
||||||
|
|
||||||
@override
|
@override
|
||||||
@@ -279,7 +282,6 @@ class RemoteDBConnection(DBConnection):
|
|||||||
self._client.post(
|
self._client.post(
|
||||||
f"/v1/table/{name}/drop/",
|
f"/v1/table/{name}/drop/",
|
||||||
)
|
)
|
||||||
self._table_cache.pop(name)
|
|
||||||
|
|
||||||
async def close(self):
|
async def close(self):
|
||||||
"""Close the connection to the database."""
|
"""Close the connection to the database."""
|
||||||
|
|||||||
@@ -499,11 +499,11 @@ class RemoteTable(Table):
|
|||||||
)
|
)
|
||||||
|
|
||||||
def count_rows(self, filter: Optional[str] = None) -> int:
|
def count_rows(self, filter: Optional[str] = None) -> int:
|
||||||
payload = {"predicate": filter}
|
# payload = {"filter": filter}
|
||||||
resp = self._conn._client.post(
|
# self._conn._client.post(f"/v1/table/{self._name}/count_rows/", data=payload)
|
||||||
f"/v1/table/{self._name}/count_rows/", data=payload
|
return NotImplementedError(
|
||||||
|
"count_rows() is not yet supported on the LanceDB cloud"
|
||||||
)
|
)
|
||||||
return resp
|
|
||||||
|
|
||||||
def add_columns(self, transforms: Dict[str, str]):
|
def add_columns(self, transforms: Dict[str, str]):
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
|
|||||||
@@ -53,6 +53,7 @@ from .util import (
|
|||||||
safe_import_polars,
|
safe_import_polars,
|
||||||
value_to_sql,
|
value_to_sql,
|
||||||
)
|
)
|
||||||
|
from .utils.events import register_event
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
import PIL
|
import PIL
|
||||||
@@ -95,9 +96,6 @@ def _sanitize_data(
|
|||||||
data.data.to_batches(), schema, metadata, on_bad_vectors, fill_value
|
data.data.to_batches(), schema, metadata, on_bad_vectors, fill_value
|
||||||
)
|
)
|
||||||
|
|
||||||
if isinstance(data, LanceModel):
|
|
||||||
raise ValueError("Cannot add a single LanceModel to a table. Use a list.")
|
|
||||||
|
|
||||||
if isinstance(data, list):
|
if isinstance(data, list):
|
||||||
# convert to list of dict if data is a bunch of LanceModels
|
# convert to list of dict if data is a bunch of LanceModels
|
||||||
if isinstance(data[0], LanceModel):
|
if isinstance(data[0], LanceModel):
|
||||||
@@ -909,6 +907,7 @@ class LanceTable(Table):
|
|||||||
f"Table {name} does not exist."
|
f"Table {name} does not exist."
|
||||||
f"Please first call db.create_table({name}, data)"
|
f"Please first call db.create_table({name}, data)"
|
||||||
)
|
)
|
||||||
|
register_event("open_table")
|
||||||
|
|
||||||
return tbl
|
return tbl
|
||||||
|
|
||||||
@@ -1152,6 +1151,7 @@ class LanceTable(Table):
|
|||||||
accelerator=accelerator,
|
accelerator=accelerator,
|
||||||
index_cache_size=index_cache_size,
|
index_cache_size=index_cache_size,
|
||||||
)
|
)
|
||||||
|
register_event("create_index")
|
||||||
|
|
||||||
def create_scalar_index(self, column: str, *, replace: bool = True):
|
def create_scalar_index(self, column: str, *, replace: bool = True):
|
||||||
self._dataset_mut.create_scalar_index(
|
self._dataset_mut.create_scalar_index(
|
||||||
@@ -1211,6 +1211,7 @@ class LanceTable(Table):
|
|||||||
ordering_fields=ordering_field_names,
|
ordering_fields=ordering_field_names,
|
||||||
writer_heap_size=writer_heap_size,
|
writer_heap_size=writer_heap_size,
|
||||||
)
|
)
|
||||||
|
register_event("create_fts_index")
|
||||||
|
|
||||||
def _get_fts_index_path(self):
|
def _get_fts_index_path(self):
|
||||||
return join_uri(self._dataset_uri, "_indices", "tantivy")
|
return join_uri(self._dataset_uri, "_indices", "tantivy")
|
||||||
@@ -1258,6 +1259,7 @@ class LanceTable(Table):
|
|||||||
self._ref.dataset = lance.write_dataset(
|
self._ref.dataset = lance.write_dataset(
|
||||||
data, self._dataset_uri, schema=self.schema, mode=mode
|
data, self._dataset_uri, schema=self.schema, mode=mode
|
||||||
)
|
)
|
||||||
|
register_event("add")
|
||||||
|
|
||||||
def merge(
|
def merge(
|
||||||
self,
|
self,
|
||||||
@@ -1320,6 +1322,7 @@ class LanceTable(Table):
|
|||||||
self._ref.dataset = self._dataset_mut.merge(
|
self._ref.dataset = self._dataset_mut.merge(
|
||||||
other_table, left_on=left_on, right_on=right_on, schema=schema
|
other_table, left_on=left_on, right_on=right_on, schema=schema
|
||||||
)
|
)
|
||||||
|
register_event("merge")
|
||||||
|
|
||||||
@cached_property
|
@cached_property
|
||||||
def embedding_functions(self) -> dict:
|
def embedding_functions(self) -> dict:
|
||||||
@@ -1406,14 +1409,8 @@ class LanceTable(Table):
|
|||||||
vector and the returned vector.
|
vector and the returned vector.
|
||||||
"""
|
"""
|
||||||
if vector_column_name is None and query is not None:
|
if vector_column_name is None and query is not None:
|
||||||
try:
|
vector_column_name = inf_vector_column_query(self.schema)
|
||||||
vector_column_name = inf_vector_column_query(self.schema)
|
register_event("search_table")
|
||||||
except Exception as e:
|
|
||||||
if query_type == "fts":
|
|
||||||
vector_column_name = ""
|
|
||||||
else:
|
|
||||||
raise e
|
|
||||||
|
|
||||||
return LanceQueryBuilder.create(
|
return LanceQueryBuilder.create(
|
||||||
self,
|
self,
|
||||||
query,
|
query,
|
||||||
@@ -1540,6 +1537,7 @@ class LanceTable(Table):
|
|||||||
if data is not None:
|
if data is not None:
|
||||||
new_table.add(data)
|
new_table.add(data)
|
||||||
|
|
||||||
|
register_event("create_table")
|
||||||
return new_table
|
return new_table
|
||||||
|
|
||||||
def delete(self, where: str):
|
def delete(self, where: str):
|
||||||
@@ -1598,6 +1596,7 @@ class LanceTable(Table):
|
|||||||
values_sql = {k: value_to_sql(v) for k, v in values.items()}
|
values_sql = {k: value_to_sql(v) for k, v in values.items()}
|
||||||
|
|
||||||
self._dataset_mut.update(values_sql, where)
|
self._dataset_mut.update(values_sql, where)
|
||||||
|
register_event("update")
|
||||||
|
|
||||||
def _execute_query(
|
def _execute_query(
|
||||||
self, query: Query, batch_size: Optional[int] = None
|
self, query: Query, batch_size: Optional[int] = None
|
||||||
@@ -2114,6 +2113,7 @@ class AsyncTable:
|
|||||||
if isinstance(data, pa.Table):
|
if isinstance(data, pa.Table):
|
||||||
data = pa.RecordBatchReader.from_batches(data.schema, data.to_batches())
|
data = pa.RecordBatchReader.from_batches(data.schema, data.to_batches())
|
||||||
await self._inner.add(data, mode)
|
await self._inner.add(data, mode)
|
||||||
|
register_event("add")
|
||||||
|
|
||||||
def merge_insert(self, on: Union[str, Iterable[str]]) -> LanceMergeInsertBuilder:
|
def merge_insert(self, on: Union[str, Iterable[str]]) -> LanceMergeInsertBuilder:
|
||||||
"""
|
"""
|
||||||
|
|||||||
15
python/python/lancedb/utils/__init__.py
Normal file
15
python/python/lancedb/utils/__init__.py
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
# Copyright 2023 LanceDB Developers
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
from .config import Config
|
||||||
|
|
||||||
|
CONFIG = Config()
|
||||||
118
python/python/lancedb/utils/config.py
Normal file
118
python/python/lancedb/utils/config.py
Normal file
@@ -0,0 +1,118 @@
|
|||||||
|
# Copyright 2023 LanceDB Developers
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import copy
|
||||||
|
import hashlib
|
||||||
|
import os
|
||||||
|
import platform
|
||||||
|
import uuid
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from .general import LOGGER, is_dir_writeable, yaml_load, yaml_save
|
||||||
|
|
||||||
|
|
||||||
|
def get_user_config_dir(sub_dir="lancedb"):
|
||||||
|
"""
|
||||||
|
Get the user config directory.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
sub_dir (str): The name of the subdirectory to create.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(Path): The path to the user config directory.
|
||||||
|
"""
|
||||||
|
# Return the appropriate config directory for each operating system
|
||||||
|
if platform.system() == "Windows":
|
||||||
|
path = Path.home() / "AppData" / "Roaming" / sub_dir
|
||||||
|
elif platform.system() == "Darwin":
|
||||||
|
path = Path.home() / "Library" / "Application Support" / sub_dir
|
||||||
|
elif platform.system() == "Linux":
|
||||||
|
path = Path.home() / ".config" / sub_dir
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unsupported operating system: {platform.system()}")
|
||||||
|
|
||||||
|
# GCP and AWS lambda fix, only /tmp is writeable
|
||||||
|
if not is_dir_writeable(path.parent):
|
||||||
|
LOGGER.warning(
|
||||||
|
f"WARNING ⚠️ user config directory '{path}' is not writeable, defaulting "
|
||||||
|
"to '/tmp' or CWD. Alternatively you can define a LANCEDB_CONFIG_DIR "
|
||||||
|
"environment variable for this path."
|
||||||
|
)
|
||||||
|
path = (
|
||||||
|
Path("/tmp") / sub_dir
|
||||||
|
if is_dir_writeable("/tmp")
|
||||||
|
else Path().cwd() / sub_dir
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create the subdirectory if it does not exist
|
||||||
|
path.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
return path
|
||||||
|
|
||||||
|
|
||||||
|
USER_CONFIG_DIR = Path(os.getenv("LANCEDB_CONFIG_DIR") or get_user_config_dir())
|
||||||
|
CONFIG_FILE = USER_CONFIG_DIR / "config.yaml"
|
||||||
|
|
||||||
|
|
||||||
|
class Config(dict):
|
||||||
|
"""
|
||||||
|
Manages lancedb config stored in a YAML file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file (str | Path): Path to the lancedb config YAML file. Default is
|
||||||
|
USER_CONFIG_DIR / 'config.yaml'.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, file=CONFIG_FILE):
|
||||||
|
self.file = Path(file)
|
||||||
|
self.defaults = { # Default global config values
|
||||||
|
"diagnostics": True,
|
||||||
|
"uuid": hashlib.sha256(str(uuid.getnode()).encode()).hexdigest(),
|
||||||
|
}
|
||||||
|
|
||||||
|
super().__init__(copy.deepcopy(self.defaults))
|
||||||
|
|
||||||
|
if not self.file.exists():
|
||||||
|
self.save()
|
||||||
|
|
||||||
|
self.load()
|
||||||
|
correct_keys = self.keys() == self.defaults.keys()
|
||||||
|
correct_types = all(
|
||||||
|
type(a) is type(b) for a, b in zip(self.values(), self.defaults.values())
|
||||||
|
)
|
||||||
|
if not (correct_keys and correct_types):
|
||||||
|
LOGGER.warning(
|
||||||
|
"WARNING ⚠️ LanceDB settings reset to default values. This may be due "
|
||||||
|
"to a possible problem with your settings or a recent package update. "
|
||||||
|
f"\nView settings & usage with 'lancedb settings' or at '{self.file}'"
|
||||||
|
)
|
||||||
|
self.reset()
|
||||||
|
|
||||||
|
def load(self):
|
||||||
|
"""Loads settings from the YAML file."""
|
||||||
|
super().update(yaml_load(self.file))
|
||||||
|
|
||||||
|
def save(self):
|
||||||
|
"""Saves the current settings to the YAML file."""
|
||||||
|
yaml_save(self.file, dict(self))
|
||||||
|
|
||||||
|
def update(self, *args, **kwargs):
|
||||||
|
"""Updates a setting value in the current settings."""
|
||||||
|
super().update(*args, **kwargs)
|
||||||
|
self.save()
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
"""Resets the settings to default and saves them."""
|
||||||
|
self.clear()
|
||||||
|
self.update(self.defaults)
|
||||||
|
self.save()
|
||||||
171
python/python/lancedb/utils/events.py
Normal file
171
python/python/lancedb/utils/events.py
Normal file
@@ -0,0 +1,171 @@
|
|||||||
|
# Copyright 2023 LanceDB Developers
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import datetime
|
||||||
|
import importlib.metadata
|
||||||
|
import platform
|
||||||
|
import random
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
|
||||||
|
from lancedb.utils import CONFIG
|
||||||
|
from lancedb.utils.general import TryExcept
|
||||||
|
|
||||||
|
from .general import (
|
||||||
|
PLATFORMS,
|
||||||
|
get_git_origin_url,
|
||||||
|
is_git_dir,
|
||||||
|
is_github_actions_ci,
|
||||||
|
is_online,
|
||||||
|
is_pip_package,
|
||||||
|
is_pytest_running,
|
||||||
|
threaded_request,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class _Events:
|
||||||
|
"""
|
||||||
|
A class for collecting anonymous event analytics. Event analytics are enabled when
|
||||||
|
``diagnostics=True`` in config and disabled when ``diagnostics=False``.
|
||||||
|
|
||||||
|
You can enable or disable diagnostics by running ``lancedb diagnostics --enabled``
|
||||||
|
or ``lancedb diagnostics --disabled``.
|
||||||
|
|
||||||
|
Attributes
|
||||||
|
----------
|
||||||
|
url : str
|
||||||
|
The URL to send anonymous events.
|
||||||
|
rate_limit : float
|
||||||
|
The rate limit in seconds for sending events.
|
||||||
|
metadata : dict
|
||||||
|
A dictionary containing metadata about the environment.
|
||||||
|
enabled : bool
|
||||||
|
A flag to enable or disable Events based on certain conditions.
|
||||||
|
"""
|
||||||
|
|
||||||
|
_instance = None
|
||||||
|
|
||||||
|
url = "https://app.posthog.com/capture/"
|
||||||
|
headers = {"Content-Type": "application/json"}
|
||||||
|
api_key = "phc_oENDjGgHtmIDrV6puUiFem2RB4JA8gGWulfdulmMdZP"
|
||||||
|
# This api-key is write only and is safe to expose in the codebase.
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
"""
|
||||||
|
Initializes the Events object with default values for events, rate_limit,
|
||||||
|
and metadata.
|
||||||
|
"""
|
||||||
|
self.events = [] # events list
|
||||||
|
self.throttled_event_names = ["search_table"]
|
||||||
|
self.throttled_events = set()
|
||||||
|
self.max_events = 5 # max events to store in memory
|
||||||
|
self.rate_limit = 60.0 * 60.0 # rate limit (seconds)
|
||||||
|
self.time = 0.0
|
||||||
|
|
||||||
|
if is_git_dir():
|
||||||
|
install = "git"
|
||||||
|
elif is_pip_package():
|
||||||
|
install = "pip"
|
||||||
|
else:
|
||||||
|
install = "other"
|
||||||
|
self.metadata = {
|
||||||
|
"cli": sys.argv[0],
|
||||||
|
"install": install,
|
||||||
|
"python": ".".join(platform.python_version_tuple()[:2]),
|
||||||
|
"version": importlib.metadata.version("lancedb"),
|
||||||
|
"platforms": PLATFORMS,
|
||||||
|
"session_id": round(random.random() * 1e15),
|
||||||
|
# TODO: In future we might be interested in this metric
|
||||||
|
# 'engagement_time_msec': 1000
|
||||||
|
}
|
||||||
|
|
||||||
|
TESTS_RUNNING = is_pytest_running() or is_github_actions_ci()
|
||||||
|
ONLINE = is_online()
|
||||||
|
self.enabled = (
|
||||||
|
CONFIG["diagnostics"]
|
||||||
|
and not TESTS_RUNNING
|
||||||
|
and ONLINE
|
||||||
|
and (
|
||||||
|
is_pip_package()
|
||||||
|
or get_git_origin_url() == "https://github.com/lancedb/lancedb.git"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
def __call__(self, event_name, params={}):
|
||||||
|
"""
|
||||||
|
Attempts to add a new event to the events list and send events if the rate
|
||||||
|
limit is reached.
|
||||||
|
|
||||||
|
Args
|
||||||
|
----
|
||||||
|
event_name : str
|
||||||
|
The name of the event to be logged.
|
||||||
|
params : dict, optional
|
||||||
|
A dictionary of additional parameters to be logged with the event.
|
||||||
|
"""
|
||||||
|
### NOTE: We might need a way to tag a session with a label to check usage
|
||||||
|
### from a source. Setting label should be exposed to the user.
|
||||||
|
if not self.enabled:
|
||||||
|
return
|
||||||
|
if (
|
||||||
|
len(self.events) < self.max_events
|
||||||
|
): # Events list limited to self.max_events (drop any events past this)
|
||||||
|
params.update(self.metadata)
|
||||||
|
event = {
|
||||||
|
"event": event_name,
|
||||||
|
"properties": params,
|
||||||
|
"timestamp": datetime.datetime.now(
|
||||||
|
tz=datetime.timezone.utc
|
||||||
|
).isoformat(),
|
||||||
|
"distinct_id": CONFIG["uuid"],
|
||||||
|
}
|
||||||
|
if event_name not in self.throttled_event_names:
|
||||||
|
self.events.append(event)
|
||||||
|
elif event_name not in self.throttled_events:
|
||||||
|
self.throttled_events.add(event_name)
|
||||||
|
self.events.append(event)
|
||||||
|
|
||||||
|
# Check rate limit
|
||||||
|
t = time.time()
|
||||||
|
if (t - self.time) < self.rate_limit:
|
||||||
|
return
|
||||||
|
# Time is over rate limiter, send now
|
||||||
|
data = {
|
||||||
|
"api_key": self.api_key,
|
||||||
|
"distinct_id": CONFIG["uuid"], # posthog needs this to accepts the event
|
||||||
|
"batch": self.events,
|
||||||
|
}
|
||||||
|
# POST equivalent to requests.post(self.url, json=data).
|
||||||
|
# threaded request is used to avoid blocking, retries are disabled, and
|
||||||
|
# verbose is disabled to avoid any possible disruption in the console.
|
||||||
|
threaded_request(
|
||||||
|
method="post",
|
||||||
|
url=self.url,
|
||||||
|
headers=self.headers,
|
||||||
|
json=data,
|
||||||
|
retry=0,
|
||||||
|
verbose=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Flush & Reset
|
||||||
|
self.events = []
|
||||||
|
self.throttled_events = set()
|
||||||
|
self.time = t
|
||||||
|
|
||||||
|
|
||||||
|
@TryExcept(verbose=False)
|
||||||
|
def register_event(name: str, **kwargs):
|
||||||
|
if _Events._instance is None:
|
||||||
|
_Events._instance = _Events()
|
||||||
|
|
||||||
|
_Events._instance(name, **kwargs)
|
||||||
454
python/python/lancedb/utils/general.py
Normal file
454
python/python/lancedb/utils/general.py
Normal file
@@ -0,0 +1,454 @@
|
|||||||
|
# Copyright 2023 LanceDB Developers
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import contextlib
|
||||||
|
import importlib
|
||||||
|
import logging.config
|
||||||
|
import os
|
||||||
|
import platform
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
import threading
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Union
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
LOGGING_NAME = "lancedb"
|
||||||
|
VERBOSE = (
|
||||||
|
str(os.getenv("LANCEDB_VERBOSE", True)).lower() == "true"
|
||||||
|
) # global verbose mode
|
||||||
|
|
||||||
|
|
||||||
|
def set_logging(name=LOGGING_NAME, verbose=True):
|
||||||
|
"""Sets up logging for the given name.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
name : str, optional
|
||||||
|
The name of the logger. Default is 'lancedb'.
|
||||||
|
verbose : bool, optional
|
||||||
|
Whether to enable verbose logging. Default is True.
|
||||||
|
"""
|
||||||
|
|
||||||
|
rank = int(os.getenv("RANK", -1)) # rank in world for Multi-GPU trainings
|
||||||
|
level = logging.INFO if verbose and rank in {-1, 0} else logging.ERROR
|
||||||
|
logging.config.dictConfig(
|
||||||
|
{
|
||||||
|
"version": 1,
|
||||||
|
"disable_existing_loggers": False,
|
||||||
|
"formatters": {name: {"format": "%(message)s"}},
|
||||||
|
"handlers": {
|
||||||
|
name: {
|
||||||
|
"class": "logging.StreamHandler",
|
||||||
|
"formatter": name,
|
||||||
|
"level": level,
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"loggers": {name: {"level": level, "handlers": [name], "propagate": False}},
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
set_logging(LOGGING_NAME, verbose=VERBOSE)
|
||||||
|
LOGGER = logging.getLogger(LOGGING_NAME)
|
||||||
|
|
||||||
|
|
||||||
|
def is_pip_package(filepath: str = __name__) -> bool:
|
||||||
|
"""Determines if the file at the given filepath is part of a pip package.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
filepath : str, optional
|
||||||
|
The filepath to check. Default is the current file.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
bool
|
||||||
|
True if the file is part of a pip package, False otherwise.
|
||||||
|
"""
|
||||||
|
# Get the spec for the module
|
||||||
|
spec = importlib.util.find_spec(filepath)
|
||||||
|
|
||||||
|
# Return whether the spec is not None and the origin is not None (indicating
|
||||||
|
# it is a package)
|
||||||
|
return spec is not None and spec.origin is not None
|
||||||
|
|
||||||
|
|
||||||
|
def is_pytest_running():
|
||||||
|
"""Determines whether pytest is currently running or not.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
bool
|
||||||
|
True if pytest is running, False otherwise.
|
||||||
|
"""
|
||||||
|
return (
|
||||||
|
("PYTEST_CURRENT_TEST" in os.environ)
|
||||||
|
or ("pytest" in sys.modules)
|
||||||
|
or ("pytest" in Path(sys.argv[0]).stem)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def is_github_actions_ci() -> bool:
|
||||||
|
"""
|
||||||
|
Determine if the current environment is a GitHub Actions CI Python runner.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
bool
|
||||||
|
True if the current environment is a GitHub Actions CI Python runner,
|
||||||
|
False otherwise.
|
||||||
|
"""
|
||||||
|
|
||||||
|
return (
|
||||||
|
"GITHUB_ACTIONS" in os.environ
|
||||||
|
and "RUNNER_OS" in os.environ
|
||||||
|
and "RUNNER_TOOL_CACHE" in os.environ
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def is_git_dir():
|
||||||
|
"""
|
||||||
|
Determines whether the current file is part of a git repository.
|
||||||
|
If the current file is not part of a git repository, returns None.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
bool
|
||||||
|
True if current file is part of a git repository.
|
||||||
|
"""
|
||||||
|
return get_git_dir() is not None
|
||||||
|
|
||||||
|
|
||||||
|
def is_online() -> bool:
|
||||||
|
"""
|
||||||
|
Check internet connectivity by attempting to connect to a known online host.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
bool
|
||||||
|
True if connection is successful, False otherwise.
|
||||||
|
"""
|
||||||
|
import socket
|
||||||
|
|
||||||
|
for host in "1.1.1.1", "8.8.8.8", "223.5.5.5": # Cloudflare, Google, AliDNS:
|
||||||
|
try:
|
||||||
|
test_connection = socket.create_connection(address=(host, 53), timeout=2)
|
||||||
|
except (socket.timeout, socket.gaierror, OSError): # noqa: PERF203
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
# If the connection was successful, close it to avoid a ResourceWarning
|
||||||
|
test_connection.close()
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def is_dir_writeable(dir_path: Union[str, Path]) -> bool:
|
||||||
|
"""Check if a directory is writeable.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
dir_path : Union[str, Path]
|
||||||
|
The path to the directory.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
bool
|
||||||
|
True if the directory is writeable, False otherwise.
|
||||||
|
"""
|
||||||
|
return os.access(str(dir_path), os.W_OK)
|
||||||
|
|
||||||
|
|
||||||
|
def is_colab():
|
||||||
|
"""Check if the current script is running inside a Google Colab notebook.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
bool
|
||||||
|
True if running inside a Colab notebook, False otherwise.
|
||||||
|
"""
|
||||||
|
return "COLAB_RELEASE_TAG" in os.environ or "COLAB_BACKEND_VERSION" in os.environ
|
||||||
|
|
||||||
|
|
||||||
|
def is_kaggle():
|
||||||
|
"""Check if the current script is running inside a Kaggle kernel.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
bool
|
||||||
|
True if running inside a Kaggle kernel, False otherwise.
|
||||||
|
"""
|
||||||
|
return (
|
||||||
|
os.environ.get("PWD") == "/kaggle/working"
|
||||||
|
and os.environ.get("KAGGLE_URL_BASE") == "https://www.kaggle.com"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def is_jupyter():
|
||||||
|
"""Check if the current script is running inside a Jupyter Notebook.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
bool
|
||||||
|
True if running inside a Jupyter Notebook, False otherwise.
|
||||||
|
"""
|
||||||
|
with contextlib.suppress(Exception):
|
||||||
|
from IPython import get_ipython
|
||||||
|
|
||||||
|
return get_ipython() is not None
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def is_docker() -> bool:
|
||||||
|
"""Determine if the script is running inside a Docker container.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
bool
|
||||||
|
True if the script is running inside a Docker container, False otherwise.
|
||||||
|
"""
|
||||||
|
file = Path("/proc/self/cgroup")
|
||||||
|
if file.exists():
|
||||||
|
with open(file) as f:
|
||||||
|
return "docker" in f.read()
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def get_git_dir():
|
||||||
|
"""Determine whether the current file is part of a git repository and if so,
|
||||||
|
returns the repository root directory.
|
||||||
|
If the current file is not part of a git repository, returns None.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path | None
|
||||||
|
Git root directory if found or None if not found.
|
||||||
|
"""
|
||||||
|
for d in Path(__file__).parents:
|
||||||
|
if (d / ".git").is_dir():
|
||||||
|
return d
|
||||||
|
|
||||||
|
|
||||||
|
def get_git_origin_url():
|
||||||
|
"""Retrieve the origin URL of a git repository.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
str | None
|
||||||
|
The origin URL of the git repository or None if not git directory.
|
||||||
|
"""
|
||||||
|
if is_git_dir():
|
||||||
|
with contextlib.suppress(subprocess.CalledProcessError):
|
||||||
|
origin = subprocess.check_output(
|
||||||
|
["git", "config", "--get", "remote.origin.url"]
|
||||||
|
)
|
||||||
|
return origin.decode().strip()
|
||||||
|
|
||||||
|
|
||||||
|
def yaml_save(file="data.yaml", data=None, header=""):
|
||||||
|
"""Save YAML data to a file.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
file : str, optional
|
||||||
|
File name, by default 'data.yaml'.
|
||||||
|
data : dict, optional
|
||||||
|
Data to save in YAML format, by default None.
|
||||||
|
header : str, optional
|
||||||
|
YAML header to add, by default "".
|
||||||
|
"""
|
||||||
|
if data is None:
|
||||||
|
data = {}
|
||||||
|
file = Path(file)
|
||||||
|
if not file.parent.exists():
|
||||||
|
# Create parent directories if they don't exist
|
||||||
|
file.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Convert Path objects to strings
|
||||||
|
for k, v in data.items():
|
||||||
|
if isinstance(v, Path):
|
||||||
|
data[k] = str(v)
|
||||||
|
|
||||||
|
# Dump data to file in YAML format
|
||||||
|
with open(file, "w", errors="ignore", encoding="utf-8") as f:
|
||||||
|
if header:
|
||||||
|
f.write(header)
|
||||||
|
yaml.safe_dump(data, f, sort_keys=False, allow_unicode=True)
|
||||||
|
|
||||||
|
|
||||||
|
def yaml_load(file="data.yaml", append_filename=False):
|
||||||
|
"""
|
||||||
|
Load YAML data from a file.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
file : str, optional
|
||||||
|
File name. Default is 'data.yaml'.
|
||||||
|
append_filename : bool, optional
|
||||||
|
Add the YAML filename to the YAML dictionary. Default is False.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
dict
|
||||||
|
YAML data and file name.
|
||||||
|
"""
|
||||||
|
assert Path(file).suffix in (
|
||||||
|
".yaml",
|
||||||
|
".yml",
|
||||||
|
), f"Attempting to load non-YAML file {file} with yaml_load()"
|
||||||
|
with open(file, errors="ignore", encoding="utf-8") as f:
|
||||||
|
s = f.read() # string
|
||||||
|
|
||||||
|
# Add YAML filename to dict and return
|
||||||
|
data = (
|
||||||
|
yaml.safe_load(s) or {}
|
||||||
|
) # always return a dict (yaml.safe_load() may return None for empty files)
|
||||||
|
if append_filename:
|
||||||
|
data["yaml_file"] = str(file)
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def yaml_print(yaml_file: Union[str, Path, dict]) -> None:
|
||||||
|
"""
|
||||||
|
Pretty prints a YAML file or a YAML-formatted dictionary.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
yaml_file : Union[str, Path, dict]
|
||||||
|
The file path of the YAML file or a YAML-formatted dictionary.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
None
|
||||||
|
"""
|
||||||
|
yaml_dict = (
|
||||||
|
yaml_load(yaml_file) if isinstance(yaml_file, (str, Path)) else yaml_file
|
||||||
|
)
|
||||||
|
dump = yaml.dump(yaml_dict, sort_keys=False, allow_unicode=True)
|
||||||
|
LOGGER.info("Printing '%s'\n\n%s", yaml_file, dump)
|
||||||
|
|
||||||
|
|
||||||
|
PLATFORMS = [platform.system()]
|
||||||
|
if is_colab():
|
||||||
|
PLATFORMS.append("Colab")
|
||||||
|
if is_kaggle():
|
||||||
|
PLATFORMS.append("Kaggle")
|
||||||
|
if is_jupyter():
|
||||||
|
PLATFORMS.append("Jupyter")
|
||||||
|
if is_docker():
|
||||||
|
PLATFORMS.append("Docker")
|
||||||
|
|
||||||
|
PLATFORMS = "|".join(PLATFORMS)
|
||||||
|
|
||||||
|
|
||||||
|
class TryExcept(contextlib.ContextDecorator):
|
||||||
|
"""
|
||||||
|
TryExcept context manager.
|
||||||
|
Usage: @TryExcept() decorator or 'with TryExcept():' context manager.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, msg="", verbose=True):
|
||||||
|
"""
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
msg : str, optional
|
||||||
|
Custom message to display in case of exception, by default "".
|
||||||
|
verbose : bool, optional
|
||||||
|
Whether to display the message, by default True.
|
||||||
|
"""
|
||||||
|
self.msg = msg
|
||||||
|
self.verbose = verbose
|
||||||
|
|
||||||
|
def __enter__(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def __exit__(self, exc_type, value, traceback):
|
||||||
|
if self.verbose and value:
|
||||||
|
LOGGER.info("%s%s%s", self.msg, ": " if self.msg else "", value)
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def threaded_request(
|
||||||
|
method, url, retry=3, timeout=30, thread=True, code=-1, verbose=True, **kwargs
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Makes an HTTP request using the 'requests' library, with exponential backoff
|
||||||
|
retries up to a specified timeout.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
method : str
|
||||||
|
The HTTP method to use for the request. Choices are 'post' and 'get'.
|
||||||
|
url : str
|
||||||
|
The URL to make the request to.
|
||||||
|
retry : int, optional
|
||||||
|
Number of retries to attempt before giving up, by default 3.
|
||||||
|
timeout : int, optional
|
||||||
|
Timeout in seconds after which the function will give up retrying,
|
||||||
|
by default 30.
|
||||||
|
thread : bool, optional
|
||||||
|
Whether to execute the request in a separate daemon thread, by default True.
|
||||||
|
code : int, optional
|
||||||
|
An identifier for the request, used for logging purposes, by default -1.
|
||||||
|
verbose : bool, optional
|
||||||
|
A flag to determine whether to print out to console or not, by default True.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
requests.Response
|
||||||
|
The HTTP response object. If the request is executed in a separate thread,
|
||||||
|
returns the thread itself.
|
||||||
|
"""
|
||||||
|
# retry only these codes TODO: add codes if needed in future (500, 408)
|
||||||
|
retry_codes = ()
|
||||||
|
|
||||||
|
@TryExcept(verbose=verbose)
|
||||||
|
def func(method, url, **kwargs):
|
||||||
|
"""Make HTTP requests with retries and timeouts, with optional progress
|
||||||
|
tracking.
|
||||||
|
"""
|
||||||
|
response = None
|
||||||
|
t0 = time.time()
|
||||||
|
for i in range(retry + 1):
|
||||||
|
if (time.time() - t0) > timeout:
|
||||||
|
break
|
||||||
|
response = requests.request(method, url, **kwargs)
|
||||||
|
if response.status_code < 300: # good return codes in the 2xx range
|
||||||
|
break
|
||||||
|
try:
|
||||||
|
m = response.json().get("message", "No JSON message.")
|
||||||
|
except AttributeError:
|
||||||
|
m = "Unable to read JSON."
|
||||||
|
if i == 0:
|
||||||
|
if response.status_code in retry_codes:
|
||||||
|
m += f" Retrying {retry}x for {timeout}s." if retry else ""
|
||||||
|
elif response.status_code == 429: # rate limit
|
||||||
|
m = "Rate limit reached"
|
||||||
|
if verbose:
|
||||||
|
LOGGER.warning("%s #%s", response.status_code, m)
|
||||||
|
if response.status_code not in retry_codes:
|
||||||
|
return response
|
||||||
|
time.sleep(2**i) # exponential standoff
|
||||||
|
return response
|
||||||
|
|
||||||
|
args = method, url
|
||||||
|
if thread:
|
||||||
|
return threading.Thread(
|
||||||
|
target=func, args=args, kwargs=kwargs, daemon=True
|
||||||
|
).start()
|
||||||
|
else:
|
||||||
|
return func(*args, **kwargs)
|
||||||
119
python/python/lancedb/utils/sentry_log.py
Normal file
119
python/python/lancedb/utils/sentry_log.py
Normal file
@@ -0,0 +1,119 @@
|
|||||||
|
# Copyright 2023 LanceDB Developers
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import bdb
|
||||||
|
import importlib.metadata
|
||||||
|
import logging
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from lancedb.utils import CONFIG
|
||||||
|
|
||||||
|
from .general import (
|
||||||
|
PLATFORMS,
|
||||||
|
TryExcept,
|
||||||
|
is_git_dir,
|
||||||
|
is_github_actions_ci,
|
||||||
|
is_online,
|
||||||
|
is_pip_package,
|
||||||
|
is_pytest_running,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@TryExcept(verbose=False)
|
||||||
|
def set_sentry():
|
||||||
|
"""
|
||||||
|
Initialize the Sentry SDK for error tracking and reporting. Only used if
|
||||||
|
sentry_sdk package is installed and sync=True in settings. Run 'lancedb settings'
|
||||||
|
to see and update settings YAML file.
|
||||||
|
|
||||||
|
Conditions required to send errors (ALL conditions must be met or no errors will
|
||||||
|
be reported):
|
||||||
|
- sentry_sdk package is installed
|
||||||
|
- sync=True in settings
|
||||||
|
- pytest is not running
|
||||||
|
- running in a pip package installation
|
||||||
|
- running in a non-git directory
|
||||||
|
- online environment
|
||||||
|
|
||||||
|
The function also configures Sentry SDK to ignore KeyboardInterrupt and
|
||||||
|
FileNotFoundError exceptions for now.
|
||||||
|
|
||||||
|
Additionally, the function sets custom tags and user information for Sentry
|
||||||
|
events.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def before_send(event, hint):
|
||||||
|
"""
|
||||||
|
Modify the event before sending it to Sentry based on specific exception
|
||||||
|
types and messages.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
event (dict): The event dictionary containing information about the error.
|
||||||
|
hint (dict): A dictionary containing additional information about
|
||||||
|
the error.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: The modified event or None if the event should not be sent
|
||||||
|
to Sentry.
|
||||||
|
"""
|
||||||
|
if "exc_info" in hint:
|
||||||
|
exc_type, exc_value, tb = hint["exc_info"]
|
||||||
|
ignored_errors = ["out of memory", "no space left on device", "testing"]
|
||||||
|
if any(error in str(exc_value).lower() for error in ignored_errors):
|
||||||
|
return None
|
||||||
|
|
||||||
|
if is_git_dir():
|
||||||
|
install = "git"
|
||||||
|
elif is_pip_package():
|
||||||
|
install = "pip"
|
||||||
|
else:
|
||||||
|
install = "other"
|
||||||
|
|
||||||
|
event["tags"] = {
|
||||||
|
"sys_argv": sys.argv[0],
|
||||||
|
"sys_argv_name": Path(sys.argv[0]).name,
|
||||||
|
"install": install,
|
||||||
|
"platforms": PLATFORMS,
|
||||||
|
"version": importlib.metadata.version("lancedb"),
|
||||||
|
}
|
||||||
|
return event
|
||||||
|
|
||||||
|
TESTS_RUNNING = is_pytest_running() or is_github_actions_ci()
|
||||||
|
ONLINE = is_online()
|
||||||
|
if CONFIG["diagnostics"] and not TESTS_RUNNING and ONLINE and is_pip_package():
|
||||||
|
# and not is_git_dir(): # not running inside a git dir. Maybe too restrictive?
|
||||||
|
|
||||||
|
# If sentry_sdk package is not installed then return and do not use Sentry
|
||||||
|
try:
|
||||||
|
import sentry_sdk # noqa
|
||||||
|
except ImportError:
|
||||||
|
return
|
||||||
|
|
||||||
|
sentry_sdk.init(
|
||||||
|
dsn="https://c63ef8c64e05d1aa1a96513361f3ca2f@o4505950840946688.ingest.sentry.io/4505950933614592",
|
||||||
|
debug=False,
|
||||||
|
include_local_variables=False,
|
||||||
|
traces_sample_rate=0.5,
|
||||||
|
environment="production", # 'dev' or 'production'
|
||||||
|
before_send=before_send,
|
||||||
|
ignore_errors=[KeyboardInterrupt, FileNotFoundError, bdb.BdbQuit],
|
||||||
|
)
|
||||||
|
sentry_sdk.set_user({"id": CONFIG["uuid"]}) # SHA-256 anonymized UUID hash
|
||||||
|
|
||||||
|
# Disable all sentry logging
|
||||||
|
for logger in "sentry_sdk", "sentry_sdk.errors":
|
||||||
|
logging.getLogger(logger).setLevel(logging.CRITICAL)
|
||||||
|
|
||||||
|
|
||||||
|
set_sentry()
|
||||||
31
python/python/tests/test_cli.py
Normal file
31
python/python/tests/test_cli.py
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
from click.testing import CliRunner
|
||||||
|
from lancedb.cli.cli import cli
|
||||||
|
from lancedb.utils import CONFIG
|
||||||
|
|
||||||
|
|
||||||
|
def test_entry():
|
||||||
|
runner = CliRunner()
|
||||||
|
result = runner.invoke(cli)
|
||||||
|
assert result.exit_code == 0 # Main check
|
||||||
|
assert "lancedb" in result.output.lower() # lazy check
|
||||||
|
|
||||||
|
|
||||||
|
def test_diagnostics():
|
||||||
|
runner = CliRunner()
|
||||||
|
result = runner.invoke(cli, ["diagnostics", "--disabled"])
|
||||||
|
assert result.exit_code == 0 # Main check
|
||||||
|
assert not CONFIG["diagnostics"]
|
||||||
|
|
||||||
|
result = runner.invoke(cli, ["diagnostics", "--enabled"])
|
||||||
|
assert result.exit_code == 0 # Main check
|
||||||
|
assert CONFIG["diagnostics"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_config():
|
||||||
|
runner = CliRunner()
|
||||||
|
result = runner.invoke(cli, ["config"])
|
||||||
|
assert result.exit_code == 0 # Main check
|
||||||
|
cfg = CONFIG.copy()
|
||||||
|
cfg.pop("uuid")
|
||||||
|
for item in cfg: # check for keys only as formatting is subject to change
|
||||||
|
assert item in result.output
|
||||||
@@ -28,25 +28,13 @@ def test_basic(tmp_path):
|
|||||||
assert db.uri == str(tmp_path)
|
assert db.uri == str(tmp_path)
|
||||||
assert db.table_names() == []
|
assert db.table_names() == []
|
||||||
|
|
||||||
class SimpleModel(LanceModel):
|
|
||||||
item: str
|
|
||||||
price: float
|
|
||||||
vector: Vector(2)
|
|
||||||
|
|
||||||
table = db.create_table(
|
table = db.create_table(
|
||||||
"test",
|
"test",
|
||||||
data=[
|
data=[
|
||||||
{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
|
{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
|
||||||
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0},
|
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0},
|
||||||
],
|
],
|
||||||
schema=SimpleModel,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
with pytest.raises(
|
|
||||||
ValueError, match="Cannot add a single LanceModel to a table. Use a list."
|
|
||||||
):
|
|
||||||
table.add(SimpleModel(item="baz", price=30.0, vector=[1.0, 2.0]))
|
|
||||||
|
|
||||||
rs = table.search([100, 100]).limit(1).to_pandas()
|
rs = table.search([100, 100]).limit(1).to_pandas()
|
||||||
assert len(rs) == 1
|
assert len(rs) == 1
|
||||||
assert rs["item"].iloc[0] == "bar"
|
assert rs["item"].iloc[0] == "bar"
|
||||||
@@ -55,11 +43,6 @@ def test_basic(tmp_path):
|
|||||||
assert len(rs) == 1
|
assert len(rs) == 1
|
||||||
assert rs["item"].iloc[0] == "foo"
|
assert rs["item"].iloc[0] == "foo"
|
||||||
|
|
||||||
table.create_fts_index(["item"])
|
|
||||||
rs = table.search("bar", query_type="fts").to_pandas()
|
|
||||||
assert len(rs) == 1
|
|
||||||
assert rs["item"].iloc[0] == "bar"
|
|
||||||
|
|
||||||
assert db.table_names() == ["test"]
|
assert db.table_names() == ["test"]
|
||||||
assert "test" in db
|
assert "test" in db
|
||||||
assert len(db) == 1
|
assert len(db) == 1
|
||||||
|
|||||||
@@ -45,7 +45,7 @@ except Exception:
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.slow
|
@pytest.mark.slow
|
||||||
@pytest.mark.parametrize("alias", ["sentence-transformers", "openai", "huggingface"])
|
@pytest.mark.parametrize("alias", ["sentence-transformers", "openai"])
|
||||||
def test_basic_text_embeddings(alias, tmp_path):
|
def test_basic_text_embeddings(alias, tmp_path):
|
||||||
db = lancedb.connect(tmp_path)
|
db = lancedb.connect(tmp_path)
|
||||||
registry = get_registry()
|
registry = get_registry()
|
||||||
@@ -84,7 +84,7 @@ def test_basic_text_embeddings(alias, tmp_path):
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
query = "greeting"
|
query = "greetings"
|
||||||
actual = (
|
actual = (
|
||||||
table.search(query, vector_column_name="vector").limit(1).to_pydantic(Words)[0]
|
table.search(query, vector_column_name="vector").limit(1).to_pydantic(Words)[0]
|
||||||
)
|
)
|
||||||
@@ -184,9 +184,9 @@ def test_imagebind(tmp_path):
|
|||||||
import shutil
|
import shutil
|
||||||
import tempfile
|
import tempfile
|
||||||
|
|
||||||
|
import lancedb.embeddings.imagebind
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
from lancedb.embeddings import get_registry
|
from lancedb.embeddings import get_registry
|
||||||
from lancedb.pydantic import LanceModel, Vector
|
from lancedb.pydantic import LanceModel, Vector
|
||||||
|
|
||||||
@@ -321,6 +321,8 @@ def test_gemini_embedding(tmp_path):
|
|||||||
)
|
)
|
||||||
@pytest.mark.slow
|
@pytest.mark.slow
|
||||||
def test_gte_embedding(tmp_path):
|
def test_gte_embedding(tmp_path):
|
||||||
|
import lancedb.embeddings.gte
|
||||||
|
|
||||||
model = get_registry().get("gte-text").create()
|
model = get_registry().get("gte-text").create()
|
||||||
|
|
||||||
class TextModel(LanceModel):
|
class TextModel(LanceModel):
|
||||||
|
|||||||
@@ -1,158 +0,0 @@
|
|||||||
# Copyright 2024 Lance Developers
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
import asyncio
|
|
||||||
import copy
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
import pyarrow as pa
|
|
||||||
import lancedb
|
|
||||||
|
|
||||||
|
|
||||||
# These are all keys that are accepted by storage_options
|
|
||||||
CONFIG = {
|
|
||||||
"allow_http": "true",
|
|
||||||
"aws_access_key_id": "ACCESSKEY",
|
|
||||||
"aws_secret_access_key": "SECRETKEY",
|
|
||||||
"aws_endpoint": "http://localhost:4566",
|
|
||||||
"aws_region": "us-east-1",
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def get_boto3_client(*args, **kwargs):
|
|
||||||
import boto3
|
|
||||||
|
|
||||||
return boto3.client(
|
|
||||||
*args,
|
|
||||||
region_name=CONFIG["aws_region"],
|
|
||||||
aws_access_key_id=CONFIG["aws_access_key_id"],
|
|
||||||
aws_secret_access_key=CONFIG["aws_secret_access_key"],
|
|
||||||
**kwargs,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="module")
|
|
||||||
def s3_bucket():
|
|
||||||
s3 = get_boto3_client("s3", endpoint_url=CONFIG["aws_endpoint"])
|
|
||||||
bucket_name = "lance-integtest"
|
|
||||||
# if bucket exists, delete it
|
|
||||||
try:
|
|
||||||
delete_bucket(s3, bucket_name)
|
|
||||||
except s3.exceptions.NoSuchBucket:
|
|
||||||
pass
|
|
||||||
s3.create_bucket(Bucket=bucket_name)
|
|
||||||
yield bucket_name
|
|
||||||
|
|
||||||
delete_bucket(s3, bucket_name)
|
|
||||||
|
|
||||||
|
|
||||||
def delete_bucket(s3, bucket_name):
|
|
||||||
# Delete all objects first
|
|
||||||
for obj in s3.list_objects(Bucket=bucket_name).get("Contents", []):
|
|
||||||
s3.delete_object(Bucket=bucket_name, Key=obj["Key"])
|
|
||||||
s3.delete_bucket(Bucket=bucket_name)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.s3_test
|
|
||||||
def test_s3_lifecycle(s3_bucket: str):
|
|
||||||
storage_options = copy.copy(CONFIG)
|
|
||||||
|
|
||||||
uri = f"s3://{s3_bucket}/test_lifecycle"
|
|
||||||
data = pa.table({"x": [1, 2, 3]})
|
|
||||||
|
|
||||||
async def test():
|
|
||||||
db = await lancedb.connect_async(uri, storage_options=storage_options)
|
|
||||||
|
|
||||||
table = await db.create_table("test", schema=data.schema)
|
|
||||||
assert await table.count_rows() == 0
|
|
||||||
|
|
||||||
table = await db.create_table("test", data, mode="overwrite")
|
|
||||||
assert await table.count_rows() == 3
|
|
||||||
|
|
||||||
await table.add(data, mode="append")
|
|
||||||
assert await table.count_rows() == 6
|
|
||||||
|
|
||||||
table = await db.open_table("test")
|
|
||||||
assert await table.count_rows() == 6
|
|
||||||
|
|
||||||
await db.drop_table("test")
|
|
||||||
|
|
||||||
await db.drop_database()
|
|
||||||
|
|
||||||
asyncio.run(test())
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture()
|
|
||||||
def kms_key():
|
|
||||||
kms = get_boto3_client("kms", endpoint_url=CONFIG["aws_endpoint"])
|
|
||||||
key_id = kms.create_key()["KeyMetadata"]["KeyId"]
|
|
||||||
yield key_id
|
|
||||||
kms.schedule_key_deletion(KeyId=key_id, PendingWindowInDays=7)
|
|
||||||
|
|
||||||
|
|
||||||
def validate_objects_encrypted(bucket: str, path: str, kms_key: str):
|
|
||||||
s3 = get_boto3_client("s3", endpoint_url=CONFIG["aws_endpoint"])
|
|
||||||
objects = s3.list_objects_v2(Bucket=bucket, Prefix=path)["Contents"]
|
|
||||||
for obj in objects:
|
|
||||||
info = s3.head_object(Bucket=bucket, Key=obj["Key"])
|
|
||||||
assert info["ServerSideEncryption"] == "aws:kms", (
|
|
||||||
"object %s not encrypted" % obj["Key"]
|
|
||||||
)
|
|
||||||
assert info["SSEKMSKeyId"].endswith(kms_key), (
|
|
||||||
"object %s not encrypted with correct key" % obj["Key"]
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.s3_test
|
|
||||||
def test_s3_sse(s3_bucket: str, kms_key: str):
|
|
||||||
storage_options = copy.copy(CONFIG)
|
|
||||||
|
|
||||||
uri = f"s3://{s3_bucket}/test_lifecycle"
|
|
||||||
data = pa.table({"x": [1, 2, 3]})
|
|
||||||
|
|
||||||
async def test():
|
|
||||||
# Create a table with SSE
|
|
||||||
db = await lancedb.connect_async(uri, storage_options=storage_options)
|
|
||||||
|
|
||||||
table = await db.create_table(
|
|
||||||
"table1",
|
|
||||||
schema=data.schema,
|
|
||||||
storage_options={
|
|
||||||
"aws_server_side_encryption": "aws:kms",
|
|
||||||
"aws_sse_kms_key_id": kms_key,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
await table.add(data)
|
|
||||||
await table.update({"x": "1"})
|
|
||||||
|
|
||||||
path = "test_lifecycle/table1.lance"
|
|
||||||
validate_objects_encrypted(s3_bucket, path, kms_key)
|
|
||||||
|
|
||||||
# Test we can set encryption at connection level too.
|
|
||||||
db = await lancedb.connect_async(
|
|
||||||
uri,
|
|
||||||
storage_options=dict(
|
|
||||||
aws_server_side_encryption="aws:kms",
|
|
||||||
aws_sse_kms_key_id=kms_key,
|
|
||||||
**storage_options,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
table = await db.create_table("table2", schema=data.schema)
|
|
||||||
await table.add(data)
|
|
||||||
await table.update({"x": "1"})
|
|
||||||
|
|
||||||
path = "test_lifecycle/table2.lance"
|
|
||||||
validate_objects_encrypted(s3_bucket, path, kms_key)
|
|
||||||
|
|
||||||
asyncio.run(test())
|
|
||||||
60
python/python/tests/test_telemetry.py
Normal file
60
python/python/tests/test_telemetry.py
Normal file
@@ -0,0 +1,60 @@
|
|||||||
|
import json
|
||||||
|
|
||||||
|
import lancedb
|
||||||
|
import pytest
|
||||||
|
from lancedb.utils.events import _Events
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(autouse=True)
|
||||||
|
def request_log_path(tmp_path):
|
||||||
|
return tmp_path / "request.json"
|
||||||
|
|
||||||
|
|
||||||
|
def mock_register_event(name: str, **kwargs):
|
||||||
|
if _Events._instance is None:
|
||||||
|
_Events._instance = _Events()
|
||||||
|
|
||||||
|
_Events._instance.enabled = True
|
||||||
|
_Events._instance.rate_limit = 0
|
||||||
|
_Events._instance(name, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
def test_event_reporting(monkeypatch, request_log_path, tmp_path) -> None:
|
||||||
|
def mock_request(**kwargs):
|
||||||
|
json_data = kwargs.get("json", {})
|
||||||
|
with open(request_log_path, "w") as f:
|
||||||
|
json.dump(json_data, f)
|
||||||
|
|
||||||
|
monkeypatch.setattr(
|
||||||
|
lancedb.table, "register_event", mock_register_event
|
||||||
|
) # Force enable registering events and strip exception handling
|
||||||
|
monkeypatch.setattr(lancedb.utils.events, "threaded_request", mock_request)
|
||||||
|
|
||||||
|
db = lancedb.connect(tmp_path)
|
||||||
|
db.create_table(
|
||||||
|
"test",
|
||||||
|
data=[
|
||||||
|
{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
|
||||||
|
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0},
|
||||||
|
],
|
||||||
|
mode="overwrite",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert request_log_path.exists() # test if event was registered
|
||||||
|
|
||||||
|
with open(request_log_path, "r") as f:
|
||||||
|
json_data = json.load(f)
|
||||||
|
|
||||||
|
# TODO: don't hardcode these here. Instead create a module level json scehma in
|
||||||
|
# lancedb.utils.events for better evolvability
|
||||||
|
batch_keys = ["api_key", "distinct_id", "batch"]
|
||||||
|
event_keys = ["event", "properties", "timestamp", "distinct_id"]
|
||||||
|
property_keys = ["cli", "install", "platforms", "version", "session_id"]
|
||||||
|
|
||||||
|
assert all([key in json_data for key in batch_keys])
|
||||||
|
assert all([key in json_data["batch"][0] for key in event_keys])
|
||||||
|
assert all([key in json_data["batch"][0]["properties"] for key in property_keys])
|
||||||
|
|
||||||
|
# cleanup & reset
|
||||||
|
monkeypatch.undo()
|
||||||
|
_Events._instance = None
|
||||||
@@ -12,7 +12,7 @@
|
|||||||
// See the License for the specific language governing permissions and
|
// See the License for the specific language governing permissions and
|
||||||
// limitations under the License.
|
// limitations under the License.
|
||||||
|
|
||||||
use std::{collections::HashMap, sync::Arc, time::Duration};
|
use std::{sync::Arc, time::Duration};
|
||||||
|
|
||||||
use arrow::{datatypes::Schema, ffi_stream::ArrowArrayStreamReader, pyarrow::FromPyArrow};
|
use arrow::{datatypes::Schema, ffi_stream::ArrowArrayStreamReader, pyarrow::FromPyArrow};
|
||||||
use lancedb::connection::{Connection as LanceConnection, CreateTableMode};
|
use lancedb::connection::{Connection as LanceConnection, CreateTableMode};
|
||||||
@@ -90,21 +90,19 @@ impl Connection {
|
|||||||
name: String,
|
name: String,
|
||||||
mode: &str,
|
mode: &str,
|
||||||
data: &PyAny,
|
data: &PyAny,
|
||||||
storage_options: Option<HashMap<String, String>>,
|
|
||||||
) -> PyResult<&'a PyAny> {
|
) -> PyResult<&'a PyAny> {
|
||||||
let inner = self_.get_inner()?.clone();
|
let inner = self_.get_inner()?.clone();
|
||||||
|
|
||||||
let mode = Self::parse_create_mode_str(mode)?;
|
let mode = Self::parse_create_mode_str(mode)?;
|
||||||
|
|
||||||
let batches = ArrowArrayStreamReader::from_pyarrow(data)?;
|
let batches = ArrowArrayStreamReader::from_pyarrow(data)?;
|
||||||
let mut builder = inner.create_table(name, batches).mode(mode);
|
|
||||||
|
|
||||||
if let Some(storage_options) = storage_options {
|
|
||||||
builder = builder.storage_options(storage_options);
|
|
||||||
}
|
|
||||||
|
|
||||||
future_into_py(self_.py(), async move {
|
future_into_py(self_.py(), async move {
|
||||||
let table = builder.execute().await.infer_error()?;
|
let table = inner
|
||||||
|
.create_table(name, batches)
|
||||||
|
.mode(mode)
|
||||||
|
.execute()
|
||||||
|
.await
|
||||||
|
.infer_error()?;
|
||||||
Ok(Table::new(table))
|
Ok(Table::new(table))
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
@@ -114,7 +112,6 @@ impl Connection {
|
|||||||
name: String,
|
name: String,
|
||||||
mode: &str,
|
mode: &str,
|
||||||
schema: &PyAny,
|
schema: &PyAny,
|
||||||
storage_options: Option<HashMap<String, String>>,
|
|
||||||
) -> PyResult<&'a PyAny> {
|
) -> PyResult<&'a PyAny> {
|
||||||
let inner = self_.get_inner()?.clone();
|
let inner = self_.get_inner()?.clone();
|
||||||
|
|
||||||
@@ -122,31 +119,21 @@ impl Connection {
|
|||||||
|
|
||||||
let schema = Schema::from_pyarrow(schema)?;
|
let schema = Schema::from_pyarrow(schema)?;
|
||||||
|
|
||||||
let mut builder = inner.create_empty_table(name, Arc::new(schema)).mode(mode);
|
|
||||||
|
|
||||||
if let Some(storage_options) = storage_options {
|
|
||||||
builder = builder.storage_options(storage_options);
|
|
||||||
}
|
|
||||||
|
|
||||||
future_into_py(self_.py(), async move {
|
future_into_py(self_.py(), async move {
|
||||||
let table = builder.execute().await.infer_error()?;
|
let table = inner
|
||||||
|
.create_empty_table(name, Arc::new(schema))
|
||||||
|
.mode(mode)
|
||||||
|
.execute()
|
||||||
|
.await
|
||||||
|
.infer_error()?;
|
||||||
Ok(Table::new(table))
|
Ok(Table::new(table))
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
#[pyo3(signature = (name, storage_options = None))]
|
pub fn open_table(self_: PyRef<'_, Self>, name: String) -> PyResult<&PyAny> {
|
||||||
pub fn open_table(
|
|
||||||
self_: PyRef<'_, Self>,
|
|
||||||
name: String,
|
|
||||||
storage_options: Option<HashMap<String, String>>,
|
|
||||||
) -> PyResult<&PyAny> {
|
|
||||||
let inner = self_.get_inner()?.clone();
|
let inner = self_.get_inner()?.clone();
|
||||||
let mut builder = inner.open_table(name);
|
|
||||||
if let Some(storage_options) = storage_options {
|
|
||||||
builder = builder.storage_options(storage_options);
|
|
||||||
}
|
|
||||||
future_into_py(self_.py(), async move {
|
future_into_py(self_.py(), async move {
|
||||||
let table = builder.execute().await.infer_error()?;
|
let table = inner.open_table(&name).execute().await.infer_error()?;
|
||||||
Ok(Table::new(table))
|
Ok(Table::new(table))
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
@@ -175,7 +162,6 @@ pub fn connect(
|
|||||||
region: Option<String>,
|
region: Option<String>,
|
||||||
host_override: Option<String>,
|
host_override: Option<String>,
|
||||||
read_consistency_interval: Option<f64>,
|
read_consistency_interval: Option<f64>,
|
||||||
storage_options: Option<HashMap<String, String>>,
|
|
||||||
) -> PyResult<&PyAny> {
|
) -> PyResult<&PyAny> {
|
||||||
future_into_py(py, async move {
|
future_into_py(py, async move {
|
||||||
let mut builder = lancedb::connect(&uri);
|
let mut builder = lancedb::connect(&uri);
|
||||||
@@ -192,9 +178,6 @@ pub fn connect(
|
|||||||
let read_consistency_interval = Duration::from_secs_f64(read_consistency_interval);
|
let read_consistency_interval = Duration::from_secs_f64(read_consistency_interval);
|
||||||
builder = builder.read_consistency_interval(read_consistency_interval);
|
builder = builder.read_consistency_interval(read_consistency_interval);
|
||||||
}
|
}
|
||||||
if let Some(storage_options) = storage_options {
|
|
||||||
builder = builder.storage_options(storage_options);
|
|
||||||
}
|
|
||||||
Ok(Connection::new(builder.execute().await.infer_error()?))
|
Ok(Connection::new(builder.execute().await.infer_error()?))
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb-node"
|
name = "lancedb-node"
|
||||||
version = "0.4.17"
|
version = "0.4.15"
|
||||||
description = "Serverless, low-latency vector database for AI applications"
|
description = "Serverless, low-latency vector database for AI applications"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
|
|||||||
@@ -12,12 +12,19 @@
|
|||||||
// See the License for the specific language governing permissions and
|
// See the License for the specific language governing permissions and
|
||||||
// limitations under the License.
|
// limitations under the License.
|
||||||
|
|
||||||
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
use async_trait::async_trait;
|
||||||
|
use lance::io::ObjectStoreParams;
|
||||||
use neon::prelude::*;
|
use neon::prelude::*;
|
||||||
|
use object_store::aws::{AwsCredential, AwsCredentialProvider};
|
||||||
|
use object_store::CredentialProvider;
|
||||||
use once_cell::sync::OnceCell;
|
use once_cell::sync::OnceCell;
|
||||||
use tokio::runtime::Runtime;
|
use tokio::runtime::Runtime;
|
||||||
|
|
||||||
use lancedb::connect;
|
use lancedb::connect;
|
||||||
use lancedb::connection::Connection;
|
use lancedb::connection::Connection;
|
||||||
|
use lancedb::table::ReadParams;
|
||||||
|
|
||||||
use crate::error::ResultExt;
|
use crate::error::ResultExt;
|
||||||
use crate::query::JsQuery;
|
use crate::query::JsQuery;
|
||||||
@@ -37,6 +44,33 @@ struct JsDatabase {
|
|||||||
|
|
||||||
impl Finalize for JsDatabase {}
|
impl Finalize for JsDatabase {}
|
||||||
|
|
||||||
|
// TODO: object_store didn't export this type so I copied it.
|
||||||
|
// Make a request to object_store to export this type
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct StaticCredentialProvider<T> {
|
||||||
|
credential: Arc<T>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T> StaticCredentialProvider<T> {
|
||||||
|
pub fn new(credential: T) -> Self {
|
||||||
|
Self {
|
||||||
|
credential: Arc::new(credential),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait]
|
||||||
|
impl<T> CredentialProvider for StaticCredentialProvider<T>
|
||||||
|
where
|
||||||
|
T: std::fmt::Debug + Send + Sync,
|
||||||
|
{
|
||||||
|
type Credential = T;
|
||||||
|
|
||||||
|
async fn get_credential(&self) -> object_store::Result<Arc<T>> {
|
||||||
|
Ok(Arc::clone(&self.credential))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fn runtime<'a, C: Context<'a>>(cx: &mut C) -> NeonResult<&'static Runtime> {
|
fn runtime<'a, C: Context<'a>>(cx: &mut C) -> NeonResult<&'static Runtime> {
|
||||||
static RUNTIME: OnceCell<Runtime> = OnceCell::new();
|
static RUNTIME: OnceCell<Runtime> = OnceCell::new();
|
||||||
static LOG: OnceCell<()> = OnceCell::new();
|
static LOG: OnceCell<()> = OnceCell::new();
|
||||||
@@ -48,28 +82,29 @@ fn runtime<'a, C: Context<'a>>(cx: &mut C) -> NeonResult<&'static Runtime> {
|
|||||||
|
|
||||||
fn database_new(mut cx: FunctionContext) -> JsResult<JsPromise> {
|
fn database_new(mut cx: FunctionContext) -> JsResult<JsPromise> {
|
||||||
let path = cx.argument::<JsString>(0)?.value(&mut cx);
|
let path = cx.argument::<JsString>(0)?.value(&mut cx);
|
||||||
|
let aws_creds = get_aws_creds(&mut cx, 1)?;
|
||||||
|
let region = get_aws_region(&mut cx, 4)?;
|
||||||
let read_consistency_interval = cx
|
let read_consistency_interval = cx
|
||||||
.argument_opt(5)
|
.argument_opt(5)
|
||||||
.and_then(|arg| arg.downcast::<JsNumber, _>(&mut cx).ok())
|
.and_then(|arg| arg.downcast::<JsNumber, _>(&mut cx).ok())
|
||||||
.map(|v| v.value(&mut cx))
|
.map(|v| v.value(&mut cx))
|
||||||
.map(std::time::Duration::from_secs_f64);
|
.map(std::time::Duration::from_secs_f64);
|
||||||
|
|
||||||
let storage_options_js = cx.argument::<JsArray>(1)?.to_vec(&mut cx)?;
|
|
||||||
let mut storage_options: Vec<(String, String)> = Vec::with_capacity(storage_options_js.len());
|
|
||||||
for handle in storage_options_js {
|
|
||||||
let obj = handle.downcast::<JsArray, _>(&mut cx).unwrap();
|
|
||||||
let key = obj.get::<JsString, _, _>(&mut cx, 0)?.value(&mut cx);
|
|
||||||
let value = obj.get::<JsString, _, _>(&mut cx, 0)?.value(&mut cx);
|
|
||||||
|
|
||||||
storage_options.push((key, value));
|
|
||||||
}
|
|
||||||
|
|
||||||
let rt = runtime(&mut cx)?;
|
let rt = runtime(&mut cx)?;
|
||||||
let channel = cx.channel();
|
let channel = cx.channel();
|
||||||
let (deferred, promise) = cx.promise();
|
let (deferred, promise) = cx.promise();
|
||||||
|
|
||||||
let mut conn_builder = connect(&path).storage_options(storage_options);
|
let mut conn_builder = connect(&path);
|
||||||
|
if let Some(region) = region {
|
||||||
|
conn_builder = conn_builder.region(®ion);
|
||||||
|
}
|
||||||
|
if let Some(aws_creds) = aws_creds {
|
||||||
|
conn_builder = conn_builder.aws_creds(AwsCredential {
|
||||||
|
key_id: aws_creds.key_id,
|
||||||
|
secret_key: aws_creds.secret_key,
|
||||||
|
token: aws_creds.token,
|
||||||
|
});
|
||||||
|
}
|
||||||
if let Some(interval) = read_consistency_interval {
|
if let Some(interval) = read_consistency_interval {
|
||||||
conn_builder = conn_builder.read_consistency_interval(interval);
|
conn_builder = conn_builder.read_consistency_interval(interval);
|
||||||
}
|
}
|
||||||
@@ -108,19 +143,93 @@ fn database_table_names(mut cx: FunctionContext) -> JsResult<JsPromise> {
|
|||||||
Ok(promise)
|
Ok(promise)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Get AWS creds arguments from the context
|
||||||
|
/// Consumes 3 arguments
|
||||||
|
fn get_aws_creds(
|
||||||
|
cx: &mut FunctionContext,
|
||||||
|
arg_starting_location: i32,
|
||||||
|
) -> NeonResult<Option<AwsCredential>> {
|
||||||
|
let secret_key_id = cx
|
||||||
|
.argument_opt(arg_starting_location)
|
||||||
|
.filter(|arg| arg.is_a::<JsString, _>(cx))
|
||||||
|
.and_then(|arg| arg.downcast_or_throw::<JsString, FunctionContext>(cx).ok())
|
||||||
|
.map(|v| v.value(cx));
|
||||||
|
|
||||||
|
let secret_key = cx
|
||||||
|
.argument_opt(arg_starting_location + 1)
|
||||||
|
.filter(|arg| arg.is_a::<JsString, _>(cx))
|
||||||
|
.and_then(|arg| arg.downcast_or_throw::<JsString, FunctionContext>(cx).ok())
|
||||||
|
.map(|v| v.value(cx));
|
||||||
|
|
||||||
|
let temp_token = cx
|
||||||
|
.argument_opt(arg_starting_location + 2)
|
||||||
|
.filter(|arg| arg.is_a::<JsString, _>(cx))
|
||||||
|
.and_then(|arg| arg.downcast_or_throw::<JsString, FunctionContext>(cx).ok())
|
||||||
|
.map(|v| v.value(cx));
|
||||||
|
|
||||||
|
match (secret_key_id, secret_key, temp_token) {
|
||||||
|
(Some(key_id), Some(key), optional_token) => Ok(Some(AwsCredential {
|
||||||
|
key_id,
|
||||||
|
secret_key: key,
|
||||||
|
token: optional_token,
|
||||||
|
})),
|
||||||
|
(None, None, None) => Ok(None),
|
||||||
|
_ => cx.throw_error("Invalid credentials configuration"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_aws_credential_provider(
|
||||||
|
cx: &mut FunctionContext,
|
||||||
|
arg_starting_location: i32,
|
||||||
|
) -> NeonResult<Option<AwsCredentialProvider>> {
|
||||||
|
Ok(get_aws_creds(cx, arg_starting_location)?.map(|aws_cred| {
|
||||||
|
Arc::new(StaticCredentialProvider::new(aws_cred))
|
||||||
|
as Arc<dyn CredentialProvider<Credential = AwsCredential>>
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get AWS region arguments from the context
|
||||||
|
fn get_aws_region(cx: &mut FunctionContext, arg_location: i32) -> NeonResult<Option<String>> {
|
||||||
|
let region = cx
|
||||||
|
.argument_opt(arg_location)
|
||||||
|
.filter(|arg| arg.is_a::<JsString, _>(cx))
|
||||||
|
.map(|arg| arg.downcast_or_throw::<JsString, FunctionContext>(cx));
|
||||||
|
|
||||||
|
match region {
|
||||||
|
Some(Ok(region)) => Ok(Some(region.value(cx))),
|
||||||
|
None => Ok(None),
|
||||||
|
Some(Err(e)) => Err(e),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fn database_open_table(mut cx: FunctionContext) -> JsResult<JsPromise> {
|
fn database_open_table(mut cx: FunctionContext) -> JsResult<JsPromise> {
|
||||||
let db = cx
|
let db = cx
|
||||||
.this()
|
.this()
|
||||||
.downcast_or_throw::<JsBox<JsDatabase>, _>(&mut cx)?;
|
.downcast_or_throw::<JsBox<JsDatabase>, _>(&mut cx)?;
|
||||||
let table_name = cx.argument::<JsString>(0)?.value(&mut cx);
|
let table_name = cx.argument::<JsString>(0)?.value(&mut cx);
|
||||||
|
|
||||||
|
let aws_creds = get_aws_credential_provider(&mut cx, 1)?;
|
||||||
|
|
||||||
|
let aws_region = get_aws_region(&mut cx, 4)?;
|
||||||
|
|
||||||
|
let params = ReadParams {
|
||||||
|
store_options: Some(ObjectStoreParams::with_aws_credentials(
|
||||||
|
aws_creds, aws_region,
|
||||||
|
)),
|
||||||
|
..ReadParams::default()
|
||||||
|
};
|
||||||
|
|
||||||
let rt = runtime(&mut cx)?;
|
let rt = runtime(&mut cx)?;
|
||||||
let channel = cx.channel();
|
let channel = cx.channel();
|
||||||
let database = db.database.clone();
|
let database = db.database.clone();
|
||||||
|
|
||||||
let (deferred, promise) = cx.promise();
|
let (deferred, promise) = cx.promise();
|
||||||
rt.spawn(async move {
|
rt.spawn(async move {
|
||||||
let table_rst = database.open_table(&table_name).execute().await;
|
let table_rst = database
|
||||||
|
.open_table(&table_name)
|
||||||
|
.lance_read_params(params)
|
||||||
|
.execute()
|
||||||
|
.await;
|
||||||
|
|
||||||
deferred.settle_with(&channel, move |mut cx| {
|
deferred.settle_with(&channel, move |mut cx| {
|
||||||
let js_table = JsTable::from(table_rst.or_throw(&mut cx)?);
|
let js_table = JsTable::from(table_rst.or_throw(&mut cx)?);
|
||||||
|
|||||||
@@ -17,6 +17,7 @@ use std::ops::Deref;
|
|||||||
use arrow_array::{RecordBatch, RecordBatchIterator};
|
use arrow_array::{RecordBatch, RecordBatchIterator};
|
||||||
use lance::dataset::optimize::CompactionOptions;
|
use lance::dataset::optimize::CompactionOptions;
|
||||||
use lance::dataset::{ColumnAlteration, NewColumnTransform, WriteMode, WriteParams};
|
use lance::dataset::{ColumnAlteration, NewColumnTransform, WriteMode, WriteParams};
|
||||||
|
use lance::io::ObjectStoreParams;
|
||||||
use lancedb::table::{OptimizeAction, WriteOptions};
|
use lancedb::table::{OptimizeAction, WriteOptions};
|
||||||
|
|
||||||
use crate::arrow::{arrow_buffer_to_record_batch, record_batch_to_buffer};
|
use crate::arrow::{arrow_buffer_to_record_batch, record_batch_to_buffer};
|
||||||
@@ -25,7 +26,7 @@ use neon::prelude::*;
|
|||||||
use neon::types::buffer::TypedArray;
|
use neon::types::buffer::TypedArray;
|
||||||
|
|
||||||
use crate::error::ResultExt;
|
use crate::error::ResultExt;
|
||||||
use crate::{convert, runtime, JsDatabase};
|
use crate::{convert, get_aws_credential_provider, get_aws_region, runtime, JsDatabase};
|
||||||
|
|
||||||
pub struct JsTable {
|
pub struct JsTable {
|
||||||
pub table: LanceDbTable,
|
pub table: LanceDbTable,
|
||||||
@@ -58,10 +59,6 @@ impl JsTable {
|
|||||||
return cx.throw_error("Table::create only supports 'overwrite' and 'create' modes")
|
return cx.throw_error("Table::create only supports 'overwrite' and 'create' modes")
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
let params = WriteParams {
|
|
||||||
mode,
|
|
||||||
..WriteParams::default()
|
|
||||||
};
|
|
||||||
|
|
||||||
let rt = runtime(&mut cx)?;
|
let rt = runtime(&mut cx)?;
|
||||||
let channel = cx.channel();
|
let channel = cx.channel();
|
||||||
@@ -69,6 +66,17 @@ impl JsTable {
|
|||||||
let (deferred, promise) = cx.promise();
|
let (deferred, promise) = cx.promise();
|
||||||
let database = db.database.clone();
|
let database = db.database.clone();
|
||||||
|
|
||||||
|
let aws_creds = get_aws_credential_provider(&mut cx, 3)?;
|
||||||
|
let aws_region = get_aws_region(&mut cx, 6)?;
|
||||||
|
|
||||||
|
let params = WriteParams {
|
||||||
|
store_params: Some(ObjectStoreParams::with_aws_credentials(
|
||||||
|
aws_creds, aws_region,
|
||||||
|
)),
|
||||||
|
mode,
|
||||||
|
..WriteParams::default()
|
||||||
|
};
|
||||||
|
|
||||||
rt.spawn(async move {
|
rt.spawn(async move {
|
||||||
let batch_reader = RecordBatchIterator::new(batches.into_iter().map(Ok), schema);
|
let batch_reader = RecordBatchIterator::new(batches.into_iter().map(Ok), schema);
|
||||||
let table_rst = database
|
let table_rst = database
|
||||||
@@ -104,8 +112,13 @@ impl JsTable {
|
|||||||
"overwrite" => WriteMode::Overwrite,
|
"overwrite" => WriteMode::Overwrite,
|
||||||
s => return cx.throw_error(format!("invalid write mode {}", s)),
|
s => return cx.throw_error(format!("invalid write mode {}", s)),
|
||||||
};
|
};
|
||||||
|
let aws_creds = get_aws_credential_provider(&mut cx, 2)?;
|
||||||
|
let aws_region = get_aws_region(&mut cx, 5)?;
|
||||||
|
|
||||||
let params = WriteParams {
|
let params = WriteParams {
|
||||||
|
store_params: Some(ObjectStoreParams::with_aws_credentials(
|
||||||
|
aws_creds, aws_region,
|
||||||
|
)),
|
||||||
mode: write_mode,
|
mode: write_mode,
|
||||||
..WriteParams::default()
|
..WriteParams::default()
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb"
|
name = "lancedb"
|
||||||
version = "0.4.17"
|
version = "0.4.15"
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
description = "LanceDB: A serverless, low-latency vector database for AI applications"
|
description = "LanceDB: A serverless, low-latency vector database for AI applications"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
@@ -46,13 +46,7 @@ tempfile = "3.5.0"
|
|||||||
rand = { version = "0.8.3", features = ["small_rng"] }
|
rand = { version = "0.8.3", features = ["small_rng"] }
|
||||||
uuid = { version = "1.7.0", features = ["v4"] }
|
uuid = { version = "1.7.0", features = ["v4"] }
|
||||||
walkdir = "2"
|
walkdir = "2"
|
||||||
# For s3 integration tests (dev deps aren't allowed to be optional atm)
|
|
||||||
aws-sdk-s3 = { version = "1.0" }
|
|
||||||
aws-sdk-kms = { version = "1.0" }
|
|
||||||
aws-config = { version = "1.0" }
|
|
||||||
|
|
||||||
[features]
|
[features]
|
||||||
default = ["remote"]
|
default = ["remote"]
|
||||||
remote = ["dep:reqwest"]
|
remote = ["dep:reqwest"]
|
||||||
fp16kernels = ["lance-linalg/fp16kernels"]
|
|
||||||
s3-test = []
|
|
||||||
@@ -14,7 +14,6 @@
|
|||||||
|
|
||||||
//! LanceDB Database
|
//! LanceDB Database
|
||||||
|
|
||||||
use std::collections::HashMap;
|
|
||||||
use std::fs::create_dir_all;
|
use std::fs::create_dir_all;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
@@ -23,7 +22,9 @@ use arrow_array::{RecordBatchIterator, RecordBatchReader};
|
|||||||
use arrow_schema::SchemaRef;
|
use arrow_schema::SchemaRef;
|
||||||
use lance::dataset::{ReadParams, WriteMode};
|
use lance::dataset::{ReadParams, WriteMode};
|
||||||
use lance::io::{ObjectStore, ObjectStoreParams, WrappingObjectStore};
|
use lance::io::{ObjectStore, ObjectStoreParams, WrappingObjectStore};
|
||||||
use object_store::{aws::AwsCredential, local::LocalFileSystem};
|
use object_store::{
|
||||||
|
aws::AwsCredential, local::LocalFileSystem, CredentialProvider, StaticCredentialProvider,
|
||||||
|
};
|
||||||
use snafu::prelude::*;
|
use snafu::prelude::*;
|
||||||
|
|
||||||
use crate::arrow::IntoArrow;
|
use crate::arrow::IntoArrow;
|
||||||
@@ -100,8 +101,8 @@ impl TableNamesBuilder {
|
|||||||
///
|
///
|
||||||
/// This can be combined with limit to implement pagination by setting this to
|
/// This can be combined with limit to implement pagination by setting this to
|
||||||
/// the last table name from the previous page.
|
/// the last table name from the previous page.
|
||||||
pub fn start_after(mut self, start_after: impl Into<String>) -> Self {
|
pub fn start_after(mut self, start_after: String) -> Self {
|
||||||
self.start_after = Some(start_after.into());
|
self.start_after = Some(start_after);
|
||||||
self
|
self
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -207,50 +208,6 @@ impl<const HAS_DATA: bool, T: IntoArrow> CreateTableBuilder<HAS_DATA, T> {
|
|||||||
self.mode = mode;
|
self.mode = mode;
|
||||||
self
|
self
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Set an option for the storage layer.
|
|
||||||
///
|
|
||||||
/// Options already set on the connection will be inherited by the table,
|
|
||||||
/// but can be overridden here.
|
|
||||||
///
|
|
||||||
/// See available options at <https://lancedb.github.io/lancedb/guides/storage/>
|
|
||||||
pub fn storage_option(mut self, key: impl Into<String>, value: impl Into<String>) -> Self {
|
|
||||||
let store_options = self
|
|
||||||
.write_options
|
|
||||||
.lance_write_params
|
|
||||||
.get_or_insert(Default::default())
|
|
||||||
.store_params
|
|
||||||
.get_or_insert(Default::default())
|
|
||||||
.storage_options
|
|
||||||
.get_or_insert(Default::default());
|
|
||||||
store_options.insert(key.into(), value.into());
|
|
||||||
self
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Set multiple options for the storage layer.
|
|
||||||
///
|
|
||||||
/// Options already set on the connection will be inherited by the table,
|
|
||||||
/// but can be overridden here.
|
|
||||||
///
|
|
||||||
/// See available options at <https://lancedb.github.io/lancedb/guides/storage/>
|
|
||||||
pub fn storage_options(
|
|
||||||
mut self,
|
|
||||||
pairs: impl IntoIterator<Item = (impl Into<String>, impl Into<String>)>,
|
|
||||||
) -> Self {
|
|
||||||
let store_options = self
|
|
||||||
.write_options
|
|
||||||
.lance_write_params
|
|
||||||
.get_or_insert(Default::default())
|
|
||||||
.store_params
|
|
||||||
.get_or_insert(Default::default())
|
|
||||||
.storage_options
|
|
||||||
.get_or_insert(Default::default());
|
|
||||||
|
|
||||||
for (key, value) in pairs {
|
|
||||||
store_options.insert(key.into(), value.into());
|
|
||||||
}
|
|
||||||
self
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Debug)]
|
#[derive(Clone, Debug)]
|
||||||
@@ -295,48 +252,6 @@ impl OpenTableBuilder {
|
|||||||
self
|
self
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Set an option for the storage layer.
|
|
||||||
///
|
|
||||||
/// Options already set on the connection will be inherited by the table,
|
|
||||||
/// but can be overridden here.
|
|
||||||
///
|
|
||||||
/// See available options at <https://lancedb.github.io/lancedb/guides/storage/>
|
|
||||||
pub fn storage_option(mut self, key: impl Into<String>, value: impl Into<String>) -> Self {
|
|
||||||
let storage_options = self
|
|
||||||
.lance_read_params
|
|
||||||
.get_or_insert(Default::default())
|
|
||||||
.store_options
|
|
||||||
.get_or_insert(Default::default())
|
|
||||||
.storage_options
|
|
||||||
.get_or_insert(Default::default());
|
|
||||||
storage_options.insert(key.into(), value.into());
|
|
||||||
self
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Set multiple options for the storage layer.
|
|
||||||
///
|
|
||||||
/// Options already set on the connection will be inherited by the table,
|
|
||||||
/// but can be overridden here.
|
|
||||||
///
|
|
||||||
/// See available options at <https://lancedb.github.io/lancedb/guides/storage/>
|
|
||||||
pub fn storage_options(
|
|
||||||
mut self,
|
|
||||||
pairs: impl IntoIterator<Item = (impl Into<String>, impl Into<String>)>,
|
|
||||||
) -> Self {
|
|
||||||
let storage_options = self
|
|
||||||
.lance_read_params
|
|
||||||
.get_or_insert(Default::default())
|
|
||||||
.store_options
|
|
||||||
.get_or_insert(Default::default())
|
|
||||||
.storage_options
|
|
||||||
.get_or_insert(Default::default());
|
|
||||||
|
|
||||||
for (key, value) in pairs {
|
|
||||||
storage_options.insert(key.into(), value.into());
|
|
||||||
}
|
|
||||||
self
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Open the table
|
/// Open the table
|
||||||
pub async fn execute(self) -> Result<Table> {
|
pub async fn execute(self) -> Result<Table> {
|
||||||
self.parent.clone().do_open_table(self).await
|
self.parent.clone().do_open_table(self).await
|
||||||
@@ -470,7 +385,8 @@ pub struct ConnectBuilder {
|
|||||||
/// LanceDB Cloud host override, only required if using an on-premises Lance Cloud instance
|
/// LanceDB Cloud host override, only required if using an on-premises Lance Cloud instance
|
||||||
host_override: Option<String>,
|
host_override: Option<String>,
|
||||||
|
|
||||||
storage_options: HashMap<String, String>,
|
/// User provided AWS credentials
|
||||||
|
aws_creds: Option<AwsCredential>,
|
||||||
|
|
||||||
/// The interval at which to check for updates from other processes.
|
/// The interval at which to check for updates from other processes.
|
||||||
///
|
///
|
||||||
@@ -493,8 +409,8 @@ impl ConnectBuilder {
|
|||||||
api_key: None,
|
api_key: None,
|
||||||
region: None,
|
region: None,
|
||||||
host_override: None,
|
host_override: None,
|
||||||
|
aws_creds: None,
|
||||||
read_consistency_interval: None,
|
read_consistency_interval: None,
|
||||||
storage_options: HashMap::new(),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -514,37 +430,8 @@ impl ConnectBuilder {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// [`AwsCredential`] to use when connecting to S3.
|
/// [`AwsCredential`] to use when connecting to S3.
|
||||||
#[deprecated(note = "Pass through storage_options instead")]
|
|
||||||
pub fn aws_creds(mut self, aws_creds: AwsCredential) -> Self {
|
pub fn aws_creds(mut self, aws_creds: AwsCredential) -> Self {
|
||||||
self.storage_options
|
self.aws_creds = Some(aws_creds);
|
||||||
.insert("aws_access_key_id".into(), aws_creds.key_id.clone());
|
|
||||||
self.storage_options
|
|
||||||
.insert("aws_secret_access_key".into(), aws_creds.secret_key.clone());
|
|
||||||
if let Some(token) = &aws_creds.token {
|
|
||||||
self.storage_options
|
|
||||||
.insert("aws_session_token".into(), token.clone());
|
|
||||||
}
|
|
||||||
self
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Set an option for the storage layer.
|
|
||||||
///
|
|
||||||
/// See available options at <https://lancedb.github.io/lancedb/guides/storage/>
|
|
||||||
pub fn storage_option(mut self, key: impl Into<String>, value: impl Into<String>) -> Self {
|
|
||||||
self.storage_options.insert(key.into(), value.into());
|
|
||||||
self
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Set multiple options for the storage layer.
|
|
||||||
///
|
|
||||||
/// See available options at <https://lancedb.github.io/lancedb/guides/storage/>
|
|
||||||
pub fn storage_options(
|
|
||||||
mut self,
|
|
||||||
pairs: impl IntoIterator<Item = (impl Into<String>, impl Into<String>)>,
|
|
||||||
) -> Self {
|
|
||||||
for (key, value) in pairs {
|
|
||||||
self.storage_options.insert(key.into(), value.into());
|
|
||||||
}
|
|
||||||
self
|
self
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -635,9 +522,6 @@ struct Database {
|
|||||||
pub(crate) store_wrapper: Option<Arc<dyn WrappingObjectStore>>,
|
pub(crate) store_wrapper: Option<Arc<dyn WrappingObjectStore>>,
|
||||||
|
|
||||||
read_consistency_interval: Option<std::time::Duration>,
|
read_consistency_interval: Option<std::time::Duration>,
|
||||||
|
|
||||||
// Storage options to be inherited by tables created from this connection
|
|
||||||
storage_options: HashMap<String, String>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl std::fmt::Display for Database {
|
impl std::fmt::Display for Database {
|
||||||
@@ -720,11 +604,20 @@ impl Database {
|
|||||||
};
|
};
|
||||||
|
|
||||||
let plain_uri = url.to_string();
|
let plain_uri = url.to_string();
|
||||||
|
let os_params: ObjectStoreParams = if let Some(aws_creds) = &options.aws_creds {
|
||||||
let storage_options = options.storage_options.clone();
|
let credential_provider: Arc<
|
||||||
let os_params = ObjectStoreParams {
|
dyn CredentialProvider<Credential = AwsCredential>,
|
||||||
storage_options: Some(storage_options.clone()),
|
> = Arc::new(StaticCredentialProvider::new(AwsCredential {
|
||||||
..Default::default()
|
key_id: aws_creds.key_id.clone(),
|
||||||
|
secret_key: aws_creds.secret_key.clone(),
|
||||||
|
token: aws_creds.token.clone(),
|
||||||
|
}));
|
||||||
|
ObjectStoreParams::with_aws_credentials(
|
||||||
|
Some(credential_provider),
|
||||||
|
options.region.clone(),
|
||||||
|
)
|
||||||
|
} else {
|
||||||
|
ObjectStoreParams::default()
|
||||||
};
|
};
|
||||||
let (object_store, base_path) =
|
let (object_store, base_path) =
|
||||||
ObjectStore::from_uri_and_params(&plain_uri, &os_params).await?;
|
ObjectStore::from_uri_and_params(&plain_uri, &os_params).await?;
|
||||||
@@ -748,7 +641,6 @@ impl Database {
|
|||||||
object_store,
|
object_store,
|
||||||
store_wrapper: write_store_wrapper,
|
store_wrapper: write_store_wrapper,
|
||||||
read_consistency_interval: options.read_consistency_interval,
|
read_consistency_interval: options.read_consistency_interval,
|
||||||
storage_options,
|
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
Err(_) => Self::open_path(uri, options.read_consistency_interval).await,
|
Err(_) => Self::open_path(uri, options.read_consistency_interval).await,
|
||||||
@@ -770,7 +662,6 @@ impl Database {
|
|||||||
object_store,
|
object_store,
|
||||||
store_wrapper: None,
|
store_wrapper: None,
|
||||||
read_consistency_interval,
|
read_consistency_interval,
|
||||||
storage_options: HashMap::new(),
|
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -843,26 +734,11 @@ impl ConnectionInternal for Database {
|
|||||||
|
|
||||||
async fn do_create_table(
|
async fn do_create_table(
|
||||||
&self,
|
&self,
|
||||||
mut options: CreateTableBuilder<false, NoData>,
|
options: CreateTableBuilder<false, NoData>,
|
||||||
data: Box<dyn RecordBatchReader + Send>,
|
data: Box<dyn RecordBatchReader + Send>,
|
||||||
) -> Result<Table> {
|
) -> Result<Table> {
|
||||||
let table_uri = self.table_uri(&options.name)?;
|
let table_uri = self.table_uri(&options.name)?;
|
||||||
|
|
||||||
// Inherit storage options from the connection
|
|
||||||
let storage_options = options
|
|
||||||
.write_options
|
|
||||||
.lance_write_params
|
|
||||||
.get_or_insert_with(Default::default)
|
|
||||||
.store_params
|
|
||||||
.get_or_insert_with(Default::default)
|
|
||||||
.storage_options
|
|
||||||
.get_or_insert_with(Default::default);
|
|
||||||
for (key, value) in self.storage_options.iter() {
|
|
||||||
if !storage_options.contains_key(key) {
|
|
||||||
storage_options.insert(key.clone(), value.clone());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut write_params = options.write_options.lance_write_params.unwrap_or_default();
|
let mut write_params = options.write_options.lance_write_params.unwrap_or_default();
|
||||||
if matches!(&options.mode, CreateTableMode::Overwrite) {
|
if matches!(&options.mode, CreateTableMode::Overwrite) {
|
||||||
write_params.mode = WriteMode::Overwrite;
|
write_params.mode = WriteMode::Overwrite;
|
||||||
@@ -892,23 +768,8 @@ impl ConnectionInternal for Database {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn do_open_table(&self, mut options: OpenTableBuilder) -> Result<Table> {
|
async fn do_open_table(&self, options: OpenTableBuilder) -> Result<Table> {
|
||||||
let table_uri = self.table_uri(&options.name)?;
|
let table_uri = self.table_uri(&options.name)?;
|
||||||
|
|
||||||
// Inherit storage options from the connection
|
|
||||||
let storage_options = options
|
|
||||||
.lance_read_params
|
|
||||||
.get_or_insert_with(Default::default)
|
|
||||||
.store_options
|
|
||||||
.get_or_insert_with(Default::default)
|
|
||||||
.storage_options
|
|
||||||
.get_or_insert_with(Default::default);
|
|
||||||
for (key, value) in self.storage_options.iter() {
|
|
||||||
if !storage_options.contains_key(key) {
|
|
||||||
storage_options.insert(key.clone(), value.clone());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let native_table = Arc::new(
|
let native_table = Arc::new(
|
||||||
NativeTable::open_with_params(
|
NativeTable::open_with_params(
|
||||||
&table_uri,
|
&table_uri,
|
||||||
@@ -940,10 +801,7 @@ impl ConnectionInternal for Database {
|
|||||||
}
|
}
|
||||||
|
|
||||||
async fn drop_db(&self) -> Result<()> {
|
async fn drop_db(&self) -> Result<()> {
|
||||||
self.object_store
|
todo!()
|
||||||
.remove_dir_all(self.base_path.clone())
|
|
||||||
.await?;
|
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1004,7 +862,7 @@ mod tests {
|
|||||||
|
|
||||||
let tables = db
|
let tables = db
|
||||||
.table_names()
|
.table_names()
|
||||||
.start_after(&names[30])
|
.start_after(names[30].clone())
|
||||||
.execute()
|
.execute()
|
||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
@@ -1013,7 +871,7 @@ mod tests {
|
|||||||
|
|
||||||
let tables = db
|
let tables = db
|
||||||
.table_names()
|
.table_names()
|
||||||
.start_after(&names[30])
|
.start_after(names[30].clone())
|
||||||
.limit(7)
|
.limit(7)
|
||||||
.execute()
|
.execute()
|
||||||
.await
|
.await
|
||||||
|
|||||||
@@ -14,7 +14,6 @@
|
|||||||
|
|
||||||
//! LanceDB Table APIs
|
//! LanceDB Table APIs
|
||||||
|
|
||||||
use std::collections::HashMap;
|
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
@@ -758,8 +757,6 @@ pub struct NativeTable {
|
|||||||
// the object store wrapper to use on write path
|
// the object store wrapper to use on write path
|
||||||
store_wrapper: Option<Arc<dyn WrappingObjectStore>>,
|
store_wrapper: Option<Arc<dyn WrappingObjectStore>>,
|
||||||
|
|
||||||
storage_options: HashMap<String, String>,
|
|
||||||
|
|
||||||
// This comes from the connection options. We store here so we can pass down
|
// This comes from the connection options. We store here so we can pass down
|
||||||
// to the dataset when we recreate it (for example, in checkout_latest).
|
// to the dataset when we recreate it (for example, in checkout_latest).
|
||||||
read_consistency_interval: Option<std::time::Duration>,
|
read_consistency_interval: Option<std::time::Duration>,
|
||||||
@@ -825,13 +822,6 @@ impl NativeTable {
|
|||||||
None => params,
|
None => params,
|
||||||
};
|
};
|
||||||
|
|
||||||
let storage_options = params
|
|
||||||
.store_options
|
|
||||||
.clone()
|
|
||||||
.unwrap_or_default()
|
|
||||||
.storage_options
|
|
||||||
.unwrap_or_default();
|
|
||||||
|
|
||||||
let dataset = DatasetBuilder::from_uri(uri)
|
let dataset = DatasetBuilder::from_uri(uri)
|
||||||
.with_read_params(params)
|
.with_read_params(params)
|
||||||
.load()
|
.load()
|
||||||
@@ -850,7 +840,6 @@ impl NativeTable {
|
|||||||
uri: uri.to_string(),
|
uri: uri.to_string(),
|
||||||
dataset,
|
dataset,
|
||||||
store_wrapper: write_store_wrapper,
|
store_wrapper: write_store_wrapper,
|
||||||
storage_options,
|
|
||||||
read_consistency_interval,
|
read_consistency_interval,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
@@ -919,13 +908,6 @@ impl NativeTable {
|
|||||||
None => params,
|
None => params,
|
||||||
};
|
};
|
||||||
|
|
||||||
let storage_options = params
|
|
||||||
.store_params
|
|
||||||
.clone()
|
|
||||||
.unwrap_or_default()
|
|
||||||
.storage_options
|
|
||||||
.unwrap_or_default();
|
|
||||||
|
|
||||||
let dataset = Dataset::write(batches, uri, Some(params))
|
let dataset = Dataset::write(batches, uri, Some(params))
|
||||||
.await
|
.await
|
||||||
.map_err(|e| match e {
|
.map_err(|e| match e {
|
||||||
@@ -939,7 +921,6 @@ impl NativeTable {
|
|||||||
uri: uri.to_string(),
|
uri: uri.to_string(),
|
||||||
dataset: DatasetConsistencyWrapper::new_latest(dataset, read_consistency_interval),
|
dataset: DatasetConsistencyWrapper::new_latest(dataset, read_consistency_interval),
|
||||||
store_wrapper: write_store_wrapper,
|
store_wrapper: write_store_wrapper,
|
||||||
storage_options,
|
|
||||||
read_consistency_interval,
|
read_consistency_interval,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
@@ -1323,7 +1304,14 @@ impl TableInternal for NativeTable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
async fn count_rows(&self, filter: Option<String>) -> Result<usize> {
|
async fn count_rows(&self, filter: Option<String>) -> Result<usize> {
|
||||||
Ok(self.dataset.get().await?.count_rows(filter).await?)
|
let dataset = self.dataset.get().await?;
|
||||||
|
if let Some(filter) = filter {
|
||||||
|
let mut scanner = dataset.scan();
|
||||||
|
scanner.filter(&filter)?;
|
||||||
|
Ok(scanner.count_rows().await? as usize)
|
||||||
|
} else {
|
||||||
|
Ok(dataset.count_rows().await?)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn add(
|
async fn add(
|
||||||
@@ -1331,7 +1319,7 @@ impl TableInternal for NativeTable {
|
|||||||
add: AddDataBuilder<NoData>,
|
add: AddDataBuilder<NoData>,
|
||||||
data: Box<dyn RecordBatchReader + Send>,
|
data: Box<dyn RecordBatchReader + Send>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
let mut lance_params = add.write_options.lance_write_params.unwrap_or(WriteParams {
|
let lance_params = add.write_options.lance_write_params.unwrap_or(WriteParams {
|
||||||
mode: match add.mode {
|
mode: match add.mode {
|
||||||
AddDataMode::Append => WriteMode::Append,
|
AddDataMode::Append => WriteMode::Append,
|
||||||
AddDataMode::Overwrite => WriteMode::Overwrite,
|
AddDataMode::Overwrite => WriteMode::Overwrite,
|
||||||
@@ -1339,18 +1327,6 @@ impl TableInternal for NativeTable {
|
|||||||
..Default::default()
|
..Default::default()
|
||||||
});
|
});
|
||||||
|
|
||||||
// Bring storage options from table
|
|
||||||
let storage_options = lance_params
|
|
||||||
.store_params
|
|
||||||
.get_or_insert(Default::default())
|
|
||||||
.storage_options
|
|
||||||
.get_or_insert(Default::default());
|
|
||||||
for (key, value) in self.storage_options.iter() {
|
|
||||||
if !storage_options.contains_key(key) {
|
|
||||||
storage_options.insert(key.clone(), value.clone());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// patch the params if we have a write store wrapper
|
// patch the params if we have a write store wrapper
|
||||||
let lance_params = match self.store_wrapper.clone() {
|
let lance_params = match self.store_wrapper.clone() {
|
||||||
Some(wrapper) => lance_params.patch_with_store_wrapper(wrapper)?,
|
Some(wrapper) => lance_params.patch_with_store_wrapper(wrapper)?,
|
||||||
|
|||||||
@@ -1,290 +0,0 @@
|
|||||||
// Copyright 2023 LanceDB Developers.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
#![cfg(feature = "s3-test")]
|
|
||||||
use std::sync::Arc;
|
|
||||||
|
|
||||||
use arrow_array::{Int32Array, RecordBatch, RecordBatchIterator, StringArray};
|
|
||||||
use arrow_schema::{DataType, Field, Schema};
|
|
||||||
|
|
||||||
use aws_config::{BehaviorVersion, ConfigLoader, Region, SdkConfig};
|
|
||||||
use aws_sdk_s3::{config::Credentials, types::ServerSideEncryption, Client as S3Client};
|
|
||||||
use lancedb::Result;
|
|
||||||
|
|
||||||
const CONFIG: &[(&str, &str)] = &[
|
|
||||||
("access_key_id", "ACCESS_KEY"),
|
|
||||||
("secret_access_key", "SECRET_KEY"),
|
|
||||||
("endpoint", "http://127.0.0.1:4566"),
|
|
||||||
("allow_http", "true"),
|
|
||||||
];
|
|
||||||
|
|
||||||
async fn aws_config() -> SdkConfig {
|
|
||||||
let credentials = Credentials::new(CONFIG[0].1, CONFIG[1].1, None, None, "static");
|
|
||||||
ConfigLoader::default()
|
|
||||||
.credentials_provider(credentials)
|
|
||||||
.endpoint_url(CONFIG[2].1)
|
|
||||||
.behavior_version(BehaviorVersion::latest())
|
|
||||||
.region(Region::new("us-east-1"))
|
|
||||||
.load()
|
|
||||||
.await
|
|
||||||
}
|
|
||||||
|
|
||||||
struct S3Bucket(String);
|
|
||||||
|
|
||||||
impl S3Bucket {
|
|
||||||
async fn new(bucket: &str) -> Self {
|
|
||||||
let config = aws_config().await;
|
|
||||||
let client = S3Client::new(&config);
|
|
||||||
|
|
||||||
// In case it wasn't deleted earlier
|
|
||||||
Self::delete_bucket(client.clone(), bucket).await;
|
|
||||||
|
|
||||||
client.create_bucket().bucket(bucket).send().await.unwrap();
|
|
||||||
|
|
||||||
Self(bucket.to_string())
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn delete_bucket(client: S3Client, bucket: &str) {
|
|
||||||
// Before we delete the bucket, we need to delete all objects in it
|
|
||||||
let res = client
|
|
||||||
.list_objects_v2()
|
|
||||||
.bucket(bucket)
|
|
||||||
.send()
|
|
||||||
.await
|
|
||||||
.map_err(|err| err.into_service_error());
|
|
||||||
match res {
|
|
||||||
Err(e) if e.is_no_such_bucket() => return,
|
|
||||||
Err(e) => panic!("Failed to list objects in bucket: {}", e),
|
|
||||||
_ => {}
|
|
||||||
}
|
|
||||||
let objects = res.unwrap().contents.unwrap_or_default();
|
|
||||||
for object in objects {
|
|
||||||
client
|
|
||||||
.delete_object()
|
|
||||||
.bucket(bucket)
|
|
||||||
.key(object.key.unwrap())
|
|
||||||
.send()
|
|
||||||
.await
|
|
||||||
.unwrap();
|
|
||||||
}
|
|
||||||
client.delete_bucket().bucket(bucket).send().await.unwrap();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Drop for S3Bucket {
|
|
||||||
fn drop(&mut self) {
|
|
||||||
let bucket_name = self.0.clone();
|
|
||||||
tokio::task::spawn(async move {
|
|
||||||
let config = aws_config().await;
|
|
||||||
let client = S3Client::new(&config);
|
|
||||||
Self::delete_bucket(client, &bucket_name).await;
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn test_data() -> RecordBatch {
|
|
||||||
let schema = Arc::new(Schema::new(vec![
|
|
||||||
Field::new("a", DataType::Int32, false),
|
|
||||||
Field::new("b", DataType::Utf8, false),
|
|
||||||
]));
|
|
||||||
RecordBatch::try_new(
|
|
||||||
schema.clone(),
|
|
||||||
vec![
|
|
||||||
Arc::new(Int32Array::from(vec![1, 2, 3])),
|
|
||||||
Arc::new(StringArray::from(vec!["a", "b", "c"])),
|
|
||||||
],
|
|
||||||
)
|
|
||||||
.unwrap()
|
|
||||||
}
|
|
||||||
|
|
||||||
#[tokio::test]
|
|
||||||
async fn test_minio_lifecycle() -> Result<()> {
|
|
||||||
// test create, update, drop, list on localstack minio
|
|
||||||
let bucket = S3Bucket::new("test-bucket").await;
|
|
||||||
let uri = format!("s3://{}", bucket.0);
|
|
||||||
|
|
||||||
let db = lancedb::connect(&uri)
|
|
||||||
.storage_options(CONFIG.iter().cloned())
|
|
||||||
.execute()
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
let data = test_data();
|
|
||||||
let data = RecordBatchIterator::new(vec![Ok(data.clone())], data.schema());
|
|
||||||
|
|
||||||
let table = db.create_table("test_table", data).execute().await?;
|
|
||||||
|
|
||||||
let row_count = table.count_rows(None).await?;
|
|
||||||
assert_eq!(row_count, 3);
|
|
||||||
|
|
||||||
let table_names = db.table_names().execute().await?;
|
|
||||||
assert_eq!(table_names, vec!["test_table"]);
|
|
||||||
|
|
||||||
// Re-open the table
|
|
||||||
let table = db.open_table("test_table").execute().await?;
|
|
||||||
let row_count = table.count_rows(None).await?;
|
|
||||||
assert_eq!(row_count, 3);
|
|
||||||
|
|
||||||
let data = test_data();
|
|
||||||
let data = RecordBatchIterator::new(vec![Ok(data.clone())], data.schema());
|
|
||||||
table.add(data).execute().await?;
|
|
||||||
|
|
||||||
db.drop_table("test_table").await?;
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
struct KMSKey(String);
|
|
||||||
|
|
||||||
impl KMSKey {
|
|
||||||
async fn new() -> Self {
|
|
||||||
let config = aws_config().await;
|
|
||||||
let client = aws_sdk_kms::Client::new(&config);
|
|
||||||
let key = client
|
|
||||||
.create_key()
|
|
||||||
.description("test key")
|
|
||||||
.send()
|
|
||||||
.await
|
|
||||||
.unwrap()
|
|
||||||
.key_metadata
|
|
||||||
.unwrap()
|
|
||||||
.key_id;
|
|
||||||
Self(key)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Drop for KMSKey {
|
|
||||||
fn drop(&mut self) {
|
|
||||||
let key_id = self.0.clone();
|
|
||||||
tokio::task::spawn(async move {
|
|
||||||
let config = aws_config().await;
|
|
||||||
let client = aws_sdk_kms::Client::new(&config);
|
|
||||||
client
|
|
||||||
.schedule_key_deletion()
|
|
||||||
.key_id(&key_id)
|
|
||||||
.send()
|
|
||||||
.await
|
|
||||||
.unwrap();
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn validate_objects_encrypted(bucket: &str, path: &str, kms_key_id: &str) {
|
|
||||||
// Get S3 client
|
|
||||||
let config = aws_config().await;
|
|
||||||
let client = S3Client::new(&config);
|
|
||||||
|
|
||||||
// list the objects are the path
|
|
||||||
let objects = client
|
|
||||||
.list_objects_v2()
|
|
||||||
.bucket(bucket)
|
|
||||||
.prefix(path)
|
|
||||||
.send()
|
|
||||||
.await
|
|
||||||
.unwrap()
|
|
||||||
.contents
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
let mut errors = vec![];
|
|
||||||
let mut correctly_encrypted = vec![];
|
|
||||||
|
|
||||||
// For each object, call head
|
|
||||||
for object in &objects {
|
|
||||||
let head = client
|
|
||||||
.head_object()
|
|
||||||
.bucket(bucket)
|
|
||||||
.key(object.key().unwrap())
|
|
||||||
.send()
|
|
||||||
.await
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
// Verify the object is encrypted
|
|
||||||
if head.server_side_encryption() != Some(&ServerSideEncryption::AwsKms) {
|
|
||||||
errors.push(format!("Object {} is not encrypted", object.key().unwrap()));
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if !(head
|
|
||||||
.ssekms_key_id()
|
|
||||||
.map(|arn| arn.ends_with(kms_key_id))
|
|
||||||
.unwrap_or(false))
|
|
||||||
{
|
|
||||||
errors.push(format!(
|
|
||||||
"Object {} has wrong key id: {:?}, vs expected: {}",
|
|
||||||
object.key().unwrap(),
|
|
||||||
head.ssekms_key_id(),
|
|
||||||
kms_key_id
|
|
||||||
));
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
correctly_encrypted.push(object.key().unwrap().to_string());
|
|
||||||
}
|
|
||||||
|
|
||||||
if !errors.is_empty() {
|
|
||||||
panic!(
|
|
||||||
"{} of {} correctly encrypted: {:?}\n{} of {} not correct: {:?}",
|
|
||||||
correctly_encrypted.len(),
|
|
||||||
objects.len(),
|
|
||||||
correctly_encrypted,
|
|
||||||
errors.len(),
|
|
||||||
objects.len(),
|
|
||||||
errors
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[tokio::test]
|
|
||||||
async fn test_encryption() -> Result<()> {
|
|
||||||
// test encryption on localstack minio
|
|
||||||
let bucket = S3Bucket::new("test-encryption").await;
|
|
||||||
let key = KMSKey::new().await;
|
|
||||||
|
|
||||||
let uri = format!("s3://{}", bucket.0);
|
|
||||||
let db = lancedb::connect(&uri)
|
|
||||||
.storage_options(CONFIG.iter().cloned())
|
|
||||||
.execute()
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
// Create a table with encryption
|
|
||||||
let data = test_data();
|
|
||||||
let data = RecordBatchIterator::new(vec![Ok(data.clone())], data.schema());
|
|
||||||
|
|
||||||
let mut builder = db.create_table("test_table", data);
|
|
||||||
for (key, value) in CONFIG {
|
|
||||||
builder = builder.storage_option(*key, *value);
|
|
||||||
}
|
|
||||||
let table = builder
|
|
||||||
.storage_option("aws_server_side_encryption", "aws:kms")
|
|
||||||
.storage_option("aws_sse_kms_key_id", &key.0)
|
|
||||||
.execute()
|
|
||||||
.await?;
|
|
||||||
validate_objects_encrypted(&bucket.0, "test_table", &key.0).await;
|
|
||||||
|
|
||||||
table.delete("a = 1").await?;
|
|
||||||
validate_objects_encrypted(&bucket.0, "test_table", &key.0).await;
|
|
||||||
|
|
||||||
// Test we can set encryption at the connection level.
|
|
||||||
let db = lancedb::connect(&uri)
|
|
||||||
.storage_options(CONFIG.iter().cloned())
|
|
||||||
.storage_option("aws_server_side_encryption", "aws:kms")
|
|
||||||
.storage_option("aws_sse_kms_key_id", &key.0)
|
|
||||||
.execute()
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
let table = db.open_table("test_table").execute().await?;
|
|
||||||
|
|
||||||
let data = test_data();
|
|
||||||
let data = RecordBatchIterator::new(vec![Ok(data.clone())], data.schema());
|
|
||||||
table.add(data).execute().await?;
|
|
||||||
validate_objects_encrypted(&bucket.0, "test_table", &key.0).await;
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
Reference in New Issue
Block a user