mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-27 15:12:53 +00:00
Compare commits
41 Commits
python-v0.
...
aidangomar
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2acd9d0f0e | ||
|
|
49b6e77e4b | ||
|
|
5372843281 | ||
|
|
54677b8f0b | ||
|
|
ebcf9bf6ae | ||
|
|
797514bcbf | ||
|
|
1c872ce501 | ||
|
|
479f471c14 | ||
|
|
ae0d2f2599 | ||
|
|
1e8678f11a | ||
|
|
662968559d | ||
|
|
9d895801f2 | ||
|
|
80613a40fd | ||
|
|
d43ef7f11e | ||
|
|
554e068917 | ||
|
|
567734dd6e | ||
|
|
1589499f89 | ||
|
|
682e95fa83 | ||
|
|
1ad5e7f2f0 | ||
|
|
ddb3ef4ce5 | ||
|
|
ef20b2a138 | ||
|
|
2e0f251bfd | ||
|
|
2cb91e818d | ||
|
|
2835c76336 | ||
|
|
8068a2bbc3 | ||
|
|
24111d543a | ||
|
|
7eec2b8f9a | ||
|
|
b2b70ea399 | ||
|
|
e50a3c1783 | ||
|
|
b517134309 | ||
|
|
6fb539b5bf | ||
|
|
f37fe120fd | ||
|
|
2e115acb9a | ||
|
|
27a638362d | ||
|
|
22a6695d7a | ||
|
|
57eff82ee7 | ||
|
|
7732f7d41c | ||
|
|
5ca98c326f | ||
|
|
b55db397eb | ||
|
|
c04d72ac8a | ||
|
|
28b02fb72a |
@@ -1,5 +1,5 @@
|
|||||||
[bumpversion]
|
[bumpversion]
|
||||||
current_version = 0.3.3
|
current_version = 0.3.7
|
||||||
commit = True
|
commit = True
|
||||||
message = Bump version: {current_version} → {new_version}
|
message = Bump version: {current_version} → {new_version}
|
||||||
tag = True
|
tag = True
|
||||||
|
|||||||
4
.github/workflows/node.yml
vendored
4
.github/workflows/node.yml
vendored
@@ -11,6 +11,10 @@ on:
|
|||||||
- .github/workflows/node.yml
|
- .github/workflows/node.yml
|
||||||
- docker-compose.yml
|
- docker-compose.yml
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
|
||||||
|
cancel-in-progress: true
|
||||||
|
|
||||||
env:
|
env:
|
||||||
# Disable full debug symbol generation to speed up CI build and keep memory down
|
# Disable full debug symbol generation to speed up CI build and keep memory down
|
||||||
# "1" means line tables only, which is useful for panic tracebacks.
|
# "1" means line tables only, which is useful for panic tracebacks.
|
||||||
|
|||||||
2
.github/workflows/npm-publish.yml
vendored
2
.github/workflows/npm-publish.yml
vendored
@@ -38,7 +38,7 @@ jobs:
|
|||||||
node/vectordb-*.tgz
|
node/vectordb-*.tgz
|
||||||
|
|
||||||
node-macos:
|
node-macos:
|
||||||
runs-on: macos-12
|
runs-on: macos-13
|
||||||
# Only runs on tags that matches the make-release action
|
# Only runs on tags that matches the make-release action
|
||||||
if: startsWith(github.ref, 'refs/tags/v')
|
if: startsWith(github.ref, 'refs/tags/v')
|
||||||
strategy:
|
strategy:
|
||||||
|
|||||||
22
.github/workflows/python.yml
vendored
22
.github/workflows/python.yml
vendored
@@ -8,6 +8,11 @@ on:
|
|||||||
paths:
|
paths:
|
||||||
- python/**
|
- python/**
|
||||||
- .github/workflows/python.yml
|
- .github/workflows/python.yml
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
|
||||||
|
cancel-in-progress: true
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
linux:
|
linux:
|
||||||
timeout-minutes: 30
|
timeout-minutes: 30
|
||||||
@@ -32,18 +37,19 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
pip install -e .[tests]
|
pip install -e .[tests]
|
||||||
pip install tantivy@git+https://github.com/quickwit-oss/tantivy-py#164adc87e1a033117001cf70e38c82a53014d985
|
pip install tantivy@git+https://github.com/quickwit-oss/tantivy-py#164adc87e1a033117001cf70e38c82a53014d985
|
||||||
pip install pytest pytest-mock black isort
|
pip install pytest pytest-mock ruff
|
||||||
- name: Black
|
- name: Lint
|
||||||
run: black --check --diff --no-color --quiet .
|
run: ruff format --check .
|
||||||
- name: isort
|
|
||||||
run: isort --check --diff --quiet .
|
|
||||||
- name: Run tests
|
- name: Run tests
|
||||||
run: pytest -m "not slow" -x -v --durations=30 tests
|
run: pytest -m "not slow" -x -v --durations=30 tests
|
||||||
- name: doctest
|
- name: doctest
|
||||||
run: pytest --doctest-modules lancedb
|
run: pytest --doctest-modules lancedb
|
||||||
mac:
|
mac:
|
||||||
timeout-minutes: 30
|
timeout-minutes: 30
|
||||||
runs-on: "macos-12"
|
strategy:
|
||||||
|
matrix:
|
||||||
|
mac-runner: [ "macos-13", "macos-13-xlarge" ]
|
||||||
|
runs-on: "${{ matrix.mac-runner }}"
|
||||||
defaults:
|
defaults:
|
||||||
run:
|
run:
|
||||||
shell: bash
|
shell: bash
|
||||||
@@ -62,8 +68,6 @@ jobs:
|
|||||||
pip install -e .[tests]
|
pip install -e .[tests]
|
||||||
pip install tantivy@git+https://github.com/quickwit-oss/tantivy-py#164adc87e1a033117001cf70e38c82a53014d985
|
pip install tantivy@git+https://github.com/quickwit-oss/tantivy-py#164adc87e1a033117001cf70e38c82a53014d985
|
||||||
pip install pytest pytest-mock black
|
pip install pytest pytest-mock black
|
||||||
- name: Black
|
|
||||||
run: black --check --diff --no-color --quiet .
|
|
||||||
- name: Run tests
|
- name: Run tests
|
||||||
run: pytest -m "not slow" -x -v --durations=30 tests
|
run: pytest -m "not slow" -x -v --durations=30 tests
|
||||||
pydantic1x:
|
pydantic1x:
|
||||||
@@ -95,4 +99,4 @@ jobs:
|
|||||||
- name: Run tests
|
- name: Run tests
|
||||||
run: pytest -m "not slow" -x -v --durations=30 tests
|
run: pytest -m "not slow" -x -v --durations=30 tests
|
||||||
- name: doctest
|
- name: doctest
|
||||||
run: pytest --doctest-modules lancedb
|
run: pytest --doctest-modules lancedb
|
||||||
|
|||||||
9
.github/workflows/rust.yml
vendored
9
.github/workflows/rust.yml
vendored
@@ -10,6 +10,10 @@ on:
|
|||||||
- rust/**
|
- rust/**
|
||||||
- .github/workflows/rust.yml
|
- .github/workflows/rust.yml
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
|
||||||
|
cancel-in-progress: true
|
||||||
|
|
||||||
env:
|
env:
|
||||||
# This env var is used by Swatinem/rust-cache@v2 for the cache
|
# This env var is used by Swatinem/rust-cache@v2 for the cache
|
||||||
# key, so we set it to make sure it is always consistent.
|
# key, so we set it to make sure it is always consistent.
|
||||||
@@ -44,8 +48,11 @@ jobs:
|
|||||||
- name: Run tests
|
- name: Run tests
|
||||||
run: cargo test --all-features
|
run: cargo test --all-features
|
||||||
macos:
|
macos:
|
||||||
runs-on: macos-12
|
|
||||||
timeout-minutes: 30
|
timeout-minutes: 30
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
mac-runner: [ "macos-13", "macos-13-xlarge" ]
|
||||||
|
runs-on: "${{ matrix.mac-runner }}"
|
||||||
defaults:
|
defaults:
|
||||||
run:
|
run:
|
||||||
shell: bash
|
shell: bash
|
||||||
|
|||||||
@@ -5,9 +5,9 @@ exclude = ["python"]
|
|||||||
resolver = "2"
|
resolver = "2"
|
||||||
|
|
||||||
[workspace.dependencies]
|
[workspace.dependencies]
|
||||||
lance = { "version" = "=0.8.7", "features" = ["dynamodb"] }
|
lance = { "version" = "=0.8.14", "features" = ["dynamodb"] }
|
||||||
lance-linalg = { "version" = "=0.8.7" }
|
lance-linalg = { "version" = "=0.8.14" }
|
||||||
lance-testing = { "version" = "=0.8.7" }
|
lance-testing = { "version" = "=0.8.14" }
|
||||||
# Note that this one does not include pyarrow
|
# Note that this one does not include pyarrow
|
||||||
arrow = { version = "47.0.0", optional = false }
|
arrow = { version = "47.0.0", optional = false }
|
||||||
arrow-array = "47.0"
|
arrow-array = "47.0"
|
||||||
@@ -19,7 +19,7 @@ arrow-arith = "47.0"
|
|||||||
arrow-cast = "47.0"
|
arrow-cast = "47.0"
|
||||||
chrono = "0.4.23"
|
chrono = "0.4.23"
|
||||||
half = { "version" = "=2.3.1", default-features = false, features = [
|
half = { "version" = "=2.3.1", default-features = false, features = [
|
||||||
"num-traits"
|
"num-traits",
|
||||||
] }
|
] }
|
||||||
log = "0.4"
|
log = "0.4"
|
||||||
object_store = "0.7.1"
|
object_store = "0.7.1"
|
||||||
|
|||||||
@@ -150,8 +150,6 @@ nav:
|
|||||||
|
|
||||||
extra_css:
|
extra_css:
|
||||||
- styles/global.css
|
- styles/global.css
|
||||||
extra_javascript:
|
|
||||||
- scripts/posthog.js
|
|
||||||
|
|
||||||
extra:
|
extra:
|
||||||
analytics:
|
analytics:
|
||||||
|
|||||||
@@ -71,9 +71,41 @@ a single PQ code.
|
|||||||
### Use GPU to build vector index
|
### Use GPU to build vector index
|
||||||
|
|
||||||
Lance Python SDK has experimental GPU support for creating IVF index.
|
Lance Python SDK has experimental GPU support for creating IVF index.
|
||||||
|
Using GPU for index creation requires [PyTorch>2.0](https://pytorch.org/) being installed.
|
||||||
|
|
||||||
You can specify the GPU device to train IVF partitions via
|
You can specify the GPU device to train IVF partitions via
|
||||||
|
|
||||||
- **accelerator**: Specify to `"cuda"`` to enable GPU training.
|
- **accelerator**: Specify to ``cuda`` or ``mps`` (on Apple Silicon) to enable GPU training.
|
||||||
|
|
||||||
|
=== "Linux"
|
||||||
|
|
||||||
|
<!-- skip-test -->
|
||||||
|
``` { .python .copy }
|
||||||
|
# Create index using CUDA on Nvidia GPUs.
|
||||||
|
tbl.create_index(
|
||||||
|
num_partitions=256,
|
||||||
|
num_sub_vectors=96,
|
||||||
|
accelerator="cuda"
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
=== "Macos"
|
||||||
|
|
||||||
|
<!-- skip-test -->
|
||||||
|
```python
|
||||||
|
# Create index using MPS on Apple Silicon.
|
||||||
|
tbl.create_index(
|
||||||
|
num_partitions=256,
|
||||||
|
num_sub_vectors=96,
|
||||||
|
accelerator="mps"
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
Trouble shootings:
|
||||||
|
|
||||||
|
If you see ``AssertionError: Torch not compiled with CUDA enabled``, you need to [install
|
||||||
|
PyTorch with CUDA support](https://pytorch.org/get-started/locally/).
|
||||||
|
|
||||||
|
|
||||||
## Querying an ANN Index
|
## Querying an ANN Index
|
||||||
|
|
||||||
|
|||||||
@@ -22,8 +22,6 @@ pip install lancedb
|
|||||||
|
|
||||||
::: lancedb.query.LanceQueryBuilder
|
::: lancedb.query.LanceQueryBuilder
|
||||||
|
|
||||||
::: lancedb.query.LanceFtsQueryBuilder
|
|
||||||
|
|
||||||
## Embeddings
|
## Embeddings
|
||||||
|
|
||||||
::: lancedb.embeddings.registry.EmbeddingFunctionRegistry
|
::: lancedb.embeddings.registry.EmbeddingFunctionRegistry
|
||||||
@@ -56,7 +54,7 @@ pip install lancedb
|
|||||||
|
|
||||||
## Utilities
|
## Utilities
|
||||||
|
|
||||||
::: lancedb.vector
|
::: lancedb.schema.vector
|
||||||
|
|
||||||
## Integrations
|
## Integrations
|
||||||
|
|
||||||
|
|||||||
@@ -18,29 +18,45 @@ python_file = ".py"
|
|||||||
python_folder = "python"
|
python_folder = "python"
|
||||||
|
|
||||||
files = glob.glob(glob_string, recursive=True)
|
files = glob.glob(glob_string, recursive=True)
|
||||||
excluded_files = [f for excluded_glob in excluded_globs for f in glob.glob(excluded_glob, recursive=True)]
|
excluded_files = [
|
||||||
|
f
|
||||||
|
for excluded_glob in excluded_globs
|
||||||
|
for f in glob.glob(excluded_glob, recursive=True)
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def yield_lines(lines: Iterator[str], prefix: str, suffix: str):
|
def yield_lines(lines: Iterator[str], prefix: str, suffix: str):
|
||||||
in_code_block = False
|
in_code_block = False
|
||||||
# Python code has strict indentation
|
# Python code has strict indentation
|
||||||
strip_length = 0
|
strip_length = 0
|
||||||
|
skip_test = False
|
||||||
for line in lines:
|
for line in lines:
|
||||||
|
if "skip-test" in line:
|
||||||
|
skip_test = True
|
||||||
if line.strip().startswith(prefix + python_prefix):
|
if line.strip().startswith(prefix + python_prefix):
|
||||||
in_code_block = True
|
in_code_block = True
|
||||||
strip_length = len(line) - len(line.lstrip())
|
strip_length = len(line) - len(line.lstrip())
|
||||||
elif in_code_block and line.strip().startswith(suffix):
|
elif in_code_block and line.strip().startswith(suffix):
|
||||||
in_code_block = False
|
in_code_block = False
|
||||||
yield "\n"
|
if not skip_test:
|
||||||
|
yield "\n"
|
||||||
|
skip_test = False
|
||||||
elif in_code_block:
|
elif in_code_block:
|
||||||
yield line[strip_length:]
|
if not skip_test:
|
||||||
|
yield line[strip_length:]
|
||||||
|
|
||||||
for file in filter(lambda file: file not in excluded_files, files):
|
for file in filter(lambda file: file not in excluded_files, files):
|
||||||
with open(file, "r") as f:
|
with open(file, "r") as f:
|
||||||
lines = list(yield_lines(iter(f), "```", "```"))
|
lines = list(yield_lines(iter(f), "```", "```"))
|
||||||
|
|
||||||
if len(lines) > 0:
|
if len(lines) > 0:
|
||||||
out_path = Path(python_folder) / Path(file).name.strip(".md") / (Path(file).name.strip(".md") + python_file)
|
print(lines)
|
||||||
|
out_path = (
|
||||||
|
Path(python_folder)
|
||||||
|
/ Path(file).name.strip(".md")
|
||||||
|
/ (Path(file).name.strip(".md") + python_file)
|
||||||
|
)
|
||||||
print(out_path)
|
print(out_path)
|
||||||
out_path.parent.mkdir(exist_ok=True, parents=True)
|
out_path.parent.mkdir(exist_ok=True, parents=True)
|
||||||
with open(out_path, "w") as out:
|
with open(out_path, "w") as out:
|
||||||
out.writelines(lines)
|
out.writelines(lines)
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ npm install vectordb
|
|||||||
|
|
||||||
This will download the appropriate native library for your platform. We currently
|
This will download the appropriate native library for your platform. We currently
|
||||||
support x86_64 Linux, aarch64 Linux, Intel MacOS, and ARM (M1/M2) MacOS. We do not
|
support x86_64 Linux, aarch64 Linux, Intel MacOS, and ARM (M1/M2) MacOS. We do not
|
||||||
yet support Windows or musl-based Linux (such as Alpine Linux).
|
yet support musl-based Linux (such as Alpine Linux).
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
|
|||||||
74
node/package-lock.json
generated
74
node/package-lock.json
generated
@@ -1,12 +1,12 @@
|
|||||||
{
|
{
|
||||||
"name": "vectordb",
|
"name": "vectordb",
|
||||||
"version": "0.3.3",
|
"version": "0.3.7",
|
||||||
"lockfileVersion": 2,
|
"lockfileVersion": 2,
|
||||||
"requires": true,
|
"requires": true,
|
||||||
"packages": {
|
"packages": {
|
||||||
"": {
|
"": {
|
||||||
"name": "vectordb",
|
"name": "vectordb",
|
||||||
"version": "0.3.3",
|
"version": "0.3.7",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"x64",
|
"x64",
|
||||||
"arm64"
|
"arm64"
|
||||||
@@ -53,11 +53,11 @@
|
|||||||
"uuid": "^9.0.0"
|
"uuid": "^9.0.0"
|
||||||
},
|
},
|
||||||
"optionalDependencies": {
|
"optionalDependencies": {
|
||||||
"@lancedb/vectordb-darwin-arm64": "0.3.3",
|
"@lancedb/vectordb-darwin-arm64": "0.3.7",
|
||||||
"@lancedb/vectordb-darwin-x64": "0.3.3",
|
"@lancedb/vectordb-darwin-x64": "0.3.7",
|
||||||
"@lancedb/vectordb-linux-arm64-gnu": "0.3.3",
|
"@lancedb/vectordb-linux-arm64-gnu": "0.3.7",
|
||||||
"@lancedb/vectordb-linux-x64-gnu": "0.3.3",
|
"@lancedb/vectordb-linux-x64-gnu": "0.3.7",
|
||||||
"@lancedb/vectordb-win32-x64-msvc": "0.3.3"
|
"@lancedb/vectordb-win32-x64-msvc": "0.3.7"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"node_modules/@apache-arrow/ts": {
|
"node_modules/@apache-arrow/ts": {
|
||||||
@@ -317,9 +317,9 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
"node_modules/@lancedb/vectordb-darwin-arm64": {
|
"node_modules/@lancedb/vectordb-darwin-arm64": {
|
||||||
"version": "0.3.3",
|
"version": "0.3.7",
|
||||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.3.3.tgz",
|
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.3.7.tgz",
|
||||||
"integrity": "sha512-nvyj7xNX2/wb/PH5TjyhLR/NQ1jVuoBw2B5UaSg7qf8Tnm5SSXWQ7F25RVKcKwh72fz1qB+CWW24ftZnRzbT/Q==",
|
"integrity": "sha512-QsDxcbhrumJg+Cyflpnj8EY+bZojbco5K7VSeKvguqeXUGb62ksyOZuUTCn2sqJaCgy1KZ1qC5U8jBqfgZHc2w==",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"arm64"
|
"arm64"
|
||||||
],
|
],
|
||||||
@@ -329,9 +329,9 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
"node_modules/@lancedb/vectordb-darwin-x64": {
|
"node_modules/@lancedb/vectordb-darwin-x64": {
|
||||||
"version": "0.3.3",
|
"version": "0.3.7",
|
||||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.3.3.tgz",
|
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.3.7.tgz",
|
||||||
"integrity": "sha512-7CW+nILyPHp6cua0Rl0xaTDWw/vajEn/jCsEjFYgDmE+rtf5Z5Fum41FxR9C2TtIAvUK+nWb5mkYeOLqU6vRvg==",
|
"integrity": "sha512-fgv10kI04UycgpmhJLUcCswgvSdgsGuj65o+W5usmVdxYZiWpoXBBXRkWYMjUX5RNe3mY1Ff6QPBbToR0WkSUA==",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"x64"
|
"x64"
|
||||||
],
|
],
|
||||||
@@ -341,9 +341,9 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
"node_modules/@lancedb/vectordb-linux-arm64-gnu": {
|
"node_modules/@lancedb/vectordb-linux-arm64-gnu": {
|
||||||
"version": "0.3.3",
|
"version": "0.3.7",
|
||||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.3.3.tgz",
|
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.3.7.tgz",
|
||||||
"integrity": "sha512-MmhwbacKxZPkLwwOqysVY8mUb8lFoyFIPlYhSLV4xS1C8X4HWALljIul1qMl1RYudp9Uc3PsOzRexl+OvCGfUw==",
|
"integrity": "sha512-pvw+31+VKEH3YmS/GLKzEGt/Y2+c/IaE6JL6tIjXi2KY+ZcWuyyXpYnYiHHDw2EP7ubKj6+fKIG1P9tlxMcGMQ==",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"arm64"
|
"arm64"
|
||||||
],
|
],
|
||||||
@@ -353,9 +353,9 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
"node_modules/@lancedb/vectordb-linux-x64-gnu": {
|
"node_modules/@lancedb/vectordb-linux-x64-gnu": {
|
||||||
"version": "0.3.3",
|
"version": "0.3.7",
|
||||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.3.3.tgz",
|
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.3.7.tgz",
|
||||||
"integrity": "sha512-OrNlsKi/QPw59Po040oRKn8IuqFEk4upc/4FaFKqVkcmQjjZrMg5Kgy9ZfWIhHdAnWXXggZZIPArpt0X1B0ceA==",
|
"integrity": "sha512-kHFURhfhJRqw4k1auseqQgOzAHB4oYpyzLCX3TCR3uTxqRQ7gFxxlO0TnIcwNRqLcGb9GmWxWWoR8k1CdCXrMw==",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"x64"
|
"x64"
|
||||||
],
|
],
|
||||||
@@ -365,9 +365,9 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
"node_modules/@lancedb/vectordb-win32-x64-msvc": {
|
"node_modules/@lancedb/vectordb-win32-x64-msvc": {
|
||||||
"version": "0.3.3",
|
"version": "0.3.7",
|
||||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.3.3.tgz",
|
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.3.7.tgz",
|
||||||
"integrity": "sha512-lIT0A7a6eqX51IfGyhECtpXXgsr//kgbd+HZbcCdPy2GMmNezSch/7V22zExDSpF32hX8WfgcTLYCVWVilggDQ==",
|
"integrity": "sha512-zWfZ557v2Y+93dVrmqqnbiLeTOb0ptunAG0zGjyE+3oyi8j/4+bL56Fdv94k+dfNF4KrcqcULEcZhKik3/FQ9w==",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"x64"
|
"x64"
|
||||||
],
|
],
|
||||||
@@ -4869,33 +4869,33 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
"@lancedb/vectordb-darwin-arm64": {
|
"@lancedb/vectordb-darwin-arm64": {
|
||||||
"version": "0.3.3",
|
"version": "0.3.7",
|
||||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.3.3.tgz",
|
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.3.7.tgz",
|
||||||
"integrity": "sha512-nvyj7xNX2/wb/PH5TjyhLR/NQ1jVuoBw2B5UaSg7qf8Tnm5SSXWQ7F25RVKcKwh72fz1qB+CWW24ftZnRzbT/Q==",
|
"integrity": "sha512-QsDxcbhrumJg+Cyflpnj8EY+bZojbco5K7VSeKvguqeXUGb62ksyOZuUTCn2sqJaCgy1KZ1qC5U8jBqfgZHc2w==",
|
||||||
"optional": true
|
"optional": true
|
||||||
},
|
},
|
||||||
"@lancedb/vectordb-darwin-x64": {
|
"@lancedb/vectordb-darwin-x64": {
|
||||||
"version": "0.3.3",
|
"version": "0.3.7",
|
||||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.3.3.tgz",
|
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.3.7.tgz",
|
||||||
"integrity": "sha512-7CW+nILyPHp6cua0Rl0xaTDWw/vajEn/jCsEjFYgDmE+rtf5Z5Fum41FxR9C2TtIAvUK+nWb5mkYeOLqU6vRvg==",
|
"integrity": "sha512-fgv10kI04UycgpmhJLUcCswgvSdgsGuj65o+W5usmVdxYZiWpoXBBXRkWYMjUX5RNe3mY1Ff6QPBbToR0WkSUA==",
|
||||||
"optional": true
|
"optional": true
|
||||||
},
|
},
|
||||||
"@lancedb/vectordb-linux-arm64-gnu": {
|
"@lancedb/vectordb-linux-arm64-gnu": {
|
||||||
"version": "0.3.3",
|
"version": "0.3.7",
|
||||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.3.3.tgz",
|
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.3.7.tgz",
|
||||||
"integrity": "sha512-MmhwbacKxZPkLwwOqysVY8mUb8lFoyFIPlYhSLV4xS1C8X4HWALljIul1qMl1RYudp9Uc3PsOzRexl+OvCGfUw==",
|
"integrity": "sha512-pvw+31+VKEH3YmS/GLKzEGt/Y2+c/IaE6JL6tIjXi2KY+ZcWuyyXpYnYiHHDw2EP7ubKj6+fKIG1P9tlxMcGMQ==",
|
||||||
"optional": true
|
"optional": true
|
||||||
},
|
},
|
||||||
"@lancedb/vectordb-linux-x64-gnu": {
|
"@lancedb/vectordb-linux-x64-gnu": {
|
||||||
"version": "0.3.3",
|
"version": "0.3.7",
|
||||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.3.3.tgz",
|
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.3.7.tgz",
|
||||||
"integrity": "sha512-OrNlsKi/QPw59Po040oRKn8IuqFEk4upc/4FaFKqVkcmQjjZrMg5Kgy9ZfWIhHdAnWXXggZZIPArpt0X1B0ceA==",
|
"integrity": "sha512-kHFURhfhJRqw4k1auseqQgOzAHB4oYpyzLCX3TCR3uTxqRQ7gFxxlO0TnIcwNRqLcGb9GmWxWWoR8k1CdCXrMw==",
|
||||||
"optional": true
|
"optional": true
|
||||||
},
|
},
|
||||||
"@lancedb/vectordb-win32-x64-msvc": {
|
"@lancedb/vectordb-win32-x64-msvc": {
|
||||||
"version": "0.3.3",
|
"version": "0.3.7",
|
||||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.3.3.tgz",
|
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.3.7.tgz",
|
||||||
"integrity": "sha512-lIT0A7a6eqX51IfGyhECtpXXgsr//kgbd+HZbcCdPy2GMmNezSch/7V22zExDSpF32hX8WfgcTLYCVWVilggDQ==",
|
"integrity": "sha512-zWfZ557v2Y+93dVrmqqnbiLeTOb0ptunAG0zGjyE+3oyi8j/4+bL56Fdv94k+dfNF4KrcqcULEcZhKik3/FQ9w==",
|
||||||
"optional": true
|
"optional": true
|
||||||
},
|
},
|
||||||
"@neon-rs/cli": {
|
"@neon-rs/cli": {
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "vectordb",
|
"name": "vectordb",
|
||||||
"version": "0.3.3",
|
"version": "0.3.7",
|
||||||
"description": " Serverless, low-latency vector database for AI applications",
|
"description": " Serverless, low-latency vector database for AI applications",
|
||||||
"main": "dist/index.js",
|
"main": "dist/index.js",
|
||||||
"types": "dist/index.d.ts",
|
"types": "dist/index.d.ts",
|
||||||
@@ -81,10 +81,10 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
"optionalDependencies": {
|
"optionalDependencies": {
|
||||||
"@lancedb/vectordb-darwin-arm64": "0.3.3",
|
"@lancedb/vectordb-darwin-arm64": "0.3.7",
|
||||||
"@lancedb/vectordb-darwin-x64": "0.3.3",
|
"@lancedb/vectordb-darwin-x64": "0.3.7",
|
||||||
"@lancedb/vectordb-linux-arm64-gnu": "0.3.3",
|
"@lancedb/vectordb-linux-arm64-gnu": "0.3.7",
|
||||||
"@lancedb/vectordb-linux-x64-gnu": "0.3.3",
|
"@lancedb/vectordb-linux-x64-gnu": "0.3.7",
|
||||||
"@lancedb/vectordb-win32-x64-msvc": "0.3.3"
|
"@lancedb/vectordb-win32-x64-msvc": "0.3.7"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ import { Query } from './query'
|
|||||||
import { isEmbeddingFunction } from './embedding/embedding_function'
|
import { isEmbeddingFunction } from './embedding/embedding_function'
|
||||||
|
|
||||||
// eslint-disable-next-line @typescript-eslint/no-var-requires
|
// eslint-disable-next-line @typescript-eslint/no-var-requires
|
||||||
const { databaseNew, databaseTableNames, databaseOpenTable, databaseDropTable, tableCreate, tableAdd, tableCreateVectorIndex, tableCountRows, tableDelete, tableCleanupOldVersions, tableCompactFiles } = require('../native.js')
|
const { databaseNew, databaseTableNames, databaseOpenTable, databaseDropTable, tableCreate, tableAdd, tableCreateVectorIndex, tableCountRows, tableDelete, tableCleanupOldVersions, tableCompactFiles, tableListIndices, tableIndexStats } = require('../native.js')
|
||||||
|
|
||||||
export { Query }
|
export { Query }
|
||||||
export type { EmbeddingFunction }
|
export type { EmbeddingFunction }
|
||||||
@@ -260,6 +260,27 @@ export interface Table<T = number[]> {
|
|||||||
* ```
|
* ```
|
||||||
*/
|
*/
|
||||||
delete: (filter: string) => Promise<void>
|
delete: (filter: string) => Promise<void>
|
||||||
|
|
||||||
|
/**
|
||||||
|
* List the indicies on this table.
|
||||||
|
*/
|
||||||
|
listIndices: () => Promise<VectorIndex[]>
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get statistics about an index.
|
||||||
|
*/
|
||||||
|
indexStats: (indexUuid: string) => Promise<IndexStats>
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface VectorIndex {
|
||||||
|
columns: string[]
|
||||||
|
name: string
|
||||||
|
uuid: string
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface IndexStats {
|
||||||
|
numIndexedRows: number | null
|
||||||
|
numUnindexedRows: number | null
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -502,6 +523,14 @@ export class LocalTable<T = number[]> implements Table<T> {
|
|||||||
return res.metrics
|
return res.metrics
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async listIndices (): Promise<VectorIndex[]> {
|
||||||
|
return tableListIndices.call(this._tbl)
|
||||||
|
}
|
||||||
|
|
||||||
|
async indexStats (indexUuid: string): Promise<IndexStats> {
|
||||||
|
return tableIndexStats.call(this._tbl, indexUuid)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface CleanupStats {
|
export interface CleanupStats {
|
||||||
|
|||||||
@@ -63,6 +63,9 @@ export class HttpLancedbClient {
|
|||||||
}
|
}
|
||||||
).catch((err) => {
|
).catch((err) => {
|
||||||
console.error('error: ', err)
|
console.error('error: ', err)
|
||||||
|
if (err.response === undefined) {
|
||||||
|
throw new Error(`Network Error: ${err.message as string}`)
|
||||||
|
}
|
||||||
return err.response
|
return err.response
|
||||||
})
|
})
|
||||||
if (response.status !== 200) {
|
if (response.status !== 200) {
|
||||||
@@ -86,13 +89,17 @@ export class HttpLancedbClient {
|
|||||||
{
|
{
|
||||||
headers: {
|
headers: {
|
||||||
'Content-Type': 'application/json',
|
'Content-Type': 'application/json',
|
||||||
'x-api-key': this._apiKey()
|
'x-api-key': this._apiKey(),
|
||||||
|
...(this._dbName !== undefined ? { 'x-lancedb-database': this._dbName } : {})
|
||||||
},
|
},
|
||||||
params,
|
params,
|
||||||
timeout: 10000
|
timeout: 10000
|
||||||
}
|
}
|
||||||
).catch((err) => {
|
).catch((err) => {
|
||||||
console.error('error: ', err)
|
console.error('error: ', err)
|
||||||
|
if (err.response === undefined) {
|
||||||
|
throw new Error(`Network Error: ${err.message as string}`)
|
||||||
|
}
|
||||||
return err.response
|
return err.response
|
||||||
})
|
})
|
||||||
if (response.status !== 200) {
|
if (response.status !== 200) {
|
||||||
@@ -128,6 +135,9 @@ export class HttpLancedbClient {
|
|||||||
}
|
}
|
||||||
).catch((err) => {
|
).catch((err) => {
|
||||||
console.error('error: ', err)
|
console.error('error: ', err)
|
||||||
|
if (err.response === undefined) {
|
||||||
|
throw new Error(`Network Error: ${err.message as string}`)
|
||||||
|
}
|
||||||
return err.response
|
return err.response
|
||||||
})
|
})
|
||||||
if (response.status !== 200) {
|
if (response.status !== 200) {
|
||||||
|
|||||||
@@ -14,7 +14,9 @@
|
|||||||
|
|
||||||
import {
|
import {
|
||||||
type EmbeddingFunction, type Table, type VectorIndexParams, type Connection,
|
type EmbeddingFunction, type Table, type VectorIndexParams, type Connection,
|
||||||
type ConnectionOptions, type CreateTableOptions, type WriteOptions
|
type ConnectionOptions, type CreateTableOptions, type VectorIndex,
|
||||||
|
type WriteOptions,
|
||||||
|
type IndexStats
|
||||||
} from '../index'
|
} from '../index'
|
||||||
import { Query } from '../query'
|
import { Query } from '../query'
|
||||||
|
|
||||||
@@ -235,10 +237,28 @@ export class RemoteTable<T = number[]> implements Table<T> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
async countRows (): Promise<number> {
|
async countRows (): Promise<number> {
|
||||||
throw new Error('Not implemented')
|
const result = await this._client.post(`/v1/table/${this._name}/describe/`)
|
||||||
|
return result.data?.stats?.num_rows
|
||||||
}
|
}
|
||||||
|
|
||||||
async delete (filter: string): Promise<void> {
|
async delete (filter: string): Promise<void> {
|
||||||
await this._client.post(`/v1/table/${this._name}/delete/`, { predicate: filter })
|
await this._client.post(`/v1/table/${this._name}/delete/`, { predicate: filter })
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async listIndices (): Promise<VectorIndex[]> {
|
||||||
|
const results = await this._client.post(`/v1/table/${this._name}/index/list/`)
|
||||||
|
return results.data.indexes?.map((index: any) => ({
|
||||||
|
columns: index.columns,
|
||||||
|
name: index.index_name,
|
||||||
|
uuid: index.index_uuid
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
|
||||||
|
async indexStats (indexUuid: string): Promise<IndexStats> {
|
||||||
|
const results = await this._client.post(`/v1/table/${this._name}/index/${indexUuid}/stats/`)
|
||||||
|
return {
|
||||||
|
numIndexedRows: results.data.num_indexed_rows,
|
||||||
|
numUnindexedRows: results.data.num_unindexed_rows
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -328,6 +328,24 @@ describe('LanceDB client', function () {
|
|||||||
const createIndex = table.createIndex({ type: 'ivf_pq', column: 'name', num_partitions: -1, max_iters: 2, num_sub_vectors: 2 })
|
const createIndex = table.createIndex({ type: 'ivf_pq', column: 'name', num_partitions: -1, max_iters: 2, num_sub_vectors: 2 })
|
||||||
await expect(createIndex).to.be.rejectedWith('num_partitions: must be > 0')
|
await expect(createIndex).to.be.rejectedWith('num_partitions: must be > 0')
|
||||||
})
|
})
|
||||||
|
|
||||||
|
it('should be able to list index and stats', async function () {
|
||||||
|
const uri = await createTestDB(32, 300)
|
||||||
|
const con = await lancedb.connect(uri)
|
||||||
|
const table = await con.openTable('vectors')
|
||||||
|
await table.createIndex({ type: 'ivf_pq', column: 'vector', num_partitions: 2, max_iters: 2, num_sub_vectors: 2 })
|
||||||
|
|
||||||
|
const indices = await table.listIndices()
|
||||||
|
expect(indices).to.have.lengthOf(1)
|
||||||
|
expect(indices[0].name).to.equal('vector_idx')
|
||||||
|
expect(indices[0].uuid).to.not.be.equal(undefined)
|
||||||
|
expect(indices[0].columns).to.have.lengthOf(1)
|
||||||
|
expect(indices[0].columns[0]).to.equal('vector')
|
||||||
|
|
||||||
|
const stats = await table.indexStats(indices[0].uuid)
|
||||||
|
expect(stats.numIndexedRows).to.equal(300)
|
||||||
|
expect(stats.numUnindexedRows).to.equal(0)
|
||||||
|
}).timeout(50_000)
|
||||||
})
|
})
|
||||||
|
|
||||||
describe('when using a custom embedding function', function () {
|
describe('when using a custom embedding function', function () {
|
||||||
@@ -378,6 +396,40 @@ describe('LanceDB client', function () {
|
|||||||
})
|
})
|
||||||
})
|
})
|
||||||
|
|
||||||
|
describe('Remote LanceDB client', function () {
|
||||||
|
describe('when the server is not reachable', function () {
|
||||||
|
it('produces a network error', async function () {
|
||||||
|
const con = await lancedb.connect({
|
||||||
|
uri: 'db://test-1234',
|
||||||
|
region: 'asdfasfasfdf',
|
||||||
|
apiKey: 'some-api-key'
|
||||||
|
})
|
||||||
|
|
||||||
|
// GET
|
||||||
|
try {
|
||||||
|
await con.tableNames()
|
||||||
|
} catch (err) {
|
||||||
|
expect(err).to.have.property('message', 'Network Error: getaddrinfo ENOTFOUND test-1234.asdfasfasfdf.api.lancedb.com')
|
||||||
|
}
|
||||||
|
|
||||||
|
// POST
|
||||||
|
try {
|
||||||
|
await con.createTable({ name: 'vectors', schema: new Schema([]) })
|
||||||
|
} catch (err) {
|
||||||
|
expect(err).to.have.property('message', 'Network Error: getaddrinfo ENOTFOUND test-1234.asdfasfasfdf.api.lancedb.com')
|
||||||
|
}
|
||||||
|
|
||||||
|
// Search
|
||||||
|
const table = await con.openTable('vectors')
|
||||||
|
try {
|
||||||
|
await table.search([0.1, 0.3]).execute()
|
||||||
|
} catch (err) {
|
||||||
|
expect(err).to.have.property('message', 'Network Error: getaddrinfo ENOTFOUND test-1234.asdfasfasfdf.api.lancedb.com')
|
||||||
|
}
|
||||||
|
})
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|
||||||
describe('Query object', function () {
|
describe('Query object', function () {
|
||||||
it('sets custom parameters', async function () {
|
it('sets custom parameters', async function () {
|
||||||
const query = new Query([0.1, 0.3])
|
const query = new Query([0.1, 0.3])
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
[bumpversion]
|
[bumpversion]
|
||||||
current_version = 0.3.2
|
current_version = 0.3.3
|
||||||
commit = True
|
commit = True
|
||||||
message = [python] Bump version: {current_version} → {new_version}
|
message = [python] Bump version: {current_version} → {new_version}
|
||||||
tag = True
|
tag = True
|
||||||
|
|||||||
@@ -16,10 +16,13 @@ from typing import Optional
|
|||||||
|
|
||||||
__version__ = importlib.metadata.version("lancedb")
|
__version__ = importlib.metadata.version("lancedb")
|
||||||
|
|
||||||
from .db import URI, DBConnection, LanceDBConnection
|
from .common import URI
|
||||||
|
from .db import DBConnection, LanceDBConnection
|
||||||
from .remote.db import RemoteDBConnection
|
from .remote.db import RemoteDBConnection
|
||||||
from .schema import vector
|
from .schema import vector # noqa: F401
|
||||||
from .utils import sentry_log
|
from .utils import sentry_log # noqa: F401
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
|
||||||
def connect(
|
def connect(
|
||||||
@@ -69,3 +72,26 @@ def connect(
|
|||||||
raise ValueError(f"api_key is required to connected LanceDB cloud: {uri}")
|
raise ValueError(f"api_key is required to connected LanceDB cloud: {uri}")
|
||||||
return RemoteDBConnection(uri, api_key, region, host_override)
|
return RemoteDBConnection(uri, api_key, region, host_override)
|
||||||
return LanceDBConnection(uri)
|
return LanceDBConnection(uri)
|
||||||
|
|
||||||
|
def drop_database(uri: URI, api_key: str, region: str = "us-west-2"):
|
||||||
|
"""Drop a LanceDB database.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
uri: str or Path
|
||||||
|
The uri of the database.
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
|
||||||
|
>>> lancedb.drop_database(uri="db://", api_key="sk_...", region="...")
|
||||||
|
|
||||||
|
"""
|
||||||
|
if isinstance(uri, str) and uri.startswith("db://"):
|
||||||
|
control_plane_url = f"control-plane.{region}.api.lancedb.com"
|
||||||
|
requests.delete(
|
||||||
|
f"https://{control_plane_url}/api/v1/auth/token/delete",
|
||||||
|
json={"api_key": api_key}
|
||||||
|
)
|
||||||
|
return LanceDBConnection(uri).drop_database()
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,6 @@
|
|||||||
import os
|
import os
|
||||||
|
import time
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pytest
|
import pytest
|
||||||
@@ -38,3 +40,26 @@ class MockTextEmbeddingFunction(TextEmbeddingFunction):
|
|||||||
|
|
||||||
def ndims(self):
|
def ndims(self):
|
||||||
return 10
|
return 10
|
||||||
|
|
||||||
|
|
||||||
|
class RateLimitedAPI:
|
||||||
|
rate_limit = 0.1 # 1 request per 0.1 second
|
||||||
|
last_request_time = 0
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def make_request():
|
||||||
|
current_time = time.time()
|
||||||
|
|
||||||
|
if current_time - RateLimitedAPI.last_request_time < RateLimitedAPI.rate_limit:
|
||||||
|
raise Exception("Rate limit exceeded. Please try again later.")
|
||||||
|
|
||||||
|
# Simulate a successful request
|
||||||
|
RateLimitedAPI.last_request_time = current_time
|
||||||
|
return "Request successful"
|
||||||
|
|
||||||
|
|
||||||
|
@registry.register("test-rate-limited")
|
||||||
|
class MockRateLimitedEmbeddingFunction(MockTextEmbeddingFunction):
|
||||||
|
def generate_embeddings(self, texts):
|
||||||
|
RateLimitedAPI.make_request()
|
||||||
|
return [self._compute_one_embedding(row) for row in texts]
|
||||||
|
|||||||
@@ -84,7 +84,9 @@ def contextualize(raw_df: "pd.DataFrame") -> Contextualizer:
|
|||||||
context windows that don't cross document boundaries. In this case, we can
|
context windows that don't cross document boundaries. In this case, we can
|
||||||
pass ``document_id`` as the group by.
|
pass ``document_id`` as the group by.
|
||||||
|
|
||||||
>>> contextualize(data).window(4).stride(2).text_col('token').groupby('document_id').to_pandas()
|
>>> (contextualize(data)
|
||||||
|
... .window(4).stride(2).text_col('token').groupby('document_id')
|
||||||
|
... .to_pandas())
|
||||||
token document_id
|
token document_id
|
||||||
0 The quick brown fox 1
|
0 The quick brown fox 1
|
||||||
2 brown fox jumped over 1
|
2 brown fox jumped over 1
|
||||||
@@ -92,18 +94,24 @@ def contextualize(raw_df: "pd.DataFrame") -> Contextualizer:
|
|||||||
6 the lazy dog 1
|
6 the lazy dog 1
|
||||||
9 I love sandwiches 2
|
9 I love sandwiches 2
|
||||||
|
|
||||||
``min_window_size`` determines the minimum size of the context windows that are generated
|
``min_window_size`` determines the minimum size of the context windows
|
||||||
This can be used to trim the last few context windows which have size less than
|
that are generated.This can be used to trim the last few context windows
|
||||||
``min_window_size``. By default context windows of size 1 are skipped.
|
which have size less than ``min_window_size``.
|
||||||
|
By default context windows of size 1 are skipped.
|
||||||
|
|
||||||
>>> contextualize(data).window(6).stride(3).text_col('token').groupby('document_id').to_pandas()
|
>>> (contextualize(data)
|
||||||
|
... .window(6).stride(3).text_col('token').groupby('document_id')
|
||||||
|
... .to_pandas())
|
||||||
token document_id
|
token document_id
|
||||||
0 The quick brown fox jumped over 1
|
0 The quick brown fox jumped over 1
|
||||||
3 fox jumped over the lazy dog 1
|
3 fox jumped over the lazy dog 1
|
||||||
6 the lazy dog 1
|
6 the lazy dog 1
|
||||||
9 I love sandwiches 2
|
9 I love sandwiches 2
|
||||||
|
|
||||||
>>> contextualize(data).window(6).stride(3).min_window_size(4).text_col('token').groupby('document_id').to_pandas()
|
>>> (contextualize(data)
|
||||||
|
... .window(6).stride(3).min_window_size(4).text_col('token')
|
||||||
|
... .groupby('document_id')
|
||||||
|
... .to_pandas())
|
||||||
token document_id
|
token document_id
|
||||||
0 The quick brown fox jumped over 1
|
0 The quick brown fox jumped over 1
|
||||||
3 fox jumped over the lazy dog 1
|
3 fox jumped over the lazy dog 1
|
||||||
@@ -113,7 +121,9 @@ def contextualize(raw_df: "pd.DataFrame") -> Contextualizer:
|
|||||||
|
|
||||||
|
|
||||||
class Contextualizer:
|
class Contextualizer:
|
||||||
"""Create context windows from a DataFrame. See [lancedb.context.contextualize][]."""
|
"""Create context windows from a DataFrame.
|
||||||
|
See [lancedb.context.contextualize][].
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self, raw_df):
|
def __init__(self, raw_df):
|
||||||
self._text_col = None
|
self._text_col = None
|
||||||
@@ -183,7 +193,7 @@ class Contextualizer:
|
|||||||
deprecated_in="0.3.1",
|
deprecated_in="0.3.1",
|
||||||
removed_in="0.4.0",
|
removed_in="0.4.0",
|
||||||
current_version=__version__,
|
current_version=__version__,
|
||||||
details="Use the bar function instead",
|
details="Use to_pandas() instead",
|
||||||
)
|
)
|
||||||
def to_df(self) -> "pd.DataFrame":
|
def to_df(self) -> "pd.DataFrame":
|
||||||
return self.to_pandas()
|
return self.to_pandas()
|
||||||
|
|||||||
@@ -14,26 +14,39 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import os
|
import os
|
||||||
from abc import ABC, abstractmethod
|
from abc import abstractmethod
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List, Optional, Union
|
from typing import TYPE_CHECKING, Iterable, List, Optional, Union
|
||||||
|
|
||||||
import pyarrow as pa
|
import pyarrow as pa
|
||||||
|
from overrides import EnforceOverrides, override
|
||||||
from pyarrow import fs
|
from pyarrow import fs
|
||||||
|
|
||||||
from .common import DATA, URI
|
|
||||||
from .embeddings import EmbeddingFunctionConfig
|
|
||||||
from .pydantic import LanceModel
|
|
||||||
from .table import LanceTable, Table
|
from .table import LanceTable, Table
|
||||||
from .util import fs_from_uri, get_uri_location, get_uri_scheme
|
from .util import fs_from_uri, get_uri_location, get_uri_scheme
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from .common import DATA, URI
|
||||||
|
from .embeddings import EmbeddingFunctionConfig
|
||||||
|
from .pydantic import LanceModel
|
||||||
|
|
||||||
class DBConnection(ABC):
|
|
||||||
|
class DBConnection(EnforceOverrides):
|
||||||
"""An active LanceDB connection interface."""
|
"""An active LanceDB connection interface."""
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def table_names(self) -> list[str]:
|
def table_names(
|
||||||
"""List all table names in the database."""
|
self, page_token: Optional[str] = None, limit: int = 10
|
||||||
|
) -> Iterable[str]:
|
||||||
|
"""List all table in this database
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
page_token: str, optional
|
||||||
|
The token to use for pagination. If not present, start from the beginning.
|
||||||
|
limit: int, default 10
|
||||||
|
The size of the page to return.
|
||||||
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
@@ -45,6 +58,7 @@ class DBConnection(ABC):
|
|||||||
mode: str = "create",
|
mode: str = "create",
|
||||||
on_bad_vectors: str = "error",
|
on_bad_vectors: str = "error",
|
||||||
fill_value: float = 0.0,
|
fill_value: float = 0.0,
|
||||||
|
embedding_functions: Optional[List[EmbeddingFunctionConfig]] = None,
|
||||||
) -> Table:
|
) -> Table:
|
||||||
"""Create a [Table][lancedb.table.Table] in the database.
|
"""Create a [Table][lancedb.table.Table] in the database.
|
||||||
|
|
||||||
@@ -52,12 +66,24 @@ class DBConnection(ABC):
|
|||||||
----------
|
----------
|
||||||
name: str
|
name: str
|
||||||
The name of the table.
|
The name of the table.
|
||||||
data: list, tuple, dict, pd.DataFrame; optional
|
data: The data to initialize the table, *optional*
|
||||||
The data to initialize the table. User must provide at least one of `data` or `schema`.
|
User must provide at least one of `data` or `schema`.
|
||||||
schema: pyarrow.Schema or LanceModel; optional
|
Acceptable types are:
|
||||||
The schema of the table.
|
|
||||||
|
- dict or list-of-dict
|
||||||
|
|
||||||
|
- pandas.DataFrame
|
||||||
|
|
||||||
|
- pyarrow.Table or pyarrow.RecordBatch
|
||||||
|
schema: The schema of the table, *optional*
|
||||||
|
Acceptable types are:
|
||||||
|
|
||||||
|
- pyarrow.Schema
|
||||||
|
|
||||||
|
- [LanceModel][lancedb.pydantic.LanceModel]
|
||||||
mode: str; default "create"
|
mode: str; default "create"
|
||||||
The mode to use when creating the table. Can be either "create" or "overwrite".
|
The mode to use when creating the table.
|
||||||
|
Can be either "create" or "overwrite".
|
||||||
By default, if the table already exists, an exception is raised.
|
By default, if the table already exists, an exception is raised.
|
||||||
If you want to overwrite the table, use mode="overwrite".
|
If you want to overwrite the table, use mode="overwrite".
|
||||||
on_bad_vectors: str, default "error"
|
on_bad_vectors: str, default "error"
|
||||||
@@ -150,7 +176,8 @@ class DBConnection(ABC):
|
|||||||
... for i in range(5):
|
... for i in range(5):
|
||||||
... yield pa.RecordBatch.from_arrays(
|
... yield pa.RecordBatch.from_arrays(
|
||||||
... [
|
... [
|
||||||
... pa.array([[3.1, 4.1], [5.9, 26.5]], pa.list_(pa.float32(), 2)),
|
... pa.array([[3.1, 4.1], [5.9, 26.5]],
|
||||||
|
... pa.list_(pa.float32(), 2)),
|
||||||
... pa.array(["foo", "bar"]),
|
... pa.array(["foo", "bar"]),
|
||||||
... pa.array([10.0, 20.0]),
|
... pa.array([10.0, 20.0]),
|
||||||
... ],
|
... ],
|
||||||
@@ -249,12 +276,15 @@ class LanceDBConnection(DBConnection):
|
|||||||
def uri(self) -> str:
|
def uri(self) -> str:
|
||||||
return self._uri
|
return self._uri
|
||||||
|
|
||||||
def table_names(self) -> list[str]:
|
@override
|
||||||
"""Get the names of all tables in the database.
|
def table_names(
|
||||||
|
self, page_token: Optional[str] = None, limit: int = 10
|
||||||
|
) -> Iterable[str]:
|
||||||
|
"""Get the names of all tables in the database. The names are sorted.
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
list of str
|
Iterator of str.
|
||||||
A list of table names.
|
A list of table names.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
@@ -274,6 +304,7 @@ class LanceDBConnection(DBConnection):
|
|||||||
for file_info in paths
|
for file_info in paths
|
||||||
if file_info.extension == "lance"
|
if file_info.extension == "lance"
|
||||||
]
|
]
|
||||||
|
tables.sort()
|
||||||
return tables
|
return tables
|
||||||
|
|
||||||
def __len__(self) -> int:
|
def __len__(self) -> int:
|
||||||
@@ -282,6 +313,7 @@ class LanceDBConnection(DBConnection):
|
|||||||
def __contains__(self, name: str) -> bool:
|
def __contains__(self, name: str) -> bool:
|
||||||
return name in self.table_names()
|
return name in self.table_names()
|
||||||
|
|
||||||
|
@override
|
||||||
def create_table(
|
def create_table(
|
||||||
self,
|
self,
|
||||||
name: str,
|
name: str,
|
||||||
@@ -313,6 +345,7 @@ class LanceDBConnection(DBConnection):
|
|||||||
)
|
)
|
||||||
return tbl
|
return tbl
|
||||||
|
|
||||||
|
@override
|
||||||
def open_table(self, name: str) -> LanceTable:
|
def open_table(self, name: str) -> LanceTable:
|
||||||
"""Open a table in the database.
|
"""Open a table in the database.
|
||||||
|
|
||||||
@@ -327,6 +360,7 @@ class LanceDBConnection(DBConnection):
|
|||||||
"""
|
"""
|
||||||
return LanceTable.open(self, name)
|
return LanceTable.open(self, name)
|
||||||
|
|
||||||
|
@override
|
||||||
def drop_table(self, name: str, ignore_missing: bool = False):
|
def drop_table(self, name: str, ignore_missing: bool = False):
|
||||||
"""Drop a table from the database.
|
"""Drop a table from the database.
|
||||||
|
|
||||||
@@ -345,6 +379,7 @@ class LanceDBConnection(DBConnection):
|
|||||||
if not ignore_missing:
|
if not ignore_missing:
|
||||||
raise
|
raise
|
||||||
|
|
||||||
|
@override
|
||||||
def drop_database(self):
|
def drop_database(self):
|
||||||
filesystem, path = fs_from_uri(self.uri)
|
filesystem, path = fs_from_uri(self.uri)
|
||||||
filesystem.delete_dir(path)
|
filesystem.delete_dir(path)
|
||||||
|
|||||||
@@ -11,8 +11,10 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
|
# ruff: noqa: F401
|
||||||
from .base import EmbeddingFunction, EmbeddingFunctionConfig, TextEmbeddingFunction
|
from .base import EmbeddingFunction, EmbeddingFunctionConfig, TextEmbeddingFunction
|
||||||
from .cohere import CohereEmbeddingFunction
|
from .cohere import CohereEmbeddingFunction
|
||||||
|
from .instructor import InstructorEmbeddingFunction
|
||||||
from .open_clip import OpenClipEmbeddings
|
from .open_clip import OpenClipEmbeddings
|
||||||
from .openai import OpenAIEmbeddings
|
from .openai import OpenAIEmbeddings
|
||||||
from .registry import EmbeddingFunctionRegistry, get_registry
|
from .registry import EmbeddingFunctionRegistry, get_registry
|
||||||
|
|||||||
@@ -1,3 +1,15 @@
|
|||||||
|
# Copyright (c) 2023. LanceDB Developers
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
import importlib
|
import importlib
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import List, Union
|
from typing import List, Union
|
||||||
@@ -6,7 +18,7 @@ import numpy as np
|
|||||||
import pyarrow as pa
|
import pyarrow as pa
|
||||||
from pydantic import BaseModel, Field, PrivateAttr
|
from pydantic import BaseModel, Field, PrivateAttr
|
||||||
|
|
||||||
from .utils import TEXT
|
from .utils import TEXT, retry_with_exponential_backoff
|
||||||
|
|
||||||
|
|
||||||
class EmbeddingFunction(BaseModel, ABC):
|
class EmbeddingFunction(BaseModel, ABC):
|
||||||
@@ -21,6 +33,10 @@ class EmbeddingFunction(BaseModel, ABC):
|
|||||||
3. ndims method which returns the number of dimensions of the vector column
|
3. ndims method which returns the number of dimensions of the vector column
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
__slots__ = ("__weakref__",) # pydantic 1.x compatibility
|
||||||
|
max_retries: int = (
|
||||||
|
7 # Setitng 0 disables retires. Maybe this should not be enabled by default,
|
||||||
|
)
|
||||||
_ndims: int = PrivateAttr()
|
_ndims: int = PrivateAttr()
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@@ -44,6 +60,25 @@ class EmbeddingFunction(BaseModel, ABC):
|
|||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
def compute_query_embeddings_with_retry(self, *args, **kwargs) -> List[np.array]:
|
||||||
|
"""
|
||||||
|
Compute the embeddings for a given user query with retries
|
||||||
|
"""
|
||||||
|
return retry_with_exponential_backoff(
|
||||||
|
self.compute_query_embeddings, max_retries=self.max_retries
|
||||||
|
)(
|
||||||
|
*args,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
|
||||||
|
def compute_source_embeddings_with_retry(self, *args, **kwargs) -> List[np.array]:
|
||||||
|
"""
|
||||||
|
Compute the embeddings for the source column in the database with retries
|
||||||
|
"""
|
||||||
|
return retry_with_exponential_backoff(
|
||||||
|
self.compute_source_embeddings, max_retries=self.max_retries
|
||||||
|
)(*args, **kwargs)
|
||||||
|
|
||||||
def sanitize_input(self, texts: TEXT) -> Union[List[str], np.ndarray]:
|
def sanitize_input(self, texts: TEXT) -> Union[List[str], np.ndarray]:
|
||||||
"""
|
"""
|
||||||
Sanitize the input to the embedding function.
|
Sanitize the input to the embedding function.
|
||||||
@@ -103,6 +138,14 @@ class EmbeddingFunction(BaseModel, ABC):
|
|||||||
"""
|
"""
|
||||||
return Field(json_schema_extra={"vector_column_for": self}, **kwargs)
|
return Field(json_schema_extra={"vector_column_for": self}, **kwargs)
|
||||||
|
|
||||||
|
def __eq__(self, __value: object) -> bool:
|
||||||
|
if not hasattr(__value, "__dict__"):
|
||||||
|
return False
|
||||||
|
return vars(self) == vars(__value)
|
||||||
|
|
||||||
|
def __hash__(self) -> int:
|
||||||
|
return hash(frozenset(vars(self).items()))
|
||||||
|
|
||||||
|
|
||||||
class EmbeddingFunctionConfig(BaseModel):
|
class EmbeddingFunctionConfig(BaseModel):
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -31,7 +31,8 @@ class CohereEmbeddingFunction(TextEmbeddingFunction):
|
|||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
name: str, default "embed-multilingual-v2.0"
|
name: str, default "embed-multilingual-v2.0"
|
||||||
The name of the model to use. See the Cohere documentation for a list of available models.
|
The name of the model to use. See the Cohere documentation for
|
||||||
|
a list of available models.
|
||||||
|
|
||||||
Examples
|
Examples
|
||||||
--------
|
--------
|
||||||
@@ -39,7 +40,10 @@ class CohereEmbeddingFunction(TextEmbeddingFunction):
|
|||||||
from lancedb.pydantic import LanceModel, Vector
|
from lancedb.pydantic import LanceModel, Vector
|
||||||
from lancedb.embeddings import EmbeddingFunctionRegistry
|
from lancedb.embeddings import EmbeddingFunctionRegistry
|
||||||
|
|
||||||
cohere = EmbeddingFunctionRegistry.get_instance().get("cohere").create(name="embed-multilingual-v2.0")
|
cohere = EmbeddingFunctionRegistry
|
||||||
|
.get_instance()
|
||||||
|
.get("cohere")
|
||||||
|
.create(name="embed-multilingual-v2.0")
|
||||||
|
|
||||||
class TextModel(LanceModel):
|
class TextModel(LanceModel):
|
||||||
text: str = cohere.SourceField()
|
text: str = cohere.SourceField()
|
||||||
|
|||||||
137
python/lancedb/embeddings/instructor.py
Normal file
137
python/lancedb/embeddings/instructor.py
Normal file
@@ -0,0 +1,137 @@
|
|||||||
|
# Copyright (c) 2023. LanceDB Developers
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from .base import TextEmbeddingFunction
|
||||||
|
from .registry import register
|
||||||
|
from .utils import TEXT, weak_lru
|
||||||
|
|
||||||
|
|
||||||
|
@register("instructor")
|
||||||
|
class InstructorEmbeddingFunction(TextEmbeddingFunction):
|
||||||
|
"""
|
||||||
|
An embedding function that uses the InstructorEmbedding library. Instructor models support multi-task learning, and can be used for a
|
||||||
|
variety of tasks, including text classification, sentence similarity, and document retrieval.
|
||||||
|
If you want to calculate customized embeddings for specific sentences, you may follow the unified template to write instructions:
|
||||||
|
"Represent the `domain` `text_type` for `task_objective`":
|
||||||
|
|
||||||
|
* domain is optional, and it specifies the domain of the text, e.g., science, finance, medicine, etc.
|
||||||
|
* text_type is required, and it specifies the encoding unit, e.g., sentence, document, paragraph, etc.
|
||||||
|
* task_objective is optional, and it specifies the objective of embedding, e.g., retrieve a document, classify the sentence, etc.
|
||||||
|
|
||||||
|
For example, if you want to calculate embeddings for a document, you may write the instruction as follows:
|
||||||
|
"Represent the document for retreival"
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
name: str
|
||||||
|
The name of the model to use. Available models are listed at https://github.com/xlang-ai/instructor-embedding#model-list;
|
||||||
|
The default model is hkunlp/instructor-base
|
||||||
|
batch_size: int, default 32
|
||||||
|
The batch size to use when generating embeddings
|
||||||
|
device: str, default "cpu"
|
||||||
|
The device to use when generating embeddings
|
||||||
|
show_progress_bar: bool, default True
|
||||||
|
Whether to show a progress bar when generating embeddings
|
||||||
|
normalize_embeddings: bool, default True
|
||||||
|
Whether to normalize the embeddings
|
||||||
|
quantize: bool, default False
|
||||||
|
Whether to quantize the model
|
||||||
|
source_instruction: str, default "represent the docuement for retreival"
|
||||||
|
The instruction for the source column
|
||||||
|
query_instruction: str, default "represent the document for retreiving the most similar documents"
|
||||||
|
The instruction for the query
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
import lancedb
|
||||||
|
from lancedb.pydantic import LanceModel, Vector
|
||||||
|
from lancedb.embeddings import get_registry, InstuctorEmbeddingFunction
|
||||||
|
|
||||||
|
instructor = get_registry().get("instructor").create(
|
||||||
|
source_instruction="represent the docuement for retreival",
|
||||||
|
query_instruction="represent the document for retreiving the most similar documents"
|
||||||
|
)
|
||||||
|
|
||||||
|
class Schema(LanceModel):
|
||||||
|
vector: Vector(instructor.ndims()) = instructor.VectorField()
|
||||||
|
text: str = instructor.SourceField()
|
||||||
|
|
||||||
|
db = lancedb.connect("~/.lancedb")
|
||||||
|
tbl = db.create_table("test", schema=Schema, mode="overwrite")
|
||||||
|
|
||||||
|
texts = [{"text": "Capitalism has been dominant in the Western world since the end of feudalism, but most feel[who?] that..."},
|
||||||
|
{"text": "The disparate impact theory is especially controversial under the Fair Housing Act because the Act..."},
|
||||||
|
{"text": "Disparate impact in United States labor law refers to practices in employment, housing, and other areas that.."}]
|
||||||
|
|
||||||
|
tbl.add(texts)
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
name: str = "hkunlp/instructor-base"
|
||||||
|
batch_size: int = 32
|
||||||
|
device: str = "cpu"
|
||||||
|
show_progress_bar: bool = True
|
||||||
|
normalize_embeddings: bool = True
|
||||||
|
quantize: bool = False
|
||||||
|
# convert_to_numpy: bool = True # Hardcoding this as numpy can be ingested directly
|
||||||
|
|
||||||
|
source_instruction: str = "represent the document for retrieval"
|
||||||
|
query_instruction: str = (
|
||||||
|
"represent the document for retrieving the most similar documents"
|
||||||
|
)
|
||||||
|
|
||||||
|
@weak_lru(maxsize=1)
|
||||||
|
def ndims(self):
|
||||||
|
model = self.get_model()
|
||||||
|
return model.encode("foo").shape[0]
|
||||||
|
|
||||||
|
def compute_query_embeddings(self, query: str, *args, **kwargs) -> List[np.array]:
|
||||||
|
return self.generate_embeddings([[self.query_instruction, query]])
|
||||||
|
|
||||||
|
def compute_source_embeddings(self, texts: TEXT, *args, **kwargs) -> List[np.array]:
|
||||||
|
texts = self.sanitize_input(texts)
|
||||||
|
texts_formatted = []
|
||||||
|
for text in texts:
|
||||||
|
texts_formatted.append([self.source_instruction, text])
|
||||||
|
return self.generate_embeddings(texts_formatted)
|
||||||
|
|
||||||
|
def generate_embeddings(self, texts: List) -> List:
|
||||||
|
model = self.get_model()
|
||||||
|
res = model.encode(
|
||||||
|
texts,
|
||||||
|
batch_size=self.batch_size,
|
||||||
|
show_progress_bar=self.show_progress_bar,
|
||||||
|
normalize_embeddings=self.normalize_embeddings,
|
||||||
|
).tolist()
|
||||||
|
return res
|
||||||
|
|
||||||
|
@weak_lru(maxsize=1)
|
||||||
|
def get_model(self):
|
||||||
|
instructor_embedding = self.safe_import(
|
||||||
|
"InstructorEmbedding", "InstructorEmbedding"
|
||||||
|
)
|
||||||
|
torch = self.safe_import("torch", "torch")
|
||||||
|
|
||||||
|
model = instructor_embedding.INSTRUCTOR(self.name)
|
||||||
|
if self.quantize:
|
||||||
|
if (
|
||||||
|
"qnnpack" in torch.backends.quantized.supported_engines
|
||||||
|
): # fix for https://github.com/pytorch/pytorch/issues/29327
|
||||||
|
torch.backends.quantized.engine = "qnnpack"
|
||||||
|
model = torch.quantization.quantize_dynamic(
|
||||||
|
model, {torch.nn.Linear}, dtype=torch.qint8
|
||||||
|
)
|
||||||
|
return model
|
||||||
@@ -1,3 +1,15 @@
|
|||||||
|
# Copyright (c) 2023. LanceDB Developers
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
import concurrent.futures
|
import concurrent.futures
|
||||||
import io
|
import io
|
||||||
import os
|
import os
|
||||||
|
|||||||
@@ -1,3 +1,15 @@
|
|||||||
|
# Copyright (c) 2023. LanceDB Developers
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
from typing import List, Union
|
from typing import List, Union
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|||||||
@@ -1,3 +1,15 @@
|
|||||||
|
# Copyright (c) 2023. LanceDB Developers
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
from typing import List, Union
|
from typing import List, Union
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@@ -5,6 +17,7 @@ from cachetools import cached
|
|||||||
|
|
||||||
from .base import TextEmbeddingFunction
|
from .base import TextEmbeddingFunction
|
||||||
from .registry import register
|
from .registry import register
|
||||||
|
from .utils import weak_lru
|
||||||
|
|
||||||
|
|
||||||
@register("sentence-transformers")
|
@register("sentence-transformers")
|
||||||
@@ -30,7 +43,7 @@ class SentenceTransformerEmbeddings(TextEmbeddingFunction):
|
|||||||
name and device. This is cached so that the model is only loaded
|
name and device. This is cached so that the model is only loaded
|
||||||
once per process.
|
once per process.
|
||||||
"""
|
"""
|
||||||
return self.__class__.get_embedding_model(self.name, self.device)
|
return self.get_embedding_model()
|
||||||
|
|
||||||
def ndims(self):
|
def ndims(self):
|
||||||
if self._ndims is None:
|
if self._ndims is None:
|
||||||
@@ -54,9 +67,8 @@ class SentenceTransformerEmbeddings(TextEmbeddingFunction):
|
|||||||
normalize_embeddings=self.normalize,
|
normalize_embeddings=self.normalize,
|
||||||
).tolist()
|
).tolist()
|
||||||
|
|
||||||
@classmethod
|
@weak_lru(maxsize=1)
|
||||||
@cached(cache={})
|
def get_embedding_model(self):
|
||||||
def get_embedding_model(cls, name, device):
|
|
||||||
"""
|
"""
|
||||||
Get the sentence-transformers embedding model specified by the
|
Get the sentence-transformers embedding model specified by the
|
||||||
name and device. This is cached so that the model is only loaded
|
name and device. This is cached so that the model is only loaded
|
||||||
@@ -71,7 +83,7 @@ class SentenceTransformerEmbeddings(TextEmbeddingFunction):
|
|||||||
|
|
||||||
TODO: use lru_cache instead with a reasonable/configurable maxsize
|
TODO: use lru_cache instead with a reasonable/configurable maxsize
|
||||||
"""
|
"""
|
||||||
sentence_transformers = cls.safe_import(
|
sentence_transformers = self.safe_import(
|
||||||
"sentence_transformers", "sentence-transformers"
|
"sentence_transformers", "sentence-transformers"
|
||||||
)
|
)
|
||||||
return sentence_transformers.SentenceTransformer(name, device=device)
|
return sentence_transformers.SentenceTransformer(self.name, device=self.device)
|
||||||
|
|||||||
@@ -11,10 +11,14 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
|
import functools
|
||||||
import math
|
import math
|
||||||
|
import random
|
||||||
import socket
|
import socket
|
||||||
import sys
|
import sys
|
||||||
|
import time
|
||||||
import urllib.error
|
import urllib.error
|
||||||
|
import weakref
|
||||||
from typing import Callable, List, Union
|
from typing import Callable, List, Union
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@@ -162,6 +166,99 @@ class FunctionWrapper:
|
|||||||
yield from _chunker(arr)
|
yield from _chunker(arr)
|
||||||
|
|
||||||
|
|
||||||
|
def weak_lru(maxsize=128):
|
||||||
|
"""
|
||||||
|
LRU cache that keeps weak references to the objects it caches. Only caches the latest instance of the objects to make sure memory usage
|
||||||
|
is bounded.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
maxsize : int, default 128
|
||||||
|
The maximum number of objects to cache.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Callable
|
||||||
|
A decorator that can be applied to a method.
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
>>> class Foo:
|
||||||
|
... @weak_lru()
|
||||||
|
... def bar(self, x):
|
||||||
|
... return x
|
||||||
|
>>> foo = Foo()
|
||||||
|
>>> foo.bar(1)
|
||||||
|
1
|
||||||
|
>>> foo.bar(2)
|
||||||
|
2
|
||||||
|
>>> foo.bar(1)
|
||||||
|
1
|
||||||
|
"""
|
||||||
|
|
||||||
|
def wrapper(func):
|
||||||
|
@functools.lru_cache(maxsize)
|
||||||
|
def _func(_self, *args, **kwargs):
|
||||||
|
return func(_self(), *args, **kwargs)
|
||||||
|
|
||||||
|
@functools.wraps(func)
|
||||||
|
def inner(self, *args, **kwargs):
|
||||||
|
return _func(weakref.ref(self), *args, **kwargs)
|
||||||
|
|
||||||
|
return inner
|
||||||
|
|
||||||
|
return wrapper
|
||||||
|
|
||||||
|
|
||||||
|
def retry_with_exponential_backoff(
|
||||||
|
func,
|
||||||
|
initial_delay: float = 1,
|
||||||
|
exponential_base: float = 2,
|
||||||
|
jitter: bool = True,
|
||||||
|
max_retries: int = 7,
|
||||||
|
# errors: tuple = (),
|
||||||
|
):
|
||||||
|
"""Retry a function with exponential backoff.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
func (function): The function to be retried.
|
||||||
|
initial_delay (float): Initial delay in seconds (default is 1).
|
||||||
|
exponential_base (float): The base for exponential backoff (default is 2).
|
||||||
|
jitter (bool): Whether to add jitter to the delay (default is True).
|
||||||
|
max_retries (int): Maximum number of retries (default is 10).
|
||||||
|
errors (tuple): Tuple of specific exceptions to retry on (default is (openai.error.RateLimitError,)).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
function: The decorated function.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def wrapper(*args, **kwargs):
|
||||||
|
num_retries = 0
|
||||||
|
delay = initial_delay
|
||||||
|
|
||||||
|
# Loop until a successful response or max_retries is hit or an exception is raised
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
return func(*args, **kwargs)
|
||||||
|
|
||||||
|
# Currently retrying on all exceptions as there is no way to know the format of the error msgs used by different APIs
|
||||||
|
# We'll log the error and say that it is assumed that if this portion errors out, it's due to rate limit but the user
|
||||||
|
# should check the error message to be sure
|
||||||
|
except Exception as e:
|
||||||
|
num_retries += 1
|
||||||
|
|
||||||
|
if num_retries > max_retries:
|
||||||
|
raise Exception(
|
||||||
|
f"Maximum number of retries ({max_retries}) exceeded."
|
||||||
|
)
|
||||||
|
|
||||||
|
delay *= exponential_base * (1 + jitter * random.random())
|
||||||
|
LOGGER.info(f"Retrying in {delay:.2f} seconds due to {e}")
|
||||||
|
time.sleep(delay)
|
||||||
|
|
||||||
|
return wrapper
|
||||||
|
|
||||||
|
|
||||||
def url_retrieve(url: str):
|
def url_retrieve(url: str):
|
||||||
"""
|
"""
|
||||||
Parameters
|
Parameters
|
||||||
|
|||||||
@@ -14,7 +14,7 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import List, Literal, Optional, Type, Union
|
from typing import TYPE_CHECKING, List, Literal, Optional, Type, Union
|
||||||
|
|
||||||
import deprecation
|
import deprecation
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@@ -23,14 +23,49 @@ import pydantic
|
|||||||
|
|
||||||
from . import __version__
|
from . import __version__
|
||||||
from .common import VECTOR_COLUMN_NAME
|
from .common import VECTOR_COLUMN_NAME
|
||||||
from .pydantic import LanceModel
|
|
||||||
from .util import safe_import_pandas
|
from .util import safe_import_pandas
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from .pydantic import LanceModel
|
||||||
|
|
||||||
pd = safe_import_pandas()
|
pd = safe_import_pandas()
|
||||||
|
|
||||||
|
|
||||||
class Query(pydantic.BaseModel):
|
class Query(pydantic.BaseModel):
|
||||||
"""A Query"""
|
"""The LanceDB Query
|
||||||
|
|
||||||
|
Attributes
|
||||||
|
----------
|
||||||
|
vector : List[float]
|
||||||
|
the vector to search for
|
||||||
|
filter : Optional[str]
|
||||||
|
sql filter to refine the query with, optional
|
||||||
|
prefilter : bool
|
||||||
|
if True then apply the filter before vector search
|
||||||
|
k : int
|
||||||
|
top k results to return
|
||||||
|
metric : str
|
||||||
|
the distance metric between a pair of vectors,
|
||||||
|
|
||||||
|
can support L2 (default), Cosine and Dot.
|
||||||
|
[metric definitions][search]
|
||||||
|
columns : Optional[List[str]]
|
||||||
|
which columns to return in the results
|
||||||
|
nprobes : int
|
||||||
|
The number of probes used - optional
|
||||||
|
|
||||||
|
- A higher number makes search more accurate but also slower.
|
||||||
|
|
||||||
|
- See discussion in [Querying an ANN Index][querying-an-ann-index] for
|
||||||
|
tuning advice.
|
||||||
|
refine_factor : Optional[int]
|
||||||
|
Refine the results by reading extra elements and re-ranking them in memory - optional
|
||||||
|
|
||||||
|
- A higher number makes search more accurate but also slower.
|
||||||
|
|
||||||
|
- See discussion in [Querying an ANN Index][querying-an-ann-index] for
|
||||||
|
tuning advice.
|
||||||
|
"""
|
||||||
|
|
||||||
vector_column: str = VECTOR_COLUMN_NAME
|
vector_column: str = VECTOR_COLUMN_NAME
|
||||||
|
|
||||||
@@ -61,6 +96,10 @@ class Query(pydantic.BaseModel):
|
|||||||
|
|
||||||
|
|
||||||
class LanceQueryBuilder(ABC):
|
class LanceQueryBuilder(ABC):
|
||||||
|
"""Build LanceDB query based on specific query type:
|
||||||
|
vector or full text search.
|
||||||
|
"""
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def create(
|
def create(
|
||||||
cls,
|
cls,
|
||||||
@@ -103,7 +142,7 @@ class LanceQueryBuilder(ABC):
|
|||||||
if not isinstance(query, (list, np.ndarray)):
|
if not isinstance(query, (list, np.ndarray)):
|
||||||
conf = table.embedding_functions.get(vector_column_name)
|
conf = table.embedding_functions.get(vector_column_name)
|
||||||
if conf is not None:
|
if conf is not None:
|
||||||
query = conf.function.compute_query_embeddings(query)[0]
|
query = conf.function.compute_query_embeddings_with_retry(query)[0]
|
||||||
else:
|
else:
|
||||||
msg = f"No embedding function for {vector_column_name}"
|
msg = f"No embedding function for {vector_column_name}"
|
||||||
raise ValueError(msg)
|
raise ValueError(msg)
|
||||||
@@ -114,7 +153,7 @@ class LanceQueryBuilder(ABC):
|
|||||||
else:
|
else:
|
||||||
conf = table.embedding_functions.get(vector_column_name)
|
conf = table.embedding_functions.get(vector_column_name)
|
||||||
if conf is not None:
|
if conf is not None:
|
||||||
query = conf.function.compute_query_embeddings(query)[0]
|
query = conf.function.compute_query_embeddings_with_retry(query)[0]
|
||||||
return query, "vector"
|
return query, "vector"
|
||||||
else:
|
else:
|
||||||
return query, "fts"
|
return query, "fts"
|
||||||
@@ -133,11 +172,11 @@ class LanceQueryBuilder(ABC):
|
|||||||
deprecated_in="0.3.1",
|
deprecated_in="0.3.1",
|
||||||
removed_in="0.4.0",
|
removed_in="0.4.0",
|
||||||
current_version=__version__,
|
current_version=__version__,
|
||||||
details="Use the bar function instead",
|
details="Use to_pandas() instead",
|
||||||
)
|
)
|
||||||
def to_df(self) -> "pd.DataFrame":
|
def to_df(self) -> "pd.DataFrame":
|
||||||
"""
|
"""
|
||||||
Deprecated alias for `to_pandas()`. Please use `to_pandas()` instead.
|
*Deprecated alias for `to_pandas()`. Please use `to_pandas()` instead.*
|
||||||
|
|
||||||
Execute the query and return the results as a pandas DataFrame.
|
Execute the query and return the results as a pandas DataFrame.
|
||||||
In addition to the selected columns, LanceDB also returns a vector
|
In addition to the selected columns, LanceDB also returns a vector
|
||||||
@@ -226,13 +265,20 @@ class LanceQueryBuilder(ABC):
|
|||||||
self._columns = columns
|
self._columns = columns
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def where(self, where) -> LanceQueryBuilder:
|
def where(self, where: str, prefilter: bool = False) -> LanceQueryBuilder:
|
||||||
"""Set the where clause.
|
"""Set the where clause.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
where: str
|
where: str
|
||||||
The where clause.
|
The where clause which is a valid SQL where clause. See
|
||||||
|
`Lance filter pushdown <https://lancedb.github.io/lance/read_and_write.html#filter-push-down>`_
|
||||||
|
for valid SQL expressions.
|
||||||
|
prefilter: bool, default False
|
||||||
|
If True, apply the filter before vector search, otherwise the
|
||||||
|
filter is applied on the result of vector search.
|
||||||
|
This feature is **EXPERIMENTAL** and may be removed and modified
|
||||||
|
without warning in the future.
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
@@ -240,13 +286,12 @@ class LanceQueryBuilder(ABC):
|
|||||||
The LanceQueryBuilder object.
|
The LanceQueryBuilder object.
|
||||||
"""
|
"""
|
||||||
self._where = where
|
self._where = where
|
||||||
|
self._prefilter = prefilter
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
||||||
class LanceVectorQueryBuilder(LanceQueryBuilder):
|
class LanceVectorQueryBuilder(LanceQueryBuilder):
|
||||||
"""
|
"""
|
||||||
A builder for nearest neighbor queries for LanceDB.
|
|
||||||
|
|
||||||
Examples
|
Examples
|
||||||
--------
|
--------
|
||||||
>>> import lancedb
|
>>> import lancedb
|
||||||
@@ -302,7 +347,7 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
|
|||||||
Higher values will yield better recall (more likely to find vectors if
|
Higher values will yield better recall (more likely to find vectors if
|
||||||
they exist) at the expense of latency.
|
they exist) at the expense of latency.
|
||||||
|
|
||||||
See discussion in [Querying an ANN Index][../querying-an-ann-index] for
|
See discussion in [Querying an ANN Index][querying-an-ann-index] for
|
||||||
tuning advice.
|
tuning advice.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
@@ -369,14 +414,14 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
|
|||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
where: str
|
where: str
|
||||||
The where clause.
|
The where clause which is a valid SQL where clause. See
|
||||||
|
`Lance filter pushdown <https://lancedb.github.io/lance/read_and_write.html#filter-push-down>`_
|
||||||
|
for valid SQL expressions.
|
||||||
prefilter: bool, default False
|
prefilter: bool, default False
|
||||||
If True, apply the filter before vector search, otherwise the
|
If True, apply the filter before vector search, otherwise the
|
||||||
filter is applied on the result of vector search.
|
filter is applied on the result of vector search.
|
||||||
This feature is **EXPERIMENTAL** and may be removed and modified
|
This feature is **EXPERIMENTAL** and may be removed and modified
|
||||||
without warning in the future. Currently this is only supported
|
without warning in the future.
|
||||||
in OSS and can only be used with a table that does not have an ANN
|
|
||||||
index.
|
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
@@ -389,6 +434,8 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
|
|||||||
|
|
||||||
|
|
||||||
class LanceFtsQueryBuilder(LanceQueryBuilder):
|
class LanceFtsQueryBuilder(LanceQueryBuilder):
|
||||||
|
"""A builder for full text search for LanceDB."""
|
||||||
|
|
||||||
def __init__(self, table: "lancedb.table.Table", query: str):
|
def __init__(self, table: "lancedb.table.Table", query: str):
|
||||||
super().__init__(table)
|
super().__init__(table)
|
||||||
self._query = query
|
self._query = query
|
||||||
|
|||||||
@@ -13,7 +13,7 @@
|
|||||||
|
|
||||||
|
|
||||||
import functools
|
import functools
|
||||||
from typing import Any, Callable, Dict, Optional, Union
|
from typing import Any, Callable, Dict, Iterable, Optional, Union
|
||||||
|
|
||||||
import aiohttp
|
import aiohttp
|
||||||
import attrs
|
import attrs
|
||||||
@@ -151,15 +151,14 @@ class RestfulLanceDBClient:
|
|||||||
return await deserialize(resp)
|
return await deserialize(resp)
|
||||||
|
|
||||||
@_check_not_closed
|
@_check_not_closed
|
||||||
async def list_tables(self, limit: int, page_token: str):
|
async def list_tables(
|
||||||
|
self, limit: int, page_token: Optional[str] = None
|
||||||
|
) -> Iterable[str]:
|
||||||
"""List all tables in the database."""
|
"""List all tables in the database."""
|
||||||
try:
|
if page_token is None:
|
||||||
json = await self.get(
|
page_token = ""
|
||||||
"/v1/table/", {"limit": limit, "page_token": page_token}
|
json = await self.get("/v1/table/", {"limit": limit, "page_token": page_token})
|
||||||
)
|
return json["tables"]
|
||||||
return json["tables"]
|
|
||||||
except StopAsyncIteration:
|
|
||||||
return []
|
|
||||||
|
|
||||||
@_check_not_closed
|
@_check_not_closed
|
||||||
async def query(self, table_name: str, query: VectorQuery) -> VectorQueryResult:
|
async def query(self, table_name: str, query: VectorQuery) -> VectorQueryResult:
|
||||||
|
|||||||
@@ -12,14 +12,19 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
|
import inspect
|
||||||
|
import logging
|
||||||
import uuid
|
import uuid
|
||||||
from typing import Iterator, Optional
|
from typing import Iterable, List, Optional, Union
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
import pyarrow as pa
|
import pyarrow as pa
|
||||||
|
from overrides import override
|
||||||
|
|
||||||
from ..common import DATA
|
from ..common import DATA
|
||||||
from ..db import DBConnection
|
from ..db import DBConnection
|
||||||
|
from ..embeddings import EmbeddingFunctionConfig
|
||||||
|
from ..pydantic import LanceModel
|
||||||
from ..table import Table, _sanitize_data
|
from ..table import Table, _sanitize_data
|
||||||
from .arrow import to_ipc_binary
|
from .arrow import to_ipc_binary
|
||||||
from .client import ARROW_STREAM_CONTENT_TYPE, RestfulLanceDBClient
|
from .client import ARROW_STREAM_CONTENT_TYPE, RestfulLanceDBClient
|
||||||
@@ -52,11 +57,13 @@ class RemoteDBConnection(DBConnection):
|
|||||||
def __repr__(self) -> str:
|
def __repr__(self) -> str:
|
||||||
return f"RemoveConnect(name={self.db_name})"
|
return f"RemoveConnect(name={self.db_name})"
|
||||||
|
|
||||||
def table_names(self, last_token: str, limit=10) -> Iterator[str]:
|
@override
|
||||||
|
def table_names(self, page_token: Optional[str] = None, limit=10) -> Iterable[str]:
|
||||||
"""List the names of all tables in the database.
|
"""List the names of all tables in the database.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
last_token: str
|
page_token: str
|
||||||
The last token to start the new page.
|
The last token to start the new page.
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
@@ -65,15 +72,16 @@ class RemoteDBConnection(DBConnection):
|
|||||||
"""
|
"""
|
||||||
while True:
|
while True:
|
||||||
result = self._loop.run_until_complete(
|
result = self._loop.run_until_complete(
|
||||||
self._client.list_tables(limit, last_token)
|
self._client.list_tables(limit, page_token)
|
||||||
)
|
)
|
||||||
if len(result) > 0:
|
if len(result) > 0:
|
||||||
last_token = result[len(result) - 1]
|
page_token = result[len(result) - 1]
|
||||||
else:
|
else:
|
||||||
break
|
break
|
||||||
for item in result:
|
for item in result:
|
||||||
yield result
|
yield item
|
||||||
|
|
||||||
|
@override
|
||||||
def open_table(self, name: str) -> Table:
|
def open_table(self, name: str) -> Table:
|
||||||
"""Open a Lance Table in the database.
|
"""Open a Lance Table in the database.
|
||||||
|
|
||||||
@@ -88,23 +96,50 @@ class RemoteDBConnection(DBConnection):
|
|||||||
"""
|
"""
|
||||||
from .table import RemoteTable
|
from .table import RemoteTable
|
||||||
|
|
||||||
# TODO: check if table exists
|
# check if table exists
|
||||||
|
try:
|
||||||
|
self._loop.run_until_complete(
|
||||||
|
self._client.post(f"/v1/table/{name}/describe/")
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
logging.error(
|
||||||
|
"Table {name} does not exist."
|
||||||
|
"Please first call db.create_table({name}, data)"
|
||||||
|
)
|
||||||
return RemoteTable(self, name)
|
return RemoteTable(self, name)
|
||||||
|
|
||||||
|
@override
|
||||||
def create_table(
|
def create_table(
|
||||||
self,
|
self,
|
||||||
name: str,
|
name: str,
|
||||||
data: DATA = None,
|
data: DATA = None,
|
||||||
schema: pa.Schema = None,
|
schema: Optional[Union[pa.Schema, LanceModel]] = None,
|
||||||
on_bad_vectors: str = "error",
|
on_bad_vectors: str = "error",
|
||||||
fill_value: float = 0.0,
|
fill_value: float = 0.0,
|
||||||
|
embedding_functions: Optional[List[EmbeddingFunctionConfig]] = None,
|
||||||
) -> Table:
|
) -> Table:
|
||||||
if data is None and schema is None:
|
if data is None and schema is None:
|
||||||
raise ValueError("Either data or schema must be provided.")
|
raise ValueError("Either data or schema must be provided.")
|
||||||
|
if embedding_functions is not None:
|
||||||
|
raise NotImplementedError(
|
||||||
|
"embedding_functions is not supported for remote databases."
|
||||||
|
"Please vote https://github.com/lancedb/lancedb/issues/626 "
|
||||||
|
"for this feature."
|
||||||
|
)
|
||||||
|
|
||||||
|
if inspect.isclass(schema) and issubclass(schema, LanceModel):
|
||||||
|
# convert LanceModel to pyarrow schema
|
||||||
|
# note that it's possible this contains
|
||||||
|
# embedding function metadata already
|
||||||
|
schema = schema.to_arrow_schema()
|
||||||
|
|
||||||
if data is not None:
|
if data is not None:
|
||||||
data = _sanitize_data(
|
data = _sanitize_data(
|
||||||
data, schema, on_bad_vectors=on_bad_vectors, fill_value=fill_value
|
data,
|
||||||
|
schema,
|
||||||
|
metadata=None,
|
||||||
|
on_bad_vectors=on_bad_vectors,
|
||||||
|
fill_value=fill_value,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
if schema is None:
|
if schema is None:
|
||||||
@@ -126,6 +161,7 @@ class RemoteDBConnection(DBConnection):
|
|||||||
)
|
)
|
||||||
return RemoteTable(self, name)
|
return RemoteTable(self, name)
|
||||||
|
|
||||||
|
@override
|
||||||
def drop_table(self, name: str):
|
def drop_table(self, name: str):
|
||||||
"""Drop a table from the database.
|
"""Drop a table from the database.
|
||||||
|
|
||||||
|
|||||||
@@ -44,6 +44,14 @@ class RemoteTable(Table):
|
|||||||
schema = json_to_schema(resp["schema"])
|
schema = json_to_schema(resp["schema"])
|
||||||
return schema
|
return schema
|
||||||
|
|
||||||
|
@property
|
||||||
|
def version(self) -> int:
|
||||||
|
"""Get the current version of the table"""
|
||||||
|
resp = self._conn._loop.run_until_complete(
|
||||||
|
self._conn._client.post(f"/v1/table/{self._name}/describe/")
|
||||||
|
)
|
||||||
|
return resp["version"]
|
||||||
|
|
||||||
def to_arrow(self) -> pa.Table:
|
def to_arrow(self) -> pa.Table:
|
||||||
"""Return the table as an Arrow table."""
|
"""Return the table as an Arrow table."""
|
||||||
raise NotImplementedError("to_arrow() is not supported on the LanceDB cloud")
|
raise NotImplementedError("to_arrow() is not supported on the LanceDB cloud")
|
||||||
@@ -99,8 +107,6 @@ class RemoteTable(Table):
|
|||||||
return LanceVectorQueryBuilder(self, query, vector_column_name)
|
return LanceVectorQueryBuilder(self, query, vector_column_name)
|
||||||
|
|
||||||
def _execute_query(self, query: Query) -> pa.Table:
|
def _execute_query(self, query: Query) -> pa.Table:
|
||||||
if query.prefilter:
|
|
||||||
raise NotImplementedError("Cloud support for prefiltering is coming soon")
|
|
||||||
result = self._conn._client.query(self._name, query)
|
result = self._conn._client.query(self._name, query)
|
||||||
return self._conn._loop.run_until_complete(result).to_arrow()
|
return self._conn._loop.run_until_complete(result).to_arrow()
|
||||||
|
|
||||||
|
|||||||
@@ -16,16 +16,14 @@ from __future__ import annotations
|
|||||||
import inspect
|
import inspect
|
||||||
import os
|
import os
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from datetime import timedelta
|
|
||||||
from functools import cached_property
|
from functools import cached_property
|
||||||
from typing import Any, Iterable, List, Optional, Union
|
from typing import TYPE_CHECKING, Any, Iterable, List, Optional, Union
|
||||||
|
|
||||||
import lance
|
import lance
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pyarrow as pa
|
import pyarrow as pa
|
||||||
import pyarrow.compute as pc
|
import pyarrow.compute as pc
|
||||||
from lance import LanceDataset
|
from lance import LanceDataset
|
||||||
from lance.dataset import CleanupStats, ReaderLike
|
|
||||||
from lance.vector import vec_to_table
|
from lance.vector import vec_to_table
|
||||||
|
|
||||||
from .common import DATA, VEC, VECTOR_COLUMN_NAME
|
from .common import DATA, VEC, VECTOR_COLUMN_NAME
|
||||||
@@ -35,6 +33,12 @@ from .query import LanceQueryBuilder, Query
|
|||||||
from .util import fs_from_uri, safe_import_pandas
|
from .util import fs_from_uri, safe_import_pandas
|
||||||
from .utils.events import register_event
|
from .utils.events import register_event
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from datetime import timedelta
|
||||||
|
|
||||||
|
from lance.dataset import CleanupStats, ReaderLike
|
||||||
|
|
||||||
|
|
||||||
pd = safe_import_pandas()
|
pd = safe_import_pandas()
|
||||||
|
|
||||||
|
|
||||||
@@ -86,7 +90,9 @@ def _append_vector_col(data: pa.Table, metadata: dict, schema: Optional[pa.Schem
|
|||||||
for vector_column, conf in functions.items():
|
for vector_column, conf in functions.items():
|
||||||
func = conf.function
|
func = conf.function
|
||||||
if vector_column not in data.column_names:
|
if vector_column not in data.column_names:
|
||||||
col_data = func.compute_source_embeddings(data[conf.source_column])
|
col_data = func.compute_source_embeddings_with_retry(
|
||||||
|
data[conf.source_column]
|
||||||
|
)
|
||||||
if schema is not None:
|
if schema is not None:
|
||||||
dtype = schema.field(vector_column).type
|
dtype = schema.field(vector_column).type
|
||||||
else:
|
else:
|
||||||
@@ -149,13 +155,13 @@ class Table(ABC):
|
|||||||
@property
|
@property
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def schema(self) -> pa.Schema:
|
def schema(self) -> pa.Schema:
|
||||||
"""The [Arrow Schema](https://arrow.apache.org/docs/python/api/datatypes.html#) of
|
"""The [Arrow Schema](https://arrow.apache.org/docs/python/api/datatypes.html#)
|
||||||
this Table
|
of this Table
|
||||||
|
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def to_pandas(self):
|
def to_pandas(self) -> "pd.DataFrame":
|
||||||
"""Return the table as a pandas DataFrame.
|
"""Return the table as a pandas DataFrame.
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
@@ -191,17 +197,18 @@ class Table(ABC):
|
|||||||
The distance metric to use when creating the index.
|
The distance metric to use when creating the index.
|
||||||
Valid values are "L2", "cosine", or "dot".
|
Valid values are "L2", "cosine", or "dot".
|
||||||
L2 is euclidean distance.
|
L2 is euclidean distance.
|
||||||
num_partitions: int
|
num_partitions: int, default 256
|
||||||
The number of IVF partitions to use when creating the index.
|
The number of IVF partitions to use when creating the index.
|
||||||
Default is 256.
|
Default is 256.
|
||||||
num_sub_vectors: int
|
num_sub_vectors: int, default 96
|
||||||
The number of PQ sub-vectors to use when creating the index.
|
The number of PQ sub-vectors to use when creating the index.
|
||||||
Default is 96.
|
Default is 96.
|
||||||
vector_column_name: str, default "vector"
|
vector_column_name: str, default "vector"
|
||||||
The vector column name to create the index.
|
The vector column name to create the index.
|
||||||
replace: bool, default True
|
replace: bool, default True
|
||||||
If True, replace the existing index if it exists.
|
- If True, replace the existing index if it exists.
|
||||||
If False, raise an error if duplicate index exists.
|
|
||||||
|
- If False, raise an error if duplicate index exists.
|
||||||
accelerator: str, default None
|
accelerator: str, default None
|
||||||
If set, use the given accelerator to create the index.
|
If set, use the given accelerator to create the index.
|
||||||
Only support "cuda" for now.
|
Only support "cuda" for now.
|
||||||
@@ -220,8 +227,14 @@ class Table(ABC):
|
|||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
data: list-of-dict, dict, pd.DataFrame
|
data: DATA
|
||||||
The data to insert into the table.
|
The data to insert into the table. Acceptable types are:
|
||||||
|
|
||||||
|
- dict or list-of-dict
|
||||||
|
|
||||||
|
- pandas.DataFrame
|
||||||
|
|
||||||
|
- pyarrow.Table or pyarrow.RecordBatch
|
||||||
mode: str
|
mode: str
|
||||||
The mode to use when writing the data. Valid values are
|
The mode to use when writing the data. Valid values are
|
||||||
"append" and "overwrite".
|
"append" and "overwrite".
|
||||||
@@ -242,31 +255,70 @@ class Table(ABC):
|
|||||||
query_type: str = "auto",
|
query_type: str = "auto",
|
||||||
) -> LanceQueryBuilder:
|
) -> LanceQueryBuilder:
|
||||||
"""Create a search query to find the nearest neighbors
|
"""Create a search query to find the nearest neighbors
|
||||||
of the given query vector.
|
of the given query vector. We currently support [vector search][search]
|
||||||
|
and [full-text search][experimental-full-text-search].
|
||||||
|
|
||||||
|
All query options are defined in [Query][lancedb.query.Query].
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
>>> import lancedb
|
||||||
|
>>> db = lancedb.connect("./.lancedb")
|
||||||
|
>>> data = [
|
||||||
|
... {"original_width": 100, "caption": "bar", "vector": [0.1, 2.3, 4.5]},
|
||||||
|
... {"original_width": 2000, "caption": "foo", "vector": [0.5, 3.4, 1.3]},
|
||||||
|
... {"original_width": 3000, "caption": "test", "vector": [0.3, 6.2, 2.6]}
|
||||||
|
... ]
|
||||||
|
>>> table = db.create_table("my_table", data)
|
||||||
|
>>> query = [0.4, 1.4, 2.4]
|
||||||
|
>>> (table.search(query, vector_column_name="vector")
|
||||||
|
... .where("original_width > 1000", prefilter=True)
|
||||||
|
... .select(["caption", "original_width"])
|
||||||
|
... .limit(2)
|
||||||
|
... .to_pandas())
|
||||||
|
caption original_width vector _distance
|
||||||
|
0 foo 2000 [0.5, 3.4, 1.3] 5.220000
|
||||||
|
1 test 3000 [0.3, 6.2, 2.6] 23.089996
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
query: str, list, np.ndarray, PIL.Image.Image, default None
|
query: list/np.ndarray/str/PIL.Image.Image, default None
|
||||||
The query to search for. If None then
|
The targetted vector to search for.
|
||||||
the select/where/limit clauses are applied to filter
|
|
||||||
|
- *default None*.
|
||||||
|
Acceptable types are: list, np.ndarray, PIL.Image.Image
|
||||||
|
|
||||||
|
- If None then the select/where/limit clauses are applied to filter
|
||||||
the table
|
the table
|
||||||
vector_column_name: str, default "vector"
|
vector_column_name: str
|
||||||
The name of the vector column to search.
|
The name of the vector column to search.
|
||||||
query_type: str, default "auto"
|
*default "vector"*
|
||||||
"vector", "fts", or "auto"
|
query_type: str
|
||||||
If "auto" then the query type is inferred from the query;
|
*default "auto"*.
|
||||||
If `query` is a list/np.ndarray then the query type is "vector";
|
Acceptable types are: "vector", "fts", or "auto"
|
||||||
If `query` is a PIL.Image.Image then either do vector search
|
|
||||||
or raise an error if no corresponding embedding function is found.
|
- If "auto" then the query type is inferred from the query;
|
||||||
If `query` is a string, then the query type is "vector" if the
|
|
||||||
|
- If `query` is a list/np.ndarray then the query type is
|
||||||
|
"vector";
|
||||||
|
|
||||||
|
- If `query` is a PIL.Image.Image then either do vector search,
|
||||||
|
or raise an error if no corresponding embedding function is found.
|
||||||
|
|
||||||
|
- If `query` is a string, then the query type is "vector" if the
|
||||||
table has embedding functions else the query type is "fts"
|
table has embedding functions else the query type is "fts"
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
LanceQueryBuilder
|
LanceQueryBuilder
|
||||||
A query builder object representing the query.
|
A query builder object representing the query.
|
||||||
Once executed, the query returns selected columns, the vector,
|
Once executed, the query returns
|
||||||
and also the "_distance" column which is the distance between the query
|
|
||||||
|
- selected columns
|
||||||
|
|
||||||
|
- the vector
|
||||||
|
|
||||||
|
- and also the "_distance" column which is the distance between the query
|
||||||
vector and the returned vector.
|
vector and the returned vector.
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
@@ -285,14 +337,19 @@ class Table(ABC):
|
|||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
where: str
|
where: str
|
||||||
The SQL where clause to use when deleting rows. For example, 'x = 2'
|
The SQL where clause to use when deleting rows.
|
||||||
or 'x IN (1, 2, 3)'. The filter must not be empty, or it will error.
|
|
||||||
|
- For example, 'x = 2' or 'x IN (1, 2, 3)'.
|
||||||
|
|
||||||
|
The filter must not be empty, or it will error.
|
||||||
|
|
||||||
Examples
|
Examples
|
||||||
--------
|
--------
|
||||||
>>> import lancedb
|
>>> import lancedb
|
||||||
>>> data = [
|
>>> data = [
|
||||||
... {"x": 1, "vector": [1, 2]}, {"x": 2, "vector": [3, 4]}, {"x": 3, "vector": [5, 6]}
|
... {"x": 1, "vector": [1, 2]},
|
||||||
|
... {"x": 2, "vector": [3, 4]},
|
||||||
|
... {"x": 3, "vector": [5, 6]}
|
||||||
... ]
|
... ]
|
||||||
>>> db = lancedb.connect("./.lancedb")
|
>>> db = lancedb.connect("./.lancedb")
|
||||||
>>> table = db.create_table("my_table", data)
|
>>> table = db.create_table("my_table", data)
|
||||||
@@ -377,7 +434,8 @@ class LanceTable(Table):
|
|||||||
--------
|
--------
|
||||||
>>> import lancedb
|
>>> import lancedb
|
||||||
>>> db = lancedb.connect("./.lancedb")
|
>>> db = lancedb.connect("./.lancedb")
|
||||||
>>> table = db.create_table("my_table", [{"vector": [1.1, 0.9], "type": "vector"}])
|
>>> table = db.create_table("my_table",
|
||||||
|
... [{"vector": [1.1, 0.9], "type": "vector"}])
|
||||||
>>> table.version
|
>>> table.version
|
||||||
2
|
2
|
||||||
>>> table.to_pandas()
|
>>> table.to_pandas()
|
||||||
@@ -424,7 +482,8 @@ class LanceTable(Table):
|
|||||||
--------
|
--------
|
||||||
>>> import lancedb
|
>>> import lancedb
|
||||||
>>> db = lancedb.connect("./.lancedb")
|
>>> db = lancedb.connect("./.lancedb")
|
||||||
>>> table = db.create_table("my_table", [{"vector": [1.1, 0.9], "type": "vector"}])
|
>>> table = db.create_table("my_table", [
|
||||||
|
... {"vector": [1.1, 0.9], "type": "vector"}])
|
||||||
>>> table.version
|
>>> table.version
|
||||||
2
|
2
|
||||||
>>> table.to_pandas()
|
>>> table.to_pandas()
|
||||||
@@ -669,14 +728,39 @@ class LanceTable(Table):
|
|||||||
query_type: str = "auto",
|
query_type: str = "auto",
|
||||||
) -> LanceQueryBuilder:
|
) -> LanceQueryBuilder:
|
||||||
"""Create a search query to find the nearest neighbors
|
"""Create a search query to find the nearest neighbors
|
||||||
of the given query vector.
|
of the given query vector. We currently support [vector search][search]
|
||||||
|
and [full-text search][search].
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
>>> import lancedb
|
||||||
|
>>> db = lancedb.connect("./.lancedb")
|
||||||
|
>>> data = [
|
||||||
|
... {"original_width": 100, "caption": "bar", "vector": [0.1, 2.3, 4.5]},
|
||||||
|
... {"original_width": 2000, "caption": "foo", "vector": [0.5, 3.4, 1.3]},
|
||||||
|
... {"original_width": 3000, "caption": "test", "vector": [0.3, 6.2, 2.6]}
|
||||||
|
... ]
|
||||||
|
>>> table = db.create_table("my_table", data)
|
||||||
|
>>> query = [0.4, 1.4, 2.4]
|
||||||
|
>>> (table.search(query, vector_column_name="vector")
|
||||||
|
... .where("original_width > 1000", prefilter=True)
|
||||||
|
... .select(["caption", "original_width"])
|
||||||
|
... .limit(2)
|
||||||
|
... .to_pandas())
|
||||||
|
caption original_width vector _distance
|
||||||
|
0 foo 2000 [0.5, 3.4, 1.3] 5.220000
|
||||||
|
1 test 3000 [0.3, 6.2, 2.6] 23.089996
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
query: str, list, np.ndarray, a PIL Image or None
|
query: list/np.ndarray/str/PIL.Image.Image, default None
|
||||||
The query to search for. If None then
|
The targetted vector to search for.
|
||||||
the select/where/limit clauses are applied to filter
|
|
||||||
the table
|
- *default None*.
|
||||||
|
Acceptable types are: list, np.ndarray, PIL.Image.Image
|
||||||
|
|
||||||
|
- If None then the select/[where][sql]/limit clauses are applied
|
||||||
|
to filter the table
|
||||||
vector_column_name: str, default "vector"
|
vector_column_name: str, default "vector"
|
||||||
The name of the vector column to search.
|
The name of the vector column to search.
|
||||||
query_type: str, default "auto"
|
query_type: str, default "auto"
|
||||||
@@ -685,7 +769,7 @@ class LanceTable(Table):
|
|||||||
If `query` is a list/np.ndarray then the query type is "vector";
|
If `query` is a list/np.ndarray then the query type is "vector";
|
||||||
If `query` is a PIL.Image.Image then either do vector search
|
If `query` is a PIL.Image.Image then either do vector search
|
||||||
or raise an error if no corresponding embedding function is found.
|
or raise an error if no corresponding embedding function is found.
|
||||||
If the query is a string, then the query type is "vector" if the
|
If the `query` is a string, then the query type is "vector" if the
|
||||||
table has embedding functions, else the query type is "fts"
|
table has embedding functions, else the query type is "fts"
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
@@ -720,7 +804,9 @@ class LanceTable(Table):
|
|||||||
--------
|
--------
|
||||||
>>> import lancedb
|
>>> import lancedb
|
||||||
>>> data = [
|
>>> data = [
|
||||||
... {"x": 1, "vector": [1, 2]}, {"x": 2, "vector": [3, 4]}, {"x": 3, "vector": [5, 6]}
|
... {"x": 1, "vector": [1, 2]},
|
||||||
|
... {"x": 2, "vector": [3, 4]},
|
||||||
|
... {"x": 3, "vector": [5, 6]}
|
||||||
... ]
|
... ]
|
||||||
>>> db = lancedb.connect("./.lancedb")
|
>>> db = lancedb.connect("./.lancedb")
|
||||||
>>> table = db.create_table("my_table", data)
|
>>> table = db.create_table("my_table", data)
|
||||||
@@ -740,7 +826,8 @@ class LanceTable(Table):
|
|||||||
The data to insert into the table.
|
The data to insert into the table.
|
||||||
At least one of `data` or `schema` must be provided.
|
At least one of `data` or `schema` must be provided.
|
||||||
schema: pa.Schema or LanceModel, optional
|
schema: pa.Schema or LanceModel, optional
|
||||||
The schema of the table. If not provided, the schema is inferred from the data.
|
The schema of the table. If not provided,
|
||||||
|
the schema is inferred from the data.
|
||||||
At least one of `data` or `schema` must be provided.
|
At least one of `data` or `schema` must be provided.
|
||||||
mode: str, default "create"
|
mode: str, default "create"
|
||||||
The mode to use when writing the data. Valid values are
|
The mode to use when writing the data. Valid values are
|
||||||
@@ -811,7 +898,8 @@ class LanceTable(Table):
|
|||||||
file_info = fs.get_file_info(path)
|
file_info = fs.get_file_info(path)
|
||||||
if file_info.type != pa.fs.FileType.Directory:
|
if file_info.type != pa.fs.FileType.Directory:
|
||||||
raise FileNotFoundError(
|
raise FileNotFoundError(
|
||||||
f"Table {name} does not exist. Please first call db.create_table({name}, data)"
|
f"Table {name} does not exist."
|
||||||
|
f"Please first call db.create_table({name}, data)"
|
||||||
)
|
)
|
||||||
return tbl
|
return tbl
|
||||||
|
|
||||||
@@ -838,7 +926,9 @@ class LanceTable(Table):
|
|||||||
--------
|
--------
|
||||||
>>> import lancedb
|
>>> import lancedb
|
||||||
>>> data = [
|
>>> data = [
|
||||||
... {"x": 1, "vector": [1, 2]}, {"x": 2, "vector": [3, 4]}, {"x": 3, "vector": [5, 6]}
|
... {"x": 1, "vector": [1, 2]},
|
||||||
|
... {"x": 2, "vector": [3, 4]},
|
||||||
|
... {"x": 3, "vector": [5, 6]}
|
||||||
... ]
|
... ]
|
||||||
>>> db = lancedb.connect("./.lancedb")
|
>>> db = lancedb.connect("./.lancedb")
|
||||||
>>> table = db.create_table("my_table", data)
|
>>> table = db.create_table("my_table", data)
|
||||||
@@ -872,12 +962,6 @@ class LanceTable(Table):
|
|||||||
|
|
||||||
def _execute_query(self, query: Query) -> pa.Table:
|
def _execute_query(self, query: Query) -> pa.Table:
|
||||||
ds = self.to_lance()
|
ds = self.to_lance()
|
||||||
if query.prefilter:
|
|
||||||
for idx in ds.list_indices():
|
|
||||||
if query.vector_column in idx["fields"]:
|
|
||||||
raise NotImplementedError(
|
|
||||||
"Prefiltering for indexed vector column is coming soon."
|
|
||||||
)
|
|
||||||
return ds.to_table(
|
return ds.to_table(
|
||||||
columns=query.columns,
|
columns=query.columns,
|
||||||
filter=query.filter,
|
filter=query.filter,
|
||||||
@@ -1019,7 +1103,8 @@ def _sanitize_vector_column(
|
|||||||
# ChunkedArray is annoying to work with, so we combine chunks here
|
# ChunkedArray is annoying to work with, so we combine chunks here
|
||||||
vec_arr = data[vector_column_name].combine_chunks()
|
vec_arr = data[vector_column_name].combine_chunks()
|
||||||
if pa.types.is_list(data[vector_column_name].type):
|
if pa.types.is_list(data[vector_column_name].type):
|
||||||
# if it's a variable size list array we make sure the dimensions are all the same
|
# if it's a variable size list array,
|
||||||
|
# we make sure the dimensions are all the same
|
||||||
has_jagged_ndims = len(vec_arr.values) % len(data) != 0
|
has_jagged_ndims = len(vec_arr.values) % len(data) != 0
|
||||||
if has_jagged_ndims:
|
if has_jagged_ndims:
|
||||||
data = _sanitize_jagged(
|
data = _sanitize_jagged(
|
||||||
|
|||||||
@@ -63,7 +63,8 @@ def set_sentry():
|
|||||||
"""
|
"""
|
||||||
if "exc_info" in hint:
|
if "exc_info" in hint:
|
||||||
exc_type, exc_value, tb = hint["exc_info"]
|
exc_type, exc_value, tb = hint["exc_info"]
|
||||||
if "out of memory" in str(exc_value).lower():
|
ignored_errors = ["out of memory", "no space left on device", "testing"]
|
||||||
|
if any(error in str(exc_value).lower() for error in ignored_errors):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
if is_git_dir():
|
if is_git_dir():
|
||||||
@@ -97,7 +98,7 @@ def set_sentry():
|
|||||||
dsn="https://c63ef8c64e05d1aa1a96513361f3ca2f@o4505950840946688.ingest.sentry.io/4505950933614592",
|
dsn="https://c63ef8c64e05d1aa1a96513361f3ca2f@o4505950840946688.ingest.sentry.io/4505950933614592",
|
||||||
debug=False,
|
debug=False,
|
||||||
include_local_variables=False,
|
include_local_variables=False,
|
||||||
traces_sample_rate=1.0,
|
traces_sample_rate=0.5,
|
||||||
environment="production", # 'dev' or 'production'
|
environment="production", # 'dev' or 'production'
|
||||||
before_send=before_send,
|
before_send=before_send,
|
||||||
ignore_errors=[KeyboardInterrupt, FileNotFoundError, bdb.BdbQuit],
|
ignore_errors=[KeyboardInterrupt, FileNotFoundError, bdb.BdbQuit],
|
||||||
|
|||||||
@@ -1,9 +1,9 @@
|
|||||||
[project]
|
[project]
|
||||||
name = "lancedb"
|
name = "lancedb"
|
||||||
version = "0.3.2"
|
version = "0.3.3"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"deprecation",
|
"deprecation",
|
||||||
"pylance==0.8.7",
|
"pylance==0.8.10",
|
||||||
"ratelimiter~=1.0",
|
"ratelimiter~=1.0",
|
||||||
"retry>=0.9.2",
|
"retry>=0.9.2",
|
||||||
"tqdm>=4.1.0",
|
"tqdm>=4.1.0",
|
||||||
@@ -14,7 +14,8 @@ dependencies = [
|
|||||||
"cachetools",
|
"cachetools",
|
||||||
"pyyaml>=6.0",
|
"pyyaml>=6.0",
|
||||||
"click>=8.1.7",
|
"click>=8.1.7",
|
||||||
"requests>=2.31.0"
|
"requests>=2.31.0",
|
||||||
|
"overrides>=0.7"
|
||||||
]
|
]
|
||||||
description = "lancedb"
|
description = "lancedb"
|
||||||
authors = [{ name = "LanceDB Devs", email = "dev@lancedb.com" }]
|
authors = [{ name = "LanceDB Devs", email = "dev@lancedb.com" }]
|
||||||
@@ -52,7 +53,7 @@ tests = ["pandas>=1.4", "pytest", "pytest-mock", "pytest-asyncio", "requests"]
|
|||||||
dev = ["ruff", "pre-commit", "black"]
|
dev = ["ruff", "pre-commit", "black"]
|
||||||
docs = ["mkdocs", "mkdocs-jupyter", "mkdocs-material", "mkdocstrings[python]"]
|
docs = ["mkdocs", "mkdocs-jupyter", "mkdocs-material", "mkdocstrings[python]"]
|
||||||
clip = ["torch", "pillow", "open-clip"]
|
clip = ["torch", "pillow", "open-clip"]
|
||||||
embeddings = ["openai", "sentence-transformers", "torch", "pillow", "open-clip-torch", "cohere"]
|
embeddings = ["openai", "sentence-transformers", "torch", "pillow", "open-clip-torch", "cohere", "InstructorEmbedding"]
|
||||||
|
|
||||||
[project.scripts]
|
[project.scripts]
|
||||||
lancedb = "lancedb.cli.cli:cli"
|
lancedb = "lancedb.cli.cli:cli"
|
||||||
@@ -64,6 +65,9 @@ build-backend = "setuptools.build_meta"
|
|||||||
[tool.isort]
|
[tool.isort]
|
||||||
profile = "black"
|
profile = "black"
|
||||||
|
|
||||||
|
[tool.ruff]
|
||||||
|
select = ["F", "E", "W", "I", "G", "TCH", "PERF"]
|
||||||
|
|
||||||
[tool.pytest.ini_options]
|
[tool.pytest.ini_options]
|
||||||
addopts = "--strict-markers"
|
addopts = "--strict-markers"
|
||||||
markers = [
|
markers = [
|
||||||
|
|||||||
@@ -129,7 +129,7 @@ def test_ingest_iterator(tmp_path):
|
|||||||
[
|
[
|
||||||
PydanticSchema(vector=[3.1, 4.1], item="foo", price=10.0),
|
PydanticSchema(vector=[3.1, 4.1], item="foo", price=10.0),
|
||||||
PydanticSchema(vector=[5.9, 26.5], item="bar", price=20.0),
|
PydanticSchema(vector=[5.9, 26.5], item="bar", price=20.0),
|
||||||
]
|
],
|
||||||
# TODO: test pydict separately. it is unique column number and names contraint
|
# TODO: test pydict separately. it is unique column number and names contraint
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -150,6 +150,21 @@ def test_ingest_iterator(tmp_path):
|
|||||||
run_tests(PydanticSchema)
|
run_tests(PydanticSchema)
|
||||||
|
|
||||||
|
|
||||||
|
def test_table_names(tmp_path):
|
||||||
|
db = lancedb.connect(tmp_path)
|
||||||
|
data = pd.DataFrame(
|
||||||
|
{
|
||||||
|
"vector": [[3.1, 4.1], [5.9, 26.5]],
|
||||||
|
"item": ["foo", "bar"],
|
||||||
|
"price": [10.0, 20.0],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
db.create_table("test2", data=data)
|
||||||
|
db.create_table("test1", data=data)
|
||||||
|
db.create_table("test3", data=data)
|
||||||
|
assert db.table_names() == ["test1", "test2", "test3"]
|
||||||
|
|
||||||
|
|
||||||
def test_create_mode(tmp_path):
|
def test_create_mode(tmp_path):
|
||||||
db = lancedb.connect(tmp_path)
|
db = lancedb.connect(tmp_path)
|
||||||
data = pd.DataFrame(
|
data = pd.DataFrame(
|
||||||
@@ -287,3 +302,27 @@ def test_replace_index(tmp_path):
|
|||||||
num_sub_vectors=4,
|
num_sub_vectors=4,
|
||||||
replace=True,
|
replace=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_prefilter_with_index(tmp_path):
|
||||||
|
db = lancedb.connect(uri=tmp_path)
|
||||||
|
data = [
|
||||||
|
{"vector": np.random.rand(128), "item": "foo", "price": float(i)}
|
||||||
|
for i in range(1000)
|
||||||
|
]
|
||||||
|
sample_key = data[100]["vector"]
|
||||||
|
table = db.create_table(
|
||||||
|
"test",
|
||||||
|
data,
|
||||||
|
)
|
||||||
|
table.create_index(
|
||||||
|
num_partitions=2,
|
||||||
|
num_sub_vectors=4,
|
||||||
|
)
|
||||||
|
table = (
|
||||||
|
table.search(sample_key)
|
||||||
|
.where("price == 500", prefilter=True)
|
||||||
|
.limit(5)
|
||||||
|
.to_arrow()
|
||||||
|
)
|
||||||
|
assert table.num_rows == 1
|
||||||
|
|||||||
@@ -15,13 +15,16 @@ import sys
|
|||||||
import lance
|
import lance
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pyarrow as pa
|
import pyarrow as pa
|
||||||
|
import pytest
|
||||||
|
|
||||||
from lancedb.conftest import MockTextEmbeddingFunction
|
import lancedb
|
||||||
|
from lancedb.conftest import MockRateLimitedEmbeddingFunction, MockTextEmbeddingFunction
|
||||||
from lancedb.embeddings import (
|
from lancedb.embeddings import (
|
||||||
EmbeddingFunctionConfig,
|
EmbeddingFunctionConfig,
|
||||||
EmbeddingFunctionRegistry,
|
EmbeddingFunctionRegistry,
|
||||||
with_embeddings,
|
with_embeddings,
|
||||||
)
|
)
|
||||||
|
from lancedb.pydantic import LanceModel, Vector
|
||||||
|
|
||||||
|
|
||||||
def mock_embed_func(input_data):
|
def mock_embed_func(input_data):
|
||||||
@@ -83,3 +86,29 @@ def test_embedding_function(tmp_path):
|
|||||||
expected = func.compute_query_embeddings("hello world")
|
expected = func.compute_query_embeddings("hello world")
|
||||||
|
|
||||||
assert np.allclose(actual, expected)
|
assert np.allclose(actual, expected)
|
||||||
|
|
||||||
|
|
||||||
|
def test_embedding_function_rate_limit(tmp_path):
|
||||||
|
def _get_schema_from_model(model):
|
||||||
|
class Schema(LanceModel):
|
||||||
|
text: str = model.SourceField()
|
||||||
|
vector: Vector(model.ndims()) = model.VectorField()
|
||||||
|
|
||||||
|
return Schema
|
||||||
|
|
||||||
|
db = lancedb.connect(tmp_path)
|
||||||
|
registry = EmbeddingFunctionRegistry.get_instance()
|
||||||
|
model = registry.get("test-rate-limited").create(max_retries=0)
|
||||||
|
schema = _get_schema_from_model(model)
|
||||||
|
table = db.create_table("test", schema=schema, mode="overwrite")
|
||||||
|
table.add([{"text": "hello world"}])
|
||||||
|
with pytest.raises(Exception):
|
||||||
|
table.add([{"text": "hello world"}])
|
||||||
|
assert len(table) == 1
|
||||||
|
|
||||||
|
model = registry.get("test-rate-limited").create()
|
||||||
|
schema = _get_schema_from_model(model)
|
||||||
|
table = db.create_table("test", schema=schema, mode="overwrite")
|
||||||
|
table.add([{"text": "hello world"}])
|
||||||
|
table.add([{"text": "hello world"}])
|
||||||
|
assert len(table) == 2
|
||||||
|
|||||||
@@ -32,8 +32,8 @@ from lancedb.pydantic import LanceModel, Vector
|
|||||||
def test_sentence_transformer(alias, tmp_path):
|
def test_sentence_transformer(alias, tmp_path):
|
||||||
db = lancedb.connect(tmp_path)
|
db = lancedb.connect(tmp_path)
|
||||||
registry = get_registry()
|
registry = get_registry()
|
||||||
func = registry.get(alias).create()
|
func = registry.get(alias).create(max_retries=0)
|
||||||
func2 = registry.get(alias).create()
|
func2 = registry.get(alias).create(max_retries=0)
|
||||||
|
|
||||||
class Words(LanceModel):
|
class Words(LanceModel):
|
||||||
text: str = func.SourceField()
|
text: str = func.SourceField()
|
||||||
@@ -150,7 +150,11 @@ def test_openclip(tmp_path):
|
|||||||
os.environ.get("COHERE_API_KEY") is None, reason="COHERE_API_KEY not set"
|
os.environ.get("COHERE_API_KEY") is None, reason="COHERE_API_KEY not set"
|
||||||
) # also skip if cohere not installed
|
) # also skip if cohere not installed
|
||||||
def test_cohere_embedding_function():
|
def test_cohere_embedding_function():
|
||||||
cohere = get_registry().get("cohere").create(name="embed-multilingual-v2.0")
|
cohere = (
|
||||||
|
get_registry()
|
||||||
|
.get("cohere")
|
||||||
|
.create(name="embed-multilingual-v2.0", max_retries=0)
|
||||||
|
)
|
||||||
|
|
||||||
class TextModel(LanceModel):
|
class TextModel(LanceModel):
|
||||||
text: str = cohere.SourceField()
|
text: str = cohere.SourceField()
|
||||||
@@ -162,3 +166,19 @@ def test_cohere_embedding_function():
|
|||||||
|
|
||||||
tbl.add(df)
|
tbl.add(df)
|
||||||
assert len(tbl.to_pandas()["vector"][0]) == cohere.ndims()
|
assert len(tbl.to_pandas()["vector"][0]) == cohere.ndims()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.slow
|
||||||
|
def test_instructor_embedding(tmp_path):
|
||||||
|
model = get_registry().get("instructor").create()
|
||||||
|
|
||||||
|
class TextModel(LanceModel):
|
||||||
|
text: str = model.SourceField()
|
||||||
|
vector: Vector(model.ndims()) = model.VectorField()
|
||||||
|
|
||||||
|
df = pd.DataFrame({"text": ["hello world", "goodbye world"]})
|
||||||
|
db = lancedb.connect(tmp_path)
|
||||||
|
tbl = db.create_table("test", schema=TextModel, mode="overwrite")
|
||||||
|
|
||||||
|
tbl.add(df)
|
||||||
|
assert len(tbl.to_pandas()["vector"][0]) == model.ndims()
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "vectordb-node"
|
name = "vectordb-node"
|
||||||
version = "0.3.3"
|
version = "0.3.7"
|
||||||
description = "Serverless, low-latency vector database for AI applications"
|
description = "Serverless, low-latency vector database for AI applications"
|
||||||
license = "Apache-2.0"
|
license = "Apache-2.0"
|
||||||
edition = "2018"
|
edition = "2018"
|
||||||
|
|||||||
@@ -70,7 +70,6 @@ fn get_index_params_builder(
|
|||||||
.map(|mt| {
|
.map(|mt| {
|
||||||
let metric_type = mt.unwrap();
|
let metric_type = mt.unwrap();
|
||||||
index_builder.metric_type(metric_type);
|
index_builder.metric_type(metric_type);
|
||||||
pq_params.metric_type = metric_type;
|
|
||||||
});
|
});
|
||||||
|
|
||||||
let num_partitions = obj.get_opt_usize(cx, "num_partitions")?;
|
let num_partitions = obj.get_opt_usize(cx, "num_partitions")?;
|
||||||
|
|||||||
@@ -239,6 +239,8 @@ fn main(mut cx: ModuleContext) -> NeonResult<()> {
|
|||||||
cx.export_function("tableDelete", JsTable::js_delete)?;
|
cx.export_function("tableDelete", JsTable::js_delete)?;
|
||||||
cx.export_function("tableCleanupOldVersions", JsTable::js_cleanup)?;
|
cx.export_function("tableCleanupOldVersions", JsTable::js_cleanup)?;
|
||||||
cx.export_function("tableCompactFiles", JsTable::js_compact)?;
|
cx.export_function("tableCompactFiles", JsTable::js_compact)?;
|
||||||
|
cx.export_function("tableListIndices", JsTable::js_list_indices)?;
|
||||||
|
cx.export_function("tableIndexStats", JsTable::js_index_stats)?;
|
||||||
cx.export_function(
|
cx.export_function(
|
||||||
"tableCreateVectorIndex",
|
"tableCreateVectorIndex",
|
||||||
index::vector::table_create_vector_index,
|
index::vector::table_create_vector_index,
|
||||||
|
|||||||
@@ -247,7 +247,7 @@ impl JsTable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
rt.spawn(async move {
|
rt.spawn(async move {
|
||||||
let stats = table.compact_files(options).await;
|
let stats = table.compact_files(options, None).await;
|
||||||
|
|
||||||
deferred.settle_with(&channel, move |mut cx| {
|
deferred.settle_with(&channel, move |mut cx| {
|
||||||
let stats = stats.or_throw(&mut cx)?;
|
let stats = stats.or_throw(&mut cx)?;
|
||||||
@@ -276,4 +276,91 @@ impl JsTable {
|
|||||||
});
|
});
|
||||||
Ok(promise)
|
Ok(promise)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub(crate) fn js_list_indices(mut cx: FunctionContext) -> JsResult<JsPromise> {
|
||||||
|
let js_table = cx.this().downcast_or_throw::<JsBox<JsTable>, _>(&mut cx)?;
|
||||||
|
let rt = runtime(&mut cx)?;
|
||||||
|
let (deferred, promise) = cx.promise();
|
||||||
|
// let predicate = cx.argument::<JsString>(0)?.value(&mut cx);
|
||||||
|
let channel = cx.channel();
|
||||||
|
let table = js_table.table.clone();
|
||||||
|
|
||||||
|
rt.spawn(async move {
|
||||||
|
let indices = table.load_indices().await;
|
||||||
|
|
||||||
|
deferred.settle_with(&channel, move |mut cx| {
|
||||||
|
let indices = indices.or_throw(&mut cx)?;
|
||||||
|
|
||||||
|
let output = JsArray::new(&mut cx, indices.len() as u32);
|
||||||
|
for (i, index) in indices.iter().enumerate() {
|
||||||
|
let js_index = JsObject::new(&mut cx);
|
||||||
|
let index_name = cx.string(index.index_name.clone());
|
||||||
|
js_index.set(&mut cx, "name", index_name)?;
|
||||||
|
|
||||||
|
let index_uuid = cx.string(index.index_uuid.clone());
|
||||||
|
js_index.set(&mut cx, "uuid", index_uuid)?;
|
||||||
|
|
||||||
|
let js_index_columns = JsArray::new(&mut cx, index.columns.len() as u32);
|
||||||
|
for (j, column) in index.columns.iter().enumerate() {
|
||||||
|
let js_column = cx.string(column.clone());
|
||||||
|
js_index_columns.set(&mut cx, j as u32, js_column)?;
|
||||||
|
}
|
||||||
|
js_index.set(&mut cx, "columns", js_index_columns)?;
|
||||||
|
|
||||||
|
output.set(&mut cx, i as u32, js_index)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(output)
|
||||||
|
})
|
||||||
|
});
|
||||||
|
Ok(promise)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn js_index_stats(mut cx: FunctionContext) -> JsResult<JsPromise> {
|
||||||
|
let js_table = cx.this().downcast_or_throw::<JsBox<JsTable>, _>(&mut cx)?;
|
||||||
|
let rt = runtime(&mut cx)?;
|
||||||
|
let (deferred, promise) = cx.promise();
|
||||||
|
let index_uuid = cx.argument::<JsString>(0)?.value(&mut cx);
|
||||||
|
let channel = cx.channel();
|
||||||
|
let table = js_table.table.clone();
|
||||||
|
|
||||||
|
rt.spawn(async move {
|
||||||
|
let load_stats = futures::try_join!(
|
||||||
|
table.count_indexed_rows(&index_uuid),
|
||||||
|
table.count_unindexed_rows(&index_uuid)
|
||||||
|
);
|
||||||
|
|
||||||
|
deferred.settle_with(&channel, move |mut cx| {
|
||||||
|
let (indexed_rows, unindexed_rows) = load_stats.or_throw(&mut cx)?;
|
||||||
|
|
||||||
|
let output = JsObject::new(&mut cx);
|
||||||
|
|
||||||
|
match indexed_rows {
|
||||||
|
Some(x) => {
|
||||||
|
let i = cx.number(x as f64);
|
||||||
|
output.set(&mut cx, "numIndexedRows", i)?;
|
||||||
|
}
|
||||||
|
None => {
|
||||||
|
let null = cx.null();
|
||||||
|
output.set(&mut cx, "numIndexedRows", null)?;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
match unindexed_rows {
|
||||||
|
Some(x) => {
|
||||||
|
let i = cx.number(x as f64);
|
||||||
|
output.set(&mut cx, "numUnindexedRows", i)?;
|
||||||
|
}
|
||||||
|
None => {
|
||||||
|
let null = cx.null();
|
||||||
|
output.set(&mut cx, "numUnindexedRows", null)?;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok(output)
|
||||||
|
})
|
||||||
|
});
|
||||||
|
|
||||||
|
Ok(promise)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "vectordb"
|
name = "vectordb"
|
||||||
version = "0.3.3"
|
version = "0.3.7"
|
||||||
edition = "2021"
|
edition = "2021"
|
||||||
description = "LanceDB: A serverless, low-latency vector database for AI applications"
|
description = "LanceDB: A serverless, low-latency vector database for AI applications"
|
||||||
license = "Apache-2.0"
|
license = "Apache-2.0"
|
||||||
|
|||||||
@@ -161,7 +161,7 @@ impl Database {
|
|||||||
///
|
///
|
||||||
/// * A [Vec<String>] with all table names.
|
/// * A [Vec<String>] with all table names.
|
||||||
pub async fn table_names(&self) -> Result<Vec<String>> {
|
pub async fn table_names(&self) -> Result<Vec<String>> {
|
||||||
let f = self
|
let mut f = self
|
||||||
.object_store
|
.object_store
|
||||||
.read_dir(self.base_path.clone())
|
.read_dir(self.base_path.clone())
|
||||||
.await?
|
.await?
|
||||||
@@ -175,7 +175,8 @@ impl Database {
|
|||||||
is_lance.unwrap_or(false)
|
is_lance.unwrap_or(false)
|
||||||
})
|
})
|
||||||
.filter_map(|p| p.file_stem().and_then(|s| s.to_str().map(String::from)))
|
.filter_map(|p| p.file_stem().and_then(|s| s.to_str().map(String::from)))
|
||||||
.collect();
|
.collect::<Vec<String>>();
|
||||||
|
f.sort();
|
||||||
Ok(f)
|
Ok(f)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -312,8 +313,8 @@ mod tests {
|
|||||||
let db = Database::connect(uri).await.unwrap();
|
let db = Database::connect(uri).await.unwrap();
|
||||||
let tables = db.table_names().await.unwrap();
|
let tables = db.table_names().await.unwrap();
|
||||||
assert_eq!(tables.len(), 2);
|
assert_eq!(tables.len(), 2);
|
||||||
assert!(tables.contains(&String::from("table1")));
|
assert!(tables[0].eq(&String::from("table1")));
|
||||||
assert!(tables.contains(&String::from("table2")));
|
assert!(tables[1].eq(&String::from("table2")));
|
||||||
}
|
}
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
|
|||||||
@@ -12,6 +12,7 @@
|
|||||||
// See the License for the specific language governing permissions and
|
// See the License for the specific language governing permissions and
|
||||||
// limitations under the License.
|
// limitations under the License.
|
||||||
|
|
||||||
|
use lance::format::{Index, Manifest};
|
||||||
use lance::index::vector::ivf::IvfBuildParams;
|
use lance::index::vector::ivf::IvfBuildParams;
|
||||||
use lance::index::vector::pq::PQBuildParams;
|
use lance::index::vector::pq::PQBuildParams;
|
||||||
use lance::index::vector::VectorIndexParams;
|
use lance::index::vector::VectorIndexParams;
|
||||||
@@ -98,7 +99,11 @@ impl VectorIndexBuilder for IvfPQIndexBuilder {
|
|||||||
let ivf_params = self.ivf_params.clone().unwrap_or_default();
|
let ivf_params = self.ivf_params.clone().unwrap_or_default();
|
||||||
let pq_params = self.pq_params.clone().unwrap_or_default();
|
let pq_params = self.pq_params.clone().unwrap_or_default();
|
||||||
|
|
||||||
VectorIndexParams::with_ivf_pq_params(pq_params.metric_type, ivf_params, pq_params)
|
VectorIndexParams::with_ivf_pq_params(
|
||||||
|
self.metric_type.unwrap_or(MetricType::L2),
|
||||||
|
ivf_params,
|
||||||
|
pq_params,
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_replace(&self) -> bool {
|
fn get_replace(&self) -> bool {
|
||||||
@@ -106,6 +111,27 @@ impl VectorIndexBuilder for IvfPQIndexBuilder {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub struct VectorIndex {
|
||||||
|
pub columns: Vec<String>,
|
||||||
|
pub index_name: String,
|
||||||
|
pub index_uuid: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl VectorIndex {
|
||||||
|
pub fn new_from_format(manifest: &Manifest, index: &Index) -> VectorIndex {
|
||||||
|
let fields = index
|
||||||
|
.fields
|
||||||
|
.iter()
|
||||||
|
.map(|i| manifest.schema.fields[*i as usize].name.clone())
|
||||||
|
.collect();
|
||||||
|
VectorIndex {
|
||||||
|
columns: fields,
|
||||||
|
index_name: index.name.clone(),
|
||||||
|
index_uuid: index.uuid.to_string(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
@@ -158,7 +184,6 @@ mod tests {
|
|||||||
pq_params.max_iters = 1;
|
pq_params.max_iters = 1;
|
||||||
pq_params.num_bits = 8;
|
pq_params.num_bits = 8;
|
||||||
pq_params.num_sub_vectors = 50;
|
pq_params.num_sub_vectors = 50;
|
||||||
pq_params.metric_type = MetricType::Cosine;
|
|
||||||
pq_params.max_opq_iters = 2;
|
pq_params.max_opq_iters = 2;
|
||||||
index_builder.ivf_params(ivf_params);
|
index_builder.ivf_params(ivf_params);
|
||||||
index_builder.pq_params(pq_params);
|
index_builder.pq_params(pq_params);
|
||||||
@@ -176,7 +201,6 @@ mod tests {
|
|||||||
assert_eq!(pq_params.max_iters, 1);
|
assert_eq!(pq_params.max_iters, 1);
|
||||||
assert_eq!(pq_params.num_bits, 8);
|
assert_eq!(pq_params.num_bits, 8);
|
||||||
assert_eq!(pq_params.num_sub_vectors, 50);
|
assert_eq!(pq_params.num_sub_vectors, 50);
|
||||||
assert_eq!(pq_params.metric_type, MetricType::Cosine);
|
|
||||||
assert_eq!(pq_params.max_opq_iters, 2);
|
assert_eq!(pq_params.max_opq_iters, 2);
|
||||||
} else {
|
} else {
|
||||||
assert!(false, "Expected second stage to be pq")
|
assert!(false, "Expected second stage to be pq")
|
||||||
|
|||||||
@@ -25,7 +25,8 @@ use bytes::Bytes;
|
|||||||
use futures::{stream::BoxStream, FutureExt, StreamExt};
|
use futures::{stream::BoxStream, FutureExt, StreamExt};
|
||||||
use lance::io::object_store::WrappingObjectStore;
|
use lance::io::object_store::WrappingObjectStore;
|
||||||
use object_store::{
|
use object_store::{
|
||||||
path::Path, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Result,
|
path::Path, Error, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore,
|
||||||
|
Result,
|
||||||
};
|
};
|
||||||
|
|
||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
@@ -120,7 +121,10 @@ impl ObjectStore for MirroringObjectStore {
|
|||||||
|
|
||||||
async fn delete(&self, location: &Path) -> Result<()> {
|
async fn delete(&self, location: &Path) -> Result<()> {
|
||||||
if !location.primary_only() {
|
if !location.primary_only() {
|
||||||
self.secondary.delete(location).await?;
|
match self.secondary.delete(location).await {
|
||||||
|
Err(Error::NotFound { .. }) | Ok(_) => {}
|
||||||
|
Err(e) => return Err(e),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
self.primary.delete(location).await
|
self.primary.delete(location).await
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -18,14 +18,16 @@ use std::sync::Arc;
|
|||||||
use arrow_array::{Float32Array, RecordBatchReader};
|
use arrow_array::{Float32Array, RecordBatchReader};
|
||||||
use arrow_schema::SchemaRef;
|
use arrow_schema::SchemaRef;
|
||||||
use lance::dataset::cleanup::RemovalStats;
|
use lance::dataset::cleanup::RemovalStats;
|
||||||
use lance::dataset::optimize::{compact_files, CompactionMetrics, CompactionOptions};
|
use lance::dataset::optimize::{
|
||||||
|
compact_files, CompactionMetrics, CompactionOptions, IndexRemapperOptions,
|
||||||
|
};
|
||||||
use lance::dataset::{Dataset, WriteParams};
|
use lance::dataset::{Dataset, WriteParams};
|
||||||
use lance::index::IndexType;
|
use lance::index::{DatasetIndexExt, IndexType};
|
||||||
use lance::io::object_store::WrappingObjectStore;
|
use lance::io::object_store::WrappingObjectStore;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
|
|
||||||
use crate::error::{Error, Result};
|
use crate::error::{Error, Result};
|
||||||
use crate::index::vector::VectorIndexBuilder;
|
use crate::index::vector::{VectorIndex, VectorIndexBuilder};
|
||||||
use crate::query::Query;
|
use crate::query::Query;
|
||||||
use crate::utils::{PatchReadParam, PatchWriteParam};
|
use crate::utils::{PatchReadParam, PatchWriteParam};
|
||||||
use crate::WriteMode;
|
use crate::WriteMode;
|
||||||
@@ -238,8 +240,6 @@ impl Table {
|
|||||||
|
|
||||||
/// Create index on the table.
|
/// Create index on the table.
|
||||||
pub async fn create_index(&mut self, index_builder: &impl VectorIndexBuilder) -> Result<()> {
|
pub async fn create_index(&mut self, index_builder: &impl VectorIndexBuilder) -> Result<()> {
|
||||||
use lance::index::DatasetIndexExt;
|
|
||||||
|
|
||||||
let mut dataset = self.dataset.as_ref().clone();
|
let mut dataset = self.dataset.as_ref().clone();
|
||||||
dataset
|
dataset
|
||||||
.create_index(
|
.create_index(
|
||||||
@@ -257,6 +257,14 @@ impl Table {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub async fn optimize_indices(&mut self) -> Result<()> {
|
||||||
|
let mut dataset = self.dataset.as_ref().clone();
|
||||||
|
|
||||||
|
dataset.optimize_indices().await?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
/// Insert records into this Table
|
/// Insert records into this Table
|
||||||
///
|
///
|
||||||
/// # Arguments
|
/// # Arguments
|
||||||
@@ -353,12 +361,45 @@ impl Table {
|
|||||||
/// for faster reads.
|
/// for faster reads.
|
||||||
///
|
///
|
||||||
/// This calls into [lance::dataset::optimize::compact_files].
|
/// This calls into [lance::dataset::optimize::compact_files].
|
||||||
pub async fn compact_files(&mut self, options: CompactionOptions) -> Result<CompactionMetrics> {
|
pub async fn compact_files(
|
||||||
|
&mut self,
|
||||||
|
options: CompactionOptions,
|
||||||
|
remap_options: Option<Arc<dyn IndexRemapperOptions>>,
|
||||||
|
) -> Result<CompactionMetrics> {
|
||||||
let mut dataset = self.dataset.as_ref().clone();
|
let mut dataset = self.dataset.as_ref().clone();
|
||||||
let metrics = compact_files(&mut dataset, options, None).await?;
|
let metrics = compact_files(&mut dataset, options, remap_options).await?;
|
||||||
self.dataset = Arc::new(dataset);
|
self.dataset = Arc::new(dataset);
|
||||||
Ok(metrics)
|
Ok(metrics)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn count_fragments(&self) -> usize {
|
||||||
|
self.dataset.count_fragments()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn count_deleted_rows(&self) -> Result<usize> {
|
||||||
|
Ok(self.dataset.count_deleted_rows().await?)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn num_small_files(&self, max_rows_per_group: usize) -> usize {
|
||||||
|
self.dataset.num_small_files(max_rows_per_group).await
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn count_indexed_rows(&self, index_uuid: &str) -> Result<Option<usize>> {
|
||||||
|
Ok(self.dataset.count_indexed_rows(index_uuid).await?)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn count_unindexed_rows(&self, index_uuid: &str) -> Result<Option<usize>> {
|
||||||
|
Ok(self.dataset.count_unindexed_rows(index_uuid).await?)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn load_indices(&self) -> Result<Vec<VectorIndex>> {
|
||||||
|
let (indices, mf) =
|
||||||
|
futures::try_join!(self.dataset.load_indices(), self.dataset.latest_manifest())?;
|
||||||
|
Ok(indices
|
||||||
|
.iter()
|
||||||
|
.map(|i| VectorIndex::new_from_format(&mf, i))
|
||||||
|
.collect())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
|
|||||||
Reference in New Issue
Block a user