mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-23 05:19:58 +00:00
Compare commits
38 Commits
myriel/doc
...
python-v0.
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f744b785f8 | ||
|
|
2e3f745820 | ||
|
|
683aaed716 | ||
|
|
48f7b20daa | ||
|
|
4dd399ca29 | ||
|
|
e6f1da31dc | ||
|
|
a9ea785b15 | ||
|
|
cc38453391 | ||
|
|
47747287b6 | ||
|
|
0847e666a0 | ||
|
|
981f8427e6 | ||
|
|
f6846004ca | ||
|
|
faf8973624 | ||
|
|
fabe37274f | ||
|
|
6839ac3509 | ||
|
|
b88422e515 | ||
|
|
8d60685ede | ||
|
|
04285a4a4e | ||
|
|
d4a41b5663 | ||
|
|
adc3daa462 | ||
|
|
acbfa6c012 | ||
|
|
d602e9f98c | ||
|
|
ad09234d59 | ||
|
|
0c34ffb252 | ||
|
|
d9f333d828 | ||
|
|
bb809abd4b | ||
|
|
c87530f7a3 | ||
|
|
1eb1beecd6 | ||
|
|
ce550e6c45 | ||
|
|
d3bae1f3a3 | ||
|
|
dcf53c4506 | ||
|
|
941eada703 | ||
|
|
ed640a76d9 | ||
|
|
296205ef96 | ||
|
|
16beaaa656 | ||
|
|
4ff87b1f4a | ||
|
|
0532ef2358 | ||
|
|
dcf7334c1f |
@@ -1,5 +1,5 @@
|
||||
[tool.bumpversion]
|
||||
current_version = "0.21.2"
|
||||
current_version = "0.22.0-beta.1"
|
||||
parse = """(?x)
|
||||
(?P<major>0|[1-9]\\d*)\\.
|
||||
(?P<minor>0|[1-9]\\d*)\\.
|
||||
|
||||
11
.github/workflows/docs.yml
vendored
11
.github/workflows/docs.yml
vendored
@@ -56,22 +56,11 @@ jobs:
|
||||
with:
|
||||
node-version: 20
|
||||
cache: 'npm'
|
||||
cache-dependency-path: node/package-lock.json
|
||||
- name: Install node dependencies
|
||||
working-directory: node
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install -y protobuf-compiler libssl-dev
|
||||
- name: Build node
|
||||
working-directory: node
|
||||
run: |
|
||||
npm ci
|
||||
npm run build
|
||||
npm run tsc
|
||||
- name: Create markdown files
|
||||
working-directory: node
|
||||
run: |
|
||||
npx typedoc --plugin typedoc-plugin-markdown --out ../docs/src/javascript src/index.ts
|
||||
- name: Build docs
|
||||
working-directory: docs
|
||||
run: |
|
||||
|
||||
48
.github/workflows/docs_test.yml
vendored
48
.github/workflows/docs_test.yml
vendored
@@ -58,51 +58,3 @@ jobs:
|
||||
run: |
|
||||
cd docs/test/python
|
||||
for d in *; do cd "$d"; echo "$d".py; python "$d".py; cd ..; done
|
||||
test-node:
|
||||
name: Test doc nodejs code
|
||||
runs-on: ubuntu-24.04
|
||||
timeout-minutes: 60
|
||||
strategy:
|
||||
fail-fast: false
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
lfs: true
|
||||
- name: Print CPU capabilities
|
||||
run: cat /proc/cpuinfo
|
||||
- name: Set up Node
|
||||
uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: 20
|
||||
- name: Install protobuf
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install -y protobuf-compiler
|
||||
- name: Install dependecies needed for ubuntu
|
||||
run: |
|
||||
sudo apt install -y libssl-dev
|
||||
rustup update && rustup default
|
||||
- name: Rust cache
|
||||
uses: swatinem/rust-cache@v2
|
||||
- name: Install node dependencies
|
||||
run: |
|
||||
sudo swapoff -a
|
||||
sudo fallocate -l 8G /swapfile
|
||||
sudo chmod 600 /swapfile
|
||||
sudo mkswap /swapfile
|
||||
sudo swapon /swapfile
|
||||
sudo swapon --show
|
||||
cd node
|
||||
npm ci
|
||||
npm run build-release
|
||||
cd ../docs
|
||||
npm install
|
||||
- name: Test
|
||||
env:
|
||||
LANCEDB_URI: ${{ secrets.LANCEDB_URI }}
|
||||
LANCEDB_DEV_API_KEY: ${{ secrets.LANCEDB_DEV_API_KEY }}
|
||||
run: |
|
||||
cd docs
|
||||
npm t
|
||||
|
||||
4
.github/workflows/nodejs.yml
vendored
4
.github/workflows/nodejs.yml
vendored
@@ -79,7 +79,7 @@ jobs:
|
||||
with:
|
||||
node-version: ${{ matrix.node-version }}
|
||||
cache: 'npm'
|
||||
cache-dependency-path: node/package-lock.json
|
||||
cache-dependency-path: nodejs/package-lock.json
|
||||
- uses: Swatinem/rust-cache@v2
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
@@ -137,7 +137,7 @@ jobs:
|
||||
with:
|
||||
node-version: 20
|
||||
cache: 'npm'
|
||||
cache-dependency-path: node/package-lock.json
|
||||
cache-dependency-path: nodejs/package-lock.json
|
||||
- uses: Swatinem/rust-cache@v2
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
|
||||
3
.gitignore
vendored
3
.gitignore
vendored
@@ -31,9 +31,6 @@ python/dist
|
||||
*.node
|
||||
**/node_modules
|
||||
**/.DS_Store
|
||||
node/dist
|
||||
node/examples/**/package-lock.json
|
||||
node/examples/**/dist
|
||||
nodejs/lancedb/native*
|
||||
dist
|
||||
|
||||
|
||||
68
CLAUDE.md
68
CLAUDE.md
@@ -11,14 +11,70 @@ Project layout:
|
||||
* `nodejs`: The Typescript bindings, using napi-rs
|
||||
* `java`: The Java bindings
|
||||
|
||||
(`rust/ffi` and `node/` are for a deprecated package. You can ignore them.)
|
||||
|
||||
Common commands:
|
||||
|
||||
* Check for compiler errors: `cargo check --features remote --tests --examples`
|
||||
* Run tests: `cargo test --features remote --tests`
|
||||
* Run specific test: `cargo test --features remote -p <package_name> --test <test_name>`
|
||||
* Lint: `cargo clippy --features remote --tests --examples`
|
||||
* Check for compiler errors: `cargo check --quiet --features remote --tests --examples`
|
||||
* Run tests: `cargo test --quiet --features remote --tests`
|
||||
* Run specific test: `cargo test --quiet --features remote -p <package_name> --test <test_name>`
|
||||
* Lint: `cargo clippy --quiet --features remote --tests --examples`
|
||||
* Format: `cargo fmt --all`
|
||||
|
||||
Before committing changes, run formatting.
|
||||
|
||||
## Coding tips
|
||||
|
||||
* When writing Rust doctests for things that require a connection or table reference,
|
||||
write them as a function instead of a fully executable test. This allows type checking
|
||||
to run but avoids needing a full test environment. For example:
|
||||
```rust
|
||||
/// ```
|
||||
/// use lance_index::scalar::FullTextSearchQuery;
|
||||
/// use lancedb::query::{QueryBase, ExecutableQuery};
|
||||
///
|
||||
/// # use lancedb::Table;
|
||||
/// # async fn query(table: &Table) -> Result<(), Box<dyn std::error::Error>> {
|
||||
/// let results = table.query()
|
||||
/// .full_text_search(FullTextSearchQuery::new("hello world".into()))
|
||||
/// .execute()
|
||||
/// .await?;
|
||||
/// # Ok(())
|
||||
/// # }
|
||||
/// ```
|
||||
```
|
||||
|
||||
## Example plan: adding a new method on Table
|
||||
|
||||
Adding a new method involves first adding it to the Rust core, then exposing it
|
||||
in the Python and TypeScript bindings. There are both local and remote tables.
|
||||
Remote tables are implemented via a HTTP API and require the `remote` cargo
|
||||
feature flag to be enabled. Python has both sync and async methods.
|
||||
|
||||
Rust core changes:
|
||||
|
||||
1. Add method on `Table` struct in `rust/lancedb/src/table.rs` (calls `BaseTable` trait).
|
||||
2. Add method to `BaseTable` trait in `rust/lancedb/src/table.rs`.
|
||||
3. Implement new trait method on `NativeTable` in `rust/lancedb/src/table.rs`.
|
||||
* Test with unit test in `rust/lancedb/src/table.rs`.
|
||||
4. Implement new trait method on `RemoteTable` in `rust/lancedb/src/remote/table.rs`.
|
||||
* Test with unit test in `rust/lancedb/src/remote/table.rs` against mocked endpoint.
|
||||
|
||||
Python bindings changes:
|
||||
|
||||
1. Add PyO3 method binding in `python/src/table.rs`. Run `make develop` to compile bindings.
|
||||
2. Add types for PyO3 method in `python/python/lancedb/_lancedb.pyi`.
|
||||
3. Add method to `AsyncTable` class in `python/python/lancedb/table.py`.
|
||||
4. Add abstract method to `Table` abstract base class in `python/python/lancedb/table.py`.
|
||||
5. Add concrete sync method to `LanceTable` class in `python/python/lancedb/table.py`.
|
||||
* Should use `LOOP.run()` to call the corresponding `AsyncTable` method.
|
||||
6. Add concrete sync method to `RemoteTable` class in `python/python/lancedb/remote/table.py`.
|
||||
7. Add unit test in `python/tests/test_table.py`.
|
||||
|
||||
TypeScript bindings changes:
|
||||
|
||||
1. Add napi-rs method binding on `Table` in `nodejs/src/table.rs`.
|
||||
2. Run `npm run build` to generate TypeScript definitions.
|
||||
3. Add typescript method on abstract class `Table` in `nodejs/src/table.ts`.
|
||||
4. Add concrete method on `LocalTable` class in `nodejs/src/native_table.ts`.
|
||||
* Note: despite the name, this class is also used for remote tables.
|
||||
5. Add test in `nodejs/__test__/table.test.ts`.
|
||||
6. Run `npm run docs` to generate TypeScript documentation.
|
||||
|
||||
1654
Cargo.lock
generated
1654
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
28
Cargo.toml
28
Cargo.toml
@@ -1,10 +1,5 @@
|
||||
[workspace]
|
||||
members = [
|
||||
"rust/lancedb",
|
||||
"nodejs",
|
||||
"python",
|
||||
"java/core/lancedb-jni",
|
||||
]
|
||||
members = ["rust/lancedb", "nodejs", "python", "java/core/lancedb-jni"]
|
||||
# Python package needs to be built by maturin.
|
||||
exclude = ["python"]
|
||||
resolver = "2"
|
||||
@@ -20,16 +15,14 @@ categories = ["database-implementations"]
|
||||
rust-version = "1.78.0"
|
||||
|
||||
[workspace.dependencies]
|
||||
lance = { "version" = "=0.32.1", "features" = [
|
||||
"dynamodb",
|
||||
], "tag" = "v0.32.1-beta.2", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-io = { "version" = "=0.32.1", "tag" = "v0.32.1-beta.2", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-index = { "version" = "=0.32.1", "tag" = "v0.32.1-beta.2", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-linalg = { "version" = "=0.32.1", "tag" = "v0.32.1-beta.2", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-table = { "version" = "=0.32.1", "tag" = "v0.32.1-beta.2", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-testing = { "version" = "=0.32.1", "tag" = "v0.32.1-beta.2", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-datafusion = { "version" = "=0.32.1", "tag" = "v0.32.1-beta.2", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-encoding = { "version" = "=0.32.1", "tag" = "v0.32.1-beta.2", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance = { "version" = "=0.35.0", default-features = false, "features" = ["dynamodb"] }
|
||||
lance-io = { "version" = "=0.35.0", default-features = false }
|
||||
lance-index = { "version" = "=0.35.0" }
|
||||
lance-linalg = { "version" = "=0.35.0" }
|
||||
lance-table = { "version" = "=0.35.0" }
|
||||
lance-testing = { "version" = "=0.35.0" }
|
||||
lance-datafusion = { "version" = "=0.35.0" }
|
||||
lance-encoding = { "version" = "=0.35.0" }
|
||||
# Note that this one does not include pyarrow
|
||||
arrow = { version = "55.1", optional = false }
|
||||
arrow-array = "55.1"
|
||||
@@ -62,12 +55,11 @@ rand = "0.9"
|
||||
regex = "1.10"
|
||||
lazy_static = "1"
|
||||
semver = "1.0.25"
|
||||
crunchy = "0.2.4"
|
||||
# Temporary pins to work around downstream issues
|
||||
# https://github.com/apache/arrow-rs/commit/2fddf85afcd20110ce783ed5b4cdeb82293da30b
|
||||
chrono = "=0.4.41"
|
||||
# https://github.com/RustCrypto/formats/issues/1684
|
||||
base64ct = "=1.6.0"
|
||||
# Workaround for: https://github.com/eira-fransham/crunchy/issues/13
|
||||
crunchy = "=0.2.2"
|
||||
# Workaround for: https://github.com/Lokathor/bytemuck/issues/306
|
||||
bytemuck_derive = ">=1.8.1, <1.9.0"
|
||||
|
||||
@@ -54,6 +54,52 @@ def extract_features(line: str) -> list:
|
||||
return []
|
||||
|
||||
|
||||
def extract_default_features(line: str) -> bool:
|
||||
"""
|
||||
Checks if default-features = false is present in a line in Cargo.toml.
|
||||
Example: 'lance = { "version" = "=0.29.0", default-features = false, "features" = ["dynamodb"] }'
|
||||
Returns: True if default-features = false is present, False otherwise
|
||||
"""
|
||||
import re
|
||||
|
||||
match = re.search(r'default-features\s*=\s*false', line)
|
||||
return match is not None
|
||||
|
||||
|
||||
def dict_to_toml_line(package_name: str, config: dict) -> str:
|
||||
"""
|
||||
Converts a configuration dictionary to a TOML dependency line.
|
||||
Dictionary insertion order is preserved (Python 3.7+), so the caller
|
||||
controls the order of fields in the output.
|
||||
|
||||
Args:
|
||||
package_name: The name of the package (e.g., "lance", "lance-io")
|
||||
config: Dictionary with keys like "version", "path", "git", "tag", "features", "default-features"
|
||||
The order of keys in this dict determines the order in the output.
|
||||
|
||||
Returns:
|
||||
A properly formatted TOML line with a trailing newline
|
||||
"""
|
||||
# If only version is specified, use simple format
|
||||
if len(config) == 1 and "version" in config:
|
||||
return f'{package_name} = "{config["version"]}"\n'
|
||||
|
||||
# Otherwise, use inline table format
|
||||
parts = []
|
||||
for key, value in config.items():
|
||||
if key == "default-features" and not value:
|
||||
parts.append("default-features = false")
|
||||
elif key == "features":
|
||||
parts.append(f'"features" = {json.dumps(value)}')
|
||||
elif isinstance(value, str):
|
||||
parts.append(f'"{key}" = "{value}"')
|
||||
else:
|
||||
# This shouldn't happen with our current usage
|
||||
parts.append(f'"{key}" = {json.dumps(value)}')
|
||||
|
||||
return f'{package_name} = {{ {", ".join(parts)} }}\n'
|
||||
|
||||
|
||||
def update_cargo_toml(line_updater):
|
||||
"""
|
||||
Updates the Cargo.toml file by applying the line_updater function to each line.
|
||||
@@ -67,20 +113,27 @@ def update_cargo_toml(line_updater):
|
||||
is_parsing_lance_line = False
|
||||
for line in lines:
|
||||
if line.startswith("lance"):
|
||||
# Update the line using the provided function
|
||||
if line.strip().endswith("}"):
|
||||
# Check if this is a single-line or multi-line entry
|
||||
# Single-line entries either:
|
||||
# 1. End with } (complete inline table)
|
||||
# 2. End with " (simple version string)
|
||||
# Multi-line entries start with { but don't end with }
|
||||
if line.strip().endswith("}") or line.strip().endswith('"'):
|
||||
# Single-line entry - process immediately
|
||||
new_lines.append(line_updater(line))
|
||||
else:
|
||||
elif "{" in line and not line.strip().endswith("}"):
|
||||
# Multi-line entry - start accumulating
|
||||
lance_line = line
|
||||
is_parsing_lance_line = True
|
||||
else:
|
||||
# Single-line entry without quotes or braces (shouldn't happen but handle it)
|
||||
new_lines.append(line_updater(line))
|
||||
elif is_parsing_lance_line:
|
||||
lance_line += line
|
||||
if line.strip().endswith("}"):
|
||||
new_lines.append(line_updater(lance_line))
|
||||
lance_line = ""
|
||||
is_parsing_lance_line = False
|
||||
else:
|
||||
print("doesn't end with }:", line)
|
||||
else:
|
||||
# Keep the line unchanged
|
||||
new_lines.append(line)
|
||||
@@ -92,18 +145,25 @@ def update_cargo_toml(line_updater):
|
||||
def set_stable_version(version: str):
|
||||
"""
|
||||
Sets lines to
|
||||
lance = { "version" = "=0.29.0", "features" = ["dynamodb"] }
|
||||
lance-io = "=0.29.0"
|
||||
lance = { "version" = "=0.29.0", default-features = false, "features" = ["dynamodb"] }
|
||||
lance-io = { "version" = "=0.29.0", default-features = false }
|
||||
...
|
||||
"""
|
||||
|
||||
def line_updater(line: str) -> str:
|
||||
package_name = line.split("=", maxsplit=1)[0].strip()
|
||||
|
||||
# Build config in desired order: version, default-features, features
|
||||
config = {"version": f"={version}"}
|
||||
|
||||
if extract_default_features(line):
|
||||
config["default-features"] = False
|
||||
|
||||
features = extract_features(line)
|
||||
if features:
|
||||
return f'{package_name} = {{ "version" = "={version}", "features" = {json.dumps(features)} }}\n'
|
||||
else:
|
||||
return f'{package_name} = "={version}"\n'
|
||||
config["features"] = features
|
||||
|
||||
return dict_to_toml_line(package_name, config)
|
||||
|
||||
update_cargo_toml(line_updater)
|
||||
|
||||
@@ -111,19 +171,29 @@ def set_stable_version(version: str):
|
||||
def set_preview_version(version: str):
|
||||
"""
|
||||
Sets lines to
|
||||
lance = { "version" = "=0.29.0", "features" = ["dynamodb"], tag = "v0.29.0-beta.2", git="https://github.com/lancedb/lance.git" }
|
||||
lance-io = { version = "=0.29.0", tag = "v0.29.0-beta.2", git="https://github.com/lancedb/lance.git" }
|
||||
lance = { "version" = "=0.29.0", default-features = false, "features" = ["dynamodb"], "tag" = "v0.29.0-beta.2", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-io = { "version" = "=0.29.0", default-features = false, "tag" = "v0.29.0-beta.2", "git" = "https://github.com/lancedb/lance.git" }
|
||||
...
|
||||
"""
|
||||
|
||||
def line_updater(line: str) -> str:
|
||||
package_name = line.split("=", maxsplit=1)[0].strip()
|
||||
features = extract_features(line)
|
||||
base_version = version.split("-")[0] # Get the base version without beta suffix
|
||||
|
||||
# Build config in desired order: version, default-features, features, tag, git
|
||||
config = {"version": f"={base_version}"}
|
||||
|
||||
if extract_default_features(line):
|
||||
config["default-features"] = False
|
||||
|
||||
features = extract_features(line)
|
||||
if features:
|
||||
return f'{package_name} = {{ "version" = "={base_version}", "features" = {json.dumps(features)}, "tag" = "v{version}", "git" = "https://github.com/lancedb/lance.git" }}\n'
|
||||
else:
|
||||
return f'{package_name} = {{ "version" = "={base_version}", "tag" = "v{version}", "git" = "https://github.com/lancedb/lance.git" }}\n'
|
||||
config["features"] = features
|
||||
|
||||
config["tag"] = f"v{version}"
|
||||
config["git"] = "https://github.com/lancedb/lance.git"
|
||||
|
||||
return dict_to_toml_line(package_name, config)
|
||||
|
||||
update_cargo_toml(line_updater)
|
||||
|
||||
@@ -131,18 +201,25 @@ def set_preview_version(version: str):
|
||||
def set_local_version():
|
||||
"""
|
||||
Sets lines to
|
||||
lance = { path = "../lance/rust/lance", features = ["dynamodb"] }
|
||||
lance-io = { path = "../lance/rust/lance-io" }
|
||||
lance = { "path" = "../lance/rust/lance", default-features = false, "features" = ["dynamodb"] }
|
||||
lance-io = { "path" = "../lance/rust/lance-io", default-features = false }
|
||||
...
|
||||
"""
|
||||
|
||||
def line_updater(line: str) -> str:
|
||||
package_name = line.split("=", maxsplit=1)[0].strip()
|
||||
|
||||
# Build config in desired order: path, default-features, features
|
||||
config = {"path": f"../lance/rust/{package_name}"}
|
||||
|
||||
if extract_default_features(line):
|
||||
config["default-features"] = False
|
||||
|
||||
features = extract_features(line)
|
||||
if features:
|
||||
return f'{package_name} = {{ "path" = "../lance/rust/{package_name}", "features" = {json.dumps(features)} }}\n'
|
||||
else:
|
||||
return f'{package_name} = {{ "path" = "../lance/rust/{package_name}" }}\n'
|
||||
config["features"] = features
|
||||
|
||||
return dict_to_toml_line(package_name, config)
|
||||
|
||||
update_cargo_toml(line_updater)
|
||||
|
||||
|
||||
@@ -15,16 +15,13 @@ cargo metadata --quiet > /dev/null
|
||||
pushd nodejs || exit 1
|
||||
npm install --package-lock-only --silent
|
||||
popd
|
||||
pushd node || exit 1
|
||||
npm install --package-lock-only --silent
|
||||
popd
|
||||
|
||||
if git diff --quiet --exit-code; then
|
||||
echo "No lockfile changes to commit; skipping amend."
|
||||
elif $AMEND; then
|
||||
git add Cargo.lock nodejs/package-lock.json node/package-lock.json
|
||||
git add Cargo.lock nodejs/package-lock.json
|
||||
git commit --amend --no-edit
|
||||
else
|
||||
git add Cargo.lock nodejs/package-lock.json node/package-lock.json
|
||||
git add Cargo.lock nodejs/package-lock.json
|
||||
git commit -m "Update lockfiles"
|
||||
fi
|
||||
|
||||
258
docs/mkdocs.yml
258
docs/mkdocs.yml
@@ -103,6 +103,264 @@ markdown_extensions:
|
||||
permalink: ""
|
||||
|
||||
nav:
|
||||
- Home:
|
||||
- LanceDB: index.md
|
||||
- 🏃🏼♂️ Quick start: basic.md
|
||||
- 📚 Concepts:
|
||||
- Vector search: concepts/vector_search.md
|
||||
- Indexing:
|
||||
- IVFPQ: concepts/index_ivfpq.md
|
||||
- HNSW: concepts/index_hnsw.md
|
||||
- Storage: concepts/storage.md
|
||||
- Data management: concepts/data_management.md
|
||||
- 🔨 Guides:
|
||||
- Working with tables: guides/tables.md
|
||||
- Building a vector index: ann_indexes.md
|
||||
- Vector Search: search.md
|
||||
- Full-text search (native): fts.md
|
||||
- Full-text search (tantivy-based): fts_tantivy.md
|
||||
- Building a scalar index: guides/scalar_index.md
|
||||
- Hybrid search:
|
||||
- Overview: hybrid_search/hybrid_search.md
|
||||
- Comparing Rerankers: hybrid_search/eval.md
|
||||
- Airbnb financial data example: notebooks/hybrid_search.ipynb
|
||||
- Late interaction with MultiVector search:
|
||||
- Overview: guides/multi-vector.md
|
||||
- Example: notebooks/Multivector_on_LanceDB.ipynb
|
||||
- RAG:
|
||||
- Vanilla RAG: rag/vanilla_rag.md
|
||||
- Multi-head RAG: rag/multi_head_rag.md
|
||||
- Corrective RAG: rag/corrective_rag.md
|
||||
- Agentic RAG: rag/agentic_rag.md
|
||||
- Graph RAG: rag/graph_rag.md
|
||||
- Self RAG: rag/self_rag.md
|
||||
- Adaptive RAG: rag/adaptive_rag.md
|
||||
- SFR RAG: rag/sfr_rag.md
|
||||
- Advanced Techniques:
|
||||
- HyDE: rag/advanced_techniques/hyde.md
|
||||
- FLARE: rag/advanced_techniques/flare.md
|
||||
- Reranking:
|
||||
- Quickstart: reranking/index.md
|
||||
- Cohere Reranker: reranking/cohere.md
|
||||
- Linear Combination Reranker: reranking/linear_combination.md
|
||||
- Reciprocal Rank Fusion Reranker: reranking/rrf.md
|
||||
- Cross Encoder Reranker: reranking/cross_encoder.md
|
||||
- ColBERT Reranker: reranking/colbert.md
|
||||
- Jina Reranker: reranking/jina.md
|
||||
- OpenAI Reranker: reranking/openai.md
|
||||
- AnswerDotAi Rerankers: reranking/answerdotai.md
|
||||
- Voyage AI Rerankers: reranking/voyageai.md
|
||||
- Building Custom Rerankers: reranking/custom_reranker.md
|
||||
- Example: notebooks/lancedb_reranking.ipynb
|
||||
- Filtering: sql.md
|
||||
- Versioning & Reproducibility:
|
||||
- sync API: notebooks/reproducibility.ipynb
|
||||
- async API: notebooks/reproducibility_async.ipynb
|
||||
- Configuring Storage: guides/storage.md
|
||||
- Migration Guide: migration.md
|
||||
- Tuning retrieval performance:
|
||||
- Choosing right query type: guides/tuning_retrievers/1_query_types.md
|
||||
- Reranking: guides/tuning_retrievers/2_reranking.md
|
||||
- Embedding fine-tuning: guides/tuning_retrievers/3_embed_tuning.md
|
||||
- 🧬 Managing embeddings:
|
||||
- Understand Embeddings: embeddings/understanding_embeddings.md
|
||||
- Get Started: embeddings/index.md
|
||||
- Embedding functions: embeddings/embedding_functions.md
|
||||
- Available models:
|
||||
- Overview: embeddings/default_embedding_functions.md
|
||||
- Text Embedding Functions:
|
||||
- Sentence Transformers: embeddings/available_embedding_models/text_embedding_functions/sentence_transformers.md
|
||||
- Huggingface Embedding Models: embeddings/available_embedding_models/text_embedding_functions/huggingface_embedding.md
|
||||
- Ollama Embeddings: embeddings/available_embedding_models/text_embedding_functions/ollama_embedding.md
|
||||
- OpenAI Embeddings: embeddings/available_embedding_models/text_embedding_functions/openai_embedding.md
|
||||
- Instructor Embeddings: embeddings/available_embedding_models/text_embedding_functions/instructor_embedding.md
|
||||
- Gemini Embeddings: embeddings/available_embedding_models/text_embedding_functions/gemini_embedding.md
|
||||
- Cohere Embeddings: embeddings/available_embedding_models/text_embedding_functions/cohere_embedding.md
|
||||
- Jina Embeddings: embeddings/available_embedding_models/text_embedding_functions/jina_embedding.md
|
||||
- AWS Bedrock Text Embedding Functions: embeddings/available_embedding_models/text_embedding_functions/aws_bedrock_embedding.md
|
||||
- IBM watsonx.ai Embeddings: embeddings/available_embedding_models/text_embedding_functions/ibm_watsonx_ai_embedding.md
|
||||
- Voyage AI Embeddings: embeddings/available_embedding_models/text_embedding_functions/voyageai_embedding.md
|
||||
- Multimodal Embedding Functions:
|
||||
- OpenClip embeddings: embeddings/available_embedding_models/multimodal_embedding_functions/openclip_embedding.md
|
||||
- Imagebind embeddings: embeddings/available_embedding_models/multimodal_embedding_functions/imagebind_embedding.md
|
||||
- Jina Embeddings: embeddings/available_embedding_models/multimodal_embedding_functions/jina_multimodal_embedding.md
|
||||
- User-defined embedding functions: embeddings/custom_embedding_function.md
|
||||
- Variables and secrets: embeddings/variables_and_secrets.md
|
||||
- "Example: Multi-lingual semantic search": notebooks/multi_lingual_example.ipynb
|
||||
- "Example: MultiModal CLIP Embeddings": notebooks/DisappearingEmbeddingFunction.ipynb
|
||||
- 🔌 Integrations:
|
||||
- Tools and data formats: integrations/index.md
|
||||
- Pandas and PyArrow: python/pandas_and_pyarrow.md
|
||||
- Polars: python/polars_arrow.md
|
||||
- DuckDB: python/duckdb.md
|
||||
- Datafusion: python/datafusion.md
|
||||
- LangChain:
|
||||
- LangChain 🔗: integrations/langchain.md
|
||||
- LangChain demo: notebooks/langchain_demo.ipynb
|
||||
- LangChain JS/TS 🔗: https://js.langchain.com/docs/integrations/vectorstores/lancedb
|
||||
- LlamaIndex 🦙:
|
||||
- LlamaIndex docs: integrations/llamaIndex.md
|
||||
- LlamaIndex demo: notebooks/llamaIndex_demo.ipynb
|
||||
- Pydantic: python/pydantic.md
|
||||
- Voxel51: integrations/voxel51.md
|
||||
- PromptTools: integrations/prompttools.md
|
||||
- dlt: integrations/dlt.md
|
||||
- phidata: integrations/phidata.md
|
||||
- Genkit: integrations/genkit.md
|
||||
- 🎯 Examples:
|
||||
- Overview: examples/index.md
|
||||
- 🐍 Python:
|
||||
- Overview: examples/examples_python.md
|
||||
- Build From Scratch: examples/python_examples/build_from_scratch.md
|
||||
- Multimodal: examples/python_examples/multimodal.md
|
||||
- Rag: examples/python_examples/rag.md
|
||||
- Vector Search: examples/python_examples/vector_search.md
|
||||
- Chatbot: examples/python_examples/chatbot.md
|
||||
- Evaluation: examples/python_examples/evaluations.md
|
||||
- AI Agent: examples/python_examples/aiagent.md
|
||||
- Recommender System: examples/python_examples/recommendersystem.md
|
||||
- Miscellaneous:
|
||||
- Serverless QA Bot with S3 and Lambda: examples/serverless_lancedb_with_s3_and_lambda.md
|
||||
- Serverless QA Bot with Modal: examples/serverless_qa_bot_with_modal_and_langchain.md
|
||||
- 👾 JavaScript:
|
||||
- Overview: examples/examples_js.md
|
||||
- Serverless Website Chatbot: examples/serverless_website_chatbot.md
|
||||
- YouTube Transcript Search: examples/youtube_transcript_bot_with_nodejs.md
|
||||
- TransformersJS Embedding Search: examples/transformerjs_embedding_search_nodejs.md
|
||||
- 🦀 Rust:
|
||||
- Overview: examples/examples_rust.md
|
||||
- 📓 Studies:
|
||||
- ↗Improve retrievers with hybrid search and reranking: https://blog.lancedb.com/hybrid-search-and-reranking-report/
|
||||
- 💭 FAQs: faq.md
|
||||
- 🔍 Troubleshooting: troubleshooting.md
|
||||
- ⚙️ API reference:
|
||||
- 🐍 Python: python/python.md
|
||||
- 👾 JavaScript (vectordb): javascript/modules.md
|
||||
- 👾 JavaScript (lancedb): js/globals.md
|
||||
- 🦀 Rust: https://docs.rs/lancedb/latest/lancedb/
|
||||
|
||||
- Quick start: basic.md
|
||||
- Concepts:
|
||||
- Vector search: concepts/vector_search.md
|
||||
- Indexing:
|
||||
- IVFPQ: concepts/index_ivfpq.md
|
||||
- HNSW: concepts/index_hnsw.md
|
||||
- Storage: concepts/storage.md
|
||||
- Data management: concepts/data_management.md
|
||||
- Guides:
|
||||
- Working with tables: guides/tables.md
|
||||
- Working with SQL: guides/sql_querying.md
|
||||
- Building an ANN index: ann_indexes.md
|
||||
- Vector Search: search.md
|
||||
- Full-text search (native): fts.md
|
||||
- Full-text search (tantivy-based): fts_tantivy.md
|
||||
- Building a scalar index: guides/scalar_index.md
|
||||
- Hybrid search:
|
||||
- Overview: hybrid_search/hybrid_search.md
|
||||
- Comparing Rerankers: hybrid_search/eval.md
|
||||
- Airbnb financial data example: notebooks/hybrid_search.ipynb
|
||||
- Late interaction with MultiVector search:
|
||||
- Overview: guides/multi-vector.md
|
||||
- Document search Example: notebooks/Multivector_on_LanceDB.ipynb
|
||||
- RAG:
|
||||
- Vanilla RAG: rag/vanilla_rag.md
|
||||
- Multi-head RAG: rag/multi_head_rag.md
|
||||
- Corrective RAG: rag/corrective_rag.md
|
||||
- Agentic RAG: rag/agentic_rag.md
|
||||
- Graph RAG: rag/graph_rag.md
|
||||
- Self RAG: rag/self_rag.md
|
||||
- Adaptive RAG: rag/adaptive_rag.md
|
||||
- SFR RAG: rag/sfr_rag.md
|
||||
- Advanced Techniques:
|
||||
- HyDE: rag/advanced_techniques/hyde.md
|
||||
- FLARE: rag/advanced_techniques/flare.md
|
||||
- Reranking:
|
||||
- Quickstart: reranking/index.md
|
||||
- Cohere Reranker: reranking/cohere.md
|
||||
- Linear Combination Reranker: reranking/linear_combination.md
|
||||
- Reciprocal Rank Fusion Reranker: reranking/rrf.md
|
||||
- Cross Encoder Reranker: reranking/cross_encoder.md
|
||||
- ColBERT Reranker: reranking/colbert.md
|
||||
- Jina Reranker: reranking/jina.md
|
||||
- OpenAI Reranker: reranking/openai.md
|
||||
- AnswerDotAi Rerankers: reranking/answerdotai.md
|
||||
- Building Custom Rerankers: reranking/custom_reranker.md
|
||||
- Example: notebooks/lancedb_reranking.ipynb
|
||||
- Filtering: sql.md
|
||||
- Versioning & Reproducibility:
|
||||
- sync API: notebooks/reproducibility.ipynb
|
||||
- async API: notebooks/reproducibility_async.ipynb
|
||||
- Configuring Storage: guides/storage.md
|
||||
- Migration Guide: migration.md
|
||||
- Tuning retrieval performance:
|
||||
- Choosing right query type: guides/tuning_retrievers/1_query_types.md
|
||||
- Reranking: guides/tuning_retrievers/2_reranking.md
|
||||
- Embedding fine-tuning: guides/tuning_retrievers/3_embed_tuning.md
|
||||
- Managing Embeddings:
|
||||
- Understand Embeddings: embeddings/understanding_embeddings.md
|
||||
- Get Started: embeddings/index.md
|
||||
- Embedding functions: embeddings/embedding_functions.md
|
||||
- Available models:
|
||||
- Overview: embeddings/default_embedding_functions.md
|
||||
- Text Embedding Functions:
|
||||
- Sentence Transformers: embeddings/available_embedding_models/text_embedding_functions/sentence_transformers.md
|
||||
- Huggingface Embedding Models: embeddings/available_embedding_models/text_embedding_functions/huggingface_embedding.md
|
||||
- Ollama Embeddings: embeddings/available_embedding_models/text_embedding_functions/ollama_embedding.md
|
||||
- OpenAI Embeddings: embeddings/available_embedding_models/text_embedding_functions/openai_embedding.md
|
||||
- Instructor Embeddings: embeddings/available_embedding_models/text_embedding_functions/instructor_embedding.md
|
||||
- Gemini Embeddings: embeddings/available_embedding_models/text_embedding_functions/gemini_embedding.md
|
||||
- Cohere Embeddings: embeddings/available_embedding_models/text_embedding_functions/cohere_embedding.md
|
||||
- Jina Embeddings: embeddings/available_embedding_models/text_embedding_functions/jina_embedding.md
|
||||
- AWS Bedrock Text Embedding Functions: embeddings/available_embedding_models/text_embedding_functions/aws_bedrock_embedding.md
|
||||
- IBM watsonx.ai Embeddings: embeddings/available_embedding_models/text_embedding_functions/ibm_watsonx_ai_embedding.md
|
||||
- Multimodal Embedding Functions:
|
||||
- OpenClip embeddings: embeddings/available_embedding_models/multimodal_embedding_functions/openclip_embedding.md
|
||||
- Imagebind embeddings: embeddings/available_embedding_models/multimodal_embedding_functions/imagebind_embedding.md
|
||||
- Jina Embeddings: embeddings/available_embedding_models/multimodal_embedding_functions/jina_multimodal_embedding.md
|
||||
- User-defined embedding functions: embeddings/custom_embedding_function.md
|
||||
- Variables and secrets: embeddings/variables_and_secrets.md
|
||||
- "Example: Multi-lingual semantic search": notebooks/multi_lingual_example.ipynb
|
||||
- "Example: MultiModal CLIP Embeddings": notebooks/DisappearingEmbeddingFunction.ipynb
|
||||
- Integrations:
|
||||
- Overview: integrations/index.md
|
||||
- Pandas and PyArrow: python/pandas_and_pyarrow.md
|
||||
- Polars: python/polars_arrow.md
|
||||
- DuckDB: python/duckdb.md
|
||||
- Datafusion: python/datafusion.md
|
||||
- LangChain 🦜️🔗↗: integrations/langchain.md
|
||||
- LangChain.js 🦜️🔗↗: https://js.langchain.com/docs/integrations/vectorstores/lancedb
|
||||
- LlamaIndex 🦙↗: integrations/llamaIndex.md
|
||||
- Pydantic: python/pydantic.md
|
||||
- Voxel51: integrations/voxel51.md
|
||||
- PromptTools: integrations/prompttools.md
|
||||
- dlt: integrations/dlt.md
|
||||
- phidata: integrations/phidata.md
|
||||
- Genkit: integrations/genkit.md
|
||||
- Examples:
|
||||
- examples/index.md
|
||||
- 🐍 Python:
|
||||
- Overview: examples/examples_python.md
|
||||
- Build From Scratch: examples/python_examples/build_from_scratch.md
|
||||
- Multimodal: examples/python_examples/multimodal.md
|
||||
- Rag: examples/python_examples/rag.md
|
||||
- Vector Search: examples/python_examples/vector_search.md
|
||||
- Chatbot: examples/python_examples/chatbot.md
|
||||
- Evaluation: examples/python_examples/evaluations.md
|
||||
- AI Agent: examples/python_examples/aiagent.md
|
||||
- Recommender System: examples/python_examples/recommendersystem.md
|
||||
- Miscellaneous:
|
||||
- Serverless QA Bot with S3 and Lambda: examples/serverless_lancedb_with_s3_and_lambda.md
|
||||
- Serverless QA Bot with Modal: examples/serverless_qa_bot_with_modal_and_langchain.md
|
||||
- 👾 JavaScript:
|
||||
- Overview: examples/examples_js.md
|
||||
- Serverless Website Chatbot: examples/serverless_website_chatbot.md
|
||||
- YouTube Transcript Search: examples/youtube_transcript_bot_with_nodejs.md
|
||||
- TransformersJS Embedding Search: examples/transformerjs_embedding_search_nodejs.md
|
||||
- 🦀 Rust:
|
||||
- Overview: examples/examples_rust.md
|
||||
- Studies:
|
||||
- studies/overview.md
|
||||
- ↗Improve retrievers with hybrid search and reranking: https://blog.lancedb.com/hybrid-search-and-reranking-report/
|
||||
- API reference:
|
||||
- Overview: api_reference.md
|
||||
- Python: python/python.md
|
||||
|
||||
@@ -13,7 +13,7 @@ The following concepts are important to keep in mind:
|
||||
- Data is versioned, with each insert operation creating a new version of the dataset and an update to the manifest that tracks versions via metadata
|
||||
|
||||
!!! note
|
||||
1. First, each version contains metadata and just the new/updated data in your transaction. So if you have 100 versions, they aren't 100 duplicates of the same data. However, they do have 100x the metadata overhead of a single version, which can result in slower queries.
|
||||
1. First, each version contains metadata and just the new/updated data in your transaction. So if you have 100 versions, they aren't 100 duplicates of the same data. However, they do have 100x the metadata overhead of a single version, which can result in slower queries.
|
||||
2. Second, these versions exist to keep LanceDB scalable and consistent. We do not immediately blow away old versions when creating new ones because other clients might be in the middle of querying the old version. It's important to retain older versions for as long as they might be queried.
|
||||
|
||||
## What are fragments?
|
||||
@@ -37,6 +37,10 @@ Depending on the use case and dataset, optimal compaction will have different re
|
||||
- It’s always better to use *batch* inserts rather than adding 1 row at a time (to avoid too small fragments). If single-row inserts are unavoidable, run compaction on a regular basis to merge them into larger fragments.
|
||||
- Keep the number of fragments under 100, which is suitable for most use cases (for *really* large datasets of >500M rows, more fragments might be needed)
|
||||
|
||||
!!! note
|
||||
|
||||
LanceDB Cloud/Enterprise supports [auto-compaction](https://docs.lancedb.com/enterprise/architecture/architecture#write-path) which automatically optimizes fragments in the background as data changes.
|
||||
|
||||
## Deletion
|
||||
|
||||
Although Lance allows you to delete rows from a dataset, it does not actually delete the data immediately. It simply marks the row as deleted in the `DataFile` that represents a fragment. For a given version of the dataset, each fragment can have up to one deletion file (if no rows were ever deleted from that fragment, it will not have a deletion file). This is important to keep in mind because it means that the data is still there, and can be recovered if needed, as long as that version still exists based on your backup policy.
|
||||
@@ -50,13 +54,9 @@ Reindexing is the process of updating the index to account for new data, keeping
|
||||
|
||||
Both LanceDB OSS and Cloud support reindexing, but the process (at least for now) is different for each, depending on the type of index.
|
||||
|
||||
When a reindex job is triggered in the background, the entire data is reindexed, but in the interim as new queries come in, LanceDB will combine results from the existing index with exhaustive kNN search on the new data. This is done to ensure that you're still searching on all your data, but it does come at a performance cost. The more data that you add without reindexing, the impact on latency (due to exhaustive search) can be noticeable.
|
||||
In LanceDB OSS, re-indexing happens synchronously when you call either `create_index` or `optimize` on a table. In LanceDB Cloud, re-indexing happens asynchronously as you add and update data in your table.
|
||||
|
||||
### Vector reindex
|
||||
By default, queries will search new data even if it has yet to be indexed. This is done using brute-force methods, such as kNN for vector search, and combined with the fast index search results. This is done to ensure that you're always searching over all your data, but it does come at a performance cost. Without reindexing, adding more data to a table will make queries slower and more expensive. This behavior can be disabled by setting the [fast_search](https://lancedb.github.io/lancedb/python/python/#lancedb.query.AsyncQuery.fast_search) parameter which will instruct the query to ignore un-indexed data.
|
||||
|
||||
* LanceDB Cloud supports incremental reindexing, where a background process will trigger a new index build for you automatically when new data is added to a dataset
|
||||
* LanceDB Cloud/Enterprise supports [automatic incremental reindexing](https://docs.lancedb.com/core#vector-index) for vector, scalar, and FTS indices, where a background process will trigger a new index build for you automatically when new data is added or modified in a dataset
|
||||
* LanceDB OSS requires you to manually trigger a reindex operation -- we are working on adding incremental reindexing to LanceDB OSS as well
|
||||
|
||||
### FTS reindex
|
||||
|
||||
FTS reindexing is supported in both LanceDB OSS and Cloud, but requires that it's manually rebuilt once you have a significant enough amount of new data added that needs to be reindexed. We [updated](https://github.com/lancedb/lancedb/pull/762) Tantivy's default heap size from 128MB to 1GB in LanceDB to make it much faster to reindex, by up to 10x from the default settings.
|
||||
|
||||
@@ -14,7 +14,7 @@ A builder for LanceDB queries.
|
||||
|
||||
## Extends
|
||||
|
||||
- [`QueryBase`](QueryBase.md)<`NativeQuery`>
|
||||
- `StandardQueryBase`<`NativeQuery`>
|
||||
|
||||
## Properties
|
||||
|
||||
@@ -26,7 +26,7 @@ protected inner: Query | Promise<Query>;
|
||||
|
||||
#### Inherited from
|
||||
|
||||
[`QueryBase`](QueryBase.md).[`inner`](QueryBase.md#inner)
|
||||
`StandardQueryBase.inner`
|
||||
|
||||
## Methods
|
||||
|
||||
@@ -73,7 +73,7 @@ AnalyzeExec verbose=true, metrics=[]
|
||||
|
||||
#### Inherited from
|
||||
|
||||
[`QueryBase`](QueryBase.md).[`analyzePlan`](QueryBase.md#analyzeplan)
|
||||
`StandardQueryBase.analyzePlan`
|
||||
|
||||
***
|
||||
|
||||
@@ -107,7 +107,7 @@ single query)
|
||||
|
||||
#### Inherited from
|
||||
|
||||
[`QueryBase`](QueryBase.md).[`execute`](QueryBase.md#execute)
|
||||
`StandardQueryBase.execute`
|
||||
|
||||
***
|
||||
|
||||
@@ -143,7 +143,7 @@ const plan = await table.query().nearestTo([0.5, 0.2]).explainPlan();
|
||||
|
||||
#### Inherited from
|
||||
|
||||
[`QueryBase`](QueryBase.md).[`explainPlan`](QueryBase.md#explainplan)
|
||||
`StandardQueryBase.explainPlan`
|
||||
|
||||
***
|
||||
|
||||
@@ -164,7 +164,7 @@ Use [Table#optimize](Table.md#optimize) to index all un-indexed data.
|
||||
|
||||
#### Inherited from
|
||||
|
||||
[`QueryBase`](QueryBase.md).[`fastSearch`](QueryBase.md#fastsearch)
|
||||
`StandardQueryBase.fastSearch`
|
||||
|
||||
***
|
||||
|
||||
@@ -194,7 +194,7 @@ Use `where` instead
|
||||
|
||||
#### Inherited from
|
||||
|
||||
[`QueryBase`](QueryBase.md).[`filter`](QueryBase.md#filter)
|
||||
`StandardQueryBase.filter`
|
||||
|
||||
***
|
||||
|
||||
@@ -216,7 +216,7 @@ fullTextSearch(query, options?): this
|
||||
|
||||
#### Inherited from
|
||||
|
||||
[`QueryBase`](QueryBase.md).[`fullTextSearch`](QueryBase.md#fulltextsearch)
|
||||
`StandardQueryBase.fullTextSearch`
|
||||
|
||||
***
|
||||
|
||||
@@ -241,7 +241,7 @@ called then every valid row from the table will be returned.
|
||||
|
||||
#### Inherited from
|
||||
|
||||
[`QueryBase`](QueryBase.md).[`limit`](QueryBase.md#limit)
|
||||
`StandardQueryBase.limit`
|
||||
|
||||
***
|
||||
|
||||
@@ -325,6 +325,10 @@ nearestToText(query, columns?): Query
|
||||
offset(offset): this
|
||||
```
|
||||
|
||||
Set the number of rows to skip before returning results.
|
||||
|
||||
This is useful for pagination.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **offset**: `number`
|
||||
@@ -335,7 +339,7 @@ offset(offset): this
|
||||
|
||||
#### Inherited from
|
||||
|
||||
[`QueryBase`](QueryBase.md).[`offset`](QueryBase.md#offset)
|
||||
`StandardQueryBase.offset`
|
||||
|
||||
***
|
||||
|
||||
@@ -388,7 +392,7 @@ object insertion order is easy to get wrong and `Map` is more foolproof.
|
||||
|
||||
#### Inherited from
|
||||
|
||||
[`QueryBase`](QueryBase.md).[`select`](QueryBase.md#select)
|
||||
`StandardQueryBase.select`
|
||||
|
||||
***
|
||||
|
||||
@@ -410,7 +414,7 @@ Collect the results as an array of objects.
|
||||
|
||||
#### Inherited from
|
||||
|
||||
[`QueryBase`](QueryBase.md).[`toArray`](QueryBase.md#toarray)
|
||||
`StandardQueryBase.toArray`
|
||||
|
||||
***
|
||||
|
||||
@@ -436,7 +440,7 @@ ArrowTable.
|
||||
|
||||
#### Inherited from
|
||||
|
||||
[`QueryBase`](QueryBase.md).[`toArrow`](QueryBase.md#toarrow)
|
||||
`StandardQueryBase.toArrow`
|
||||
|
||||
***
|
||||
|
||||
@@ -471,7 +475,7 @@ on the filter column(s).
|
||||
|
||||
#### Inherited from
|
||||
|
||||
[`QueryBase`](QueryBase.md).[`where`](QueryBase.md#where)
|
||||
`StandardQueryBase.where`
|
||||
|
||||
***
|
||||
|
||||
@@ -493,4 +497,4 @@ order to perform hybrid search.
|
||||
|
||||
#### Inherited from
|
||||
|
||||
[`QueryBase`](QueryBase.md).[`withRowId`](QueryBase.md#withrowid)
|
||||
`StandardQueryBase.withRowId`
|
||||
|
||||
@@ -15,12 +15,11 @@ Common methods supported by all query types
|
||||
|
||||
## Extended by
|
||||
|
||||
- [`Query`](Query.md)
|
||||
- [`VectorQuery`](VectorQuery.md)
|
||||
- [`TakeQuery`](TakeQuery.md)
|
||||
|
||||
## Type Parameters
|
||||
|
||||
• **NativeQueryType** *extends* `NativeQuery` \| `NativeVectorQuery`
|
||||
• **NativeQueryType** *extends* `NativeQuery` \| `NativeVectorQuery` \| `NativeTakeQuery`
|
||||
|
||||
## Implements
|
||||
|
||||
@@ -141,104 +140,6 @@ const plan = await table.query().nearestTo([0.5, 0.2]).explainPlan();
|
||||
|
||||
***
|
||||
|
||||
### fastSearch()
|
||||
|
||||
```ts
|
||||
fastSearch(): this
|
||||
```
|
||||
|
||||
Skip searching un-indexed data. This can make search faster, but will miss
|
||||
any data that is not yet indexed.
|
||||
|
||||
Use [Table#optimize](Table.md#optimize) to index all un-indexed data.
|
||||
|
||||
#### Returns
|
||||
|
||||
`this`
|
||||
|
||||
***
|
||||
|
||||
### ~~filter()~~
|
||||
|
||||
```ts
|
||||
filter(predicate): this
|
||||
```
|
||||
|
||||
A filter statement to be applied to this query.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **predicate**: `string`
|
||||
|
||||
#### Returns
|
||||
|
||||
`this`
|
||||
|
||||
#### See
|
||||
|
||||
where
|
||||
|
||||
#### Deprecated
|
||||
|
||||
Use `where` instead
|
||||
|
||||
***
|
||||
|
||||
### fullTextSearch()
|
||||
|
||||
```ts
|
||||
fullTextSearch(query, options?): this
|
||||
```
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **query**: `string` \| [`FullTextQuery`](../interfaces/FullTextQuery.md)
|
||||
|
||||
* **options?**: `Partial`<[`FullTextSearchOptions`](../interfaces/FullTextSearchOptions.md)>
|
||||
|
||||
#### Returns
|
||||
|
||||
`this`
|
||||
|
||||
***
|
||||
|
||||
### limit()
|
||||
|
||||
```ts
|
||||
limit(limit): this
|
||||
```
|
||||
|
||||
Set the maximum number of results to return.
|
||||
|
||||
By default, a plain search has no limit. If this method is not
|
||||
called then every valid row from the table will be returned.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **limit**: `number`
|
||||
|
||||
#### Returns
|
||||
|
||||
`this`
|
||||
|
||||
***
|
||||
|
||||
### offset()
|
||||
|
||||
```ts
|
||||
offset(offset): this
|
||||
```
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **offset**: `number`
|
||||
|
||||
#### Returns
|
||||
|
||||
`this`
|
||||
|
||||
***
|
||||
|
||||
### select()
|
||||
|
||||
```ts
|
||||
@@ -328,37 +229,6 @@ ArrowTable.
|
||||
|
||||
***
|
||||
|
||||
### where()
|
||||
|
||||
```ts
|
||||
where(predicate): this
|
||||
```
|
||||
|
||||
A filter statement to be applied to this query.
|
||||
|
||||
The filter should be supplied as an SQL query string. For example:
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **predicate**: `string`
|
||||
|
||||
#### Returns
|
||||
|
||||
`this`
|
||||
|
||||
#### Example
|
||||
|
||||
```ts
|
||||
x > 10
|
||||
y > 0 AND y < 100
|
||||
x > 5 OR y = 'test'
|
||||
|
||||
Filtering performance can often be improved by creating a scalar index
|
||||
on the filter column(s).
|
||||
```
|
||||
|
||||
***
|
||||
|
||||
### withRowId()
|
||||
|
||||
```ts
|
||||
|
||||
@@ -9,7 +9,8 @@
|
||||
A session for managing caches and object stores across LanceDB operations.
|
||||
|
||||
Sessions allow you to configure cache sizes for index and metadata caches,
|
||||
which can significantly impact performance for large datasets.
|
||||
which can significantly impact memory use and performance. They can
|
||||
also be re-used across multiple connections to share the same cache state.
|
||||
|
||||
## Constructors
|
||||
|
||||
@@ -24,8 +25,11 @@ Create a new session with custom cache sizes.
|
||||
# Parameters
|
||||
|
||||
- `index_cache_size_bytes`: The size of the index cache in bytes.
|
||||
Index data is stored in memory in this cache to speed up queries.
|
||||
Defaults to 6GB if not specified.
|
||||
- `metadata_cache_size_bytes`: The size of the metadata cache in bytes.
|
||||
The metadata cache stores file metadata and schema information in memory.
|
||||
This cache improves scan and write performance.
|
||||
Defaults to 1GB if not specified.
|
||||
|
||||
#### Parameters
|
||||
|
||||
@@ -674,6 +674,48 @@ console.log(tags); // { "v1": { version: 1, manifestSize: ... } }
|
||||
|
||||
***
|
||||
|
||||
### takeOffsets()
|
||||
|
||||
```ts
|
||||
abstract takeOffsets(offsets): TakeQuery
|
||||
```
|
||||
|
||||
Create a query that returns a subset of the rows in the table.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **offsets**: `number`[]
|
||||
The offsets of the rows to return.
|
||||
|
||||
#### Returns
|
||||
|
||||
[`TakeQuery`](TakeQuery.md)
|
||||
|
||||
A builder that can be used to parameterize the query.
|
||||
|
||||
***
|
||||
|
||||
### takeRowIds()
|
||||
|
||||
```ts
|
||||
abstract takeRowIds(rowIds): TakeQuery
|
||||
```
|
||||
|
||||
Create a query that returns a subset of the rows in the table.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **rowIds**: `number`[]
|
||||
The row ids of the rows to return.
|
||||
|
||||
#### Returns
|
||||
|
||||
[`TakeQuery`](TakeQuery.md)
|
||||
|
||||
A builder that can be used to parameterize the query.
|
||||
|
||||
***
|
||||
|
||||
### toArrow()
|
||||
|
||||
```ts
|
||||
|
||||
265
docs/src/js/classes/TakeQuery.md
Normal file
265
docs/src/js/classes/TakeQuery.md
Normal file
@@ -0,0 +1,265 @@
|
||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../globals.md) / TakeQuery
|
||||
|
||||
# Class: TakeQuery
|
||||
|
||||
A query that returns a subset of the rows in the table.
|
||||
|
||||
## Extends
|
||||
|
||||
- [`QueryBase`](QueryBase.md)<`NativeTakeQuery`>
|
||||
|
||||
## Properties
|
||||
|
||||
### inner
|
||||
|
||||
```ts
|
||||
protected inner: TakeQuery | Promise<TakeQuery>;
|
||||
```
|
||||
|
||||
#### Inherited from
|
||||
|
||||
[`QueryBase`](QueryBase.md).[`inner`](QueryBase.md#inner)
|
||||
|
||||
## Methods
|
||||
|
||||
### analyzePlan()
|
||||
|
||||
```ts
|
||||
analyzePlan(): Promise<string>
|
||||
```
|
||||
|
||||
Executes the query and returns the physical query plan annotated with runtime metrics.
|
||||
|
||||
This is useful for debugging and performance analysis, as it shows how the query was executed
|
||||
and includes metrics such as elapsed time, rows processed, and I/O statistics.
|
||||
|
||||
#### Returns
|
||||
|
||||
`Promise`<`string`>
|
||||
|
||||
A query execution plan with runtime metrics for each step.
|
||||
|
||||
#### Example
|
||||
|
||||
```ts
|
||||
import * as lancedb from "@lancedb/lancedb"
|
||||
|
||||
const db = await lancedb.connect("./.lancedb");
|
||||
const table = await db.createTable("my_table", [
|
||||
{ vector: [1.1, 0.9], id: "1" },
|
||||
]);
|
||||
|
||||
const plan = await table.query().nearestTo([0.5, 0.2]).analyzePlan();
|
||||
|
||||
Example output (with runtime metrics inlined):
|
||||
AnalyzeExec verbose=true, metrics=[]
|
||||
ProjectionExec: expr=[id@3 as id, vector@0 as vector, _distance@2 as _distance], metrics=[output_rows=1, elapsed_compute=3.292µs]
|
||||
Take: columns="vector, _rowid, _distance, (id)", metrics=[output_rows=1, elapsed_compute=66.001µs, batches_processed=1, bytes_read=8, iops=1, requests=1]
|
||||
CoalesceBatchesExec: target_batch_size=1024, metrics=[output_rows=1, elapsed_compute=3.333µs]
|
||||
GlobalLimitExec: skip=0, fetch=10, metrics=[output_rows=1, elapsed_compute=167ns]
|
||||
FilterExec: _distance@2 IS NOT NULL, metrics=[output_rows=1, elapsed_compute=8.542µs]
|
||||
SortExec: TopK(fetch=10), expr=[_distance@2 ASC NULLS LAST], metrics=[output_rows=1, elapsed_compute=63.25µs, row_replacements=1]
|
||||
KNNVectorDistance: metric=l2, metrics=[output_rows=1, elapsed_compute=114.333µs, output_batches=1]
|
||||
LanceScan: uri=/path/to/data, projection=[vector], row_id=true, row_addr=false, ordered=false, metrics=[output_rows=1, elapsed_compute=103.626µs, bytes_read=549, iops=2, requests=2]
|
||||
```
|
||||
|
||||
#### Inherited from
|
||||
|
||||
[`QueryBase`](QueryBase.md).[`analyzePlan`](QueryBase.md#analyzeplan)
|
||||
|
||||
***
|
||||
|
||||
### execute()
|
||||
|
||||
```ts
|
||||
protected execute(options?): RecordBatchIterator
|
||||
```
|
||||
|
||||
Execute the query and return the results as an
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **options?**: `Partial`<[`QueryExecutionOptions`](../interfaces/QueryExecutionOptions.md)>
|
||||
|
||||
#### Returns
|
||||
|
||||
[`RecordBatchIterator`](RecordBatchIterator.md)
|
||||
|
||||
#### See
|
||||
|
||||
- AsyncIterator
|
||||
of
|
||||
- RecordBatch.
|
||||
|
||||
By default, LanceDb will use many threads to calculate results and, when
|
||||
the result set is large, multiple batches will be processed at one time.
|
||||
This readahead is limited however and backpressure will be applied if this
|
||||
stream is consumed slowly (this constrains the maximum memory used by a
|
||||
single query)
|
||||
|
||||
#### Inherited from
|
||||
|
||||
[`QueryBase`](QueryBase.md).[`execute`](QueryBase.md#execute)
|
||||
|
||||
***
|
||||
|
||||
### explainPlan()
|
||||
|
||||
```ts
|
||||
explainPlan(verbose): Promise<string>
|
||||
```
|
||||
|
||||
Generates an explanation of the query execution plan.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **verbose**: `boolean` = `false`
|
||||
If true, provides a more detailed explanation. Defaults to false.
|
||||
|
||||
#### Returns
|
||||
|
||||
`Promise`<`string`>
|
||||
|
||||
A Promise that resolves to a string containing the query execution plan explanation.
|
||||
|
||||
#### Example
|
||||
|
||||
```ts
|
||||
import * as lancedb from "@lancedb/lancedb"
|
||||
const db = await lancedb.connect("./.lancedb");
|
||||
const table = await db.createTable("my_table", [
|
||||
{ vector: [1.1, 0.9], id: "1" },
|
||||
]);
|
||||
const plan = await table.query().nearestTo([0.5, 0.2]).explainPlan();
|
||||
```
|
||||
|
||||
#### Inherited from
|
||||
|
||||
[`QueryBase`](QueryBase.md).[`explainPlan`](QueryBase.md#explainplan)
|
||||
|
||||
***
|
||||
|
||||
### select()
|
||||
|
||||
```ts
|
||||
select(columns): this
|
||||
```
|
||||
|
||||
Return only the specified columns.
|
||||
|
||||
By default a query will return all columns from the table. However, this can have
|
||||
a very significant impact on latency. LanceDb stores data in a columnar fashion. This
|
||||
means we can finely tune our I/O to select exactly the columns we need.
|
||||
|
||||
As a best practice you should always limit queries to the columns that you need. If you
|
||||
pass in an array of column names then only those columns will be returned.
|
||||
|
||||
You can also use this method to create new "dynamic" columns based on your existing columns.
|
||||
For example, you may not care about "a" or "b" but instead simply want "a + b". This is often
|
||||
seen in the SELECT clause of an SQL query (e.g. `SELECT a+b FROM my_table`).
|
||||
|
||||
To create dynamic columns you can pass in a Map<string, string>. A column will be returned
|
||||
for each entry in the map. The key provides the name of the column. The value is
|
||||
an SQL string used to specify how the column is calculated.
|
||||
|
||||
For example, an SQL query might state `SELECT a + b AS combined, c`. The equivalent
|
||||
input to this method would be:
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **columns**: `string` \| `string`[] \| `Record`<`string`, `string`> \| `Map`<`string`, `string`>
|
||||
|
||||
#### Returns
|
||||
|
||||
`this`
|
||||
|
||||
#### Example
|
||||
|
||||
```ts
|
||||
new Map([["combined", "a + b"], ["c", "c"]])
|
||||
|
||||
Columns will always be returned in the order given, even if that order is different than
|
||||
the order used when adding the data.
|
||||
|
||||
Note that you can pass in a `Record<string, string>` (e.g. an object literal). This method
|
||||
uses `Object.entries` which should preserve the insertion order of the object. However,
|
||||
object insertion order is easy to get wrong and `Map` is more foolproof.
|
||||
```
|
||||
|
||||
#### Inherited from
|
||||
|
||||
[`QueryBase`](QueryBase.md).[`select`](QueryBase.md#select)
|
||||
|
||||
***
|
||||
|
||||
### toArray()
|
||||
|
||||
```ts
|
||||
toArray(options?): Promise<any[]>
|
||||
```
|
||||
|
||||
Collect the results as an array of objects.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **options?**: `Partial`<[`QueryExecutionOptions`](../interfaces/QueryExecutionOptions.md)>
|
||||
|
||||
#### Returns
|
||||
|
||||
`Promise`<`any`[]>
|
||||
|
||||
#### Inherited from
|
||||
|
||||
[`QueryBase`](QueryBase.md).[`toArray`](QueryBase.md#toarray)
|
||||
|
||||
***
|
||||
|
||||
### toArrow()
|
||||
|
||||
```ts
|
||||
toArrow(options?): Promise<Table<any>>
|
||||
```
|
||||
|
||||
Collect the results as an Arrow
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **options?**: `Partial`<[`QueryExecutionOptions`](../interfaces/QueryExecutionOptions.md)>
|
||||
|
||||
#### Returns
|
||||
|
||||
`Promise`<`Table`<`any`>>
|
||||
|
||||
#### See
|
||||
|
||||
ArrowTable.
|
||||
|
||||
#### Inherited from
|
||||
|
||||
[`QueryBase`](QueryBase.md).[`toArrow`](QueryBase.md#toarrow)
|
||||
|
||||
***
|
||||
|
||||
### withRowId()
|
||||
|
||||
```ts
|
||||
withRowId(): this
|
||||
```
|
||||
|
||||
Whether to return the row id in the results.
|
||||
|
||||
This column can be used to match results between different queries. For
|
||||
example, to match results from a full text search and a vector search in
|
||||
order to perform hybrid search.
|
||||
|
||||
#### Returns
|
||||
|
||||
`this`
|
||||
|
||||
#### Inherited from
|
||||
|
||||
[`QueryBase`](QueryBase.md).[`withRowId`](QueryBase.md#withrowid)
|
||||
@@ -16,7 +16,7 @@ This builder can be reused to execute the query many times.
|
||||
|
||||
## Extends
|
||||
|
||||
- [`QueryBase`](QueryBase.md)<`NativeVectorQuery`>
|
||||
- `StandardQueryBase`<`NativeVectorQuery`>
|
||||
|
||||
## Properties
|
||||
|
||||
@@ -28,7 +28,7 @@ protected inner: VectorQuery | Promise<VectorQuery>;
|
||||
|
||||
#### Inherited from
|
||||
|
||||
[`QueryBase`](QueryBase.md).[`inner`](QueryBase.md#inner)
|
||||
`StandardQueryBase.inner`
|
||||
|
||||
## Methods
|
||||
|
||||
@@ -91,7 +91,7 @@ AnalyzeExec verbose=true, metrics=[]
|
||||
|
||||
#### Inherited from
|
||||
|
||||
[`QueryBase`](QueryBase.md).[`analyzePlan`](QueryBase.md#analyzeplan)
|
||||
`StandardQueryBase.analyzePlan`
|
||||
|
||||
***
|
||||
|
||||
@@ -248,7 +248,7 @@ single query)
|
||||
|
||||
#### Inherited from
|
||||
|
||||
[`QueryBase`](QueryBase.md).[`execute`](QueryBase.md#execute)
|
||||
`StandardQueryBase.execute`
|
||||
|
||||
***
|
||||
|
||||
@@ -284,7 +284,7 @@ const plan = await table.query().nearestTo([0.5, 0.2]).explainPlan();
|
||||
|
||||
#### Inherited from
|
||||
|
||||
[`QueryBase`](QueryBase.md).[`explainPlan`](QueryBase.md#explainplan)
|
||||
`StandardQueryBase.explainPlan`
|
||||
|
||||
***
|
||||
|
||||
@@ -305,7 +305,7 @@ Use [Table#optimize](Table.md#optimize) to index all un-indexed data.
|
||||
|
||||
#### Inherited from
|
||||
|
||||
[`QueryBase`](QueryBase.md).[`fastSearch`](QueryBase.md#fastsearch)
|
||||
`StandardQueryBase.fastSearch`
|
||||
|
||||
***
|
||||
|
||||
@@ -335,7 +335,7 @@ Use `where` instead
|
||||
|
||||
#### Inherited from
|
||||
|
||||
[`QueryBase`](QueryBase.md).[`filter`](QueryBase.md#filter)
|
||||
`StandardQueryBase.filter`
|
||||
|
||||
***
|
||||
|
||||
@@ -357,7 +357,7 @@ fullTextSearch(query, options?): this
|
||||
|
||||
#### Inherited from
|
||||
|
||||
[`QueryBase`](QueryBase.md).[`fullTextSearch`](QueryBase.md#fulltextsearch)
|
||||
`StandardQueryBase.fullTextSearch`
|
||||
|
||||
***
|
||||
|
||||
@@ -382,7 +382,7 @@ called then every valid row from the table will be returned.
|
||||
|
||||
#### Inherited from
|
||||
|
||||
[`QueryBase`](QueryBase.md).[`limit`](QueryBase.md#limit)
|
||||
`StandardQueryBase.limit`
|
||||
|
||||
***
|
||||
|
||||
@@ -480,6 +480,10 @@ the minimum and maximum to the same value.
|
||||
offset(offset): this
|
||||
```
|
||||
|
||||
Set the number of rows to skip before returning results.
|
||||
|
||||
This is useful for pagination.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **offset**: `number`
|
||||
@@ -490,7 +494,7 @@ offset(offset): this
|
||||
|
||||
#### Inherited from
|
||||
|
||||
[`QueryBase`](QueryBase.md).[`offset`](QueryBase.md#offset)
|
||||
`StandardQueryBase.offset`
|
||||
|
||||
***
|
||||
|
||||
@@ -637,7 +641,7 @@ object insertion order is easy to get wrong and `Map` is more foolproof.
|
||||
|
||||
#### Inherited from
|
||||
|
||||
[`QueryBase`](QueryBase.md).[`select`](QueryBase.md#select)
|
||||
`StandardQueryBase.select`
|
||||
|
||||
***
|
||||
|
||||
@@ -659,7 +663,7 @@ Collect the results as an array of objects.
|
||||
|
||||
#### Inherited from
|
||||
|
||||
[`QueryBase`](QueryBase.md).[`toArray`](QueryBase.md#toarray)
|
||||
`StandardQueryBase.toArray`
|
||||
|
||||
***
|
||||
|
||||
@@ -685,7 +689,7 @@ ArrowTable.
|
||||
|
||||
#### Inherited from
|
||||
|
||||
[`QueryBase`](QueryBase.md).[`toArrow`](QueryBase.md#toarrow)
|
||||
`StandardQueryBase.toArrow`
|
||||
|
||||
***
|
||||
|
||||
@@ -720,7 +724,7 @@ on the filter column(s).
|
||||
|
||||
#### Inherited from
|
||||
|
||||
[`QueryBase`](QueryBase.md).[`where`](QueryBase.md#where)
|
||||
`StandardQueryBase.where`
|
||||
|
||||
***
|
||||
|
||||
@@ -742,4 +746,4 @@ order to perform hybrid search.
|
||||
|
||||
#### Inherited from
|
||||
|
||||
[`QueryBase`](QueryBase.md).[`withRowId`](QueryBase.md#withrowid)
|
||||
`StandardQueryBase.withRowId`
|
||||
|
||||
@@ -33,6 +33,7 @@
|
||||
- [Table](classes/Table.md)
|
||||
- [TagContents](classes/TagContents.md)
|
||||
- [Tags](classes/Tags.md)
|
||||
- [TakeQuery](classes/TakeQuery.md)
|
||||
- [VectorColumnOptions](classes/VectorColumnOptions.md)
|
||||
- [VectorQuery](classes/VectorQuery.md)
|
||||
|
||||
|
||||
@@ -26,6 +26,18 @@ will be used to determine the most useful kind of index to create.
|
||||
|
||||
***
|
||||
|
||||
### name?
|
||||
|
||||
```ts
|
||||
optional name: string;
|
||||
```
|
||||
|
||||
Optional custom name for the index.
|
||||
|
||||
If not provided, a default name will be generated based on the column name.
|
||||
|
||||
***
|
||||
|
||||
### replace?
|
||||
|
||||
```ts
|
||||
@@ -42,8 +54,27 @@ The default is true
|
||||
|
||||
***
|
||||
|
||||
### train?
|
||||
|
||||
```ts
|
||||
optional train: boolean;
|
||||
```
|
||||
|
||||
Whether to train the index with existing data.
|
||||
|
||||
If true (default), the index will be trained with existing data in the table.
|
||||
If false, the index will be created empty and populated as new data is added.
|
||||
|
||||
Note: This option is only supported for scalar indices. Vector indices always train.
|
||||
|
||||
***
|
||||
|
||||
### waitTimeoutSeconds?
|
||||
|
||||
```ts
|
||||
optional waitTimeoutSeconds: number;
|
||||
```
|
||||
|
||||
Timeout in seconds to wait for index creation to complete.
|
||||
|
||||
If not specified, the method will return immediately after starting the index creation.
|
||||
|
||||
@@ -44,3 +44,17 @@ optional readTimeout: number;
|
||||
The timeout for reading data from the server in seconds. Default is 300
|
||||
seconds (5 minutes). This can also be set via the environment variable
|
||||
`LANCE_CLIENT_READ_TIMEOUT`, as an integer number of seconds.
|
||||
|
||||
***
|
||||
|
||||
### timeout?
|
||||
|
||||
```ts
|
||||
optional timeout: number;
|
||||
```
|
||||
|
||||
The overall timeout for the entire request in seconds. This includes
|
||||
connection, send, and read time. If the entire request doesn't complete
|
||||
within this time, it will fail. Default is None (no overall timeout).
|
||||
This can also be set via the environment variable `LANCE_CLIENT_TIMEOUT`,
|
||||
as an integer number of seconds.
|
||||
|
||||
@@ -15,7 +15,7 @@ publish = false
|
||||
crate-type = ["cdylib"]
|
||||
|
||||
[dependencies]
|
||||
lancedb = { path = "../../../rust/lancedb" }
|
||||
lancedb = { path = "../../../rust/lancedb", default-features = false }
|
||||
lance = { workspace = true }
|
||||
arrow = { workspace = true, features = ["ffi"] }
|
||||
arrow-schema.workspace = true
|
||||
@@ -25,3 +25,6 @@ snafu.workspace = true
|
||||
lazy_static.workspace = true
|
||||
serde = { version = "^1" }
|
||||
serde_json = { version = "1" }
|
||||
|
||||
[features]
|
||||
default = ["lancedb/default"]
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
<parent>
|
||||
<groupId>com.lancedb</groupId>
|
||||
<artifactId>lancedb-parent</artifactId>
|
||||
<version>0.21.2-final.0</version>
|
||||
<version>0.22.0-beta.1</version>
|
||||
<relativePath>../pom.xml</relativePath>
|
||||
</parent>
|
||||
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
<parent>
|
||||
<groupId>com.lancedb</groupId>
|
||||
<artifactId>lancedb-parent</artifactId>
|
||||
<version>0.21.2-final.0</version>
|
||||
<version>0.22.0-beta.1</version>
|
||||
<relativePath>../pom.xml</relativePath>
|
||||
</parent>
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
|
||||
<groupId>com.lancedb</groupId>
|
||||
<artifactId>lancedb-parent</artifactId>
|
||||
<version>0.21.2-final.0</version>
|
||||
<version>0.22.0-beta.1</version>
|
||||
<packaging>pom</packaging>
|
||||
<name>${project.artifactId}</name>
|
||||
<description>LanceDB Java SDK Parent POM</description>
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
[package]
|
||||
name = "lancedb-nodejs"
|
||||
edition.workspace = true
|
||||
version = "0.21.2"
|
||||
version = "0.22.0-beta.1"
|
||||
license.workspace = true
|
||||
description.workspace = true
|
||||
repository.workspace = true
|
||||
@@ -18,7 +18,7 @@ arrow-array.workspace = true
|
||||
arrow-schema.workspace = true
|
||||
env_logger.workspace = true
|
||||
futures.workspace = true
|
||||
lancedb = { path = "../rust/lancedb" }
|
||||
lancedb = { path = "../rust/lancedb", default-features = false }
|
||||
napi = { version = "2.16.8", default-features = false, features = [
|
||||
"napi9",
|
||||
"async"
|
||||
@@ -36,6 +36,6 @@ aws-lc-rs = "=1.13.0"
|
||||
napi-build = "2.1"
|
||||
|
||||
[features]
|
||||
default = ["remote"]
|
||||
default = ["remote", "lancedb/default"]
|
||||
fp16kernels = ["lancedb/fp16kernels"]
|
||||
remote = ["lancedb/remote"]
|
||||
|
||||
@@ -287,6 +287,12 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
|
||||
expect(res2[1].id).toEqual(data2.id);
|
||||
});
|
||||
|
||||
it("should support take queries", async () => {
|
||||
await table.add([{ id: 1 }, { id: 2 }, { id: 3 }]);
|
||||
const res = await table.takeOffsets([1, 2]).toArrow();
|
||||
expect(res.getChild("id")?.toJSON()).toEqual([2, 3]);
|
||||
});
|
||||
|
||||
it("should return the table as an instance of an arrow table", async () => {
|
||||
const arrowTbl = await table.toArrow();
|
||||
expect(arrowTbl).toBeInstanceOf(ArrowTable);
|
||||
@@ -557,7 +563,7 @@ describe("When creating an index", () => {
|
||||
|
||||
// test offset
|
||||
rst = await tbl.query().limit(2).offset(1).nearestTo(queryVec).toArrow();
|
||||
expect(rst.numRows).toBe(1);
|
||||
expect(rst.numRows).toBe(2);
|
||||
|
||||
// test nprobes
|
||||
rst = await tbl.query().nearestTo(queryVec).limit(2).nprobes(50).toArrow();
|
||||
@@ -696,7 +702,7 @@ describe("When creating an index", () => {
|
||||
|
||||
// test offset
|
||||
rst = await tbl.query().limit(2).offset(1).nearestTo(queryVec).toArrow();
|
||||
expect(rst.numRows).toBe(1);
|
||||
expect(rst.numRows).toBe(2);
|
||||
|
||||
// test ef
|
||||
rst = await tbl.query().limit(2).nearestTo(queryVec).ef(100).toArrow();
|
||||
@@ -851,6 +857,40 @@ describe("When creating an index", () => {
|
||||
expect(stats).toBeUndefined();
|
||||
});
|
||||
|
||||
test("should support name and train parameters", async () => {
|
||||
// Test with custom name
|
||||
await tbl.createIndex("vec", {
|
||||
config: Index.ivfPq({ numPartitions: 4 }),
|
||||
name: "my_custom_vector_index",
|
||||
});
|
||||
|
||||
const indices = await tbl.listIndices();
|
||||
expect(indices).toHaveLength(1);
|
||||
expect(indices[0].name).toBe("my_custom_vector_index");
|
||||
|
||||
// Test scalar index with train=false
|
||||
await tbl.createIndex("id", {
|
||||
config: Index.btree(),
|
||||
name: "btree_empty",
|
||||
train: false,
|
||||
});
|
||||
|
||||
const allIndices = await tbl.listIndices();
|
||||
expect(allIndices).toHaveLength(2);
|
||||
expect(allIndices.some((idx) => idx.name === "btree_empty")).toBe(true);
|
||||
|
||||
// Test with both name and train=true (use tags column)
|
||||
await tbl.createIndex("tags", {
|
||||
config: Index.labelList(),
|
||||
name: "tags_trained",
|
||||
train: true,
|
||||
});
|
||||
|
||||
const finalIndices = await tbl.listIndices();
|
||||
expect(finalIndices).toHaveLength(3);
|
||||
expect(finalIndices.some((idx) => idx.name === "tags_trained")).toBe(true);
|
||||
});
|
||||
|
||||
test("create ivf_flat with binary vectors", async () => {
|
||||
const db = await connect(tmpDir.name);
|
||||
const binarySchema = new Schema([
|
||||
|
||||
@@ -12,7 +12,7 @@ test("ann index examples", async () => {
|
||||
// --8<-- [start:ingest]
|
||||
const db = await lancedb.connect(databaseDir);
|
||||
|
||||
const data = Array.from({ length: 5_000 }, (_, i) => ({
|
||||
const data = Array.from({ length: 1_000 }, (_, i) => ({
|
||||
vector: Array(128).fill(i),
|
||||
id: `${i}`,
|
||||
content: "",
|
||||
@@ -24,8 +24,8 @@ test("ann index examples", async () => {
|
||||
});
|
||||
await table.createIndex("vector", {
|
||||
config: lancedb.Index.ivfPq({
|
||||
numPartitions: 10,
|
||||
numSubVectors: 16,
|
||||
numPartitions: 30,
|
||||
numSubVectors: 8,
|
||||
}),
|
||||
});
|
||||
// --8<-- [end:ingest]
|
||||
|
||||
@@ -159,17 +159,33 @@ export abstract class Connection {
|
||||
*
|
||||
* Tables will be returned in lexicographical order.
|
||||
* @param {Partial<TableNamesOptions>} options - options to control the
|
||||
* paging / start point
|
||||
* paging / start point (backwards compatibility)
|
||||
*
|
||||
*/
|
||||
abstract tableNames(options?: Partial<TableNamesOptions>): Promise<string[]>;
|
||||
/**
|
||||
* List all the table names in this database.
|
||||
*
|
||||
* Tables will be returned in lexicographical order.
|
||||
* @param {string[]} namespace - The namespace to list tables from (defaults to root namespace)
|
||||
* @param {Partial<TableNamesOptions>} options - options to control the
|
||||
* paging / start point
|
||||
*
|
||||
*/
|
||||
abstract tableNames(
|
||||
namespace?: string[],
|
||||
options?: Partial<TableNamesOptions>,
|
||||
): Promise<string[]>;
|
||||
|
||||
/**
|
||||
* Open a table in the database.
|
||||
* @param {string} name - The name of the table
|
||||
* @param {string[]} namespace - The namespace of the table (defaults to root namespace)
|
||||
* @param {Partial<OpenTableOptions>} options - Additional options
|
||||
*/
|
||||
abstract openTable(
|
||||
name: string,
|
||||
namespace?: string[],
|
||||
options?: Partial<OpenTableOptions>,
|
||||
): Promise<Table>;
|
||||
|
||||
@@ -178,6 +194,7 @@ export abstract class Connection {
|
||||
* @param {object} options - The options object.
|
||||
* @param {string} options.name - The name of the table.
|
||||
* @param {Data} options.data - Non-empty Array of Records to be inserted into the table
|
||||
* @param {string[]} namespace - The namespace to create the table in (defaults to root namespace)
|
||||
*
|
||||
*/
|
||||
abstract createTable(
|
||||
@@ -185,40 +202,72 @@ export abstract class Connection {
|
||||
name: string;
|
||||
data: Data;
|
||||
} & Partial<CreateTableOptions>,
|
||||
namespace?: string[],
|
||||
): Promise<Table>;
|
||||
/**
|
||||
* Creates a new Table and initialize it with new data.
|
||||
* @param {string} name - The name of the table.
|
||||
* @param {Record<string, unknown>[] | TableLike} data - Non-empty Array of Records
|
||||
* to be inserted into the table
|
||||
* @param {Partial<CreateTableOptions>} options - Additional options (backwards compatibility)
|
||||
*/
|
||||
abstract createTable(
|
||||
name: string,
|
||||
data: Record<string, unknown>[] | TableLike,
|
||||
options?: Partial<CreateTableOptions>,
|
||||
): Promise<Table>;
|
||||
/**
|
||||
* Creates a new Table and initialize it with new data.
|
||||
* @param {string} name - The name of the table.
|
||||
* @param {Record<string, unknown>[] | TableLike} data - Non-empty Array of Records
|
||||
* to be inserted into the table
|
||||
* @param {string[]} namespace - The namespace to create the table in (defaults to root namespace)
|
||||
* @param {Partial<CreateTableOptions>} options - Additional options
|
||||
*/
|
||||
abstract createTable(
|
||||
name: string,
|
||||
data: Record<string, unknown>[] | TableLike,
|
||||
namespace?: string[],
|
||||
options?: Partial<CreateTableOptions>,
|
||||
): Promise<Table>;
|
||||
|
||||
/**
|
||||
* Creates a new empty Table
|
||||
* @param {string} name - The name of the table.
|
||||
* @param {Schema} schema - The schema of the table
|
||||
* @param {Partial<CreateTableOptions>} options - Additional options (backwards compatibility)
|
||||
*/
|
||||
abstract createEmptyTable(
|
||||
name: string,
|
||||
schema: import("./arrow").SchemaLike,
|
||||
options?: Partial<CreateTableOptions>,
|
||||
): Promise<Table>;
|
||||
/**
|
||||
* Creates a new empty Table
|
||||
* @param {string} name - The name of the table.
|
||||
* @param {Schema} schema - The schema of the table
|
||||
* @param {string[]} namespace - The namespace to create the table in (defaults to root namespace)
|
||||
* @param {Partial<CreateTableOptions>} options - Additional options
|
||||
*/
|
||||
abstract createEmptyTable(
|
||||
name: string,
|
||||
schema: import("./arrow").SchemaLike,
|
||||
namespace?: string[],
|
||||
options?: Partial<CreateTableOptions>,
|
||||
): Promise<Table>;
|
||||
|
||||
/**
|
||||
* Drop an existing table.
|
||||
* @param {string} name The name of the table to drop.
|
||||
* @param {string[]} namespace The namespace of the table (defaults to root namespace).
|
||||
*/
|
||||
abstract dropTable(name: string): Promise<void>;
|
||||
abstract dropTable(name: string, namespace?: string[]): Promise<void>;
|
||||
|
||||
/**
|
||||
* Drop all tables in the database.
|
||||
* @param {string[]} namespace The namespace to drop tables from (defaults to root namespace).
|
||||
*/
|
||||
abstract dropAllTables(): Promise<void>;
|
||||
abstract dropAllTables(namespace?: string[]): Promise<void>;
|
||||
}
|
||||
|
||||
/** @hideconstructor */
|
||||
@@ -243,16 +292,39 @@ export class LocalConnection extends Connection {
|
||||
return this.inner.display();
|
||||
}
|
||||
|
||||
async tableNames(options?: Partial<TableNamesOptions>): Promise<string[]> {
|
||||
return this.inner.tableNames(options?.startAfter, options?.limit);
|
||||
async tableNames(
|
||||
namespaceOrOptions?: string[] | Partial<TableNamesOptions>,
|
||||
options?: Partial<TableNamesOptions>,
|
||||
): Promise<string[]> {
|
||||
// Detect if first argument is namespace array or options object
|
||||
let namespace: string[] | undefined;
|
||||
let tableNamesOptions: Partial<TableNamesOptions> | undefined;
|
||||
|
||||
if (Array.isArray(namespaceOrOptions)) {
|
||||
// First argument is namespace array
|
||||
namespace = namespaceOrOptions;
|
||||
tableNamesOptions = options;
|
||||
} else {
|
||||
// First argument is options object (backwards compatibility)
|
||||
namespace = undefined;
|
||||
tableNamesOptions = namespaceOrOptions;
|
||||
}
|
||||
|
||||
return this.inner.tableNames(
|
||||
namespace ?? [],
|
||||
tableNamesOptions?.startAfter,
|
||||
tableNamesOptions?.limit,
|
||||
);
|
||||
}
|
||||
|
||||
async openTable(
|
||||
name: string,
|
||||
namespace?: string[],
|
||||
options?: Partial<OpenTableOptions>,
|
||||
): Promise<Table> {
|
||||
const innerTable = await this.inner.openTable(
|
||||
name,
|
||||
namespace ?? [],
|
||||
cleanseStorageOptions(options?.storageOptions),
|
||||
options?.indexCacheSize,
|
||||
);
|
||||
@@ -286,14 +358,44 @@ export class LocalConnection extends Connection {
|
||||
nameOrOptions:
|
||||
| string
|
||||
| ({ name: string; data: Data } & Partial<CreateTableOptions>),
|
||||
data?: Record<string, unknown>[] | TableLike,
|
||||
dataOrNamespace?: Record<string, unknown>[] | TableLike | string[],
|
||||
namespaceOrOptions?: string[] | Partial<CreateTableOptions>,
|
||||
options?: Partial<CreateTableOptions>,
|
||||
): Promise<Table> {
|
||||
if (typeof nameOrOptions !== "string" && "name" in nameOrOptions) {
|
||||
const { name, data, ...options } = nameOrOptions;
|
||||
|
||||
return this.createTable(name, data, options);
|
||||
// First overload: createTable(options, namespace?)
|
||||
const { name, data, ...createOptions } = nameOrOptions;
|
||||
const namespace = dataOrNamespace as string[] | undefined;
|
||||
return this._createTableImpl(name, data, namespace, createOptions);
|
||||
}
|
||||
|
||||
// Second overload: createTable(name, data, namespace?, options?)
|
||||
const name = nameOrOptions;
|
||||
const data = dataOrNamespace as Record<string, unknown>[] | TableLike;
|
||||
|
||||
// Detect if third argument is namespace array or options object
|
||||
let namespace: string[] | undefined;
|
||||
let createOptions: Partial<CreateTableOptions> | undefined;
|
||||
|
||||
if (Array.isArray(namespaceOrOptions)) {
|
||||
// Third argument is namespace array
|
||||
namespace = namespaceOrOptions;
|
||||
createOptions = options;
|
||||
} else {
|
||||
// Third argument is options object (backwards compatibility)
|
||||
namespace = undefined;
|
||||
createOptions = namespaceOrOptions;
|
||||
}
|
||||
|
||||
return this._createTableImpl(name, data, namespace, createOptions);
|
||||
}
|
||||
|
||||
private async _createTableImpl(
|
||||
name: string,
|
||||
data: Data,
|
||||
namespace?: string[],
|
||||
options?: Partial<CreateTableOptions>,
|
||||
): Promise<Table> {
|
||||
if (data === undefined) {
|
||||
throw new Error("data is required");
|
||||
}
|
||||
@@ -302,9 +404,10 @@ export class LocalConnection extends Connection {
|
||||
const storageOptions = this.getStorageOptions(options);
|
||||
|
||||
const innerTable = await this.inner.createTable(
|
||||
nameOrOptions,
|
||||
name,
|
||||
buf,
|
||||
mode,
|
||||
namespace ?? [],
|
||||
storageOptions,
|
||||
);
|
||||
|
||||
@@ -314,39 +417,55 @@ export class LocalConnection extends Connection {
|
||||
async createEmptyTable(
|
||||
name: string,
|
||||
schema: import("./arrow").SchemaLike,
|
||||
namespaceOrOptions?: string[] | Partial<CreateTableOptions>,
|
||||
options?: Partial<CreateTableOptions>,
|
||||
): Promise<Table> {
|
||||
let mode: string = options?.mode ?? "create";
|
||||
const existOk = options?.existOk ?? false;
|
||||
// Detect if third argument is namespace array or options object
|
||||
let namespace: string[] | undefined;
|
||||
let createOptions: Partial<CreateTableOptions> | undefined;
|
||||
|
||||
if (Array.isArray(namespaceOrOptions)) {
|
||||
// Third argument is namespace array
|
||||
namespace = namespaceOrOptions;
|
||||
createOptions = options;
|
||||
} else {
|
||||
// Third argument is options object (backwards compatibility)
|
||||
namespace = undefined;
|
||||
createOptions = namespaceOrOptions;
|
||||
}
|
||||
|
||||
let mode: string = createOptions?.mode ?? "create";
|
||||
const existOk = createOptions?.existOk ?? false;
|
||||
|
||||
if (mode === "create" && existOk) {
|
||||
mode = "exist_ok";
|
||||
}
|
||||
let metadata: Map<string, string> | undefined = undefined;
|
||||
if (options?.embeddingFunction !== undefined) {
|
||||
const embeddingFunction = options.embeddingFunction;
|
||||
if (createOptions?.embeddingFunction !== undefined) {
|
||||
const embeddingFunction = createOptions.embeddingFunction;
|
||||
const registry = getRegistry();
|
||||
metadata = registry.getTableMetadata([embeddingFunction]);
|
||||
}
|
||||
|
||||
const storageOptions = this.getStorageOptions(options);
|
||||
const storageOptions = this.getStorageOptions(createOptions);
|
||||
const table = makeEmptyTable(schema, metadata);
|
||||
const buf = await fromTableToBuffer(table);
|
||||
const innerTable = await this.inner.createEmptyTable(
|
||||
name,
|
||||
buf,
|
||||
mode,
|
||||
namespace ?? [],
|
||||
storageOptions,
|
||||
);
|
||||
return new LocalTable(innerTable);
|
||||
}
|
||||
|
||||
async dropTable(name: string): Promise<void> {
|
||||
return this.inner.dropTable(name);
|
||||
async dropTable(name: string, namespace?: string[]): Promise<void> {
|
||||
return this.inner.dropTable(name, namespace ?? []);
|
||||
}
|
||||
|
||||
async dropAllTables(): Promise<void> {
|
||||
return this.inner.dropAllTables();
|
||||
async dropAllTables(namespace?: string[]): Promise<void> {
|
||||
return this.inner.dropAllTables(namespace ?? []);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -59,6 +59,7 @@ export {
|
||||
Query,
|
||||
QueryBase,
|
||||
VectorQuery,
|
||||
TakeQuery,
|
||||
QueryExecutionOptions,
|
||||
FullTextSearchOptions,
|
||||
RecordBatchIterator,
|
||||
|
||||
@@ -700,5 +700,27 @@ export interface IndexOptions {
|
||||
*/
|
||||
replace?: boolean;
|
||||
|
||||
/**
|
||||
* Timeout in seconds to wait for index creation to complete.
|
||||
*
|
||||
* If not specified, the method will return immediately after starting the index creation.
|
||||
*/
|
||||
waitTimeoutSeconds?: number;
|
||||
|
||||
/**
|
||||
* Optional custom name for the index.
|
||||
*
|
||||
* If not provided, a default name will be generated based on the column name.
|
||||
*/
|
||||
name?: string;
|
||||
|
||||
/**
|
||||
* Whether to train the index with existing data.
|
||||
*
|
||||
* If true (default), the index will be trained with existing data in the table.
|
||||
* If false, the index will be created empty and populated as new data is added.
|
||||
*
|
||||
* Note: This option is only supported for scalar indices. Vector indices always train.
|
||||
*/
|
||||
train?: boolean;
|
||||
}
|
||||
|
||||
@@ -15,6 +15,7 @@ import {
|
||||
RecordBatchIterator as NativeBatchIterator,
|
||||
Query as NativeQuery,
|
||||
Table as NativeTable,
|
||||
TakeQuery as NativeTakeQuery,
|
||||
VectorQuery as NativeVectorQuery,
|
||||
} from "./native";
|
||||
import { Reranker } from "./rerankers";
|
||||
@@ -50,7 +51,7 @@ export class RecordBatchIterator implements AsyncIterator<RecordBatch> {
|
||||
/* eslint-enable */
|
||||
|
||||
class RecordBatchIterable<
|
||||
NativeQueryType extends NativeQuery | NativeVectorQuery,
|
||||
NativeQueryType extends NativeQuery | NativeVectorQuery | NativeTakeQuery,
|
||||
> implements AsyncIterable<RecordBatch>
|
||||
{
|
||||
private inner: NativeQueryType;
|
||||
@@ -107,8 +108,9 @@ export interface FullTextSearchOptions {
|
||||
*
|
||||
* @hideconstructor
|
||||
*/
|
||||
export class QueryBase<NativeQueryType extends NativeQuery | NativeVectorQuery>
|
||||
implements AsyncIterable<RecordBatch>
|
||||
export class QueryBase<
|
||||
NativeQueryType extends NativeQuery | NativeVectorQuery | NativeTakeQuery,
|
||||
> implements AsyncIterable<RecordBatch>
|
||||
{
|
||||
/**
|
||||
* @hidden
|
||||
@@ -133,56 +135,6 @@ export class QueryBase<NativeQueryType extends NativeQuery | NativeVectorQuery>
|
||||
fn(this.inner);
|
||||
}
|
||||
}
|
||||
/**
|
||||
* A filter statement to be applied to this query.
|
||||
*
|
||||
* The filter should be supplied as an SQL query string. For example:
|
||||
* @example
|
||||
* x > 10
|
||||
* y > 0 AND y < 100
|
||||
* x > 5 OR y = 'test'
|
||||
*
|
||||
* Filtering performance can often be improved by creating a scalar index
|
||||
* on the filter column(s).
|
||||
*/
|
||||
where(predicate: string): this {
|
||||
this.doCall((inner: NativeQueryType) => inner.onlyIf(predicate));
|
||||
return this;
|
||||
}
|
||||
/**
|
||||
* A filter statement to be applied to this query.
|
||||
* @see where
|
||||
* @deprecated Use `where` instead
|
||||
*/
|
||||
filter(predicate: string): this {
|
||||
return this.where(predicate);
|
||||
}
|
||||
|
||||
fullTextSearch(
|
||||
query: string | FullTextQuery,
|
||||
options?: Partial<FullTextSearchOptions>,
|
||||
): this {
|
||||
let columns: string[] | null = null;
|
||||
if (options) {
|
||||
if (typeof options.columns === "string") {
|
||||
columns = [options.columns];
|
||||
} else if (Array.isArray(options.columns)) {
|
||||
columns = options.columns;
|
||||
}
|
||||
}
|
||||
|
||||
this.doCall((inner: NativeQueryType) => {
|
||||
if (typeof query === "string") {
|
||||
inner.fullTextSearch({
|
||||
query: query,
|
||||
columns: columns,
|
||||
});
|
||||
} else {
|
||||
inner.fullTextSearch({ query: query.inner });
|
||||
}
|
||||
});
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return only the specified columns.
|
||||
@@ -241,33 +193,6 @@ export class QueryBase<NativeQueryType extends NativeQuery | NativeVectorQuery>
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the maximum number of results to return.
|
||||
*
|
||||
* By default, a plain search has no limit. If this method is not
|
||||
* called then every valid row from the table will be returned.
|
||||
*/
|
||||
limit(limit: number): this {
|
||||
this.doCall((inner: NativeQueryType) => inner.limit(limit));
|
||||
return this;
|
||||
}
|
||||
|
||||
offset(offset: number): this {
|
||||
this.doCall((inner: NativeQueryType) => inner.offset(offset));
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Skip searching un-indexed data. This can make search faster, but will miss
|
||||
* any data that is not yet indexed.
|
||||
*
|
||||
* Use {@link Table#optimize} to index all un-indexed data.
|
||||
*/
|
||||
fastSearch(): this {
|
||||
this.doCall((inner: NativeQueryType) => inner.fastSearch());
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Whether to return the row id in the results.
|
||||
*
|
||||
@@ -403,6 +328,100 @@ export class QueryBase<NativeQueryType extends NativeQuery | NativeVectorQuery>
|
||||
}
|
||||
}
|
||||
|
||||
export class StandardQueryBase<
|
||||
NativeQueryType extends NativeQuery | NativeVectorQuery,
|
||||
>
|
||||
extends QueryBase<NativeQueryType>
|
||||
implements ExecutableQuery
|
||||
{
|
||||
constructor(inner: NativeQueryType | Promise<NativeQueryType>) {
|
||||
super(inner);
|
||||
}
|
||||
|
||||
/**
|
||||
* A filter statement to be applied to this query.
|
||||
*
|
||||
* The filter should be supplied as an SQL query string. For example:
|
||||
* @example
|
||||
* x > 10
|
||||
* y > 0 AND y < 100
|
||||
* x > 5 OR y = 'test'
|
||||
*
|
||||
* Filtering performance can often be improved by creating a scalar index
|
||||
* on the filter column(s).
|
||||
*/
|
||||
where(predicate: string): this {
|
||||
this.doCall((inner: NativeQueryType) => inner.onlyIf(predicate));
|
||||
return this;
|
||||
}
|
||||
/**
|
||||
* A filter statement to be applied to this query.
|
||||
* @see where
|
||||
* @deprecated Use `where` instead
|
||||
*/
|
||||
filter(predicate: string): this {
|
||||
return this.where(predicate);
|
||||
}
|
||||
|
||||
fullTextSearch(
|
||||
query: string | FullTextQuery,
|
||||
options?: Partial<FullTextSearchOptions>,
|
||||
): this {
|
||||
let columns: string[] | null = null;
|
||||
if (options) {
|
||||
if (typeof options.columns === "string") {
|
||||
columns = [options.columns];
|
||||
} else if (Array.isArray(options.columns)) {
|
||||
columns = options.columns;
|
||||
}
|
||||
}
|
||||
|
||||
this.doCall((inner: NativeQueryType) => {
|
||||
if (typeof query === "string") {
|
||||
inner.fullTextSearch({
|
||||
query: query,
|
||||
columns: columns,
|
||||
});
|
||||
} else {
|
||||
inner.fullTextSearch({ query: query.inner });
|
||||
}
|
||||
});
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the maximum number of results to return.
|
||||
*
|
||||
* By default, a plain search has no limit. If this method is not
|
||||
* called then every valid row from the table will be returned.
|
||||
*/
|
||||
limit(limit: number): this {
|
||||
this.doCall((inner: NativeQueryType) => inner.limit(limit));
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the number of rows to skip before returning results.
|
||||
*
|
||||
* This is useful for pagination.
|
||||
*/
|
||||
offset(offset: number): this {
|
||||
this.doCall((inner: NativeQueryType) => inner.offset(offset));
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Skip searching un-indexed data. This can make search faster, but will miss
|
||||
* any data that is not yet indexed.
|
||||
*
|
||||
* Use {@link Table#optimize} to index all un-indexed data.
|
||||
*/
|
||||
fastSearch(): this {
|
||||
this.doCall((inner: NativeQueryType) => inner.fastSearch());
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* An interface for a query that can be executed
|
||||
*
|
||||
@@ -419,7 +438,7 @@ export interface ExecutableQuery {}
|
||||
*
|
||||
* @hideconstructor
|
||||
*/
|
||||
export class VectorQuery extends QueryBase<NativeVectorQuery> {
|
||||
export class VectorQuery extends StandardQueryBase<NativeVectorQuery> {
|
||||
/**
|
||||
* @hidden
|
||||
*/
|
||||
@@ -679,13 +698,24 @@ export class VectorQuery extends QueryBase<NativeVectorQuery> {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* A query that returns a subset of the rows in the table.
|
||||
*
|
||||
* @hideconstructor
|
||||
*/
|
||||
export class TakeQuery extends QueryBase<NativeTakeQuery> {
|
||||
constructor(inner: NativeTakeQuery) {
|
||||
super(inner);
|
||||
}
|
||||
}
|
||||
|
||||
/** A builder for LanceDB queries.
|
||||
*
|
||||
* @see {@link Table#query}, {@link Table#search}
|
||||
*
|
||||
* @hideconstructor
|
||||
*/
|
||||
export class Query extends QueryBase<NativeQuery> {
|
||||
export class Query extends StandardQueryBase<NativeQuery> {
|
||||
/**
|
||||
* @hidden
|
||||
*/
|
||||
|
||||
@@ -35,6 +35,7 @@ import {
|
||||
import {
|
||||
FullTextQuery,
|
||||
Query,
|
||||
TakeQuery,
|
||||
VectorQuery,
|
||||
instanceOfFullTextQuery,
|
||||
} from "./query";
|
||||
@@ -336,6 +337,20 @@ export abstract class Table {
|
||||
*/
|
||||
abstract query(): Query;
|
||||
|
||||
/**
|
||||
* Create a query that returns a subset of the rows in the table.
|
||||
* @param offsets The offsets of the rows to return.
|
||||
* @returns A builder that can be used to parameterize the query.
|
||||
*/
|
||||
abstract takeOffsets(offsets: number[]): TakeQuery;
|
||||
|
||||
/**
|
||||
* Create a query that returns a subset of the rows in the table.
|
||||
* @param rowIds The row ids of the rows to return.
|
||||
* @returns A builder that can be used to parameterize the query.
|
||||
*/
|
||||
abstract takeRowIds(rowIds: number[]): TakeQuery;
|
||||
|
||||
/**
|
||||
* Create a search query to find the nearest neighbors
|
||||
* of the given query
|
||||
@@ -647,6 +662,8 @@ export class LocalTable extends Table {
|
||||
column,
|
||||
options?.replace,
|
||||
options?.waitTimeoutSeconds,
|
||||
options?.name,
|
||||
options?.train,
|
||||
);
|
||||
}
|
||||
|
||||
@@ -665,6 +682,14 @@ export class LocalTable extends Table {
|
||||
await this.inner.waitForIndex(indexNames, timeoutSeconds);
|
||||
}
|
||||
|
||||
takeOffsets(offsets: number[]): TakeQuery {
|
||||
return new TakeQuery(this.inner.takeOffsets(offsets));
|
||||
}
|
||||
|
||||
takeRowIds(rowIds: number[]): TakeQuery {
|
||||
return new TakeQuery(this.inner.takeRowIds(rowIds));
|
||||
}
|
||||
|
||||
query(): Query {
|
||||
return new Query(this.inner);
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-darwin-arm64",
|
||||
"version": "0.21.2",
|
||||
"version": "0.22.0-beta.1",
|
||||
"os": ["darwin"],
|
||||
"cpu": ["arm64"],
|
||||
"main": "lancedb.darwin-arm64.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-darwin-x64",
|
||||
"version": "0.21.2",
|
||||
"version": "0.22.0-beta.1",
|
||||
"os": ["darwin"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.darwin-x64.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-arm64-gnu",
|
||||
"version": "0.21.2",
|
||||
"version": "0.22.0-beta.1",
|
||||
"os": ["linux"],
|
||||
"cpu": ["arm64"],
|
||||
"main": "lancedb.linux-arm64-gnu.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-arm64-musl",
|
||||
"version": "0.21.2",
|
||||
"version": "0.22.0-beta.1",
|
||||
"os": ["linux"],
|
||||
"cpu": ["arm64"],
|
||||
"main": "lancedb.linux-arm64-musl.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-x64-gnu",
|
||||
"version": "0.21.2",
|
||||
"version": "0.22.0-beta.1",
|
||||
"os": ["linux"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.linux-x64-gnu.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-x64-musl",
|
||||
"version": "0.21.2",
|
||||
"version": "0.22.0-beta.1",
|
||||
"os": ["linux"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.linux-x64-musl.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-win32-arm64-msvc",
|
||||
"version": "0.21.2",
|
||||
"version": "0.22.0-beta.1",
|
||||
"os": [
|
||||
"win32"
|
||||
],
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-win32-x64-msvc",
|
||||
"version": "0.21.2",
|
||||
"version": "0.22.0-beta.1",
|
||||
"os": ["win32"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.win32-x64-msvc.node",
|
||||
|
||||
4
nodejs/package-lock.json
generated
4
nodejs/package-lock.json
generated
@@ -1,12 +1,12 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb",
|
||||
"version": "0.21.2",
|
||||
"version": "0.22.0-beta.1",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "@lancedb/lancedb",
|
||||
"version": "0.21.2",
|
||||
"version": "0.22.0-beta.1",
|
||||
"cpu": [
|
||||
"x64",
|
||||
"arm64"
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
"ann"
|
||||
],
|
||||
"private": false,
|
||||
"version": "0.21.2",
|
||||
"version": "0.22.0-beta.1",
|
||||
"main": "dist/index.js",
|
||||
"exports": {
|
||||
".": "./dist/index.js",
|
||||
|
||||
@@ -100,10 +100,12 @@ impl Connection {
|
||||
#[napi(catch_unwind)]
|
||||
pub async fn table_names(
|
||||
&self,
|
||||
namespace: Vec<String>,
|
||||
start_after: Option<String>,
|
||||
limit: Option<u32>,
|
||||
) -> napi::Result<Vec<String>> {
|
||||
let mut op = self.get_inner()?.table_names();
|
||||
op = op.namespace(namespace);
|
||||
if let Some(start_after) = start_after {
|
||||
op = op.start_after(start_after);
|
||||
}
|
||||
@@ -125,6 +127,7 @@ impl Connection {
|
||||
name: String,
|
||||
buf: Buffer,
|
||||
mode: String,
|
||||
namespace: Vec<String>,
|
||||
storage_options: Option<HashMap<String, String>>,
|
||||
) -> napi::Result<Table> {
|
||||
let batches = ipc_file_to_batches(buf.to_vec())
|
||||
@@ -132,6 +135,8 @@ impl Connection {
|
||||
let mode = Self::parse_create_mode_str(&mode)?;
|
||||
let mut builder = self.get_inner()?.create_table(&name, batches).mode(mode);
|
||||
|
||||
builder = builder.namespace(namespace);
|
||||
|
||||
if let Some(storage_options) = storage_options {
|
||||
for (key, value) in storage_options {
|
||||
builder = builder.storage_option(key, value);
|
||||
@@ -147,6 +152,7 @@ impl Connection {
|
||||
name: String,
|
||||
schema_buf: Buffer,
|
||||
mode: String,
|
||||
namespace: Vec<String>,
|
||||
storage_options: Option<HashMap<String, String>>,
|
||||
) -> napi::Result<Table> {
|
||||
let schema = ipc_file_to_schema(schema_buf.to_vec()).map_err(|e| {
|
||||
@@ -157,6 +163,9 @@ impl Connection {
|
||||
.get_inner()?
|
||||
.create_empty_table(&name, schema)
|
||||
.mode(mode);
|
||||
|
||||
builder = builder.namespace(namespace);
|
||||
|
||||
if let Some(storage_options) = storage_options {
|
||||
for (key, value) in storage_options {
|
||||
builder = builder.storage_option(key, value);
|
||||
@@ -170,10 +179,14 @@ impl Connection {
|
||||
pub async fn open_table(
|
||||
&self,
|
||||
name: String,
|
||||
namespace: Vec<String>,
|
||||
storage_options: Option<HashMap<String, String>>,
|
||||
index_cache_size: Option<u32>,
|
||||
) -> napi::Result<Table> {
|
||||
let mut builder = self.get_inner()?.open_table(&name);
|
||||
|
||||
builder = builder.namespace(namespace);
|
||||
|
||||
if let Some(storage_options) = storage_options {
|
||||
for (key, value) in storage_options {
|
||||
builder = builder.storage_option(key, value);
|
||||
@@ -188,12 +201,18 @@ impl Connection {
|
||||
|
||||
/// Drop table with the name. Or raise an error if the table does not exist.
|
||||
#[napi(catch_unwind)]
|
||||
pub async fn drop_table(&self, name: String) -> napi::Result<()> {
|
||||
self.get_inner()?.drop_table(&name).await.default_error()
|
||||
pub async fn drop_table(&self, name: String, namespace: Vec<String>) -> napi::Result<()> {
|
||||
self.get_inner()?
|
||||
.drop_table(&name, &namespace)
|
||||
.await
|
||||
.default_error()
|
||||
}
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
pub async fn drop_all_tables(&self) -> napi::Result<()> {
|
||||
self.get_inner()?.drop_all_tables().await.default_error()
|
||||
pub async fn drop_all_tables(&self, namespace: Vec<String>) -> napi::Result<()> {
|
||||
self.get_inner()?
|
||||
.drop_all_tables(&namespace)
|
||||
.await
|
||||
.default_error()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,6 +12,7 @@ use lancedb::query::Query as LanceDbQuery;
|
||||
use lancedb::query::QueryBase;
|
||||
use lancedb::query::QueryExecutionOptions;
|
||||
use lancedb::query::Select;
|
||||
use lancedb::query::TakeQuery as LanceDbTakeQuery;
|
||||
use lancedb::query::VectorQuery as LanceDbVectorQuery;
|
||||
use napi::bindgen_prelude::*;
|
||||
use napi_derive::napi;
|
||||
@@ -319,6 +320,79 @@ impl VectorQuery {
|
||||
}
|
||||
}
|
||||
|
||||
#[napi]
|
||||
pub struct TakeQuery {
|
||||
inner: LanceDbTakeQuery,
|
||||
}
|
||||
|
||||
#[napi]
|
||||
impl TakeQuery {
|
||||
pub fn new(query: LanceDbTakeQuery) -> Self {
|
||||
Self { inner: query }
|
||||
}
|
||||
|
||||
#[napi]
|
||||
pub fn select(&mut self, columns: Vec<(String, String)>) {
|
||||
self.inner = self.inner.clone().select(Select::dynamic(&columns));
|
||||
}
|
||||
|
||||
#[napi]
|
||||
pub fn select_columns(&mut self, columns: Vec<String>) {
|
||||
self.inner = self.inner.clone().select(Select::columns(&columns));
|
||||
}
|
||||
|
||||
#[napi]
|
||||
pub fn with_row_id(&mut self) {
|
||||
self.inner = self.inner.clone().with_row_id();
|
||||
}
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
pub async fn execute(
|
||||
&self,
|
||||
max_batch_length: Option<u32>,
|
||||
timeout_ms: Option<u32>,
|
||||
) -> napi::Result<RecordBatchIterator> {
|
||||
let mut execution_opts = QueryExecutionOptions::default();
|
||||
if let Some(max_batch_length) = max_batch_length {
|
||||
execution_opts.max_batch_length = max_batch_length;
|
||||
}
|
||||
if let Some(timeout_ms) = timeout_ms {
|
||||
execution_opts.timeout = Some(std::time::Duration::from_millis(timeout_ms as u64))
|
||||
}
|
||||
let inner_stream = self
|
||||
.inner
|
||||
.execute_with_options(execution_opts)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
napi::Error::from_reason(format!(
|
||||
"Failed to execute query stream: {}",
|
||||
convert_error(&e)
|
||||
))
|
||||
})?;
|
||||
Ok(RecordBatchIterator::new(inner_stream))
|
||||
}
|
||||
|
||||
#[napi]
|
||||
pub async fn explain_plan(&self, verbose: bool) -> napi::Result<String> {
|
||||
self.inner.explain_plan(verbose).await.map_err(|e| {
|
||||
napi::Error::from_reason(format!(
|
||||
"Failed to retrieve the query plan: {}",
|
||||
convert_error(&e)
|
||||
))
|
||||
})
|
||||
}
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
pub async fn analyze_plan(&self) -> napi::Result<String> {
|
||||
self.inner.analyze_plan().await.map_err(|e| {
|
||||
napi::Error::from_reason(format!(
|
||||
"Failed to execute analyze plan: {}",
|
||||
convert_error(&e)
|
||||
))
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[napi]
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct JsFullTextQuery {
|
||||
@@ -406,6 +480,7 @@ impl JsFullTextQuery {
|
||||
}
|
||||
|
||||
#[napi(factory)]
|
||||
#[allow(clippy::use_self)] // NAPI doesn't allow Self here but clippy reports it
|
||||
pub fn boolean_query(queries: Vec<(String, &JsFullTextQuery)>) -> napi::Result<Self> {
|
||||
let mut sub_queries = Vec::with_capacity(queries.len());
|
||||
for (occur, q) in queries {
|
||||
|
||||
@@ -76,6 +76,7 @@ pub struct ClientConfig {
|
||||
pub retry_config: Option<RetryConfig>,
|
||||
pub timeout_config: Option<TimeoutConfig>,
|
||||
pub extra_headers: Option<HashMap<String, String>>,
|
||||
pub id_delimiter: Option<String>,
|
||||
}
|
||||
|
||||
impl From<TimeoutConfig> for lancedb::remote::TimeoutConfig {
|
||||
@@ -115,6 +116,7 @@ impl From<ClientConfig> for lancedb::remote::ClientConfig {
|
||||
retry_config: config.retry_config.map(Into::into).unwrap_or_default(),
|
||||
timeout_config: config.timeout_config.map(Into::into).unwrap_or_default(),
|
||||
extra_headers: config.extra_headers.unwrap_or_default(),
|
||||
id_delimiter: config.id_delimiter,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -94,7 +94,7 @@ impl napi::bindgen_prelude::FromNapiValue for Session {
|
||||
env: napi::sys::napi_env,
|
||||
napi_val: napi::sys::napi_value,
|
||||
) -> napi::Result<Self> {
|
||||
let object: napi::bindgen_prelude::ClassInstance<Session> =
|
||||
let object: napi::bindgen_prelude::ClassInstance<Self> =
|
||||
napi::bindgen_prelude::ClassInstance::from_napi_value(env, napi_val)?;
|
||||
let copy = object.clone();
|
||||
Ok(copy)
|
||||
|
||||
@@ -15,7 +15,7 @@ use napi_derive::napi;
|
||||
use crate::error::NapiErrorExt;
|
||||
use crate::index::Index;
|
||||
use crate::merge::NativeMergeInsertBuilder;
|
||||
use crate::query::{Query, VectorQuery};
|
||||
use crate::query::{Query, TakeQuery, VectorQuery};
|
||||
|
||||
#[napi]
|
||||
pub struct Table {
|
||||
@@ -114,6 +114,8 @@ impl Table {
|
||||
column: String,
|
||||
replace: Option<bool>,
|
||||
wait_timeout_s: Option<i64>,
|
||||
name: Option<String>,
|
||||
train: Option<bool>,
|
||||
) -> napi::Result<()> {
|
||||
let lancedb_index = if let Some(index) = index {
|
||||
index.consume()?
|
||||
@@ -128,6 +130,12 @@ impl Table {
|
||||
builder =
|
||||
builder.wait_timeout(std::time::Duration::from_secs(timeout.try_into().unwrap()));
|
||||
}
|
||||
if let Some(name) = name {
|
||||
builder = builder.name(name);
|
||||
}
|
||||
if let Some(train) = train {
|
||||
builder = builder.train(train);
|
||||
}
|
||||
builder.execute().await.default_error()
|
||||
}
|
||||
|
||||
@@ -187,6 +195,44 @@ impl Table {
|
||||
Ok(Query::new(self.inner_ref()?.query()))
|
||||
}
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
pub fn take_offsets(&self, offsets: Vec<i64>) -> napi::Result<TakeQuery> {
|
||||
Ok(TakeQuery::new(
|
||||
self.inner_ref()?.take_offsets(
|
||||
offsets
|
||||
.into_iter()
|
||||
.map(|o| {
|
||||
u64::try_from(o).map_err(|e| {
|
||||
napi::Error::from_reason(format!(
|
||||
"Failed to convert offset to u64: {}",
|
||||
e
|
||||
))
|
||||
})
|
||||
})
|
||||
.collect::<Result<Vec<_>>>()?,
|
||||
),
|
||||
))
|
||||
}
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
pub fn take_row_ids(&self, row_ids: Vec<i64>) -> napi::Result<TakeQuery> {
|
||||
Ok(TakeQuery::new(
|
||||
self.inner_ref()?.take_row_ids(
|
||||
row_ids
|
||||
.into_iter()
|
||||
.map(|o| {
|
||||
u64::try_from(o).map_err(|e| {
|
||||
napi::Error::from_reason(format!(
|
||||
"Failed to convert row id to u64: {}",
|
||||
e
|
||||
))
|
||||
})
|
||||
})
|
||||
.collect::<Result<Vec<_>>>()?,
|
||||
),
|
||||
))
|
||||
}
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
pub fn vector_search(&self, vector: Float32Array) -> napi::Result<VectorQuery> {
|
||||
self.query()?.nearest_to(vector)
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
"intentionallyNotExported": [
|
||||
"lancedb/native.d.ts:Query",
|
||||
"lancedb/native.d.ts:VectorQuery",
|
||||
"lancedb/native.d.ts:TakeQuery",
|
||||
"lancedb/native.d.ts:RecordBatchIterator",
|
||||
"lancedb/native.d.ts:NativeMergeInsertBuilder"
|
||||
],
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
[tool.bumpversion]
|
||||
current_version = "0.24.2"
|
||||
current_version = "0.25.0"
|
||||
parse = """(?x)
|
||||
(?P<major>0|[1-9]\\d*)\\.
|
||||
(?P<minor>0|[1-9]\\d*)\\.
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "lancedb-python"
|
||||
version = "0.24.2"
|
||||
version = "0.25.0"
|
||||
edition.workspace = true
|
||||
description = "Python bindings for LanceDB"
|
||||
license.workspace = true
|
||||
@@ -33,6 +33,6 @@ pyo3-build-config = { version = "0.24", features = [
|
||||
] }
|
||||
|
||||
[features]
|
||||
default = ["remote"]
|
||||
default = ["remote", "lancedb/default"]
|
||||
fp16kernels = ["lancedb/fp16kernels"]
|
||||
remote = ["lancedb/remote"]
|
||||
|
||||
@@ -10,6 +10,7 @@ dependencies = [
|
||||
"pyarrow>=16",
|
||||
"pydantic>=1.10",
|
||||
"tqdm>=4.27.0",
|
||||
"lance-namespace==0.0.6"
|
||||
]
|
||||
description = "lancedb"
|
||||
authors = [{ name = "LanceDB Devs", email = "dev@lancedb.com" }]
|
||||
|
||||
@@ -19,6 +19,7 @@ from .remote.db import RemoteDBConnection
|
||||
from .schema import vector
|
||||
from .table import AsyncTable
|
||||
from ._lancedb import Session
|
||||
from .namespace import connect_namespace, LanceNamespaceDBConnection
|
||||
|
||||
|
||||
def connect(
|
||||
@@ -221,6 +222,7 @@ async def connect_async(
|
||||
__all__ = [
|
||||
"connect",
|
||||
"connect_async",
|
||||
"connect_namespace",
|
||||
"AsyncConnection",
|
||||
"AsyncTable",
|
||||
"URI",
|
||||
@@ -228,6 +230,7 @@ __all__ = [
|
||||
"vector",
|
||||
"DBConnection",
|
||||
"LanceDBConnection",
|
||||
"LanceNamespaceDBConnection",
|
||||
"RemoteDBConnection",
|
||||
"Session",
|
||||
"__version__",
|
||||
|
||||
@@ -21,14 +21,28 @@ class Session:
|
||||
|
||||
class Connection(object):
|
||||
uri: str
|
||||
async def is_open(self): ...
|
||||
async def close(self): ...
|
||||
async def list_namespaces(
|
||||
self,
|
||||
namespace: List[str],
|
||||
page_token: Optional[str],
|
||||
limit: Optional[int],
|
||||
) -> List[str]: ...
|
||||
async def create_namespace(self, namespace: List[str]) -> None: ...
|
||||
async def drop_namespace(self, namespace: List[str]) -> None: ...
|
||||
async def table_names(
|
||||
self, start_after: Optional[str], limit: Optional[int]
|
||||
self,
|
||||
namespace: List[str],
|
||||
start_after: Optional[str],
|
||||
limit: Optional[int],
|
||||
) -> list[str]: ...
|
||||
async def create_table(
|
||||
self,
|
||||
name: str,
|
||||
mode: str,
|
||||
data: pa.RecordBatchReader,
|
||||
namespace: List[str] = [],
|
||||
storage_options: Optional[Dict[str, str]] = None,
|
||||
) -> Table: ...
|
||||
async def create_empty_table(
|
||||
@@ -36,10 +50,25 @@ class Connection(object):
|
||||
name: str,
|
||||
mode: str,
|
||||
schema: pa.Schema,
|
||||
namespace: List[str] = [],
|
||||
storage_options: Optional[Dict[str, str]] = None,
|
||||
) -> Table: ...
|
||||
async def rename_table(self, old_name: str, new_name: str) -> None: ...
|
||||
async def drop_table(self, name: str) -> None: ...
|
||||
async def open_table(
|
||||
self,
|
||||
name: str,
|
||||
namespace: List[str] = [],
|
||||
storage_options: Optional[Dict[str, str]] = None,
|
||||
index_cache_size: Optional[int] = None,
|
||||
) -> Table: ...
|
||||
async def rename_table(
|
||||
self,
|
||||
cur_name: str,
|
||||
new_name: str,
|
||||
cur_namespace: List[str] = [],
|
||||
new_namespace: List[str] = [],
|
||||
) -> None: ...
|
||||
async def drop_table(self, name: str, namespace: List[str] = []) -> None: ...
|
||||
async def drop_all_tables(self, namespace: List[str] = []) -> None: ...
|
||||
|
||||
class Table:
|
||||
def name(self) -> str: ...
|
||||
@@ -59,6 +88,10 @@ class Table:
|
||||
column: str,
|
||||
index: Union[IvfFlat, IvfPq, HnswPq, HnswSq, BTree, Bitmap, LabelList, FTS],
|
||||
replace: Optional[bool],
|
||||
wait_timeout: Optional[object],
|
||||
*,
|
||||
name: Optional[str],
|
||||
train: Optional[bool],
|
||||
): ...
|
||||
async def list_versions(self) -> List[Dict[str, Any]]: ...
|
||||
async def version(self) -> int: ...
|
||||
|
||||
@@ -43,14 +43,70 @@ if TYPE_CHECKING:
|
||||
class DBConnection(EnforceOverrides):
|
||||
"""An active LanceDB connection interface."""
|
||||
|
||||
def list_namespaces(
|
||||
self,
|
||||
namespace: List[str] = [],
|
||||
page_token: Optional[str] = None,
|
||||
limit: int = 10,
|
||||
) -> Iterable[str]:
|
||||
"""List immediate child namespace names in the given namespace.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
namespace: List[str], default []
|
||||
The parent namespace to list namespaces in.
|
||||
Empty list represents root namespace.
|
||||
page_token: str, optional
|
||||
The token to use for pagination. If not present, start from the beginning.
|
||||
limit: int, default 10
|
||||
The size of the page to return.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Iterable of str
|
||||
List of immediate child namespace names
|
||||
"""
|
||||
return []
|
||||
|
||||
def create_namespace(self, namespace: List[str]) -> None:
|
||||
"""Create a new namespace.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
namespace: List[str]
|
||||
The namespace identifier to create.
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
"Namespace operations are not supported for this connection type"
|
||||
)
|
||||
|
||||
def drop_namespace(self, namespace: List[str]) -> None:
|
||||
"""Drop a namespace.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
namespace: List[str]
|
||||
The namespace identifier to drop.
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
"Namespace operations are not supported for this connection type"
|
||||
)
|
||||
|
||||
@abstractmethod
|
||||
def table_names(
|
||||
self, page_token: Optional[str] = None, limit: int = 10
|
||||
self,
|
||||
page_token: Optional[str] = None,
|
||||
limit: int = 10,
|
||||
*,
|
||||
namespace: List[str] = [],
|
||||
) -> Iterable[str]:
|
||||
"""List all tables in this database, in sorted order
|
||||
|
||||
Parameters
|
||||
----------
|
||||
namespace: List[str], default []
|
||||
The namespace to list tables in.
|
||||
Empty list represents root namespace.
|
||||
page_token: str, optional
|
||||
The token to use for pagination. If not present, start from the beginning.
|
||||
Typically, this token is last table name from the previous page.
|
||||
@@ -77,6 +133,7 @@ class DBConnection(EnforceOverrides):
|
||||
fill_value: float = 0.0,
|
||||
embedding_functions: Optional[List[EmbeddingFunctionConfig]] = None,
|
||||
*,
|
||||
namespace: List[str] = [],
|
||||
storage_options: Optional[Dict[str, str]] = None,
|
||||
data_storage_version: Optional[str] = None,
|
||||
enable_v2_manifest_paths: Optional[bool] = None,
|
||||
@@ -87,6 +144,9 @@ class DBConnection(EnforceOverrides):
|
||||
----------
|
||||
name: str
|
||||
The name of the table.
|
||||
namespace: List[str], default []
|
||||
The namespace to create the table in.
|
||||
Empty list represents root namespace.
|
||||
data: The data to initialize the table, *optional*
|
||||
User must provide at least one of `data` or `schema`.
|
||||
Acceptable types are:
|
||||
@@ -238,6 +298,7 @@ class DBConnection(EnforceOverrides):
|
||||
self,
|
||||
name: str,
|
||||
*,
|
||||
namespace: List[str] = [],
|
||||
storage_options: Optional[Dict[str, str]] = None,
|
||||
index_cache_size: Optional[int] = None,
|
||||
) -> Table:
|
||||
@@ -247,6 +308,9 @@ class DBConnection(EnforceOverrides):
|
||||
----------
|
||||
name: str
|
||||
The name of the table.
|
||||
namespace: List[str], optional
|
||||
The namespace to open the table from.
|
||||
None or empty list represents root namespace.
|
||||
index_cache_size: int, default 256
|
||||
**Deprecated**: Use session-level cache configuration instead.
|
||||
Create a Session with custom cache sizes and pass it to lancedb.connect().
|
||||
@@ -272,17 +336,26 @@ class DBConnection(EnforceOverrides):
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def drop_table(self, name: str):
|
||||
def drop_table(self, name: str, namespace: List[str] = []):
|
||||
"""Drop a table from the database.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
name: str
|
||||
The name of the table.
|
||||
namespace: List[str], default []
|
||||
The namespace to drop the table from.
|
||||
Empty list represents root namespace.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def rename_table(self, cur_name: str, new_name: str):
|
||||
def rename_table(
|
||||
self,
|
||||
cur_name: str,
|
||||
new_name: str,
|
||||
cur_namespace: List[str] = [],
|
||||
new_namespace: List[str] = [],
|
||||
):
|
||||
"""Rename a table in the database.
|
||||
|
||||
Parameters
|
||||
@@ -291,6 +364,12 @@ class DBConnection(EnforceOverrides):
|
||||
The current name of the table.
|
||||
new_name: str
|
||||
The new name of the table.
|
||||
cur_namespace: List[str], optional
|
||||
The namespace of the current table.
|
||||
None or empty list represents root namespace.
|
||||
new_namespace: List[str], optional
|
||||
The namespace to move the table to.
|
||||
If not specified, defaults to the same as cur_namespace.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@@ -301,9 +380,15 @@ class DBConnection(EnforceOverrides):
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def drop_all_tables(self):
|
||||
def drop_all_tables(self, namespace: List[str] = []):
|
||||
"""
|
||||
Drop all tables from the database
|
||||
|
||||
Parameters
|
||||
----------
|
||||
namespace: List[str], optional
|
||||
The namespace to drop all tables from.
|
||||
None or empty list represents root namespace.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@@ -404,18 +489,87 @@ class LanceDBConnection(DBConnection):
|
||||
conn = AsyncConnection(await lancedb_connect(self.uri))
|
||||
return await conn.table_names(start_after=start_after, limit=limit)
|
||||
|
||||
@override
|
||||
def list_namespaces(
|
||||
self,
|
||||
namespace: List[str] = [],
|
||||
page_token: Optional[str] = None,
|
||||
limit: int = 10,
|
||||
) -> Iterable[str]:
|
||||
"""List immediate child namespace names in the given namespace.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
namespace: List[str], optional
|
||||
The parent namespace to list namespaces in.
|
||||
None or empty list represents root namespace.
|
||||
page_token: str, optional
|
||||
The token to use for pagination. If not present, start from the beginning.
|
||||
limit: int, default 10
|
||||
The size of the page to return.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Iterable of str
|
||||
List of immediate child namespace names
|
||||
"""
|
||||
return LOOP.run(
|
||||
self._conn.list_namespaces(
|
||||
namespace=namespace, page_token=page_token, limit=limit
|
||||
)
|
||||
)
|
||||
|
||||
@override
|
||||
def create_namespace(self, namespace: List[str]) -> None:
|
||||
"""Create a new namespace.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
namespace: List[str]
|
||||
The namespace identifier to create.
|
||||
"""
|
||||
LOOP.run(self._conn.create_namespace(namespace=namespace))
|
||||
|
||||
@override
|
||||
def drop_namespace(self, namespace: List[str]) -> None:
|
||||
"""Drop a namespace.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
namespace: List[str]
|
||||
The namespace identifier to drop.
|
||||
"""
|
||||
return LOOP.run(self._conn.drop_namespace(namespace=namespace))
|
||||
|
||||
@override
|
||||
def table_names(
|
||||
self, page_token: Optional[str] = None, limit: int = 10
|
||||
self,
|
||||
page_token: Optional[str] = None,
|
||||
limit: int = 10,
|
||||
*,
|
||||
namespace: List[str] = [],
|
||||
) -> Iterable[str]:
|
||||
"""Get the names of all tables in the database. The names are sorted.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
namespace: List[str], optional
|
||||
The namespace to list tables in.
|
||||
page_token: str, optional
|
||||
The token to use for pagination.
|
||||
limit: int, default 10
|
||||
The maximum number of tables to return.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Iterator of str.
|
||||
A list of table names.
|
||||
"""
|
||||
return LOOP.run(self._conn.table_names(start_after=page_token, limit=limit))
|
||||
return LOOP.run(
|
||||
self._conn.table_names(
|
||||
namespace=namespace, start_after=page_token, limit=limit
|
||||
)
|
||||
)
|
||||
|
||||
def __len__(self) -> int:
|
||||
return len(self.table_names())
|
||||
@@ -435,12 +589,18 @@ class LanceDBConnection(DBConnection):
|
||||
fill_value: float = 0.0,
|
||||
embedding_functions: Optional[List[EmbeddingFunctionConfig]] = None,
|
||||
*,
|
||||
namespace: List[str] = [],
|
||||
storage_options: Optional[Dict[str, str]] = None,
|
||||
data_storage_version: Optional[str] = None,
|
||||
enable_v2_manifest_paths: Optional[bool] = None,
|
||||
) -> LanceTable:
|
||||
"""Create a table in the database.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
namespace: List[str], optional
|
||||
The namespace to create the table in.
|
||||
|
||||
See
|
||||
---
|
||||
DBConnection.create_table
|
||||
@@ -459,6 +619,7 @@ class LanceDBConnection(DBConnection):
|
||||
on_bad_vectors=on_bad_vectors,
|
||||
fill_value=fill_value,
|
||||
embedding_functions=embedding_functions,
|
||||
namespace=namespace,
|
||||
storage_options=storage_options,
|
||||
)
|
||||
return tbl
|
||||
@@ -468,6 +629,7 @@ class LanceDBConnection(DBConnection):
|
||||
self,
|
||||
name: str,
|
||||
*,
|
||||
namespace: List[str] = [],
|
||||
storage_options: Optional[Dict[str, str]] = None,
|
||||
index_cache_size: Optional[int] = None,
|
||||
) -> LanceTable:
|
||||
@@ -477,6 +639,8 @@ class LanceDBConnection(DBConnection):
|
||||
----------
|
||||
name: str
|
||||
The name of the table.
|
||||
namespace: List[str], optional
|
||||
The namespace to open the table from.
|
||||
|
||||
Returns
|
||||
-------
|
||||
@@ -496,26 +660,68 @@ class LanceDBConnection(DBConnection):
|
||||
return LanceTable.open(
|
||||
self,
|
||||
name,
|
||||
namespace=namespace,
|
||||
storage_options=storage_options,
|
||||
index_cache_size=index_cache_size,
|
||||
)
|
||||
|
||||
@override
|
||||
def drop_table(self, name: str, ignore_missing: bool = False):
|
||||
def drop_table(
|
||||
self,
|
||||
name: str,
|
||||
namespace: List[str] = [],
|
||||
ignore_missing: bool = False,
|
||||
):
|
||||
"""Drop a table from the database.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
name: str
|
||||
The name of the table.
|
||||
namespace: List[str], optional
|
||||
The namespace to drop the table from.
|
||||
ignore_missing: bool, default False
|
||||
If True, ignore if the table does not exist.
|
||||
"""
|
||||
LOOP.run(self._conn.drop_table(name, ignore_missing=ignore_missing))
|
||||
LOOP.run(
|
||||
self._conn.drop_table(
|
||||
name, namespace=namespace, ignore_missing=ignore_missing
|
||||
)
|
||||
)
|
||||
|
||||
@override
|
||||
def drop_all_tables(self):
|
||||
LOOP.run(self._conn.drop_all_tables())
|
||||
def drop_all_tables(self, namespace: List[str] = []):
|
||||
LOOP.run(self._conn.drop_all_tables(namespace=namespace))
|
||||
|
||||
@override
|
||||
def rename_table(
|
||||
self,
|
||||
cur_name: str,
|
||||
new_name: str,
|
||||
cur_namespace: List[str] = [],
|
||||
new_namespace: List[str] = [],
|
||||
):
|
||||
"""Rename a table in the database.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
cur_name: str
|
||||
The current name of the table.
|
||||
new_name: str
|
||||
The new name of the table.
|
||||
cur_namespace: List[str], optional
|
||||
The namespace of the current table.
|
||||
new_namespace: List[str], optional
|
||||
The namespace to move the table to.
|
||||
"""
|
||||
LOOP.run(
|
||||
self._conn.rename_table(
|
||||
cur_name,
|
||||
new_name,
|
||||
cur_namespace=cur_namespace,
|
||||
new_namespace=new_namespace,
|
||||
)
|
||||
)
|
||||
|
||||
@deprecation.deprecated(
|
||||
deprecated_in="0.15.1",
|
||||
@@ -588,13 +794,67 @@ class AsyncConnection(object):
|
||||
def uri(self) -> str:
|
||||
return self._inner.uri
|
||||
|
||||
async def list_namespaces(
|
||||
self,
|
||||
namespace: List[str] = [],
|
||||
page_token: Optional[str] = None,
|
||||
limit: int = 10,
|
||||
) -> Iterable[str]:
|
||||
"""List immediate child namespace names in the given namespace.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
namespace: List[str], optional
|
||||
The parent namespace to list namespaces in.
|
||||
None or empty list represents root namespace.
|
||||
page_token: str, optional
|
||||
The token to use for pagination. If not present, start from the beginning.
|
||||
limit: int, default 10
|
||||
The size of the page to return.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Iterable of str
|
||||
List of immediate child namespace names (not full paths)
|
||||
"""
|
||||
return await self._inner.list_namespaces(
|
||||
namespace=namespace, page_token=page_token, limit=limit
|
||||
)
|
||||
|
||||
async def create_namespace(self, namespace: List[str]) -> None:
|
||||
"""Create a new namespace.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
namespace: List[str]
|
||||
The namespace identifier to create.
|
||||
"""
|
||||
await self._inner.create_namespace(namespace)
|
||||
|
||||
async def drop_namespace(self, namespace: List[str]) -> None:
|
||||
"""Drop a namespace.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
namespace: List[str]
|
||||
The namespace identifier to drop.
|
||||
"""
|
||||
await self._inner.drop_namespace(namespace)
|
||||
|
||||
async def table_names(
|
||||
self, *, start_after: Optional[str] = None, limit: Optional[int] = None
|
||||
self,
|
||||
*,
|
||||
namespace: List[str] = [],
|
||||
start_after: Optional[str] = None,
|
||||
limit: Optional[int] = None,
|
||||
) -> Iterable[str]:
|
||||
"""List all tables in this database, in sorted order
|
||||
|
||||
Parameters
|
||||
----------
|
||||
namespace: List[str], optional
|
||||
The namespace to list tables in.
|
||||
None or empty list represents root namespace.
|
||||
start_after: str, optional
|
||||
If present, only return names that come lexicographically after the supplied
|
||||
value.
|
||||
@@ -608,7 +868,9 @@ class AsyncConnection(object):
|
||||
-------
|
||||
Iterable of str
|
||||
"""
|
||||
return await self._inner.table_names(start_after=start_after, limit=limit)
|
||||
return await self._inner.table_names(
|
||||
namespace=namespace, start_after=start_after, limit=limit
|
||||
)
|
||||
|
||||
async def create_table(
|
||||
self,
|
||||
@@ -621,6 +883,7 @@ class AsyncConnection(object):
|
||||
fill_value: Optional[float] = None,
|
||||
storage_options: Optional[Dict[str, str]] = None,
|
||||
*,
|
||||
namespace: List[str] = [],
|
||||
embedding_functions: Optional[List[EmbeddingFunctionConfig]] = None,
|
||||
) -> AsyncTable:
|
||||
"""Create an [AsyncTable][lancedb.table.AsyncTable] in the database.
|
||||
@@ -629,6 +892,9 @@ class AsyncConnection(object):
|
||||
----------
|
||||
name: str
|
||||
The name of the table.
|
||||
namespace: List[str], default []
|
||||
The namespace to create the table in.
|
||||
Empty list represents root namespace.
|
||||
data: The data to initialize the table, *optional*
|
||||
User must provide at least one of `data` or `schema`.
|
||||
Acceptable types are:
|
||||
@@ -807,6 +1073,7 @@ class AsyncConnection(object):
|
||||
name,
|
||||
mode,
|
||||
schema,
|
||||
namespace=namespace,
|
||||
storage_options=storage_options,
|
||||
)
|
||||
else:
|
||||
@@ -815,6 +1082,7 @@ class AsyncConnection(object):
|
||||
name,
|
||||
mode,
|
||||
data,
|
||||
namespace=namespace,
|
||||
storage_options=storage_options,
|
||||
)
|
||||
|
||||
@@ -823,6 +1091,8 @@ class AsyncConnection(object):
|
||||
async def open_table(
|
||||
self,
|
||||
name: str,
|
||||
*,
|
||||
namespace: List[str] = [],
|
||||
storage_options: Optional[Dict[str, str]] = None,
|
||||
index_cache_size: Optional[int] = None,
|
||||
) -> AsyncTable:
|
||||
@@ -832,6 +1102,9 @@ class AsyncConnection(object):
|
||||
----------
|
||||
name: str
|
||||
The name of the table.
|
||||
namespace: List[str], optional
|
||||
The namespace to open the table from.
|
||||
None or empty list represents root namespace.
|
||||
storage_options: dict, optional
|
||||
Additional options for the storage backend. Options already set on the
|
||||
connection will be inherited by the table, but can be overridden here.
|
||||
@@ -855,42 +1128,77 @@ class AsyncConnection(object):
|
||||
-------
|
||||
A LanceTable object representing the table.
|
||||
"""
|
||||
table = await self._inner.open_table(name, storage_options, index_cache_size)
|
||||
table = await self._inner.open_table(
|
||||
name,
|
||||
namespace=namespace,
|
||||
storage_options=storage_options,
|
||||
index_cache_size=index_cache_size,
|
||||
)
|
||||
return AsyncTable(table)
|
||||
|
||||
async def rename_table(self, old_name: str, new_name: str):
|
||||
async def rename_table(
|
||||
self,
|
||||
cur_name: str,
|
||||
new_name: str,
|
||||
cur_namespace: List[str] = [],
|
||||
new_namespace: List[str] = [],
|
||||
):
|
||||
"""Rename a table in the database.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
old_name: str
|
||||
cur_name: str
|
||||
The current name of the table.
|
||||
new_name: str
|
||||
The new name of the table.
|
||||
cur_namespace: List[str], optional
|
||||
The namespace of the current table.
|
||||
None or empty list represents root namespace.
|
||||
new_namespace: List[str], optional
|
||||
The namespace to move the table to.
|
||||
If not specified, defaults to the same as cur_namespace.
|
||||
"""
|
||||
await self._inner.rename_table(old_name, new_name)
|
||||
await self._inner.rename_table(
|
||||
cur_name, new_name, cur_namespace=cur_namespace, new_namespace=new_namespace
|
||||
)
|
||||
|
||||
async def drop_table(self, name: str, *, ignore_missing: bool = False):
|
||||
async def drop_table(
|
||||
self,
|
||||
name: str,
|
||||
*,
|
||||
namespace: List[str] = [],
|
||||
ignore_missing: bool = False,
|
||||
):
|
||||
"""Drop a table from the database.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
name: str
|
||||
The name of the table.
|
||||
namespace: List[str], default []
|
||||
The namespace to drop the table from.
|
||||
Empty list represents root namespace.
|
||||
ignore_missing: bool, default False
|
||||
If True, ignore if the table does not exist.
|
||||
"""
|
||||
try:
|
||||
await self._inner.drop_table(name)
|
||||
await self._inner.drop_table(name, namespace=namespace)
|
||||
except ValueError as e:
|
||||
if not ignore_missing:
|
||||
raise e
|
||||
if f"Table '{name}' was not found" not in str(e):
|
||||
raise e
|
||||
|
||||
async def drop_all_tables(self):
|
||||
"""Drop all tables from the database."""
|
||||
await self._inner.drop_all_tables()
|
||||
async def drop_all_tables(self, namespace: List[str] = []):
|
||||
"""Drop all tables from the database.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
namespace: List[str], optional
|
||||
The namespace to drop all tables from.
|
||||
None or empty list represents root namespace.
|
||||
"""
|
||||
await self._inner.drop_all_tables(namespace=namespace)
|
||||
|
||||
@deprecation.deprecated(
|
||||
deprecated_in="0.15.1",
|
||||
|
||||
401
python/python/lancedb/namespace.py
Normal file
401
python/python/lancedb/namespace.py
Normal file
@@ -0,0 +1,401 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
"""
|
||||
LanceDB Namespace integration module.
|
||||
|
||||
This module provides integration with lance_namespace for managing tables
|
||||
through a namespace abstraction.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Dict, Iterable, List, Optional, Union
|
||||
import os
|
||||
|
||||
from lancedb.db import DBConnection
|
||||
from lancedb.table import LanceTable, Table
|
||||
from lancedb.util import validate_table_name
|
||||
from lancedb.common import validate_schema
|
||||
from lancedb.table import sanitize_create_table
|
||||
from overrides import override
|
||||
|
||||
from lance_namespace import LanceNamespace, connect as namespace_connect
|
||||
from lance_namespace_urllib3_client.models import (
|
||||
ListTablesRequest,
|
||||
DescribeTableRequest,
|
||||
CreateTableRequest,
|
||||
DropTableRequest,
|
||||
ListNamespacesRequest,
|
||||
CreateNamespaceRequest,
|
||||
DropNamespaceRequest,
|
||||
JsonArrowSchema,
|
||||
JsonArrowField,
|
||||
JsonArrowDataType,
|
||||
)
|
||||
|
||||
import pyarrow as pa
|
||||
from datetime import timedelta
|
||||
from lancedb.pydantic import LanceModel
|
||||
from lancedb.common import DATA
|
||||
from lancedb.embeddings import EmbeddingFunctionConfig
|
||||
from ._lancedb import Session
|
||||
|
||||
|
||||
def _convert_pyarrow_type_to_json(arrow_type: pa.DataType) -> JsonArrowDataType:
|
||||
"""Convert PyArrow DataType to JsonArrowDataType."""
|
||||
if pa.types.is_null(arrow_type):
|
||||
type_name = "null"
|
||||
elif pa.types.is_boolean(arrow_type):
|
||||
type_name = "bool"
|
||||
elif pa.types.is_int8(arrow_type):
|
||||
type_name = "int8"
|
||||
elif pa.types.is_uint8(arrow_type):
|
||||
type_name = "uint8"
|
||||
elif pa.types.is_int16(arrow_type):
|
||||
type_name = "int16"
|
||||
elif pa.types.is_uint16(arrow_type):
|
||||
type_name = "uint16"
|
||||
elif pa.types.is_int32(arrow_type):
|
||||
type_name = "int32"
|
||||
elif pa.types.is_uint32(arrow_type):
|
||||
type_name = "uint32"
|
||||
elif pa.types.is_int64(arrow_type):
|
||||
type_name = "int64"
|
||||
elif pa.types.is_uint64(arrow_type):
|
||||
type_name = "uint64"
|
||||
elif pa.types.is_float32(arrow_type):
|
||||
type_name = "float32"
|
||||
elif pa.types.is_float64(arrow_type):
|
||||
type_name = "float64"
|
||||
elif pa.types.is_string(arrow_type):
|
||||
type_name = "utf8"
|
||||
elif pa.types.is_binary(arrow_type):
|
||||
type_name = "binary"
|
||||
elif pa.types.is_list(arrow_type):
|
||||
# For list types, we need more complex handling
|
||||
type_name = "list"
|
||||
elif pa.types.is_fixed_size_list(arrow_type):
|
||||
type_name = "fixed_size_list"
|
||||
else:
|
||||
# Default to string representation for unsupported types
|
||||
type_name = str(arrow_type)
|
||||
|
||||
return JsonArrowDataType(type=type_name)
|
||||
|
||||
|
||||
def _convert_pyarrow_schema_to_json(schema: pa.Schema) -> JsonArrowSchema:
|
||||
"""Convert PyArrow Schema to JsonArrowSchema."""
|
||||
fields = []
|
||||
for field in schema:
|
||||
json_field = JsonArrowField(
|
||||
name=field.name,
|
||||
type=_convert_pyarrow_type_to_json(field.type),
|
||||
nullable=field.nullable,
|
||||
metadata=field.metadata,
|
||||
)
|
||||
fields.append(json_field)
|
||||
|
||||
return JsonArrowSchema(fields=fields, metadata=schema.metadata)
|
||||
|
||||
|
||||
class LanceNamespaceDBConnection(DBConnection):
|
||||
"""
|
||||
A LanceDB connection that uses a namespace for table management.
|
||||
|
||||
This connection delegates table URI resolution to a lance_namespace instance,
|
||||
while using the standard LanceTable for actual table operations.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
namespace: LanceNamespace,
|
||||
*,
|
||||
read_consistency_interval: Optional[timedelta] = None,
|
||||
storage_options: Optional[Dict[str, str]] = None,
|
||||
session: Optional[Session] = None,
|
||||
):
|
||||
"""
|
||||
Initialize a namespace-based LanceDB connection.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
namespace : LanceNamespace
|
||||
The namespace instance to use for table management
|
||||
read_consistency_interval : Optional[timedelta]
|
||||
The interval at which to check for updates to the table from other
|
||||
processes. If None, then consistency is not checked.
|
||||
storage_options : Optional[Dict[str, str]]
|
||||
Additional options for the storage backend
|
||||
session : Optional[Session]
|
||||
A session to use for this connection
|
||||
"""
|
||||
self._ns = namespace
|
||||
self.read_consistency_interval = read_consistency_interval
|
||||
self.storage_options = storage_options or {}
|
||||
self.session = session
|
||||
|
||||
@override
|
||||
def table_names(
|
||||
self,
|
||||
page_token: Optional[str] = None,
|
||||
limit: int = 10,
|
||||
*,
|
||||
namespace: List[str] = [],
|
||||
) -> Iterable[str]:
|
||||
request = ListTablesRequest(id=namespace, page_token=page_token, limit=limit)
|
||||
response = self._ns.list_tables(request)
|
||||
return response.tables if response.tables else []
|
||||
|
||||
@override
|
||||
def create_table(
|
||||
self,
|
||||
name: str,
|
||||
data: Optional[DATA] = None,
|
||||
schema: Optional[Union[pa.Schema, LanceModel]] = None,
|
||||
mode: str = "create",
|
||||
exist_ok: bool = False,
|
||||
on_bad_vectors: str = "error",
|
||||
fill_value: float = 0.0,
|
||||
embedding_functions: Optional[List[EmbeddingFunctionConfig]] = None,
|
||||
*,
|
||||
namespace: List[str] = [],
|
||||
storage_options: Optional[Dict[str, str]] = None,
|
||||
data_storage_version: Optional[str] = None,
|
||||
enable_v2_manifest_paths: Optional[bool] = None,
|
||||
) -> Table:
|
||||
if mode.lower() not in ["create", "overwrite"]:
|
||||
raise ValueError("mode must be either 'create' or 'overwrite'")
|
||||
validate_table_name(name)
|
||||
|
||||
# TODO: support passing data
|
||||
if data is not None:
|
||||
raise ValueError(
|
||||
"create_table currently only supports creating empty tables (data=None)"
|
||||
)
|
||||
|
||||
# Prepare schema
|
||||
metadata = None
|
||||
if embedding_functions is not None:
|
||||
from lancedb.embeddings.registry import EmbeddingFunctionRegistry
|
||||
|
||||
registry = EmbeddingFunctionRegistry.get_instance()
|
||||
metadata = registry.get_table_metadata(embedding_functions)
|
||||
|
||||
data, schema = sanitize_create_table(
|
||||
data, schema, metadata, on_bad_vectors, fill_value
|
||||
)
|
||||
validate_schema(schema)
|
||||
|
||||
# Convert PyArrow schema to JsonArrowSchema
|
||||
json_schema = _convert_pyarrow_schema_to_json(schema)
|
||||
|
||||
# Create table request with namespace
|
||||
table_id = namespace + [name]
|
||||
request = CreateTableRequest(id=table_id, var_schema=json_schema)
|
||||
|
||||
# Create empty Arrow IPC stream bytes
|
||||
import pyarrow.ipc as ipc
|
||||
import io
|
||||
|
||||
empty_table = pa.Table.from_arrays(
|
||||
[pa.array([], type=field.type) for field in schema], schema=schema
|
||||
)
|
||||
buffer = io.BytesIO()
|
||||
with ipc.new_stream(buffer, schema) as writer:
|
||||
writer.write_table(empty_table)
|
||||
request_data = buffer.getvalue()
|
||||
|
||||
self._ns.create_table(request, request_data)
|
||||
return self.open_table(
|
||||
name, namespace=namespace, storage_options=storage_options
|
||||
)
|
||||
|
||||
@override
|
||||
def open_table(
|
||||
self,
|
||||
name: str,
|
||||
*,
|
||||
namespace: List[str] = [],
|
||||
storage_options: Optional[Dict[str, str]] = None,
|
||||
index_cache_size: Optional[int] = None,
|
||||
) -> Table:
|
||||
table_id = namespace + [name]
|
||||
request = DescribeTableRequest(id=table_id)
|
||||
response = self._ns.describe_table(request)
|
||||
|
||||
merged_storage_options = dict()
|
||||
if storage_options:
|
||||
merged_storage_options.update(storage_options)
|
||||
if response.storage_options:
|
||||
merged_storage_options.update(response.storage_options)
|
||||
|
||||
return self._lance_table_from_uri(
|
||||
response.location,
|
||||
storage_options=merged_storage_options,
|
||||
index_cache_size=index_cache_size,
|
||||
)
|
||||
|
||||
@override
|
||||
def drop_table(self, name: str, namespace: List[str] = []):
|
||||
# Use namespace drop_table directly
|
||||
table_id = namespace + [name]
|
||||
request = DropTableRequest(id=table_id)
|
||||
self._ns.drop_table(request)
|
||||
|
||||
@override
|
||||
def rename_table(
|
||||
self,
|
||||
cur_name: str,
|
||||
new_name: str,
|
||||
cur_namespace: List[str] = [],
|
||||
new_namespace: List[str] = [],
|
||||
):
|
||||
raise NotImplementedError(
|
||||
"rename_table is not supported for namespace connections"
|
||||
)
|
||||
|
||||
@override
|
||||
def drop_database(self):
|
||||
raise NotImplementedError(
|
||||
"drop_database is deprecated, use drop_all_tables instead"
|
||||
)
|
||||
|
||||
@override
|
||||
def drop_all_tables(self, namespace: List[str] = []):
|
||||
for table_name in self.table_names(namespace=namespace):
|
||||
self.drop_table(table_name, namespace=namespace)
|
||||
|
||||
@override
|
||||
def list_namespaces(
|
||||
self,
|
||||
namespace: List[str] = [],
|
||||
page_token: Optional[str] = None,
|
||||
limit: int = 10,
|
||||
) -> Iterable[str]:
|
||||
"""
|
||||
List child namespaces under the given namespace.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
namespace : Optional[List[str]]
|
||||
The parent namespace to list children from.
|
||||
If None, lists root-level namespaces.
|
||||
page_token : Optional[str]
|
||||
Pagination token for listing results.
|
||||
limit : int
|
||||
Maximum number of namespaces to return.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Iterable[str]
|
||||
Names of child namespaces.
|
||||
"""
|
||||
request = ListNamespacesRequest(
|
||||
id=namespace, page_token=page_token, limit=limit
|
||||
)
|
||||
response = self._ns.list_namespaces(request)
|
||||
return response.namespaces if response.namespaces else []
|
||||
|
||||
@override
|
||||
def create_namespace(self, namespace: List[str]) -> None:
|
||||
"""
|
||||
Create a new namespace.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
namespace : List[str]
|
||||
The namespace path to create.
|
||||
"""
|
||||
request = CreateNamespaceRequest(id=namespace)
|
||||
self._ns.create_namespace(request)
|
||||
|
||||
@override
|
||||
def drop_namespace(self, namespace: List[str]) -> None:
|
||||
"""
|
||||
Drop a namespace.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
namespace : List[str]
|
||||
The namespace path to drop.
|
||||
"""
|
||||
request = DropNamespaceRequest(id=namespace)
|
||||
self._ns.drop_namespace(request)
|
||||
|
||||
def _lance_table_from_uri(
|
||||
self,
|
||||
table_uri: str,
|
||||
*,
|
||||
storage_options: Optional[Dict[str, str]] = None,
|
||||
index_cache_size: Optional[int] = None,
|
||||
) -> LanceTable:
|
||||
# Extract the base path and table name from the URI
|
||||
if table_uri.endswith(".lance"):
|
||||
base_path = os.path.dirname(table_uri)
|
||||
table_name = os.path.basename(table_uri)[:-6] # Remove .lance
|
||||
else:
|
||||
raise ValueError(f"Invalid table URI: {table_uri}")
|
||||
|
||||
from lancedb.db import LanceDBConnection
|
||||
|
||||
temp_conn = LanceDBConnection(
|
||||
base_path,
|
||||
read_consistency_interval=self.read_consistency_interval,
|
||||
storage_options={**self.storage_options, **(storage_options or {})},
|
||||
session=self.session,
|
||||
)
|
||||
|
||||
# Open the table using the temporary connection
|
||||
return LanceTable.open(
|
||||
temp_conn,
|
||||
table_name,
|
||||
storage_options=storage_options,
|
||||
index_cache_size=index_cache_size,
|
||||
)
|
||||
|
||||
|
||||
def connect_namespace(
|
||||
impl: str,
|
||||
properties: Dict[str, str],
|
||||
*,
|
||||
read_consistency_interval: Optional[timedelta] = None,
|
||||
storage_options: Optional[Dict[str, str]] = None,
|
||||
session: Optional[Session] = None,
|
||||
) -> LanceNamespaceDBConnection:
|
||||
"""
|
||||
Connect to a LanceDB database through a namespace.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
impl : str
|
||||
The namespace implementation to use. For examples:
|
||||
- "dir" for DirectoryNamespace
|
||||
- "rest" for REST-based namespace
|
||||
- Full module path for custom implementations
|
||||
properties : Dict[str, str]
|
||||
Configuration properties for the namespace implementation.
|
||||
Different namespace implemenation has different config properties.
|
||||
For example, use DirectoryNamespace with {"root": "/path/to/directory"}
|
||||
read_consistency_interval : Optional[timedelta]
|
||||
The interval at which to check for updates to the table from other
|
||||
processes. If None, then consistency is not checked.
|
||||
storage_options : Optional[Dict[str, str]]
|
||||
Additional options for the storage backend
|
||||
session : Optional[Session]
|
||||
A session to use for this connection
|
||||
|
||||
Returns
|
||||
-------
|
||||
LanceNamespaceDBConnection
|
||||
A namespace-based connection to LanceDB
|
||||
"""
|
||||
namespace = namespace_connect(impl, properties)
|
||||
|
||||
# Return the namespace-based connection
|
||||
return LanceNamespaceDBConnection(
|
||||
namespace,
|
||||
read_consistency_interval=read_consistency_interval,
|
||||
storage_options=storage_options,
|
||||
session=session,
|
||||
)
|
||||
@@ -28,6 +28,7 @@ import pyarrow.fs as pa_fs
|
||||
import pydantic
|
||||
|
||||
from lancedb.pydantic import PYDANTIC_VERSION
|
||||
from lancedb.background_loop import LOOP
|
||||
|
||||
from . import __version__
|
||||
from .arrow import AsyncRecordBatchReader
|
||||
@@ -48,6 +49,7 @@ if TYPE_CHECKING:
|
||||
from ._lancedb import FTSQuery as LanceFTSQuery
|
||||
from ._lancedb import HybridQuery as LanceHybridQuery
|
||||
from ._lancedb import VectorQuery as LanceVectorQuery
|
||||
from ._lancedb import TakeQuery as LanceTakeQuery
|
||||
from ._lancedb import PyQueryRequest
|
||||
from .common import VEC
|
||||
from .pydantic import LanceModel
|
||||
@@ -910,7 +912,7 @@ class LanceQueryBuilder(ABC):
|
||||
ProjectionExec: expr=[vector@0 as vector, _distance@2 as _distance]
|
||||
GlobalLimitExec: skip=0, fetch=10
|
||||
FilterExec: _distance@2 IS NOT NULL
|
||||
SortExec: TopK(fetch=10), expr=[_distance@2 ASC NULLS LAST], preserve_partitioning=[false]
|
||||
SortExec: TopK(fetch=10), expr=[_distance@2 ASC NULLS LAST, _rowid@1 ASC NULLS LAST], preserve_partitioning=[false]
|
||||
KNNVectorDistance: metric=l2
|
||||
LanceRead: uri=..., projection=[vector], ...
|
||||
|
||||
@@ -941,20 +943,22 @@ class LanceQueryBuilder(ABC):
|
||||
>>> query = [100, 100]
|
||||
>>> plan = table.search(query).analyze_plan()
|
||||
>>> print(plan) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
|
||||
AnalyzeExec verbose=true, metrics=[]
|
||||
TracedExec, metrics=[]
|
||||
ProjectionExec: expr=[...], metrics=[...]
|
||||
GlobalLimitExec: skip=0, fetch=10, metrics=[...]
|
||||
AnalyzeExec verbose=true, metrics=[], cumulative_cpu=...
|
||||
TracedExec, metrics=[], cumulative_cpu=...
|
||||
ProjectionExec: expr=[...], metrics=[...], cumulative_cpu=...
|
||||
GlobalLimitExec: skip=0, fetch=10, metrics=[...], cumulative_cpu=...
|
||||
FilterExec: _distance@2 IS NOT NULL,
|
||||
metrics=[output_rows=..., elapsed_compute=...]
|
||||
metrics=[output_rows=..., elapsed_compute=...], cumulative_cpu=...
|
||||
SortExec: TopK(fetch=10), expr=[...],
|
||||
preserve_partitioning=[...],
|
||||
metrics=[output_rows=..., elapsed_compute=..., row_replacements=...]
|
||||
metrics=[output_rows=..., elapsed_compute=..., row_replacements=...],
|
||||
cumulative_cpu=...
|
||||
KNNVectorDistance: metric=l2,
|
||||
metrics=[output_rows=..., elapsed_compute=..., output_batches=...]
|
||||
metrics=[output_rows=..., elapsed_compute=..., output_batches=...],
|
||||
cumulative_cpu=...
|
||||
LanceRead: uri=..., projection=[vector], ...
|
||||
metrics=[output_rows=..., elapsed_compute=...,
|
||||
bytes_read=..., iops=..., requests=...]
|
||||
bytes_read=..., iops=..., requests=...], cumulative_cpu=...
|
||||
|
||||
Returns
|
||||
-------
|
||||
@@ -2041,11 +2045,11 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
|
||||
>>> plan = table.search(query).explain_plan(True)
|
||||
>>> print(plan) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
|
||||
ProjectionExec: expr=[vector@0 as vector, _distance@2 as _distance]
|
||||
GlobalLimitExec: skip=0, fetch=10
|
||||
FilterExec: _distance@2 IS NOT NULL
|
||||
SortExec: TopK(fetch=10), expr=[_distance@2 ASC NULLS LAST], preserve_partitioning=[false]
|
||||
KNNVectorDistance: metric=l2
|
||||
LanceRead: uri=..., projection=[vector], ...
|
||||
GlobalLimitExec: skip=0, fetch=10
|
||||
FilterExec: _distance@2 IS NOT NULL
|
||||
SortExec: TopK(fetch=10), expr=[_distance@2 ASC NULLS LAST, _rowid@1 ASC NULLS LAST], preserve_partitioning=[false]
|
||||
KNNVectorDistance: metric=l2
|
||||
LanceRead: uri=..., projection=[vector], ...
|
||||
|
||||
Parameters
|
||||
----------
|
||||
@@ -2139,7 +2143,11 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
|
||||
|
||||
|
||||
class AsyncQueryBase(object):
|
||||
def __init__(self, inner: Union[LanceQuery, LanceVectorQuery]):
|
||||
"""
|
||||
Base class for all async queries (take, scan, vector, fts, hybrid)
|
||||
"""
|
||||
|
||||
def __init__(self, inner: Union[LanceQuery, LanceVectorQuery, LanceTakeQuery]):
|
||||
"""
|
||||
Construct an AsyncQueryBase
|
||||
|
||||
@@ -2149,27 +2157,14 @@ class AsyncQueryBase(object):
|
||||
self._inner = inner
|
||||
|
||||
def to_query_object(self) -> Query:
|
||||
"""
|
||||
Convert the query into a query object
|
||||
|
||||
This is currently experimental but can be useful as the query object is pure
|
||||
python and more easily serializable.
|
||||
"""
|
||||
return Query.from_inner(self._inner.to_query_request())
|
||||
|
||||
def where(self, predicate: str) -> Self:
|
||||
"""
|
||||
Only return rows matching the given predicate
|
||||
|
||||
The predicate should be supplied as an SQL query string.
|
||||
|
||||
Examples
|
||||
--------
|
||||
|
||||
>>> predicate = "x > 10"
|
||||
>>> predicate = "y > 0 AND y < 100"
|
||||
>>> predicate = "x > 5 OR y = 'test'"
|
||||
|
||||
Filtering performance can often be improved by creating a scalar index
|
||||
on the filter column(s).
|
||||
"""
|
||||
self._inner.where(predicate)
|
||||
return self
|
||||
|
||||
def select(self, columns: Union[List[str], dict[str, str]]) -> Self:
|
||||
"""
|
||||
Return only the specified columns.
|
||||
@@ -2208,42 +2203,6 @@ class AsyncQueryBase(object):
|
||||
raise TypeError("columns must be a list of column names or a dict")
|
||||
return self
|
||||
|
||||
def limit(self, limit: int) -> Self:
|
||||
"""
|
||||
Set the maximum number of results to return.
|
||||
|
||||
By default, a plain search has no limit. If this method is not
|
||||
called then every valid row from the table will be returned.
|
||||
"""
|
||||
self._inner.limit(limit)
|
||||
return self
|
||||
|
||||
def offset(self, offset: int) -> Self:
|
||||
"""
|
||||
Set the offset for the results.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
offset: int
|
||||
The offset to start fetching results from.
|
||||
"""
|
||||
self._inner.offset(offset)
|
||||
return self
|
||||
|
||||
def fast_search(self) -> Self:
|
||||
"""
|
||||
Skip searching un-indexed data.
|
||||
|
||||
This can make queries faster, but will miss any data that has not been
|
||||
indexed.
|
||||
|
||||
!!! tip
|
||||
You can add new data into an existing index by calling
|
||||
[AsyncTable.optimize][lancedb.table.AsyncTable.optimize].
|
||||
"""
|
||||
self._inner.fast_search()
|
||||
return self
|
||||
|
||||
def with_row_id(self) -> Self:
|
||||
"""
|
||||
Include the _rowid column in the results.
|
||||
@@ -2251,27 +2210,6 @@ class AsyncQueryBase(object):
|
||||
self._inner.with_row_id()
|
||||
return self
|
||||
|
||||
def postfilter(self) -> Self:
|
||||
"""
|
||||
If this is called then filtering will happen after the search instead of
|
||||
before.
|
||||
By default filtering will be performed before the search. This is how
|
||||
filtering is typically understood to work. This prefilter step does add some
|
||||
additional latency. Creating a scalar index on the filter column(s) can
|
||||
often improve this latency. However, sometimes a filter is too complex or
|
||||
scalar indices cannot be applied to the column. In these cases postfiltering
|
||||
can be used instead of prefiltering to improve latency.
|
||||
Post filtering applies the filter to the results of the search. This
|
||||
means we only run the filter on a much smaller set of data. However, it can
|
||||
cause the query to return fewer than `limit` results (or even no results) if
|
||||
none of the nearest results match the filter.
|
||||
Post filtering happens during the "refine stage" (described in more detail in
|
||||
@see {@link VectorQuery#refineFactor}). This means that setting a higher refine
|
||||
factor can often help restore some of the results lost by post filtering.
|
||||
"""
|
||||
self._inner.postfilter()
|
||||
return self
|
||||
|
||||
async def to_batches(
|
||||
self,
|
||||
*,
|
||||
@@ -2295,7 +2233,9 @@ class AsyncQueryBase(object):
|
||||
complete within the specified time, an error will be raised.
|
||||
"""
|
||||
return AsyncRecordBatchReader(
|
||||
await self._inner.execute(max_batch_length, timeout)
|
||||
await self._inner.execute(
|
||||
max_batch_length=max_batch_length, timeout=timeout
|
||||
)
|
||||
)
|
||||
|
||||
async def to_arrow(self, timeout: Optional[timedelta] = None) -> pa.Table:
|
||||
@@ -2429,7 +2369,7 @@ class AsyncQueryBase(object):
|
||||
ProjectionExec: expr=[vector@0 as vector, _distance@2 as _distance]
|
||||
GlobalLimitExec: skip=0, fetch=10
|
||||
FilterExec: _distance@2 IS NOT NULL
|
||||
SortExec: TopK(fetch=10), expr=[_distance@2 ASC NULLS LAST], preserve_partitioning=[false]
|
||||
SortExec: TopK(fetch=10), expr=[_distance@2 ASC NULLS LAST, _rowid@1 ASC NULLS LAST], preserve_partitioning=[false]
|
||||
KNNVectorDistance: metric=l2
|
||||
LanceRead: uri=..., projection=[vector], ...
|
||||
|
||||
@@ -2454,7 +2394,98 @@ class AsyncQueryBase(object):
|
||||
return await self._inner.analyze_plan()
|
||||
|
||||
|
||||
class AsyncQuery(AsyncQueryBase):
|
||||
class AsyncStandardQuery(AsyncQueryBase):
|
||||
"""
|
||||
Base class for "standard" async queries (all but take currently)
|
||||
"""
|
||||
|
||||
def __init__(self, inner: Union[LanceQuery, LanceVectorQuery]):
|
||||
"""
|
||||
Construct an AsyncStandardQuery
|
||||
|
||||
This method is not intended to be called directly. Instead, use the
|
||||
[AsyncTable.query][lancedb.table.AsyncTable.query] method to create a query.
|
||||
"""
|
||||
super().__init__(inner)
|
||||
|
||||
def where(self, predicate: str) -> Self:
|
||||
"""
|
||||
Only return rows matching the given predicate
|
||||
|
||||
The predicate should be supplied as an SQL query string.
|
||||
|
||||
Examples
|
||||
--------
|
||||
|
||||
>>> predicate = "x > 10"
|
||||
>>> predicate = "y > 0 AND y < 100"
|
||||
>>> predicate = "x > 5 OR y = 'test'"
|
||||
|
||||
Filtering performance can often be improved by creating a scalar index
|
||||
on the filter column(s).
|
||||
"""
|
||||
self._inner.where(predicate)
|
||||
return self
|
||||
|
||||
def limit(self, limit: int) -> Self:
|
||||
"""
|
||||
Set the maximum number of results to return.
|
||||
|
||||
By default, a plain search has no limit. If this method is not
|
||||
called then every valid row from the table will be returned.
|
||||
"""
|
||||
self._inner.limit(limit)
|
||||
return self
|
||||
|
||||
def offset(self, offset: int) -> Self:
|
||||
"""
|
||||
Set the offset for the results.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
offset: int
|
||||
The offset to start fetching results from.
|
||||
"""
|
||||
self._inner.offset(offset)
|
||||
return self
|
||||
|
||||
def fast_search(self) -> Self:
|
||||
"""
|
||||
Skip searching un-indexed data.
|
||||
|
||||
This can make queries faster, but will miss any data that has not been
|
||||
indexed.
|
||||
|
||||
!!! tip
|
||||
You can add new data into an existing index by calling
|
||||
[AsyncTable.optimize][lancedb.table.AsyncTable.optimize].
|
||||
"""
|
||||
self._inner.fast_search()
|
||||
return self
|
||||
|
||||
def postfilter(self) -> Self:
|
||||
"""
|
||||
If this is called then filtering will happen after the search instead of
|
||||
before.
|
||||
By default filtering will be performed before the search. This is how
|
||||
filtering is typically understood to work. This prefilter step does add some
|
||||
additional latency. Creating a scalar index on the filter column(s) can
|
||||
often improve this latency. However, sometimes a filter is too complex or
|
||||
scalar indices cannot be applied to the column. In these cases postfiltering
|
||||
can be used instead of prefiltering to improve latency.
|
||||
Post filtering applies the filter to the results of the search. This
|
||||
means we only run the filter on a much smaller set of data. However, it can
|
||||
cause the query to return fewer than `limit` results (or even no results) if
|
||||
none of the nearest results match the filter.
|
||||
Post filtering happens during the "refine stage" (described in more detail in
|
||||
@see {@link VectorQuery#refineFactor}). This means that setting a higher refine
|
||||
factor can often help restore some of the results lost by post filtering.
|
||||
"""
|
||||
self._inner.postfilter()
|
||||
return self
|
||||
|
||||
|
||||
class AsyncQuery(AsyncStandardQuery):
|
||||
def __init__(self, inner: LanceQuery):
|
||||
"""
|
||||
Construct an AsyncQuery
|
||||
@@ -2588,7 +2619,7 @@ class AsyncQuery(AsyncQueryBase):
|
||||
return AsyncFTSQuery(self._inner.nearest_to_text({"query": query}))
|
||||
|
||||
|
||||
class AsyncFTSQuery(AsyncQueryBase):
|
||||
class AsyncFTSQuery(AsyncStandardQuery):
|
||||
"""A query for full text search for LanceDB."""
|
||||
|
||||
def __init__(self, inner: LanceFTSQuery):
|
||||
@@ -2867,7 +2898,7 @@ class AsyncVectorQueryBase:
|
||||
return self
|
||||
|
||||
|
||||
class AsyncVectorQuery(AsyncQueryBase, AsyncVectorQueryBase):
|
||||
class AsyncVectorQuery(AsyncStandardQuery, AsyncVectorQueryBase):
|
||||
def __init__(self, inner: LanceVectorQuery):
|
||||
"""
|
||||
Construct an AsyncVectorQuery
|
||||
@@ -2950,7 +2981,7 @@ class AsyncVectorQuery(AsyncQueryBase, AsyncVectorQueryBase):
|
||||
return AsyncRecordBatchReader(results, max_batch_length=max_batch_length)
|
||||
|
||||
|
||||
class AsyncHybridQuery(AsyncQueryBase, AsyncVectorQueryBase):
|
||||
class AsyncHybridQuery(AsyncStandardQuery, AsyncVectorQueryBase):
|
||||
"""
|
||||
A query builder that performs hybrid vector and full text search.
|
||||
Results are combined and reranked based on the specified reranker.
|
||||
@@ -3054,7 +3085,7 @@ class AsyncHybridQuery(AsyncQueryBase, AsyncVectorQueryBase):
|
||||
CoalesceBatchesExec: target_batch_size=1024
|
||||
GlobalLimitExec: skip=0, fetch=10
|
||||
FilterExec: _distance@2 IS NOT NULL
|
||||
SortExec: TopK(fetch=10), expr=[_distance@2 ASC NULLS LAST], preserve_partitioning=[false]
|
||||
SortExec: TopK(fetch=10), expr=[_distance@2 ASC NULLS LAST, _rowid@1 ASC NULLS LAST], preserve_partitioning=[false]
|
||||
KNNVectorDistance: metric=l2
|
||||
LanceRead: uri=..., projection=[vector], ...
|
||||
<BLANKLINE>
|
||||
@@ -3102,3 +3133,252 @@ class AsyncHybridQuery(AsyncQueryBase, AsyncVectorQueryBase):
|
||||
results.append(await self._inner.to_fts_query().analyze_plan())
|
||||
|
||||
return "\n".join(results)
|
||||
|
||||
|
||||
class AsyncTakeQuery(AsyncQueryBase):
|
||||
"""
|
||||
Builder for parameterizing and executing take queries.
|
||||
"""
|
||||
|
||||
def __init__(self, inner: LanceTakeQuery):
|
||||
super().__init__(inner)
|
||||
|
||||
|
||||
class BaseQueryBuilder(object):
|
||||
"""
|
||||
Wraps AsyncQueryBase and provides a synchronous interface
|
||||
"""
|
||||
|
||||
def __init__(self, inner: AsyncQueryBase):
|
||||
self._inner = inner
|
||||
|
||||
def to_query_object(self) -> Query:
|
||||
return self._inner.to_query_object()
|
||||
|
||||
def select(self, columns: Union[List[str], dict[str, str]]) -> Self:
|
||||
"""
|
||||
Return only the specified columns.
|
||||
|
||||
By default a query will return all columns from the table. However, this can
|
||||
have a very significant impact on latency. LanceDb stores data in a columnar
|
||||
fashion. This
|
||||
means we can finely tune our I/O to select exactly the columns we need.
|
||||
|
||||
As a best practice you should always limit queries to the columns that you need.
|
||||
If you pass in a list of column names then only those columns will be
|
||||
returned.
|
||||
|
||||
You can also use this method to create new "dynamic" columns based on your
|
||||
existing columns. For example, you may not care about "a" or "b" but instead
|
||||
simply want "a + b". This is often seen in the SELECT clause of an SQL query
|
||||
(e.g. `SELECT a+b FROM my_table`).
|
||||
|
||||
To create dynamic columns you can pass in a dict[str, str]. A column will be
|
||||
returned for each entry in the map. The key provides the name of the column.
|
||||
The value is an SQL string used to specify how the column is calculated.
|
||||
|
||||
For example, an SQL query might state `SELECT a + b AS combined, c`. The
|
||||
equivalent input to this method would be `{"combined": "a + b", "c": "c"}`.
|
||||
|
||||
Columns will always be returned in the order given, even if that order is
|
||||
different than the order used when adding the data.
|
||||
"""
|
||||
self._inner.select(columns)
|
||||
return self
|
||||
|
||||
def with_row_id(self) -> Self:
|
||||
"""
|
||||
Include the _rowid column in the results.
|
||||
"""
|
||||
self._inner.with_row_id()
|
||||
return self
|
||||
|
||||
def to_batches(
|
||||
self,
|
||||
*,
|
||||
max_batch_length: Optional[int] = None,
|
||||
timeout: Optional[timedelta] = None,
|
||||
) -> pa.RecordBatchReader:
|
||||
"""
|
||||
Execute the query and return the results as an Apache Arrow RecordBatchReader.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
||||
max_batch_length: Optional[int]
|
||||
The maximum number of selected records in a single RecordBatch object.
|
||||
If not specified, a default batch length is used.
|
||||
It is possible for batches to be smaller than the provided length if the
|
||||
underlying data is stored in smaller chunks.
|
||||
timeout: Optional[timedelta]
|
||||
The maximum time to wait for the query to complete.
|
||||
If not specified, no timeout is applied. If the query does not
|
||||
complete within the specified time, an error will be raised.
|
||||
"""
|
||||
async_iter = LOOP.run(self._inner.execute(max_batch_length, timeout))
|
||||
|
||||
def iter_sync():
|
||||
try:
|
||||
while True:
|
||||
yield LOOP.run(async_iter.__anext__())
|
||||
except StopAsyncIteration:
|
||||
return
|
||||
|
||||
return pa.RecordBatchReader.from_batches(async_iter.schema, iter_sync())
|
||||
|
||||
def to_arrow(self, timeout: Optional[timedelta] = None) -> pa.Table:
|
||||
"""
|
||||
Execute the query and collect the results into an Apache Arrow Table.
|
||||
|
||||
This method will collect all results into memory before returning. If
|
||||
you expect a large number of results, you may want to use
|
||||
[to_batches][lancedb.query.AsyncQueryBase.to_batches]
|
||||
|
||||
Parameters
|
||||
----------
|
||||
timeout: Optional[timedelta]
|
||||
The maximum time to wait for the query to complete.
|
||||
If not specified, no timeout is applied. If the query does not
|
||||
complete within the specified time, an error will be raised.
|
||||
"""
|
||||
return LOOP.run(self._inner.to_arrow(timeout))
|
||||
|
||||
def to_list(self, timeout: Optional[timedelta] = None) -> List[dict]:
|
||||
"""
|
||||
Execute the query and return the results as a list of dictionaries.
|
||||
|
||||
Each list entry is a dictionary with the selected column names as keys,
|
||||
or all table columns if `select` is not called. The vector and the "_distance"
|
||||
fields are returned whether or not they're explicitly selected.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
timeout: Optional[timedelta]
|
||||
The maximum time to wait for the query to complete.
|
||||
If not specified, no timeout is applied. If the query does not
|
||||
complete within the specified time, an error will be raised.
|
||||
"""
|
||||
return LOOP.run(self._inner.to_list(timeout))
|
||||
|
||||
def to_pandas(
|
||||
self,
|
||||
flatten: Optional[Union[int, bool]] = None,
|
||||
timeout: Optional[timedelta] = None,
|
||||
) -> "pd.DataFrame":
|
||||
"""
|
||||
Execute the query and collect the results into a pandas DataFrame.
|
||||
|
||||
This method will collect all results into memory before returning. If you
|
||||
expect a large number of results, you may want to use
|
||||
[to_batches][lancedb.query.AsyncQueryBase.to_batches] and convert each batch to
|
||||
pandas separately.
|
||||
|
||||
Examples
|
||||
--------
|
||||
|
||||
>>> import asyncio
|
||||
>>> from lancedb import connect_async
|
||||
>>> async def doctest_example():
|
||||
... conn = await connect_async("./.lancedb")
|
||||
... table = await conn.create_table("my_table", data=[{"a": 1, "b": 2}])
|
||||
... async for batch in await table.query().to_batches():
|
||||
... batch_df = batch.to_pandas()
|
||||
>>> asyncio.run(doctest_example())
|
||||
|
||||
Parameters
|
||||
----------
|
||||
flatten: Optional[Union[int, bool]]
|
||||
If flatten is True, flatten all nested columns.
|
||||
If flatten is an integer, flatten the nested columns up to the
|
||||
specified depth.
|
||||
If unspecified, do not flatten the nested columns.
|
||||
timeout: Optional[timedelta]
|
||||
The maximum time to wait for the query to complete.
|
||||
If not specified, no timeout is applied. If the query does not
|
||||
complete within the specified time, an error will be raised.
|
||||
"""
|
||||
return LOOP.run(self._inner.to_pandas(flatten, timeout))
|
||||
|
||||
def to_polars(
|
||||
self,
|
||||
timeout: Optional[timedelta] = None,
|
||||
) -> "pl.DataFrame":
|
||||
"""
|
||||
Execute the query and collect the results into a Polars DataFrame.
|
||||
|
||||
This method will collect all results into memory before returning. If you
|
||||
expect a large number of results, you may want to use
|
||||
[to_batches][lancedb.query.AsyncQueryBase.to_batches] and convert each batch to
|
||||
polars separately.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
timeout: Optional[timedelta]
|
||||
The maximum time to wait for the query to complete.
|
||||
If not specified, no timeout is applied. If the query does not
|
||||
complete within the specified time, an error will be raised.
|
||||
|
||||
Examples
|
||||
--------
|
||||
|
||||
>>> import asyncio
|
||||
>>> import polars as pl
|
||||
>>> from lancedb import connect_async
|
||||
>>> async def doctest_example():
|
||||
... conn = await connect_async("./.lancedb")
|
||||
... table = await conn.create_table("my_table", data=[{"a": 1, "b": 2}])
|
||||
... async for batch in await table.query().to_batches():
|
||||
... batch_df = pl.from_arrow(batch)
|
||||
>>> asyncio.run(doctest_example())
|
||||
"""
|
||||
return LOOP.run(self._inner.to_polars(timeout))
|
||||
|
||||
def explain_plan(self, verbose: Optional[bool] = False):
|
||||
"""Return the execution plan for this query.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import asyncio
|
||||
>>> from lancedb import connect_async
|
||||
>>> async def doctest_example():
|
||||
... conn = await connect_async("./.lancedb")
|
||||
... table = await conn.create_table("my_table", [{"vector": [99, 99]}])
|
||||
... query = [100, 100]
|
||||
... plan = await table.query().nearest_to([1, 2]).explain_plan(True)
|
||||
... print(plan)
|
||||
>>> asyncio.run(doctest_example()) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
|
||||
ProjectionExec: expr=[vector@0 as vector, _distance@2 as _distance]
|
||||
GlobalLimitExec: skip=0, fetch=10
|
||||
FilterExec: _distance@2 IS NOT NULL
|
||||
SortExec: TopK(fetch=10), expr=[_distance@2 ASC NULLS LAST, _rowid@1 ASC NULLS LAST], preserve_partitioning=[false]
|
||||
KNNVectorDistance: metric=l2
|
||||
LanceRead: uri=..., projection=[vector], ...
|
||||
|
||||
Parameters
|
||||
----------
|
||||
verbose : bool, default False
|
||||
Use a verbose output format.
|
||||
|
||||
Returns
|
||||
-------
|
||||
plan : str
|
||||
""" # noqa: E501
|
||||
return LOOP.run(self._inner.explain_plan(verbose))
|
||||
|
||||
def analyze_plan(self):
|
||||
"""Execute the query and display with runtime metrics.
|
||||
|
||||
Returns
|
||||
-------
|
||||
plan : str
|
||||
"""
|
||||
return LOOP.run(self._inner.analyze_plan())
|
||||
|
||||
|
||||
class LanceTakeQueryBuilder(BaseQueryBuilder):
|
||||
"""
|
||||
Builder for parameterizing and executing take queries.
|
||||
"""
|
||||
|
||||
def __init__(self, inner: AsyncTakeQuery):
|
||||
super().__init__(inner)
|
||||
|
||||
@@ -118,6 +118,7 @@ class ClientConfig:
|
||||
retry_config: RetryConfig = field(default_factory=RetryConfig)
|
||||
timeout_config: Optional[TimeoutConfig] = field(default_factory=TimeoutConfig)
|
||||
extra_headers: Optional[dict] = None
|
||||
id_delimiter: Optional[str] = None
|
||||
|
||||
def __post_init__(self):
|
||||
if isinstance(self.retry_config, dict):
|
||||
|
||||
@@ -96,14 +96,73 @@ class RemoteDBConnection(DBConnection):
|
||||
def __repr__(self) -> str:
|
||||
return f"RemoteConnect(name={self.db_name})"
|
||||
|
||||
@override
|
||||
def list_namespaces(
|
||||
self,
|
||||
namespace: List[str] = [],
|
||||
page_token: Optional[str] = None,
|
||||
limit: int = 10,
|
||||
) -> Iterable[str]:
|
||||
"""List immediate child namespace names in the given namespace.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
namespace: List[str], optional
|
||||
The parent namespace to list namespaces in.
|
||||
None or empty list represents root namespace.
|
||||
page_token: str, optional
|
||||
The token to use for pagination. If not present, start from the beginning.
|
||||
limit: int, default 10
|
||||
The size of the page to return.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Iterable of str
|
||||
List of immediate child namespace names
|
||||
"""
|
||||
return LOOP.run(
|
||||
self._conn.list_namespaces(
|
||||
namespace=namespace, page_token=page_token, limit=limit
|
||||
)
|
||||
)
|
||||
|
||||
@override
|
||||
def create_namespace(self, namespace: List[str]) -> None:
|
||||
"""Create a new namespace.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
namespace: List[str]
|
||||
The namespace identifier to create.
|
||||
"""
|
||||
LOOP.run(self._conn.create_namespace(namespace=namespace))
|
||||
|
||||
@override
|
||||
def drop_namespace(self, namespace: List[str]) -> None:
|
||||
"""Drop a namespace.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
namespace: List[str]
|
||||
The namespace identifier to drop.
|
||||
"""
|
||||
return LOOP.run(self._conn.drop_namespace(namespace=namespace))
|
||||
|
||||
@override
|
||||
def table_names(
|
||||
self, page_token: Optional[str] = None, limit: int = 10
|
||||
self,
|
||||
page_token: Optional[str] = None,
|
||||
limit: int = 10,
|
||||
*,
|
||||
namespace: List[str] = [],
|
||||
) -> Iterable[str]:
|
||||
"""List the names of all tables in the database.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
namespace: List[str], default []
|
||||
The namespace to list tables in.
|
||||
Empty list represents root namespace.
|
||||
page_token: str
|
||||
The last token to start the new page.
|
||||
limit: int, default 10
|
||||
@@ -113,13 +172,18 @@ class RemoteDBConnection(DBConnection):
|
||||
-------
|
||||
An iterator of table names.
|
||||
"""
|
||||
return LOOP.run(self._conn.table_names(start_after=page_token, limit=limit))
|
||||
return LOOP.run(
|
||||
self._conn.table_names(
|
||||
namespace=namespace, start_after=page_token, limit=limit
|
||||
)
|
||||
)
|
||||
|
||||
@override
|
||||
def open_table(
|
||||
self,
|
||||
name: str,
|
||||
*,
|
||||
namespace: List[str] = [],
|
||||
storage_options: Optional[Dict[str, str]] = None,
|
||||
index_cache_size: Optional[int] = None,
|
||||
) -> Table:
|
||||
@@ -129,6 +193,9 @@ class RemoteDBConnection(DBConnection):
|
||||
----------
|
||||
name: str
|
||||
The name of the table.
|
||||
namespace: List[str], optional
|
||||
The namespace to open the table from.
|
||||
None or empty list represents root namespace.
|
||||
|
||||
Returns
|
||||
-------
|
||||
@@ -142,7 +209,7 @@ class RemoteDBConnection(DBConnection):
|
||||
" (there is no local cache to configure)"
|
||||
)
|
||||
|
||||
table = LOOP.run(self._conn.open_table(name))
|
||||
table = LOOP.run(self._conn.open_table(name, namespace=namespace))
|
||||
return RemoteTable(table, self.db_name)
|
||||
|
||||
@override
|
||||
@@ -155,6 +222,8 @@ class RemoteDBConnection(DBConnection):
|
||||
fill_value: float = 0.0,
|
||||
mode: Optional[str] = None,
|
||||
embedding_functions: Optional[List[EmbeddingFunctionConfig]] = None,
|
||||
*,
|
||||
namespace: List[str] = [],
|
||||
) -> Table:
|
||||
"""Create a [Table][lancedb.table.Table] in the database.
|
||||
|
||||
@@ -162,6 +231,9 @@ class RemoteDBConnection(DBConnection):
|
||||
----------
|
||||
name: str
|
||||
The name of the table.
|
||||
namespace: List[str], optional
|
||||
The namespace to create the table in.
|
||||
None or empty list represents root namespace.
|
||||
data: The data to initialize the table, *optional*
|
||||
User must provide at least one of `data` or `schema`.
|
||||
Acceptable types are:
|
||||
@@ -262,6 +334,7 @@ class RemoteDBConnection(DBConnection):
|
||||
self._conn.create_table(
|
||||
name,
|
||||
data,
|
||||
namespace=namespace,
|
||||
mode=mode,
|
||||
schema=schema,
|
||||
on_bad_vectors=on_bad_vectors,
|
||||
@@ -271,18 +344,27 @@ class RemoteDBConnection(DBConnection):
|
||||
return RemoteTable(table, self.db_name)
|
||||
|
||||
@override
|
||||
def drop_table(self, name: str):
|
||||
def drop_table(self, name: str, namespace: List[str] = []):
|
||||
"""Drop a table from the database.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
name: str
|
||||
The name of the table.
|
||||
namespace: List[str], optional
|
||||
The namespace to drop the table from.
|
||||
None or empty list represents root namespace.
|
||||
"""
|
||||
LOOP.run(self._conn.drop_table(name))
|
||||
LOOP.run(self._conn.drop_table(name, namespace=namespace))
|
||||
|
||||
@override
|
||||
def rename_table(self, cur_name: str, new_name: str):
|
||||
def rename_table(
|
||||
self,
|
||||
cur_name: str,
|
||||
new_name: str,
|
||||
cur_namespace: List[str] = [],
|
||||
new_namespace: List[str] = [],
|
||||
):
|
||||
"""Rename a table in the database.
|
||||
|
||||
Parameters
|
||||
@@ -292,7 +374,14 @@ class RemoteDBConnection(DBConnection):
|
||||
new_name: str
|
||||
The new name of the table.
|
||||
"""
|
||||
LOOP.run(self._conn.rename_table(cur_name, new_name))
|
||||
LOOP.run(
|
||||
self._conn.rename_table(
|
||||
cur_name,
|
||||
new_name,
|
||||
cur_namespace=cur_namespace,
|
||||
new_namespace=new_namespace,
|
||||
)
|
||||
)
|
||||
|
||||
async def close(self):
|
||||
"""Close the connection to the database."""
|
||||
|
||||
@@ -26,7 +26,7 @@ from lancedb.common import DATA, VEC, VECTOR_COLUMN_NAME
|
||||
from lancedb.merge import LanceMergeInsertBuilder
|
||||
from lancedb.embeddings import EmbeddingFunctionRegistry
|
||||
|
||||
from ..query import LanceVectorQueryBuilder, LanceQueryBuilder
|
||||
from ..query import LanceVectorQueryBuilder, LanceQueryBuilder, LanceTakeQueryBuilder
|
||||
from ..table import AsyncTable, IndexStatistics, Query, Table, Tags
|
||||
|
||||
|
||||
@@ -115,6 +115,7 @@ class RemoteTable(Table):
|
||||
*,
|
||||
replace: bool = False,
|
||||
wait_timeout: timedelta = None,
|
||||
name: Optional[str] = None,
|
||||
):
|
||||
"""Creates a scalar index
|
||||
Parameters
|
||||
@@ -139,7 +140,11 @@ class RemoteTable(Table):
|
||||
|
||||
LOOP.run(
|
||||
self._table.create_index(
|
||||
column, config=config, replace=replace, wait_timeout=wait_timeout
|
||||
column,
|
||||
config=config,
|
||||
replace=replace,
|
||||
wait_timeout=wait_timeout,
|
||||
name=name,
|
||||
)
|
||||
)
|
||||
|
||||
@@ -161,6 +166,7 @@ class RemoteTable(Table):
|
||||
ngram_min_length: int = 3,
|
||||
ngram_max_length: int = 3,
|
||||
prefix_only: bool = False,
|
||||
name: Optional[str] = None,
|
||||
):
|
||||
config = FTS(
|
||||
with_position=with_position,
|
||||
@@ -177,7 +183,11 @@ class RemoteTable(Table):
|
||||
)
|
||||
LOOP.run(
|
||||
self._table.create_index(
|
||||
column, config=config, replace=replace, wait_timeout=wait_timeout
|
||||
column,
|
||||
config=config,
|
||||
replace=replace,
|
||||
wait_timeout=wait_timeout,
|
||||
name=name,
|
||||
)
|
||||
)
|
||||
|
||||
@@ -194,6 +204,8 @@ class RemoteTable(Table):
|
||||
wait_timeout: Optional[timedelta] = None,
|
||||
*,
|
||||
num_bits: int = 8,
|
||||
name: Optional[str] = None,
|
||||
train: bool = True,
|
||||
):
|
||||
"""Create an index on the table.
|
||||
Currently, the only parameters that matter are
|
||||
@@ -270,7 +282,11 @@ class RemoteTable(Table):
|
||||
|
||||
LOOP.run(
|
||||
self._table.create_index(
|
||||
vector_column_name, config=config, wait_timeout=wait_timeout
|
||||
vector_column_name,
|
||||
config=config,
|
||||
wait_timeout=wait_timeout,
|
||||
name=name,
|
||||
train=train,
|
||||
)
|
||||
)
|
||||
|
||||
@@ -617,6 +633,12 @@ class RemoteTable(Table):
|
||||
def stats(self):
|
||||
return LOOP.run(self._table.stats())
|
||||
|
||||
def take_offsets(self, offsets: list[int]) -> LanceTakeQueryBuilder:
|
||||
return LanceTakeQueryBuilder(self._table.take_offsets(offsets))
|
||||
|
||||
def take_row_ids(self, row_ids: list[int]) -> LanceTakeQueryBuilder:
|
||||
return LanceTakeQueryBuilder(self._table.take_row_ids(row_ids))
|
||||
|
||||
def uses_v2_manifest_paths(self) -> bool:
|
||||
raise NotImplementedError(
|
||||
"uses_v2_manifest_paths() is not supported on the LanceDB Cloud"
|
||||
|
||||
@@ -51,6 +51,7 @@ from .query import (
|
||||
AsyncFTSQuery,
|
||||
AsyncHybridQuery,
|
||||
AsyncQuery,
|
||||
AsyncTakeQuery,
|
||||
AsyncVectorQuery,
|
||||
FullTextQuery,
|
||||
LanceEmptyQueryBuilder,
|
||||
@@ -58,6 +59,7 @@ from .query import (
|
||||
LanceHybridQueryBuilder,
|
||||
LanceQueryBuilder,
|
||||
LanceVectorQueryBuilder,
|
||||
LanceTakeQueryBuilder,
|
||||
Query,
|
||||
)
|
||||
from .util import (
|
||||
@@ -687,6 +689,8 @@ class Table(ABC):
|
||||
sample_rate: int = 256,
|
||||
m: int = 20,
|
||||
ef_construction: int = 300,
|
||||
name: Optional[str] = None,
|
||||
train: bool = True,
|
||||
):
|
||||
"""Create an index on the table.
|
||||
|
||||
@@ -719,6 +723,11 @@ class Table(ABC):
|
||||
Only 4 and 8 are supported.
|
||||
wait_timeout: timedelta, optional
|
||||
The timeout to wait if indexing is asynchronous.
|
||||
name: str, optional
|
||||
The name of the index. If not provided, a default name will be generated.
|
||||
train: bool, default True
|
||||
Whether to train the index with existing data. Vector indices always train
|
||||
with existing data.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@@ -774,6 +783,7 @@ class Table(ABC):
|
||||
replace: bool = True,
|
||||
index_type: ScalarIndexType = "BTREE",
|
||||
wait_timeout: Optional[timedelta] = None,
|
||||
name: Optional[str] = None,
|
||||
):
|
||||
"""Create a scalar index on a column.
|
||||
|
||||
@@ -788,6 +798,8 @@ class Table(ABC):
|
||||
The type of index to create.
|
||||
wait_timeout: timedelta, optional
|
||||
The timeout to wait if indexing is asynchronous.
|
||||
name: str, optional
|
||||
The name of the index. If not provided, a default name will be generated.
|
||||
Examples
|
||||
--------
|
||||
|
||||
@@ -850,6 +862,7 @@ class Table(ABC):
|
||||
ngram_max_length: int = 3,
|
||||
prefix_only: bool = False,
|
||||
wait_timeout: Optional[timedelta] = None,
|
||||
name: Optional[str] = None,
|
||||
):
|
||||
"""Create a full-text search index on the table.
|
||||
|
||||
@@ -914,6 +927,8 @@ class Table(ABC):
|
||||
Whether to only index the prefix of the token for ngram tokenizer.
|
||||
wait_timeout: timedelta, optional
|
||||
The timeout to wait if indexing is asynchronous.
|
||||
name: str, optional
|
||||
The name of the index. If not provided, a default name will be generated.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@@ -1103,6 +1118,120 @@ class Table(ABC):
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def take_offsets(
|
||||
self, offsets: list[int], *, with_row_id: bool = False
|
||||
) -> LanceTakeQueryBuilder:
|
||||
"""
|
||||
Take a list of offsets from the table.
|
||||
|
||||
Offsets are 0-indexed and relative to the current version of the table. Offsets
|
||||
are not stable. A row with an offset of N may have a different offset in a
|
||||
different version of the table (e.g. if an earlier row is deleted).
|
||||
|
||||
Offsets are mostly useful for sampling as the set of all valid offsets is easily
|
||||
known in advance to be [0, len(table)).
|
||||
|
||||
No guarantees are made regarding the order in which results are returned. If
|
||||
you desire an output order that matches the order of the given offsets, you will
|
||||
need to add the row offset column to the output and align it yourself.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
offsets: list[int]
|
||||
The offsets to take.
|
||||
|
||||
Returns
|
||||
-------
|
||||
pa.RecordBatch
|
||||
A record batch containing the rows at the given offsets.
|
||||
"""
|
||||
|
||||
def __getitems__(self, offsets: list[int]) -> pa.RecordBatch:
|
||||
"""
|
||||
Take a list of offsets from the table and return as a record batch.
|
||||
|
||||
This method uses the `take_offsets` method to take the rows. However, it
|
||||
aligns the offsets to the passed in offsets. This means the return type
|
||||
is a record batch (and so users should take care not to pass in too many
|
||||
offsets)
|
||||
|
||||
Note: this method is primarily intended to fulfill the Dataset contract
|
||||
for pytorch.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
offsets: list[int]
|
||||
The offsets to take.
|
||||
|
||||
Returns
|
||||
-------
|
||||
pa.RecordBatch
|
||||
A record batch containing the rows at the given offsets.
|
||||
"""
|
||||
# We don't know the order of the results at all. So we calculate a permutation
|
||||
# for ordering the given offsets. Then we load the data with the _rowoffset
|
||||
# column. Then we sort by _rowoffset and apply the inverse of the permutation
|
||||
# that we calculated.
|
||||
#
|
||||
# Note: this is potentially a lot of memory copy if we're operating on large
|
||||
# batches :(
|
||||
num_offsets = len(offsets)
|
||||
indices = list(range(num_offsets))
|
||||
permutation = sorted(indices, key=lambda idx: offsets[idx])
|
||||
permutation_inv = [0] * num_offsets
|
||||
for i in range(num_offsets):
|
||||
permutation_inv[permutation[i]] = i
|
||||
|
||||
columns = self.schema.names
|
||||
columns.append("_rowoffset")
|
||||
tbl = (
|
||||
self.take_offsets(offsets)
|
||||
.select(columns)
|
||||
.to_arrow()
|
||||
.sort_by("_rowoffset")
|
||||
.take(permutation_inv)
|
||||
.combine_chunks()
|
||||
.drop_columns(["_rowoffset"])
|
||||
)
|
||||
|
||||
return tbl
|
||||
|
||||
@abstractmethod
|
||||
def take_row_ids(
|
||||
self, row_ids: list[int], *, with_row_id: bool = False
|
||||
) -> LanceTakeQueryBuilder:
|
||||
"""
|
||||
Take a list of row ids from the table.
|
||||
|
||||
Row ids are not stable and are relative to the current version of the table.
|
||||
They can change due to compaction and updates.
|
||||
|
||||
No guarantees are made regarding the order in which results are returned. If
|
||||
you desire an output order that matches the order of the given ids, you will
|
||||
need to add the row id column to the output and align it yourself.
|
||||
|
||||
Unlike offsets, row ids are not 0-indexed and no assumptions should be made
|
||||
about the possible range of row ids. In order to use this method you must
|
||||
first obtain the row ids by scanning or searching the table.
|
||||
|
||||
Even so, row ids are more stable than offsets and can be useful in some
|
||||
situations.
|
||||
|
||||
There is an ongoing effort to make row ids stable which is tracked at
|
||||
https://github.com/lancedb/lancedb/issues/1120
|
||||
|
||||
Parameters
|
||||
----------
|
||||
row_ids: list[int]
|
||||
The row ids to take.
|
||||
|
||||
Returns
|
||||
-------
|
||||
AsyncTakeQuery
|
||||
A query object that can be executed to get the rows.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def _execute_query(
|
||||
self,
|
||||
@@ -1577,13 +1706,16 @@ class LanceTable(Table):
|
||||
connection: "LanceDBConnection",
|
||||
name: str,
|
||||
*,
|
||||
namespace: List[str] = [],
|
||||
storage_options: Optional[Dict[str, str]] = None,
|
||||
index_cache_size: Optional[int] = None,
|
||||
):
|
||||
self._conn = connection
|
||||
self._namespace = namespace
|
||||
self._table = LOOP.run(
|
||||
connection._conn.open_table(
|
||||
name,
|
||||
namespace=namespace,
|
||||
storage_options=storage_options,
|
||||
index_cache_size=index_cache_size,
|
||||
)
|
||||
@@ -1594,8 +1726,8 @@ class LanceTable(Table):
|
||||
return self._table.name
|
||||
|
||||
@classmethod
|
||||
def open(cls, db, name, **kwargs):
|
||||
tbl = cls(db, name, **kwargs)
|
||||
def open(cls, db, name, *, namespace: List[str] = [], **kwargs):
|
||||
tbl = cls(db, name, namespace=namespace, **kwargs)
|
||||
|
||||
# check the dataset exists
|
||||
try:
|
||||
@@ -1648,6 +1780,12 @@ class LanceTable(Table):
|
||||
"""Get the current version of the table"""
|
||||
return LOOP.run(self._table.version())
|
||||
|
||||
def take_offsets(self, offsets: list[int]) -> LanceTakeQueryBuilder:
|
||||
return LanceTakeQueryBuilder(self._table.take_offsets(offsets))
|
||||
|
||||
def take_row_ids(self, row_ids: list[int]) -> LanceTakeQueryBuilder:
|
||||
return LanceTakeQueryBuilder(self._table.take_row_ids(row_ids))
|
||||
|
||||
@property
|
||||
def tags(self) -> Tags:
|
||||
"""Tag management for the table.
|
||||
@@ -1861,6 +1999,9 @@ class LanceTable(Table):
|
||||
sample_rate: int = 256,
|
||||
m: int = 20,
|
||||
ef_construction: int = 300,
|
||||
*,
|
||||
name: Optional[str] = None,
|
||||
train: bool = True,
|
||||
):
|
||||
"""Create an index on the table."""
|
||||
if accelerator is not None:
|
||||
@@ -1924,6 +2065,8 @@ class LanceTable(Table):
|
||||
vector_column_name,
|
||||
replace=replace,
|
||||
config=config,
|
||||
name=name,
|
||||
train=train,
|
||||
)
|
||||
)
|
||||
|
||||
@@ -1968,6 +2111,7 @@ class LanceTable(Table):
|
||||
*,
|
||||
replace: bool = True,
|
||||
index_type: ScalarIndexType = "BTREE",
|
||||
name: Optional[str] = None,
|
||||
):
|
||||
if index_type == "BTREE":
|
||||
config = BTree()
|
||||
@@ -1978,7 +2122,7 @@ class LanceTable(Table):
|
||||
else:
|
||||
raise ValueError(f"Unknown index type {index_type}")
|
||||
return LOOP.run(
|
||||
self._table.create_index(column, replace=replace, config=config)
|
||||
self._table.create_index(column, replace=replace, config=config, name=name)
|
||||
)
|
||||
|
||||
def create_fts_index(
|
||||
@@ -2002,6 +2146,7 @@ class LanceTable(Table):
|
||||
ngram_min_length: int = 3,
|
||||
ngram_max_length: int = 3,
|
||||
prefix_only: bool = False,
|
||||
name: Optional[str] = None,
|
||||
):
|
||||
if not use_tantivy:
|
||||
if not isinstance(field_names, str):
|
||||
@@ -2039,6 +2184,7 @@ class LanceTable(Table):
|
||||
field_names,
|
||||
replace=replace,
|
||||
config=config,
|
||||
name=name,
|
||||
)
|
||||
)
|
||||
return
|
||||
@@ -2405,6 +2551,7 @@ class LanceTable(Table):
|
||||
fill_value: float = 0.0,
|
||||
embedding_functions: Optional[List[EmbeddingFunctionConfig]] = None,
|
||||
*,
|
||||
namespace: List[str] = [],
|
||||
storage_options: Optional[Dict[str, str | bool]] = None,
|
||||
data_storage_version: Optional[str] = None,
|
||||
enable_v2_manifest_paths: Optional[bool] = None,
|
||||
@@ -2464,6 +2611,7 @@ class LanceTable(Table):
|
||||
"""
|
||||
self = cls.__new__(cls)
|
||||
self._conn = db
|
||||
self._namespace = namespace
|
||||
|
||||
if data_storage_version is not None:
|
||||
warnings.warn(
|
||||
@@ -2496,6 +2644,7 @@ class LanceTable(Table):
|
||||
on_bad_vectors=on_bad_vectors,
|
||||
fill_value=fill_value,
|
||||
embedding_functions=embedding_functions,
|
||||
namespace=namespace,
|
||||
storage_options=storage_options,
|
||||
)
|
||||
)
|
||||
@@ -3183,6 +3332,8 @@ class AsyncTable:
|
||||
Union[IvfFlat, IvfPq, HnswPq, HnswSq, BTree, Bitmap, LabelList, FTS]
|
||||
] = None,
|
||||
wait_timeout: Optional[timedelta] = None,
|
||||
name: Optional[str] = None,
|
||||
train: bool = True,
|
||||
):
|
||||
"""Create an index to speed up queries
|
||||
|
||||
@@ -3209,6 +3360,11 @@ class AsyncTable:
|
||||
creating an index object.
|
||||
wait_timeout: timedelta, optional
|
||||
The timeout to wait if indexing is asynchronous.
|
||||
name: str, optional
|
||||
The name of the index. If not provided, a default name will be generated.
|
||||
train: bool, default True
|
||||
Whether to train the index with existing data. Vector indices always train
|
||||
with existing data.
|
||||
"""
|
||||
if config is not None:
|
||||
if not isinstance(
|
||||
@@ -3220,7 +3376,12 @@ class AsyncTable:
|
||||
)
|
||||
try:
|
||||
await self._inner.create_index(
|
||||
column, index=config, replace=replace, wait_timeout=wait_timeout
|
||||
column,
|
||||
index=config,
|
||||
replace=replace,
|
||||
wait_timeout=wait_timeout,
|
||||
name=name,
|
||||
train=train,
|
||||
)
|
||||
except ValueError as e:
|
||||
if "not support the requested language" in str(e):
|
||||
@@ -4030,6 +4191,58 @@ class AsyncTable:
|
||||
"""
|
||||
await self._inner.restore(version)
|
||||
|
||||
def take_offsets(self, offsets: list[int]) -> AsyncTakeQuery:
|
||||
"""
|
||||
Take a list of offsets from the table.
|
||||
|
||||
Offsets are 0-indexed and relative to the current version of the table. Offsets
|
||||
are not stable. A row with an offset of N may have a different offset in a
|
||||
different version of the table (e.g. if an earlier row is deleted).
|
||||
|
||||
Offsets are mostly useful for sampling as the set of all valid offsets is easily
|
||||
known in advance to be [0, len(table)).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
offsets: list[int]
|
||||
The offsets to take.
|
||||
|
||||
Returns
|
||||
-------
|
||||
pa.RecordBatch
|
||||
A record batch containing the rows at the given offsets.
|
||||
"""
|
||||
return AsyncTakeQuery(self._inner.take_offsets(offsets))
|
||||
|
||||
def take_row_ids(self, row_ids: list[int]) -> AsyncTakeQuery:
|
||||
"""
|
||||
Take a list of row ids from the table.
|
||||
|
||||
Row ids are not stable and are relative to the current version of the table.
|
||||
They can change due to compaction and updates.
|
||||
|
||||
Unlike offsets, row ids are not 0-indexed and no assumptions should be made
|
||||
about the possible range of row ids. In order to use this method you must
|
||||
first obtain the row ids by scanning or searching the table.
|
||||
|
||||
Even so, row ids are more stable than offsets and can be useful in some
|
||||
situations.
|
||||
|
||||
There is an ongoing effort to make row ids stable which is tracked at
|
||||
https://github.com/lancedb/lancedb/issues/1120
|
||||
|
||||
Parameters
|
||||
----------
|
||||
row_ids: list[int]
|
||||
The row ids to take.
|
||||
|
||||
Returns
|
||||
-------
|
||||
AsyncTakeQuery
|
||||
A query object that can be executed to get the rows.
|
||||
"""
|
||||
return AsyncTakeQuery(self._inner.take_row_ids(row_ids))
|
||||
|
||||
@property
|
||||
def tags(self) -> AsyncTags:
|
||||
"""Tag management for the dataset.
|
||||
|
||||
@@ -175,6 +175,18 @@ def test_table_names(tmp_db: lancedb.DBConnection):
|
||||
tmp_db.create_table("test3", data=data)
|
||||
assert tmp_db.table_names() == ["test1", "test2", "test3"]
|
||||
|
||||
# Test that positional arguments for page_token and limit
|
||||
result = list(tmp_db.table_names("test1", 1)) # page_token="test1", limit=1
|
||||
assert result == ["test2"], f"Expected ['test2'], got {result}"
|
||||
|
||||
# Test mixed positional and keyword arguments
|
||||
result = list(tmp_db.table_names("test2", limit=2))
|
||||
assert result == ["test3"], f"Expected ['test3'], got {result}"
|
||||
|
||||
# Test that namespace parameter can be passed as keyword
|
||||
result = list(tmp_db.table_names(namespace=[]))
|
||||
assert len(result) == 3
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_table_names_async(tmp_path):
|
||||
@@ -728,3 +740,93 @@ def test_bypass_vector_index_sync(tmp_db: lancedb.DBConnection):
|
||||
table.search(sample_key).bypass_vector_index().explain_plan(verbose=True)
|
||||
)
|
||||
assert "KNN" in plan_without_index
|
||||
|
||||
|
||||
def test_local_namespace_operations(tmp_path):
|
||||
"""Test that local mode namespace operations behave as expected."""
|
||||
# Create a local database connection
|
||||
db = lancedb.connect(tmp_path)
|
||||
|
||||
# Test list_namespaces returns empty list
|
||||
namespaces = list(db.list_namespaces())
|
||||
assert namespaces == []
|
||||
|
||||
# Test list_namespaces with parameters still returns empty list
|
||||
namespaces_with_params = list(
|
||||
db.list_namespaces(namespace=["test"], page_token="token", limit=5)
|
||||
)
|
||||
assert namespaces_with_params == []
|
||||
|
||||
|
||||
def test_local_create_namespace_not_supported(tmp_path):
|
||||
"""Test that create_namespace is not supported in local mode."""
|
||||
db = lancedb.connect(tmp_path)
|
||||
|
||||
with pytest.raises(
|
||||
NotImplementedError,
|
||||
match="Namespace operations are not supported for listing database",
|
||||
):
|
||||
db.create_namespace(["test_namespace"])
|
||||
|
||||
|
||||
def test_local_drop_namespace_not_supported(tmp_path):
|
||||
"""Test that drop_namespace is not supported in local mode."""
|
||||
db = lancedb.connect(tmp_path)
|
||||
|
||||
with pytest.raises(
|
||||
NotImplementedError,
|
||||
match="Namespace operations are not supported for listing database",
|
||||
):
|
||||
db.drop_namespace(["test_namespace"])
|
||||
|
||||
|
||||
def test_local_table_operations_with_namespace_raise_error(tmp_path):
|
||||
"""
|
||||
Test that table operations with namespace parameter
|
||||
raise ValueError in local mode.
|
||||
"""
|
||||
db = lancedb.connect(tmp_path)
|
||||
|
||||
# Create some test data
|
||||
data = [{"vector": [1.0, 2.0], "item": "test"}]
|
||||
schema = pa.schema(
|
||||
[pa.field("vector", pa.list_(pa.float32(), 2)), pa.field("item", pa.string())]
|
||||
)
|
||||
|
||||
# Test create_table with namespace - should raise ValueError
|
||||
with pytest.raises(
|
||||
NotImplementedError,
|
||||
match="Namespace parameter is not supported for listing database",
|
||||
):
|
||||
db.create_table(
|
||||
"test_table_with_ns", data=data, schema=schema, namespace=["test_ns"]
|
||||
)
|
||||
|
||||
# Create table normally for other tests
|
||||
db.create_table("test_table", data=data, schema=schema)
|
||||
assert "test_table" in db.table_names()
|
||||
|
||||
# Test open_table with namespace - should raise ValueError
|
||||
with pytest.raises(
|
||||
NotImplementedError,
|
||||
match="Namespace parameter is not supported for listing database",
|
||||
):
|
||||
db.open_table("test_table", namespace=["test_ns"])
|
||||
|
||||
# Test table_names with namespace - should raise ValueError
|
||||
with pytest.raises(
|
||||
NotImplementedError,
|
||||
match="Namespace parameter is not supported for listing database",
|
||||
):
|
||||
list(db.table_names(namespace=["test_ns"]))
|
||||
|
||||
# Test drop_table with namespace - should raise ValueError
|
||||
with pytest.raises(
|
||||
NotImplementedError,
|
||||
match="Namespace parameter is not supported for listing database",
|
||||
):
|
||||
db.drop_table("test_table", namespace=["test_ns"])
|
||||
|
||||
# Test table_names without namespace - should work normally
|
||||
tables_root = list(db.table_names())
|
||||
assert "test_table" in tables_root
|
||||
|
||||
@@ -157,7 +157,16 @@ def test_create_index_with_stemming(tmp_path, table):
|
||||
def test_create_inverted_index(table, use_tantivy, with_position):
|
||||
if use_tantivy and not with_position:
|
||||
pytest.skip("we don't support building a tantivy index without position")
|
||||
table.create_fts_index("text", use_tantivy=use_tantivy, with_position=with_position)
|
||||
table.create_fts_index(
|
||||
"text",
|
||||
use_tantivy=use_tantivy,
|
||||
with_position=with_position,
|
||||
name="custom_fts_index",
|
||||
)
|
||||
if not use_tantivy:
|
||||
indices = table.list_indices()
|
||||
fts_indices = [i for i in indices if i.index_type == "FTS"]
|
||||
assert any(i.name == "custom_fts_index" for i in fts_indices)
|
||||
|
||||
|
||||
def test_populate_index(tmp_path, table):
|
||||
|
||||
707
python/python/tests/test_namespace.py
Normal file
707
python/python/tests/test_namespace.py
Normal file
@@ -0,0 +1,707 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
"""Tests for LanceDB namespace integration."""
|
||||
|
||||
import tempfile
|
||||
import shutil
|
||||
from typing import Dict, Optional
|
||||
import pytest
|
||||
import pyarrow as pa
|
||||
import lancedb
|
||||
from lance_namespace.namespace import NATIVE_IMPLS, LanceNamespace
|
||||
from lance_namespace_urllib3_client.models import (
|
||||
ListTablesRequest,
|
||||
ListTablesResponse,
|
||||
DescribeTableRequest,
|
||||
DescribeTableResponse,
|
||||
RegisterTableRequest,
|
||||
RegisterTableResponse,
|
||||
DeregisterTableRequest,
|
||||
DeregisterTableResponse,
|
||||
CreateTableRequest,
|
||||
CreateTableResponse,
|
||||
DropTableRequest,
|
||||
DropTableResponse,
|
||||
ListNamespacesRequest,
|
||||
ListNamespacesResponse,
|
||||
CreateNamespaceRequest,
|
||||
CreateNamespaceResponse,
|
||||
DropNamespaceRequest,
|
||||
DropNamespaceResponse,
|
||||
)
|
||||
|
||||
|
||||
class TempNamespace(LanceNamespace):
|
||||
"""A simple dictionary-backed namespace for testing."""
|
||||
|
||||
# Class-level storage to persist table registry across instances
|
||||
_global_registry: Dict[str, Dict[str, str]] = {}
|
||||
# Class-level storage for namespaces (supporting 1-level namespace)
|
||||
_global_namespaces: Dict[str, set] = {}
|
||||
|
||||
def __init__(self, **properties):
|
||||
"""Initialize the test namespace.
|
||||
|
||||
Args:
|
||||
root: The root directory for tables (optional)
|
||||
**properties: Additional configuration properties
|
||||
"""
|
||||
self.config = TempNamespaceConfig(properties)
|
||||
# Use the root as a key to maintain separate registries per root
|
||||
root = self.config.root
|
||||
if root not in self._global_registry:
|
||||
self._global_registry[root] = {}
|
||||
if root not in self._global_namespaces:
|
||||
self._global_namespaces[root] = set()
|
||||
self.tables = self._global_registry[root] # Reference to shared registry
|
||||
self.namespaces = self._global_namespaces[
|
||||
root
|
||||
] # Reference to shared namespaces
|
||||
|
||||
def list_tables(self, request: ListTablesRequest) -> ListTablesResponse:
|
||||
"""List all tables in the namespace."""
|
||||
if not request.id:
|
||||
# List all tables in root namespace
|
||||
tables = [name for name in self.tables.keys() if "." not in name]
|
||||
else:
|
||||
# List tables in specific namespace (1-level only)
|
||||
if len(request.id) == 1:
|
||||
namespace_name = request.id[0]
|
||||
prefix = f"{namespace_name}."
|
||||
tables = [
|
||||
name[len(prefix) :]
|
||||
for name in self.tables.keys()
|
||||
if name.startswith(prefix)
|
||||
]
|
||||
else:
|
||||
# Multi-level namespaces not supported
|
||||
raise ValueError("Only 1-level namespaces are supported")
|
||||
return ListTablesResponse(tables=tables)
|
||||
|
||||
def describe_table(self, request: DescribeTableRequest) -> DescribeTableResponse:
|
||||
"""Describe a table by returning its location."""
|
||||
if not request.id:
|
||||
raise ValueError("Invalid table ID")
|
||||
|
||||
if len(request.id) == 1:
|
||||
# Root namespace table
|
||||
table_name = request.id[0]
|
||||
elif len(request.id) == 2:
|
||||
# Namespaced table (1-level namespace)
|
||||
namespace_name, table_name = request.id
|
||||
table_name = f"{namespace_name}.{table_name}"
|
||||
else:
|
||||
raise ValueError("Only 1-level namespaces are supported")
|
||||
|
||||
if table_name not in self.tables:
|
||||
raise RuntimeError(f"Table does not exist: {table_name}")
|
||||
|
||||
table_uri = self.tables[table_name]
|
||||
return DescribeTableResponse(location=table_uri)
|
||||
|
||||
def create_table(
|
||||
self, request: CreateTableRequest, request_data: bytes
|
||||
) -> CreateTableResponse:
|
||||
"""Create a table in the namespace."""
|
||||
if not request.id:
|
||||
raise ValueError("Invalid table ID")
|
||||
|
||||
if len(request.id) == 1:
|
||||
# Root namespace table
|
||||
table_name = request.id[0]
|
||||
table_uri = f"{self.config.root}/{table_name}.lance"
|
||||
elif len(request.id) == 2:
|
||||
# Namespaced table (1-level namespace)
|
||||
namespace_name, base_table_name = request.id
|
||||
# Add namespace to our namespace set
|
||||
self.namespaces.add(namespace_name)
|
||||
table_name = f"{namespace_name}.{base_table_name}"
|
||||
table_uri = f"{self.config.root}/{namespace_name}/{base_table_name}.lance"
|
||||
else:
|
||||
raise ValueError("Only 1-level namespaces are supported")
|
||||
|
||||
# Check if table already exists
|
||||
if table_name in self.tables:
|
||||
if request.mode == "overwrite":
|
||||
# Drop existing table for overwrite mode
|
||||
del self.tables[table_name]
|
||||
else:
|
||||
raise RuntimeError(f"Table already exists: {table_name}")
|
||||
|
||||
# Parse the Arrow IPC stream to get the schema and create the actual table
|
||||
import pyarrow.ipc as ipc
|
||||
import io
|
||||
import lance
|
||||
import os
|
||||
|
||||
# Create directory if needed for namespaced tables
|
||||
os.makedirs(os.path.dirname(table_uri), exist_ok=True)
|
||||
|
||||
# Read the IPC stream
|
||||
reader = ipc.open_stream(io.BytesIO(request_data))
|
||||
table = reader.read_all()
|
||||
|
||||
# Create the actual Lance table
|
||||
lance.write_dataset(table, table_uri)
|
||||
|
||||
# Store the table mapping
|
||||
self.tables[table_name] = table_uri
|
||||
|
||||
return CreateTableResponse(location=table_uri)
|
||||
|
||||
def drop_table(self, request: DropTableRequest) -> DropTableResponse:
|
||||
"""Drop a table from the namespace."""
|
||||
if not request.id:
|
||||
raise ValueError("Invalid table ID")
|
||||
|
||||
if len(request.id) == 1:
|
||||
# Root namespace table
|
||||
table_name = request.id[0]
|
||||
elif len(request.id) == 2:
|
||||
# Namespaced table (1-level namespace)
|
||||
namespace_name, base_table_name = request.id
|
||||
table_name = f"{namespace_name}.{base_table_name}"
|
||||
else:
|
||||
raise ValueError("Only 1-level namespaces are supported")
|
||||
|
||||
if table_name not in self.tables:
|
||||
raise RuntimeError(f"Table does not exist: {table_name}")
|
||||
|
||||
# Get the table URI
|
||||
table_uri = self.tables[table_name]
|
||||
|
||||
# Delete the actual table files
|
||||
import shutil
|
||||
import os
|
||||
|
||||
if os.path.exists(table_uri):
|
||||
shutil.rmtree(table_uri, ignore_errors=True)
|
||||
|
||||
# Remove from registry
|
||||
del self.tables[table_name]
|
||||
|
||||
return DropTableResponse()
|
||||
|
||||
def register_table(self, request: RegisterTableRequest) -> RegisterTableResponse:
|
||||
"""Register a table with the namespace."""
|
||||
if not request.id or len(request.id) != 1:
|
||||
raise ValueError("Invalid table ID")
|
||||
|
||||
if not request.location:
|
||||
raise ValueError("Table location is required")
|
||||
|
||||
table_name = request.id[0]
|
||||
self.tables[table_name] = request.location
|
||||
|
||||
return RegisterTableResponse()
|
||||
|
||||
def deregister_table(
|
||||
self, request: DeregisterTableRequest
|
||||
) -> DeregisterTableResponse:
|
||||
"""Deregister a table from the namespace."""
|
||||
if not request.id or len(request.id) != 1:
|
||||
raise ValueError("Invalid table ID")
|
||||
|
||||
table_name = request.id[0]
|
||||
if table_name not in self.tables:
|
||||
raise RuntimeError(f"Table does not exist: {table_name}")
|
||||
|
||||
del self.tables[table_name]
|
||||
return DeregisterTableResponse()
|
||||
|
||||
def list_namespaces(self, request: ListNamespacesRequest) -> ListNamespacesResponse:
|
||||
"""List child namespaces."""
|
||||
if not request.id:
|
||||
# List root-level namespaces
|
||||
namespaces = list(self.namespaces)
|
||||
elif len(request.id) == 1:
|
||||
# For 1-level namespace, there are no child namespaces
|
||||
namespaces = []
|
||||
else:
|
||||
raise ValueError("Only 1-level namespaces are supported")
|
||||
|
||||
return ListNamespacesResponse(namespaces=namespaces)
|
||||
|
||||
def create_namespace(
|
||||
self, request: CreateNamespaceRequest
|
||||
) -> CreateNamespaceResponse:
|
||||
"""Create a namespace."""
|
||||
if not request.id:
|
||||
raise ValueError("Invalid namespace ID")
|
||||
|
||||
if len(request.id) == 1:
|
||||
# Create 1-level namespace
|
||||
namespace_name = request.id[0]
|
||||
self.namespaces.add(namespace_name)
|
||||
|
||||
# Create directory for the namespace
|
||||
import os
|
||||
|
||||
namespace_dir = f"{self.config.root}/{namespace_name}"
|
||||
os.makedirs(namespace_dir, exist_ok=True)
|
||||
else:
|
||||
raise ValueError("Only 1-level namespaces are supported")
|
||||
|
||||
return CreateNamespaceResponse()
|
||||
|
||||
def drop_namespace(self, request: DropNamespaceRequest) -> DropNamespaceResponse:
|
||||
"""Drop a namespace."""
|
||||
if not request.id:
|
||||
raise ValueError("Invalid namespace ID")
|
||||
|
||||
if len(request.id) == 1:
|
||||
# Drop 1-level namespace
|
||||
namespace_name = request.id[0]
|
||||
|
||||
if namespace_name not in self.namespaces:
|
||||
raise RuntimeError(f"Namespace does not exist: {namespace_name}")
|
||||
|
||||
# Check if namespace has any tables
|
||||
prefix = f"{namespace_name}."
|
||||
tables_in_namespace = [
|
||||
name for name in self.tables.keys() if name.startswith(prefix)
|
||||
]
|
||||
if tables_in_namespace:
|
||||
raise RuntimeError(
|
||||
f"Cannot drop namespace '{namespace_name}': contains tables"
|
||||
)
|
||||
|
||||
# Remove namespace
|
||||
self.namespaces.remove(namespace_name)
|
||||
|
||||
# Remove directory
|
||||
import shutil
|
||||
import os
|
||||
|
||||
namespace_dir = f"{self.config.root}/{namespace_name}"
|
||||
if os.path.exists(namespace_dir):
|
||||
shutil.rmtree(namespace_dir, ignore_errors=True)
|
||||
else:
|
||||
raise ValueError("Only 1-level namespaces are supported")
|
||||
|
||||
return DropNamespaceResponse()
|
||||
|
||||
|
||||
class TempNamespaceConfig:
|
||||
"""Configuration for TestNamespace."""
|
||||
|
||||
ROOT = "root"
|
||||
|
||||
def __init__(self, properties: Optional[Dict[str, str]] = None):
|
||||
"""Initialize configuration from properties.
|
||||
|
||||
Args:
|
||||
properties: Dictionary of configuration properties
|
||||
"""
|
||||
if properties is None:
|
||||
properties = {}
|
||||
|
||||
self._root = properties.get(self.ROOT, "/tmp")
|
||||
|
||||
@property
|
||||
def root(self) -> str:
|
||||
"""Get the namespace root directory."""
|
||||
return self._root
|
||||
|
||||
|
||||
NATIVE_IMPLS["temp"] = f"{TempNamespace.__module__}.TempNamespace"
|
||||
|
||||
|
||||
class TestNamespaceConnection:
|
||||
"""Test namespace-based LanceDB connection."""
|
||||
|
||||
def setup_method(self):
|
||||
"""Set up test fixtures."""
|
||||
self.temp_dir = tempfile.mkdtemp()
|
||||
# Clear the TestNamespace registry for this test
|
||||
if self.temp_dir in TempNamespace._global_registry:
|
||||
TempNamespace._global_registry[self.temp_dir].clear()
|
||||
if self.temp_dir in TempNamespace._global_namespaces:
|
||||
TempNamespace._global_namespaces[self.temp_dir].clear()
|
||||
|
||||
def teardown_method(self):
|
||||
"""Clean up test fixtures."""
|
||||
# Clear the TestNamespace registry
|
||||
if self.temp_dir in TempNamespace._global_registry:
|
||||
del TempNamespace._global_registry[self.temp_dir]
|
||||
if self.temp_dir in TempNamespace._global_namespaces:
|
||||
del TempNamespace._global_namespaces[self.temp_dir]
|
||||
shutil.rmtree(self.temp_dir, ignore_errors=True)
|
||||
|
||||
def test_connect_namespace_test(self):
|
||||
"""Test connecting to LanceDB through TestNamespace."""
|
||||
# Connect using TestNamespace
|
||||
db = lancedb.connect_namespace("temp", {"root": self.temp_dir})
|
||||
|
||||
# Should be a LanceNamespaceDBConnection
|
||||
assert isinstance(db, lancedb.LanceNamespaceDBConnection)
|
||||
|
||||
# Initially no tables
|
||||
assert len(list(db.table_names())) == 0
|
||||
|
||||
def test_create_table_through_namespace(self):
|
||||
"""Test creating a table through namespace."""
|
||||
db = lancedb.connect_namespace("temp", {"root": self.temp_dir})
|
||||
|
||||
# Define schema for empty table
|
||||
schema = pa.schema(
|
||||
[
|
||||
pa.field("id", pa.int64()),
|
||||
pa.field("vector", pa.list_(pa.float32(), 2)),
|
||||
pa.field("text", pa.string()),
|
||||
]
|
||||
)
|
||||
|
||||
# Create empty table
|
||||
table = db.create_table("test_table", schema=schema)
|
||||
assert table is not None
|
||||
assert table.name == "test_table"
|
||||
|
||||
# Table should appear in namespace
|
||||
table_names = list(db.table_names())
|
||||
assert "test_table" in table_names
|
||||
assert len(table_names) == 1
|
||||
|
||||
# Verify empty table
|
||||
result = table.to_pandas()
|
||||
assert len(result) == 0
|
||||
assert list(result.columns) == ["id", "vector", "text"]
|
||||
|
||||
def test_open_table_through_namespace(self):
|
||||
"""Test opening an existing table through namespace."""
|
||||
db = lancedb.connect_namespace("temp", {"root": self.temp_dir})
|
||||
|
||||
# Create a table with schema
|
||||
schema = pa.schema(
|
||||
[
|
||||
pa.field("id", pa.int64()),
|
||||
pa.field("vector", pa.list_(pa.float32(), 2)),
|
||||
]
|
||||
)
|
||||
db.create_table("test_table", schema=schema)
|
||||
|
||||
# Open the table
|
||||
table = db.open_table("test_table")
|
||||
assert table is not None
|
||||
assert table.name == "test_table"
|
||||
|
||||
# Verify empty table with correct schema
|
||||
result = table.to_pandas()
|
||||
assert len(result) == 0
|
||||
assert list(result.columns) == ["id", "vector"]
|
||||
|
||||
def test_drop_table_through_namespace(self):
|
||||
"""Test dropping a table through namespace."""
|
||||
db = lancedb.connect_namespace("temp", {"root": self.temp_dir})
|
||||
|
||||
# Create tables
|
||||
schema = pa.schema(
|
||||
[
|
||||
pa.field("id", pa.int64()),
|
||||
pa.field("vector", pa.list_(pa.float32(), 2)),
|
||||
]
|
||||
)
|
||||
db.create_table("table1", schema=schema)
|
||||
db.create_table("table2", schema=schema)
|
||||
|
||||
# Verify both tables exist
|
||||
table_names = list(db.table_names())
|
||||
assert "table1" in table_names
|
||||
assert "table2" in table_names
|
||||
assert len(table_names) == 2
|
||||
|
||||
# Drop one table
|
||||
db.drop_table("table1")
|
||||
|
||||
# Verify only table2 remains
|
||||
table_names = list(db.table_names())
|
||||
assert "table1" not in table_names
|
||||
assert "table2" in table_names
|
||||
assert len(table_names) == 1
|
||||
|
||||
# Test that drop_table works without explicit namespace parameter
|
||||
db.drop_table("table2")
|
||||
assert len(list(db.table_names())) == 0
|
||||
|
||||
# Should not be able to open dropped table
|
||||
with pytest.raises(RuntimeError):
|
||||
db.open_table("table1")
|
||||
|
||||
def test_create_table_with_schema(self):
|
||||
"""Test creating a table with explicit schema through namespace."""
|
||||
db = lancedb.connect_namespace("temp", {"root": self.temp_dir})
|
||||
|
||||
# Define schema
|
||||
schema = pa.schema(
|
||||
[
|
||||
pa.field("id", pa.int64()),
|
||||
pa.field("vector", pa.list_(pa.float32(), 3)),
|
||||
pa.field("text", pa.string()),
|
||||
]
|
||||
)
|
||||
|
||||
# Create table with schema
|
||||
table = db.create_table("test_table", schema=schema)
|
||||
assert table is not None
|
||||
|
||||
# Verify schema
|
||||
table_schema = table.schema
|
||||
assert len(table_schema) == 3
|
||||
assert table_schema.field("id").type == pa.int64()
|
||||
assert table_schema.field("text").type == pa.string()
|
||||
|
||||
def test_rename_table_not_supported(self):
|
||||
"""Test that rename_table raises NotImplementedError."""
|
||||
db = lancedb.connect_namespace("temp", {"root": self.temp_dir})
|
||||
|
||||
# Create a table
|
||||
schema = pa.schema(
|
||||
[
|
||||
pa.field("id", pa.int64()),
|
||||
pa.field("vector", pa.list_(pa.float32(), 2)),
|
||||
]
|
||||
)
|
||||
db.create_table("old_name", schema=schema)
|
||||
|
||||
# Rename should raise NotImplementedError
|
||||
with pytest.raises(NotImplementedError, match="rename_table is not supported"):
|
||||
db.rename_table("old_name", "new_name")
|
||||
|
||||
def test_drop_all_tables(self):
|
||||
"""Test dropping all tables through namespace."""
|
||||
db = lancedb.connect_namespace("temp", {"root": self.temp_dir})
|
||||
|
||||
# Create multiple tables
|
||||
schema = pa.schema(
|
||||
[
|
||||
pa.field("id", pa.int64()),
|
||||
pa.field("vector", pa.list_(pa.float32(), 2)),
|
||||
]
|
||||
)
|
||||
for i in range(3):
|
||||
db.create_table(f"table{i}", schema=schema)
|
||||
|
||||
# Verify tables exist
|
||||
assert len(list(db.table_names())) == 3
|
||||
|
||||
# Drop all tables
|
||||
db.drop_all_tables()
|
||||
|
||||
# Verify all tables are gone
|
||||
assert len(list(db.table_names())) == 0
|
||||
|
||||
# Test that table_names works with keyword-only namespace parameter
|
||||
db.create_table("test_table", schema=schema)
|
||||
result = list(db.table_names(namespace=[]))
|
||||
assert "test_table" in result
|
||||
|
||||
def test_table_operations(self):
|
||||
"""Test various table operations through namespace."""
|
||||
db = lancedb.connect_namespace("temp", {"root": self.temp_dir})
|
||||
|
||||
# Create a table with schema
|
||||
schema = pa.schema(
|
||||
[
|
||||
pa.field("id", pa.int64()),
|
||||
pa.field("vector", pa.list_(pa.float32(), 2)),
|
||||
pa.field("text", pa.string()),
|
||||
]
|
||||
)
|
||||
table = db.create_table("test_table", schema=schema)
|
||||
|
||||
# Verify empty table was created
|
||||
result = table.to_pandas()
|
||||
assert len(result) == 0
|
||||
assert list(result.columns) == ["id", "vector", "text"]
|
||||
|
||||
# Test add data to the table
|
||||
new_data = [
|
||||
{"id": 1, "vector": [1.0, 2.0], "text": "item_1"},
|
||||
{"id": 2, "vector": [2.0, 3.0], "text": "item_2"},
|
||||
]
|
||||
table.add(new_data)
|
||||
result = table.to_pandas()
|
||||
assert len(result) == 2
|
||||
|
||||
# Test delete
|
||||
table.delete("id = 1")
|
||||
result = table.to_pandas()
|
||||
assert len(result) == 1
|
||||
assert result["id"].values[0] == 2
|
||||
|
||||
# Test update
|
||||
table.update(where="id = 2", values={"text": "updated"})
|
||||
result = table.to_pandas()
|
||||
assert result["text"].values[0] == "updated"
|
||||
|
||||
def test_storage_options(self):
|
||||
"""Test passing storage options through namespace connection."""
|
||||
# Connect with storage options
|
||||
storage_opts = {"test_option": "test_value"}
|
||||
db = lancedb.connect_namespace(
|
||||
"temp", {"root": self.temp_dir}, storage_options=storage_opts
|
||||
)
|
||||
|
||||
# Storage options should be preserved
|
||||
assert db.storage_options == storage_opts
|
||||
|
||||
# Create table with additional storage options
|
||||
table_opts = {"table_option": "table_value"}
|
||||
schema = pa.schema(
|
||||
[
|
||||
pa.field("id", pa.int64()),
|
||||
pa.field("vector", pa.list_(pa.float32(), 2)),
|
||||
]
|
||||
)
|
||||
db.create_table("test_table", schema=schema, storage_options=table_opts)
|
||||
|
||||
def test_namespace_operations(self):
|
||||
"""Test namespace management operations."""
|
||||
db = lancedb.connect_namespace("temp", {"root": self.temp_dir})
|
||||
|
||||
# Initially no namespaces
|
||||
assert len(list(db.list_namespaces())) == 0
|
||||
|
||||
# Create a namespace
|
||||
db.create_namespace(["test_namespace"])
|
||||
|
||||
# Verify namespace exists
|
||||
namespaces = list(db.list_namespaces())
|
||||
assert "test_namespace" in namespaces
|
||||
assert len(namespaces) == 1
|
||||
|
||||
# Create table in namespace
|
||||
schema = pa.schema(
|
||||
[
|
||||
pa.field("id", pa.int64()),
|
||||
pa.field("vector", pa.list_(pa.float32(), 2)),
|
||||
]
|
||||
)
|
||||
table = db.create_table(
|
||||
"test_table", schema=schema, namespace=["test_namespace"]
|
||||
)
|
||||
assert table is not None
|
||||
|
||||
# Verify table exists in namespace
|
||||
tables_in_namespace = list(db.table_names(namespace=["test_namespace"]))
|
||||
assert "test_table" in tables_in_namespace
|
||||
assert len(tables_in_namespace) == 1
|
||||
|
||||
# Open table from namespace
|
||||
table = db.open_table("test_table", namespace=["test_namespace"])
|
||||
assert table is not None
|
||||
assert table.name == "test_table"
|
||||
|
||||
# Drop table from namespace
|
||||
db.drop_table("test_table", namespace=["test_namespace"])
|
||||
|
||||
# Verify table no longer exists in namespace
|
||||
tables_in_namespace = list(db.table_names(namespace=["test_namespace"]))
|
||||
assert len(tables_in_namespace) == 0
|
||||
|
||||
# Drop namespace
|
||||
db.drop_namespace(["test_namespace"])
|
||||
|
||||
# Verify namespace no longer exists
|
||||
namespaces = list(db.list_namespaces())
|
||||
assert len(namespaces) == 0
|
||||
|
||||
def test_namespace_with_tables_cannot_be_dropped(self):
|
||||
"""Test that namespaces containing tables cannot be dropped."""
|
||||
db = lancedb.connect_namespace("temp", {"root": self.temp_dir})
|
||||
|
||||
# Create namespace and table
|
||||
db.create_namespace(["test_namespace"])
|
||||
schema = pa.schema(
|
||||
[
|
||||
pa.field("id", pa.int64()),
|
||||
pa.field("vector", pa.list_(pa.float32(), 2)),
|
||||
]
|
||||
)
|
||||
db.create_table("test_table", schema=schema, namespace=["test_namespace"])
|
||||
|
||||
# Try to drop namespace with tables - should fail
|
||||
with pytest.raises(RuntimeError, match="contains tables"):
|
||||
db.drop_namespace(["test_namespace"])
|
||||
|
||||
# Drop table first
|
||||
db.drop_table("test_table", namespace=["test_namespace"])
|
||||
|
||||
# Now dropping namespace should work
|
||||
db.drop_namespace(["test_namespace"])
|
||||
|
||||
def test_same_table_name_different_namespaces(self):
|
||||
db = lancedb.connect_namespace("temp", {"root": self.temp_dir})
|
||||
|
||||
# Create two namespaces
|
||||
db.create_namespace(["namespace_a"])
|
||||
db.create_namespace(["namespace_b"])
|
||||
|
||||
# Define schema
|
||||
schema = pa.schema(
|
||||
[
|
||||
pa.field("id", pa.int64()),
|
||||
pa.field("vector", pa.list_(pa.float32(), 2)),
|
||||
pa.field("text", pa.string()),
|
||||
]
|
||||
)
|
||||
|
||||
# Create table with same name in both namespaces
|
||||
table_a = db.create_table(
|
||||
"same_name_table", schema=schema, namespace=["namespace_a"]
|
||||
)
|
||||
table_b = db.create_table(
|
||||
"same_name_table", schema=schema, namespace=["namespace_b"]
|
||||
)
|
||||
|
||||
# Add different data to each table
|
||||
data_a = [
|
||||
{"id": 1, "vector": [1.0, 2.0], "text": "data_from_namespace_a"},
|
||||
{"id": 2, "vector": [3.0, 4.0], "text": "also_from_namespace_a"},
|
||||
]
|
||||
table_a.add(data_a)
|
||||
|
||||
data_b = [
|
||||
{"id": 10, "vector": [10.0, 20.0], "text": "data_from_namespace_b"},
|
||||
{"id": 20, "vector": [30.0, 40.0], "text": "also_from_namespace_b"},
|
||||
{"id": 30, "vector": [50.0, 60.0], "text": "more_from_namespace_b"},
|
||||
]
|
||||
table_b.add(data_b)
|
||||
|
||||
# Verify data in namespace_a table
|
||||
opened_table_a = db.open_table("same_name_table", namespace=["namespace_a"])
|
||||
result_a = opened_table_a.to_pandas().sort_values("id").reset_index(drop=True)
|
||||
assert len(result_a) == 2
|
||||
assert result_a["id"].tolist() == [1, 2]
|
||||
assert result_a["text"].tolist() == [
|
||||
"data_from_namespace_a",
|
||||
"also_from_namespace_a",
|
||||
]
|
||||
assert [v.tolist() for v in result_a["vector"]] == [[1.0, 2.0], [3.0, 4.0]]
|
||||
|
||||
# Verify data in namespace_b table
|
||||
opened_table_b = db.open_table("same_name_table", namespace=["namespace_b"])
|
||||
result_b = opened_table_b.to_pandas().sort_values("id").reset_index(drop=True)
|
||||
assert len(result_b) == 3
|
||||
assert result_b["id"].tolist() == [10, 20, 30]
|
||||
assert result_b["text"].tolist() == [
|
||||
"data_from_namespace_b",
|
||||
"also_from_namespace_b",
|
||||
"more_from_namespace_b",
|
||||
]
|
||||
assert [v.tolist() for v in result_b["vector"]] == [
|
||||
[10.0, 20.0],
|
||||
[30.0, 40.0],
|
||||
[50.0, 60.0],
|
||||
]
|
||||
|
||||
# Verify root namespace doesn't have this table
|
||||
root_tables = list(db.table_names())
|
||||
assert "same_name_table" not in root_tables
|
||||
|
||||
# Clean up
|
||||
db.drop_table("same_name_table", namespace=["namespace_a"])
|
||||
db.drop_table("same_name_table", namespace=["namespace_b"])
|
||||
db.drop_namespace(["namespace_a"])
|
||||
db.drop_namespace(["namespace_b"])
|
||||
@@ -5,6 +5,7 @@ from typing import List, Union
|
||||
import unittest.mock as mock
|
||||
from datetime import timedelta
|
||||
from pathlib import Path
|
||||
import random
|
||||
|
||||
import lancedb
|
||||
from lancedb.db import AsyncConnection
|
||||
@@ -1327,6 +1328,55 @@ def test_query_timeout(tmp_path):
|
||||
)
|
||||
|
||||
|
||||
def test_take_queries(tmp_path):
|
||||
db = lancedb.connect(tmp_path)
|
||||
data = pa.table(
|
||||
{
|
||||
"idx": range(100),
|
||||
}
|
||||
)
|
||||
table = db.create_table("test", data)
|
||||
|
||||
# Take by offset
|
||||
assert list(
|
||||
sorted(table.take_offsets([5, 2, 17]).to_pandas()["idx"].to_list())
|
||||
) == [
|
||||
2,
|
||||
5,
|
||||
17,
|
||||
]
|
||||
|
||||
# Take by row id
|
||||
assert list(
|
||||
sorted(table.take_row_ids([5, 2, 17]).to_pandas()["idx"].to_list())
|
||||
) == [
|
||||
2,
|
||||
5,
|
||||
17,
|
||||
]
|
||||
|
||||
|
||||
def test_getitems(tmp_path):
|
||||
db = lancedb.connect(tmp_path)
|
||||
data = pa.table(
|
||||
{
|
||||
"idx": range(100),
|
||||
}
|
||||
)
|
||||
# Make two fragments
|
||||
table = db.create_table("test", data)
|
||||
table.add(pa.table({"idx": range(100, 200)}))
|
||||
|
||||
assert table.__getitems__([5, 2, 117]) == pa.table(
|
||||
{
|
||||
"idx": [5, 2, 117],
|
||||
}
|
||||
)
|
||||
|
||||
offsets = random.sample(range(200), 10)
|
||||
assert table.__getitems__(offsets) == pa.table({"idx": offsets})
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_query_timeout_async(tmp_path):
|
||||
db = await lancedb.connect_async(tmp_path)
|
||||
|
||||
@@ -271,12 +271,21 @@ def test_table_add_in_threadpool():
|
||||
|
||||
|
||||
def test_table_create_indices():
|
||||
# Track received index creation requests to validate name parameter
|
||||
received_requests = []
|
||||
|
||||
def handler(request):
|
||||
index_stats = dict(
|
||||
index_type="IVF_PQ", num_indexed_rows=1000, num_unindexed_rows=0
|
||||
)
|
||||
|
||||
if request.path == "/v1/table/test/create_index/":
|
||||
# Capture the request body to validate name parameter
|
||||
content_len = int(request.headers.get("Content-Length", 0))
|
||||
if content_len > 0:
|
||||
body = request.rfile.read(content_len)
|
||||
body_data = json.loads(body)
|
||||
received_requests.append(body_data)
|
||||
request.send_response(200)
|
||||
request.end_headers()
|
||||
elif request.path == "/v1/table/test/create/?mode=create":
|
||||
@@ -307,34 +316,34 @@ def test_table_create_indices():
|
||||
dict(
|
||||
indexes=[
|
||||
{
|
||||
"index_name": "id_idx",
|
||||
"index_name": "custom_scalar_idx",
|
||||
"columns": ["id"],
|
||||
},
|
||||
{
|
||||
"index_name": "text_idx",
|
||||
"index_name": "custom_fts_idx",
|
||||
"columns": ["text"],
|
||||
},
|
||||
{
|
||||
"index_name": "vector_idx",
|
||||
"index_name": "custom_vector_idx",
|
||||
"columns": ["vector"],
|
||||
},
|
||||
]
|
||||
)
|
||||
)
|
||||
request.wfile.write(payload.encode())
|
||||
elif request.path == "/v1/table/test/index/id_idx/stats/":
|
||||
elif request.path == "/v1/table/test/index/custom_scalar_idx/stats/":
|
||||
request.send_response(200)
|
||||
request.send_header("Content-Type", "application/json")
|
||||
request.end_headers()
|
||||
payload = json.dumps(index_stats)
|
||||
request.wfile.write(payload.encode())
|
||||
elif request.path == "/v1/table/test/index/text_idx/stats/":
|
||||
elif request.path == "/v1/table/test/index/custom_fts_idx/stats/":
|
||||
request.send_response(200)
|
||||
request.send_header("Content-Type", "application/json")
|
||||
request.end_headers()
|
||||
payload = json.dumps(index_stats)
|
||||
request.wfile.write(payload.encode())
|
||||
elif request.path == "/v1/table/test/index/vector_idx/stats/":
|
||||
elif request.path == "/v1/table/test/index/custom_vector_idx/stats/":
|
||||
request.send_response(200)
|
||||
request.send_header("Content-Type", "application/json")
|
||||
request.end_headers()
|
||||
@@ -351,16 +360,49 @@ def test_table_create_indices():
|
||||
# Parameters are well-tested through local and async tests.
|
||||
# This is a smoke-test.
|
||||
table = db.create_table("test", [{"id": 1}])
|
||||
table.create_scalar_index("id", wait_timeout=timedelta(seconds=2))
|
||||
table.create_fts_index("text", wait_timeout=timedelta(seconds=2))
|
||||
table.create_index(
|
||||
vector_column_name="vector", wait_timeout=timedelta(seconds=10)
|
||||
|
||||
# Test create_scalar_index with custom name
|
||||
table.create_scalar_index(
|
||||
"id", wait_timeout=timedelta(seconds=2), name="custom_scalar_idx"
|
||||
)
|
||||
table.wait_for_index(["id_idx"], timedelta(seconds=2))
|
||||
table.wait_for_index(["text_idx", "vector_idx"], timedelta(seconds=2))
|
||||
table.drop_index("vector_idx")
|
||||
table.drop_index("id_idx")
|
||||
table.drop_index("text_idx")
|
||||
|
||||
# Test create_fts_index with custom name
|
||||
table.create_fts_index(
|
||||
"text", wait_timeout=timedelta(seconds=2), name="custom_fts_idx"
|
||||
)
|
||||
|
||||
# Test create_index with custom name
|
||||
table.create_index(
|
||||
vector_column_name="vector",
|
||||
wait_timeout=timedelta(seconds=10),
|
||||
name="custom_vector_idx",
|
||||
)
|
||||
|
||||
# Validate that the name parameter was passed correctly in requests
|
||||
assert len(received_requests) == 3
|
||||
|
||||
# Check scalar index request has custom name
|
||||
scalar_req = received_requests[0]
|
||||
assert "name" in scalar_req
|
||||
assert scalar_req["name"] == "custom_scalar_idx"
|
||||
|
||||
# Check FTS index request has custom name
|
||||
fts_req = received_requests[1]
|
||||
assert "name" in fts_req
|
||||
assert fts_req["name"] == "custom_fts_idx"
|
||||
|
||||
# Check vector index request has custom name
|
||||
vector_req = received_requests[2]
|
||||
assert "name" in vector_req
|
||||
assert vector_req["name"] == "custom_vector_idx"
|
||||
|
||||
table.wait_for_index(["custom_scalar_idx"], timedelta(seconds=2))
|
||||
table.wait_for_index(
|
||||
["custom_fts_idx", "custom_vector_idx"], timedelta(seconds=2)
|
||||
)
|
||||
table.drop_index("custom_vector_idx")
|
||||
table.drop_index("custom_scalar_idx")
|
||||
table.drop_index("custom_fts_idx")
|
||||
|
||||
|
||||
def test_table_wait_for_index_timeout():
|
||||
|
||||
@@ -290,7 +290,7 @@ def test_add_struct(mem_db: DBConnection):
|
||||
}
|
||||
)
|
||||
data = [{"s_list": [{"b": 1, "a": 2}, {"b": 4}]}]
|
||||
table = mem_db.create_table("test", schema=schema)
|
||||
table = mem_db.create_table("test2", schema=schema)
|
||||
table.add(data)
|
||||
|
||||
|
||||
@@ -670,7 +670,9 @@ def test_create_index_method(mock_create_index, mem_db: DBConnection):
|
||||
num_sub_vectors=96,
|
||||
num_bits=4,
|
||||
)
|
||||
mock_create_index.assert_called_with("vector", replace=True, config=expected_config)
|
||||
mock_create_index.assert_called_with(
|
||||
"vector", replace=True, config=expected_config, name=None, train=True
|
||||
)
|
||||
|
||||
table.create_index(
|
||||
vector_column_name="my_vector",
|
||||
@@ -680,7 +682,7 @@ def test_create_index_method(mock_create_index, mem_db: DBConnection):
|
||||
)
|
||||
expected_config = HnswPq(distance_type="dot")
|
||||
mock_create_index.assert_called_with(
|
||||
"my_vector", replace=False, config=expected_config
|
||||
"my_vector", replace=False, config=expected_config, name=None, train=True
|
||||
)
|
||||
|
||||
table.create_index(
|
||||
@@ -695,7 +697,44 @@ def test_create_index_method(mock_create_index, mem_db: DBConnection):
|
||||
distance_type="cosine", sample_rate=0.1, m=29, ef_construction=10
|
||||
)
|
||||
mock_create_index.assert_called_with(
|
||||
"my_vector", replace=True, config=expected_config
|
||||
"my_vector", replace=True, config=expected_config, name=None, train=True
|
||||
)
|
||||
|
||||
|
||||
@patch("lancedb.table.AsyncTable.create_index")
|
||||
def test_create_index_name_and_train_parameters(
|
||||
mock_create_index, mem_db: DBConnection
|
||||
):
|
||||
"""Test that name and train parameters are passed correctly to AsyncTable"""
|
||||
table = mem_db.create_table(
|
||||
"test",
|
||||
data=[
|
||||
{"vector": [3.1, 4.1], "id": 1},
|
||||
{"vector": [5.9, 26.5], "id": 2},
|
||||
],
|
||||
)
|
||||
|
||||
# Test with custom name
|
||||
table.create_index(vector_column_name="vector", name="my_custom_index")
|
||||
expected_config = IvfPq() # Default config
|
||||
mock_create_index.assert_called_with(
|
||||
"vector",
|
||||
replace=True,
|
||||
config=expected_config,
|
||||
name="my_custom_index",
|
||||
train=True,
|
||||
)
|
||||
|
||||
# Test with train=False
|
||||
table.create_index(vector_column_name="vector", train=False)
|
||||
mock_create_index.assert_called_with(
|
||||
"vector", replace=True, config=expected_config, name=None, train=False
|
||||
)
|
||||
|
||||
# Test with both name and train
|
||||
table.create_index(vector_column_name="vector", name="my_index_name", train=True)
|
||||
mock_create_index.assert_called_with(
|
||||
"vector", replace=True, config=expected_config, name="my_index_name", train=True
|
||||
)
|
||||
|
||||
|
||||
@@ -1235,11 +1274,13 @@ def test_create_scalar_index(mem_db: DBConnection):
|
||||
"my_table",
|
||||
data=test_data,
|
||||
)
|
||||
# Test with default name
|
||||
table.create_scalar_index("x")
|
||||
indices = table.list_indices()
|
||||
assert len(indices) == 1
|
||||
scalar_index = indices[0]
|
||||
assert scalar_index.index_type == "BTree"
|
||||
assert scalar_index.name == "x_idx" # Default name
|
||||
|
||||
# Confirm that prefiltering still works with the scalar index column
|
||||
results = table.search().where("x = 'c'").to_arrow()
|
||||
@@ -1253,6 +1294,14 @@ def test_create_scalar_index(mem_db: DBConnection):
|
||||
indices = table.list_indices()
|
||||
assert len(indices) == 0
|
||||
|
||||
# Test with custom name
|
||||
table.create_scalar_index("y", name="custom_y_index")
|
||||
indices = table.list_indices()
|
||||
assert len(indices) == 1
|
||||
scalar_index = indices[0]
|
||||
assert scalar_index.index_type == "BTree"
|
||||
assert scalar_index.name == "custom_y_index"
|
||||
|
||||
|
||||
def test_empty_query(mem_db: DBConnection):
|
||||
table = mem_db.create_table(
|
||||
|
||||
26
python/python/tests/test_torch.py
Normal file
26
python/python/tests/test_torch.py
Normal file
@@ -0,0 +1,26 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
import pyarrow as pa
|
||||
import pytest
|
||||
|
||||
torch = pytest.importorskip("torch")
|
||||
|
||||
|
||||
def tbl_to_tensor(tbl):
|
||||
def to_tensor(col: pa.ChunkedArray):
|
||||
if col.num_chunks > 1:
|
||||
raise Exception("Single batch was too large to fit into a one-chunk table")
|
||||
return torch.from_dlpack(col.chunk(0))
|
||||
|
||||
return torch.stack([to_tensor(tbl.column(i)) for i in range(tbl.num_columns)])
|
||||
|
||||
|
||||
def test_table_dataloader(mem_db):
|
||||
table = mem_db.create_table("test_table", pa.table({"a": range(1000)}))
|
||||
dataloader = torch.utils.data.DataLoader(
|
||||
table, collate_fn=tbl_to_tensor, batch_size=10, shuffle=True
|
||||
)
|
||||
for batch in dataloader:
|
||||
assert batch.size(0) == 1
|
||||
assert batch.size(1) == 10
|
||||
@@ -63,14 +63,16 @@ impl Connection {
|
||||
self.get_inner().map(|inner| inner.uri().to_string())
|
||||
}
|
||||
|
||||
#[pyo3(signature = (start_after=None, limit=None))]
|
||||
#[pyo3(signature = (namespace=vec![], start_after=None, limit=None))]
|
||||
pub fn table_names(
|
||||
self_: PyRef<'_, Self>,
|
||||
namespace: Vec<String>,
|
||||
start_after: Option<String>,
|
||||
limit: Option<u32>,
|
||||
) -> PyResult<Bound<'_, PyAny>> {
|
||||
let inner = self_.get_inner()?.clone();
|
||||
let mut op = inner.table_names();
|
||||
op = op.namespace(namespace);
|
||||
if let Some(start_after) = start_after {
|
||||
op = op.start_after(start_after);
|
||||
}
|
||||
@@ -80,12 +82,13 @@ impl Connection {
|
||||
future_into_py(self_.py(), async move { op.execute().await.infer_error() })
|
||||
}
|
||||
|
||||
#[pyo3(signature = (name, mode, data, storage_options=None))]
|
||||
#[pyo3(signature = (name, mode, data, namespace=vec![], storage_options=None))]
|
||||
pub fn create_table<'a>(
|
||||
self_: PyRef<'a, Self>,
|
||||
name: String,
|
||||
mode: &str,
|
||||
data: Bound<'_, PyAny>,
|
||||
namespace: Vec<String>,
|
||||
storage_options: Option<HashMap<String, String>>,
|
||||
) -> PyResult<Bound<'a, PyAny>> {
|
||||
let inner = self_.get_inner()?.clone();
|
||||
@@ -93,8 +96,10 @@ impl Connection {
|
||||
let mode = Self::parse_create_mode_str(mode)?;
|
||||
|
||||
let batches = ArrowArrayStreamReader::from_pyarrow_bound(&data)?;
|
||||
|
||||
let mut builder = inner.create_table(name, batches).mode(mode);
|
||||
|
||||
builder = builder.namespace(namespace);
|
||||
if let Some(storage_options) = storage_options {
|
||||
builder = builder.storage_options(storage_options);
|
||||
}
|
||||
@@ -105,12 +110,13 @@ impl Connection {
|
||||
})
|
||||
}
|
||||
|
||||
#[pyo3(signature = (name, mode, schema, storage_options=None))]
|
||||
#[pyo3(signature = (name, mode, schema, namespace=vec![], storage_options=None))]
|
||||
pub fn create_empty_table<'a>(
|
||||
self_: PyRef<'a, Self>,
|
||||
name: String,
|
||||
mode: &str,
|
||||
schema: Bound<'_, PyAny>,
|
||||
namespace: Vec<String>,
|
||||
storage_options: Option<HashMap<String, String>>,
|
||||
) -> PyResult<Bound<'a, PyAny>> {
|
||||
let inner = self_.get_inner()?.clone();
|
||||
@@ -121,6 +127,7 @@ impl Connection {
|
||||
|
||||
let mut builder = inner.create_empty_table(name, Arc::new(schema)).mode(mode);
|
||||
|
||||
builder = builder.namespace(namespace);
|
||||
if let Some(storage_options) = storage_options {
|
||||
builder = builder.storage_options(storage_options);
|
||||
}
|
||||
@@ -131,49 +138,115 @@ impl Connection {
|
||||
})
|
||||
}
|
||||
|
||||
#[pyo3(signature = (name, storage_options = None, index_cache_size = None))]
|
||||
#[pyo3(signature = (name, namespace=vec![], storage_options = None, index_cache_size = None))]
|
||||
pub fn open_table(
|
||||
self_: PyRef<'_, Self>,
|
||||
name: String,
|
||||
namespace: Vec<String>,
|
||||
storage_options: Option<HashMap<String, String>>,
|
||||
index_cache_size: Option<u32>,
|
||||
) -> PyResult<Bound<'_, PyAny>> {
|
||||
let inner = self_.get_inner()?.clone();
|
||||
|
||||
let mut builder = inner.open_table(name);
|
||||
builder = builder.namespace(namespace);
|
||||
if let Some(storage_options) = storage_options {
|
||||
builder = builder.storage_options(storage_options);
|
||||
}
|
||||
if let Some(index_cache_size) = index_cache_size {
|
||||
builder = builder.index_cache_size(index_cache_size);
|
||||
}
|
||||
|
||||
future_into_py(self_.py(), async move {
|
||||
let table = builder.execute().await.infer_error()?;
|
||||
Ok(Table::new(table))
|
||||
})
|
||||
}
|
||||
|
||||
#[pyo3(signature = (cur_name, new_name, cur_namespace=vec![], new_namespace=vec![]))]
|
||||
pub fn rename_table(
|
||||
self_: PyRef<'_, Self>,
|
||||
old_name: String,
|
||||
cur_name: String,
|
||||
new_name: String,
|
||||
cur_namespace: Vec<String>,
|
||||
new_namespace: Vec<String>,
|
||||
) -> PyResult<Bound<'_, PyAny>> {
|
||||
let inner = self_.get_inner()?.clone();
|
||||
future_into_py(self_.py(), async move {
|
||||
inner.rename_table(old_name, new_name).await.infer_error()
|
||||
inner
|
||||
.rename_table(cur_name, new_name, &cur_namespace, &new_namespace)
|
||||
.await
|
||||
.infer_error()
|
||||
})
|
||||
}
|
||||
|
||||
pub fn drop_table(self_: PyRef<'_, Self>, name: String) -> PyResult<Bound<'_, PyAny>> {
|
||||
#[pyo3(signature = (name, namespace=vec![]))]
|
||||
pub fn drop_table(
|
||||
self_: PyRef<'_, Self>,
|
||||
name: String,
|
||||
namespace: Vec<String>,
|
||||
) -> PyResult<Bound<'_, PyAny>> {
|
||||
let inner = self_.get_inner()?.clone();
|
||||
future_into_py(self_.py(), async move {
|
||||
inner.drop_table(name).await.infer_error()
|
||||
inner.drop_table(name, &namespace).await.infer_error()
|
||||
})
|
||||
}
|
||||
|
||||
pub fn drop_all_tables(self_: PyRef<'_, Self>) -> PyResult<Bound<'_, PyAny>> {
|
||||
#[pyo3(signature = (namespace=vec![],))]
|
||||
pub fn drop_all_tables(
|
||||
self_: PyRef<'_, Self>,
|
||||
namespace: Vec<String>,
|
||||
) -> PyResult<Bound<'_, PyAny>> {
|
||||
let inner = self_.get_inner()?.clone();
|
||||
future_into_py(self_.py(), async move {
|
||||
inner.drop_all_tables().await.infer_error()
|
||||
inner.drop_all_tables(&namespace).await.infer_error()
|
||||
})
|
||||
}
|
||||
|
||||
// Namespace management methods
|
||||
|
||||
#[pyo3(signature = (namespace=vec![], page_token=None, limit=None))]
|
||||
pub fn list_namespaces(
|
||||
self_: PyRef<'_, Self>,
|
||||
namespace: Vec<String>,
|
||||
page_token: Option<String>,
|
||||
limit: Option<u32>,
|
||||
) -> PyResult<Bound<'_, PyAny>> {
|
||||
let inner = self_.get_inner()?.clone();
|
||||
future_into_py(self_.py(), async move {
|
||||
use lancedb::database::ListNamespacesRequest;
|
||||
let request = ListNamespacesRequest {
|
||||
namespace,
|
||||
page_token,
|
||||
limit,
|
||||
};
|
||||
inner.list_namespaces(request).await.infer_error()
|
||||
})
|
||||
}
|
||||
|
||||
#[pyo3(signature = (namespace,))]
|
||||
pub fn create_namespace(
|
||||
self_: PyRef<'_, Self>,
|
||||
namespace: Vec<String>,
|
||||
) -> PyResult<Bound<'_, PyAny>> {
|
||||
let inner = self_.get_inner()?.clone();
|
||||
future_into_py(self_.py(), async move {
|
||||
use lancedb::database::CreateNamespaceRequest;
|
||||
let request = CreateNamespaceRequest { namespace };
|
||||
inner.create_namespace(request).await.infer_error()
|
||||
})
|
||||
}
|
||||
|
||||
#[pyo3(signature = (namespace,))]
|
||||
pub fn drop_namespace(
|
||||
self_: PyRef<'_, Self>,
|
||||
namespace: Vec<String>,
|
||||
) -> PyResult<Bound<'_, PyAny>> {
|
||||
let inner = self_.get_inner()?.clone();
|
||||
future_into_py(self_.py(), async move {
|
||||
use lancedb::database::DropNamespaceRequest;
|
||||
let request = DropNamespaceRequest { namespace };
|
||||
inner.drop_namespace(request).await.infer_error()
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -227,6 +300,7 @@ pub struct PyClientConfig {
|
||||
retry_config: Option<PyClientRetryConfig>,
|
||||
timeout_config: Option<PyClientTimeoutConfig>,
|
||||
extra_headers: Option<HashMap<String, String>>,
|
||||
id_delimiter: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(FromPyObject)]
|
||||
@@ -281,6 +355,7 @@ impl From<PyClientConfig> for lancedb::remote::ClientConfig {
|
||||
retry_config: value.retry_config.map(Into::into).unwrap_or_default(),
|
||||
timeout_config: value.timeout_config.map(Into::into).unwrap_or_default(),
|
||||
extra_headers: value.extra_headers.unwrap_or_default(),
|
||||
id_delimiter: value.id_delimiter,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -13,10 +13,12 @@ use lancedb::index::scalar::{
|
||||
BooleanQuery, BoostQuery, FtsQuery, FullTextSearchQuery, MatchQuery, MultiMatchQuery, Occur,
|
||||
Operator, PhraseQuery,
|
||||
};
|
||||
use lancedb::query::QueryBase;
|
||||
use lancedb::query::QueryExecutionOptions;
|
||||
use lancedb::query::QueryFilter;
|
||||
use lancedb::query::{
|
||||
ExecutableQuery, Query as LanceDbQuery, QueryBase, Select, VectorQuery as LanceDbVectorQuery,
|
||||
ExecutableQuery, Query as LanceDbQuery, Select, TakeQuery as LanceDbTakeQuery,
|
||||
VectorQuery as LanceDbVectorQuery,
|
||||
};
|
||||
use lancedb::table::AnyQuery;
|
||||
use pyo3::prelude::{PyAnyMethods, PyDictMethods};
|
||||
@@ -488,6 +490,76 @@ impl Query {
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass]
|
||||
pub struct TakeQuery {
|
||||
inner: LanceDbTakeQuery,
|
||||
}
|
||||
|
||||
impl TakeQuery {
|
||||
pub fn new(query: LanceDbTakeQuery) -> Self {
|
||||
Self { inner: query }
|
||||
}
|
||||
}
|
||||
|
||||
#[pymethods]
|
||||
impl TakeQuery {
|
||||
pub fn select(&mut self, columns: Vec<(String, String)>) {
|
||||
self.inner = self.inner.clone().select(Select::dynamic(&columns));
|
||||
}
|
||||
|
||||
pub fn select_columns(&mut self, columns: Vec<String>) {
|
||||
self.inner = self.inner.clone().select(Select::columns(&columns));
|
||||
}
|
||||
|
||||
pub fn with_row_id(&mut self) {
|
||||
self.inner = self.inner.clone().with_row_id();
|
||||
}
|
||||
|
||||
#[pyo3(signature = (max_batch_length=None, timeout=None))]
|
||||
pub fn execute(
|
||||
self_: PyRef<'_, Self>,
|
||||
max_batch_length: Option<u32>,
|
||||
timeout: Option<Duration>,
|
||||
) -> PyResult<Bound<'_, PyAny>> {
|
||||
let inner = self_.inner.clone();
|
||||
future_into_py(self_.py(), async move {
|
||||
let mut opts = QueryExecutionOptions::default();
|
||||
if let Some(max_batch_length) = max_batch_length {
|
||||
opts.max_batch_length = max_batch_length;
|
||||
}
|
||||
if let Some(timeout) = timeout {
|
||||
opts.timeout = Some(timeout);
|
||||
}
|
||||
let inner_stream = inner.execute_with_options(opts).await.infer_error()?;
|
||||
Ok(RecordBatchStream::new(inner_stream))
|
||||
})
|
||||
}
|
||||
|
||||
pub fn explain_plan(self_: PyRef<'_, Self>, verbose: bool) -> PyResult<Bound<'_, PyAny>> {
|
||||
let inner = self_.inner.clone();
|
||||
future_into_py(self_.py(), async move {
|
||||
inner
|
||||
.explain_plan(verbose)
|
||||
.await
|
||||
.map_err(|e| PyRuntimeError::new_err(e.to_string()))
|
||||
})
|
||||
}
|
||||
|
||||
pub fn analyze_plan(self_: PyRef<'_, Self>) -> PyResult<Bound<'_, PyAny>> {
|
||||
let inner = self_.inner.clone();
|
||||
future_into_py(self_.py(), async move {
|
||||
inner
|
||||
.analyze_plan()
|
||||
.await
|
||||
.map_err(|e| PyRuntimeError::new_err(e.to_string()))
|
||||
})
|
||||
}
|
||||
|
||||
pub fn to_query_request(&self) -> PyQueryRequest {
|
||||
PyQueryRequest::from(AnyQuery::Query(self.inner.clone().into_request()))
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass]
|
||||
#[derive(Clone)]
|
||||
pub struct FTSQuery {
|
||||
|
||||
@@ -5,7 +5,7 @@ use std::{collections::HashMap, sync::Arc};
|
||||
use crate::{
|
||||
error::PythonErrorExt,
|
||||
index::{extract_index_params, IndexConfig},
|
||||
query::Query,
|
||||
query::{Query, TakeQuery},
|
||||
};
|
||||
use arrow::{
|
||||
datatypes::{DataType, Schema},
|
||||
@@ -341,13 +341,15 @@ impl Table {
|
||||
})
|
||||
}
|
||||
|
||||
#[pyo3(signature = (column, index=None, replace=None, wait_timeout=None))]
|
||||
#[pyo3(signature = (column, index=None, replace=None, wait_timeout=None, *, name=None, train=None))]
|
||||
pub fn create_index<'a>(
|
||||
self_: PyRef<'a, Self>,
|
||||
column: String,
|
||||
index: Option<Bound<'_, PyAny>>,
|
||||
replace: Option<bool>,
|
||||
wait_timeout: Option<Bound<'_, PyAny>>,
|
||||
name: Option<String>,
|
||||
train: Option<bool>,
|
||||
) -> PyResult<Bound<'a, PyAny>> {
|
||||
let index = extract_index_params(&index)?;
|
||||
let timeout = wait_timeout.map(|t| t.extract::<std::time::Duration>().unwrap());
|
||||
@@ -357,6 +359,12 @@ impl Table {
|
||||
if let Some(replace) = replace {
|
||||
op = op.replace(replace);
|
||||
}
|
||||
if let Some(name) = name {
|
||||
op = op.name(name);
|
||||
}
|
||||
if let Some(train) = train {
|
||||
op = op.train(train);
|
||||
}
|
||||
|
||||
future_into_py(self_.py(), async move {
|
||||
op.execute().await.infer_error()?;
|
||||
@@ -568,6 +576,20 @@ impl Table {
|
||||
Ok(Tags::new(self.inner_ref()?.clone()))
|
||||
}
|
||||
|
||||
#[pyo3(signature = (offsets))]
|
||||
pub fn take_offsets(self_: PyRef<'_, Self>, offsets: Vec<u64>) -> PyResult<TakeQuery> {
|
||||
Ok(TakeQuery::new(
|
||||
self_.inner_ref()?.clone().take_offsets(offsets),
|
||||
))
|
||||
}
|
||||
|
||||
#[pyo3(signature = (row_ids))]
|
||||
pub fn take_row_ids(self_: PyRef<'_, Self>, row_ids: Vec<u64>) -> PyResult<TakeQuery> {
|
||||
Ok(TakeQuery::new(
|
||||
self_.inner_ref()?.clone().take_row_ids(row_ids),
|
||||
))
|
||||
}
|
||||
|
||||
/// Optimize the on-disk data by compacting and pruning old data, for better performance.
|
||||
#[pyo3(signature = (cleanup_since_ms=None, delete_unverified=None, retrain=None))]
|
||||
pub fn optimize(
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "lancedb"
|
||||
version = "0.21.2"
|
||||
version = "0.22.0-beta.1"
|
||||
edition.workspace = true
|
||||
description = "LanceDB: A serverless, low-latency vector database for AI applications"
|
||||
license.workspace = true
|
||||
@@ -65,7 +65,11 @@ http = { version = "1", optional = true } # Matching what is in reqwest
|
||||
uuid = { version = "1.7.0", features = ["v4"], optional = true }
|
||||
polars-arrow = { version = ">=0.37,<0.40.0", optional = true }
|
||||
polars = { version = ">=0.37,<0.40.0", optional = true }
|
||||
hf-hub = { version = "0.4.1", optional = true, default-features = false, features = ["rustls-tls", "tokio", "ureq"]}
|
||||
hf-hub = { version = "0.4.1", optional = true, default-features = false, features = [
|
||||
"rustls-tls",
|
||||
"tokio",
|
||||
"ureq",
|
||||
] }
|
||||
candle-core = { version = "0.9.1", optional = true }
|
||||
candle-transformers = { version = "0.9.1", optional = true }
|
||||
candle-nn = { version = "0.9.1", optional = true }
|
||||
@@ -93,7 +97,12 @@ rstest = "0.23.0"
|
||||
|
||||
|
||||
[features]
|
||||
default = []
|
||||
default = ["aws", "gcs", "azure", "dynamodb", "oss"]
|
||||
aws = ["lance/aws", "lance-io/aws"]
|
||||
oss = ["lance/oss", "lance-io/oss"]
|
||||
gcs = ["lance/gcp", "lance-io/gcp"]
|
||||
azure = ["lance/azure", "lance-io/azure"]
|
||||
dynamodb = ["lance/dynamodb", "aws"]
|
||||
remote = ["dep:reqwest", "dep:http", "dep:rand", "dep:uuid"]
|
||||
fp16kernels = ["lance-linalg/fp16kernels"]
|
||||
s3-test = []
|
||||
@@ -119,3 +128,16 @@ required-features = ["sentence-transformers"]
|
||||
[[example]]
|
||||
name = "bedrock"
|
||||
required-features = ["bedrock"]
|
||||
|
||||
[[example]]
|
||||
name = "simple"
|
||||
|
||||
[[example]]
|
||||
name = "full_text_search"
|
||||
|
||||
[[example]]
|
||||
name = "ivf_pq"
|
||||
|
||||
[[example]]
|
||||
name = "hybrid_search"
|
||||
required-features = ["sentence-transformers"]
|
||||
|
||||
114
rust/lancedb/examples/hybrid_search.rs
Normal file
114
rust/lancedb/examples/hybrid_search.rs
Normal file
@@ -0,0 +1,114 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
use arrow_array::{RecordBatch, RecordBatchIterator, StringArray};
|
||||
use arrow_schema::{DataType, Field, Schema};
|
||||
use futures::TryStreamExt;
|
||||
use lance_index::scalar::FullTextSearchQuery;
|
||||
use lancedb::index::scalar::FtsIndexBuilder;
|
||||
use lancedb::index::Index;
|
||||
use lancedb::{
|
||||
arrow::IntoArrow,
|
||||
connect,
|
||||
embeddings::{
|
||||
sentence_transformers::SentenceTransformersEmbeddings, EmbeddingDefinition,
|
||||
EmbeddingFunction,
|
||||
},
|
||||
query::{QueryBase, QueryExecutionOptions},
|
||||
Result, Table,
|
||||
};
|
||||
use std::{iter::once, sync::Arc};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<()> {
|
||||
let tempdir = tempfile::tempdir().unwrap();
|
||||
let tempdir = tempdir.path().to_str().unwrap();
|
||||
let embedding = SentenceTransformersEmbeddings::builder().build()?;
|
||||
let embedding = Arc::new(embedding);
|
||||
let db = connect(tempdir).execute().await?;
|
||||
db.embedding_registry()
|
||||
.register("sentence-transformers", embedding.clone())?;
|
||||
|
||||
// Create the table with embeddings
|
||||
let table = db
|
||||
.create_table("vectors", make_data())
|
||||
.add_embedding(EmbeddingDefinition::new(
|
||||
"facts",
|
||||
"sentence-transformers",
|
||||
Some("embeddings"),
|
||||
))?
|
||||
.execute()
|
||||
.await?;
|
||||
|
||||
// Create the FTS index
|
||||
create_index(&table).await?;
|
||||
|
||||
// Perform a hybrid search using the FTS index and embeddings
|
||||
let query_str = "world records";
|
||||
let query = Arc::new(StringArray::from_iter_values(once(query_str)));
|
||||
let query_vector = embedding.compute_query_embeddings(query)?;
|
||||
let mut results = table
|
||||
.query()
|
||||
.full_text_search(FullTextSearchQuery::new(query_str.to_owned()))
|
||||
.nearest_to(query_vector)?
|
||||
.limit(5)
|
||||
.execute_hybrid(QueryExecutionOptions::default())
|
||||
.await?;
|
||||
|
||||
while let Some(batch) = results.try_next().await? {
|
||||
let out = batch
|
||||
.column_by_name("facts")
|
||||
.unwrap()
|
||||
.as_any()
|
||||
.downcast_ref::<StringArray>()
|
||||
.unwrap();
|
||||
for text in out.iter().flatten() {
|
||||
println!("Result: {}", text);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn make_data() -> impl IntoArrow {
|
||||
let schema = Schema::new(vec![Field::new("facts", DataType::Utf8, false)]);
|
||||
|
||||
let facts = StringArray::from_iter_values(vec![
|
||||
"Albert Einstein was a theoretical physicist.",
|
||||
"The capital of France is Paris.",
|
||||
"The Great Wall of China is one of the Seven Wonders of the World.",
|
||||
"Python is a popular programming language.",
|
||||
"Mount Everest is the highest mountain in the world.",
|
||||
"Leonardo da Vinci painted the Mona Lisa.",
|
||||
"Shakespeare wrote Hamlet.",
|
||||
"The human body has 206 bones.",
|
||||
"The speed of light is approximately 299,792 kilometers per second.",
|
||||
"Water boils at 100 degrees Celsius.",
|
||||
"The Earth orbits the Sun.",
|
||||
"The Pyramids of Giza are located in Egypt.",
|
||||
"Coffee is one of the most popular beverages in the world.",
|
||||
"Tokyo is the capital city of Japan.",
|
||||
"Photosynthesis is the process by which plants make their food.",
|
||||
"The Pacific Ocean is the largest ocean on Earth.",
|
||||
"Mozart was a prolific composer of classical music.",
|
||||
"The Internet is a global network of computers.",
|
||||
"Basketball is a sport played with a ball and a hoop.",
|
||||
"The first computer virus was created in 1983.",
|
||||
"Artificial neural networks are inspired by the human brain.",
|
||||
"Deep learning is a subset of machine learning.",
|
||||
"IBM's Watson won Jeopardy! in 2011.",
|
||||
"The first computer programmer was Ada Lovelace.",
|
||||
"The first chatbot was ELIZA, created in the 1960s.",
|
||||
]);
|
||||
let schema = Arc::new(schema);
|
||||
let rb = RecordBatch::try_new(schema.clone(), vec![Arc::new(facts)]).unwrap();
|
||||
Box::new(RecordBatchIterator::new(vec![Ok(rb)], schema))
|
||||
}
|
||||
|
||||
async fn create_index(table: &Table) -> Result<()> {
|
||||
table
|
||||
.create_index(&["facts"], Index::FTS(FtsIndexBuilder::default()))
|
||||
.execute()
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
@@ -43,7 +43,7 @@ async fn main() -> Result<()> {
|
||||
// --8<-- [end:delete]
|
||||
|
||||
// --8<-- [start:drop_table]
|
||||
db.drop_table("my_table").await.unwrap();
|
||||
db.drop_table("my_table", &[]).await.unwrap();
|
||||
// --8<-- [end:drop_table]
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -379,6 +379,7 @@ mod tests {
|
||||
data: CreateTableData::Empty(TableDefinition::new_from_schema(dummy_schema)),
|
||||
mode: Default::default(),
|
||||
write_options: Default::default(),
|
||||
namespace: vec![],
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
@@ -414,6 +415,7 @@ mod tests {
|
||||
data: CreateTableData::Empty(TableDefinition::new_from_schema(dummy_schema)),
|
||||
mode: Default::default(),
|
||||
write_options: Default::default(),
|
||||
namespace: vec![],
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
@@ -9,6 +9,7 @@ use std::sync::Arc;
|
||||
use arrow_array::RecordBatchReader;
|
||||
use arrow_schema::{Field, SchemaRef};
|
||||
use lance::dataset::ReadParams;
|
||||
#[cfg(feature = "aws")]
|
||||
use object_store::aws::AwsCredential;
|
||||
|
||||
use crate::arrow::{IntoArrow, IntoArrowStream, SendableRecordBatchStream};
|
||||
@@ -18,8 +19,9 @@ use crate::database::listing::{
|
||||
ListingDatabase, OPT_NEW_TABLE_STORAGE_VERSION, OPT_NEW_TABLE_V2_MANIFEST_PATHS,
|
||||
};
|
||||
use crate::database::{
|
||||
CreateTableData, CreateTableMode, CreateTableRequest, Database, DatabaseOptions,
|
||||
OpenTableRequest, TableNamesRequest,
|
||||
CreateNamespaceRequest, CreateTableData, CreateTableMode, CreateTableRequest, Database,
|
||||
DatabaseOptions, DropNamespaceRequest, ListNamespacesRequest, OpenTableRequest,
|
||||
TableNamesRequest,
|
||||
};
|
||||
use crate::embeddings::{
|
||||
EmbeddingDefinition, EmbeddingFunction, EmbeddingRegistry, MemoryRegistry, WithEmbeddings,
|
||||
@@ -66,6 +68,12 @@ impl TableNamesBuilder {
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the namespace to list tables from
|
||||
pub fn namespace(mut self, namespace: Vec<String>) -> Self {
|
||||
self.request.namespace = namespace;
|
||||
self
|
||||
}
|
||||
|
||||
/// Execute the table names operation
|
||||
pub async fn execute(self) -> Result<Vec<String>> {
|
||||
self.parent.clone().table_names(self.request).await
|
||||
@@ -347,6 +355,12 @@ impl<const HAS_DATA: bool> CreateTableBuilder<HAS_DATA> {
|
||||
);
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the namespace for the table
|
||||
pub fn namespace(mut self, namespace: Vec<String>) -> Self {
|
||||
self.request.namespace = namespace;
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
@@ -366,6 +380,7 @@ impl OpenTableBuilder {
|
||||
parent,
|
||||
request: OpenTableRequest {
|
||||
name,
|
||||
namespace: vec![],
|
||||
index_cache_size: None,
|
||||
lance_read_params: None,
|
||||
},
|
||||
@@ -441,6 +456,12 @@ impl OpenTableBuilder {
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the namespace for the table
|
||||
pub fn namespace(mut self, namespace: Vec<String>) -> Self {
|
||||
self.request.namespace = namespace;
|
||||
self
|
||||
}
|
||||
|
||||
/// Open the table
|
||||
pub async fn execute(self) -> Result<Table> {
|
||||
Ok(Table::new_with_embedding_registry(
|
||||
@@ -563,9 +584,16 @@ impl Connection {
|
||||
&self,
|
||||
old_name: impl AsRef<str>,
|
||||
new_name: impl AsRef<str>,
|
||||
cur_namespace: &[String],
|
||||
new_namespace: &[String],
|
||||
) -> Result<()> {
|
||||
self.internal
|
||||
.rename_table(old_name.as_ref(), new_name.as_ref())
|
||||
.rename_table(
|
||||
old_name.as_ref(),
|
||||
new_name.as_ref(),
|
||||
cur_namespace,
|
||||
new_namespace,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
@@ -573,8 +601,9 @@ impl Connection {
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `name` - The name of the table to drop
|
||||
pub async fn drop_table(&self, name: impl AsRef<str>) -> Result<()> {
|
||||
self.internal.drop_table(name.as_ref()).await
|
||||
/// * `namespace` - The namespace to drop the table from
|
||||
pub async fn drop_table(&self, name: impl AsRef<str>, namespace: &[String]) -> Result<()> {
|
||||
self.internal.drop_table(name.as_ref(), namespace).await
|
||||
}
|
||||
|
||||
/// Drop the database
|
||||
@@ -582,12 +611,30 @@ impl Connection {
|
||||
/// This is the same as dropping all of the tables
|
||||
#[deprecated(since = "0.15.1", note = "Use `drop_all_tables` instead")]
|
||||
pub async fn drop_db(&self) -> Result<()> {
|
||||
self.internal.drop_all_tables().await
|
||||
self.internal.drop_all_tables(&[]).await
|
||||
}
|
||||
|
||||
/// Drops all tables in the database
|
||||
pub async fn drop_all_tables(&self) -> Result<()> {
|
||||
self.internal.drop_all_tables().await
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `namespace` - The namespace to drop all tables from. Empty slice represents root namespace.
|
||||
pub async fn drop_all_tables(&self, namespace: &[String]) -> Result<()> {
|
||||
self.internal.drop_all_tables(namespace).await
|
||||
}
|
||||
|
||||
/// List immediate child namespace names in the given namespace
|
||||
pub async fn list_namespaces(&self, request: ListNamespacesRequest) -> Result<Vec<String>> {
|
||||
self.internal.list_namespaces(request).await
|
||||
}
|
||||
|
||||
/// Create a new namespace
|
||||
pub async fn create_namespace(&self, request: CreateNamespaceRequest) -> Result<()> {
|
||||
self.internal.create_namespace(request).await
|
||||
}
|
||||
|
||||
/// Drop a namespace
|
||||
pub async fn drop_namespace(&self, request: DropNamespaceRequest) -> Result<()> {
|
||||
self.internal.drop_namespace(request).await
|
||||
}
|
||||
|
||||
/// Get the in-memory embedding registry.
|
||||
@@ -749,6 +796,7 @@ impl ConnectBuilder {
|
||||
}
|
||||
|
||||
/// [`AwsCredential`] to use when connecting to S3.
|
||||
#[cfg(feature = "aws")]
|
||||
#[deprecated(note = "Pass through storage_options instead")]
|
||||
pub fn aws_creds(mut self, aws_creds: AwsCredential) -> Self {
|
||||
self.request
|
||||
@@ -1218,12 +1266,12 @@ mod tests {
|
||||
|
||||
// drop non-exist table
|
||||
assert!(matches!(
|
||||
db.drop_table("invalid_table").await,
|
||||
db.drop_table("invalid_table", &[]).await,
|
||||
Err(crate::Error::TableNotFound { .. }),
|
||||
));
|
||||
|
||||
create_dir_all(tmp_dir.path().join("table1.lance")).unwrap();
|
||||
db.drop_table("table1").await.unwrap();
|
||||
db.drop_table("table1", &[]).await.unwrap();
|
||||
|
||||
let tables = db.table_names().execute().await.unwrap();
|
||||
assert_eq!(tables.len(), 0);
|
||||
|
||||
@@ -34,9 +34,36 @@ pub trait DatabaseOptions {
|
||||
fn serialize_into_map(&self, map: &mut HashMap<String, String>);
|
||||
}
|
||||
|
||||
/// A request to list namespaces in the database
|
||||
#[derive(Clone, Debug, Default)]
|
||||
pub struct ListNamespacesRequest {
|
||||
/// The parent namespace to list namespaces in. Empty list represents root namespace.
|
||||
pub namespace: Vec<String>,
|
||||
/// If present, only return names that come lexicographically after the supplied value.
|
||||
pub page_token: Option<String>,
|
||||
/// The maximum number of namespace names to return
|
||||
pub limit: Option<u32>,
|
||||
}
|
||||
|
||||
/// A request to create a namespace
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct CreateNamespaceRequest {
|
||||
/// The namespace identifier to create
|
||||
pub namespace: Vec<String>,
|
||||
}
|
||||
|
||||
/// A request to drop a namespace
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct DropNamespaceRequest {
|
||||
/// The namespace identifier to drop
|
||||
pub namespace: Vec<String>,
|
||||
}
|
||||
|
||||
/// A request to list names of tables in the database
|
||||
#[derive(Clone, Debug, Default)]
|
||||
pub struct TableNamesRequest {
|
||||
/// The namespace to list tables in. Empty list represents root namespace.
|
||||
pub namespace: Vec<String>,
|
||||
/// If present, only return names that come lexicographically after the supplied
|
||||
/// value.
|
||||
///
|
||||
@@ -51,6 +78,8 @@ pub struct TableNamesRequest {
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct OpenTableRequest {
|
||||
pub name: String,
|
||||
/// The namespace to open the table from. Empty list represents root namespace.
|
||||
pub namespace: Vec<String>,
|
||||
pub index_cache_size: Option<u32>,
|
||||
pub lance_read_params: Option<ReadParams>,
|
||||
}
|
||||
@@ -125,6 +154,8 @@ impl StreamingWriteSource for CreateTableData {
|
||||
pub struct CreateTableRequest {
|
||||
/// The name of the new table
|
||||
pub name: String,
|
||||
/// The namespace to create the table in. Empty list represents root namespace.
|
||||
pub namespace: Vec<String>,
|
||||
/// Initial data to write to the table, can be None to create an empty table
|
||||
pub data: CreateTableData,
|
||||
/// The mode to use when creating the table
|
||||
@@ -137,6 +168,7 @@ impl CreateTableRequest {
|
||||
pub fn new(name: String, data: CreateTableData) -> Self {
|
||||
Self {
|
||||
name,
|
||||
namespace: vec![],
|
||||
data,
|
||||
mode: CreateTableMode::default(),
|
||||
write_options: WriteOptions::default(),
|
||||
@@ -151,6 +183,12 @@ impl CreateTableRequest {
|
||||
pub trait Database:
|
||||
Send + Sync + std::any::Any + std::fmt::Debug + std::fmt::Display + 'static
|
||||
{
|
||||
/// List immediate child namespace names in the given namespace
|
||||
async fn list_namespaces(&self, request: ListNamespacesRequest) -> Result<Vec<String>>;
|
||||
/// Create a new namespace
|
||||
async fn create_namespace(&self, request: CreateNamespaceRequest) -> Result<()>;
|
||||
/// Drop a namespace
|
||||
async fn drop_namespace(&self, request: DropNamespaceRequest) -> Result<()>;
|
||||
/// List the names of tables in the database
|
||||
async fn table_names(&self, request: TableNamesRequest) -> Result<Vec<String>>;
|
||||
/// Create a table in the database
|
||||
@@ -158,10 +196,16 @@ pub trait Database:
|
||||
/// Open a table in the database
|
||||
async fn open_table(&self, request: OpenTableRequest) -> Result<Arc<dyn BaseTable>>;
|
||||
/// Rename a table in the database
|
||||
async fn rename_table(&self, old_name: &str, new_name: &str) -> Result<()>;
|
||||
async fn rename_table(
|
||||
&self,
|
||||
cur_name: &str,
|
||||
new_name: &str,
|
||||
cur_namespace: &[String],
|
||||
new_namespace: &[String],
|
||||
) -> Result<()>;
|
||||
/// Drop a table in the database
|
||||
async fn drop_table(&self, name: &str) -> Result<()>;
|
||||
async fn drop_table(&self, name: &str, namespace: &[String]) -> Result<()>;
|
||||
/// Drop all tables in the database
|
||||
async fn drop_all_tables(&self) -> Result<()>;
|
||||
async fn drop_all_tables(&self, namespace: &[String]) -> Result<()>;
|
||||
fn as_any(&self) -> &dyn std::any::Any;
|
||||
}
|
||||
|
||||
@@ -22,7 +22,8 @@ use crate::table::NativeTable;
|
||||
use crate::utils::validate_table_name;
|
||||
|
||||
use super::{
|
||||
BaseTable, CreateTableMode, CreateTableRequest, Database, DatabaseOptions, OpenTableRequest,
|
||||
BaseTable, CreateNamespaceRequest, CreateTableMode, CreateTableRequest, Database,
|
||||
DatabaseOptions, DropNamespaceRequest, ListNamespacesRequest, OpenTableRequest,
|
||||
TableNamesRequest,
|
||||
};
|
||||
|
||||
@@ -551,6 +552,7 @@ impl ListingDatabase {
|
||||
async fn handle_table_exists(
|
||||
&self,
|
||||
table_name: &str,
|
||||
namespace: Vec<String>,
|
||||
mode: CreateTableMode,
|
||||
data_schema: &arrow_schema::Schema,
|
||||
) -> Result<Arc<dyn BaseTable>> {
|
||||
@@ -561,6 +563,7 @@ impl ListingDatabase {
|
||||
CreateTableMode::ExistOk(callback) => {
|
||||
let req = OpenTableRequest {
|
||||
name: table_name.to_string(),
|
||||
namespace: namespace.clone(),
|
||||
index_cache_size: None,
|
||||
lance_read_params: None,
|
||||
};
|
||||
@@ -584,7 +587,28 @@ impl ListingDatabase {
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl Database for ListingDatabase {
|
||||
async fn list_namespaces(&self, _request: ListNamespacesRequest) -> Result<Vec<String>> {
|
||||
Ok(Vec::new())
|
||||
}
|
||||
|
||||
async fn create_namespace(&self, _request: CreateNamespaceRequest) -> Result<()> {
|
||||
Err(Error::NotSupported {
|
||||
message: "Namespace operations are not supported for listing database".into(),
|
||||
})
|
||||
}
|
||||
|
||||
async fn drop_namespace(&self, _request: DropNamespaceRequest) -> Result<()> {
|
||||
Err(Error::NotSupported {
|
||||
message: "Namespace operations are not supported for listing database".into(),
|
||||
})
|
||||
}
|
||||
|
||||
async fn table_names(&self, request: TableNamesRequest) -> Result<Vec<String>> {
|
||||
if !request.namespace.is_empty() {
|
||||
return Err(Error::NotSupported {
|
||||
message: "Namespace parameter is not supported for listing database. Only root namespace is supported.".into(),
|
||||
});
|
||||
}
|
||||
let mut f = self
|
||||
.object_store
|
||||
.read_dir(self.base_path.clone())
|
||||
@@ -615,6 +639,11 @@ impl Database for ListingDatabase {
|
||||
}
|
||||
|
||||
async fn create_table(&self, request: CreateTableRequest) -> Result<Arc<dyn BaseTable>> {
|
||||
if !request.namespace.is_empty() {
|
||||
return Err(Error::NotSupported {
|
||||
message: "Namespace parameter is not supported for listing database. Only root namespace is supported.".into(),
|
||||
});
|
||||
}
|
||||
let table_uri = self.table_uri(&request.name)?;
|
||||
|
||||
let (storage_version_override, v2_manifest_override) =
|
||||
@@ -637,14 +666,24 @@ impl Database for ListingDatabase {
|
||||
{
|
||||
Ok(table) => Ok(Arc::new(table)),
|
||||
Err(Error::TableAlreadyExists { .. }) => {
|
||||
self.handle_table_exists(&request.name, request.mode, &data_schema)
|
||||
.await
|
||||
self.handle_table_exists(
|
||||
&request.name,
|
||||
request.namespace.clone(),
|
||||
request.mode,
|
||||
&data_schema,
|
||||
)
|
||||
.await
|
||||
}
|
||||
Err(err) => Err(err),
|
||||
}
|
||||
}
|
||||
|
||||
async fn open_table(&self, mut request: OpenTableRequest) -> Result<Arc<dyn BaseTable>> {
|
||||
if !request.namespace.is_empty() {
|
||||
return Err(Error::NotSupported {
|
||||
message: "Namespace parameter is not supported for listing database. Only root namespace is supported.".into(),
|
||||
});
|
||||
}
|
||||
let table_uri = self.table_uri(&request.name)?;
|
||||
|
||||
// Only modify the storage options if we actually have something to
|
||||
@@ -694,17 +733,44 @@ impl Database for ListingDatabase {
|
||||
Ok(native_table)
|
||||
}
|
||||
|
||||
async fn rename_table(&self, _old_name: &str, _new_name: &str) -> Result<()> {
|
||||
async fn rename_table(
|
||||
&self,
|
||||
_cur_name: &str,
|
||||
_new_name: &str,
|
||||
cur_namespace: &[String],
|
||||
new_namespace: &[String],
|
||||
) -> Result<()> {
|
||||
if !cur_namespace.is_empty() {
|
||||
return Err(Error::NotSupported {
|
||||
message: "Namespace parameter is not supported for listing database.".into(),
|
||||
});
|
||||
}
|
||||
if !new_namespace.is_empty() {
|
||||
return Err(Error::NotSupported {
|
||||
message: "Namespace parameter is not supported for listing database.".into(),
|
||||
});
|
||||
}
|
||||
Err(Error::NotSupported {
|
||||
message: "rename_table is not supported in LanceDB OSS".to_string(),
|
||||
message: "rename_table is not supported in LanceDB OSS".into(),
|
||||
})
|
||||
}
|
||||
|
||||
async fn drop_table(&self, name: &str) -> Result<()> {
|
||||
async fn drop_table(&self, name: &str, namespace: &[String]) -> Result<()> {
|
||||
if !namespace.is_empty() {
|
||||
return Err(Error::NotSupported {
|
||||
message: "Namespace parameter is not supported for listing database.".into(),
|
||||
});
|
||||
}
|
||||
self.drop_tables(vec![name.to_string()]).await
|
||||
}
|
||||
|
||||
async fn drop_all_tables(&self) -> Result<()> {
|
||||
async fn drop_all_tables(&self, namespace: &[String]) -> Result<()> {
|
||||
// Check if namespace parameter is provided
|
||||
if !namespace.is_empty() {
|
||||
return Err(Error::NotSupported {
|
||||
message: "Namespace parameter is not supported for listing database.".into(),
|
||||
});
|
||||
}
|
||||
let tables = self.table_names(TableNamesRequest::default()).await?;
|
||||
self.drop_tables(tables).await
|
||||
}
|
||||
|
||||
@@ -65,12 +65,94 @@ pub enum Index {
|
||||
/// Builder for the create_index operation
|
||||
///
|
||||
/// The methods on this builder are used to specify options common to all indices.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// Creating a basic vector index:
|
||||
///
|
||||
/// ```
|
||||
/// use lancedb::{connect, index::{Index, vector::IvfPqIndexBuilder}};
|
||||
///
|
||||
/// # async fn create_basic_vector_index() -> lancedb::Result<()> {
|
||||
/// let db = connect("data/sample-lancedb").execute().await?;
|
||||
/// let table = db.open_table("my_table").execute().await?;
|
||||
///
|
||||
/// // Create a vector index with default settings
|
||||
/// table
|
||||
/// .create_index(&["vector"], Index::IvfPq(IvfPqIndexBuilder::default()))
|
||||
/// .execute()
|
||||
/// .await?;
|
||||
/// # Ok(())
|
||||
/// # }
|
||||
/// ```
|
||||
///
|
||||
/// Creating an index with a custom name:
|
||||
///
|
||||
/// ```
|
||||
/// use lancedb::{connect, index::{Index, vector::IvfPqIndexBuilder}};
|
||||
///
|
||||
/// # async fn create_named_index() -> lancedb::Result<()> {
|
||||
/// let db = connect("data/sample-lancedb").execute().await?;
|
||||
/// let table = db.open_table("my_table").execute().await?;
|
||||
///
|
||||
/// // Create a vector index with a custom name
|
||||
/// table
|
||||
/// .create_index(&["embeddings"], Index::IvfPq(IvfPqIndexBuilder::default()))
|
||||
/// .name("my_embeddings_index".to_string())
|
||||
/// .execute()
|
||||
/// .await?;
|
||||
/// # Ok(())
|
||||
/// # }
|
||||
/// ```
|
||||
///
|
||||
/// Creating an untrained index (for scalar indices only):
|
||||
///
|
||||
/// ```
|
||||
/// use lancedb::{connect, index::{Index, scalar::BTreeIndexBuilder}};
|
||||
///
|
||||
/// # async fn create_untrained_index() -> lancedb::Result<()> {
|
||||
/// let db = connect("data/sample-lancedb").execute().await?;
|
||||
/// let table = db.open_table("my_table").execute().await?;
|
||||
///
|
||||
/// // Create a BTree index without training (creates empty index)
|
||||
/// table
|
||||
/// .create_index(&["category"], Index::BTree(BTreeIndexBuilder::default()))
|
||||
/// .train(false)
|
||||
/// .name("category_index".to_string())
|
||||
/// .execute()
|
||||
/// .await?;
|
||||
/// # Ok(())
|
||||
/// # }
|
||||
/// ```
|
||||
///
|
||||
/// Creating a scalar index with all options:
|
||||
///
|
||||
/// ```
|
||||
/// use lancedb::{connect, index::{Index, scalar::BitmapIndexBuilder}};
|
||||
///
|
||||
/// # async fn create_full_options_index() -> lancedb::Result<()> {
|
||||
/// let db = connect("data/sample-lancedb").execute().await?;
|
||||
/// let table = db.open_table("my_table").execute().await?;
|
||||
///
|
||||
/// // Create a bitmap index with full configuration
|
||||
/// table
|
||||
/// .create_index(&["status"], Index::Bitmap(BitmapIndexBuilder::default()))
|
||||
/// .name("status_bitmap_index".to_string())
|
||||
/// .train(true) // Train the index with existing data
|
||||
/// .replace(false) // Don't replace if index already exists
|
||||
/// .execute()
|
||||
/// .await?;
|
||||
/// # Ok(())
|
||||
/// # }
|
||||
/// ```
|
||||
pub struct IndexBuilder {
|
||||
parent: Arc<dyn BaseTable>,
|
||||
pub(crate) index: Index,
|
||||
pub(crate) columns: Vec<String>,
|
||||
pub(crate) replace: bool,
|
||||
pub(crate) wait_timeout: Option<Duration>,
|
||||
pub(crate) train: bool,
|
||||
pub(crate) name: Option<String>,
|
||||
}
|
||||
|
||||
impl IndexBuilder {
|
||||
@@ -80,7 +162,9 @@ impl IndexBuilder {
|
||||
index,
|
||||
columns,
|
||||
replace: true,
|
||||
train: true,
|
||||
wait_timeout: None,
|
||||
name: None,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -94,6 +178,82 @@ impl IndexBuilder {
|
||||
self
|
||||
}
|
||||
|
||||
/// The name of the index. If not set, a default name will be generated.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```
|
||||
/// use lancedb::{connect, index::{Index, scalar::BTreeIndexBuilder}};
|
||||
///
|
||||
/// # async fn name_example() -> lancedb::Result<()> {
|
||||
/// let db = connect("data/sample-lancedb").execute().await?;
|
||||
/// let table = db.open_table("my_table").execute().await?;
|
||||
///
|
||||
/// // Create an index with a custom name
|
||||
/// table
|
||||
/// .create_index(&["user_id"], Index::BTree(BTreeIndexBuilder::default()))
|
||||
/// .name("user_id_btree_index".to_string())
|
||||
/// .execute()
|
||||
/// .await?;
|
||||
/// # Ok(())
|
||||
/// # }
|
||||
/// ```
|
||||
pub fn name(mut self, v: String) -> Self {
|
||||
self.name = Some(v);
|
||||
self
|
||||
}
|
||||
|
||||
/// Whether to train the index, the default is `true`.
|
||||
///
|
||||
/// If this is false, the index will not be trained and just created empty.
|
||||
///
|
||||
/// This is not supported for vector indices yet.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// Creating an empty index that will be populated later:
|
||||
///
|
||||
/// ```
|
||||
/// use lancedb::{connect, index::{Index, scalar::BitmapIndexBuilder}};
|
||||
///
|
||||
/// # async fn train_false_example() -> lancedb::Result<()> {
|
||||
/// let db = connect("data/sample-lancedb").execute().await?;
|
||||
/// let table = db.open_table("my_table").execute().await?;
|
||||
///
|
||||
/// // Create an empty bitmap index (not trained with existing data)
|
||||
/// table
|
||||
/// .create_index(&["category"], Index::Bitmap(BitmapIndexBuilder::default()))
|
||||
/// .train(false) // Create empty index
|
||||
/// .name("category_bitmap".to_string())
|
||||
/// .execute()
|
||||
/// .await?;
|
||||
/// # Ok(())
|
||||
/// # }
|
||||
/// ```
|
||||
///
|
||||
/// Creating a trained index (default behavior):
|
||||
///
|
||||
/// ```
|
||||
/// use lancedb::{connect, index::{Index, scalar::BTreeIndexBuilder}};
|
||||
///
|
||||
/// # async fn train_true_example() -> lancedb::Result<()> {
|
||||
/// let db = connect("data/sample-lancedb").execute().await?;
|
||||
/// let table = db.open_table("my_table").execute().await?;
|
||||
///
|
||||
/// // Create a trained BTree index (includes existing data)
|
||||
/// table
|
||||
/// .create_index(&["timestamp"], Index::BTree(BTreeIndexBuilder::default()))
|
||||
/// .train(true) // Train with existing data (this is the default)
|
||||
/// .execute()
|
||||
/// .await?;
|
||||
/// # Ok(())
|
||||
/// # }
|
||||
/// ```
|
||||
pub fn train(mut self, v: bool) -> Self {
|
||||
self.train = v;
|
||||
self
|
||||
}
|
||||
|
||||
/// Duration of time to wait for asynchronous indexing to complete. If not set,
|
||||
/// `create_index()` will not wait.
|
||||
///
|
||||
|
||||
@@ -9,7 +9,7 @@ use futures::{stream::BoxStream, TryFutureExt};
|
||||
use lance::io::WrappingObjectStore;
|
||||
use object_store::{
|
||||
path::Path, Error, GetOptions, GetResult, ListResult, MultipartUpload, ObjectMeta, ObjectStore,
|
||||
PutMultipartOpts, PutOptions, PutPayload, PutResult, Result, UploadPart,
|
||||
PutMultipartOptions, PutOptions, PutPayload, PutResult, Result, UploadPart,
|
||||
};
|
||||
|
||||
use async_trait::async_trait;
|
||||
@@ -73,7 +73,7 @@ impl ObjectStore for MirroringObjectStore {
|
||||
async fn put_multipart_opts(
|
||||
&self,
|
||||
location: &Path,
|
||||
opts: PutMultipartOpts,
|
||||
opts: PutMultipartOptions,
|
||||
) -> Result<Box<dyn MultipartUpload>> {
|
||||
if location.primary_only() {
|
||||
return self.primary.put_multipart_opts(location, opts).await;
|
||||
@@ -170,7 +170,11 @@ impl MirroringObjectStoreWrapper {
|
||||
}
|
||||
|
||||
impl WrappingObjectStore for MirroringObjectStoreWrapper {
|
||||
fn wrap(&self, primary: Arc<dyn ObjectStore>) -> Arc<dyn ObjectStore> {
|
||||
fn wrap(
|
||||
&self,
|
||||
primary: Arc<dyn ObjectStore>,
|
||||
_storage_options: Option<&std::collections::HashMap<String, String>>,
|
||||
) -> Arc<dyn ObjectStore> {
|
||||
Arc::new(MirroringObjectStore {
|
||||
primary,
|
||||
secondary: self.secondary.clone(),
|
||||
|
||||
@@ -11,7 +11,7 @@ use futures::stream::BoxStream;
|
||||
use lance::io::WrappingObjectStore;
|
||||
use object_store::{
|
||||
path::Path, GetOptions, GetResult, ListResult, MultipartUpload, ObjectMeta, ObjectStore,
|
||||
PutMultipartOpts, PutOptions, PutPayload, PutResult, Result as OSResult, UploadPart,
|
||||
PutMultipartOptions, PutOptions, PutPayload, PutResult, Result as OSResult, UploadPart,
|
||||
};
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
@@ -50,7 +50,11 @@ impl IoStatsHolder {
|
||||
}
|
||||
|
||||
impl WrappingObjectStore for IoStatsHolder {
|
||||
fn wrap(&self, target: Arc<dyn ObjectStore>) -> Arc<dyn ObjectStore> {
|
||||
fn wrap(
|
||||
&self,
|
||||
target: Arc<dyn ObjectStore>,
|
||||
_storage_options: Option<&std::collections::HashMap<String, String>>,
|
||||
) -> Arc<dyn ObjectStore> {
|
||||
Arc::new(IoTrackingStore {
|
||||
target,
|
||||
stats: self.0.clone(),
|
||||
@@ -106,7 +110,7 @@ impl ObjectStore for IoTrackingStore {
|
||||
async fn put_multipart_opts(
|
||||
&self,
|
||||
location: &Path,
|
||||
opts: PutMultipartOpts,
|
||||
opts: PutMultipartOptions,
|
||||
) -> OSResult<Box<dyn MultipartUpload>> {
|
||||
let target = self.target.put_multipart_opts(location, opts).await?;
|
||||
Ok(Box::new(IoTrackingMultipartUpload {
|
||||
|
||||
@@ -1206,6 +1206,144 @@ impl HasQuery for VectorQuery {
|
||||
}
|
||||
}
|
||||
|
||||
/// A builder for LanceDB take queries.
|
||||
///
|
||||
/// See [`crate::Table::query`] for more details on queries
|
||||
///
|
||||
/// A `TakeQuery` is a query that is used to select a subset of rows
|
||||
/// from a table using dataset offsets or row ids.
|
||||
///
|
||||
/// See [`ExecutableQuery`] for methods that can be used to execute
|
||||
/// the query and retrieve results.
|
||||
///
|
||||
/// This query object can be reused to issue the same query multiple
|
||||
/// times.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct TakeQuery {
|
||||
parent: Arc<dyn BaseTable>,
|
||||
request: QueryRequest,
|
||||
}
|
||||
|
||||
impl TakeQuery {
|
||||
/// Create a new `TakeQuery` that will return rows at the given offsets.
|
||||
///
|
||||
/// See [`crate::Table::take_offsets`] for more details.
|
||||
pub fn from_offsets(parent: Arc<dyn BaseTable>, offsets: Vec<u64>) -> Self {
|
||||
let filter = format!(
|
||||
"_rowoffset in ({})",
|
||||
offsets
|
||||
.iter()
|
||||
.map(|o| o.to_string())
|
||||
.collect::<Vec<_>>()
|
||||
.join(",")
|
||||
);
|
||||
Self {
|
||||
parent,
|
||||
request: QueryRequest {
|
||||
filter: Some(QueryFilter::Sql(filter)),
|
||||
..Default::default()
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a new `TakeQuery` that will return rows with the given row ids.
|
||||
///
|
||||
/// See [`crate::Table::take_row_ids`] for more details.
|
||||
pub fn from_row_ids(parent: Arc<dyn BaseTable>, row_ids: Vec<u64>) -> Self {
|
||||
let filter = format!(
|
||||
"_rowid in ({})",
|
||||
row_ids
|
||||
.iter()
|
||||
.map(|o| o.to_string())
|
||||
.collect::<Vec<_>>()
|
||||
.join(",")
|
||||
);
|
||||
Self {
|
||||
parent,
|
||||
request: QueryRequest {
|
||||
filter: Some(QueryFilter::Sql(filter)),
|
||||
..Default::default()
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert the `TakeQuery` into a `QueryRequest`.
|
||||
pub fn into_request(self) -> QueryRequest {
|
||||
self.request
|
||||
}
|
||||
|
||||
/// Return the current `QueryRequest` for the `TakeQuery`.
|
||||
pub fn current_request(&self) -> &QueryRequest {
|
||||
&self.request
|
||||
}
|
||||
|
||||
/// Return only the specified columns.
|
||||
///
|
||||
/// By default a query will return all columns from the table. However, this can have
|
||||
/// a very significant impact on latency. LanceDb stores data in a columnar fashion. This
|
||||
/// means we can finely tune our I/O to select exactly the columns we need.
|
||||
///
|
||||
/// As a best practice you should always limit queries to the columns that you need.
|
||||
///
|
||||
/// You can also use this method to create new "dynamic" columns based on your existing columns.
|
||||
/// For example, you may not care about "a" or "b" but instead simply want "a + b". This is often
|
||||
/// seen in the SELECT clause of an SQL query (e.g. `SELECT a+b FROM my_table`).
|
||||
///
|
||||
/// To create dynamic columns use [`Select::Dynamic`] (it might be easier to create this with the
|
||||
/// helper method [`Select::dynamic`]). A column will be returned for each tuple provided. The
|
||||
/// first value in that tuple provides the name of the column. The second value in the tuple is
|
||||
/// an SQL string used to specify how the column is calculated.
|
||||
///
|
||||
/// For example, an SQL query might state `SELECT a + b AS combined, c`. The equivalent
|
||||
/// input to [`Select::dynamic`] would be `&[("combined", "a + b"), ("c", "c")]`.
|
||||
///
|
||||
/// Columns will always be returned in the order given, even if that order is different than
|
||||
/// the order used when adding the data.
|
||||
pub fn select(mut self, selection: Select) -> Self {
|
||||
self.request.select = selection;
|
||||
self
|
||||
}
|
||||
|
||||
/// Return the `_rowid` meta column from the Table.
|
||||
pub fn with_row_id(mut self) -> Self {
|
||||
self.request.with_row_id = true;
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
impl HasQuery for TakeQuery {
|
||||
fn mut_query(&mut self) -> &mut QueryRequest {
|
||||
&mut self.request
|
||||
}
|
||||
}
|
||||
|
||||
impl ExecutableQuery for TakeQuery {
|
||||
async fn create_plan(&self, options: QueryExecutionOptions) -> Result<Arc<dyn ExecutionPlan>> {
|
||||
let req = AnyQuery::Query(self.request.clone());
|
||||
self.parent.clone().create_plan(&req, options).await
|
||||
}
|
||||
|
||||
async fn execute_with_options(
|
||||
&self,
|
||||
options: QueryExecutionOptions,
|
||||
) -> Result<SendableRecordBatchStream> {
|
||||
let query = AnyQuery::Query(self.request.clone());
|
||||
Ok(SendableRecordBatchStream::from(
|
||||
self.parent.clone().query(&query, options).await?,
|
||||
))
|
||||
}
|
||||
|
||||
async fn explain_plan(&self, verbose: bool) -> Result<String> {
|
||||
let query = AnyQuery::Query(self.request.clone());
|
||||
self.parent.explain_plan(&query, verbose).await
|
||||
}
|
||||
|
||||
async fn analyze_plan_with_options(&self, options: QueryExecutionOptions) -> Result<String> {
|
||||
let query = AnyQuery::Query(self.request.clone());
|
||||
self.parent.analyze_plan(&query, options).await
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::{collections::HashSet, sync::Arc};
|
||||
@@ -1219,9 +1357,10 @@ mod tests {
|
||||
use arrow_schema::{DataType, Field as ArrowField, Schema as ArrowSchema};
|
||||
use futures::{StreamExt, TryStreamExt};
|
||||
use lance_testing::datagen::{BatchGenerator, IncrementingInt32, RandomVector};
|
||||
use rand::seq::IndexedRandom;
|
||||
use tempfile::tempdir;
|
||||
|
||||
use crate::{connect, database::CreateTableMode, Table};
|
||||
use crate::{connect, database::CreateTableMode, index::Index, Table};
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_setters_getters() {
|
||||
@@ -1327,7 +1466,7 @@ mod tests {
|
||||
|
||||
while let Some(batch) = stream.next().await {
|
||||
// pre filter should return 10 rows
|
||||
assert!(batch.expect("should be Ok").num_rows() == 10);
|
||||
assert_eq!(batch.expect("should be Ok").num_rows(), 10);
|
||||
}
|
||||
|
||||
let query = table
|
||||
@@ -1342,7 +1481,7 @@ mod tests {
|
||||
// should only have one batch
|
||||
while let Some(batch) = stream.next().await {
|
||||
// pre filter should return 10 rows
|
||||
assert!(batch.expect("should be Ok").num_rows() == 9);
|
||||
assert_eq!(batch.expect("should be Ok").num_rows(), 10);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1802,4 +1941,197 @@ mod tests {
|
||||
assert_eq!(0, batch.num_rows());
|
||||
assert_eq!(2, batch.num_columns());
|
||||
}
|
||||
|
||||
// TODO: Implement a good FTS test data generator in lance_datagen.
|
||||
fn fts_test_data(nrows: usize) -> RecordBatch {
|
||||
let schema = Arc::new(ArrowSchema::new(vec![
|
||||
ArrowField::new("text", DataType::Utf8, false),
|
||||
ArrowField::new("id", DataType::Int32, false),
|
||||
]));
|
||||
|
||||
let ids: Int32Array = (1..=nrows as i32).collect();
|
||||
|
||||
// Sample 1 - 3 tokens for each string value
|
||||
let tokens = ["a", "b", "c", "d", "e"];
|
||||
use rand::{rng, Rng};
|
||||
|
||||
let mut rng = rng();
|
||||
let text: StringArray = (0..nrows)
|
||||
.map(|_| {
|
||||
let num_tokens = rng.random_range(1..=3); // 1 to 3 tokens
|
||||
let selected_tokens: Vec<&str> = tokens
|
||||
.choose_multiple(&mut rng, num_tokens)
|
||||
.cloned()
|
||||
.collect();
|
||||
Some(selected_tokens.join(" "))
|
||||
})
|
||||
.collect();
|
||||
|
||||
RecordBatch::try_new(schema, vec![Arc::new(text), Arc::new(ids)]).unwrap()
|
||||
}
|
||||
|
||||
async fn run_query_request(table: &dyn BaseTable, query: AnyQuery) -> RecordBatch {
|
||||
use lance::io::RecordBatchStream;
|
||||
let stream = table.query(&query, Default::default()).await.unwrap();
|
||||
let schema = stream.schema();
|
||||
let batches = stream.try_collect::<Vec<_>>().await.unwrap();
|
||||
arrow::compute::concat_batches(&schema, &batches).unwrap()
|
||||
}
|
||||
|
||||
async fn test_pagination(table: &dyn BaseTable, full_query: AnyQuery, page_size: usize) {
|
||||
// Get full results
|
||||
let full_results = run_query_request(table, full_query.clone()).await;
|
||||
|
||||
// Then use limit & offset to do paginated queries, assert each
|
||||
// is the same as a slice of the full results
|
||||
let mut offset = 0;
|
||||
while offset < full_results.num_rows() {
|
||||
let mut paginated_query = full_query.clone();
|
||||
let limit = page_size.min(full_results.num_rows() - offset);
|
||||
match &mut paginated_query {
|
||||
AnyQuery::Query(query)
|
||||
| AnyQuery::VectorQuery(VectorQueryRequest { base: query, .. }) => {
|
||||
query.limit = Some(limit);
|
||||
query.offset = Some(offset);
|
||||
}
|
||||
}
|
||||
let paginated_results = run_query_request(table, paginated_query).await;
|
||||
let expected_slice = full_results.slice(offset, limit);
|
||||
assert_eq!(
|
||||
paginated_results, expected_slice,
|
||||
"Paginated results do not match expected slice at offset {}, for page size {}",
|
||||
offset, page_size
|
||||
);
|
||||
offset += page_size;
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_pagination_with_scan() {
|
||||
let db = connect("memory://test").execute().await.unwrap();
|
||||
let table = db
|
||||
.create_table("test_table", make_non_empty_batches())
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
let query = AnyQuery::Query(table.query().into_request());
|
||||
test_pagination(table.base_table().as_ref(), query.clone(), 3).await;
|
||||
test_pagination(table.base_table().as_ref(), query, 10).await;
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_pagination_with_fts() {
|
||||
let db = connect("memory://test").execute().await.unwrap();
|
||||
let data = fts_test_data(400);
|
||||
let schema = data.schema();
|
||||
let data = RecordBatchIterator::new(vec![Ok(data)], schema);
|
||||
let table = db.create_table("test_table", data).execute().await.unwrap();
|
||||
|
||||
table
|
||||
.create_index(&["text"], Index::FTS(Default::default()))
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
let query = table
|
||||
.query()
|
||||
.full_text_search(FullTextSearchQuery::new("test".into()))
|
||||
.into_request();
|
||||
let query = AnyQuery::Query(query);
|
||||
test_pagination(table.base_table().as_ref(), query.clone(), 3).await;
|
||||
test_pagination(table.base_table().as_ref(), query, 10).await;
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_pagination_with_vector_query() {
|
||||
let db = connect("memory://test").execute().await.unwrap();
|
||||
let table = db
|
||||
.create_table("test_table", make_non_empty_batches())
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
let query_vector = vec![0.1_f32, 0.2, 0.3, 0.4];
|
||||
let query = table
|
||||
.query()
|
||||
.nearest_to(query_vector.as_slice())
|
||||
.unwrap()
|
||||
.limit(50)
|
||||
.into_request();
|
||||
let query = AnyQuery::VectorQuery(query);
|
||||
test_pagination(table.base_table().as_ref(), query.clone(), 3).await;
|
||||
test_pagination(table.base_table().as_ref(), query, 10).await;
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_take_offsets() {
|
||||
let tmp_dir = tempdir().unwrap();
|
||||
let table = make_test_table(&tmp_dir).await;
|
||||
|
||||
let results = table
|
||||
.take_offsets(vec![5, 1, 17])
|
||||
.execute()
|
||||
.await
|
||||
.unwrap()
|
||||
.try_collect::<Vec<_>>()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(results.len(), 1);
|
||||
assert_eq!(results[0].num_rows(), 3);
|
||||
assert_eq!(results[0].num_columns(), 2);
|
||||
|
||||
let mut ids = results[0]
|
||||
.column_by_name("id")
|
||||
.unwrap()
|
||||
.as_primitive::<Int32Type>()
|
||||
.values()
|
||||
.to_vec();
|
||||
ids.sort();
|
||||
|
||||
assert_eq!(ids, vec![1, 5, 17]);
|
||||
|
||||
// Select specific columns
|
||||
let results = table
|
||||
.take_offsets(vec![5, 1, 17])
|
||||
.select(Select::Columns(vec!["vector".to_string()]))
|
||||
.execute()
|
||||
.await
|
||||
.unwrap()
|
||||
.try_collect::<Vec<_>>()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(results.len(), 1);
|
||||
assert_eq!(results[0].num_rows(), 3);
|
||||
assert_eq!(results[0].num_columns(), 1);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_take_row_ids() {
|
||||
let tmp_dir = tempdir().unwrap();
|
||||
let table = make_test_table(&tmp_dir).await;
|
||||
|
||||
let results = table
|
||||
.take_row_ids(vec![5, 1, 17])
|
||||
.execute()
|
||||
.await
|
||||
.unwrap()
|
||||
.try_collect::<Vec<_>>()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(results.len(), 1);
|
||||
assert_eq!(results[0].num_rows(), 3);
|
||||
assert_eq!(results[0].num_columns(), 2);
|
||||
|
||||
let mut ids = results[0]
|
||||
.column_by_name("id")
|
||||
.unwrap()
|
||||
.as_primitive::<Int32Type>()
|
||||
.values()
|
||||
.to_vec();
|
||||
|
||||
ids.sort();
|
||||
|
||||
assert_eq!(ids, vec![1, 5, 17]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -25,6 +25,9 @@ pub struct ClientConfig {
|
||||
pub user_agent: String,
|
||||
// TODO: how to configure request ids?
|
||||
pub extra_headers: HashMap<String, String>,
|
||||
/// The delimiter to use when constructing object identifiers.
|
||||
/// If not default, passes as query parameter.
|
||||
pub id_delimiter: Option<String>,
|
||||
}
|
||||
|
||||
impl Default for ClientConfig {
|
||||
@@ -34,6 +37,7 @@ impl Default for ClientConfig {
|
||||
retry_config: RetryConfig::default(),
|
||||
user_agent: concat!("LanceDB-Rust-Client/", env!("CARGO_PKG_VERSION")).into(),
|
||||
extra_headers: HashMap::new(),
|
||||
id_delimiter: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -145,6 +149,7 @@ pub struct RestfulLanceDbClient<S: HttpSend = Sender> {
|
||||
host: String,
|
||||
pub(crate) retry_config: ResolvedRetryConfig,
|
||||
pub(crate) sender: S,
|
||||
pub(crate) id_delimiter: String,
|
||||
}
|
||||
|
||||
pub trait HttpSend: Clone + Send + Sync + std::fmt::Debug + 'static {
|
||||
@@ -268,6 +273,7 @@ impl RestfulLanceDbClient<Sender> {
|
||||
host,
|
||||
retry_config,
|
||||
sender: Sender,
|
||||
id_delimiter: client_config.id_delimiter.unwrap_or("$".to_string()),
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -356,12 +362,22 @@ impl<S: HttpSend> RestfulLanceDbClient<S> {
|
||||
|
||||
pub fn get(&self, uri: &str) -> RequestBuilder {
|
||||
let full_uri = format!("{}{}", self.host, uri);
|
||||
self.client.get(full_uri)
|
||||
let builder = self.client.get(full_uri);
|
||||
self.add_id_delimiter_query_param(builder)
|
||||
}
|
||||
|
||||
pub fn post(&self, uri: &str) -> RequestBuilder {
|
||||
let full_uri = format!("{}{}", self.host, uri);
|
||||
self.client.post(full_uri)
|
||||
let builder = self.client.post(full_uri);
|
||||
self.add_id_delimiter_query_param(builder)
|
||||
}
|
||||
|
||||
fn add_id_delimiter_query_param(&self, req: RequestBuilder) -> RequestBuilder {
|
||||
if self.id_delimiter != "$" {
|
||||
req.query(&[("delimiter", self.id_delimiter.clone())])
|
||||
} else {
|
||||
req
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn send(&self, req: RequestBuilder) -> Result<(String, Response)> {
|
||||
@@ -594,6 +610,7 @@ pub mod test_utils {
|
||||
sender: MockSender {
|
||||
f: Arc::new(wrapper),
|
||||
},
|
||||
id_delimiter: "$".to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14,8 +14,9 @@ use serde::Deserialize;
|
||||
use tokio::task::spawn_blocking;
|
||||
|
||||
use crate::database::{
|
||||
CreateTableData, CreateTableMode, CreateTableRequest, Database, DatabaseOptions,
|
||||
OpenTableRequest, TableNamesRequest,
|
||||
CreateNamespaceRequest, CreateTableData, CreateTableMode, CreateTableRequest, Database,
|
||||
DatabaseOptions, DropNamespaceRequest, ListNamespacesRequest, OpenTableRequest,
|
||||
TableNamesRequest,
|
||||
};
|
||||
use crate::error::Result;
|
||||
use crate::table::BaseTable;
|
||||
@@ -245,10 +246,61 @@ impl From<&CreateTableMode> for &'static str {
|
||||
}
|
||||
}
|
||||
|
||||
fn build_table_identifier(name: &str, namespace: &[String], delimiter: &str) -> String {
|
||||
if !namespace.is_empty() {
|
||||
let mut parts = namespace.to_vec();
|
||||
parts.push(name.to_string());
|
||||
parts.join(delimiter)
|
||||
} else {
|
||||
name.to_string()
|
||||
}
|
||||
}
|
||||
|
||||
fn build_namespace_identifier(namespace: &[String], delimiter: &str) -> String {
|
||||
if namespace.is_empty() {
|
||||
// According to the namespace spec, use delimiter to represent root namespace
|
||||
delimiter.to_string()
|
||||
} else {
|
||||
namespace.join(delimiter)
|
||||
}
|
||||
}
|
||||
|
||||
/// Build a secure cache key using length prefixes.
|
||||
/// This format is completely unambiguous regardless of delimiter or content.
|
||||
/// Format: [u32_len][namespace1][u32_len][namespace2]...[u32_len][table_name]
|
||||
/// Returns a hex-encoded string for use as a cache key.
|
||||
fn build_cache_key(name: &str, namespace: &[String]) -> String {
|
||||
let mut key = Vec::new();
|
||||
|
||||
// Add each namespace component with length prefix
|
||||
for ns in namespace {
|
||||
let bytes = ns.as_bytes();
|
||||
key.extend_from_slice(&(bytes.len() as u32).to_le_bytes());
|
||||
key.extend_from_slice(bytes);
|
||||
}
|
||||
|
||||
// Add table name with length prefix
|
||||
let name_bytes = name.as_bytes();
|
||||
key.extend_from_slice(&(name_bytes.len() as u32).to_le_bytes());
|
||||
key.extend_from_slice(name_bytes);
|
||||
|
||||
// Convert to hex string for use as a cache key
|
||||
key.iter().map(|b| format!("{:02x}", b)).collect()
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl<S: HttpSend> Database for RemoteDatabase<S> {
|
||||
async fn table_names(&self, request: TableNamesRequest) -> Result<Vec<String>> {
|
||||
let mut req = self.client.get("/v1/table/");
|
||||
let mut req = if !request.namespace.is_empty() {
|
||||
let namespace_id =
|
||||
build_namespace_identifier(&request.namespace, &self.client.id_delimiter);
|
||||
self.client
|
||||
.get(&format!("/v1/namespace/{}/table/list", namespace_id))
|
||||
} else {
|
||||
// TODO: use new API for all listing operations once stable
|
||||
self.client.get("/v1/table/")
|
||||
};
|
||||
|
||||
if let Some(limit) = request.limit {
|
||||
req = req.query(&[("limit", limit)]);
|
||||
}
|
||||
@@ -264,12 +316,17 @@ impl<S: HttpSend> Database for RemoteDatabase<S> {
|
||||
.err_to_http(request_id)?
|
||||
.tables;
|
||||
for table in &tables {
|
||||
let table_identifier =
|
||||
build_table_identifier(table, &request.namespace, &self.client.id_delimiter);
|
||||
let cache_key = build_cache_key(table, &request.namespace);
|
||||
let remote_table = Arc::new(RemoteTable::new(
|
||||
self.client.clone(),
|
||||
table.clone(),
|
||||
request.namespace.clone(),
|
||||
table_identifier.clone(),
|
||||
version.clone(),
|
||||
));
|
||||
self.table_cache.insert(table.clone(), remote_table).await;
|
||||
self.table_cache.insert(cache_key, remote_table).await;
|
||||
}
|
||||
Ok(tables)
|
||||
}
|
||||
@@ -295,9 +352,11 @@ impl<S: HttpSend> Database for RemoteDatabase<S> {
|
||||
.await
|
||||
.unwrap()?;
|
||||
|
||||
let identifier =
|
||||
build_table_identifier(&request.name, &request.namespace, &self.client.id_delimiter);
|
||||
let req = self
|
||||
.client
|
||||
.post(&format!("/v1/table/{}/create/", request.name))
|
||||
.post(&format!("/v1/table/{}/create/", identifier))
|
||||
.query(&[("mode", Into::<&str>::into(&request.mode))])
|
||||
.body(data_buffer)
|
||||
.header(CONTENT_TYPE, ARROW_STREAM_CONTENT_TYPE);
|
||||
@@ -314,6 +373,7 @@ impl<S: HttpSend> Database for RemoteDatabase<S> {
|
||||
CreateTableMode::ExistOk(callback) => {
|
||||
let req = OpenTableRequest {
|
||||
name: request.name.clone(),
|
||||
namespace: request.namespace.clone(),
|
||||
index_cache_size: None,
|
||||
lance_read_params: None,
|
||||
};
|
||||
@@ -342,70 +402,160 @@ impl<S: HttpSend> Database for RemoteDatabase<S> {
|
||||
}
|
||||
let rsp = self.client.check_response(&request_id, rsp).await?;
|
||||
let version = parse_server_version(&request_id, &rsp)?;
|
||||
let table_identifier =
|
||||
build_table_identifier(&request.name, &request.namespace, &self.client.id_delimiter);
|
||||
let cache_key = build_cache_key(&request.name, &request.namespace);
|
||||
let table = Arc::new(RemoteTable::new(
|
||||
self.client.clone(),
|
||||
request.name.clone(),
|
||||
request.namespace.clone(),
|
||||
table_identifier,
|
||||
version,
|
||||
));
|
||||
self.table_cache
|
||||
.insert(request.name.clone(), table.clone())
|
||||
.await;
|
||||
self.table_cache.insert(cache_key, table.clone()).await;
|
||||
|
||||
Ok(table)
|
||||
}
|
||||
|
||||
async fn open_table(&self, request: OpenTableRequest) -> Result<Arc<dyn BaseTable>> {
|
||||
let identifier =
|
||||
build_table_identifier(&request.name, &request.namespace, &self.client.id_delimiter);
|
||||
let cache_key = build_cache_key(&request.name, &request.namespace);
|
||||
|
||||
// We describe the table to confirm it exists before moving on.
|
||||
if let Some(table) = self.table_cache.get(&request.name).await {
|
||||
if let Some(table) = self.table_cache.get(&cache_key).await {
|
||||
Ok(table.clone())
|
||||
} else {
|
||||
let req = self
|
||||
.client
|
||||
.post(&format!("/v1/table/{}/describe/", request.name));
|
||||
.post(&format!("/v1/table/{}/describe/", identifier));
|
||||
let (request_id, rsp) = self.client.send_with_retry(req, None, true).await?;
|
||||
if rsp.status() == StatusCode::NOT_FOUND {
|
||||
return Err(crate::Error::TableNotFound { name: request.name });
|
||||
return Err(crate::Error::TableNotFound {
|
||||
name: identifier.clone(),
|
||||
});
|
||||
}
|
||||
let rsp = self.client.check_response(&request_id, rsp).await?;
|
||||
let version = parse_server_version(&request_id, &rsp)?;
|
||||
let table_identifier = build_table_identifier(
|
||||
&request.name,
|
||||
&request.namespace,
|
||||
&self.client.id_delimiter,
|
||||
);
|
||||
let table = Arc::new(RemoteTable::new(
|
||||
self.client.clone(),
|
||||
request.name.clone(),
|
||||
request.namespace.clone(),
|
||||
table_identifier,
|
||||
version,
|
||||
));
|
||||
self.table_cache.insert(request.name, table.clone()).await;
|
||||
let cache_key = build_cache_key(&request.name, &request.namespace);
|
||||
self.table_cache.insert(cache_key, table.clone()).await;
|
||||
Ok(table)
|
||||
}
|
||||
}
|
||||
|
||||
async fn rename_table(&self, current_name: &str, new_name: &str) -> Result<()> {
|
||||
async fn rename_table(
|
||||
&self,
|
||||
current_name: &str,
|
||||
new_name: &str,
|
||||
cur_namespace: &[String],
|
||||
new_namespace: &[String],
|
||||
) -> Result<()> {
|
||||
let current_identifier =
|
||||
build_table_identifier(current_name, cur_namespace, &self.client.id_delimiter);
|
||||
let current_cache_key = build_cache_key(current_name, cur_namespace);
|
||||
let new_cache_key = build_cache_key(new_name, new_namespace);
|
||||
|
||||
let mut body = serde_json::json!({ "new_table_name": new_name });
|
||||
if !new_namespace.is_empty() {
|
||||
body["new_namespace"] = serde_json::Value::Array(
|
||||
new_namespace
|
||||
.iter()
|
||||
.map(|s| serde_json::Value::String(s.clone()))
|
||||
.collect(),
|
||||
);
|
||||
}
|
||||
let req = self
|
||||
.client
|
||||
.post(&format!("/v1/table/{}/rename/", current_name));
|
||||
let req = req.json(&serde_json::json!({ "new_table_name": new_name }));
|
||||
.post(&format!("/v1/table/{}/rename/", current_identifier))
|
||||
.json(&body);
|
||||
let (request_id, resp) = self.client.send(req).await?;
|
||||
self.client.check_response(&request_id, resp).await?;
|
||||
let table = self.table_cache.remove(current_name).await;
|
||||
let table = self.table_cache.remove(¤t_cache_key).await;
|
||||
if let Some(table) = table {
|
||||
self.table_cache.insert(new_name.into(), table).await;
|
||||
self.table_cache.insert(new_cache_key, table).await;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn drop_table(&self, name: &str) -> Result<()> {
|
||||
let req = self.client.post(&format!("/v1/table/{}/drop/", name));
|
||||
async fn drop_table(&self, name: &str, namespace: &[String]) -> Result<()> {
|
||||
let identifier = build_table_identifier(name, namespace, &self.client.id_delimiter);
|
||||
let cache_key = build_cache_key(name, namespace);
|
||||
let req = self.client.post(&format!("/v1/table/{}/drop/", identifier));
|
||||
let (request_id, resp) = self.client.send(req).await?;
|
||||
self.client.check_response(&request_id, resp).await?;
|
||||
self.table_cache.remove(name).await;
|
||||
self.table_cache.remove(&cache_key).await;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn drop_all_tables(&self) -> Result<()> {
|
||||
async fn drop_all_tables(&self, namespace: &[String]) -> Result<()> {
|
||||
// TODO: Implement namespace-aware drop_all_tables
|
||||
let _namespace = namespace; // Suppress unused warning for now
|
||||
Err(crate::Error::NotSupported {
|
||||
message: "Dropping databases is not supported in the remote API".to_string(),
|
||||
message: "Dropping all tables is not currently supported in the remote API".to_string(),
|
||||
})
|
||||
}
|
||||
|
||||
async fn list_namespaces(&self, request: ListNamespacesRequest) -> Result<Vec<String>> {
|
||||
let namespace_id =
|
||||
build_namespace_identifier(request.namespace.as_slice(), &self.client.id_delimiter);
|
||||
let mut req = self
|
||||
.client
|
||||
.get(&format!("/v1/namespace/{}/list", namespace_id));
|
||||
if let Some(limit) = request.limit {
|
||||
req = req.query(&[("limit", limit)]);
|
||||
}
|
||||
if let Some(page_token) = request.page_token {
|
||||
req = req.query(&[("page_token", page_token)]);
|
||||
}
|
||||
|
||||
let (request_id, resp) = self.client.send(req).await?;
|
||||
let resp = self.client.check_response(&request_id, resp).await?;
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct ListNamespacesResponse {
|
||||
namespaces: Vec<String>,
|
||||
}
|
||||
|
||||
let parsed: ListNamespacesResponse = resp.json().await.map_err(|e| Error::Runtime {
|
||||
message: format!("Failed to parse namespace response: {}", e),
|
||||
})?;
|
||||
Ok(parsed.namespaces)
|
||||
}
|
||||
|
||||
async fn create_namespace(&self, request: CreateNamespaceRequest) -> Result<()> {
|
||||
let namespace_id =
|
||||
build_namespace_identifier(request.namespace.as_slice(), &self.client.id_delimiter);
|
||||
let req = self
|
||||
.client
|
||||
.post(&format!("/v1/namespace/{}/create", namespace_id));
|
||||
let (request_id, resp) = self.client.send(req).await?;
|
||||
self.client.check_response(&request_id, resp).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn drop_namespace(&self, request: DropNamespaceRequest) -> Result<()> {
|
||||
let namespace_id =
|
||||
build_namespace_identifier(request.namespace.as_slice(), &self.client.id_delimiter);
|
||||
let req = self
|
||||
.client
|
||||
.post(&format!("/v1/namespace/{}/drop", namespace_id));
|
||||
let (request_id, resp) = self.client.send(req).await?;
|
||||
self.client.check_response(&request_id, resp).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn as_any(&self) -> &dyn std::any::Any {
|
||||
self
|
||||
}
|
||||
@@ -436,6 +586,7 @@ impl From<StorageOptions> for RemoteOptions {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::build_cache_key;
|
||||
use std::sync::{Arc, OnceLock};
|
||||
|
||||
use arrow_array::{Int32Array, RecordBatch, RecordBatchIterator};
|
||||
@@ -448,6 +599,38 @@ mod tests {
|
||||
Connection, Error,
|
||||
};
|
||||
|
||||
#[test]
|
||||
fn test_cache_key_security() {
|
||||
// Test that cache keys are unique regardless of delimiter manipulation
|
||||
|
||||
// Case 1: Different delimiters should not affect cache key
|
||||
let key1 = build_cache_key("table1", &["ns1".to_string(), "ns2".to_string()]);
|
||||
let key2 = build_cache_key("table1", &["ns1$ns2".to_string()]);
|
||||
assert_ne!(
|
||||
key1, key2,
|
||||
"Cache keys should differ for different namespace structures"
|
||||
);
|
||||
|
||||
// Case 2: Table name containing delimiter should not cause collision
|
||||
let key3 = build_cache_key("ns2$table1", &["ns1".to_string()]);
|
||||
assert_ne!(
|
||||
key1, key3,
|
||||
"Cache key should be different when table name contains delimiter"
|
||||
);
|
||||
|
||||
// Case 3: Empty namespace vs namespace with empty string
|
||||
let key4 = build_cache_key("table1", &[]);
|
||||
let key5 = build_cache_key("table1", &["".to_string()]);
|
||||
assert_ne!(
|
||||
key4, key5,
|
||||
"Empty namespace should differ from namespace with empty string"
|
||||
);
|
||||
|
||||
// Case 4: Verify same inputs produce same key (consistency)
|
||||
let key6 = build_cache_key("table1", &["ns1".to_string(), "ns2".to_string()]);
|
||||
assert_eq!(key1, key6, "Same inputs should produce same cache key");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_retries() {
|
||||
// We'll record the request_id here, to check it matches the one in the error.
|
||||
@@ -711,7 +894,7 @@ mod tests {
|
||||
|
||||
http::Response::builder().status(200).body("").unwrap()
|
||||
});
|
||||
conn.drop_table("table1").await.unwrap();
|
||||
conn.drop_table("table1", &[]).await.unwrap();
|
||||
// NOTE: the API will return 200 even if the table does not exist. So we shouldn't expect 404.
|
||||
}
|
||||
|
||||
@@ -731,7 +914,9 @@ mod tests {
|
||||
|
||||
http::Response::builder().status(200).body("").unwrap()
|
||||
});
|
||||
conn.rename_table("table1", "table2").await.unwrap();
|
||||
conn.rename_table("table1", "table2", &[], &[])
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
@@ -745,4 +930,186 @@ mod tests {
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_table_names_with_root_namespace() {
|
||||
// When namespace is empty (root namespace), should use /v1/table/ for backwards compatibility
|
||||
let conn = Connection::new_with_handler(|request| {
|
||||
assert_eq!(request.method(), &reqwest::Method::GET);
|
||||
assert_eq!(request.url().path(), "/v1/table/");
|
||||
assert_eq!(request.url().query(), None);
|
||||
|
||||
http::Response::builder()
|
||||
.status(200)
|
||||
.body(r#"{"tables": ["table1", "table2"]}"#)
|
||||
.unwrap()
|
||||
});
|
||||
let names = conn
|
||||
.table_names()
|
||||
.namespace(vec![])
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(names, vec!["table1", "table2"]);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_table_names_with_namespace() {
|
||||
// When namespace is non-empty, should use /v1/namespace/{id}/table/list
|
||||
let conn = Connection::new_with_handler(|request| {
|
||||
assert_eq!(request.method(), &reqwest::Method::GET);
|
||||
assert_eq!(request.url().path(), "/v1/namespace/test/table/list");
|
||||
assert_eq!(request.url().query(), None);
|
||||
|
||||
http::Response::builder()
|
||||
.status(200)
|
||||
.body(r#"{"tables": ["table1", "table2"]}"#)
|
||||
.unwrap()
|
||||
});
|
||||
let names = conn
|
||||
.table_names()
|
||||
.namespace(vec!["test".to_string()])
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(names, vec!["table1", "table2"]);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_table_names_with_nested_namespace() {
|
||||
// When namespace is vec!["ns1", "ns2"], should use /v1/namespace/ns1$ns2/table/list
|
||||
let conn = Connection::new_with_handler(|request| {
|
||||
assert_eq!(request.method(), &reqwest::Method::GET);
|
||||
assert_eq!(request.url().path(), "/v1/namespace/ns1$ns2/table/list");
|
||||
assert_eq!(request.url().query(), None);
|
||||
|
||||
http::Response::builder()
|
||||
.status(200)
|
||||
.body(r#"{"tables": ["ns1$ns2$table1", "ns1$ns2$table2"]}"#)
|
||||
.unwrap()
|
||||
});
|
||||
let names = conn
|
||||
.table_names()
|
||||
.namespace(vec!["ns1".to_string(), "ns2".to_string()])
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(names, vec!["ns1$ns2$table1", "ns1$ns2$table2"]);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_open_table_with_namespace() {
|
||||
let conn = Connection::new_with_handler(|request| {
|
||||
assert_eq!(request.method(), &reqwest::Method::POST);
|
||||
assert_eq!(request.url().path(), "/v1/table/ns1$ns2$table1/describe/");
|
||||
assert_eq!(request.url().query(), None);
|
||||
|
||||
http::Response::builder()
|
||||
.status(200)
|
||||
.body(r#"{"table": "table1"}"#)
|
||||
.unwrap()
|
||||
});
|
||||
let table = conn
|
||||
.open_table("table1")
|
||||
.namespace(vec!["ns1".to_string(), "ns2".to_string()])
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(table.name(), "table1");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_create_table_with_namespace() {
|
||||
let conn = Connection::new_with_handler(|request| {
|
||||
assert_eq!(request.method(), &reqwest::Method::POST);
|
||||
assert_eq!(request.url().path(), "/v1/table/ns1$table1/create/");
|
||||
assert_eq!(
|
||||
request
|
||||
.headers()
|
||||
.get(reqwest::header::CONTENT_TYPE)
|
||||
.unwrap(),
|
||||
ARROW_STREAM_CONTENT_TYPE.as_bytes()
|
||||
);
|
||||
|
||||
http::Response::builder().status(200).body("").unwrap()
|
||||
});
|
||||
let data = RecordBatch::try_new(
|
||||
Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)])),
|
||||
vec![Arc::new(Int32Array::from(vec![1, 2, 3]))],
|
||||
)
|
||||
.unwrap();
|
||||
let reader = RecordBatchIterator::new([Ok(data.clone())], data.schema());
|
||||
let table = conn
|
||||
.create_table("table1", reader)
|
||||
.namespace(vec!["ns1".to_string()])
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(table.name(), "table1");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_drop_table_with_namespace() {
|
||||
let conn = Connection::new_with_handler(|request| {
|
||||
assert_eq!(request.method(), &reqwest::Method::POST);
|
||||
assert_eq!(request.url().path(), "/v1/table/ns1$ns2$table1/drop/");
|
||||
assert_eq!(request.url().query(), None);
|
||||
assert!(request.body().is_none());
|
||||
|
||||
http::Response::builder().status(200).body("").unwrap()
|
||||
});
|
||||
conn.drop_table("table1", &["ns1".to_string(), "ns2".to_string()])
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_rename_table_with_namespace() {
|
||||
let conn = Connection::new_with_handler(|request| {
|
||||
assert_eq!(request.method(), &reqwest::Method::POST);
|
||||
assert_eq!(request.url().path(), "/v1/table/ns1$table1/rename/");
|
||||
assert_eq!(
|
||||
request.headers().get("Content-Type").unwrap(),
|
||||
JSON_CONTENT_TYPE
|
||||
);
|
||||
|
||||
let body = request.body().unwrap().as_bytes().unwrap();
|
||||
let body: serde_json::Value = serde_json::from_slice(body).unwrap();
|
||||
assert_eq!(body["new_table_name"], "table2");
|
||||
assert_eq!(body["new_namespace"], serde_json::json!(["ns2"]));
|
||||
|
||||
http::Response::builder().status(200).body("").unwrap()
|
||||
});
|
||||
conn.rename_table(
|
||||
"table1",
|
||||
"table2",
|
||||
&["ns1".to_string()],
|
||||
&["ns2".to_string()],
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_create_empty_table_with_namespace() {
|
||||
let conn = Connection::new_with_handler(|request| {
|
||||
assert_eq!(request.method(), &reqwest::Method::POST);
|
||||
assert_eq!(request.url().path(), "/v1/table/prod$data$metrics/create/");
|
||||
assert_eq!(
|
||||
request
|
||||
.headers()
|
||||
.get(reqwest::header::CONTENT_TYPE)
|
||||
.unwrap(),
|
||||
ARROW_STREAM_CONTENT_TYPE.as_bytes()
|
||||
);
|
||||
|
||||
http::Response::builder().status(200).body("").unwrap()
|
||||
});
|
||||
let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)]));
|
||||
conn.create_empty_table("metrics", schema)
|
||||
.namespace(vec!["prod".to_string(), "data".to_string()])
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -70,7 +70,7 @@ impl<S: HttpSend + 'static> Tags for RemoteTags<'_, S> {
|
||||
let request = self
|
||||
.inner
|
||||
.client
|
||||
.post(&format!("/v1/table/{}/tags/list/", self.inner.name));
|
||||
.post(&format!("/v1/table/{}/tags/list/", self.inner.identifier));
|
||||
let (request_id, response) = self.inner.send(request, true).await?;
|
||||
let response = self
|
||||
.inner
|
||||
@@ -104,7 +104,10 @@ impl<S: HttpSend + 'static> Tags for RemoteTags<'_, S> {
|
||||
let request = self
|
||||
.inner
|
||||
.client
|
||||
.post(&format!("/v1/table/{}/tags/version/", self.inner.name))
|
||||
.post(&format!(
|
||||
"/v1/table/{}/tags/version/",
|
||||
self.inner.identifier
|
||||
))
|
||||
.json(&serde_json::json!({ "tag": tag }));
|
||||
|
||||
let (request_id, response) = self.inner.send(request, true).await?;
|
||||
@@ -146,7 +149,7 @@ impl<S: HttpSend + 'static> Tags for RemoteTags<'_, S> {
|
||||
let request = self
|
||||
.inner
|
||||
.client
|
||||
.post(&format!("/v1/table/{}/tags/create/", self.inner.name))
|
||||
.post(&format!("/v1/table/{}/tags/create/", self.inner.identifier))
|
||||
.json(&serde_json::json!({
|
||||
"tag": tag,
|
||||
"version": version
|
||||
@@ -163,7 +166,7 @@ impl<S: HttpSend + 'static> Tags for RemoteTags<'_, S> {
|
||||
let request = self
|
||||
.inner
|
||||
.client
|
||||
.post(&format!("/v1/table/{}/tags/delete/", self.inner.name))
|
||||
.post(&format!("/v1/table/{}/tags/delete/", self.inner.identifier))
|
||||
.json(&serde_json::json!({ "tag": tag }));
|
||||
|
||||
let (request_id, response) = self.inner.send(request, true).await?;
|
||||
@@ -177,7 +180,7 @@ impl<S: HttpSend + 'static> Tags for RemoteTags<'_, S> {
|
||||
let request = self
|
||||
.inner
|
||||
.client
|
||||
.post(&format!("/v1/table/{}/tags/update/", self.inner.name))
|
||||
.post(&format!("/v1/table/{}/tags/update/", self.inner.identifier))
|
||||
.json(&serde_json::json!({
|
||||
"tag": tag,
|
||||
"version": version
|
||||
@@ -196,6 +199,8 @@ pub struct RemoteTable<S: HttpSend = Sender> {
|
||||
#[allow(dead_code)]
|
||||
client: RestfulLanceDbClient<S>,
|
||||
name: String,
|
||||
namespace: Vec<String>,
|
||||
identifier: String,
|
||||
server_version: ServerVersion,
|
||||
|
||||
version: RwLock<Option<u64>>,
|
||||
@@ -205,11 +210,15 @@ impl<S: HttpSend> RemoteTable<S> {
|
||||
pub fn new(
|
||||
client: RestfulLanceDbClient<S>,
|
||||
name: String,
|
||||
namespace: Vec<String>,
|
||||
identifier: String,
|
||||
server_version: ServerVersion,
|
||||
) -> Self {
|
||||
Self {
|
||||
client,
|
||||
name,
|
||||
namespace,
|
||||
identifier,
|
||||
server_version,
|
||||
version: RwLock::new(None),
|
||||
}
|
||||
@@ -223,7 +232,7 @@ impl<S: HttpSend> RemoteTable<S> {
|
||||
async fn describe_version(&self, version: Option<u64>) -> Result<TableDescription> {
|
||||
let mut request = self
|
||||
.client
|
||||
.post(&format!("/v1/table/{}/describe/", self.name));
|
||||
.post(&format!("/v1/table/{}/describe/", self.identifier));
|
||||
|
||||
let body = serde_json::json!({ "version": version });
|
||||
request = request.json(&body);
|
||||
@@ -334,7 +343,7 @@ impl<S: HttpSend> RemoteTable<S> {
|
||||
) -> Result<reqwest::Response> {
|
||||
if response.status() == StatusCode::NOT_FOUND {
|
||||
return Err(Error::TableNotFound {
|
||||
name: self.name.clone(),
|
||||
name: self.identifier.clone(),
|
||||
});
|
||||
}
|
||||
|
||||
@@ -548,7 +557,9 @@ impl<S: HttpSend> RemoteTable<S> {
|
||||
query: &AnyQuery,
|
||||
options: &QueryExecutionOptions,
|
||||
) -> Result<Vec<Pin<Box<dyn RecordBatchStream + Send>>>> {
|
||||
let mut request = self.client.post(&format!("/v1/table/{}/query/", self.name));
|
||||
let mut request = self
|
||||
.client
|
||||
.post(&format!("/v1/table/{}/query/", self.identifier));
|
||||
|
||||
if let Some(timeout) = options.timeout {
|
||||
// Also send to server, so it can abort the query if it takes too long.
|
||||
@@ -615,7 +626,7 @@ struct TableDescription {
|
||||
|
||||
impl<S: HttpSend> std::fmt::Display for RemoteTable<S> {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "RemoteTable({})", self.name)
|
||||
write!(f, "RemoteTable({})", self.identifier)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -634,7 +645,9 @@ mod test_utils {
|
||||
let client = client_with_handler(handler);
|
||||
Self {
|
||||
client,
|
||||
name,
|
||||
name: name.clone(),
|
||||
namespace: vec![],
|
||||
identifier: name,
|
||||
server_version: version.map(ServerVersion).unwrap_or_default(),
|
||||
version: RwLock::new(None),
|
||||
}
|
||||
@@ -650,6 +663,14 @@ impl<S: HttpSend> BaseTable for RemoteTable<S> {
|
||||
fn name(&self) -> &str {
|
||||
&self.name
|
||||
}
|
||||
|
||||
fn namespace(&self) -> &[String] {
|
||||
&self.namespace
|
||||
}
|
||||
|
||||
fn id(&self) -> &str {
|
||||
&self.identifier
|
||||
}
|
||||
async fn version(&self) -> Result<u64> {
|
||||
self.describe().await.map(|desc| desc.version)
|
||||
}
|
||||
@@ -678,7 +699,7 @@ impl<S: HttpSend> BaseTable for RemoteTable<S> {
|
||||
async fn restore(&self) -> Result<()> {
|
||||
let mut request = self
|
||||
.client
|
||||
.post(&format!("/v1/table/{}/restore/", self.name));
|
||||
.post(&format!("/v1/table/{}/restore/", self.identifier));
|
||||
let version = self.current_version().await;
|
||||
let body = serde_json::json!({ "version": version });
|
||||
request = request.json(&body);
|
||||
@@ -692,7 +713,7 @@ impl<S: HttpSend> BaseTable for RemoteTable<S> {
|
||||
async fn list_versions(&self) -> Result<Vec<Version>> {
|
||||
let request = self
|
||||
.client
|
||||
.post(&format!("/v1/table/{}/version/list/", self.name));
|
||||
.post(&format!("/v1/table/{}/version/list/", self.identifier));
|
||||
let (request_id, response) = self.send(request, true).await?;
|
||||
let response = self.check_table_response(&request_id, response).await?;
|
||||
|
||||
@@ -723,7 +744,7 @@ impl<S: HttpSend> BaseTable for RemoteTable<S> {
|
||||
async fn count_rows(&self, filter: Option<Filter>) -> Result<usize> {
|
||||
let mut request = self
|
||||
.client
|
||||
.post(&format!("/v1/table/{}/count_rows/", self.name));
|
||||
.post(&format!("/v1/table/{}/count_rows/", self.identifier));
|
||||
|
||||
let version = self.current_version().await;
|
||||
|
||||
@@ -759,7 +780,7 @@ impl<S: HttpSend> BaseTable for RemoteTable<S> {
|
||||
self.check_mutable().await?;
|
||||
let mut request = self
|
||||
.client
|
||||
.post(&format!("/v1/table/{}/insert/", self.name))
|
||||
.post(&format!("/v1/table/{}/insert/", self.identifier))
|
||||
.header(CONTENT_TYPE, ARROW_STREAM_CONTENT_TYPE);
|
||||
|
||||
match add.mode {
|
||||
@@ -831,7 +852,7 @@ impl<S: HttpSend> BaseTable for RemoteTable<S> {
|
||||
async fn explain_plan(&self, query: &AnyQuery, verbose: bool) -> Result<String> {
|
||||
let base_request = self
|
||||
.client
|
||||
.post(&format!("/v1/table/{}/explain_plan/", self.name));
|
||||
.post(&format!("/v1/table/{}/explain_plan/", self.identifier));
|
||||
|
||||
let query_bodies = self.prepare_query_bodies(query).await?;
|
||||
let requests: Vec<reqwest::RequestBuilder> = query_bodies
|
||||
@@ -880,7 +901,7 @@ impl<S: HttpSend> BaseTable for RemoteTable<S> {
|
||||
) -> Result<String> {
|
||||
let request = self
|
||||
.client
|
||||
.post(&format!("/v1/table/{}/analyze_plan/", self.name));
|
||||
.post(&format!("/v1/table/{}/analyze_plan/", self.identifier));
|
||||
|
||||
let query_bodies = self.prepare_query_bodies(query).await?;
|
||||
let requests: Vec<reqwest::RequestBuilder> = query_bodies
|
||||
@@ -919,7 +940,7 @@ impl<S: HttpSend> BaseTable for RemoteTable<S> {
|
||||
self.check_mutable().await?;
|
||||
let request = self
|
||||
.client
|
||||
.post(&format!("/v1/table/{}/update/", self.name));
|
||||
.post(&format!("/v1/table/{}/update/", self.identifier));
|
||||
|
||||
let mut updates = Vec::new();
|
||||
for (column, expression) in update.columns {
|
||||
@@ -958,7 +979,7 @@ impl<S: HttpSend> BaseTable for RemoteTable<S> {
|
||||
let body = serde_json::json!({ "predicate": predicate });
|
||||
let request = self
|
||||
.client
|
||||
.post(&format!("/v1/table/{}/delete/", self.name))
|
||||
.post(&format!("/v1/table/{}/delete/", self.identifier))
|
||||
.json(&body);
|
||||
let (request_id, response) = self.send(request, true).await?;
|
||||
let response = self.check_table_response(&request_id, response).await?;
|
||||
@@ -980,7 +1001,7 @@ impl<S: HttpSend> BaseTable for RemoteTable<S> {
|
||||
self.check_mutable().await?;
|
||||
let request = self
|
||||
.client
|
||||
.post(&format!("/v1/table/{}/create_index/", self.name));
|
||||
.post(&format!("/v1/table/{}/create_index/", self.identifier));
|
||||
|
||||
let column = match index.columns.len() {
|
||||
0 => {
|
||||
@@ -999,6 +1020,18 @@ impl<S: HttpSend> BaseTable for RemoteTable<S> {
|
||||
"column": column
|
||||
});
|
||||
|
||||
// Add name parameter if provided (for backwards compatibility, only include if Some)
|
||||
if let Some(ref name) = index.name {
|
||||
body["name"] = serde_json::Value::String(name.clone());
|
||||
}
|
||||
|
||||
// Warn if train=false is specified since it's not meaningful
|
||||
if !index.train {
|
||||
log::warn!(
|
||||
"train=false has no effect remote tables. The index will be created empty and automatically populated in the background."
|
||||
);
|
||||
}
|
||||
|
||||
match index.index {
|
||||
// TODO: Should we pass the actual index parameters? SaaS does not
|
||||
// yet support them.
|
||||
@@ -1084,8 +1117,8 @@ impl<S: HttpSend> BaseTable for RemoteTable<S> {
|
||||
self.check_table_response(&request_id, response).await?;
|
||||
|
||||
if let Some(wait_timeout) = index.wait_timeout {
|
||||
let name = format!("{}_idx", column);
|
||||
self.wait_for_index(&[&name], wait_timeout).await?;
|
||||
let index_name = index.name.unwrap_or_else(|| format!("{}_idx", column));
|
||||
self.wait_for_index(&[&index_name], wait_timeout).await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -1109,7 +1142,7 @@ impl<S: HttpSend> BaseTable for RemoteTable<S> {
|
||||
let query = MergeInsertRequest::try_from(params)?;
|
||||
let mut request = self
|
||||
.client
|
||||
.post(&format!("/v1/table/{}/merge_insert/", self.name))
|
||||
.post(&format!("/v1/table/{}/merge_insert/", self.identifier))
|
||||
.query(&query)
|
||||
.header(CONTENT_TYPE, ARROW_STREAM_CONTENT_TYPE);
|
||||
|
||||
@@ -1181,7 +1214,7 @@ impl<S: HttpSend> BaseTable for RemoteTable<S> {
|
||||
let body = serde_json::json!({ "new_columns": body });
|
||||
let request = self
|
||||
.client
|
||||
.post(&format!("/v1/table/{}/add_columns/", self.name))
|
||||
.post(&format!("/v1/table/{}/add_columns/", self.identifier))
|
||||
.json(&body);
|
||||
let (request_id, response) = self.send(request, true).await?;
|
||||
let response = self.check_table_response(&request_id, response).await?;
|
||||
@@ -1234,7 +1267,7 @@ impl<S: HttpSend> BaseTable for RemoteTable<S> {
|
||||
let body = serde_json::json!({ "alterations": body });
|
||||
let request = self
|
||||
.client
|
||||
.post(&format!("/v1/table/{}/alter_columns/", self.name))
|
||||
.post(&format!("/v1/table/{}/alter_columns/", self.identifier))
|
||||
.json(&body);
|
||||
let (request_id, response) = self.send(request, true).await?;
|
||||
let response = self.check_table_response(&request_id, response).await?;
|
||||
@@ -1259,7 +1292,7 @@ impl<S: HttpSend> BaseTable for RemoteTable<S> {
|
||||
let body = serde_json::json!({ "columns": columns });
|
||||
let request = self
|
||||
.client
|
||||
.post(&format!("/v1/table/{}/drop_columns/", self.name))
|
||||
.post(&format!("/v1/table/{}/drop_columns/", self.identifier))
|
||||
.json(&body);
|
||||
let (request_id, response) = self.send(request, true).await?;
|
||||
let response = self.check_table_response(&request_id, response).await?;
|
||||
@@ -1283,7 +1316,7 @@ impl<S: HttpSend> BaseTable for RemoteTable<S> {
|
||||
// Make request to list the indices
|
||||
let mut request = self
|
||||
.client
|
||||
.post(&format!("/v1/table/{}/index/list/", self.name));
|
||||
.post(&format!("/v1/table/{}/index/list/", self.identifier));
|
||||
let version = self.current_version().await;
|
||||
let body = serde_json::json!({ "version": version });
|
||||
request = request.json(&body);
|
||||
@@ -1339,7 +1372,7 @@ impl<S: HttpSend> BaseTable for RemoteTable<S> {
|
||||
async fn index_stats(&self, index_name: &str) -> Result<Option<IndexStatistics>> {
|
||||
let mut request = self.client.post(&format!(
|
||||
"/v1/table/{}/index/{}/stats/",
|
||||
self.name, index_name
|
||||
self.identifier, index_name
|
||||
));
|
||||
let version = self.current_version().await;
|
||||
let body = serde_json::json!({ "version": version });
|
||||
@@ -1367,7 +1400,7 @@ impl<S: HttpSend> BaseTable for RemoteTable<S> {
|
||||
async fn drop_index(&self, index_name: &str) -> Result<()> {
|
||||
let request = self.client.post(&format!(
|
||||
"/v1/table/{}/index/{}/drop/",
|
||||
self.name, index_name
|
||||
self.identifier, index_name
|
||||
));
|
||||
let (request_id, response) = self.send(request, true).await?;
|
||||
if response.status() == StatusCode::NOT_FOUND {
|
||||
@@ -1395,7 +1428,9 @@ impl<S: HttpSend> BaseTable for RemoteTable<S> {
|
||||
}
|
||||
|
||||
async fn stats(&self) -> Result<TableStatistics> {
|
||||
let request = self.client.post(&format!("/v1/table/{}/stats/", self.name));
|
||||
let request = self
|
||||
.client
|
||||
.post(&format!("/v1/table/{}/stats/", self.identifier));
|
||||
let (request_id, response) = self.send(request, true).await?;
|
||||
let response = self.check_table_response(&request_id, response).await?;
|
||||
let body = response.text().await.err_to_http(request_id.clone())?;
|
||||
@@ -3070,4 +3105,174 @@ mod tests {
|
||||
});
|
||||
table
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_table_with_namespace_identifier() {
|
||||
// Test that a table created with namespace uses the correct identifier in API calls
|
||||
let table = Table::new_with_handler("ns1$ns2$table1", |request| {
|
||||
assert_eq!(request.method(), "POST");
|
||||
// All API calls should use the full identifier in the path
|
||||
assert_eq!(request.url().path(), "/v1/table/ns1$ns2$table1/describe/");
|
||||
|
||||
http::Response::builder()
|
||||
.status(200)
|
||||
.body(r#"{"version": 1, "schema": { "fields": [] }}"#)
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
// The name() method should return just the base name, not the full identifier
|
||||
assert_eq!(table.name(), "ns1$ns2$table1");
|
||||
|
||||
// API operations should work correctly
|
||||
let version = table.version().await.unwrap();
|
||||
assert_eq!(version, 1);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_query_with_namespace() {
|
||||
let table = Table::new_with_handler("analytics$events", |request| {
|
||||
match request.url().path() {
|
||||
"/v1/table/analytics$events/query/" => {
|
||||
assert_eq!(request.method(), "POST");
|
||||
|
||||
// Return empty arrow stream
|
||||
let data = RecordBatch::try_new(
|
||||
Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])),
|
||||
vec![Arc::new(Int32Array::from(vec![1, 2, 3]))],
|
||||
)
|
||||
.unwrap();
|
||||
let body = write_ipc_file(&data);
|
||||
|
||||
http::Response::builder()
|
||||
.status(200)
|
||||
.header("Content-Type", ARROW_FILE_CONTENT_TYPE)
|
||||
.body(body)
|
||||
.unwrap()
|
||||
}
|
||||
_ => {
|
||||
panic!("Unexpected path: {}", request.url().path());
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
let results = table.query().execute().await.unwrap();
|
||||
let batches = results.try_collect::<Vec<_>>().await.unwrap();
|
||||
assert_eq!(batches.len(), 1);
|
||||
assert_eq!(batches[0].num_rows(), 3);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_add_data_with_namespace() {
|
||||
let data = RecordBatch::try_new(
|
||||
Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)])),
|
||||
vec![Arc::new(Int32Array::from(vec![1, 2, 3]))],
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let (sender, receiver) = std::sync::mpsc::channel();
|
||||
let table = Table::new_with_handler("prod$metrics", move |mut request| {
|
||||
if request.url().path() == "/v1/table/prod$metrics/insert/" {
|
||||
assert_eq!(request.method(), "POST");
|
||||
assert_eq!(
|
||||
request.headers().get("Content-Type").unwrap(),
|
||||
ARROW_STREAM_CONTENT_TYPE
|
||||
);
|
||||
let mut body_out = reqwest::Body::from(Vec::new());
|
||||
std::mem::swap(request.body_mut().as_mut().unwrap(), &mut body_out);
|
||||
sender.send(body_out).unwrap();
|
||||
http::Response::builder()
|
||||
.status(200)
|
||||
.body(r#"{"version": 2}"#)
|
||||
.unwrap()
|
||||
} else {
|
||||
panic!("Unexpected request path: {}", request.url().path());
|
||||
}
|
||||
});
|
||||
|
||||
let result = table
|
||||
.add(RecordBatchIterator::new([Ok(data.clone())], data.schema()))
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(result.version, 2);
|
||||
|
||||
let body = receiver.recv().unwrap();
|
||||
let body = collect_body(body).await;
|
||||
let expected_body = write_ipc_stream(&data);
|
||||
assert_eq!(&body, &expected_body);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_create_index_with_namespace() {
|
||||
let table = Table::new_with_handler("dev$users", |request| {
|
||||
match request.url().path() {
|
||||
"/v1/table/dev$users/create_index/" => {
|
||||
assert_eq!(request.method(), "POST");
|
||||
assert_eq!(
|
||||
request.headers().get("Content-Type").unwrap(),
|
||||
JSON_CONTENT_TYPE
|
||||
);
|
||||
|
||||
// Verify the request body contains the column name
|
||||
if let Some(body) = request.body().unwrap().as_bytes() {
|
||||
let body = std::str::from_utf8(body).unwrap();
|
||||
let value: serde_json::Value = serde_json::from_str(body).unwrap();
|
||||
assert_eq!(value["column"], "embedding");
|
||||
assert_eq!(value["index_type"], "IVF_PQ");
|
||||
}
|
||||
|
||||
http::Response::builder().status(200).body("").unwrap()
|
||||
}
|
||||
"/v1/table/dev$users/describe/" => {
|
||||
// Needed for schema check in Auto index type
|
||||
http::Response::builder()
|
||||
.status(200)
|
||||
.body(r#"{"version": 1, "schema": {"fields": [{"name": "embedding", "type": {"type": "list", "item": {"type": "float32"}}, "nullable": false}]}}"#)
|
||||
.unwrap()
|
||||
}
|
||||
_ => {
|
||||
panic!("Unexpected path: {}", request.url().path());
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
table
|
||||
.create_index(&["embedding"], Index::IvfPq(IvfPqIndexBuilder::default()))
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_drop_columns_with_namespace() {
|
||||
let table = Table::new_with_handler("test$schema_ops", |request| {
|
||||
assert_eq!(request.method(), "POST");
|
||||
assert_eq!(
|
||||
request.url().path(),
|
||||
"/v1/table/test$schema_ops/drop_columns/"
|
||||
);
|
||||
assert_eq!(
|
||||
request.headers().get("Content-Type").unwrap(),
|
||||
JSON_CONTENT_TYPE
|
||||
);
|
||||
|
||||
if let Some(body) = request.body().unwrap().as_bytes() {
|
||||
let body = std::str::from_utf8(body).unwrap();
|
||||
let value: serde_json::Value = serde_json::from_str(body).unwrap();
|
||||
let columns = value["columns"].as_array().unwrap();
|
||||
assert_eq!(columns.len(), 2);
|
||||
assert_eq!(columns[0], "old_col1");
|
||||
assert_eq!(columns[1], "old_col2");
|
||||
}
|
||||
|
||||
http::Response::builder()
|
||||
.status(200)
|
||||
.body(r#"{"version": 5}"#)
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
let result = table.drop_columns(&["old_col1", "old_col2"]).await.unwrap();
|
||||
assert_eq!(result.version, 5);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -28,9 +28,11 @@ use lance::dataset::{
|
||||
};
|
||||
use lance::dataset::{MergeInsertBuilder as LanceMergeInsertBuilder, WhenNotMatchedBySource};
|
||||
use lance::index::vector::utils::infer_vector_dim;
|
||||
use lance::index::vector::VectorIndexParams;
|
||||
use lance::io::WrappingObjectStore;
|
||||
use lance_datafusion::exec::{analyze_plan as lance_analyze_plan, execute_plan};
|
||||
use lance_datafusion::utils::StreamingWriteSource;
|
||||
use lance_index::scalar::{BuiltinIndexType, ScalarIndexParams};
|
||||
use lance_index::vector::hnsw::builder::HnswBuildParams;
|
||||
use lance_index::vector::ivf::IvfBuildParams;
|
||||
use lance_index::vector::pq::PQBuildParams;
|
||||
@@ -50,11 +52,7 @@ use crate::arrow::IntoArrow;
|
||||
use crate::connection::NoData;
|
||||
use crate::embeddings::{EmbeddingDefinition, EmbeddingRegistry, MaybeEmbedded, MemoryRegistry};
|
||||
use crate::error::{Error, Result};
|
||||
use crate::index::scalar::FtsIndexBuilder;
|
||||
use crate::index::vector::{
|
||||
suggested_num_partitions_for_hnsw, IvfFlatIndexBuilder, IvfHnswPqIndexBuilder,
|
||||
IvfHnswSqIndexBuilder, IvfPqIndexBuilder, VectorIndex,
|
||||
};
|
||||
use crate::index::vector::{suggested_num_partitions_for_hnsw, VectorIndex};
|
||||
use crate::index::IndexStatistics;
|
||||
use crate::index::{
|
||||
vector::{suggested_num_partitions, suggested_num_sub_vectors},
|
||||
@@ -62,8 +60,8 @@ use crate::index::{
|
||||
};
|
||||
use crate::index::{IndexConfig, IndexStatisticsImpl};
|
||||
use crate::query::{
|
||||
IntoQueryVector, Query, QueryExecutionOptions, QueryFilter, QueryRequest, Select, VectorQuery,
|
||||
VectorQueryRequest, DEFAULT_TOP_K,
|
||||
IntoQueryVector, Query, QueryExecutionOptions, QueryFilter, QueryRequest, Select, TakeQuery,
|
||||
VectorQuery, VectorQueryRequest, DEFAULT_TOP_K,
|
||||
};
|
||||
use crate::utils::{
|
||||
default_vector_column, supported_bitmap_data_type, supported_btree_data_type,
|
||||
@@ -401,6 +399,7 @@ pub enum Filter {
|
||||
}
|
||||
|
||||
/// A query that can be used to search a LanceDB table
|
||||
#[derive(Debug, Clone)]
|
||||
pub enum AnyQuery {
|
||||
Query(QueryRequest),
|
||||
VectorQuery(VectorQueryRequest),
|
||||
@@ -510,6 +509,10 @@ pub trait BaseTable: std::fmt::Display + std::fmt::Debug + Send + Sync {
|
||||
fn as_any(&self) -> &dyn std::any::Any;
|
||||
/// Get the name of the table.
|
||||
fn name(&self) -> &str;
|
||||
/// Get the namespace of the table.
|
||||
fn namespace(&self) -> &[String];
|
||||
/// Get the id of the table
|
||||
fn id(&self) -> &str;
|
||||
/// Get the arrow [Schema] of the table.
|
||||
async fn schema(&self) -> Result<SchemaRef>;
|
||||
/// Count the number of rows in this table.
|
||||
@@ -1078,6 +1081,54 @@ impl Table {
|
||||
Query::new(self.inner.clone())
|
||||
}
|
||||
|
||||
/// Extract rows from the dataset using dataset offsets.
|
||||
///
|
||||
/// Dataset offsets are 0-indexed and relative to the current version of the table.
|
||||
/// They are not stable. A row with an offset of N may have a different offset in a
|
||||
/// different version of the table (e.g. if an earlier row is deleted).
|
||||
///
|
||||
/// Offsets are useful for sampling as the set of all valid offsets is easily
|
||||
/// known in advance to be [0, len(table)).
|
||||
///
|
||||
/// No guarantees are made regarding the order in which results are returned. If you
|
||||
/// desire an output order that matches the order of the given offsets, you will need
|
||||
/// to add the row offset column to the output and align it yourself.
|
||||
///
|
||||
/// Parameters
|
||||
/// ----------
|
||||
/// offsets: list[int]
|
||||
/// The offsets to take.
|
||||
///
|
||||
/// Returns
|
||||
/// -------
|
||||
/// pa.RecordBatch
|
||||
/// A record batch containing the rows at the given offsets.
|
||||
pub fn take_offsets(&self, offsets: Vec<u64>) -> TakeQuery {
|
||||
TakeQuery::from_offsets(self.inner.clone(), offsets)
|
||||
}
|
||||
|
||||
/// Extract rows from the dataset using row ids.
|
||||
///
|
||||
/// Row ids are not stable and are relative to the current version of the table.
|
||||
/// They can change due to compaction and updates.
|
||||
///
|
||||
/// Even so, row ids are more stable than offsets and can be useful in some situations.
|
||||
///
|
||||
/// There is an ongoing effort to make row ids stable which is tracked at
|
||||
/// https://github.com/lancedb/lancedb/issues/1120
|
||||
///
|
||||
/// No guarantees are made regarding the order in which results are returned. If you
|
||||
/// desire an output order that matches the order of the given ids, you will need
|
||||
/// to add the row id column to the output and align it yourself.
|
||||
/// Parameters
|
||||
/// ----------
|
||||
/// row_ids: list[int]
|
||||
/// The row ids to take.
|
||||
///
|
||||
pub fn take_row_ids(&self, row_ids: Vec<u64>) -> TakeQuery {
|
||||
TakeQuery::from_row_ids(self.inner.clone(), row_ids)
|
||||
}
|
||||
|
||||
/// Search the table with a given query vector.
|
||||
///
|
||||
/// This is a convenience method for preparing a vector query and
|
||||
@@ -1649,345 +1700,219 @@ impl NativeTable {
|
||||
.collect())
|
||||
}
|
||||
|
||||
async fn create_ivf_flat_index(
|
||||
&self,
|
||||
index: IvfFlatIndexBuilder,
|
||||
// Helper to validate index type compatibility with field data type
|
||||
fn validate_index_type(
|
||||
field: &Field,
|
||||
replace: bool,
|
||||
index_name: &str,
|
||||
supported_fn: impl Fn(&DataType) -> bool,
|
||||
) -> Result<()> {
|
||||
if !supported_vector_data_type(field.data_type()) {
|
||||
return Err(Error::InvalidInput {
|
||||
if !supported_fn(field.data_type()) {
|
||||
return Err(Error::Schema {
|
||||
message: format!(
|
||||
"An IVF Flat index cannot be created on the column `{}` which has data type {}",
|
||||
"A {} index cannot be created on the field `{}` which has data type {}",
|
||||
index_name,
|
||||
field.name(),
|
||||
field.data_type()
|
||||
),
|
||||
});
|
||||
}
|
||||
|
||||
let num_partitions = if let Some(n) = index.num_partitions {
|
||||
n
|
||||
} else {
|
||||
suggested_num_partitions(self.count_rows(None).await?)
|
||||
};
|
||||
let mut dataset = self.dataset.get_mut().await?;
|
||||
let lance_idx_params = lance::index::vector::VectorIndexParams::ivf_flat(
|
||||
num_partitions as usize,
|
||||
index.distance_type.into(),
|
||||
);
|
||||
dataset
|
||||
.create_index(
|
||||
&[field.name()],
|
||||
IndexType::Vector,
|
||||
None,
|
||||
&lance_idx_params,
|
||||
replace,
|
||||
)
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn create_ivf_pq_index(
|
||||
// Helper to get num_partitions with default calculation
|
||||
async fn get_num_partitions(
|
||||
&self,
|
||||
index: IvfPqIndexBuilder,
|
||||
field: &Field,
|
||||
replace: bool,
|
||||
) -> Result<()> {
|
||||
if !supported_vector_data_type(field.data_type()) {
|
||||
return Err(Error::InvalidInput {
|
||||
message: format!(
|
||||
"An IVF PQ index cannot be created on the column `{}` which has data type {}",
|
||||
field.name(),
|
||||
field.data_type()
|
||||
),
|
||||
});
|
||||
provided: Option<u32>,
|
||||
for_hnsw: bool,
|
||||
dim: Option<u32>,
|
||||
) -> Result<u32> {
|
||||
if let Some(n) = provided {
|
||||
Ok(n)
|
||||
} else {
|
||||
let row_count = self.count_rows(None).await?;
|
||||
if for_hnsw {
|
||||
Ok(suggested_num_partitions_for_hnsw(
|
||||
row_count,
|
||||
dim.ok_or_else(|| Error::InvalidInput {
|
||||
message: "Vector dimension required for HNSW partitioning".to_string(),
|
||||
})?,
|
||||
))
|
||||
} else {
|
||||
Ok(suggested_num_partitions(row_count))
|
||||
}
|
||||
}
|
||||
|
||||
let num_partitions = if let Some(n) = index.num_partitions {
|
||||
n
|
||||
} else {
|
||||
suggested_num_partitions(self.count_rows(None).await?)
|
||||
};
|
||||
let num_sub_vectors: u32 = if let Some(n) = index.num_sub_vectors {
|
||||
n
|
||||
} else {
|
||||
let dim = infer_vector_dim(field.data_type())?;
|
||||
suggested_num_sub_vectors(dim as u32)
|
||||
};
|
||||
let mut dataset = self.dataset.get_mut().await?;
|
||||
let lance_idx_params = lance::index::vector::VectorIndexParams::ivf_pq(
|
||||
num_partitions as usize,
|
||||
/*num_bits=*/ 8,
|
||||
num_sub_vectors as usize,
|
||||
index.distance_type.into(),
|
||||
index.max_iterations as usize,
|
||||
);
|
||||
dataset
|
||||
.create_index(
|
||||
&[field.name()],
|
||||
IndexType::Vector,
|
||||
None,
|
||||
&lance_idx_params,
|
||||
replace,
|
||||
)
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn create_ivf_hnsw_pq_index(
|
||||
&self,
|
||||
index: IvfHnswPqIndexBuilder,
|
||||
field: &Field,
|
||||
replace: bool,
|
||||
) -> Result<()> {
|
||||
if !supported_vector_data_type(field.data_type()) {
|
||||
return Err(Error::InvalidInput {
|
||||
message: format!(
|
||||
"An IVF HNSW PQ index cannot be created on the column `{}` which has data type {}",
|
||||
field.name(),
|
||||
field.data_type()
|
||||
),
|
||||
});
|
||||
// Helper to get num_sub_vectors with default calculation
|
||||
fn get_num_sub_vectors(provided: Option<u32>, dim: u32) -> u32 {
|
||||
provided.unwrap_or_else(|| suggested_num_sub_vectors(dim))
|
||||
}
|
||||
|
||||
// Helper to extract vector dimension from field
|
||||
fn get_vector_dimension(field: &Field) -> Result<u32> {
|
||||
match field.data_type() {
|
||||
arrow_schema::DataType::FixedSizeList(_, n) => Ok(*n as u32),
|
||||
_ => Ok(infer_vector_dim(field.data_type())? as u32),
|
||||
}
|
||||
}
|
||||
|
||||
let num_partitions: u32 = if let Some(n) = index.num_partitions {
|
||||
n
|
||||
} else {
|
||||
match field.data_type() {
|
||||
arrow_schema::DataType::FixedSizeList(_, n) => Ok::<u32, Error>(
|
||||
suggested_num_partitions_for_hnsw(self.count_rows(None).await?, *n as u32),
|
||||
),
|
||||
_ => Err(Error::Schema {
|
||||
message: format!("Column '{}' is not a FixedSizeList", field.name()),
|
||||
}),
|
||||
}?
|
||||
};
|
||||
|
||||
let num_sub_vectors: u32 = if let Some(n) = index.num_sub_vectors {
|
||||
n
|
||||
} else {
|
||||
match field.data_type() {
|
||||
arrow_schema::DataType::FixedSizeList(_, n) => {
|
||||
Ok::<u32, Error>(suggested_num_sub_vectors(*n as u32))
|
||||
// Convert LanceDB Index to Lance IndexParams
|
||||
async fn make_index_params(
|
||||
&self,
|
||||
field: &Field,
|
||||
index_opts: Index,
|
||||
) -> Result<Box<dyn lance::index::IndexParams>> {
|
||||
match index_opts {
|
||||
Index::Auto => {
|
||||
if supported_vector_data_type(field.data_type()) {
|
||||
// Use IvfPq as the default for auto vector indices
|
||||
let dim = Self::get_vector_dimension(field)?;
|
||||
let num_partitions = self.get_num_partitions(None, false, None).await?;
|
||||
let num_sub_vectors = Self::get_num_sub_vectors(None, dim);
|
||||
let lance_idx_params = lance::index::vector::VectorIndexParams::ivf_pq(
|
||||
num_partitions as usize,
|
||||
/*num_bits=*/ 8,
|
||||
num_sub_vectors as usize,
|
||||
lance_linalg::distance::MetricType::L2,
|
||||
/*max_iterations=*/ 50,
|
||||
);
|
||||
Ok(Box::new(lance_idx_params))
|
||||
} else if supported_btree_data_type(field.data_type()) {
|
||||
Ok(Box::new(ScalarIndexParams::for_builtin(
|
||||
BuiltinIndexType::BTree,
|
||||
)))
|
||||
} else {
|
||||
return Err(Error::InvalidInput {
|
||||
message: format!(
|
||||
"there are no indices supported for the field `{}` with the data type {}",
|
||||
field.name(),
|
||||
field.data_type()
|
||||
),
|
||||
});
|
||||
}
|
||||
_ => Err(Error::Schema {
|
||||
message: format!("Column '{}' is not a FixedSizeList", field.name()),
|
||||
}),
|
||||
}?
|
||||
};
|
||||
|
||||
let mut dataset = self.dataset.get_mut().await?;
|
||||
let mut ivf_params = IvfBuildParams::new(num_partitions as usize);
|
||||
ivf_params.sample_rate = index.sample_rate as usize;
|
||||
ivf_params.max_iters = index.max_iterations as usize;
|
||||
let hnsw_params = HnswBuildParams::default()
|
||||
.num_edges(index.m as usize)
|
||||
.ef_construction(index.ef_construction as usize);
|
||||
let pq_params = PQBuildParams {
|
||||
num_sub_vectors: num_sub_vectors as usize,
|
||||
..Default::default()
|
||||
};
|
||||
let lance_idx_params = lance::index::vector::VectorIndexParams::with_ivf_hnsw_pq_params(
|
||||
index.distance_type.into(),
|
||||
ivf_params,
|
||||
hnsw_params,
|
||||
pq_params,
|
||||
);
|
||||
dataset
|
||||
.create_index(
|
||||
&[field.name()],
|
||||
IndexType::Vector,
|
||||
None,
|
||||
&lance_idx_params,
|
||||
replace,
|
||||
)
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn create_ivf_hnsw_sq_index(
|
||||
&self,
|
||||
index: IvfHnswSqIndexBuilder,
|
||||
field: &Field,
|
||||
replace: bool,
|
||||
) -> Result<()> {
|
||||
if !supported_vector_data_type(field.data_type()) {
|
||||
return Err(Error::InvalidInput {
|
||||
message: format!(
|
||||
"An IVF HNSW SQ index cannot be created on the column `{}` which has data type {}",
|
||||
field.name(),
|
||||
field.data_type()
|
||||
),
|
||||
});
|
||||
}
|
||||
|
||||
let num_partitions: u32 = if let Some(n) = index.num_partitions {
|
||||
n
|
||||
} else {
|
||||
match field.data_type() {
|
||||
arrow_schema::DataType::FixedSizeList(_, n) => Ok::<u32, Error>(
|
||||
suggested_num_partitions_for_hnsw(self.count_rows(None).await?, *n as u32),
|
||||
),
|
||||
_ => Err(Error::Schema {
|
||||
message: format!("Column '{}' is not a FixedSizeList", field.name()),
|
||||
}),
|
||||
}?
|
||||
};
|
||||
|
||||
let mut dataset = self.dataset.get_mut().await?;
|
||||
let mut ivf_params = IvfBuildParams::new(num_partitions as usize);
|
||||
ivf_params.sample_rate = index.sample_rate as usize;
|
||||
ivf_params.max_iters = index.max_iterations as usize;
|
||||
let hnsw_params = HnswBuildParams::default()
|
||||
.num_edges(index.m as usize)
|
||||
.ef_construction(index.ef_construction as usize);
|
||||
let sq_params = SQBuildParams {
|
||||
sample_rate: index.sample_rate as usize,
|
||||
..Default::default()
|
||||
};
|
||||
let lance_idx_params = lance::index::vector::VectorIndexParams::with_ivf_hnsw_sq_params(
|
||||
index.distance_type.into(),
|
||||
ivf_params,
|
||||
hnsw_params,
|
||||
sq_params,
|
||||
);
|
||||
dataset
|
||||
.create_index(
|
||||
&[field.name()],
|
||||
IndexType::Vector,
|
||||
None,
|
||||
&lance_idx_params,
|
||||
replace,
|
||||
)
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn create_auto_index(&self, field: &Field, opts: IndexBuilder) -> Result<()> {
|
||||
if supported_vector_data_type(field.data_type()) {
|
||||
self.create_ivf_pq_index(IvfPqIndexBuilder::default(), field, opts.replace)
|
||||
.await
|
||||
} else if supported_btree_data_type(field.data_type()) {
|
||||
self.create_btree_index(field, opts).await
|
||||
} else {
|
||||
Err(Error::InvalidInput {
|
||||
message: format!(
|
||||
"there are no indices supported for the field `{}` with the data type {}",
|
||||
field.name(),
|
||||
field.data_type()
|
||||
),
|
||||
})
|
||||
}
|
||||
Index::BTree(_) => {
|
||||
Self::validate_index_type(field, "BTree", supported_btree_data_type)?;
|
||||
Ok(Box::new(ScalarIndexParams::for_builtin(
|
||||
BuiltinIndexType::BTree,
|
||||
)))
|
||||
}
|
||||
Index::Bitmap(_) => {
|
||||
Self::validate_index_type(field, "Bitmap", supported_bitmap_data_type)?;
|
||||
Ok(Box::new(ScalarIndexParams::for_builtin(
|
||||
BuiltinIndexType::Bitmap,
|
||||
)))
|
||||
}
|
||||
Index::LabelList(_) => {
|
||||
Self::validate_index_type(field, "LabelList", supported_label_list_data_type)?;
|
||||
Ok(Box::new(ScalarIndexParams::for_builtin(
|
||||
BuiltinIndexType::LabelList,
|
||||
)))
|
||||
}
|
||||
Index::FTS(fts_opts) => {
|
||||
Self::validate_index_type(field, "FTS", supported_fts_data_type)?;
|
||||
Ok(Box::new(fts_opts))
|
||||
}
|
||||
Index::IvfFlat(index) => {
|
||||
Self::validate_index_type(field, "IVF Flat", supported_vector_data_type)?;
|
||||
let num_partitions = self
|
||||
.get_num_partitions(index.num_partitions, false, None)
|
||||
.await?;
|
||||
let lance_idx_params = VectorIndexParams::ivf_flat(
|
||||
num_partitions as usize,
|
||||
index.distance_type.into(),
|
||||
);
|
||||
Ok(Box::new(lance_idx_params))
|
||||
}
|
||||
Index::IvfPq(index) => {
|
||||
Self::validate_index_type(field, "IVF PQ", supported_vector_data_type)?;
|
||||
let dim = Self::get_vector_dimension(field)?;
|
||||
let num_partitions = self
|
||||
.get_num_partitions(index.num_partitions, false, None)
|
||||
.await?;
|
||||
let num_sub_vectors = Self::get_num_sub_vectors(index.num_sub_vectors, dim);
|
||||
let lance_idx_params = VectorIndexParams::ivf_pq(
|
||||
num_partitions as usize,
|
||||
/*num_bits=*/ 8,
|
||||
num_sub_vectors as usize,
|
||||
index.distance_type.into(),
|
||||
index.max_iterations as usize,
|
||||
);
|
||||
Ok(Box::new(lance_idx_params))
|
||||
}
|
||||
Index::IvfHnswPq(index) => {
|
||||
Self::validate_index_type(field, "IVF HNSW PQ", supported_vector_data_type)?;
|
||||
let dim = Self::get_vector_dimension(field)?;
|
||||
let num_partitions = self
|
||||
.get_num_partitions(index.num_partitions, true, Some(dim))
|
||||
.await?;
|
||||
let num_sub_vectors = Self::get_num_sub_vectors(index.num_sub_vectors, dim);
|
||||
let mut ivf_params = IvfBuildParams::new(num_partitions as usize);
|
||||
ivf_params.sample_rate = index.sample_rate as usize;
|
||||
ivf_params.max_iters = index.max_iterations as usize;
|
||||
let hnsw_params = HnswBuildParams::default()
|
||||
.num_edges(index.m as usize)
|
||||
.ef_construction(index.ef_construction as usize);
|
||||
let pq_params = PQBuildParams {
|
||||
num_sub_vectors: num_sub_vectors as usize,
|
||||
..Default::default()
|
||||
};
|
||||
let lance_idx_params = VectorIndexParams::with_ivf_hnsw_pq_params(
|
||||
index.distance_type.into(),
|
||||
ivf_params,
|
||||
hnsw_params,
|
||||
pq_params,
|
||||
);
|
||||
Ok(Box::new(lance_idx_params))
|
||||
}
|
||||
Index::IvfHnswSq(index) => {
|
||||
Self::validate_index_type(field, "IVF HNSW SQ", supported_vector_data_type)?;
|
||||
let dim = Self::get_vector_dimension(field)?;
|
||||
let num_partitions = self
|
||||
.get_num_partitions(index.num_partitions, true, Some(dim))
|
||||
.await?;
|
||||
let mut ivf_params = IvfBuildParams::new(num_partitions as usize);
|
||||
ivf_params.sample_rate = index.sample_rate as usize;
|
||||
ivf_params.max_iters = index.max_iterations as usize;
|
||||
let hnsw_params = HnswBuildParams::default()
|
||||
.num_edges(index.m as usize)
|
||||
.ef_construction(index.ef_construction as usize);
|
||||
let sq_params = SQBuildParams {
|
||||
sample_rate: index.sample_rate as usize,
|
||||
..Default::default()
|
||||
};
|
||||
let lance_idx_params = VectorIndexParams::with_ivf_hnsw_sq_params(
|
||||
index.distance_type.into(),
|
||||
ivf_params,
|
||||
hnsw_params,
|
||||
sq_params,
|
||||
);
|
||||
Ok(Box::new(lance_idx_params))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn create_btree_index(&self, field: &Field, opts: IndexBuilder) -> Result<()> {
|
||||
if !supported_btree_data_type(field.data_type()) {
|
||||
return Err(Error::Schema {
|
||||
message: format!(
|
||||
"A BTree index cannot be created on the field `{}` which has data type {}",
|
||||
field.name(),
|
||||
field.data_type()
|
||||
),
|
||||
});
|
||||
// Helper method to get the correct IndexType based on the Index variant and field data type
|
||||
fn get_index_type_for_field(&self, field: &Field, index: &Index) -> IndexType {
|
||||
match index {
|
||||
Index::Auto => {
|
||||
if supported_vector_data_type(field.data_type()) {
|
||||
IndexType::Vector
|
||||
} else if supported_btree_data_type(field.data_type()) {
|
||||
IndexType::BTree
|
||||
} else {
|
||||
// This should not happen since make_index_params would have failed
|
||||
IndexType::BTree
|
||||
}
|
||||
}
|
||||
Index::BTree(_) => IndexType::BTree,
|
||||
Index::Bitmap(_) => IndexType::Bitmap,
|
||||
Index::LabelList(_) => IndexType::LabelList,
|
||||
Index::FTS(_) => IndexType::Inverted,
|
||||
Index::IvfFlat(_) | Index::IvfPq(_) | Index::IvfHnswPq(_) | Index::IvfHnswSq(_) => {
|
||||
IndexType::Vector
|
||||
}
|
||||
}
|
||||
|
||||
let mut dataset = self.dataset.get_mut().await?;
|
||||
let lance_idx_params = lance_index::scalar::ScalarIndexParams {
|
||||
force_index_type: Some(lance_index::scalar::ScalarIndexType::BTree),
|
||||
};
|
||||
dataset
|
||||
.create_index(
|
||||
&[field.name()],
|
||||
IndexType::BTree,
|
||||
None,
|
||||
&lance_idx_params,
|
||||
opts.replace,
|
||||
)
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn create_bitmap_index(&self, field: &Field, opts: IndexBuilder) -> Result<()> {
|
||||
if !supported_bitmap_data_type(field.data_type()) {
|
||||
return Err(Error::Schema {
|
||||
message: format!(
|
||||
"A Bitmap index cannot be created on the field `{}` which has data type {}",
|
||||
field.name(),
|
||||
field.data_type()
|
||||
),
|
||||
});
|
||||
}
|
||||
|
||||
let mut dataset = self.dataset.get_mut().await?;
|
||||
let lance_idx_params = lance_index::scalar::ScalarIndexParams {
|
||||
force_index_type: Some(lance_index::scalar::ScalarIndexType::Bitmap),
|
||||
};
|
||||
dataset
|
||||
.create_index(
|
||||
&[field.name()],
|
||||
IndexType::Bitmap,
|
||||
None,
|
||||
&lance_idx_params,
|
||||
opts.replace,
|
||||
)
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn create_label_list_index(&self, field: &Field, opts: IndexBuilder) -> Result<()> {
|
||||
if !supported_label_list_data_type(field.data_type()) {
|
||||
return Err(Error::Schema {
|
||||
message: format!(
|
||||
"A LabelList index cannot be created on the field `{}` which has data type {}",
|
||||
field.name(),
|
||||
field.data_type()
|
||||
),
|
||||
});
|
||||
}
|
||||
|
||||
let mut dataset = self.dataset.get_mut().await?;
|
||||
let lance_idx_params = lance_index::scalar::ScalarIndexParams {
|
||||
force_index_type: Some(lance_index::scalar::ScalarIndexType::LabelList),
|
||||
};
|
||||
dataset
|
||||
.create_index(
|
||||
&[field.name()],
|
||||
IndexType::LabelList,
|
||||
None,
|
||||
&lance_idx_params,
|
||||
opts.replace,
|
||||
)
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn create_fts_index(
|
||||
&self,
|
||||
field: &Field,
|
||||
fts_opts: FtsIndexBuilder,
|
||||
replace: bool,
|
||||
) -> Result<()> {
|
||||
if !supported_fts_data_type(field.data_type()) {
|
||||
return Err(Error::Schema {
|
||||
message: format!(
|
||||
"A FTS index cannot be created on the field `{}` which has data type {}",
|
||||
field.name(),
|
||||
field.data_type()
|
||||
),
|
||||
});
|
||||
}
|
||||
|
||||
let mut dataset = self.dataset.get_mut().await?;
|
||||
dataset
|
||||
.create_index(
|
||||
&[field.name()],
|
||||
IndexType::Inverted,
|
||||
None,
|
||||
&fts_opts,
|
||||
replace,
|
||||
)
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn generic_query(
|
||||
@@ -2094,6 +2019,16 @@ impl BaseTable for NativeTable {
|
||||
self.name.as_str()
|
||||
}
|
||||
|
||||
fn namespace(&self) -> &[String] {
|
||||
// Native tables don't support namespaces yet, return empty slice for root namespace
|
||||
&[]
|
||||
}
|
||||
|
||||
fn id(&self) -> &str {
|
||||
// For native tables, id is same as name since no namespace support
|
||||
self.name.as_str()
|
||||
}
|
||||
|
||||
async fn version(&self) -> Result<u64> {
|
||||
Ok(self.dataset.get().await?.version().version)
|
||||
}
|
||||
@@ -2202,26 +2137,20 @@ impl BaseTable for NativeTable {
|
||||
|
||||
let field = schema.field_with_name(&opts.columns[0])?;
|
||||
|
||||
match opts.index {
|
||||
Index::Auto => self.create_auto_index(field, opts).await,
|
||||
Index::BTree(_) => self.create_btree_index(field, opts).await,
|
||||
Index::Bitmap(_) => self.create_bitmap_index(field, opts).await,
|
||||
Index::LabelList(_) => self.create_label_list_index(field, opts).await,
|
||||
Index::FTS(fts_opts) => self.create_fts_index(field, fts_opts, opts.replace).await,
|
||||
Index::IvfFlat(ivf_flat) => {
|
||||
self.create_ivf_flat_index(ivf_flat, field, opts.replace)
|
||||
.await
|
||||
}
|
||||
Index::IvfPq(ivf_pq) => self.create_ivf_pq_index(ivf_pq, field, opts.replace).await,
|
||||
Index::IvfHnswPq(ivf_hnsw_pq) => {
|
||||
self.create_ivf_hnsw_pq_index(ivf_hnsw_pq, field, opts.replace)
|
||||
.await
|
||||
}
|
||||
Index::IvfHnswSq(ivf_hnsw_sq) => {
|
||||
self.create_ivf_hnsw_sq_index(ivf_hnsw_sq, field, opts.replace)
|
||||
.await
|
||||
}
|
||||
let lance_idx_params = self.make_index_params(field, opts.index.clone()).await?;
|
||||
let index_type = self.get_index_type_for_field(field, &opts.index);
|
||||
let columns = [field.name().as_str()];
|
||||
let mut dataset = self.dataset.get_mut().await?;
|
||||
let mut builder = dataset
|
||||
.create_index_builder(&columns, index_type, lance_idx_params.as_ref())
|
||||
.train(opts.train)
|
||||
.replace(opts.replace);
|
||||
|
||||
if let Some(name) = opts.name {
|
||||
builder = builder.name(name);
|
||||
}
|
||||
builder.await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn drop_index(&self, index_name: &str) -> Result<()> {
|
||||
@@ -2339,20 +2268,13 @@ impl BaseTable for NativeTable {
|
||||
|
||||
let (_, element_type) = lance::index::vector::utils::get_vector_type(schema, &column)?;
|
||||
let is_binary = matches!(element_type, DataType::UInt8);
|
||||
let top_k = query.base.limit.unwrap_or(DEFAULT_TOP_K) + query.base.offset.unwrap_or(0);
|
||||
if is_binary {
|
||||
let query_vector = arrow::compute::cast(&query_vector, &DataType::UInt8)?;
|
||||
let query_vector = query_vector.as_primitive::<UInt8Type>();
|
||||
scanner.nearest(
|
||||
&column,
|
||||
query_vector,
|
||||
query.base.limit.unwrap_or(DEFAULT_TOP_K),
|
||||
)?;
|
||||
scanner.nearest(&column, query_vector, top_k)?;
|
||||
} else {
|
||||
scanner.nearest(
|
||||
&column,
|
||||
query_vector.as_ref(),
|
||||
query.base.limit.unwrap_or(DEFAULT_TOP_K),
|
||||
)?;
|
||||
scanner.nearest(&column, query_vector.as_ref(), top_k)?;
|
||||
}
|
||||
scanner.minimum_nprobes(query.minimum_nprobes);
|
||||
if let Some(maximum_nprobes) = query.maximum_nprobes {
|
||||
@@ -2848,6 +2770,7 @@ mod tests {
|
||||
use crate::connect;
|
||||
use crate::connection::ConnectBuilder;
|
||||
use crate::index::scalar::{BTreeIndexBuilder, BitmapIndexBuilder};
|
||||
use crate::index::vector::{IvfHnswPqIndexBuilder, IvfHnswSqIndexBuilder};
|
||||
use crate::query::{ExecutableQuery, QueryBase};
|
||||
|
||||
#[tokio::test]
|
||||
@@ -3349,6 +3272,7 @@ mod tests {
|
||||
fn wrap(
|
||||
&self,
|
||||
original: Arc<dyn object_store::ObjectStore>,
|
||||
_storage_options: Option<&std::collections::HashMap<String, String>>,
|
||||
) -> Arc<dyn object_store::ObjectStore> {
|
||||
self.called.store(true, Ordering::Relaxed);
|
||||
original
|
||||
|
||||
@@ -130,7 +130,7 @@ async fn test_minio_lifecycle() -> Result<()> {
|
||||
let data = RecordBatchIterator::new(vec![Ok(data.clone())], data.schema());
|
||||
table.add(data).execute().await?;
|
||||
|
||||
db.drop_table("test_table").await?;
|
||||
db.drop_table("test_table", &[]).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user