mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-26 14:49:57 +00:00
Compare commits
10 Commits
python-v0.
...
python-v0.
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b88422e515 | ||
|
|
8d60685ede | ||
|
|
04285a4a4e | ||
|
|
d4a41b5663 | ||
|
|
adc3daa462 | ||
|
|
acbfa6c012 | ||
|
|
d602e9f98c | ||
|
|
ad09234d59 | ||
|
|
0c34ffb252 | ||
|
|
d9f333d828 |
@@ -1,5 +1,5 @@
|
|||||||
[tool.bumpversion]
|
[tool.bumpversion]
|
||||||
current_version = "0.21.2"
|
current_version = "0.21.4-beta.0"
|
||||||
parse = """(?x)
|
parse = """(?x)
|
||||||
(?P<major>0|[1-9]\\d*)\\.
|
(?P<major>0|[1-9]\\d*)\\.
|
||||||
(?P<minor>0|[1-9]\\d*)\\.
|
(?P<minor>0|[1-9]\\d*)\\.
|
||||||
|
|||||||
66
CLAUDE.md
66
CLAUDE.md
@@ -13,10 +13,68 @@ Project layout:
|
|||||||
|
|
||||||
Common commands:
|
Common commands:
|
||||||
|
|
||||||
* Check for compiler errors: `cargo check --features remote --tests --examples`
|
* Check for compiler errors: `cargo check --quiet --features remote --tests --examples`
|
||||||
* Run tests: `cargo test --features remote --tests`
|
* Run tests: `cargo test --quiet --features remote --tests`
|
||||||
* Run specific test: `cargo test --features remote -p <package_name> --test <test_name>`
|
* Run specific test: `cargo test --quiet --features remote -p <package_name> --test <test_name>`
|
||||||
* Lint: `cargo clippy --features remote --tests --examples`
|
* Lint: `cargo clippy --quiet --features remote --tests --examples`
|
||||||
* Format: `cargo fmt --all`
|
* Format: `cargo fmt --all`
|
||||||
|
|
||||||
Before committing changes, run formatting.
|
Before committing changes, run formatting.
|
||||||
|
|
||||||
|
## Coding tips
|
||||||
|
|
||||||
|
* When writing Rust doctests for things that require a connection or table reference,
|
||||||
|
write them as a function instead of a fully executable test. This allows type checking
|
||||||
|
to run but avoids needing a full test environment. For example:
|
||||||
|
```rust
|
||||||
|
/// ```
|
||||||
|
/// use lance_index::scalar::FullTextSearchQuery;
|
||||||
|
/// use lancedb::query::{QueryBase, ExecutableQuery};
|
||||||
|
///
|
||||||
|
/// # use lancedb::Table;
|
||||||
|
/// # async fn query(table: &Table) -> Result<(), Box<dyn std::error::Error>> {
|
||||||
|
/// let results = table.query()
|
||||||
|
/// .full_text_search(FullTextSearchQuery::new("hello world".into()))
|
||||||
|
/// .execute()
|
||||||
|
/// .await?;
|
||||||
|
/// # Ok(())
|
||||||
|
/// # }
|
||||||
|
/// ```
|
||||||
|
```
|
||||||
|
|
||||||
|
## Example plan: adding a new method on Table
|
||||||
|
|
||||||
|
Adding a new method involves first adding it to the Rust core, then exposing it
|
||||||
|
in the Python and TypeScript bindings. There are both local and remote tables.
|
||||||
|
Remote tables are implemented via a HTTP API and require the `remote` cargo
|
||||||
|
feature flag to be enabled. Python has both sync and async methods.
|
||||||
|
|
||||||
|
Rust core changes:
|
||||||
|
|
||||||
|
1. Add method on `Table` struct in `rust/lancedb/src/table.rs` (calls `BaseTable` trait).
|
||||||
|
2. Add method to `BaseTable` trait in `rust/lancedb/src/table.rs`.
|
||||||
|
3. Implement new trait method on `NativeTable` in `rust/lancedb/src/table.rs`.
|
||||||
|
* Test with unit test in `rust/lancedb/src/table.rs`.
|
||||||
|
4. Implement new trait method on `RemoteTable` in `rust/lancedb/src/remote/table.rs`.
|
||||||
|
* Test with unit test in `rust/lancedb/src/remote/table.rs` against mocked endpoint.
|
||||||
|
|
||||||
|
Python bindings changes:
|
||||||
|
|
||||||
|
1. Add PyO3 method binding in `python/src/table.rs`. Run `make develop` to compile bindings.
|
||||||
|
2. Add types for PyO3 method in `python/python/lancedb/_lancedb.pyi`.
|
||||||
|
3. Add method to `AsyncTable` class in `python/python/lancedb/table.py`.
|
||||||
|
4. Add abstract method to `Table` abstract base class in `python/python/lancedb/table.py`.
|
||||||
|
5. Add concrete sync method to `LanceTable` class in `python/python/lancedb/table.py`.
|
||||||
|
* Should use `LOOP.run()` to call the corresponding `AsyncTable` method.
|
||||||
|
6. Add concrete sync method to `RemoteTable` class in `python/python/lancedb/remote/table.py`.
|
||||||
|
7. Add unit test in `python/tests/test_table.py`.
|
||||||
|
|
||||||
|
TypeScript bindings changes:
|
||||||
|
|
||||||
|
1. Add napi-rs method binding on `Table` in `nodejs/src/table.rs`.
|
||||||
|
2. Run `npm run build` to generate TypeScript definitions.
|
||||||
|
3. Add typescript method on abstract class `Table` in `nodejs/src/table.ts`.
|
||||||
|
4. Add concrete method on `LocalTable` class in `nodejs/src/native_table.ts`.
|
||||||
|
* Note: despite the name, this class is also used for remote tables.
|
||||||
|
5. Add test in `nodejs/__test__/table.test.ts`.
|
||||||
|
6. Run `npm run docs` to generate TypeScript documentation.
|
||||||
|
|||||||
47
Cargo.lock
generated
47
Cargo.lock
generated
@@ -2838,8 +2838,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c"
|
|||||||
[[package]]
|
[[package]]
|
||||||
name = "fsst"
|
name = "fsst"
|
||||||
version = "0.33.0"
|
version = "0.33.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "git+https://github.com/lancedb/lance.git?tag=v0.33.0-beta.4#e37e9df2458e88c37205415bd00d950b7f936061"
|
||||||
checksum = "548190a42654ce848835b410ae33f43b4d55cb24548fd0a885a289a1d5a95019"
|
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow-array",
|
"arrow-array",
|
||||||
"rand 0.9.1",
|
"rand 0.9.1",
|
||||||
@@ -3953,8 +3952,7 @@ dependencies = [
|
|||||||
[[package]]
|
[[package]]
|
||||||
name = "lance"
|
name = "lance"
|
||||||
version = "0.33.0"
|
version = "0.33.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "git+https://github.com/lancedb/lance.git?tag=v0.33.0-beta.4#e37e9df2458e88c37205415bd00d950b7f936061"
|
||||||
checksum = "94bafd9d9a9301c1eac48892ec8016d4d28204d4fc55f2ebebee9a7af465e152"
|
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow",
|
"arrow",
|
||||||
"arrow-arith",
|
"arrow-arith",
|
||||||
@@ -4017,8 +4015,7 @@ dependencies = [
|
|||||||
[[package]]
|
[[package]]
|
||||||
name = "lance-arrow"
|
name = "lance-arrow"
|
||||||
version = "0.33.0"
|
version = "0.33.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "git+https://github.com/lancedb/lance.git?tag=v0.33.0-beta.4#e37e9df2458e88c37205415bd00d950b7f936061"
|
||||||
checksum = "b97ebcd8edc2b534e8ded20c97c8928e275160794af91ed803a3d48d8d2a88d8"
|
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow-array",
|
"arrow-array",
|
||||||
"arrow-buffer",
|
"arrow-buffer",
|
||||||
@@ -4036,8 +4033,7 @@ dependencies = [
|
|||||||
[[package]]
|
[[package]]
|
||||||
name = "lance-core"
|
name = "lance-core"
|
||||||
version = "0.33.0"
|
version = "0.33.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "git+https://github.com/lancedb/lance.git?tag=v0.33.0-beta.4#e37e9df2458e88c37205415bd00d950b7f936061"
|
||||||
checksum = "ce5c1849d07985d6a5011aca9de43c7a42ec4c996d66ef3f2d9896c227cc934c"
|
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow-array",
|
"arrow-array",
|
||||||
"arrow-buffer",
|
"arrow-buffer",
|
||||||
@@ -4073,8 +4069,7 @@ dependencies = [
|
|||||||
[[package]]
|
[[package]]
|
||||||
name = "lance-datafusion"
|
name = "lance-datafusion"
|
||||||
version = "0.33.0"
|
version = "0.33.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "git+https://github.com/lancedb/lance.git?tag=v0.33.0-beta.4#e37e9df2458e88c37205415bd00d950b7f936061"
|
||||||
checksum = "d355c087bc66d85e36cfb428465f585b13971e1e13585dd2b6886a54d8a7d9a4"
|
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow",
|
"arrow",
|
||||||
"arrow-array",
|
"arrow-array",
|
||||||
@@ -4103,8 +4098,7 @@ dependencies = [
|
|||||||
[[package]]
|
[[package]]
|
||||||
name = "lance-datagen"
|
name = "lance-datagen"
|
||||||
version = "0.33.0"
|
version = "0.33.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "git+https://github.com/lancedb/lance.git?tag=v0.33.0-beta.4#e37e9df2458e88c37205415bd00d950b7f936061"
|
||||||
checksum = "110d4dedfe02e9cff8f11cfb64a261755da7ee9131845197efeec8b659cc5513"
|
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow",
|
"arrow",
|
||||||
"arrow-array",
|
"arrow-array",
|
||||||
@@ -4112,6 +4106,7 @@ dependencies = [
|
|||||||
"arrow-schema",
|
"arrow-schema",
|
||||||
"chrono",
|
"chrono",
|
||||||
"futures",
|
"futures",
|
||||||
|
"half",
|
||||||
"hex",
|
"hex",
|
||||||
"rand 0.9.1",
|
"rand 0.9.1",
|
||||||
"rand_xoshiro",
|
"rand_xoshiro",
|
||||||
@@ -4121,8 +4116,7 @@ dependencies = [
|
|||||||
[[package]]
|
[[package]]
|
||||||
name = "lance-encoding"
|
name = "lance-encoding"
|
||||||
version = "0.33.0"
|
version = "0.33.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "git+https://github.com/lancedb/lance.git?tag=v0.33.0-beta.4#e37e9df2458e88c37205415bd00d950b7f936061"
|
||||||
checksum = "66750006299a2fb003091bc290eb1fe2a5933e35236d921934131f3e4629cd33"
|
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrayref",
|
"arrayref",
|
||||||
"arrow",
|
"arrow",
|
||||||
@@ -4162,8 +4156,7 @@ dependencies = [
|
|||||||
[[package]]
|
[[package]]
|
||||||
name = "lance-file"
|
name = "lance-file"
|
||||||
version = "0.33.0"
|
version = "0.33.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "git+https://github.com/lancedb/lance.git?tag=v0.33.0-beta.4#e37e9df2458e88c37205415bd00d950b7f936061"
|
||||||
checksum = "7c639062100610a075e01fd455173348b2fccea10cb0e89f70e38a3183c56022"
|
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow-arith",
|
"arrow-arith",
|
||||||
"arrow-array",
|
"arrow-array",
|
||||||
@@ -4198,8 +4191,7 @@ dependencies = [
|
|||||||
[[package]]
|
[[package]]
|
||||||
name = "lance-index"
|
name = "lance-index"
|
||||||
version = "0.33.0"
|
version = "0.33.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "git+https://github.com/lancedb/lance.git?tag=v0.33.0-beta.4#e37e9df2458e88c37205415bd00d950b7f936061"
|
||||||
checksum = "7ae67a048a51fb525d1bfde86d1b39118462277e7e7a7cd0e7ba866312873532"
|
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow",
|
"arrow",
|
||||||
"arrow-array",
|
"arrow-array",
|
||||||
@@ -4227,6 +4219,7 @@ dependencies = [
|
|||||||
"lance-arrow",
|
"lance-arrow",
|
||||||
"lance-core",
|
"lance-core",
|
||||||
"lance-datafusion",
|
"lance-datafusion",
|
||||||
|
"lance-datagen",
|
||||||
"lance-encoding",
|
"lance-encoding",
|
||||||
"lance-file",
|
"lance-file",
|
||||||
"lance-io",
|
"lance-io",
|
||||||
@@ -4253,8 +4246,7 @@ dependencies = [
|
|||||||
[[package]]
|
[[package]]
|
||||||
name = "lance-io"
|
name = "lance-io"
|
||||||
version = "0.33.0"
|
version = "0.33.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "git+https://github.com/lancedb/lance.git?tag=v0.33.0-beta.4#e37e9df2458e88c37205415bd00d950b7f936061"
|
||||||
checksum = "cc86c7307e2d3d895cfefa503f986edcbdd208eb0aa89ba2c75724ba04bce843"
|
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow",
|
"arrow",
|
||||||
"arrow-arith",
|
"arrow-arith",
|
||||||
@@ -4295,8 +4287,7 @@ dependencies = [
|
|||||||
[[package]]
|
[[package]]
|
||||||
name = "lance-linalg"
|
name = "lance-linalg"
|
||||||
version = "0.33.0"
|
version = "0.33.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "git+https://github.com/lancedb/lance.git?tag=v0.33.0-beta.4#e37e9df2458e88c37205415bd00d950b7f936061"
|
||||||
checksum = "769f910b6f2ad5eb4d1b3071c533b619351e61e0dfca74f13c98680a8e6476e9"
|
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow-array",
|
"arrow-array",
|
||||||
"arrow-buffer",
|
"arrow-buffer",
|
||||||
@@ -4320,8 +4311,7 @@ dependencies = [
|
|||||||
[[package]]
|
[[package]]
|
||||||
name = "lance-table"
|
name = "lance-table"
|
||||||
version = "0.33.0"
|
version = "0.33.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "git+https://github.com/lancedb/lance.git?tag=v0.33.0-beta.4#e37e9df2458e88c37205415bd00d950b7f936061"
|
||||||
checksum = "ffbeafa8a3e97b5b3a06f06d69b0cefe56e65c64a33f674c40c113b797328bd2"
|
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow",
|
"arrow",
|
||||||
"arrow-array",
|
"arrow-array",
|
||||||
@@ -4360,8 +4350,7 @@ dependencies = [
|
|||||||
[[package]]
|
[[package]]
|
||||||
name = "lance-testing"
|
name = "lance-testing"
|
||||||
version = "0.33.0"
|
version = "0.33.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "git+https://github.com/lancedb/lance.git?tag=v0.33.0-beta.4#e37e9df2458e88c37205415bd00d950b7f936061"
|
||||||
checksum = "535a3bba37625cd515a7172a8d0d138f86822acef9fa9425ad1e050ef88bf92f"
|
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow-array",
|
"arrow-array",
|
||||||
"arrow-schema",
|
"arrow-schema",
|
||||||
@@ -4372,7 +4361,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lancedb"
|
name = "lancedb"
|
||||||
version = "0.21.2"
|
version = "0.21.4-beta.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow",
|
"arrow",
|
||||||
"arrow-array",
|
"arrow-array",
|
||||||
@@ -4459,7 +4448,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lancedb-nodejs"
|
name = "lancedb-nodejs"
|
||||||
version = "0.21.2"
|
version = "0.21.4-beta.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow-array",
|
"arrow-array",
|
||||||
"arrow-ipc",
|
"arrow-ipc",
|
||||||
@@ -4479,7 +4468,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lancedb-python"
|
name = "lancedb-python"
|
||||||
version = "0.24.2"
|
version = "0.24.4-beta.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow",
|
"arrow",
|
||||||
"env_logger",
|
"env_logger",
|
||||||
|
|||||||
16
Cargo.toml
16
Cargo.toml
@@ -15,14 +15,14 @@ categories = ["database-implementations"]
|
|||||||
rust-version = "1.78.0"
|
rust-version = "1.78.0"
|
||||||
|
|
||||||
[workspace.dependencies]
|
[workspace.dependencies]
|
||||||
lance = { "version" = "=0.33.0", "features" = ["dynamodb"] }
|
lance = { "version" = "=0.33.0", default-features = false, "features" = ["dynamodb"], "tag" = "v0.33.0-beta.4", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
lance-io = "=0.33.0"
|
lance-io = { "version" = "=0.33.0", default-features = false, "tag" = "v0.33.0-beta.4", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
lance-index = "=0.33.0"
|
lance-index = { "version" = "=0.33.0", "tag" = "v0.33.0-beta.4", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
lance-linalg = "=0.33.0"
|
lance-linalg = { "version" = "=0.33.0", "tag" = "v0.33.0-beta.4", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
lance-table = "=0.33.0"
|
lance-table = { "version" = "=0.33.0", "tag" = "v0.33.0-beta.4", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
lance-testing = "=0.33.0"
|
lance-testing = { "version" = "=0.33.0", "tag" = "v0.33.0-beta.4", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
lance-datafusion = "=0.33.0"
|
lance-datafusion = { "version" = "=0.33.0", "tag" = "v0.33.0-beta.4", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
lance-encoding = "=0.33.0"
|
lance-encoding = { "version" = "=0.33.0", "tag" = "v0.33.0-beta.4", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
# Note that this one does not include pyarrow
|
# Note that this one does not include pyarrow
|
||||||
arrow = { version = "55.1", optional = false }
|
arrow = { version = "55.1", optional = false }
|
||||||
arrow-array = "55.1"
|
arrow-array = "55.1"
|
||||||
|
|||||||
@@ -54,6 +54,52 @@ def extract_features(line: str) -> list:
|
|||||||
return []
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def extract_default_features(line: str) -> bool:
|
||||||
|
"""
|
||||||
|
Checks if default-features = false is present in a line in Cargo.toml.
|
||||||
|
Example: 'lance = { "version" = "=0.29.0", default-features = false, "features" = ["dynamodb"] }'
|
||||||
|
Returns: True if default-features = false is present, False otherwise
|
||||||
|
"""
|
||||||
|
import re
|
||||||
|
|
||||||
|
match = re.search(r'default-features\s*=\s*false', line)
|
||||||
|
return match is not None
|
||||||
|
|
||||||
|
|
||||||
|
def dict_to_toml_line(package_name: str, config: dict) -> str:
|
||||||
|
"""
|
||||||
|
Converts a configuration dictionary to a TOML dependency line.
|
||||||
|
Dictionary insertion order is preserved (Python 3.7+), so the caller
|
||||||
|
controls the order of fields in the output.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
package_name: The name of the package (e.g., "lance", "lance-io")
|
||||||
|
config: Dictionary with keys like "version", "path", "git", "tag", "features", "default-features"
|
||||||
|
The order of keys in this dict determines the order in the output.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A properly formatted TOML line with a trailing newline
|
||||||
|
"""
|
||||||
|
# If only version is specified, use simple format
|
||||||
|
if len(config) == 1 and "version" in config:
|
||||||
|
return f'{package_name} = "{config["version"]}"\n'
|
||||||
|
|
||||||
|
# Otherwise, use inline table format
|
||||||
|
parts = []
|
||||||
|
for key, value in config.items():
|
||||||
|
if key == "default-features" and not value:
|
||||||
|
parts.append("default-features = false")
|
||||||
|
elif key == "features":
|
||||||
|
parts.append(f'"features" = {json.dumps(value)}')
|
||||||
|
elif isinstance(value, str):
|
||||||
|
parts.append(f'"{key}" = "{value}"')
|
||||||
|
else:
|
||||||
|
# This shouldn't happen with our current usage
|
||||||
|
parts.append(f'"{key}" = {json.dumps(value)}')
|
||||||
|
|
||||||
|
return f'{package_name} = {{ {", ".join(parts)} }}\n'
|
||||||
|
|
||||||
|
|
||||||
def update_cargo_toml(line_updater):
|
def update_cargo_toml(line_updater):
|
||||||
"""
|
"""
|
||||||
Updates the Cargo.toml file by applying the line_updater function to each line.
|
Updates the Cargo.toml file by applying the line_updater function to each line.
|
||||||
@@ -67,20 +113,27 @@ def update_cargo_toml(line_updater):
|
|||||||
is_parsing_lance_line = False
|
is_parsing_lance_line = False
|
||||||
for line in lines:
|
for line in lines:
|
||||||
if line.startswith("lance"):
|
if line.startswith("lance"):
|
||||||
# Update the line using the provided function
|
# Check if this is a single-line or multi-line entry
|
||||||
if line.strip().endswith("}"):
|
# Single-line entries either:
|
||||||
|
# 1. End with } (complete inline table)
|
||||||
|
# 2. End with " (simple version string)
|
||||||
|
# Multi-line entries start with { but don't end with }
|
||||||
|
if line.strip().endswith("}") or line.strip().endswith('"'):
|
||||||
|
# Single-line entry - process immediately
|
||||||
new_lines.append(line_updater(line))
|
new_lines.append(line_updater(line))
|
||||||
else:
|
elif "{" in line and not line.strip().endswith("}"):
|
||||||
|
# Multi-line entry - start accumulating
|
||||||
lance_line = line
|
lance_line = line
|
||||||
is_parsing_lance_line = True
|
is_parsing_lance_line = True
|
||||||
|
else:
|
||||||
|
# Single-line entry without quotes or braces (shouldn't happen but handle it)
|
||||||
|
new_lines.append(line_updater(line))
|
||||||
elif is_parsing_lance_line:
|
elif is_parsing_lance_line:
|
||||||
lance_line += line
|
lance_line += line
|
||||||
if line.strip().endswith("}"):
|
if line.strip().endswith("}"):
|
||||||
new_lines.append(line_updater(lance_line))
|
new_lines.append(line_updater(lance_line))
|
||||||
lance_line = ""
|
lance_line = ""
|
||||||
is_parsing_lance_line = False
|
is_parsing_lance_line = False
|
||||||
else:
|
|
||||||
print("doesn't end with }:", line)
|
|
||||||
else:
|
else:
|
||||||
# Keep the line unchanged
|
# Keep the line unchanged
|
||||||
new_lines.append(line)
|
new_lines.append(line)
|
||||||
@@ -92,18 +145,25 @@ def update_cargo_toml(line_updater):
|
|||||||
def set_stable_version(version: str):
|
def set_stable_version(version: str):
|
||||||
"""
|
"""
|
||||||
Sets lines to
|
Sets lines to
|
||||||
lance = { "version" = "=0.29.0", "features" = ["dynamodb"] }
|
lance = { "version" = "=0.29.0", default-features = false, "features" = ["dynamodb"] }
|
||||||
lance-io = "=0.29.0"
|
lance-io = { "version" = "=0.29.0", default-features = false }
|
||||||
...
|
...
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def line_updater(line: str) -> str:
|
def line_updater(line: str) -> str:
|
||||||
package_name = line.split("=", maxsplit=1)[0].strip()
|
package_name = line.split("=", maxsplit=1)[0].strip()
|
||||||
|
|
||||||
|
# Build config in desired order: version, default-features, features
|
||||||
|
config = {"version": f"={version}"}
|
||||||
|
|
||||||
|
if extract_default_features(line):
|
||||||
|
config["default-features"] = False
|
||||||
|
|
||||||
features = extract_features(line)
|
features = extract_features(line)
|
||||||
if features:
|
if features:
|
||||||
return f'{package_name} = {{ "version" = "={version}", "features" = {json.dumps(features)} }}\n'
|
config["features"] = features
|
||||||
else:
|
|
||||||
return f'{package_name} = "={version}"\n'
|
return dict_to_toml_line(package_name, config)
|
||||||
|
|
||||||
update_cargo_toml(line_updater)
|
update_cargo_toml(line_updater)
|
||||||
|
|
||||||
@@ -111,19 +171,29 @@ def set_stable_version(version: str):
|
|||||||
def set_preview_version(version: str):
|
def set_preview_version(version: str):
|
||||||
"""
|
"""
|
||||||
Sets lines to
|
Sets lines to
|
||||||
lance = { "version" = "=0.29.0", "features" = ["dynamodb"], tag = "v0.29.0-beta.2", git="https://github.com/lancedb/lance.git" }
|
lance = { "version" = "=0.29.0", default-features = false, "features" = ["dynamodb"], "tag" = "v0.29.0-beta.2", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
lance-io = { version = "=0.29.0", tag = "v0.29.0-beta.2", git="https://github.com/lancedb/lance.git" }
|
lance-io = { "version" = "=0.29.0", default-features = false, "tag" = "v0.29.0-beta.2", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
...
|
...
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def line_updater(line: str) -> str:
|
def line_updater(line: str) -> str:
|
||||||
package_name = line.split("=", maxsplit=1)[0].strip()
|
package_name = line.split("=", maxsplit=1)[0].strip()
|
||||||
features = extract_features(line)
|
|
||||||
base_version = version.split("-")[0] # Get the base version without beta suffix
|
base_version = version.split("-")[0] # Get the base version without beta suffix
|
||||||
|
|
||||||
|
# Build config in desired order: version, default-features, features, tag, git
|
||||||
|
config = {"version": f"={base_version}"}
|
||||||
|
|
||||||
|
if extract_default_features(line):
|
||||||
|
config["default-features"] = False
|
||||||
|
|
||||||
|
features = extract_features(line)
|
||||||
if features:
|
if features:
|
||||||
return f'{package_name} = {{ "version" = "={base_version}", "features" = {json.dumps(features)}, "tag" = "v{version}", "git" = "https://github.com/lancedb/lance.git" }}\n'
|
config["features"] = features
|
||||||
else:
|
|
||||||
return f'{package_name} = {{ "version" = "={base_version}", "tag" = "v{version}", "git" = "https://github.com/lancedb/lance.git" }}\n'
|
config["tag"] = f"v{version}"
|
||||||
|
config["git"] = "https://github.com/lancedb/lance.git"
|
||||||
|
|
||||||
|
return dict_to_toml_line(package_name, config)
|
||||||
|
|
||||||
update_cargo_toml(line_updater)
|
update_cargo_toml(line_updater)
|
||||||
|
|
||||||
@@ -131,18 +201,25 @@ def set_preview_version(version: str):
|
|||||||
def set_local_version():
|
def set_local_version():
|
||||||
"""
|
"""
|
||||||
Sets lines to
|
Sets lines to
|
||||||
lance = { path = "../lance/rust/lance", features = ["dynamodb"] }
|
lance = { "path" = "../lance/rust/lance", default-features = false, "features" = ["dynamodb"] }
|
||||||
lance-io = { path = "../lance/rust/lance-io" }
|
lance-io = { "path" = "../lance/rust/lance-io", default-features = false }
|
||||||
...
|
...
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def line_updater(line: str) -> str:
|
def line_updater(line: str) -> str:
|
||||||
package_name = line.split("=", maxsplit=1)[0].strip()
|
package_name = line.split("=", maxsplit=1)[0].strip()
|
||||||
|
|
||||||
|
# Build config in desired order: path, default-features, features
|
||||||
|
config = {"path": f"../lance/rust/{package_name}"}
|
||||||
|
|
||||||
|
if extract_default_features(line):
|
||||||
|
config["default-features"] = False
|
||||||
|
|
||||||
features = extract_features(line)
|
features = extract_features(line)
|
||||||
if features:
|
if features:
|
||||||
return f'{package_name} = {{ "path" = "../lance/rust/{package_name}", "features" = {json.dumps(features)} }}\n'
|
config["features"] = features
|
||||||
else:
|
|
||||||
return f'{package_name} = {{ "path" = "../lance/rust/{package_name}" }}\n'
|
return dict_to_toml_line(package_name, config)
|
||||||
|
|
||||||
update_cargo_toml(line_updater)
|
update_cargo_toml(line_updater)
|
||||||
|
|
||||||
|
|||||||
@@ -26,6 +26,18 @@ will be used to determine the most useful kind of index to create.
|
|||||||
|
|
||||||
***
|
***
|
||||||
|
|
||||||
|
### name?
|
||||||
|
|
||||||
|
```ts
|
||||||
|
optional name: string;
|
||||||
|
```
|
||||||
|
|
||||||
|
Optional custom name for the index.
|
||||||
|
|
||||||
|
If not provided, a default name will be generated based on the column name.
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
### replace?
|
### replace?
|
||||||
|
|
||||||
```ts
|
```ts
|
||||||
@@ -42,8 +54,27 @@ The default is true
|
|||||||
|
|
||||||
***
|
***
|
||||||
|
|
||||||
|
### train?
|
||||||
|
|
||||||
|
```ts
|
||||||
|
optional train: boolean;
|
||||||
|
```
|
||||||
|
|
||||||
|
Whether to train the index with existing data.
|
||||||
|
|
||||||
|
If true (default), the index will be trained with existing data in the table.
|
||||||
|
If false, the index will be created empty and populated as new data is added.
|
||||||
|
|
||||||
|
Note: This option is only supported for scalar indices. Vector indices always train.
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
### waitTimeoutSeconds?
|
### waitTimeoutSeconds?
|
||||||
|
|
||||||
```ts
|
```ts
|
||||||
optional waitTimeoutSeconds: number;
|
optional waitTimeoutSeconds: number;
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Timeout in seconds to wait for index creation to complete.
|
||||||
|
|
||||||
|
If not specified, the method will return immediately after starting the index creation.
|
||||||
|
|||||||
@@ -15,7 +15,7 @@ publish = false
|
|||||||
crate-type = ["cdylib"]
|
crate-type = ["cdylib"]
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
lancedb = { path = "../../../rust/lancedb" }
|
lancedb = { path = "../../../rust/lancedb", default-features = false }
|
||||||
lance = { workspace = true }
|
lance = { workspace = true }
|
||||||
arrow = { workspace = true, features = ["ffi"] }
|
arrow = { workspace = true, features = ["ffi"] }
|
||||||
arrow-schema.workspace = true
|
arrow-schema.workspace = true
|
||||||
@@ -25,3 +25,6 @@ snafu.workspace = true
|
|||||||
lazy_static.workspace = true
|
lazy_static.workspace = true
|
||||||
serde = { version = "^1" }
|
serde = { version = "^1" }
|
||||||
serde_json = { version = "1" }
|
serde_json = { version = "1" }
|
||||||
|
|
||||||
|
[features]
|
||||||
|
default = ["lancedb/default"]
|
||||||
|
|||||||
@@ -8,7 +8,7 @@
|
|||||||
<parent>
|
<parent>
|
||||||
<groupId>com.lancedb</groupId>
|
<groupId>com.lancedb</groupId>
|
||||||
<artifactId>lancedb-parent</artifactId>
|
<artifactId>lancedb-parent</artifactId>
|
||||||
<version>0.21.2-final.0</version>
|
<version>0.21.4-beta.0</version>
|
||||||
<relativePath>../pom.xml</relativePath>
|
<relativePath>../pom.xml</relativePath>
|
||||||
</parent>
|
</parent>
|
||||||
|
|
||||||
|
|||||||
@@ -8,7 +8,7 @@
|
|||||||
<parent>
|
<parent>
|
||||||
<groupId>com.lancedb</groupId>
|
<groupId>com.lancedb</groupId>
|
||||||
<artifactId>lancedb-parent</artifactId>
|
<artifactId>lancedb-parent</artifactId>
|
||||||
<version>0.21.2-final.0</version>
|
<version>0.21.4-beta.0</version>
|
||||||
<relativePath>../pom.xml</relativePath>
|
<relativePath>../pom.xml</relativePath>
|
||||||
</parent>
|
</parent>
|
||||||
|
|
||||||
|
|||||||
@@ -6,7 +6,7 @@
|
|||||||
|
|
||||||
<groupId>com.lancedb</groupId>
|
<groupId>com.lancedb</groupId>
|
||||||
<artifactId>lancedb-parent</artifactId>
|
<artifactId>lancedb-parent</artifactId>
|
||||||
<version>0.21.2-final.0</version>
|
<version>0.21.4-beta.0</version>
|
||||||
<packaging>pom</packaging>
|
<packaging>pom</packaging>
|
||||||
<name>${project.artifactId}</name>
|
<name>${project.artifactId}</name>
|
||||||
<description>LanceDB Java SDK Parent POM</description>
|
<description>LanceDB Java SDK Parent POM</description>
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb-nodejs"
|
name = "lancedb-nodejs"
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
version = "0.21.2"
|
version = "0.21.4-beta.0"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
description.workspace = true
|
description.workspace = true
|
||||||
repository.workspace = true
|
repository.workspace = true
|
||||||
@@ -18,7 +18,7 @@ arrow-array.workspace = true
|
|||||||
arrow-schema.workspace = true
|
arrow-schema.workspace = true
|
||||||
env_logger.workspace = true
|
env_logger.workspace = true
|
||||||
futures.workspace = true
|
futures.workspace = true
|
||||||
lancedb = { path = "../rust/lancedb" }
|
lancedb = { path = "../rust/lancedb", default-features = false }
|
||||||
napi = { version = "2.16.8", default-features = false, features = [
|
napi = { version = "2.16.8", default-features = false, features = [
|
||||||
"napi9",
|
"napi9",
|
||||||
"async"
|
"async"
|
||||||
@@ -36,6 +36,6 @@ aws-lc-rs = "=1.13.0"
|
|||||||
napi-build = "2.1"
|
napi-build = "2.1"
|
||||||
|
|
||||||
[features]
|
[features]
|
||||||
default = ["remote"]
|
default = ["remote", "lancedb/default"]
|
||||||
fp16kernels = ["lancedb/fp16kernels"]
|
fp16kernels = ["lancedb/fp16kernels"]
|
||||||
remote = ["lancedb/remote"]
|
remote = ["lancedb/remote"]
|
||||||
|
|||||||
@@ -857,6 +857,40 @@ describe("When creating an index", () => {
|
|||||||
expect(stats).toBeUndefined();
|
expect(stats).toBeUndefined();
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test("should support name and train parameters", async () => {
|
||||||
|
// Test with custom name
|
||||||
|
await tbl.createIndex("vec", {
|
||||||
|
config: Index.ivfPq({ numPartitions: 4 }),
|
||||||
|
name: "my_custom_vector_index",
|
||||||
|
});
|
||||||
|
|
||||||
|
const indices = await tbl.listIndices();
|
||||||
|
expect(indices).toHaveLength(1);
|
||||||
|
expect(indices[0].name).toBe("my_custom_vector_index");
|
||||||
|
|
||||||
|
// Test scalar index with train=false
|
||||||
|
await tbl.createIndex("id", {
|
||||||
|
config: Index.btree(),
|
||||||
|
name: "btree_empty",
|
||||||
|
train: false,
|
||||||
|
});
|
||||||
|
|
||||||
|
const allIndices = await tbl.listIndices();
|
||||||
|
expect(allIndices).toHaveLength(2);
|
||||||
|
expect(allIndices.some((idx) => idx.name === "btree_empty")).toBe(true);
|
||||||
|
|
||||||
|
// Test with both name and train=true (use tags column)
|
||||||
|
await tbl.createIndex("tags", {
|
||||||
|
config: Index.labelList(),
|
||||||
|
name: "tags_trained",
|
||||||
|
train: true,
|
||||||
|
});
|
||||||
|
|
||||||
|
const finalIndices = await tbl.listIndices();
|
||||||
|
expect(finalIndices).toHaveLength(3);
|
||||||
|
expect(finalIndices.some((idx) => idx.name === "tags_trained")).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
test("create ivf_flat with binary vectors", async () => {
|
test("create ivf_flat with binary vectors", async () => {
|
||||||
const db = await connect(tmpDir.name);
|
const db = await connect(tmpDir.name);
|
||||||
const binarySchema = new Schema([
|
const binarySchema = new Schema([
|
||||||
|
|||||||
@@ -700,5 +700,27 @@ export interface IndexOptions {
|
|||||||
*/
|
*/
|
||||||
replace?: boolean;
|
replace?: boolean;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Timeout in seconds to wait for index creation to complete.
|
||||||
|
*
|
||||||
|
* If not specified, the method will return immediately after starting the index creation.
|
||||||
|
*/
|
||||||
waitTimeoutSeconds?: number;
|
waitTimeoutSeconds?: number;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Optional custom name for the index.
|
||||||
|
*
|
||||||
|
* If not provided, a default name will be generated based on the column name.
|
||||||
|
*/
|
||||||
|
name?: string;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Whether to train the index with existing data.
|
||||||
|
*
|
||||||
|
* If true (default), the index will be trained with existing data in the table.
|
||||||
|
* If false, the index will be created empty and populated as new data is added.
|
||||||
|
*
|
||||||
|
* Note: This option is only supported for scalar indices. Vector indices always train.
|
||||||
|
*/
|
||||||
|
train?: boolean;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -662,6 +662,8 @@ export class LocalTable extends Table {
|
|||||||
column,
|
column,
|
||||||
options?.replace,
|
options?.replace,
|
||||||
options?.waitTimeoutSeconds,
|
options?.waitTimeoutSeconds,
|
||||||
|
options?.name,
|
||||||
|
options?.train,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-darwin-arm64",
|
"name": "@lancedb/lancedb-darwin-arm64",
|
||||||
"version": "0.21.2",
|
"version": "0.21.4-beta.0",
|
||||||
"os": ["darwin"],
|
"os": ["darwin"],
|
||||||
"cpu": ["arm64"],
|
"cpu": ["arm64"],
|
||||||
"main": "lancedb.darwin-arm64.node",
|
"main": "lancedb.darwin-arm64.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-darwin-x64",
|
"name": "@lancedb/lancedb-darwin-x64",
|
||||||
"version": "0.21.2",
|
"version": "0.21.4-beta.0",
|
||||||
"os": ["darwin"],
|
"os": ["darwin"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.darwin-x64.node",
|
"main": "lancedb.darwin-x64.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-linux-arm64-gnu",
|
"name": "@lancedb/lancedb-linux-arm64-gnu",
|
||||||
"version": "0.21.2",
|
"version": "0.21.4-beta.0",
|
||||||
"os": ["linux"],
|
"os": ["linux"],
|
||||||
"cpu": ["arm64"],
|
"cpu": ["arm64"],
|
||||||
"main": "lancedb.linux-arm64-gnu.node",
|
"main": "lancedb.linux-arm64-gnu.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-linux-arm64-musl",
|
"name": "@lancedb/lancedb-linux-arm64-musl",
|
||||||
"version": "0.21.2",
|
"version": "0.21.4-beta.0",
|
||||||
"os": ["linux"],
|
"os": ["linux"],
|
||||||
"cpu": ["arm64"],
|
"cpu": ["arm64"],
|
||||||
"main": "lancedb.linux-arm64-musl.node",
|
"main": "lancedb.linux-arm64-musl.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-linux-x64-gnu",
|
"name": "@lancedb/lancedb-linux-x64-gnu",
|
||||||
"version": "0.21.2",
|
"version": "0.21.4-beta.0",
|
||||||
"os": ["linux"],
|
"os": ["linux"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.linux-x64-gnu.node",
|
"main": "lancedb.linux-x64-gnu.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-linux-x64-musl",
|
"name": "@lancedb/lancedb-linux-x64-musl",
|
||||||
"version": "0.21.2",
|
"version": "0.21.4-beta.0",
|
||||||
"os": ["linux"],
|
"os": ["linux"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.linux-x64-musl.node",
|
"main": "lancedb.linux-x64-musl.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-win32-arm64-msvc",
|
"name": "@lancedb/lancedb-win32-arm64-msvc",
|
||||||
"version": "0.21.2",
|
"version": "0.21.4-beta.0",
|
||||||
"os": [
|
"os": [
|
||||||
"win32"
|
"win32"
|
||||||
],
|
],
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-win32-x64-msvc",
|
"name": "@lancedb/lancedb-win32-x64-msvc",
|
||||||
"version": "0.21.2",
|
"version": "0.21.4-beta.0",
|
||||||
"os": ["win32"],
|
"os": ["win32"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.win32-x64-msvc.node",
|
"main": "lancedb.win32-x64-msvc.node",
|
||||||
|
|||||||
4
nodejs/package-lock.json
generated
4
nodejs/package-lock.json
generated
@@ -1,12 +1,12 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb",
|
"name": "@lancedb/lancedb",
|
||||||
"version": "0.21.2",
|
"version": "0.21.4-beta.0",
|
||||||
"lockfileVersion": 3,
|
"lockfileVersion": 3,
|
||||||
"requires": true,
|
"requires": true,
|
||||||
"packages": {
|
"packages": {
|
||||||
"": {
|
"": {
|
||||||
"name": "@lancedb/lancedb",
|
"name": "@lancedb/lancedb",
|
||||||
"version": "0.21.2",
|
"version": "0.21.4-beta.0",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"x64",
|
"x64",
|
||||||
"arm64"
|
"arm64"
|
||||||
|
|||||||
@@ -11,7 +11,7 @@
|
|||||||
"ann"
|
"ann"
|
||||||
],
|
],
|
||||||
"private": false,
|
"private": false,
|
||||||
"version": "0.21.2",
|
"version": "0.21.4-beta.0",
|
||||||
"main": "dist/index.js",
|
"main": "dist/index.js",
|
||||||
"exports": {
|
"exports": {
|
||||||
".": "./dist/index.js",
|
".": "./dist/index.js",
|
||||||
|
|||||||
@@ -114,6 +114,8 @@ impl Table {
|
|||||||
column: String,
|
column: String,
|
||||||
replace: Option<bool>,
|
replace: Option<bool>,
|
||||||
wait_timeout_s: Option<i64>,
|
wait_timeout_s: Option<i64>,
|
||||||
|
name: Option<String>,
|
||||||
|
train: Option<bool>,
|
||||||
) -> napi::Result<()> {
|
) -> napi::Result<()> {
|
||||||
let lancedb_index = if let Some(index) = index {
|
let lancedb_index = if let Some(index) = index {
|
||||||
index.consume()?
|
index.consume()?
|
||||||
@@ -128,6 +130,12 @@ impl Table {
|
|||||||
builder =
|
builder =
|
||||||
builder.wait_timeout(std::time::Duration::from_secs(timeout.try_into().unwrap()));
|
builder.wait_timeout(std::time::Duration::from_secs(timeout.try_into().unwrap()));
|
||||||
}
|
}
|
||||||
|
if let Some(name) = name {
|
||||||
|
builder = builder.name(name);
|
||||||
|
}
|
||||||
|
if let Some(train) = train {
|
||||||
|
builder = builder.train(train);
|
||||||
|
}
|
||||||
builder.execute().await.default_error()
|
builder.execute().await.default_error()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
[tool.bumpversion]
|
[tool.bumpversion]
|
||||||
current_version = "0.24.3"
|
current_version = "0.24.4-beta.1"
|
||||||
parse = """(?x)
|
parse = """(?x)
|
||||||
(?P<major>0|[1-9]\\d*)\\.
|
(?P<major>0|[1-9]\\d*)\\.
|
||||||
(?P<minor>0|[1-9]\\d*)\\.
|
(?P<minor>0|[1-9]\\d*)\\.
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb-python"
|
name = "lancedb-python"
|
||||||
version = "0.24.3"
|
version = "0.24.4-beta.1"
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
description = "Python bindings for LanceDB"
|
description = "Python bindings for LanceDB"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
@@ -33,6 +33,6 @@ pyo3-build-config = { version = "0.24", features = [
|
|||||||
] }
|
] }
|
||||||
|
|
||||||
[features]
|
[features]
|
||||||
default = ["remote"]
|
default = ["remote", "lancedb/default"]
|
||||||
fp16kernels = ["lancedb/fp16kernels"]
|
fp16kernels = ["lancedb/fp16kernels"]
|
||||||
remote = ["lancedb/remote"]
|
remote = ["lancedb/remote"]
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ dependencies = [
|
|||||||
"pyarrow>=16",
|
"pyarrow>=16",
|
||||||
"pydantic>=1.10",
|
"pydantic>=1.10",
|
||||||
"tqdm>=4.27.0",
|
"tqdm>=4.27.0",
|
||||||
|
"lance-namespace==0.0.6"
|
||||||
]
|
]
|
||||||
description = "lancedb"
|
description = "lancedb"
|
||||||
authors = [{ name = "LanceDB Devs", email = "dev@lancedb.com" }]
|
authors = [{ name = "LanceDB Devs", email = "dev@lancedb.com" }]
|
||||||
|
|||||||
@@ -19,6 +19,7 @@ from .remote.db import RemoteDBConnection
|
|||||||
from .schema import vector
|
from .schema import vector
|
||||||
from .table import AsyncTable
|
from .table import AsyncTable
|
||||||
from ._lancedb import Session
|
from ._lancedb import Session
|
||||||
|
from .namespace import connect_namespace, LanceNamespaceDBConnection
|
||||||
|
|
||||||
|
|
||||||
def connect(
|
def connect(
|
||||||
@@ -221,6 +222,7 @@ async def connect_async(
|
|||||||
__all__ = [
|
__all__ = [
|
||||||
"connect",
|
"connect",
|
||||||
"connect_async",
|
"connect_async",
|
||||||
|
"connect_namespace",
|
||||||
"AsyncConnection",
|
"AsyncConnection",
|
||||||
"AsyncTable",
|
"AsyncTable",
|
||||||
"URI",
|
"URI",
|
||||||
@@ -228,6 +230,7 @@ __all__ = [
|
|||||||
"vector",
|
"vector",
|
||||||
"DBConnection",
|
"DBConnection",
|
||||||
"LanceDBConnection",
|
"LanceDBConnection",
|
||||||
|
"LanceNamespaceDBConnection",
|
||||||
"RemoteDBConnection",
|
"RemoteDBConnection",
|
||||||
"Session",
|
"Session",
|
||||||
"__version__",
|
"__version__",
|
||||||
|
|||||||
@@ -59,6 +59,10 @@ class Table:
|
|||||||
column: str,
|
column: str,
|
||||||
index: Union[IvfFlat, IvfPq, HnswPq, HnswSq, BTree, Bitmap, LabelList, FTS],
|
index: Union[IvfFlat, IvfPq, HnswPq, HnswSq, BTree, Bitmap, LabelList, FTS],
|
||||||
replace: Optional[bool],
|
replace: Optional[bool],
|
||||||
|
wait_timeout: Optional[object],
|
||||||
|
*,
|
||||||
|
name: Optional[str],
|
||||||
|
train: Optional[bool],
|
||||||
): ...
|
): ...
|
||||||
async def list_versions(self) -> List[Dict[str, Any]]: ...
|
async def list_versions(self) -> List[Dict[str, Any]]: ...
|
||||||
async def version(self) -> int: ...
|
async def version(self) -> int: ...
|
||||||
|
|||||||
325
python/python/lancedb/namespace.py
Normal file
325
python/python/lancedb/namespace.py
Normal file
@@ -0,0 +1,325 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||||
|
|
||||||
|
"""
|
||||||
|
LanceDB Namespace integration module.
|
||||||
|
|
||||||
|
This module provides integration with lance_namespace for managing tables
|
||||||
|
through a namespace abstraction.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import Dict, Iterable, List, Optional, Union
|
||||||
|
import os
|
||||||
|
|
||||||
|
from lancedb.db import DBConnection
|
||||||
|
from lancedb.table import LanceTable, Table
|
||||||
|
from lancedb.util import validate_table_name
|
||||||
|
from lancedb.common import validate_schema
|
||||||
|
from lancedb.table import sanitize_create_table
|
||||||
|
from overrides import override
|
||||||
|
|
||||||
|
from lance_namespace import LanceNamespace, connect as namespace_connect
|
||||||
|
from lance_namespace_urllib3_client.models import (
|
||||||
|
ListTablesRequest,
|
||||||
|
DescribeTableRequest,
|
||||||
|
CreateTableRequest,
|
||||||
|
DropTableRequest,
|
||||||
|
JsonArrowSchema,
|
||||||
|
JsonArrowField,
|
||||||
|
JsonArrowDataType,
|
||||||
|
)
|
||||||
|
|
||||||
|
import pyarrow as pa
|
||||||
|
from datetime import timedelta
|
||||||
|
from lancedb.pydantic import LanceModel
|
||||||
|
from lancedb.common import DATA
|
||||||
|
from lancedb.embeddings import EmbeddingFunctionConfig
|
||||||
|
from ._lancedb import Session
|
||||||
|
|
||||||
|
|
||||||
|
def _convert_pyarrow_type_to_json(arrow_type: pa.DataType) -> JsonArrowDataType:
|
||||||
|
"""Convert PyArrow DataType to JsonArrowDataType."""
|
||||||
|
if pa.types.is_null(arrow_type):
|
||||||
|
type_name = "null"
|
||||||
|
elif pa.types.is_boolean(arrow_type):
|
||||||
|
type_name = "bool"
|
||||||
|
elif pa.types.is_int8(arrow_type):
|
||||||
|
type_name = "int8"
|
||||||
|
elif pa.types.is_uint8(arrow_type):
|
||||||
|
type_name = "uint8"
|
||||||
|
elif pa.types.is_int16(arrow_type):
|
||||||
|
type_name = "int16"
|
||||||
|
elif pa.types.is_uint16(arrow_type):
|
||||||
|
type_name = "uint16"
|
||||||
|
elif pa.types.is_int32(arrow_type):
|
||||||
|
type_name = "int32"
|
||||||
|
elif pa.types.is_uint32(arrow_type):
|
||||||
|
type_name = "uint32"
|
||||||
|
elif pa.types.is_int64(arrow_type):
|
||||||
|
type_name = "int64"
|
||||||
|
elif pa.types.is_uint64(arrow_type):
|
||||||
|
type_name = "uint64"
|
||||||
|
elif pa.types.is_float32(arrow_type):
|
||||||
|
type_name = "float32"
|
||||||
|
elif pa.types.is_float64(arrow_type):
|
||||||
|
type_name = "float64"
|
||||||
|
elif pa.types.is_string(arrow_type):
|
||||||
|
type_name = "utf8"
|
||||||
|
elif pa.types.is_binary(arrow_type):
|
||||||
|
type_name = "binary"
|
||||||
|
elif pa.types.is_list(arrow_type):
|
||||||
|
# For list types, we need more complex handling
|
||||||
|
type_name = "list"
|
||||||
|
elif pa.types.is_fixed_size_list(arrow_type):
|
||||||
|
type_name = "fixed_size_list"
|
||||||
|
else:
|
||||||
|
# Default to string representation for unsupported types
|
||||||
|
type_name = str(arrow_type)
|
||||||
|
|
||||||
|
return JsonArrowDataType(type=type_name)
|
||||||
|
|
||||||
|
|
||||||
|
def _convert_pyarrow_schema_to_json(schema: pa.Schema) -> JsonArrowSchema:
|
||||||
|
"""Convert PyArrow Schema to JsonArrowSchema."""
|
||||||
|
fields = []
|
||||||
|
for field in schema:
|
||||||
|
json_field = JsonArrowField(
|
||||||
|
name=field.name,
|
||||||
|
type=_convert_pyarrow_type_to_json(field.type),
|
||||||
|
nullable=field.nullable,
|
||||||
|
metadata=field.metadata,
|
||||||
|
)
|
||||||
|
fields.append(json_field)
|
||||||
|
|
||||||
|
return JsonArrowSchema(fields=fields, metadata=schema.metadata)
|
||||||
|
|
||||||
|
|
||||||
|
class LanceNamespaceDBConnection(DBConnection):
|
||||||
|
"""
|
||||||
|
A LanceDB connection that uses a namespace for table management.
|
||||||
|
|
||||||
|
This connection delegates table URI resolution to a lance_namespace instance,
|
||||||
|
while using the standard LanceTable for actual table operations.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
namespace: LanceNamespace,
|
||||||
|
*,
|
||||||
|
read_consistency_interval: Optional[timedelta] = None,
|
||||||
|
storage_options: Optional[Dict[str, str]] = None,
|
||||||
|
session: Optional[Session] = None,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Initialize a namespace-based LanceDB connection.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
namespace : LanceNamespace
|
||||||
|
The namespace instance to use for table management
|
||||||
|
read_consistency_interval : Optional[timedelta]
|
||||||
|
The interval at which to check for updates to the table from other
|
||||||
|
processes. If None, then consistency is not checked.
|
||||||
|
storage_options : Optional[Dict[str, str]]
|
||||||
|
Additional options for the storage backend
|
||||||
|
session : Optional[Session]
|
||||||
|
A session to use for this connection
|
||||||
|
"""
|
||||||
|
self._ns = namespace
|
||||||
|
self.read_consistency_interval = read_consistency_interval
|
||||||
|
self.storage_options = storage_options or {}
|
||||||
|
self.session = session
|
||||||
|
|
||||||
|
@override
|
||||||
|
def table_names(
|
||||||
|
self, page_token: Optional[str] = None, limit: int = 10
|
||||||
|
) -> Iterable[str]:
|
||||||
|
# Use namespace to list tables
|
||||||
|
request = ListTablesRequest(id=None, page_token=page_token, limit=limit)
|
||||||
|
response = self._ns.list_tables(request)
|
||||||
|
return response.tables if response.tables else []
|
||||||
|
|
||||||
|
@override
|
||||||
|
def create_table(
|
||||||
|
self,
|
||||||
|
name: str,
|
||||||
|
data: Optional[DATA] = None,
|
||||||
|
schema: Optional[Union[pa.Schema, LanceModel]] = None,
|
||||||
|
mode: str = "create",
|
||||||
|
exist_ok: bool = False,
|
||||||
|
on_bad_vectors: str = "error",
|
||||||
|
fill_value: float = 0.0,
|
||||||
|
embedding_functions: Optional[List[EmbeddingFunctionConfig]] = None,
|
||||||
|
*,
|
||||||
|
storage_options: Optional[Dict[str, str]] = None,
|
||||||
|
data_storage_version: Optional[str] = None,
|
||||||
|
enable_v2_manifest_paths: Optional[bool] = None,
|
||||||
|
) -> Table:
|
||||||
|
if mode.lower() not in ["create", "overwrite"]:
|
||||||
|
raise ValueError("mode must be either 'create' or 'overwrite'")
|
||||||
|
validate_table_name(name)
|
||||||
|
|
||||||
|
# TODO: support passing data
|
||||||
|
if data is not None:
|
||||||
|
raise ValueError(
|
||||||
|
"create_table currently only supports creating empty tables (data=None)"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Prepare schema
|
||||||
|
metadata = None
|
||||||
|
if embedding_functions is not None:
|
||||||
|
from lancedb.embeddings.registry import EmbeddingFunctionRegistry
|
||||||
|
|
||||||
|
registry = EmbeddingFunctionRegistry.get_instance()
|
||||||
|
metadata = registry.get_table_metadata(embedding_functions)
|
||||||
|
|
||||||
|
data, schema = sanitize_create_table(
|
||||||
|
data, schema, metadata, on_bad_vectors, fill_value
|
||||||
|
)
|
||||||
|
validate_schema(schema)
|
||||||
|
|
||||||
|
# Convert PyArrow schema to JsonArrowSchema
|
||||||
|
json_schema = _convert_pyarrow_schema_to_json(schema)
|
||||||
|
|
||||||
|
# Create table request
|
||||||
|
request = CreateTableRequest(id=[name], var_schema=json_schema)
|
||||||
|
|
||||||
|
# Create empty Arrow IPC stream bytes
|
||||||
|
import pyarrow.ipc as ipc
|
||||||
|
import io
|
||||||
|
|
||||||
|
empty_table = pa.Table.from_arrays(
|
||||||
|
[pa.array([], type=field.type) for field in schema], schema=schema
|
||||||
|
)
|
||||||
|
buffer = io.BytesIO()
|
||||||
|
with ipc.new_stream(buffer, schema) as writer:
|
||||||
|
writer.write_table(empty_table)
|
||||||
|
request_data = buffer.getvalue()
|
||||||
|
|
||||||
|
self._ns.create_table(request, request_data)
|
||||||
|
return self.open_table(name, storage_options=storage_options)
|
||||||
|
|
||||||
|
@override
|
||||||
|
def open_table(
|
||||||
|
self,
|
||||||
|
name: str,
|
||||||
|
*,
|
||||||
|
storage_options: Optional[Dict[str, str]] = None,
|
||||||
|
index_cache_size: Optional[int] = None,
|
||||||
|
) -> Table:
|
||||||
|
request = DescribeTableRequest(id=[name])
|
||||||
|
response = self._ns.describe_table(request)
|
||||||
|
|
||||||
|
merged_storage_options = dict()
|
||||||
|
if storage_options:
|
||||||
|
merged_storage_options.update(storage_options)
|
||||||
|
if response.storage_options:
|
||||||
|
merged_storage_options.update(response.storage_options)
|
||||||
|
|
||||||
|
return self._lance_table_from_uri(
|
||||||
|
response.location,
|
||||||
|
storage_options=merged_storage_options,
|
||||||
|
index_cache_size=index_cache_size,
|
||||||
|
)
|
||||||
|
|
||||||
|
@override
|
||||||
|
def drop_table(self, name: str):
|
||||||
|
# Use namespace drop_table directly
|
||||||
|
request = DropTableRequest(id=[name])
|
||||||
|
self._ns.drop_table(request)
|
||||||
|
|
||||||
|
@override
|
||||||
|
def rename_table(self, cur_name: str, new_name: str):
|
||||||
|
raise NotImplementedError(
|
||||||
|
"rename_table is not supported for namespace connections"
|
||||||
|
)
|
||||||
|
|
||||||
|
@override
|
||||||
|
def drop_database(self):
|
||||||
|
raise NotImplementedError(
|
||||||
|
"drop_database is deprecated, use drop_all_tables instead"
|
||||||
|
)
|
||||||
|
|
||||||
|
@override
|
||||||
|
def drop_all_tables(self):
|
||||||
|
for table_name in self.table_names():
|
||||||
|
self.drop_table(table_name)
|
||||||
|
|
||||||
|
def _lance_table_from_uri(
|
||||||
|
self,
|
||||||
|
table_uri: str,
|
||||||
|
*,
|
||||||
|
storage_options: Optional[Dict[str, str]] = None,
|
||||||
|
index_cache_size: Optional[int] = None,
|
||||||
|
) -> LanceTable:
|
||||||
|
# Extract the base path and table name from the URI
|
||||||
|
if table_uri.endswith(".lance"):
|
||||||
|
base_path = os.path.dirname(table_uri)
|
||||||
|
table_name = os.path.basename(table_uri)[:-6] # Remove .lance
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Invalid table URI: {table_uri}")
|
||||||
|
|
||||||
|
from lancedb.db import LanceDBConnection
|
||||||
|
|
||||||
|
temp_conn = LanceDBConnection(
|
||||||
|
base_path,
|
||||||
|
read_consistency_interval=self.read_consistency_interval,
|
||||||
|
storage_options={**self.storage_options, **(storage_options or {})},
|
||||||
|
session=self.session,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Open the table using the temporary connection
|
||||||
|
return LanceTable.open(
|
||||||
|
temp_conn,
|
||||||
|
table_name,
|
||||||
|
storage_options=storage_options,
|
||||||
|
index_cache_size=index_cache_size,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def connect_namespace(
|
||||||
|
impl: str,
|
||||||
|
properties: Dict[str, str],
|
||||||
|
*,
|
||||||
|
read_consistency_interval: Optional[timedelta] = None,
|
||||||
|
storage_options: Optional[Dict[str, str]] = None,
|
||||||
|
session: Optional[Session] = None,
|
||||||
|
) -> LanceNamespaceDBConnection:
|
||||||
|
"""
|
||||||
|
Connect to a LanceDB database through a namespace.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
impl : str
|
||||||
|
The namespace implementation to use. For examples:
|
||||||
|
- "dir" for DirectoryNamespace
|
||||||
|
- "rest" for REST-based namespace
|
||||||
|
- Full module path for custom implementations
|
||||||
|
properties : Dict[str, str]
|
||||||
|
Configuration properties for the namespace implementation.
|
||||||
|
Different namespace implemenation has different config properties.
|
||||||
|
For example, use DirectoryNamespace with {"root": "/path/to/directory"}
|
||||||
|
read_consistency_interval : Optional[timedelta]
|
||||||
|
The interval at which to check for updates to the table from other
|
||||||
|
processes. If None, then consistency is not checked.
|
||||||
|
storage_options : Optional[Dict[str, str]]
|
||||||
|
Additional options for the storage backend
|
||||||
|
session : Optional[Session]
|
||||||
|
A session to use for this connection
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
LanceNamespaceDBConnection
|
||||||
|
A namespace-based connection to LanceDB
|
||||||
|
"""
|
||||||
|
namespace = namespace_connect(impl, properties)
|
||||||
|
|
||||||
|
# Return the namespace-based connection
|
||||||
|
return LanceNamespaceDBConnection(
|
||||||
|
namespace,
|
||||||
|
read_consistency_interval=read_consistency_interval,
|
||||||
|
storage_options=storage_options,
|
||||||
|
session=session,
|
||||||
|
)
|
||||||
@@ -194,6 +194,8 @@ class RemoteTable(Table):
|
|||||||
wait_timeout: Optional[timedelta] = None,
|
wait_timeout: Optional[timedelta] = None,
|
||||||
*,
|
*,
|
||||||
num_bits: int = 8,
|
num_bits: int = 8,
|
||||||
|
name: Optional[str] = None,
|
||||||
|
train: bool = True,
|
||||||
):
|
):
|
||||||
"""Create an index on the table.
|
"""Create an index on the table.
|
||||||
Currently, the only parameters that matter are
|
Currently, the only parameters that matter are
|
||||||
@@ -270,7 +272,11 @@ class RemoteTable(Table):
|
|||||||
|
|
||||||
LOOP.run(
|
LOOP.run(
|
||||||
self._table.create_index(
|
self._table.create_index(
|
||||||
vector_column_name, config=config, wait_timeout=wait_timeout
|
vector_column_name,
|
||||||
|
config=config,
|
||||||
|
wait_timeout=wait_timeout,
|
||||||
|
name=name,
|
||||||
|
train=train,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -689,6 +689,8 @@ class Table(ABC):
|
|||||||
sample_rate: int = 256,
|
sample_rate: int = 256,
|
||||||
m: int = 20,
|
m: int = 20,
|
||||||
ef_construction: int = 300,
|
ef_construction: int = 300,
|
||||||
|
name: Optional[str] = None,
|
||||||
|
train: bool = True,
|
||||||
):
|
):
|
||||||
"""Create an index on the table.
|
"""Create an index on the table.
|
||||||
|
|
||||||
@@ -721,6 +723,11 @@ class Table(ABC):
|
|||||||
Only 4 and 8 are supported.
|
Only 4 and 8 are supported.
|
||||||
wait_timeout: timedelta, optional
|
wait_timeout: timedelta, optional
|
||||||
The timeout to wait if indexing is asynchronous.
|
The timeout to wait if indexing is asynchronous.
|
||||||
|
name: str, optional
|
||||||
|
The name of the index. If not provided, a default name will be generated.
|
||||||
|
train: bool, default True
|
||||||
|
Whether to train the index with existing data. Vector indices always train
|
||||||
|
with existing data.
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
@@ -1929,6 +1936,9 @@ class LanceTable(Table):
|
|||||||
sample_rate: int = 256,
|
sample_rate: int = 256,
|
||||||
m: int = 20,
|
m: int = 20,
|
||||||
ef_construction: int = 300,
|
ef_construction: int = 300,
|
||||||
|
*,
|
||||||
|
name: Optional[str] = None,
|
||||||
|
train: bool = True,
|
||||||
):
|
):
|
||||||
"""Create an index on the table."""
|
"""Create an index on the table."""
|
||||||
if accelerator is not None:
|
if accelerator is not None:
|
||||||
@@ -1992,6 +2002,8 @@ class LanceTable(Table):
|
|||||||
vector_column_name,
|
vector_column_name,
|
||||||
replace=replace,
|
replace=replace,
|
||||||
config=config,
|
config=config,
|
||||||
|
name=name,
|
||||||
|
train=train,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -3251,6 +3263,8 @@ class AsyncTable:
|
|||||||
Union[IvfFlat, IvfPq, HnswPq, HnswSq, BTree, Bitmap, LabelList, FTS]
|
Union[IvfFlat, IvfPq, HnswPq, HnswSq, BTree, Bitmap, LabelList, FTS]
|
||||||
] = None,
|
] = None,
|
||||||
wait_timeout: Optional[timedelta] = None,
|
wait_timeout: Optional[timedelta] = None,
|
||||||
|
name: Optional[str] = None,
|
||||||
|
train: bool = True,
|
||||||
):
|
):
|
||||||
"""Create an index to speed up queries
|
"""Create an index to speed up queries
|
||||||
|
|
||||||
@@ -3277,6 +3291,11 @@ class AsyncTable:
|
|||||||
creating an index object.
|
creating an index object.
|
||||||
wait_timeout: timedelta, optional
|
wait_timeout: timedelta, optional
|
||||||
The timeout to wait if indexing is asynchronous.
|
The timeout to wait if indexing is asynchronous.
|
||||||
|
name: str, optional
|
||||||
|
The name of the index. If not provided, a default name will be generated.
|
||||||
|
train: bool, default True
|
||||||
|
Whether to train the index with existing data. Vector indices always train
|
||||||
|
with existing data.
|
||||||
"""
|
"""
|
||||||
if config is not None:
|
if config is not None:
|
||||||
if not isinstance(
|
if not isinstance(
|
||||||
@@ -3288,7 +3307,12 @@ class AsyncTable:
|
|||||||
)
|
)
|
||||||
try:
|
try:
|
||||||
await self._inner.create_index(
|
await self._inner.create_index(
|
||||||
column, index=config, replace=replace, wait_timeout=wait_timeout
|
column,
|
||||||
|
index=config,
|
||||||
|
replace=replace,
|
||||||
|
wait_timeout=wait_timeout,
|
||||||
|
name=name,
|
||||||
|
train=train,
|
||||||
)
|
)
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
if "not support the requested language" in str(e):
|
if "not support the requested language" in str(e):
|
||||||
|
|||||||
414
python/python/tests/test_namespace.py
Normal file
414
python/python/tests/test_namespace.py
Normal file
@@ -0,0 +1,414 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||||
|
|
||||||
|
"""Tests for LanceDB namespace integration."""
|
||||||
|
|
||||||
|
import tempfile
|
||||||
|
import shutil
|
||||||
|
from typing import Dict, Optional
|
||||||
|
import pytest
|
||||||
|
import pyarrow as pa
|
||||||
|
import lancedb
|
||||||
|
from lance_namespace.namespace import NATIVE_IMPLS, LanceNamespace
|
||||||
|
from lance_namespace_urllib3_client.models import (
|
||||||
|
ListTablesRequest,
|
||||||
|
ListTablesResponse,
|
||||||
|
DescribeTableRequest,
|
||||||
|
DescribeTableResponse,
|
||||||
|
RegisterTableRequest,
|
||||||
|
RegisterTableResponse,
|
||||||
|
DeregisterTableRequest,
|
||||||
|
DeregisterTableResponse,
|
||||||
|
CreateTableRequest,
|
||||||
|
CreateTableResponse,
|
||||||
|
DropTableRequest,
|
||||||
|
DropTableResponse,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TempNamespace(LanceNamespace):
|
||||||
|
"""A simple dictionary-backed namespace for testing."""
|
||||||
|
|
||||||
|
# Class-level storage to persist table registry across instances
|
||||||
|
_global_registry: Dict[str, Dict[str, str]] = {}
|
||||||
|
|
||||||
|
def __init__(self, **properties):
|
||||||
|
"""Initialize the test namespace.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
root: The root directory for tables (optional)
|
||||||
|
**properties: Additional configuration properties
|
||||||
|
"""
|
||||||
|
self.config = TempNamespaceConfig(properties)
|
||||||
|
# Use the root as a key to maintain separate registries per root
|
||||||
|
root = self.config.root
|
||||||
|
if root not in self._global_registry:
|
||||||
|
self._global_registry[root] = {}
|
||||||
|
self.tables = self._global_registry[root] # Reference to shared registry
|
||||||
|
|
||||||
|
def list_tables(self, request: ListTablesRequest) -> ListTablesResponse:
|
||||||
|
"""List all tables in the namespace."""
|
||||||
|
# For simplicity, ignore namespace ID validation
|
||||||
|
tables = list(self.tables.keys())
|
||||||
|
return ListTablesResponse(tables=tables)
|
||||||
|
|
||||||
|
def describe_table(self, request: DescribeTableRequest) -> DescribeTableResponse:
|
||||||
|
"""Describe a table by returning its location."""
|
||||||
|
if not request.id or len(request.id) != 1:
|
||||||
|
raise ValueError("Invalid table ID")
|
||||||
|
|
||||||
|
table_name = request.id[0]
|
||||||
|
if table_name not in self.tables:
|
||||||
|
raise RuntimeError(f"Table does not exist: {table_name}")
|
||||||
|
|
||||||
|
table_uri = self.tables[table_name]
|
||||||
|
return DescribeTableResponse(location=table_uri)
|
||||||
|
|
||||||
|
def create_table(
|
||||||
|
self, request: CreateTableRequest, request_data: bytes
|
||||||
|
) -> CreateTableResponse:
|
||||||
|
"""Create a table in the namespace."""
|
||||||
|
if not request.id or len(request.id) != 1:
|
||||||
|
raise ValueError("Invalid table ID")
|
||||||
|
|
||||||
|
table_name = request.id[0]
|
||||||
|
|
||||||
|
# Check if table already exists
|
||||||
|
if table_name in self.tables:
|
||||||
|
if request.mode == "overwrite":
|
||||||
|
# Drop existing table for overwrite mode
|
||||||
|
del self.tables[table_name]
|
||||||
|
else:
|
||||||
|
raise RuntimeError(f"Table already exists: {table_name}")
|
||||||
|
|
||||||
|
# Generate table URI based on root directory
|
||||||
|
table_uri = f"{self.config.root}/{table_name}.lance"
|
||||||
|
|
||||||
|
# Parse the Arrow IPC stream to get the schema and create the actual table
|
||||||
|
import pyarrow.ipc as ipc
|
||||||
|
import io
|
||||||
|
import lance
|
||||||
|
|
||||||
|
# Read the IPC stream
|
||||||
|
reader = ipc.open_stream(io.BytesIO(request_data))
|
||||||
|
table = reader.read_all()
|
||||||
|
|
||||||
|
# Create the actual Lance table
|
||||||
|
lance.write_dataset(table, table_uri)
|
||||||
|
|
||||||
|
# Store the table mapping
|
||||||
|
self.tables[table_name] = table_uri
|
||||||
|
|
||||||
|
return CreateTableResponse(location=table_uri)
|
||||||
|
|
||||||
|
def drop_table(self, request: DropTableRequest) -> DropTableResponse:
|
||||||
|
"""Drop a table from the namespace."""
|
||||||
|
if not request.id or len(request.id) != 1:
|
||||||
|
raise ValueError("Invalid table ID")
|
||||||
|
|
||||||
|
table_name = request.id[0]
|
||||||
|
if table_name not in self.tables:
|
||||||
|
raise RuntimeError(f"Table does not exist: {table_name}")
|
||||||
|
|
||||||
|
# Get the table URI
|
||||||
|
table_uri = self.tables[table_name]
|
||||||
|
|
||||||
|
# Delete the actual table files
|
||||||
|
import shutil
|
||||||
|
import os
|
||||||
|
|
||||||
|
if os.path.exists(table_uri):
|
||||||
|
shutil.rmtree(table_uri, ignore_errors=True)
|
||||||
|
|
||||||
|
# Remove from registry
|
||||||
|
del self.tables[table_name]
|
||||||
|
|
||||||
|
return DropTableResponse()
|
||||||
|
|
||||||
|
def register_table(self, request: RegisterTableRequest) -> RegisterTableResponse:
|
||||||
|
"""Register a table with the namespace."""
|
||||||
|
if not request.id or len(request.id) != 1:
|
||||||
|
raise ValueError("Invalid table ID")
|
||||||
|
|
||||||
|
if not request.location:
|
||||||
|
raise ValueError("Table location is required")
|
||||||
|
|
||||||
|
table_name = request.id[0]
|
||||||
|
self.tables[table_name] = request.location
|
||||||
|
|
||||||
|
return RegisterTableResponse()
|
||||||
|
|
||||||
|
def deregister_table(
|
||||||
|
self, request: DeregisterTableRequest
|
||||||
|
) -> DeregisterTableResponse:
|
||||||
|
"""Deregister a table from the namespace."""
|
||||||
|
if not request.id or len(request.id) != 1:
|
||||||
|
raise ValueError("Invalid table ID")
|
||||||
|
|
||||||
|
table_name = request.id[0]
|
||||||
|
if table_name not in self.tables:
|
||||||
|
raise RuntimeError(f"Table does not exist: {table_name}")
|
||||||
|
|
||||||
|
del self.tables[table_name]
|
||||||
|
return DeregisterTableResponse()
|
||||||
|
|
||||||
|
|
||||||
|
class TempNamespaceConfig:
|
||||||
|
"""Configuration for TestNamespace."""
|
||||||
|
|
||||||
|
ROOT = "root"
|
||||||
|
|
||||||
|
def __init__(self, properties: Optional[Dict[str, str]] = None):
|
||||||
|
"""Initialize configuration from properties.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
properties: Dictionary of configuration properties
|
||||||
|
"""
|
||||||
|
if properties is None:
|
||||||
|
properties = {}
|
||||||
|
|
||||||
|
self._root = properties.get(self.ROOT, "/tmp")
|
||||||
|
|
||||||
|
@property
|
||||||
|
def root(self) -> str:
|
||||||
|
"""Get the namespace root directory."""
|
||||||
|
return self._root
|
||||||
|
|
||||||
|
|
||||||
|
NATIVE_IMPLS["temp"] = f"{TempNamespace.__module__}.TempNamespace"
|
||||||
|
|
||||||
|
|
||||||
|
class TestNamespaceConnection:
|
||||||
|
"""Test namespace-based LanceDB connection."""
|
||||||
|
|
||||||
|
def setup_method(self):
|
||||||
|
"""Set up test fixtures."""
|
||||||
|
self.temp_dir = tempfile.mkdtemp()
|
||||||
|
# Clear the TestNamespace registry for this test
|
||||||
|
if self.temp_dir in TempNamespace._global_registry:
|
||||||
|
TempNamespace._global_registry[self.temp_dir].clear()
|
||||||
|
|
||||||
|
def teardown_method(self):
|
||||||
|
"""Clean up test fixtures."""
|
||||||
|
# Clear the TestNamespace registry
|
||||||
|
if self.temp_dir in TempNamespace._global_registry:
|
||||||
|
del TempNamespace._global_registry[self.temp_dir]
|
||||||
|
shutil.rmtree(self.temp_dir, ignore_errors=True)
|
||||||
|
|
||||||
|
def test_connect_namespace_test(self):
|
||||||
|
"""Test connecting to LanceDB through TestNamespace."""
|
||||||
|
# Connect using TestNamespace
|
||||||
|
db = lancedb.connect_namespace("temp", {"root": self.temp_dir})
|
||||||
|
|
||||||
|
# Should be a LanceNamespaceDBConnection
|
||||||
|
assert isinstance(db, lancedb.LanceNamespaceDBConnection)
|
||||||
|
|
||||||
|
# Initially no tables
|
||||||
|
assert len(list(db.table_names())) == 0
|
||||||
|
|
||||||
|
def test_create_table_through_namespace(self):
|
||||||
|
"""Test creating a table through namespace."""
|
||||||
|
db = lancedb.connect_namespace("temp", {"root": self.temp_dir})
|
||||||
|
|
||||||
|
# Define schema for empty table
|
||||||
|
schema = pa.schema(
|
||||||
|
[
|
||||||
|
pa.field("id", pa.int64()),
|
||||||
|
pa.field("vector", pa.list_(pa.float32(), 2)),
|
||||||
|
pa.field("text", pa.string()),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create empty table
|
||||||
|
table = db.create_table("test_table", schema=schema)
|
||||||
|
assert table is not None
|
||||||
|
assert table.name == "test_table"
|
||||||
|
|
||||||
|
# Table should appear in namespace
|
||||||
|
table_names = list(db.table_names())
|
||||||
|
assert "test_table" in table_names
|
||||||
|
assert len(table_names) == 1
|
||||||
|
|
||||||
|
# Verify empty table
|
||||||
|
result = table.to_pandas()
|
||||||
|
assert len(result) == 0
|
||||||
|
assert list(result.columns) == ["id", "vector", "text"]
|
||||||
|
|
||||||
|
def test_open_table_through_namespace(self):
|
||||||
|
"""Test opening an existing table through namespace."""
|
||||||
|
db = lancedb.connect_namespace("temp", {"root": self.temp_dir})
|
||||||
|
|
||||||
|
# Create a table with schema
|
||||||
|
schema = pa.schema(
|
||||||
|
[
|
||||||
|
pa.field("id", pa.int64()),
|
||||||
|
pa.field("vector", pa.list_(pa.float32(), 2)),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
db.create_table("test_table", schema=schema)
|
||||||
|
|
||||||
|
# Open the table
|
||||||
|
table = db.open_table("test_table")
|
||||||
|
assert table is not None
|
||||||
|
assert table.name == "test_table"
|
||||||
|
|
||||||
|
# Verify empty table with correct schema
|
||||||
|
result = table.to_pandas()
|
||||||
|
assert len(result) == 0
|
||||||
|
assert list(result.columns) == ["id", "vector"]
|
||||||
|
|
||||||
|
def test_drop_table_through_namespace(self):
|
||||||
|
"""Test dropping a table through namespace."""
|
||||||
|
db = lancedb.connect_namespace("temp", {"root": self.temp_dir})
|
||||||
|
|
||||||
|
# Create tables
|
||||||
|
schema = pa.schema(
|
||||||
|
[
|
||||||
|
pa.field("id", pa.int64()),
|
||||||
|
pa.field("vector", pa.list_(pa.float32(), 2)),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
db.create_table("table1", schema=schema)
|
||||||
|
db.create_table("table2", schema=schema)
|
||||||
|
|
||||||
|
# Verify both tables exist
|
||||||
|
table_names = list(db.table_names())
|
||||||
|
assert "table1" in table_names
|
||||||
|
assert "table2" in table_names
|
||||||
|
assert len(table_names) == 2
|
||||||
|
|
||||||
|
# Drop one table
|
||||||
|
db.drop_table("table1")
|
||||||
|
|
||||||
|
# Verify only table2 remains
|
||||||
|
table_names = list(db.table_names())
|
||||||
|
assert "table1" not in table_names
|
||||||
|
assert "table2" in table_names
|
||||||
|
assert len(table_names) == 1
|
||||||
|
|
||||||
|
# Should not be able to open dropped table
|
||||||
|
with pytest.raises(RuntimeError):
|
||||||
|
db.open_table("table1")
|
||||||
|
|
||||||
|
def test_create_table_with_schema(self):
|
||||||
|
"""Test creating a table with explicit schema through namespace."""
|
||||||
|
db = lancedb.connect_namespace("temp", {"root": self.temp_dir})
|
||||||
|
|
||||||
|
# Define schema
|
||||||
|
schema = pa.schema(
|
||||||
|
[
|
||||||
|
pa.field("id", pa.int64()),
|
||||||
|
pa.field("vector", pa.list_(pa.float32(), 3)),
|
||||||
|
pa.field("text", pa.string()),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create table with schema
|
||||||
|
table = db.create_table("test_table", schema=schema)
|
||||||
|
assert table is not None
|
||||||
|
|
||||||
|
# Verify schema
|
||||||
|
table_schema = table.schema
|
||||||
|
assert len(table_schema) == 3
|
||||||
|
assert table_schema.field("id").type == pa.int64()
|
||||||
|
assert table_schema.field("text").type == pa.string()
|
||||||
|
|
||||||
|
def test_rename_table_not_supported(self):
|
||||||
|
"""Test that rename_table raises NotImplementedError."""
|
||||||
|
db = lancedb.connect_namespace("temp", {"root": self.temp_dir})
|
||||||
|
|
||||||
|
# Create a table
|
||||||
|
schema = pa.schema(
|
||||||
|
[
|
||||||
|
pa.field("id", pa.int64()),
|
||||||
|
pa.field("vector", pa.list_(pa.float32(), 2)),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
db.create_table("old_name", schema=schema)
|
||||||
|
|
||||||
|
# Rename should raise NotImplementedError
|
||||||
|
with pytest.raises(NotImplementedError, match="rename_table is not supported"):
|
||||||
|
db.rename_table("old_name", "new_name")
|
||||||
|
|
||||||
|
def test_drop_all_tables(self):
|
||||||
|
"""Test dropping all tables through namespace."""
|
||||||
|
db = lancedb.connect_namespace("temp", {"root": self.temp_dir})
|
||||||
|
|
||||||
|
# Create multiple tables
|
||||||
|
schema = pa.schema(
|
||||||
|
[
|
||||||
|
pa.field("id", pa.int64()),
|
||||||
|
pa.field("vector", pa.list_(pa.float32(), 2)),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
for i in range(3):
|
||||||
|
db.create_table(f"table{i}", schema=schema)
|
||||||
|
|
||||||
|
# Verify tables exist
|
||||||
|
assert len(list(db.table_names())) == 3
|
||||||
|
|
||||||
|
# Drop all tables
|
||||||
|
db.drop_all_tables()
|
||||||
|
|
||||||
|
# Verify all tables are gone
|
||||||
|
assert len(list(db.table_names())) == 0
|
||||||
|
|
||||||
|
def test_table_operations(self):
|
||||||
|
"""Test various table operations through namespace."""
|
||||||
|
db = lancedb.connect_namespace("temp", {"root": self.temp_dir})
|
||||||
|
|
||||||
|
# Create a table with schema
|
||||||
|
schema = pa.schema(
|
||||||
|
[
|
||||||
|
pa.field("id", pa.int64()),
|
||||||
|
pa.field("vector", pa.list_(pa.float32(), 2)),
|
||||||
|
pa.field("text", pa.string()),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
table = db.create_table("test_table", schema=schema)
|
||||||
|
|
||||||
|
# Verify empty table was created
|
||||||
|
result = table.to_pandas()
|
||||||
|
assert len(result) == 0
|
||||||
|
assert list(result.columns) == ["id", "vector", "text"]
|
||||||
|
|
||||||
|
# Test add data to the table
|
||||||
|
new_data = [
|
||||||
|
{"id": 1, "vector": [1.0, 2.0], "text": "item_1"},
|
||||||
|
{"id": 2, "vector": [2.0, 3.0], "text": "item_2"},
|
||||||
|
]
|
||||||
|
table.add(new_data)
|
||||||
|
result = table.to_pandas()
|
||||||
|
assert len(result) == 2
|
||||||
|
|
||||||
|
# Test delete
|
||||||
|
table.delete("id = 1")
|
||||||
|
result = table.to_pandas()
|
||||||
|
assert len(result) == 1
|
||||||
|
assert result["id"].values[0] == 2
|
||||||
|
|
||||||
|
# Test update
|
||||||
|
table.update(where="id = 2", values={"text": "updated"})
|
||||||
|
result = table.to_pandas()
|
||||||
|
assert result["text"].values[0] == "updated"
|
||||||
|
|
||||||
|
def test_storage_options(self):
|
||||||
|
"""Test passing storage options through namespace connection."""
|
||||||
|
# Connect with storage options
|
||||||
|
storage_opts = {"test_option": "test_value"}
|
||||||
|
db = lancedb.connect_namespace(
|
||||||
|
"temp", {"root": self.temp_dir}, storage_options=storage_opts
|
||||||
|
)
|
||||||
|
|
||||||
|
# Storage options should be preserved
|
||||||
|
assert db.storage_options == storage_opts
|
||||||
|
|
||||||
|
# Create table with additional storage options
|
||||||
|
table_opts = {"table_option": "table_value"}
|
||||||
|
schema = pa.schema(
|
||||||
|
[
|
||||||
|
pa.field("id", pa.int64()),
|
||||||
|
pa.field("vector", pa.list_(pa.float32(), 2)),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
db.create_table("test_table", schema=schema, storage_options=table_opts)
|
||||||
@@ -670,7 +670,9 @@ def test_create_index_method(mock_create_index, mem_db: DBConnection):
|
|||||||
num_sub_vectors=96,
|
num_sub_vectors=96,
|
||||||
num_bits=4,
|
num_bits=4,
|
||||||
)
|
)
|
||||||
mock_create_index.assert_called_with("vector", replace=True, config=expected_config)
|
mock_create_index.assert_called_with(
|
||||||
|
"vector", replace=True, config=expected_config, name=None, train=True
|
||||||
|
)
|
||||||
|
|
||||||
table.create_index(
|
table.create_index(
|
||||||
vector_column_name="my_vector",
|
vector_column_name="my_vector",
|
||||||
@@ -680,7 +682,7 @@ def test_create_index_method(mock_create_index, mem_db: DBConnection):
|
|||||||
)
|
)
|
||||||
expected_config = HnswPq(distance_type="dot")
|
expected_config = HnswPq(distance_type="dot")
|
||||||
mock_create_index.assert_called_with(
|
mock_create_index.assert_called_with(
|
||||||
"my_vector", replace=False, config=expected_config
|
"my_vector", replace=False, config=expected_config, name=None, train=True
|
||||||
)
|
)
|
||||||
|
|
||||||
table.create_index(
|
table.create_index(
|
||||||
@@ -695,7 +697,44 @@ def test_create_index_method(mock_create_index, mem_db: DBConnection):
|
|||||||
distance_type="cosine", sample_rate=0.1, m=29, ef_construction=10
|
distance_type="cosine", sample_rate=0.1, m=29, ef_construction=10
|
||||||
)
|
)
|
||||||
mock_create_index.assert_called_with(
|
mock_create_index.assert_called_with(
|
||||||
"my_vector", replace=True, config=expected_config
|
"my_vector", replace=True, config=expected_config, name=None, train=True
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@patch("lancedb.table.AsyncTable.create_index")
|
||||||
|
def test_create_index_name_and_train_parameters(
|
||||||
|
mock_create_index, mem_db: DBConnection
|
||||||
|
):
|
||||||
|
"""Test that name and train parameters are passed correctly to AsyncTable"""
|
||||||
|
table = mem_db.create_table(
|
||||||
|
"test",
|
||||||
|
data=[
|
||||||
|
{"vector": [3.1, 4.1], "id": 1},
|
||||||
|
{"vector": [5.9, 26.5], "id": 2},
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
# Test with custom name
|
||||||
|
table.create_index(vector_column_name="vector", name="my_custom_index")
|
||||||
|
expected_config = IvfPq() # Default config
|
||||||
|
mock_create_index.assert_called_with(
|
||||||
|
"vector",
|
||||||
|
replace=True,
|
||||||
|
config=expected_config,
|
||||||
|
name="my_custom_index",
|
||||||
|
train=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Test with train=False
|
||||||
|
table.create_index(vector_column_name="vector", train=False)
|
||||||
|
mock_create_index.assert_called_with(
|
||||||
|
"vector", replace=True, config=expected_config, name=None, train=False
|
||||||
|
)
|
||||||
|
|
||||||
|
# Test with both name and train
|
||||||
|
table.create_index(vector_column_name="vector", name="my_index_name", train=True)
|
||||||
|
mock_create_index.assert_called_with(
|
||||||
|
"vector", replace=True, config=expected_config, name="my_index_name", train=True
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -341,13 +341,15 @@ impl Table {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
#[pyo3(signature = (column, index=None, replace=None, wait_timeout=None))]
|
#[pyo3(signature = (column, index=None, replace=None, wait_timeout=None, *, name=None, train=None))]
|
||||||
pub fn create_index<'a>(
|
pub fn create_index<'a>(
|
||||||
self_: PyRef<'a, Self>,
|
self_: PyRef<'a, Self>,
|
||||||
column: String,
|
column: String,
|
||||||
index: Option<Bound<'_, PyAny>>,
|
index: Option<Bound<'_, PyAny>>,
|
||||||
replace: Option<bool>,
|
replace: Option<bool>,
|
||||||
wait_timeout: Option<Bound<'_, PyAny>>,
|
wait_timeout: Option<Bound<'_, PyAny>>,
|
||||||
|
name: Option<String>,
|
||||||
|
train: Option<bool>,
|
||||||
) -> PyResult<Bound<'a, PyAny>> {
|
) -> PyResult<Bound<'a, PyAny>> {
|
||||||
let index = extract_index_params(&index)?;
|
let index = extract_index_params(&index)?;
|
||||||
let timeout = wait_timeout.map(|t| t.extract::<std::time::Duration>().unwrap());
|
let timeout = wait_timeout.map(|t| t.extract::<std::time::Duration>().unwrap());
|
||||||
@@ -357,6 +359,12 @@ impl Table {
|
|||||||
if let Some(replace) = replace {
|
if let Some(replace) = replace {
|
||||||
op = op.replace(replace);
|
op = op.replace(replace);
|
||||||
}
|
}
|
||||||
|
if let Some(name) = name {
|
||||||
|
op = op.name(name);
|
||||||
|
}
|
||||||
|
if let Some(train) = train {
|
||||||
|
op = op.train(train);
|
||||||
|
}
|
||||||
|
|
||||||
future_into_py(self_.py(), async move {
|
future_into_py(self_.py(), async move {
|
||||||
op.execute().await.infer_error()?;
|
op.execute().await.infer_error()?;
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb"
|
name = "lancedb"
|
||||||
version = "0.21.2"
|
version = "0.21.4-beta.0"
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
description = "LanceDB: A serverless, low-latency vector database for AI applications"
|
description = "LanceDB: A serverless, low-latency vector database for AI applications"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
@@ -97,7 +97,12 @@ rstest = "0.23.0"
|
|||||||
|
|
||||||
|
|
||||||
[features]
|
[features]
|
||||||
default = []
|
default = ["aws", "gcs", "azure", "dynamodb", "oss"]
|
||||||
|
aws = ["lance/aws", "lance-io/aws"]
|
||||||
|
oss = ["lance/oss", "lance-io/oss"]
|
||||||
|
gcs = ["lance/gcp", "lance-io/gcp"]
|
||||||
|
azure = ["lance/azure", "lance-io/azure"]
|
||||||
|
dynamodb = ["lance/dynamodb", "aws"]
|
||||||
remote = ["dep:reqwest", "dep:http", "dep:rand", "dep:uuid"]
|
remote = ["dep:reqwest", "dep:http", "dep:rand", "dep:uuid"]
|
||||||
fp16kernels = ["lance-linalg/fp16kernels"]
|
fp16kernels = ["lance-linalg/fp16kernels"]
|
||||||
s3-test = []
|
s3-test = []
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ use std::sync::Arc;
|
|||||||
use arrow_array::RecordBatchReader;
|
use arrow_array::RecordBatchReader;
|
||||||
use arrow_schema::{Field, SchemaRef};
|
use arrow_schema::{Field, SchemaRef};
|
||||||
use lance::dataset::ReadParams;
|
use lance::dataset::ReadParams;
|
||||||
|
#[cfg(feature = "aws")]
|
||||||
use object_store::aws::AwsCredential;
|
use object_store::aws::AwsCredential;
|
||||||
|
|
||||||
use crate::arrow::{IntoArrow, IntoArrowStream, SendableRecordBatchStream};
|
use crate::arrow::{IntoArrow, IntoArrowStream, SendableRecordBatchStream};
|
||||||
@@ -749,6 +750,7 @@ impl ConnectBuilder {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// [`AwsCredential`] to use when connecting to S3.
|
/// [`AwsCredential`] to use when connecting to S3.
|
||||||
|
#[cfg(feature = "aws")]
|
||||||
#[deprecated(note = "Pass through storage_options instead")]
|
#[deprecated(note = "Pass through storage_options instead")]
|
||||||
pub fn aws_creds(mut self, aws_creds: AwsCredential) -> Self {
|
pub fn aws_creds(mut self, aws_creds: AwsCredential) -> Self {
|
||||||
self.request
|
self.request
|
||||||
|
|||||||
@@ -65,12 +65,94 @@ pub enum Index {
|
|||||||
/// Builder for the create_index operation
|
/// Builder for the create_index operation
|
||||||
///
|
///
|
||||||
/// The methods on this builder are used to specify options common to all indices.
|
/// The methods on this builder are used to specify options common to all indices.
|
||||||
|
///
|
||||||
|
/// # Examples
|
||||||
|
///
|
||||||
|
/// Creating a basic vector index:
|
||||||
|
///
|
||||||
|
/// ```
|
||||||
|
/// use lancedb::{connect, index::{Index, vector::IvfPqIndexBuilder}};
|
||||||
|
///
|
||||||
|
/// # async fn create_basic_vector_index() -> lancedb::Result<()> {
|
||||||
|
/// let db = connect("data/sample-lancedb").execute().await?;
|
||||||
|
/// let table = db.open_table("my_table").execute().await?;
|
||||||
|
///
|
||||||
|
/// // Create a vector index with default settings
|
||||||
|
/// table
|
||||||
|
/// .create_index(&["vector"], Index::IvfPq(IvfPqIndexBuilder::default()))
|
||||||
|
/// .execute()
|
||||||
|
/// .await?;
|
||||||
|
/// # Ok(())
|
||||||
|
/// # }
|
||||||
|
/// ```
|
||||||
|
///
|
||||||
|
/// Creating an index with a custom name:
|
||||||
|
///
|
||||||
|
/// ```
|
||||||
|
/// use lancedb::{connect, index::{Index, vector::IvfPqIndexBuilder}};
|
||||||
|
///
|
||||||
|
/// # async fn create_named_index() -> lancedb::Result<()> {
|
||||||
|
/// let db = connect("data/sample-lancedb").execute().await?;
|
||||||
|
/// let table = db.open_table("my_table").execute().await?;
|
||||||
|
///
|
||||||
|
/// // Create a vector index with a custom name
|
||||||
|
/// table
|
||||||
|
/// .create_index(&["embeddings"], Index::IvfPq(IvfPqIndexBuilder::default()))
|
||||||
|
/// .name("my_embeddings_index".to_string())
|
||||||
|
/// .execute()
|
||||||
|
/// .await?;
|
||||||
|
/// # Ok(())
|
||||||
|
/// # }
|
||||||
|
/// ```
|
||||||
|
///
|
||||||
|
/// Creating an untrained index (for scalar indices only):
|
||||||
|
///
|
||||||
|
/// ```
|
||||||
|
/// use lancedb::{connect, index::{Index, scalar::BTreeIndexBuilder}};
|
||||||
|
///
|
||||||
|
/// # async fn create_untrained_index() -> lancedb::Result<()> {
|
||||||
|
/// let db = connect("data/sample-lancedb").execute().await?;
|
||||||
|
/// let table = db.open_table("my_table").execute().await?;
|
||||||
|
///
|
||||||
|
/// // Create a BTree index without training (creates empty index)
|
||||||
|
/// table
|
||||||
|
/// .create_index(&["category"], Index::BTree(BTreeIndexBuilder::default()))
|
||||||
|
/// .train(false)
|
||||||
|
/// .name("category_index".to_string())
|
||||||
|
/// .execute()
|
||||||
|
/// .await?;
|
||||||
|
/// # Ok(())
|
||||||
|
/// # }
|
||||||
|
/// ```
|
||||||
|
///
|
||||||
|
/// Creating a scalar index with all options:
|
||||||
|
///
|
||||||
|
/// ```
|
||||||
|
/// use lancedb::{connect, index::{Index, scalar::BitmapIndexBuilder}};
|
||||||
|
///
|
||||||
|
/// # async fn create_full_options_index() -> lancedb::Result<()> {
|
||||||
|
/// let db = connect("data/sample-lancedb").execute().await?;
|
||||||
|
/// let table = db.open_table("my_table").execute().await?;
|
||||||
|
///
|
||||||
|
/// // Create a bitmap index with full configuration
|
||||||
|
/// table
|
||||||
|
/// .create_index(&["status"], Index::Bitmap(BitmapIndexBuilder::default()))
|
||||||
|
/// .name("status_bitmap_index".to_string())
|
||||||
|
/// .train(true) // Train the index with existing data
|
||||||
|
/// .replace(false) // Don't replace if index already exists
|
||||||
|
/// .execute()
|
||||||
|
/// .await?;
|
||||||
|
/// # Ok(())
|
||||||
|
/// # }
|
||||||
|
/// ```
|
||||||
pub struct IndexBuilder {
|
pub struct IndexBuilder {
|
||||||
parent: Arc<dyn BaseTable>,
|
parent: Arc<dyn BaseTable>,
|
||||||
pub(crate) index: Index,
|
pub(crate) index: Index,
|
||||||
pub(crate) columns: Vec<String>,
|
pub(crate) columns: Vec<String>,
|
||||||
pub(crate) replace: bool,
|
pub(crate) replace: bool,
|
||||||
pub(crate) wait_timeout: Option<Duration>,
|
pub(crate) wait_timeout: Option<Duration>,
|
||||||
|
pub(crate) train: bool,
|
||||||
|
pub(crate) name: Option<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl IndexBuilder {
|
impl IndexBuilder {
|
||||||
@@ -80,7 +162,9 @@ impl IndexBuilder {
|
|||||||
index,
|
index,
|
||||||
columns,
|
columns,
|
||||||
replace: true,
|
replace: true,
|
||||||
|
train: true,
|
||||||
wait_timeout: None,
|
wait_timeout: None,
|
||||||
|
name: None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -94,6 +178,82 @@ impl IndexBuilder {
|
|||||||
self
|
self
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// The name of the index. If not set, a default name will be generated.
|
||||||
|
///
|
||||||
|
/// # Examples
|
||||||
|
///
|
||||||
|
/// ```
|
||||||
|
/// use lancedb::{connect, index::{Index, scalar::BTreeIndexBuilder}};
|
||||||
|
///
|
||||||
|
/// # async fn name_example() -> lancedb::Result<()> {
|
||||||
|
/// let db = connect("data/sample-lancedb").execute().await?;
|
||||||
|
/// let table = db.open_table("my_table").execute().await?;
|
||||||
|
///
|
||||||
|
/// // Create an index with a custom name
|
||||||
|
/// table
|
||||||
|
/// .create_index(&["user_id"], Index::BTree(BTreeIndexBuilder::default()))
|
||||||
|
/// .name("user_id_btree_index".to_string())
|
||||||
|
/// .execute()
|
||||||
|
/// .await?;
|
||||||
|
/// # Ok(())
|
||||||
|
/// # }
|
||||||
|
/// ```
|
||||||
|
pub fn name(mut self, v: String) -> Self {
|
||||||
|
self.name = Some(v);
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Whether to train the index, the default is `true`.
|
||||||
|
///
|
||||||
|
/// If this is false, the index will not be trained and just created empty.
|
||||||
|
///
|
||||||
|
/// This is not supported for vector indices yet.
|
||||||
|
///
|
||||||
|
/// # Examples
|
||||||
|
///
|
||||||
|
/// Creating an empty index that will be populated later:
|
||||||
|
///
|
||||||
|
/// ```
|
||||||
|
/// use lancedb::{connect, index::{Index, scalar::BitmapIndexBuilder}};
|
||||||
|
///
|
||||||
|
/// # async fn train_false_example() -> lancedb::Result<()> {
|
||||||
|
/// let db = connect("data/sample-lancedb").execute().await?;
|
||||||
|
/// let table = db.open_table("my_table").execute().await?;
|
||||||
|
///
|
||||||
|
/// // Create an empty bitmap index (not trained with existing data)
|
||||||
|
/// table
|
||||||
|
/// .create_index(&["category"], Index::Bitmap(BitmapIndexBuilder::default()))
|
||||||
|
/// .train(false) // Create empty index
|
||||||
|
/// .name("category_bitmap".to_string())
|
||||||
|
/// .execute()
|
||||||
|
/// .await?;
|
||||||
|
/// # Ok(())
|
||||||
|
/// # }
|
||||||
|
/// ```
|
||||||
|
///
|
||||||
|
/// Creating a trained index (default behavior):
|
||||||
|
///
|
||||||
|
/// ```
|
||||||
|
/// use lancedb::{connect, index::{Index, scalar::BTreeIndexBuilder}};
|
||||||
|
///
|
||||||
|
/// # async fn train_true_example() -> lancedb::Result<()> {
|
||||||
|
/// let db = connect("data/sample-lancedb").execute().await?;
|
||||||
|
/// let table = db.open_table("my_table").execute().await?;
|
||||||
|
///
|
||||||
|
/// // Create a trained BTree index (includes existing data)
|
||||||
|
/// table
|
||||||
|
/// .create_index(&["timestamp"], Index::BTree(BTreeIndexBuilder::default()))
|
||||||
|
/// .train(true) // Train with existing data (this is the default)
|
||||||
|
/// .execute()
|
||||||
|
/// .await?;
|
||||||
|
/// # Ok(())
|
||||||
|
/// # }
|
||||||
|
/// ```
|
||||||
|
pub fn train(mut self, v: bool) -> Self {
|
||||||
|
self.train = v;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
/// Duration of time to wait for asynchronous indexing to complete. If not set,
|
/// Duration of time to wait for asynchronous indexing to complete. If not set,
|
||||||
/// `create_index()` will not wait.
|
/// `create_index()` will not wait.
|
||||||
///
|
///
|
||||||
|
|||||||
@@ -999,6 +999,18 @@ impl<S: HttpSend> BaseTable for RemoteTable<S> {
|
|||||||
"column": column
|
"column": column
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// Add name parameter if provided (for backwards compatibility, only include if Some)
|
||||||
|
if let Some(ref name) = index.name {
|
||||||
|
body["name"] = serde_json::Value::String(name.clone());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Warn if train=false is specified since it's not meaningful
|
||||||
|
if !index.train {
|
||||||
|
log::warn!(
|
||||||
|
"train=false has no effect remote tables. The index will be created empty and automatically populated in the background."
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
match index.index {
|
match index.index {
|
||||||
// TODO: Should we pass the actual index parameters? SaaS does not
|
// TODO: Should we pass the actual index parameters? SaaS does not
|
||||||
// yet support them.
|
// yet support them.
|
||||||
@@ -1084,8 +1096,8 @@ impl<S: HttpSend> BaseTable for RemoteTable<S> {
|
|||||||
self.check_table_response(&request_id, response).await?;
|
self.check_table_response(&request_id, response).await?;
|
||||||
|
|
||||||
if let Some(wait_timeout) = index.wait_timeout {
|
if let Some(wait_timeout) = index.wait_timeout {
|
||||||
let name = format!("{}_idx", column);
|
let index_name = index.name.unwrap_or_else(|| format!("{}_idx", column));
|
||||||
self.wait_for_index(&[&name], wait_timeout).await?;
|
self.wait_for_index(&[&index_name], wait_timeout).await?;
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
|
|||||||
@@ -28,9 +28,11 @@ use lance::dataset::{
|
|||||||
};
|
};
|
||||||
use lance::dataset::{MergeInsertBuilder as LanceMergeInsertBuilder, WhenNotMatchedBySource};
|
use lance::dataset::{MergeInsertBuilder as LanceMergeInsertBuilder, WhenNotMatchedBySource};
|
||||||
use lance::index::vector::utils::infer_vector_dim;
|
use lance::index::vector::utils::infer_vector_dim;
|
||||||
|
use lance::index::vector::VectorIndexParams;
|
||||||
use lance::io::WrappingObjectStore;
|
use lance::io::WrappingObjectStore;
|
||||||
use lance_datafusion::exec::{analyze_plan as lance_analyze_plan, execute_plan};
|
use lance_datafusion::exec::{analyze_plan as lance_analyze_plan, execute_plan};
|
||||||
use lance_datafusion::utils::StreamingWriteSource;
|
use lance_datafusion::utils::StreamingWriteSource;
|
||||||
|
use lance_index::scalar::{ScalarIndexParams, ScalarIndexType};
|
||||||
use lance_index::vector::hnsw::builder::HnswBuildParams;
|
use lance_index::vector::hnsw::builder::HnswBuildParams;
|
||||||
use lance_index::vector::ivf::IvfBuildParams;
|
use lance_index::vector::ivf::IvfBuildParams;
|
||||||
use lance_index::vector::pq::PQBuildParams;
|
use lance_index::vector::pq::PQBuildParams;
|
||||||
@@ -50,11 +52,7 @@ use crate::arrow::IntoArrow;
|
|||||||
use crate::connection::NoData;
|
use crate::connection::NoData;
|
||||||
use crate::embeddings::{EmbeddingDefinition, EmbeddingRegistry, MaybeEmbedded, MemoryRegistry};
|
use crate::embeddings::{EmbeddingDefinition, EmbeddingRegistry, MaybeEmbedded, MemoryRegistry};
|
||||||
use crate::error::{Error, Result};
|
use crate::error::{Error, Result};
|
||||||
use crate::index::scalar::FtsIndexBuilder;
|
use crate::index::vector::{suggested_num_partitions_for_hnsw, VectorIndex};
|
||||||
use crate::index::vector::{
|
|
||||||
suggested_num_partitions_for_hnsw, IvfFlatIndexBuilder, IvfHnswPqIndexBuilder,
|
|
||||||
IvfHnswSqIndexBuilder, IvfPqIndexBuilder, VectorIndex,
|
|
||||||
};
|
|
||||||
use crate::index::IndexStatistics;
|
use crate::index::IndexStatistics;
|
||||||
use crate::index::{
|
use crate::index::{
|
||||||
vector::{suggested_num_partitions, suggested_num_sub_vectors},
|
vector::{suggested_num_partitions, suggested_num_sub_vectors},
|
||||||
@@ -1698,345 +1696,211 @@ impl NativeTable {
|
|||||||
.collect())
|
.collect())
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn create_ivf_flat_index(
|
// Helper to validate index type compatibility with field data type
|
||||||
&self,
|
fn validate_index_type(
|
||||||
index: IvfFlatIndexBuilder,
|
|
||||||
field: &Field,
|
field: &Field,
|
||||||
replace: bool,
|
index_name: &str,
|
||||||
|
supported_fn: impl Fn(&DataType) -> bool,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
if !supported_vector_data_type(field.data_type()) {
|
if !supported_fn(field.data_type()) {
|
||||||
return Err(Error::InvalidInput {
|
return Err(Error::Schema {
|
||||||
message: format!(
|
message: format!(
|
||||||
"An IVF Flat index cannot be created on the column `{}` which has data type {}",
|
"A {} index cannot be created on the field `{}` which has data type {}",
|
||||||
|
index_name,
|
||||||
field.name(),
|
field.name(),
|
||||||
field.data_type()
|
field.data_type()
|
||||||
),
|
),
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
let num_partitions = if let Some(n) = index.num_partitions {
|
|
||||||
n
|
|
||||||
} else {
|
|
||||||
suggested_num_partitions(self.count_rows(None).await?)
|
|
||||||
};
|
|
||||||
let mut dataset = self.dataset.get_mut().await?;
|
|
||||||
let lance_idx_params = lance::index::vector::VectorIndexParams::ivf_flat(
|
|
||||||
num_partitions as usize,
|
|
||||||
index.distance_type.into(),
|
|
||||||
);
|
|
||||||
dataset
|
|
||||||
.create_index(
|
|
||||||
&[field.name()],
|
|
||||||
IndexType::Vector,
|
|
||||||
None,
|
|
||||||
&lance_idx_params,
|
|
||||||
replace,
|
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn create_ivf_pq_index(
|
// Helper to get num_partitions with default calculation
|
||||||
|
async fn get_num_partitions(
|
||||||
&self,
|
&self,
|
||||||
index: IvfPqIndexBuilder,
|
provided: Option<u32>,
|
||||||
field: &Field,
|
for_hnsw: bool,
|
||||||
replace: bool,
|
dim: Option<u32>,
|
||||||
) -> Result<()> {
|
) -> Result<u32> {
|
||||||
if !supported_vector_data_type(field.data_type()) {
|
if let Some(n) = provided {
|
||||||
return Err(Error::InvalidInput {
|
Ok(n)
|
||||||
message: format!(
|
} else {
|
||||||
"An IVF PQ index cannot be created on the column `{}` which has data type {}",
|
let row_count = self.count_rows(None).await?;
|
||||||
field.name(),
|
if for_hnsw {
|
||||||
field.data_type()
|
Ok(suggested_num_partitions_for_hnsw(
|
||||||
),
|
row_count,
|
||||||
});
|
dim.ok_or_else(|| Error::InvalidInput {
|
||||||
|
message: "Vector dimension required for HNSW partitioning".to_string(),
|
||||||
|
})?,
|
||||||
|
))
|
||||||
|
} else {
|
||||||
|
Ok(suggested_num_partitions(row_count))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let num_partitions = if let Some(n) = index.num_partitions {
|
|
||||||
n
|
|
||||||
} else {
|
|
||||||
suggested_num_partitions(self.count_rows(None).await?)
|
|
||||||
};
|
|
||||||
let num_sub_vectors: u32 = if let Some(n) = index.num_sub_vectors {
|
|
||||||
n
|
|
||||||
} else {
|
|
||||||
let dim = infer_vector_dim(field.data_type())?;
|
|
||||||
suggested_num_sub_vectors(dim as u32)
|
|
||||||
};
|
|
||||||
let mut dataset = self.dataset.get_mut().await?;
|
|
||||||
let lance_idx_params = lance::index::vector::VectorIndexParams::ivf_pq(
|
|
||||||
num_partitions as usize,
|
|
||||||
/*num_bits=*/ 8,
|
|
||||||
num_sub_vectors as usize,
|
|
||||||
index.distance_type.into(),
|
|
||||||
index.max_iterations as usize,
|
|
||||||
);
|
|
||||||
dataset
|
|
||||||
.create_index(
|
|
||||||
&[field.name()],
|
|
||||||
IndexType::Vector,
|
|
||||||
None,
|
|
||||||
&lance_idx_params,
|
|
||||||
replace,
|
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn create_ivf_hnsw_pq_index(
|
// Helper to get num_sub_vectors with default calculation
|
||||||
&self,
|
fn get_num_sub_vectors(provided: Option<u32>, dim: u32) -> u32 {
|
||||||
index: IvfHnswPqIndexBuilder,
|
provided.unwrap_or_else(|| suggested_num_sub_vectors(dim))
|
||||||
field: &Field,
|
}
|
||||||
replace: bool,
|
|
||||||
) -> Result<()> {
|
// Helper to extract vector dimension from field
|
||||||
if !supported_vector_data_type(field.data_type()) {
|
fn get_vector_dimension(field: &Field) -> Result<u32> {
|
||||||
return Err(Error::InvalidInput {
|
match field.data_type() {
|
||||||
message: format!(
|
arrow_schema::DataType::FixedSizeList(_, n) => Ok(*n as u32),
|
||||||
"An IVF HNSW PQ index cannot be created on the column `{}` which has data type {}",
|
_ => Ok(infer_vector_dim(field.data_type())? as u32),
|
||||||
field.name(),
|
|
||||||
field.data_type()
|
|
||||||
),
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
let num_partitions: u32 = if let Some(n) = index.num_partitions {
|
// Convert LanceDB Index to Lance IndexParams
|
||||||
n
|
async fn make_index_params(
|
||||||
} else {
|
&self,
|
||||||
match field.data_type() {
|
field: &Field,
|
||||||
arrow_schema::DataType::FixedSizeList(_, n) => Ok::<u32, Error>(
|
index_opts: Index,
|
||||||
suggested_num_partitions_for_hnsw(self.count_rows(None).await?, *n as u32),
|
) -> Result<Box<dyn lance::index::IndexParams>> {
|
||||||
),
|
match index_opts {
|
||||||
_ => Err(Error::Schema {
|
Index::Auto => {
|
||||||
message: format!("Column '{}' is not a FixedSizeList", field.name()),
|
if supported_vector_data_type(field.data_type()) {
|
||||||
}),
|
// Use IvfPq as the default for auto vector indices
|
||||||
}?
|
let dim = Self::get_vector_dimension(field)?;
|
||||||
};
|
let num_partitions = self.get_num_partitions(None, false, None).await?;
|
||||||
|
let num_sub_vectors = Self::get_num_sub_vectors(None, dim);
|
||||||
let num_sub_vectors: u32 = if let Some(n) = index.num_sub_vectors {
|
let lance_idx_params = lance::index::vector::VectorIndexParams::ivf_pq(
|
||||||
n
|
num_partitions as usize,
|
||||||
} else {
|
/*num_bits=*/ 8,
|
||||||
match field.data_type() {
|
num_sub_vectors as usize,
|
||||||
arrow_schema::DataType::FixedSizeList(_, n) => {
|
lance_linalg::distance::MetricType::L2,
|
||||||
Ok::<u32, Error>(suggested_num_sub_vectors(*n as u32))
|
/*max_iterations=*/ 50,
|
||||||
|
);
|
||||||
|
Ok(Box::new(lance_idx_params))
|
||||||
|
} else if supported_btree_data_type(field.data_type()) {
|
||||||
|
Ok(Box::new(ScalarIndexParams::new(ScalarIndexType::BTree)))
|
||||||
|
} else {
|
||||||
|
return Err(Error::InvalidInput {
|
||||||
|
message: format!(
|
||||||
|
"there are no indices supported for the field `{}` with the data type {}",
|
||||||
|
field.name(),
|
||||||
|
field.data_type()
|
||||||
|
),
|
||||||
|
});
|
||||||
}
|
}
|
||||||
_ => Err(Error::Schema {
|
}
|
||||||
message: format!("Column '{}' is not a FixedSizeList", field.name()),
|
Index::BTree(_) => {
|
||||||
}),
|
Self::validate_index_type(field, "BTree", supported_btree_data_type)?;
|
||||||
}?
|
Ok(Box::new(ScalarIndexParams::new(ScalarIndexType::BTree)))
|
||||||
};
|
}
|
||||||
|
Index::Bitmap(_) => {
|
||||||
let mut dataset = self.dataset.get_mut().await?;
|
Self::validate_index_type(field, "Bitmap", supported_bitmap_data_type)?;
|
||||||
let mut ivf_params = IvfBuildParams::new(num_partitions as usize);
|
Ok(Box::new(ScalarIndexParams::new(ScalarIndexType::Bitmap)))
|
||||||
ivf_params.sample_rate = index.sample_rate as usize;
|
}
|
||||||
ivf_params.max_iters = index.max_iterations as usize;
|
Index::LabelList(_) => {
|
||||||
let hnsw_params = HnswBuildParams::default()
|
Self::validate_index_type(field, "LabelList", supported_label_list_data_type)?;
|
||||||
.num_edges(index.m as usize)
|
Ok(Box::new(ScalarIndexParams::new(ScalarIndexType::LabelList)))
|
||||||
.ef_construction(index.ef_construction as usize);
|
}
|
||||||
let pq_params = PQBuildParams {
|
Index::FTS(fts_opts) => {
|
||||||
num_sub_vectors: num_sub_vectors as usize,
|
Self::validate_index_type(field, "FTS", supported_fts_data_type)?;
|
||||||
..Default::default()
|
Ok(Box::new(fts_opts))
|
||||||
};
|
}
|
||||||
let lance_idx_params = lance::index::vector::VectorIndexParams::with_ivf_hnsw_pq_params(
|
Index::IvfFlat(index) => {
|
||||||
index.distance_type.into(),
|
Self::validate_index_type(field, "IVF Flat", supported_vector_data_type)?;
|
||||||
ivf_params,
|
let num_partitions = self
|
||||||
hnsw_params,
|
.get_num_partitions(index.num_partitions, false, None)
|
||||||
pq_params,
|
.await?;
|
||||||
);
|
let lance_idx_params = VectorIndexParams::ivf_flat(
|
||||||
dataset
|
num_partitions as usize,
|
||||||
.create_index(
|
index.distance_type.into(),
|
||||||
&[field.name()],
|
);
|
||||||
IndexType::Vector,
|
Ok(Box::new(lance_idx_params))
|
||||||
None,
|
}
|
||||||
&lance_idx_params,
|
Index::IvfPq(index) => {
|
||||||
replace,
|
Self::validate_index_type(field, "IVF PQ", supported_vector_data_type)?;
|
||||||
)
|
let dim = Self::get_vector_dimension(field)?;
|
||||||
.await?;
|
let num_partitions = self
|
||||||
Ok(())
|
.get_num_partitions(index.num_partitions, false, None)
|
||||||
}
|
.await?;
|
||||||
|
let num_sub_vectors = Self::get_num_sub_vectors(index.num_sub_vectors, dim);
|
||||||
async fn create_ivf_hnsw_sq_index(
|
let lance_idx_params = VectorIndexParams::ivf_pq(
|
||||||
&self,
|
num_partitions as usize,
|
||||||
index: IvfHnswSqIndexBuilder,
|
/*num_bits=*/ 8,
|
||||||
field: &Field,
|
num_sub_vectors as usize,
|
||||||
replace: bool,
|
index.distance_type.into(),
|
||||||
) -> Result<()> {
|
index.max_iterations as usize,
|
||||||
if !supported_vector_data_type(field.data_type()) {
|
);
|
||||||
return Err(Error::InvalidInput {
|
Ok(Box::new(lance_idx_params))
|
||||||
message: format!(
|
}
|
||||||
"An IVF HNSW SQ index cannot be created on the column `{}` which has data type {}",
|
Index::IvfHnswPq(index) => {
|
||||||
field.name(),
|
Self::validate_index_type(field, "IVF HNSW PQ", supported_vector_data_type)?;
|
||||||
field.data_type()
|
let dim = Self::get_vector_dimension(field)?;
|
||||||
),
|
let num_partitions = self
|
||||||
});
|
.get_num_partitions(index.num_partitions, true, Some(dim))
|
||||||
}
|
.await?;
|
||||||
|
let num_sub_vectors = Self::get_num_sub_vectors(index.num_sub_vectors, dim);
|
||||||
let num_partitions: u32 = if let Some(n) = index.num_partitions {
|
let mut ivf_params = IvfBuildParams::new(num_partitions as usize);
|
||||||
n
|
ivf_params.sample_rate = index.sample_rate as usize;
|
||||||
} else {
|
ivf_params.max_iters = index.max_iterations as usize;
|
||||||
match field.data_type() {
|
let hnsw_params = HnswBuildParams::default()
|
||||||
arrow_schema::DataType::FixedSizeList(_, n) => Ok::<u32, Error>(
|
.num_edges(index.m as usize)
|
||||||
suggested_num_partitions_for_hnsw(self.count_rows(None).await?, *n as u32),
|
.ef_construction(index.ef_construction as usize);
|
||||||
),
|
let pq_params = PQBuildParams {
|
||||||
_ => Err(Error::Schema {
|
num_sub_vectors: num_sub_vectors as usize,
|
||||||
message: format!("Column '{}' is not a FixedSizeList", field.name()),
|
..Default::default()
|
||||||
}),
|
};
|
||||||
}?
|
let lance_idx_params = VectorIndexParams::with_ivf_hnsw_pq_params(
|
||||||
};
|
index.distance_type.into(),
|
||||||
|
ivf_params,
|
||||||
let mut dataset = self.dataset.get_mut().await?;
|
hnsw_params,
|
||||||
let mut ivf_params = IvfBuildParams::new(num_partitions as usize);
|
pq_params,
|
||||||
ivf_params.sample_rate = index.sample_rate as usize;
|
);
|
||||||
ivf_params.max_iters = index.max_iterations as usize;
|
Ok(Box::new(lance_idx_params))
|
||||||
let hnsw_params = HnswBuildParams::default()
|
}
|
||||||
.num_edges(index.m as usize)
|
Index::IvfHnswSq(index) => {
|
||||||
.ef_construction(index.ef_construction as usize);
|
Self::validate_index_type(field, "IVF HNSW SQ", supported_vector_data_type)?;
|
||||||
let sq_params = SQBuildParams {
|
let dim = Self::get_vector_dimension(field)?;
|
||||||
sample_rate: index.sample_rate as usize,
|
let num_partitions = self
|
||||||
..Default::default()
|
.get_num_partitions(index.num_partitions, true, Some(dim))
|
||||||
};
|
.await?;
|
||||||
let lance_idx_params = lance::index::vector::VectorIndexParams::with_ivf_hnsw_sq_params(
|
let mut ivf_params = IvfBuildParams::new(num_partitions as usize);
|
||||||
index.distance_type.into(),
|
ivf_params.sample_rate = index.sample_rate as usize;
|
||||||
ivf_params,
|
ivf_params.max_iters = index.max_iterations as usize;
|
||||||
hnsw_params,
|
let hnsw_params = HnswBuildParams::default()
|
||||||
sq_params,
|
.num_edges(index.m as usize)
|
||||||
);
|
.ef_construction(index.ef_construction as usize);
|
||||||
dataset
|
let sq_params = SQBuildParams {
|
||||||
.create_index(
|
sample_rate: index.sample_rate as usize,
|
||||||
&[field.name()],
|
..Default::default()
|
||||||
IndexType::Vector,
|
};
|
||||||
None,
|
let lance_idx_params = VectorIndexParams::with_ivf_hnsw_sq_params(
|
||||||
&lance_idx_params,
|
index.distance_type.into(),
|
||||||
replace,
|
ivf_params,
|
||||||
)
|
hnsw_params,
|
||||||
.await?;
|
sq_params,
|
||||||
Ok(())
|
);
|
||||||
}
|
Ok(Box::new(lance_idx_params))
|
||||||
|
}
|
||||||
async fn create_auto_index(&self, field: &Field, opts: IndexBuilder) -> Result<()> {
|
|
||||||
if supported_vector_data_type(field.data_type()) {
|
|
||||||
self.create_ivf_pq_index(IvfPqIndexBuilder::default(), field, opts.replace)
|
|
||||||
.await
|
|
||||||
} else if supported_btree_data_type(field.data_type()) {
|
|
||||||
self.create_btree_index(field, opts).await
|
|
||||||
} else {
|
|
||||||
Err(Error::InvalidInput {
|
|
||||||
message: format!(
|
|
||||||
"there are no indices supported for the field `{}` with the data type {}",
|
|
||||||
field.name(),
|
|
||||||
field.data_type()
|
|
||||||
),
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn create_btree_index(&self, field: &Field, opts: IndexBuilder) -> Result<()> {
|
// Helper method to get the correct IndexType based on the Index variant and field data type
|
||||||
if !supported_btree_data_type(field.data_type()) {
|
fn get_index_type_for_field(&self, field: &Field, index: &Index) -> IndexType {
|
||||||
return Err(Error::Schema {
|
match index {
|
||||||
message: format!(
|
Index::Auto => {
|
||||||
"A BTree index cannot be created on the field `{}` which has data type {}",
|
if supported_vector_data_type(field.data_type()) {
|
||||||
field.name(),
|
IndexType::Vector
|
||||||
field.data_type()
|
} else if supported_btree_data_type(field.data_type()) {
|
||||||
),
|
IndexType::BTree
|
||||||
});
|
} else {
|
||||||
|
// This should not happen since make_index_params would have failed
|
||||||
|
IndexType::BTree
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Index::BTree(_) => IndexType::BTree,
|
||||||
|
Index::Bitmap(_) => IndexType::Bitmap,
|
||||||
|
Index::LabelList(_) => IndexType::LabelList,
|
||||||
|
Index::FTS(_) => IndexType::Inverted,
|
||||||
|
Index::IvfFlat(_) | Index::IvfPq(_) | Index::IvfHnswPq(_) | Index::IvfHnswSq(_) => {
|
||||||
|
IndexType::Vector
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut dataset = self.dataset.get_mut().await?;
|
|
||||||
let lance_idx_params = lance_index::scalar::ScalarIndexParams {
|
|
||||||
force_index_type: Some(lance_index::scalar::ScalarIndexType::BTree),
|
|
||||||
};
|
|
||||||
dataset
|
|
||||||
.create_index(
|
|
||||||
&[field.name()],
|
|
||||||
IndexType::BTree,
|
|
||||||
None,
|
|
||||||
&lance_idx_params,
|
|
||||||
opts.replace,
|
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn create_bitmap_index(&self, field: &Field, opts: IndexBuilder) -> Result<()> {
|
|
||||||
if !supported_bitmap_data_type(field.data_type()) {
|
|
||||||
return Err(Error::Schema {
|
|
||||||
message: format!(
|
|
||||||
"A Bitmap index cannot be created on the field `{}` which has data type {}",
|
|
||||||
field.name(),
|
|
||||||
field.data_type()
|
|
||||||
),
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut dataset = self.dataset.get_mut().await?;
|
|
||||||
let lance_idx_params = lance_index::scalar::ScalarIndexParams {
|
|
||||||
force_index_type: Some(lance_index::scalar::ScalarIndexType::Bitmap),
|
|
||||||
};
|
|
||||||
dataset
|
|
||||||
.create_index(
|
|
||||||
&[field.name()],
|
|
||||||
IndexType::Bitmap,
|
|
||||||
None,
|
|
||||||
&lance_idx_params,
|
|
||||||
opts.replace,
|
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn create_label_list_index(&self, field: &Field, opts: IndexBuilder) -> Result<()> {
|
|
||||||
if !supported_label_list_data_type(field.data_type()) {
|
|
||||||
return Err(Error::Schema {
|
|
||||||
message: format!(
|
|
||||||
"A LabelList index cannot be created on the field `{}` which has data type {}",
|
|
||||||
field.name(),
|
|
||||||
field.data_type()
|
|
||||||
),
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut dataset = self.dataset.get_mut().await?;
|
|
||||||
let lance_idx_params = lance_index::scalar::ScalarIndexParams {
|
|
||||||
force_index_type: Some(lance_index::scalar::ScalarIndexType::LabelList),
|
|
||||||
};
|
|
||||||
dataset
|
|
||||||
.create_index(
|
|
||||||
&[field.name()],
|
|
||||||
IndexType::LabelList,
|
|
||||||
None,
|
|
||||||
&lance_idx_params,
|
|
||||||
opts.replace,
|
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn create_fts_index(
|
|
||||||
&self,
|
|
||||||
field: &Field,
|
|
||||||
fts_opts: FtsIndexBuilder,
|
|
||||||
replace: bool,
|
|
||||||
) -> Result<()> {
|
|
||||||
if !supported_fts_data_type(field.data_type()) {
|
|
||||||
return Err(Error::Schema {
|
|
||||||
message: format!(
|
|
||||||
"A FTS index cannot be created on the field `{}` which has data type {}",
|
|
||||||
field.name(),
|
|
||||||
field.data_type()
|
|
||||||
),
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut dataset = self.dataset.get_mut().await?;
|
|
||||||
dataset
|
|
||||||
.create_index(
|
|
||||||
&[field.name()],
|
|
||||||
IndexType::Inverted,
|
|
||||||
None,
|
|
||||||
&fts_opts,
|
|
||||||
replace,
|
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn generic_query(
|
async fn generic_query(
|
||||||
@@ -2251,26 +2115,20 @@ impl BaseTable for NativeTable {
|
|||||||
|
|
||||||
let field = schema.field_with_name(&opts.columns[0])?;
|
let field = schema.field_with_name(&opts.columns[0])?;
|
||||||
|
|
||||||
match opts.index {
|
let lance_idx_params = self.make_index_params(field, opts.index.clone()).await?;
|
||||||
Index::Auto => self.create_auto_index(field, opts).await,
|
let index_type = self.get_index_type_for_field(field, &opts.index);
|
||||||
Index::BTree(_) => self.create_btree_index(field, opts).await,
|
let columns = [field.name().as_str()];
|
||||||
Index::Bitmap(_) => self.create_bitmap_index(field, opts).await,
|
let mut dataset = self.dataset.get_mut().await?;
|
||||||
Index::LabelList(_) => self.create_label_list_index(field, opts).await,
|
let mut builder = dataset
|
||||||
Index::FTS(fts_opts) => self.create_fts_index(field, fts_opts, opts.replace).await,
|
.create_index_builder(&columns, index_type, lance_idx_params.as_ref())
|
||||||
Index::IvfFlat(ivf_flat) => {
|
.train(opts.train)
|
||||||
self.create_ivf_flat_index(ivf_flat, field, opts.replace)
|
.replace(opts.replace);
|
||||||
.await
|
|
||||||
}
|
if let Some(name) = opts.name {
|
||||||
Index::IvfPq(ivf_pq) => self.create_ivf_pq_index(ivf_pq, field, opts.replace).await,
|
builder = builder.name(name);
|
||||||
Index::IvfHnswPq(ivf_hnsw_pq) => {
|
|
||||||
self.create_ivf_hnsw_pq_index(ivf_hnsw_pq, field, opts.replace)
|
|
||||||
.await
|
|
||||||
}
|
|
||||||
Index::IvfHnswSq(ivf_hnsw_sq) => {
|
|
||||||
self.create_ivf_hnsw_sq_index(ivf_hnsw_sq, field, opts.replace)
|
|
||||||
.await
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
builder.await?;
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn drop_index(&self, index_name: &str) -> Result<()> {
|
async fn drop_index(&self, index_name: &str) -> Result<()> {
|
||||||
@@ -2890,6 +2748,7 @@ mod tests {
|
|||||||
use crate::connect;
|
use crate::connect;
|
||||||
use crate::connection::ConnectBuilder;
|
use crate::connection::ConnectBuilder;
|
||||||
use crate::index::scalar::{BTreeIndexBuilder, BitmapIndexBuilder};
|
use crate::index::scalar::{BTreeIndexBuilder, BitmapIndexBuilder};
|
||||||
|
use crate::index::vector::{IvfHnswPqIndexBuilder, IvfHnswSqIndexBuilder};
|
||||||
use crate::query::{ExecutableQuery, QueryBase};
|
use crate::query::{ExecutableQuery, QueryBase};
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
|
|||||||
Reference in New Issue
Block a user