mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-24 13:59:58 +00:00
Compare commits
7 Commits
python-v0.
...
python-v0.
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
e1f9b011f8 | ||
|
|
d664b8739f | ||
|
|
20bec61ecb | ||
|
|
45255be42c | ||
|
|
93c2cf2f59 | ||
|
|
9d29c83f81 | ||
|
|
2a6143b5bd |
@@ -1,5 +1,5 @@
|
||||
[tool.bumpversion]
|
||||
current_version = "0.22.3-beta.2"
|
||||
current_version = "0.22.3-beta.3"
|
||||
parse = """(?x)
|
||||
(?P<major>0|[1-9]\\d*)\\.
|
||||
(?P<minor>0|[1-9]\\d*)\\.
|
||||
|
||||
101
AGENTS.md
Normal file
101
AGENTS.md
Normal file
@@ -0,0 +1,101 @@
|
||||
LanceDB is a database designed for retrieval, including vector, full-text, and hybrid search.
|
||||
It is a wrapper around Lance. There are two backends: local (in-process like SQLite) and
|
||||
remote (against LanceDB Cloud).
|
||||
|
||||
The core of LanceDB is written in Rust. There are bindings in Python, Typescript, and Java.
|
||||
|
||||
Project layout:
|
||||
|
||||
* `rust/lancedb`: The LanceDB core Rust implementation.
|
||||
* `python`: The Python bindings, using PyO3.
|
||||
* `nodejs`: The Typescript bindings, using napi-rs
|
||||
* `java`: The Java bindings
|
||||
|
||||
Common commands:
|
||||
|
||||
* Check for compiler errors: `cargo check --quiet --features remote --tests --examples`
|
||||
* Run tests: `cargo test --quiet --features remote --tests`
|
||||
* Run specific test: `cargo test --quiet --features remote -p <package_name> --test <test_name>`
|
||||
* Lint: `cargo clippy --quiet --features remote --tests --examples`
|
||||
* Format: `cargo fmt --all`
|
||||
|
||||
Before committing changes, run formatting.
|
||||
|
||||
## Coding tips
|
||||
|
||||
* When writing Rust doctests for things that require a connection or table reference,
|
||||
write them as a function instead of a fully executable test. This allows type checking
|
||||
to run but avoids needing a full test environment. For example:
|
||||
```rust
|
||||
/// ```
|
||||
/// use lance_index::scalar::FullTextSearchQuery;
|
||||
/// use lancedb::query::{QueryBase, ExecutableQuery};
|
||||
///
|
||||
/// # use lancedb::Table;
|
||||
/// # async fn query(table: &Table) -> Result<(), Box<dyn std::error::Error>> {
|
||||
/// let results = table.query()
|
||||
/// .full_text_search(FullTextSearchQuery::new("hello world".into()))
|
||||
/// .execute()
|
||||
/// .await?;
|
||||
/// # Ok(())
|
||||
/// # }
|
||||
/// ```
|
||||
```
|
||||
|
||||
## Example plan: adding a new method on Table
|
||||
|
||||
Adding a new method involves first adding it to the Rust core, then exposing it
|
||||
in the Python and TypeScript bindings. There are both local and remote tables.
|
||||
Remote tables are implemented via a HTTP API and require the `remote` cargo
|
||||
feature flag to be enabled. Python has both sync and async methods.
|
||||
|
||||
Rust core changes:
|
||||
|
||||
1. Add method on `Table` struct in `rust/lancedb/src/table.rs` (calls `BaseTable` trait).
|
||||
2. Add method to `BaseTable` trait in `rust/lancedb/src/table.rs`.
|
||||
3. Implement new trait method on `NativeTable` in `rust/lancedb/src/table.rs`.
|
||||
* Test with unit test in `rust/lancedb/src/table.rs`.
|
||||
4. Implement new trait method on `RemoteTable` in `rust/lancedb/src/remote/table.rs`.
|
||||
* Test with unit test in `rust/lancedb/src/remote/table.rs` against mocked endpoint.
|
||||
|
||||
Python bindings changes:
|
||||
|
||||
1. Add PyO3 method binding in `python/src/table.rs`. Run `make develop` to compile bindings.
|
||||
2. Add types for PyO3 method in `python/python/lancedb/_lancedb.pyi`.
|
||||
3. Add method to `AsyncTable` class in `python/python/lancedb/table.py`.
|
||||
4. Add abstract method to `Table` abstract base class in `python/python/lancedb/table.py`.
|
||||
5. Add concrete sync method to `LanceTable` class in `python/python/lancedb/table.py`.
|
||||
* Should use `LOOP.run()` to call the corresponding `AsyncTable` method.
|
||||
6. Add concrete sync method to `RemoteTable` class in `python/python/lancedb/remote/table.py`.
|
||||
7. Add unit test in `python/tests/test_table.py`.
|
||||
|
||||
TypeScript bindings changes:
|
||||
|
||||
1. Add napi-rs method binding on `Table` in `nodejs/src/table.rs`.
|
||||
2. Run `npm run build` to generate TypeScript definitions.
|
||||
3. Add typescript method on abstract class `Table` in `nodejs/src/table.ts`.
|
||||
4. Add concrete method on `LocalTable` class in `nodejs/src/native_table.ts`.
|
||||
* Note: despite the name, this class is also used for remote tables.
|
||||
5. Add test in `nodejs/__test__/table.test.ts`.
|
||||
6. Run `npm run docs` to generate TypeScript documentation.
|
||||
|
||||
## Review Guidelines
|
||||
|
||||
Please consider the following when reviewing code contributions.
|
||||
|
||||
### Rust API design
|
||||
* Design public APIs so they can be evolved easily in the future without breaking
|
||||
changes. Often this means using builder patterns or options structs instead of
|
||||
long argument lists.
|
||||
* For public APIs, prefer inputs that use `Into<T>` or `AsRef<T>` traits to allow
|
||||
more flexible inputs. For example, use `name: Into<String>` instead of `name: String`,
|
||||
so we don't have to write `func("my_string".to_string())`.
|
||||
|
||||
### Testing
|
||||
* Ensure all new public APIs have documentation and examples.
|
||||
* Ensure that all bugfixes and features have corresponding tests. **We do not merge
|
||||
code without tests.**
|
||||
|
||||
### Documentation
|
||||
* New features must include updates to the rust documentation comments. Link to
|
||||
relevant structs and methods to increase the value of documentation.
|
||||
80
CLAUDE.md
80
CLAUDE.md
@@ -1,80 +0,0 @@
|
||||
LanceDB is a database designed for retrieval, including vector, full-text, and hybrid search.
|
||||
It is a wrapper around Lance. There are two backends: local (in-process like SQLite) and
|
||||
remote (against LanceDB Cloud).
|
||||
|
||||
The core of LanceDB is written in Rust. There are bindings in Python, Typescript, and Java.
|
||||
|
||||
Project layout:
|
||||
|
||||
* `rust/lancedb`: The LanceDB core Rust implementation.
|
||||
* `python`: The Python bindings, using PyO3.
|
||||
* `nodejs`: The Typescript bindings, using napi-rs
|
||||
* `java`: The Java bindings
|
||||
|
||||
Common commands:
|
||||
|
||||
* Check for compiler errors: `cargo check --quiet --features remote --tests --examples`
|
||||
* Run tests: `cargo test --quiet --features remote --tests`
|
||||
* Run specific test: `cargo test --quiet --features remote -p <package_name> --test <test_name>`
|
||||
* Lint: `cargo clippy --quiet --features remote --tests --examples`
|
||||
* Format: `cargo fmt --all`
|
||||
|
||||
Before committing changes, run formatting.
|
||||
|
||||
## Coding tips
|
||||
|
||||
* When writing Rust doctests for things that require a connection or table reference,
|
||||
write them as a function instead of a fully executable test. This allows type checking
|
||||
to run but avoids needing a full test environment. For example:
|
||||
```rust
|
||||
/// ```
|
||||
/// use lance_index::scalar::FullTextSearchQuery;
|
||||
/// use lancedb::query::{QueryBase, ExecutableQuery};
|
||||
///
|
||||
/// # use lancedb::Table;
|
||||
/// # async fn query(table: &Table) -> Result<(), Box<dyn std::error::Error>> {
|
||||
/// let results = table.query()
|
||||
/// .full_text_search(FullTextSearchQuery::new("hello world".into()))
|
||||
/// .execute()
|
||||
/// .await?;
|
||||
/// # Ok(())
|
||||
/// # }
|
||||
/// ```
|
||||
```
|
||||
|
||||
## Example plan: adding a new method on Table
|
||||
|
||||
Adding a new method involves first adding it to the Rust core, then exposing it
|
||||
in the Python and TypeScript bindings. There are both local and remote tables.
|
||||
Remote tables are implemented via a HTTP API and require the `remote` cargo
|
||||
feature flag to be enabled. Python has both sync and async methods.
|
||||
|
||||
Rust core changes:
|
||||
|
||||
1. Add method on `Table` struct in `rust/lancedb/src/table.rs` (calls `BaseTable` trait).
|
||||
2. Add method to `BaseTable` trait in `rust/lancedb/src/table.rs`.
|
||||
3. Implement new trait method on `NativeTable` in `rust/lancedb/src/table.rs`.
|
||||
* Test with unit test in `rust/lancedb/src/table.rs`.
|
||||
4. Implement new trait method on `RemoteTable` in `rust/lancedb/src/remote/table.rs`.
|
||||
* Test with unit test in `rust/lancedb/src/remote/table.rs` against mocked endpoint.
|
||||
|
||||
Python bindings changes:
|
||||
|
||||
1. Add PyO3 method binding in `python/src/table.rs`. Run `make develop` to compile bindings.
|
||||
2. Add types for PyO3 method in `python/python/lancedb/_lancedb.pyi`.
|
||||
3. Add method to `AsyncTable` class in `python/python/lancedb/table.py`.
|
||||
4. Add abstract method to `Table` abstract base class in `python/python/lancedb/table.py`.
|
||||
5. Add concrete sync method to `LanceTable` class in `python/python/lancedb/table.py`.
|
||||
* Should use `LOOP.run()` to call the corresponding `AsyncTable` method.
|
||||
6. Add concrete sync method to `RemoteTable` class in `python/python/lancedb/remote/table.py`.
|
||||
7. Add unit test in `python/tests/test_table.py`.
|
||||
|
||||
TypeScript bindings changes:
|
||||
|
||||
1. Add napi-rs method binding on `Table` in `nodejs/src/table.rs`.
|
||||
2. Run `npm run build` to generate TypeScript definitions.
|
||||
3. Add typescript method on abstract class `Table` in `nodejs/src/table.ts`.
|
||||
4. Add concrete method on `LocalTable` class in `nodejs/src/native_table.ts`.
|
||||
* Note: despite the name, this class is also used for remote tables.
|
||||
5. Add test in `nodejs/__test__/table.test.ts`.
|
||||
6. Run `npm run docs` to generate TypeScript documentation.
|
||||
91
Cargo.lock
generated
91
Cargo.lock
generated
@@ -2933,18 +2933,6 @@ version = "0.2.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f8eb564c5c7423d25c886fb561d1e4ee69f72354d16918afa32c08811f6b6a55"
|
||||
|
||||
[[package]]
|
||||
name = "fastbloom"
|
||||
version = "0.14.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "18c1ddb9231d8554c2d6bdf4cfaabf0c59251658c68b6c95cd52dd0c513a912a"
|
||||
dependencies = [
|
||||
"getrandom 0.3.3",
|
||||
"libm",
|
||||
"rand 0.9.2",
|
||||
"siphasher",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fastdivide"
|
||||
version = "0.4.2"
|
||||
@@ -3044,8 +3032,8 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c"
|
||||
|
||||
[[package]]
|
||||
name = "fsst"
|
||||
version = "0.38.3-beta.11"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3-beta.11#db497cb2373156679aa0d6a1f2087880a8579bc6"
|
||||
version = "0.38.3"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3#afc0f9832cf11d0bf74381c2b63fd37de1c5f415"
|
||||
dependencies = [
|
||||
"arrow-array",
|
||||
"rand 0.9.2",
|
||||
@@ -4229,8 +4217,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance"
|
||||
version = "0.38.3-beta.11"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3-beta.11#db497cb2373156679aa0d6a1f2087880a8579bc6"
|
||||
version = "0.38.3"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3#afc0f9832cf11d0bf74381c2b63fd37de1c5f415"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"arrow-arith",
|
||||
@@ -4293,8 +4281,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-arrow"
|
||||
version = "0.38.3-beta.11"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3-beta.11#db497cb2373156679aa0d6a1f2087880a8579bc6"
|
||||
version = "0.38.3"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3#afc0f9832cf11d0bf74381c2b63fd37de1c5f415"
|
||||
dependencies = [
|
||||
"arrow-array",
|
||||
"arrow-buffer",
|
||||
@@ -4312,8 +4300,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-bitpacking"
|
||||
version = "0.38.3-beta.11"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3-beta.11#db497cb2373156679aa0d6a1f2087880a8579bc6"
|
||||
version = "0.38.3"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3#afc0f9832cf11d0bf74381c2b63fd37de1c5f415"
|
||||
dependencies = [
|
||||
"arrayref",
|
||||
"paste",
|
||||
@@ -4322,8 +4310,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-core"
|
||||
version = "0.38.3-beta.11"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3-beta.11#db497cb2373156679aa0d6a1f2087880a8579bc6"
|
||||
version = "0.38.3"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3#afc0f9832cf11d0bf74381c2b63fd37de1c5f415"
|
||||
dependencies = [
|
||||
"arrow-array",
|
||||
"arrow-buffer",
|
||||
@@ -4359,8 +4347,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-datafusion"
|
||||
version = "0.38.3-beta.11"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3-beta.11#db497cb2373156679aa0d6a1f2087880a8579bc6"
|
||||
version = "0.38.3"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3#afc0f9832cf11d0bf74381c2b63fd37de1c5f415"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"arrow-array",
|
||||
@@ -4389,8 +4377,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-datagen"
|
||||
version = "0.38.3-beta.11"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3-beta.11#db497cb2373156679aa0d6a1f2087880a8579bc6"
|
||||
version = "0.38.3"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3#afc0f9832cf11d0bf74381c2b63fd37de1c5f415"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"arrow-array",
|
||||
@@ -4407,8 +4395,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-encoding"
|
||||
version = "0.38.3-beta.11"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3-beta.11#db497cb2373156679aa0d6a1f2087880a8579bc6"
|
||||
version = "0.38.3"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3#afc0f9832cf11d0bf74381c2b63fd37de1c5f415"
|
||||
dependencies = [
|
||||
"arrow-arith",
|
||||
"arrow-array",
|
||||
@@ -4445,8 +4433,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-file"
|
||||
version = "0.38.3-beta.11"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3-beta.11#db497cb2373156679aa0d6a1f2087880a8579bc6"
|
||||
version = "0.38.3"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3#afc0f9832cf11d0bf74381c2b63fd37de1c5f415"
|
||||
dependencies = [
|
||||
"arrow-arith",
|
||||
"arrow-array",
|
||||
@@ -4471,7 +4459,6 @@ dependencies = [
|
||||
"prost",
|
||||
"prost-build",
|
||||
"prost-types",
|
||||
"roaring",
|
||||
"snafu",
|
||||
"tokio",
|
||||
"tracing",
|
||||
@@ -4479,8 +4466,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-index"
|
||||
version = "0.38.3-beta.11"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3-beta.11#db497cb2373156679aa0d6a1f2087880a8579bc6"
|
||||
version = "0.38.3"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3#afc0f9832cf11d0bf74381c2b63fd37de1c5f415"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"arrow-arith",
|
||||
@@ -4502,7 +4489,6 @@ dependencies = [
|
||||
"datafusion-sql",
|
||||
"deepsize",
|
||||
"dirs",
|
||||
"fastbloom",
|
||||
"fst",
|
||||
"futures",
|
||||
"half",
|
||||
@@ -4542,8 +4528,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-io"
|
||||
version = "0.38.3-beta.11"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3-beta.11#db497cb2373156679aa0d6a1f2087880a8579bc6"
|
||||
version = "0.38.3"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3#afc0f9832cf11d0bf74381c2b63fd37de1c5f415"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"arrow-arith",
|
||||
@@ -4583,32 +4569,25 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-linalg"
|
||||
version = "0.38.3-beta.11"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3-beta.11#db497cb2373156679aa0d6a1f2087880a8579bc6"
|
||||
version = "0.38.3"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3#afc0f9832cf11d0bf74381c2b63fd37de1c5f415"
|
||||
dependencies = [
|
||||
"arrow-array",
|
||||
"arrow-buffer",
|
||||
"arrow-ord",
|
||||
"arrow-schema",
|
||||
"bitvec",
|
||||
"cc",
|
||||
"deepsize",
|
||||
"futures",
|
||||
"half",
|
||||
"lance-arrow",
|
||||
"lance-core",
|
||||
"log",
|
||||
"num-traits",
|
||||
"rand 0.9.2",
|
||||
"rayon",
|
||||
"tokio",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lance-namespace"
|
||||
version = "0.38.3-beta.11"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3-beta.11#db497cb2373156679aa0d6a1f2087880a8579bc6"
|
||||
version = "0.38.3"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3#afc0f9832cf11d0bf74381c2b63fd37de1c5f415"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"async-trait",
|
||||
@@ -4620,8 +4599,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-namespace-impls"
|
||||
version = "0.38.3-beta.11"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3-beta.11#db497cb2373156679aa0d6a1f2087880a8579bc6"
|
||||
version = "0.38.3"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3#afc0f9832cf11d0bf74381c2b63fd37de1c5f415"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"arrow-ipc",
|
||||
@@ -4654,8 +4633,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-table"
|
||||
version = "0.38.3-beta.11"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3-beta.11#db497cb2373156679aa0d6a1f2087880a8579bc6"
|
||||
version = "0.38.3"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3#afc0f9832cf11d0bf74381c2b63fd37de1c5f415"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"arrow-array",
|
||||
@@ -4693,8 +4672,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-testing"
|
||||
version = "0.38.3-beta.11"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3-beta.11#db497cb2373156679aa0d6a1f2087880a8579bc6"
|
||||
version = "0.38.3"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3#afc0f9832cf11d0bf74381c2b63fd37de1c5f415"
|
||||
dependencies = [
|
||||
"arrow-array",
|
||||
"arrow-schema",
|
||||
@@ -4705,7 +4684,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lancedb"
|
||||
version = "0.22.3-beta.2"
|
||||
version = "0.22.3-beta.3"
|
||||
dependencies = [
|
||||
"ahash",
|
||||
"anyhow",
|
||||
@@ -4802,7 +4781,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lancedb-nodejs"
|
||||
version = "0.22.3-beta.2"
|
||||
version = "0.22.3-beta.3"
|
||||
dependencies = [
|
||||
"arrow-array",
|
||||
"arrow-ipc",
|
||||
@@ -4822,7 +4801,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lancedb-python"
|
||||
version = "0.25.3-beta.2"
|
||||
version = "0.25.3-beta.3"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"async-trait",
|
||||
|
||||
28
Cargo.toml
28
Cargo.toml
@@ -15,20 +15,20 @@ categories = ["database-implementations"]
|
||||
rust-version = "1.78.0"
|
||||
|
||||
[workspace.dependencies]
|
||||
lance = { "version" = "=0.38.3-beta.11", default-features = false, "tag" = "v0.38.3-beta.11", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-core = { "version" = "=0.38.3-beta.11", "tag" = "v0.38.3-beta.11", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-datagen = { "version" = "=0.38.3-beta.11", "tag" = "v0.38.3-beta.11", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-file = { "version" = "=0.38.3-beta.11", "tag" = "v0.38.3-beta.11", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-io = { "version" = "=0.38.3-beta.11", default-features = false, "tag" = "v0.38.3-beta.11", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-index = { "version" = "=0.38.3-beta.11", "tag" = "v0.38.3-beta.11", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-linalg = { "version" = "=0.38.3-beta.11", "tag" = "v0.38.3-beta.11", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-namespace = { "version" = "=0.38.3-beta.11", "tag" = "v0.38.3-beta.11", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-namespace-impls = { "version" = "=0.38.3-beta.11", "features" = ["dir-aws", "dir-gcp", "dir-azure", "dir-oss", "rest"], "tag" = "v0.38.3-beta.11", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-table = { "version" = "=0.38.3-beta.11", "tag" = "v0.38.3-beta.11", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-testing = { "version" = "=0.38.3-beta.11", "tag" = "v0.38.3-beta.11", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-datafusion = { "version" = "=0.38.3-beta.11", "tag" = "v0.38.3-beta.11", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-encoding = { "version" = "=0.38.3-beta.11", "tag" = "v0.38.3-beta.11", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-arrow = { "version" = "=0.38.3-beta.11", "tag" = "v0.38.3-beta.11", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance = { "version" = "=0.38.3", default-features = false, "tag" = "v0.38.3", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-core = { "version" = "=0.38.3", "tag" = "v0.38.3", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-datagen = { "version" = "=0.38.3", "tag" = "v0.38.3", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-file = { "version" = "=0.38.3", "tag" = "v0.38.3", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-io = { "version" = "=0.38.3", default-features = false, "tag" = "v0.38.3", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-index = { "version" = "=0.38.3", "tag" = "v0.38.3", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-linalg = { "version" = "=0.38.3", "tag" = "v0.38.3", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-namespace = { "version" = "=0.38.3", "tag" = "v0.38.3", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-namespace-impls = { "version" = "=0.38.3", "features" = ["dir-aws", "dir-gcp", "dir-azure", "dir-oss", "rest"], "tag" = "v0.38.3", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-table = { "version" = "=0.38.3", "tag" = "v0.38.3", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-testing = { "version" = "=0.38.3", "tag" = "v0.38.3", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-datafusion = { "version" = "=0.38.3", "tag" = "v0.38.3", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-encoding = { "version" = "=0.38.3", "tag" = "v0.38.3", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-arrow = { "version" = "=0.38.3", "tag" = "v0.38.3", "git" = "https://github.com/lancedb/lance.git" }
|
||||
ahash = "0.8"
|
||||
# Note that this one does not include pyarrow
|
||||
arrow = { version = "56.2", optional = false }
|
||||
|
||||
@@ -0,0 +1,97 @@
|
||||
# VoyageAI Embeddings : Multimodal
|
||||
|
||||
VoyageAI embeddings can also be used to embed both text and image data, only some of the models support image data and you can check the list
|
||||
under [https://docs.voyageai.com/docs/multimodal-embeddings](https://docs.voyageai.com/docs/multimodal-embeddings)
|
||||
|
||||
Supported parameters (to be passed in `create` method) are:
|
||||
|
||||
| Parameter | Type | Default Value | Description |
|
||||
|---|---|-------------------------|-------------------------------------------|
|
||||
| `name` | `str` | `"voyage-multimodal-3"` | The model ID of the VoyageAI model to use |
|
||||
|
||||
Usage Example:
|
||||
|
||||
```python
|
||||
import base64
|
||||
import os
|
||||
from io import BytesIO
|
||||
|
||||
import requests
|
||||
import lancedb
|
||||
from lancedb.pydantic import LanceModel, Vector
|
||||
from lancedb.embeddings import get_registry
|
||||
import pandas as pd
|
||||
|
||||
os.environ['VOYAGE_API_KEY'] = 'YOUR_VOYAGE_API_KEY'
|
||||
|
||||
db = lancedb.connect(".lancedb")
|
||||
func = get_registry().get("voyageai").create(name="voyage-multimodal-3")
|
||||
|
||||
|
||||
def image_to_base64(image_bytes: bytes):
|
||||
buffered = BytesIO(image_bytes)
|
||||
img_str = base64.b64encode(buffered.getvalue())
|
||||
return img_str.decode("utf-8")
|
||||
|
||||
|
||||
class Images(LanceModel):
|
||||
label: str
|
||||
image_uri: str = func.SourceField() # image uri as the source
|
||||
image_bytes: str = func.SourceField() # image bytes base64 encoded as the source
|
||||
vector: Vector(func.ndims()) = func.VectorField() # vector column
|
||||
vec_from_bytes: Vector(func.ndims()) = func.VectorField() # Another vector column
|
||||
|
||||
|
||||
if "images" in db.table_names():
|
||||
db.drop_table("images")
|
||||
table = db.create_table("images", schema=Images)
|
||||
labels = ["cat", "cat", "dog", "dog", "horse", "horse"]
|
||||
uris = [
|
||||
"http://farm1.staticflickr.com/53/167798175_7c7845bbbd_z.jpg",
|
||||
"http://farm1.staticflickr.com/134/332220238_da527d8140_z.jpg",
|
||||
"http://farm9.staticflickr.com/8387/8602747737_2e5c2a45d4_z.jpg",
|
||||
"http://farm5.staticflickr.com/4092/5017326486_1f46057f5f_z.jpg",
|
||||
"http://farm9.staticflickr.com/8216/8434969557_d37882c42d_z.jpg",
|
||||
"http://farm6.staticflickr.com/5142/5835678453_4f3a4edb45_z.jpg",
|
||||
]
|
||||
# get each uri as bytes
|
||||
images_bytes = [image_to_base64(requests.get(uri).content) for uri in uris]
|
||||
table.add(
|
||||
pd.DataFrame({"label": labels, "image_uri": uris, "image_bytes": images_bytes})
|
||||
)
|
||||
```
|
||||
Now we can search using text from both the default vector column and the custom vector column
|
||||
```python
|
||||
|
||||
# text search
|
||||
actual = table.search("man's best friend", "vec_from_bytes").limit(1).to_pydantic(Images)[0]
|
||||
print(actual.label) # prints "dog"
|
||||
|
||||
frombytes = (
|
||||
table.search("man's best friend", vector_column_name="vec_from_bytes")
|
||||
.limit(1)
|
||||
.to_pydantic(Images)[0]
|
||||
)
|
||||
print(frombytes.label)
|
||||
|
||||
```
|
||||
|
||||
Because we're using a multi-modal embedding function, we can also search using images
|
||||
|
||||
```python
|
||||
# image search
|
||||
query_image_uri = "http://farm1.staticflickr.com/200/467715466_ed4a31801f_z.jpg"
|
||||
image_bytes = requests.get(query_image_uri).content
|
||||
query_image = Image.open(BytesIO(image_bytes))
|
||||
actual = table.search(query_image, "vec_from_bytes").limit(1).to_pydantic(Images)[0]
|
||||
print(actual.label == "dog")
|
||||
|
||||
# image search using a custom vector column
|
||||
other = (
|
||||
table.search(query_image, vector_column_name="vec_from_bytes")
|
||||
.limit(1)
|
||||
.to_pydantic(Images)[0]
|
||||
)
|
||||
print(actual.label)
|
||||
|
||||
```
|
||||
@@ -397,117 +397,6 @@ For **read-only access**, LanceDB will need a policy such as:
|
||||
}
|
||||
```
|
||||
|
||||
#### DynamoDB Commit Store for concurrent writes
|
||||
|
||||
By default, S3 does not support concurrent writes. Having two or more processes
|
||||
writing to the same table at the same time can lead to data corruption. This is
|
||||
because S3, unlike other object stores, does not have any atomic put or copy
|
||||
operation.
|
||||
|
||||
To enable concurrent writes, you can configure LanceDB to use a DynamoDB table
|
||||
as a commit store. This table will be used to coordinate writes between
|
||||
different processes. To enable this feature, you must modify your connection
|
||||
URI to use the `s3+ddb` scheme and add a query parameter `ddbTableName` with the
|
||||
name of the table to use.
|
||||
|
||||
=== "Python"
|
||||
|
||||
=== "Sync API"
|
||||
|
||||
```python
|
||||
import lancedb
|
||||
db = lancedb.connect(
|
||||
"s3+ddb://bucket/path?ddbTableName=my-dynamodb-table",
|
||||
)
|
||||
```
|
||||
=== "Async API"
|
||||
|
||||
```python
|
||||
import lancedb
|
||||
async_db = await lancedb.connect_async(
|
||||
"s3+ddb://bucket/path?ddbTableName=my-dynamodb-table",
|
||||
)
|
||||
```
|
||||
|
||||
=== "JavaScript"
|
||||
|
||||
```javascript
|
||||
const lancedb = require("lancedb");
|
||||
|
||||
const db = await lancedb.connect(
|
||||
"s3+ddb://bucket/path?ddbTableName=my-dynamodb-table",
|
||||
);
|
||||
```
|
||||
|
||||
The DynamoDB table must be created with the following schema:
|
||||
|
||||
- Hash key: `base_uri` (string)
|
||||
- Range key: `version` (number)
|
||||
|
||||
You can create this programmatically with:
|
||||
|
||||
=== "Python"
|
||||
|
||||
<!-- skip-test -->
|
||||
```python
|
||||
import boto3
|
||||
|
||||
dynamodb = boto3.client("dynamodb")
|
||||
table = dynamodb.create_table(
|
||||
TableName=table_name,
|
||||
KeySchema=[
|
||||
{"AttributeName": "base_uri", "KeyType": "HASH"},
|
||||
{"AttributeName": "version", "KeyType": "RANGE"},
|
||||
],
|
||||
AttributeDefinitions=[
|
||||
{"AttributeName": "base_uri", "AttributeType": "S"},
|
||||
{"AttributeName": "version", "AttributeType": "N"},
|
||||
],
|
||||
ProvisionedThroughput={"ReadCapacityUnits": 1, "WriteCapacityUnits": 1},
|
||||
)
|
||||
```
|
||||
|
||||
=== "JavaScript"
|
||||
|
||||
<!-- skip-test -->
|
||||
```javascript
|
||||
import {
|
||||
CreateTableCommand,
|
||||
DynamoDBClient,
|
||||
} from "@aws-sdk/client-dynamodb";
|
||||
|
||||
const dynamodb = new DynamoDBClient({
|
||||
region: CONFIG.awsRegion,
|
||||
credentials: {
|
||||
accessKeyId: CONFIG.awsAccessKeyId,
|
||||
secretAccessKey: CONFIG.awsSecretAccessKey,
|
||||
},
|
||||
endpoint: CONFIG.awsEndpoint,
|
||||
});
|
||||
const command = new CreateTableCommand({
|
||||
TableName: table_name,
|
||||
AttributeDefinitions: [
|
||||
{
|
||||
AttributeName: "base_uri",
|
||||
AttributeType: "S",
|
||||
},
|
||||
{
|
||||
AttributeName: "version",
|
||||
AttributeType: "N",
|
||||
},
|
||||
],
|
||||
KeySchema: [
|
||||
{ AttributeName: "base_uri", KeyType: "HASH" },
|
||||
{ AttributeName: "version", KeyType: "RANGE" },
|
||||
],
|
||||
ProvisionedThroughput: {
|
||||
ReadCapacityUnits: 1,
|
||||
WriteCapacityUnits: 1,
|
||||
},
|
||||
});
|
||||
await client.send(command);
|
||||
```
|
||||
|
||||
|
||||
#### S3-compatible stores
|
||||
|
||||
|
||||
@@ -80,7 +80,7 @@ AnalyzeExec verbose=true, metrics=[]
|
||||
### execute()
|
||||
|
||||
```ts
|
||||
protected execute(options?): RecordBatchIterator
|
||||
protected execute(options?): AsyncGenerator<RecordBatch<any>, void, unknown>
|
||||
```
|
||||
|
||||
Execute the query and return the results as an
|
||||
@@ -91,7 +91,7 @@ Execute the query and return the results as an
|
||||
|
||||
#### Returns
|
||||
|
||||
[`RecordBatchIterator`](RecordBatchIterator.md)
|
||||
`AsyncGenerator`<`RecordBatch`<`any`>, `void`, `unknown`>
|
||||
|
||||
#### See
|
||||
|
||||
|
||||
@@ -81,7 +81,7 @@ AnalyzeExec verbose=true, metrics=[]
|
||||
### execute()
|
||||
|
||||
```ts
|
||||
protected execute(options?): RecordBatchIterator
|
||||
protected execute(options?): AsyncGenerator<RecordBatch<any>, void, unknown>
|
||||
```
|
||||
|
||||
Execute the query and return the results as an
|
||||
@@ -92,7 +92,7 @@ Execute the query and return the results as an
|
||||
|
||||
#### Returns
|
||||
|
||||
[`RecordBatchIterator`](RecordBatchIterator.md)
|
||||
`AsyncGenerator`<`RecordBatch`<`any`>, `void`, `unknown`>
|
||||
|
||||
#### See
|
||||
|
||||
|
||||
@@ -1,43 +0,0 @@
|
||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../globals.md) / RecordBatchIterator
|
||||
|
||||
# Class: RecordBatchIterator
|
||||
|
||||
## Implements
|
||||
|
||||
- `AsyncIterator`<`RecordBatch`>
|
||||
|
||||
## Constructors
|
||||
|
||||
### new RecordBatchIterator()
|
||||
|
||||
```ts
|
||||
new RecordBatchIterator(promise?): RecordBatchIterator
|
||||
```
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **promise?**: `Promise`<`RecordBatchIterator`>
|
||||
|
||||
#### Returns
|
||||
|
||||
[`RecordBatchIterator`](RecordBatchIterator.md)
|
||||
|
||||
## Methods
|
||||
|
||||
### next()
|
||||
|
||||
```ts
|
||||
next(): Promise<IteratorResult<RecordBatch<any>, any>>
|
||||
```
|
||||
|
||||
#### Returns
|
||||
|
||||
`Promise`<`IteratorResult`<`RecordBatch`<`any`>, `any`>>
|
||||
|
||||
#### Implementation of
|
||||
|
||||
`AsyncIterator.next`
|
||||
@@ -76,7 +76,7 @@ AnalyzeExec verbose=true, metrics=[]
|
||||
### execute()
|
||||
|
||||
```ts
|
||||
protected execute(options?): RecordBatchIterator
|
||||
protected execute(options?): AsyncGenerator<RecordBatch<any>, void, unknown>
|
||||
```
|
||||
|
||||
Execute the query and return the results as an
|
||||
@@ -87,7 +87,7 @@ Execute the query and return the results as an
|
||||
|
||||
#### Returns
|
||||
|
||||
[`RecordBatchIterator`](RecordBatchIterator.md)
|
||||
`AsyncGenerator`<`RecordBatch`<`any`>, `void`, `unknown`>
|
||||
|
||||
#### See
|
||||
|
||||
|
||||
@@ -221,7 +221,7 @@ also increase the latency of your query. The default value is 1.5*limit.
|
||||
### execute()
|
||||
|
||||
```ts
|
||||
protected execute(options?): RecordBatchIterator
|
||||
protected execute(options?): AsyncGenerator<RecordBatch<any>, void, unknown>
|
||||
```
|
||||
|
||||
Execute the query and return the results as an
|
||||
@@ -232,7 +232,7 @@ Execute the query and return the results as an
|
||||
|
||||
#### Returns
|
||||
|
||||
[`RecordBatchIterator`](RecordBatchIterator.md)
|
||||
`AsyncGenerator`<`RecordBatch`<`any`>, `void`, `unknown`>
|
||||
|
||||
#### See
|
||||
|
||||
|
||||
19
docs/src/js/functions/RecordBatchIterator.md
Normal file
19
docs/src/js/functions/RecordBatchIterator.md
Normal file
@@ -0,0 +1,19 @@
|
||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../globals.md) / RecordBatchIterator
|
||||
|
||||
# Function: RecordBatchIterator()
|
||||
|
||||
```ts
|
||||
function RecordBatchIterator(promisedInner): AsyncGenerator<RecordBatch<any>, void, unknown>
|
||||
```
|
||||
|
||||
## Parameters
|
||||
|
||||
* **promisedInner**: `Promise`<`RecordBatchIterator`>
|
||||
|
||||
## Returns
|
||||
|
||||
`AsyncGenerator`<`RecordBatch`<`any`>, `void`, `unknown`>
|
||||
@@ -32,7 +32,6 @@
|
||||
- [PhraseQuery](classes/PhraseQuery.md)
|
||||
- [Query](classes/Query.md)
|
||||
- [QueryBase](classes/QueryBase.md)
|
||||
- [RecordBatchIterator](classes/RecordBatchIterator.md)
|
||||
- [Session](classes/Session.md)
|
||||
- [StaticHeaderProvider](classes/StaticHeaderProvider.md)
|
||||
- [Table](classes/Table.md)
|
||||
@@ -105,6 +104,7 @@
|
||||
|
||||
## Functions
|
||||
|
||||
- [RecordBatchIterator](functions/RecordBatchIterator.md)
|
||||
- [connect](functions/connect.md)
|
||||
- [makeArrowTable](functions/makeArrowTable.md)
|
||||
- [packBits](functions/packBits.md)
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
<parent>
|
||||
<groupId>com.lancedb</groupId>
|
||||
<artifactId>lancedb-parent</artifactId>
|
||||
<version>0.22.3-beta.2</version>
|
||||
<version>0.22.3-beta.3</version>
|
||||
<relativePath>../pom.xml</relativePath>
|
||||
</parent>
|
||||
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
<parent>
|
||||
<groupId>com.lancedb</groupId>
|
||||
<artifactId>lancedb-parent</artifactId>
|
||||
<version>0.22.3-beta.2</version>
|
||||
<version>0.22.3-beta.3</version>
|
||||
<relativePath>../pom.xml</relativePath>
|
||||
</parent>
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
|
||||
<groupId>com.lancedb</groupId>
|
||||
<artifactId>lancedb-parent</artifactId>
|
||||
<version>0.22.3-beta.2</version>
|
||||
<version>0.22.3-beta.3</version>
|
||||
<packaging>pom</packaging>
|
||||
<name>${project.artifactId}</name>
|
||||
<description>LanceDB Java SDK Parent POM</description>
|
||||
|
||||
13
nodejs/AGENTS.md
Normal file
13
nodejs/AGENTS.md
Normal file
@@ -0,0 +1,13 @@
|
||||
These are the typescript bindings of LanceDB.
|
||||
The core Rust library is in the `../rust/lancedb` directory, the rust binding
|
||||
code is in the `src/` directory and the typescript bindings are in
|
||||
the `lancedb/` directory.
|
||||
|
||||
Whenever you change the Rust code, you will need to recompile: `npm run build`.
|
||||
|
||||
Common commands:
|
||||
* Build: `npm run build`
|
||||
* Lint: `npm run lint`
|
||||
* Fix lints: `npm run lint-fix`
|
||||
* Test: `npm test`
|
||||
* Run single test file: `npm test __test__/arrow.test.ts`
|
||||
@@ -1,13 +0,0 @@
|
||||
These are the typescript bindings of LanceDB.
|
||||
The core Rust library is in the `../rust/lancedb` directory, the rust binding
|
||||
code is in the `src/` directory and the typescript bindings are in
|
||||
the `lancedb/` directory.
|
||||
|
||||
Whenever you change the Rust code, you will need to recompile: `npm run build`.
|
||||
|
||||
Common commands:
|
||||
* Build: `npm run build`
|
||||
* Lint: `npm run lint`
|
||||
* Fix lints: `npm run lint-fix`
|
||||
* Test: `npm test`
|
||||
* Run single test file: `npm test __test__/arrow.test.ts`
|
||||
1
nodejs/CLAUDE.md
Symbolic link
1
nodejs/CLAUDE.md
Symbolic link
@@ -0,0 +1 @@
|
||||
AGENTS.md
|
||||
@@ -1,7 +1,7 @@
|
||||
[package]
|
||||
name = "lancedb-nodejs"
|
||||
edition.workspace = true
|
||||
version = "0.22.3-beta.2"
|
||||
version = "0.22.3-beta.3"
|
||||
license.workspace = true
|
||||
description.workspace = true
|
||||
repository.workspace = true
|
||||
|
||||
@@ -20,35 +20,25 @@ import {
|
||||
} from "./native";
|
||||
import { Reranker } from "./rerankers";
|
||||
|
||||
export class RecordBatchIterator implements AsyncIterator<RecordBatch> {
|
||||
private promisedInner?: Promise<NativeBatchIterator>;
|
||||
private inner?: NativeBatchIterator;
|
||||
export async function* RecordBatchIterator(
|
||||
promisedInner: Promise<NativeBatchIterator>,
|
||||
) {
|
||||
const inner = await promisedInner;
|
||||
|
||||
constructor(promise?: Promise<NativeBatchIterator>) {
|
||||
// TODO: check promise reliably so we dont need to pass two arguments.
|
||||
this.promisedInner = promise;
|
||||
if (inner === undefined) {
|
||||
throw new Error("Invalid iterator state");
|
||||
}
|
||||
|
||||
// biome-ignore lint/suspicious/noExplicitAny: skip
|
||||
async next(): Promise<IteratorResult<RecordBatch<any>>> {
|
||||
if (this.inner === undefined) {
|
||||
this.inner = await this.promisedInner;
|
||||
}
|
||||
if (this.inner === undefined) {
|
||||
throw new Error("Invalid iterator state state");
|
||||
}
|
||||
const n = await this.inner.next();
|
||||
if (n == null) {
|
||||
return Promise.resolve({ done: true, value: null });
|
||||
}
|
||||
const tbl = tableFromIPC(n);
|
||||
if (tbl.batches.length != 1) {
|
||||
for (let buffer = await inner.next(); buffer; buffer = await inner.next()) {
|
||||
const { batches } = tableFromIPC(buffer);
|
||||
|
||||
if (batches.length !== 1) {
|
||||
throw new Error("Expected only one batch");
|
||||
}
|
||||
return Promise.resolve({ done: false, value: tbl.batches[0] });
|
||||
|
||||
yield batches[0];
|
||||
}
|
||||
}
|
||||
/* eslint-enable */
|
||||
|
||||
class RecordBatchIterable<
|
||||
NativeQueryType extends NativeQuery | NativeVectorQuery | NativeTakeQuery,
|
||||
@@ -64,7 +54,7 @@ class RecordBatchIterable<
|
||||
|
||||
// biome-ignore lint/suspicious/noExplicitAny: skip
|
||||
[Symbol.asyncIterator](): AsyncIterator<RecordBatch<any>, any, undefined> {
|
||||
return new RecordBatchIterator(
|
||||
return RecordBatchIterator(
|
||||
this.inner.execute(this.options?.maxBatchLength, this.options?.timeoutMs),
|
||||
);
|
||||
}
|
||||
@@ -231,10 +221,8 @@ export class QueryBase<
|
||||
* single query)
|
||||
*
|
||||
*/
|
||||
protected execute(
|
||||
options?: Partial<QueryExecutionOptions>,
|
||||
): RecordBatchIterator {
|
||||
return new RecordBatchIterator(this.nativeExecute(options));
|
||||
protected execute(options?: Partial<QueryExecutionOptions>) {
|
||||
return RecordBatchIterator(this.nativeExecute(options));
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -242,8 +230,7 @@ export class QueryBase<
|
||||
*/
|
||||
// biome-ignore lint/suspicious/noExplicitAny: skip
|
||||
[Symbol.asyncIterator](): AsyncIterator<RecordBatch<any>> {
|
||||
const promise = this.nativeExecute();
|
||||
return new RecordBatchIterator(promise);
|
||||
return RecordBatchIterator(this.nativeExecute());
|
||||
}
|
||||
|
||||
/** Collect the results as an Arrow @see {@link ArrowTable}. */
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-darwin-arm64",
|
||||
"version": "0.22.3-beta.2",
|
||||
"version": "0.22.3-beta.3",
|
||||
"os": ["darwin"],
|
||||
"cpu": ["arm64"],
|
||||
"main": "lancedb.darwin-arm64.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-darwin-x64",
|
||||
"version": "0.22.3-beta.2",
|
||||
"version": "0.22.3-beta.3",
|
||||
"os": ["darwin"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.darwin-x64.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-arm64-gnu",
|
||||
"version": "0.22.3-beta.2",
|
||||
"version": "0.22.3-beta.3",
|
||||
"os": ["linux"],
|
||||
"cpu": ["arm64"],
|
||||
"main": "lancedb.linux-arm64-gnu.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-arm64-musl",
|
||||
"version": "0.22.3-beta.2",
|
||||
"version": "0.22.3-beta.3",
|
||||
"os": ["linux"],
|
||||
"cpu": ["arm64"],
|
||||
"main": "lancedb.linux-arm64-musl.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-x64-gnu",
|
||||
"version": "0.22.3-beta.2",
|
||||
"version": "0.22.3-beta.3",
|
||||
"os": ["linux"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.linux-x64-gnu.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-x64-musl",
|
||||
"version": "0.22.3-beta.2",
|
||||
"version": "0.22.3-beta.3",
|
||||
"os": ["linux"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.linux-x64-musl.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-win32-arm64-msvc",
|
||||
"version": "0.22.3-beta.2",
|
||||
"version": "0.22.3-beta.3",
|
||||
"os": [
|
||||
"win32"
|
||||
],
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-win32-x64-msvc",
|
||||
"version": "0.22.3-beta.2",
|
||||
"version": "0.22.3-beta.3",
|
||||
"os": ["win32"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.win32-x64-msvc.node",
|
||||
|
||||
4
nodejs/package-lock.json
generated
4
nodejs/package-lock.json
generated
@@ -1,12 +1,12 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb",
|
||||
"version": "0.22.3-beta.2",
|
||||
"version": "0.22.3-beta.3",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "@lancedb/lancedb",
|
||||
"version": "0.22.3-beta.2",
|
||||
"version": "0.22.3-beta.3",
|
||||
"cpu": [
|
||||
"x64",
|
||||
"arm64"
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
"ann"
|
||||
],
|
||||
"private": false,
|
||||
"version": "0.22.3-beta.2",
|
||||
"version": "0.22.3-beta.3",
|
||||
"main": "dist/index.js",
|
||||
"exports": {
|
||||
".": "./dist/index.js",
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
[tool.bumpversion]
|
||||
current_version = "0.25.3-beta.3"
|
||||
current_version = "0.25.3-beta.4"
|
||||
parse = """(?x)
|
||||
(?P<major>0|[1-9]\\d*)\\.
|
||||
(?P<minor>0|[1-9]\\d*)\\.
|
||||
|
||||
19
python/AGENTS.md
Normal file
19
python/AGENTS.md
Normal file
@@ -0,0 +1,19 @@
|
||||
These are the Python bindings of LanceDB.
|
||||
The core Rust library is in the `../rust/lancedb` directory, the rust binding
|
||||
code is in the `src/` directory and the Python bindings are in the `lancedb/` directory.
|
||||
|
||||
Common commands:
|
||||
|
||||
* Build: `make develop`
|
||||
* Format: `make format`
|
||||
* Lint: `make check`
|
||||
* Fix lints: `make fix`
|
||||
* Test: `make test`
|
||||
* Doc test: `make doctest`
|
||||
|
||||
Before committing changes, run lints and then formatting.
|
||||
|
||||
When you change the Rust code, you will need to recompile the Python bindings: `make develop`.
|
||||
|
||||
When you export new types from Rust to Python, you must manually update `python/lancedb/_lancedb.pyi`
|
||||
with the corresponding type hints. You can run `pyright` to check for type errors in the Python code.
|
||||
@@ -1,19 +0,0 @@
|
||||
These are the Python bindings of LanceDB.
|
||||
The core Rust library is in the `../rust/lancedb` directory, the rust binding
|
||||
code is in the `src/` directory and the Python bindings are in the `lancedb/` directory.
|
||||
|
||||
Common commands:
|
||||
|
||||
* Build: `make develop`
|
||||
* Format: `make format`
|
||||
* Lint: `make check`
|
||||
* Fix lints: `make fix`
|
||||
* Test: `make test`
|
||||
* Doc test: `make doctest`
|
||||
|
||||
Before committing changes, run lints and then formatting.
|
||||
|
||||
When you change the Rust code, you will need to recompile the Python bindings: `make develop`.
|
||||
|
||||
When you export new types from Rust to Python, you must manually update `python/lancedb/_lancedb.pyi`
|
||||
with the corresponding type hints. You can run `pyright` to check for type errors in the Python code.
|
||||
1
python/CLAUDE.md
Symbolic link
1
python/CLAUDE.md
Symbolic link
@@ -0,0 +1 @@
|
||||
AGENTS.md
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "lancedb-python"
|
||||
version = "0.25.3-beta.3"
|
||||
version = "0.25.3-beta.4"
|
||||
edition.workspace = true
|
||||
description = "Python bindings for LanceDB"
|
||||
license.workspace = true
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
import base64
|
||||
import os
|
||||
from typing import ClassVar, TYPE_CHECKING, List, Union, Any
|
||||
from typing import ClassVar, TYPE_CHECKING, List, Union, Any, Generator
|
||||
|
||||
from pathlib import Path
|
||||
from urllib.parse import urlparse
|
||||
@@ -19,6 +19,23 @@ from .utils import api_key_not_found_help, IMAGES, TEXT
|
||||
if TYPE_CHECKING:
|
||||
import PIL
|
||||
|
||||
# Token limits for different VoyageAI models
|
||||
VOYAGE_TOTAL_TOKEN_LIMITS = {
|
||||
"voyage-context-3": 32_000,
|
||||
"voyage-3.5-lite": 1_000_000,
|
||||
"voyage-3.5": 320_000,
|
||||
"voyage-3-lite": 120_000,
|
||||
"voyage-3": 120_000,
|
||||
"voyage-multimodal-3": 120_000,
|
||||
"voyage-finance-2": 120_000,
|
||||
"voyage-multilingual-2": 120_000,
|
||||
"voyage-law-2": 120_000,
|
||||
"voyage-code-2": 120_000,
|
||||
}
|
||||
|
||||
# Batch size for embedding requests (max number of items per batch)
|
||||
BATCH_SIZE = 1000
|
||||
|
||||
|
||||
def is_valid_url(text):
|
||||
try:
|
||||
@@ -120,6 +137,9 @@ class VoyageAIEmbeddingFunction(EmbeddingFunction):
|
||||
name: str
|
||||
The name of the model to use. List of acceptable models:
|
||||
|
||||
* voyage-context-3
|
||||
* voyage-3.5
|
||||
* voyage-3.5-lite
|
||||
* voyage-3
|
||||
* voyage-3-lite
|
||||
* voyage-multimodal-3
|
||||
@@ -157,25 +177,35 @@ class VoyageAIEmbeddingFunction(EmbeddingFunction):
|
||||
name: str
|
||||
client: ClassVar = None
|
||||
text_embedding_models: list = [
|
||||
"voyage-3.5",
|
||||
"voyage-3.5-lite",
|
||||
"voyage-3",
|
||||
"voyage-3-lite",
|
||||
"voyage-finance-2",
|
||||
"voyage-multilingual-2",
|
||||
"voyage-law-2",
|
||||
"voyage-code-2",
|
||||
]
|
||||
multimodal_embedding_models: list = ["voyage-multimodal-3"]
|
||||
contextual_embedding_models: list = ["voyage-context-3"]
|
||||
|
||||
def _is_multimodal_model(self, model_name: str):
|
||||
return (
|
||||
model_name in self.multimodal_embedding_models or "multimodal" in model_name
|
||||
)
|
||||
|
||||
def _is_contextual_model(self, model_name: str):
|
||||
return model_name in self.contextual_embedding_models or "context" in model_name
|
||||
|
||||
def ndims(self):
|
||||
if self.name == "voyage-3-lite":
|
||||
return 512
|
||||
elif self.name == "voyage-code-2":
|
||||
return 1536
|
||||
elif self.name in [
|
||||
"voyage-context-3",
|
||||
"voyage-3.5",
|
||||
"voyage-3.5-lite",
|
||||
"voyage-3",
|
||||
"voyage-multimodal-3",
|
||||
"voyage-finance-2",
|
||||
@@ -207,6 +237,11 @@ class VoyageAIEmbeddingFunction(EmbeddingFunction):
|
||||
result = client.multimodal_embed(
|
||||
inputs=[[query]], model=self.name, input_type="query", **kwargs
|
||||
)
|
||||
elif self._is_contextual_model(self.name):
|
||||
result = client.contextualized_embed(
|
||||
inputs=[[query]], model=self.name, input_type="query", **kwargs
|
||||
)
|
||||
result = result.results[0]
|
||||
else:
|
||||
result = client.embed(
|
||||
texts=[query], model=self.name, input_type="query", **kwargs
|
||||
@@ -231,18 +266,164 @@ class VoyageAIEmbeddingFunction(EmbeddingFunction):
|
||||
List[np.array]: the list of embeddings
|
||||
"""
|
||||
client = VoyageAIEmbeddingFunction._get_client()
|
||||
|
||||
# For multimodal models, check if inputs contain images
|
||||
if self._is_multimodal_model(self.name):
|
||||
inputs = sanitize_multimodal_input(inputs)
|
||||
result = client.multimodal_embed(
|
||||
inputs=inputs, model=self.name, input_type="document", **kwargs
|
||||
sanitized = sanitize_multimodal_input(inputs)
|
||||
has_images = any(
|
||||
inp["content"][0].get("type") != "text" for inp in sanitized
|
||||
)
|
||||
if has_images:
|
||||
# Use non-batched API for images
|
||||
result = client.multimodal_embed(
|
||||
inputs=sanitized, model=self.name, input_type="document", **kwargs
|
||||
)
|
||||
return result.embeddings
|
||||
# Extract texts for batching
|
||||
inputs = [inp["content"][0]["text"] for inp in sanitized]
|
||||
else:
|
||||
inputs = sanitize_text_input(inputs)
|
||||
result = client.embed(
|
||||
texts=inputs, model=self.name, input_type="document", **kwargs
|
||||
)
|
||||
|
||||
return result.embeddings
|
||||
# Use batching for all text inputs
|
||||
return self._embed_with_batching(
|
||||
client, inputs, input_type="document", **kwargs
|
||||
)
|
||||
|
||||
def _build_batches(
|
||||
self, client, texts: List[str]
|
||||
) -> Generator[List[str], None, None]:
|
||||
"""
|
||||
Generate batches of texts based on token limits using a generator.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
client : voyageai.Client
|
||||
The VoyageAI client instance.
|
||||
texts : List[str]
|
||||
List of texts to batch.
|
||||
|
||||
Yields
|
||||
------
|
||||
List[str]: Batches of texts.
|
||||
"""
|
||||
if not texts:
|
||||
return
|
||||
|
||||
max_tokens_per_batch = VOYAGE_TOTAL_TOKEN_LIMITS.get(self.name, 120_000)
|
||||
current_batch: List[str] = []
|
||||
current_batch_tokens = 0
|
||||
|
||||
# Tokenize all texts in one API call
|
||||
token_lists = client.tokenize(texts, model=self.name)
|
||||
token_counts = [len(token_list) for token_list in token_lists]
|
||||
|
||||
for i, text in enumerate(texts):
|
||||
n_tokens = token_counts[i]
|
||||
|
||||
# Check if adding this text would exceed limits
|
||||
if current_batch and (
|
||||
len(current_batch) >= BATCH_SIZE
|
||||
or (current_batch_tokens + n_tokens > max_tokens_per_batch)
|
||||
):
|
||||
# Yield the current batch and start a new one
|
||||
yield current_batch
|
||||
current_batch = []
|
||||
current_batch_tokens = 0
|
||||
|
||||
current_batch.append(text)
|
||||
current_batch_tokens += n_tokens
|
||||
|
||||
# Yield the last batch (always has at least one text)
|
||||
if current_batch:
|
||||
yield current_batch
|
||||
|
||||
def _get_embed_function(
|
||||
self, client, input_type: str = "document", **kwargs
|
||||
) -> callable:
|
||||
"""
|
||||
Get the appropriate embedding function based on model type.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
client : voyageai.Client
|
||||
The VoyageAI client instance.
|
||||
input_type : str
|
||||
Either "query" or "document"
|
||||
**kwargs
|
||||
Additional arguments to pass to the embedding API
|
||||
|
||||
Returns
|
||||
-------
|
||||
callable: A function that takes a batch of texts and returns embeddings.
|
||||
"""
|
||||
if self._is_multimodal_model(self.name):
|
||||
|
||||
def embed_batch(batch: List[str]) -> List[np.array]:
|
||||
batch_inputs = sanitize_multimodal_input(batch)
|
||||
result = client.multimodal_embed(
|
||||
inputs=batch_inputs,
|
||||
model=self.name,
|
||||
input_type=input_type,
|
||||
**kwargs,
|
||||
)
|
||||
return result.embeddings
|
||||
|
||||
return embed_batch
|
||||
|
||||
elif self._is_contextual_model(self.name):
|
||||
|
||||
def embed_batch(batch: List[str]) -> List[np.array]:
|
||||
result = client.contextualized_embed(
|
||||
inputs=[batch], model=self.name, input_type=input_type, **kwargs
|
||||
)
|
||||
return result.results[0].embeddings
|
||||
|
||||
return embed_batch
|
||||
|
||||
else:
|
||||
|
||||
def embed_batch(batch: List[str]) -> List[np.array]:
|
||||
result = client.embed(
|
||||
texts=batch, model=self.name, input_type=input_type, **kwargs
|
||||
)
|
||||
return result.embeddings
|
||||
|
||||
return embed_batch
|
||||
|
||||
def _embed_with_batching(
|
||||
self, client, texts: List[str], input_type: str = "document", **kwargs
|
||||
) -> List[np.array]:
|
||||
"""
|
||||
Embed texts with automatic batching based on token limits.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
client : voyageai.Client
|
||||
The VoyageAI client instance.
|
||||
texts : List[str]
|
||||
List of texts to embed.
|
||||
input_type : str
|
||||
Either "query" or "document"
|
||||
**kwargs
|
||||
Additional arguments to pass to the embedding API
|
||||
|
||||
Returns
|
||||
-------
|
||||
List[np.array]: List of embeddings.
|
||||
"""
|
||||
if not texts:
|
||||
return []
|
||||
|
||||
# Get the appropriate embedding function for this model type
|
||||
embed_fn = self._get_embed_function(client, input_type=input_type, **kwargs)
|
||||
|
||||
# Process each batch
|
||||
all_embeddings = []
|
||||
for batch in self._build_batches(client, texts):
|
||||
batch_embeddings = embed_fn(batch)
|
||||
all_embeddings.extend(batch_embeddings)
|
||||
|
||||
return all_embeddings
|
||||
|
||||
@staticmethod
|
||||
def _get_client():
|
||||
|
||||
@@ -21,6 +21,8 @@ class VoyageAIReranker(Reranker):
|
||||
----------
|
||||
model_name : str, default "rerank-english-v2.0"
|
||||
The name of the cross encoder model to use. Available voyageai models are:
|
||||
- rerank-2.5
|
||||
- rerank-2.5-lite
|
||||
- rerank-2
|
||||
- rerank-2-lite
|
||||
column : str, default "text"
|
||||
|
||||
@@ -532,6 +532,27 @@ def test_voyageai_embedding_function():
|
||||
assert len(tbl.to_pandas()["vector"][0]) == voyageai.ndims()
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.skipif(
|
||||
os.environ.get("VOYAGE_API_KEY") is None, reason="VOYAGE_API_KEY not set"
|
||||
)
|
||||
def test_voyageai_embedding_function_contextual_model():
|
||||
voyageai = (
|
||||
get_registry().get("voyageai").create(name="voyage-context-3", max_retries=0)
|
||||
)
|
||||
|
||||
class TextModel(LanceModel):
|
||||
text: str = voyageai.SourceField()
|
||||
vector: Vector(voyageai.ndims()) = voyageai.VectorField()
|
||||
|
||||
df = pd.DataFrame({"text": ["hello world", "goodbye world"]})
|
||||
db = lancedb.connect("~/lancedb")
|
||||
tbl = db.create_table("test", schema=TextModel, mode="overwrite")
|
||||
|
||||
tbl.add(df)
|
||||
assert len(tbl.to_pandas()["vector"][0]) == voyageai.ndims()
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.skipif(
|
||||
os.environ.get("VOYAGE_API_KEY") is None, reason="VOYAGE_API_KEY not set"
|
||||
|
||||
@@ -484,7 +484,7 @@ def test_jina_reranker(tmp_path, use_tantivy):
|
||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
||||
def test_voyageai_reranker(tmp_path, use_tantivy):
|
||||
pytest.importorskip("voyageai")
|
||||
reranker = VoyageAIReranker(model_name="rerank-2")
|
||||
reranker = VoyageAIReranker(model_name="rerank-2.5")
|
||||
table, schema = get_test_table(tmp_path, use_tantivy)
|
||||
_run_test_reranker(reranker, table, "single player experience", None, schema)
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "lancedb"
|
||||
version = "0.22.3-beta.2"
|
||||
version = "0.22.3-beta.3"
|
||||
edition.workspace = true
|
||||
description = "LanceDB: A serverless, low-latency vector database for AI applications"
|
||||
license.workspace = true
|
||||
|
||||
Reference in New Issue
Block a user