mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-27 15:12:53 +00:00
Compare commits
7 Commits
python-v0.
...
python-v0.
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
e1f9b011f8 | ||
|
|
d664b8739f | ||
|
|
20bec61ecb | ||
|
|
45255be42c | ||
|
|
93c2cf2f59 | ||
|
|
9d29c83f81 | ||
|
|
2a6143b5bd |
@@ -1,5 +1,5 @@
|
|||||||
[tool.bumpversion]
|
[tool.bumpversion]
|
||||||
current_version = "0.22.3-beta.2"
|
current_version = "0.22.3-beta.3"
|
||||||
parse = """(?x)
|
parse = """(?x)
|
||||||
(?P<major>0|[1-9]\\d*)\\.
|
(?P<major>0|[1-9]\\d*)\\.
|
||||||
(?P<minor>0|[1-9]\\d*)\\.
|
(?P<minor>0|[1-9]\\d*)\\.
|
||||||
|
|||||||
101
AGENTS.md
Normal file
101
AGENTS.md
Normal file
@@ -0,0 +1,101 @@
|
|||||||
|
LanceDB is a database designed for retrieval, including vector, full-text, and hybrid search.
|
||||||
|
It is a wrapper around Lance. There are two backends: local (in-process like SQLite) and
|
||||||
|
remote (against LanceDB Cloud).
|
||||||
|
|
||||||
|
The core of LanceDB is written in Rust. There are bindings in Python, Typescript, and Java.
|
||||||
|
|
||||||
|
Project layout:
|
||||||
|
|
||||||
|
* `rust/lancedb`: The LanceDB core Rust implementation.
|
||||||
|
* `python`: The Python bindings, using PyO3.
|
||||||
|
* `nodejs`: The Typescript bindings, using napi-rs
|
||||||
|
* `java`: The Java bindings
|
||||||
|
|
||||||
|
Common commands:
|
||||||
|
|
||||||
|
* Check for compiler errors: `cargo check --quiet --features remote --tests --examples`
|
||||||
|
* Run tests: `cargo test --quiet --features remote --tests`
|
||||||
|
* Run specific test: `cargo test --quiet --features remote -p <package_name> --test <test_name>`
|
||||||
|
* Lint: `cargo clippy --quiet --features remote --tests --examples`
|
||||||
|
* Format: `cargo fmt --all`
|
||||||
|
|
||||||
|
Before committing changes, run formatting.
|
||||||
|
|
||||||
|
## Coding tips
|
||||||
|
|
||||||
|
* When writing Rust doctests for things that require a connection or table reference,
|
||||||
|
write them as a function instead of a fully executable test. This allows type checking
|
||||||
|
to run but avoids needing a full test environment. For example:
|
||||||
|
```rust
|
||||||
|
/// ```
|
||||||
|
/// use lance_index::scalar::FullTextSearchQuery;
|
||||||
|
/// use lancedb::query::{QueryBase, ExecutableQuery};
|
||||||
|
///
|
||||||
|
/// # use lancedb::Table;
|
||||||
|
/// # async fn query(table: &Table) -> Result<(), Box<dyn std::error::Error>> {
|
||||||
|
/// let results = table.query()
|
||||||
|
/// .full_text_search(FullTextSearchQuery::new("hello world".into()))
|
||||||
|
/// .execute()
|
||||||
|
/// .await?;
|
||||||
|
/// # Ok(())
|
||||||
|
/// # }
|
||||||
|
/// ```
|
||||||
|
```
|
||||||
|
|
||||||
|
## Example plan: adding a new method on Table
|
||||||
|
|
||||||
|
Adding a new method involves first adding it to the Rust core, then exposing it
|
||||||
|
in the Python and TypeScript bindings. There are both local and remote tables.
|
||||||
|
Remote tables are implemented via a HTTP API and require the `remote` cargo
|
||||||
|
feature flag to be enabled. Python has both sync and async methods.
|
||||||
|
|
||||||
|
Rust core changes:
|
||||||
|
|
||||||
|
1. Add method on `Table` struct in `rust/lancedb/src/table.rs` (calls `BaseTable` trait).
|
||||||
|
2. Add method to `BaseTable` trait in `rust/lancedb/src/table.rs`.
|
||||||
|
3. Implement new trait method on `NativeTable` in `rust/lancedb/src/table.rs`.
|
||||||
|
* Test with unit test in `rust/lancedb/src/table.rs`.
|
||||||
|
4. Implement new trait method on `RemoteTable` in `rust/lancedb/src/remote/table.rs`.
|
||||||
|
* Test with unit test in `rust/lancedb/src/remote/table.rs` against mocked endpoint.
|
||||||
|
|
||||||
|
Python bindings changes:
|
||||||
|
|
||||||
|
1. Add PyO3 method binding in `python/src/table.rs`. Run `make develop` to compile bindings.
|
||||||
|
2. Add types for PyO3 method in `python/python/lancedb/_lancedb.pyi`.
|
||||||
|
3. Add method to `AsyncTable` class in `python/python/lancedb/table.py`.
|
||||||
|
4. Add abstract method to `Table` abstract base class in `python/python/lancedb/table.py`.
|
||||||
|
5. Add concrete sync method to `LanceTable` class in `python/python/lancedb/table.py`.
|
||||||
|
* Should use `LOOP.run()` to call the corresponding `AsyncTable` method.
|
||||||
|
6. Add concrete sync method to `RemoteTable` class in `python/python/lancedb/remote/table.py`.
|
||||||
|
7. Add unit test in `python/tests/test_table.py`.
|
||||||
|
|
||||||
|
TypeScript bindings changes:
|
||||||
|
|
||||||
|
1. Add napi-rs method binding on `Table` in `nodejs/src/table.rs`.
|
||||||
|
2. Run `npm run build` to generate TypeScript definitions.
|
||||||
|
3. Add typescript method on abstract class `Table` in `nodejs/src/table.ts`.
|
||||||
|
4. Add concrete method on `LocalTable` class in `nodejs/src/native_table.ts`.
|
||||||
|
* Note: despite the name, this class is also used for remote tables.
|
||||||
|
5. Add test in `nodejs/__test__/table.test.ts`.
|
||||||
|
6. Run `npm run docs` to generate TypeScript documentation.
|
||||||
|
|
||||||
|
## Review Guidelines
|
||||||
|
|
||||||
|
Please consider the following when reviewing code contributions.
|
||||||
|
|
||||||
|
### Rust API design
|
||||||
|
* Design public APIs so they can be evolved easily in the future without breaking
|
||||||
|
changes. Often this means using builder patterns or options structs instead of
|
||||||
|
long argument lists.
|
||||||
|
* For public APIs, prefer inputs that use `Into<T>` or `AsRef<T>` traits to allow
|
||||||
|
more flexible inputs. For example, use `name: Into<String>` instead of `name: String`,
|
||||||
|
so we don't have to write `func("my_string".to_string())`.
|
||||||
|
|
||||||
|
### Testing
|
||||||
|
* Ensure all new public APIs have documentation and examples.
|
||||||
|
* Ensure that all bugfixes and features have corresponding tests. **We do not merge
|
||||||
|
code without tests.**
|
||||||
|
|
||||||
|
### Documentation
|
||||||
|
* New features must include updates to the rust documentation comments. Link to
|
||||||
|
relevant structs and methods to increase the value of documentation.
|
||||||
80
CLAUDE.md
80
CLAUDE.md
@@ -1,80 +0,0 @@
|
|||||||
LanceDB is a database designed for retrieval, including vector, full-text, and hybrid search.
|
|
||||||
It is a wrapper around Lance. There are two backends: local (in-process like SQLite) and
|
|
||||||
remote (against LanceDB Cloud).
|
|
||||||
|
|
||||||
The core of LanceDB is written in Rust. There are bindings in Python, Typescript, and Java.
|
|
||||||
|
|
||||||
Project layout:
|
|
||||||
|
|
||||||
* `rust/lancedb`: The LanceDB core Rust implementation.
|
|
||||||
* `python`: The Python bindings, using PyO3.
|
|
||||||
* `nodejs`: The Typescript bindings, using napi-rs
|
|
||||||
* `java`: The Java bindings
|
|
||||||
|
|
||||||
Common commands:
|
|
||||||
|
|
||||||
* Check for compiler errors: `cargo check --quiet --features remote --tests --examples`
|
|
||||||
* Run tests: `cargo test --quiet --features remote --tests`
|
|
||||||
* Run specific test: `cargo test --quiet --features remote -p <package_name> --test <test_name>`
|
|
||||||
* Lint: `cargo clippy --quiet --features remote --tests --examples`
|
|
||||||
* Format: `cargo fmt --all`
|
|
||||||
|
|
||||||
Before committing changes, run formatting.
|
|
||||||
|
|
||||||
## Coding tips
|
|
||||||
|
|
||||||
* When writing Rust doctests for things that require a connection or table reference,
|
|
||||||
write them as a function instead of a fully executable test. This allows type checking
|
|
||||||
to run but avoids needing a full test environment. For example:
|
|
||||||
```rust
|
|
||||||
/// ```
|
|
||||||
/// use lance_index::scalar::FullTextSearchQuery;
|
|
||||||
/// use lancedb::query::{QueryBase, ExecutableQuery};
|
|
||||||
///
|
|
||||||
/// # use lancedb::Table;
|
|
||||||
/// # async fn query(table: &Table) -> Result<(), Box<dyn std::error::Error>> {
|
|
||||||
/// let results = table.query()
|
|
||||||
/// .full_text_search(FullTextSearchQuery::new("hello world".into()))
|
|
||||||
/// .execute()
|
|
||||||
/// .await?;
|
|
||||||
/// # Ok(())
|
|
||||||
/// # }
|
|
||||||
/// ```
|
|
||||||
```
|
|
||||||
|
|
||||||
## Example plan: adding a new method on Table
|
|
||||||
|
|
||||||
Adding a new method involves first adding it to the Rust core, then exposing it
|
|
||||||
in the Python and TypeScript bindings. There are both local and remote tables.
|
|
||||||
Remote tables are implemented via a HTTP API and require the `remote` cargo
|
|
||||||
feature flag to be enabled. Python has both sync and async methods.
|
|
||||||
|
|
||||||
Rust core changes:
|
|
||||||
|
|
||||||
1. Add method on `Table` struct in `rust/lancedb/src/table.rs` (calls `BaseTable` trait).
|
|
||||||
2. Add method to `BaseTable` trait in `rust/lancedb/src/table.rs`.
|
|
||||||
3. Implement new trait method on `NativeTable` in `rust/lancedb/src/table.rs`.
|
|
||||||
* Test with unit test in `rust/lancedb/src/table.rs`.
|
|
||||||
4. Implement new trait method on `RemoteTable` in `rust/lancedb/src/remote/table.rs`.
|
|
||||||
* Test with unit test in `rust/lancedb/src/remote/table.rs` against mocked endpoint.
|
|
||||||
|
|
||||||
Python bindings changes:
|
|
||||||
|
|
||||||
1. Add PyO3 method binding in `python/src/table.rs`. Run `make develop` to compile bindings.
|
|
||||||
2. Add types for PyO3 method in `python/python/lancedb/_lancedb.pyi`.
|
|
||||||
3. Add method to `AsyncTable` class in `python/python/lancedb/table.py`.
|
|
||||||
4. Add abstract method to `Table` abstract base class in `python/python/lancedb/table.py`.
|
|
||||||
5. Add concrete sync method to `LanceTable` class in `python/python/lancedb/table.py`.
|
|
||||||
* Should use `LOOP.run()` to call the corresponding `AsyncTable` method.
|
|
||||||
6. Add concrete sync method to `RemoteTable` class in `python/python/lancedb/remote/table.py`.
|
|
||||||
7. Add unit test in `python/tests/test_table.py`.
|
|
||||||
|
|
||||||
TypeScript bindings changes:
|
|
||||||
|
|
||||||
1. Add napi-rs method binding on `Table` in `nodejs/src/table.rs`.
|
|
||||||
2. Run `npm run build` to generate TypeScript definitions.
|
|
||||||
3. Add typescript method on abstract class `Table` in `nodejs/src/table.ts`.
|
|
||||||
4. Add concrete method on `LocalTable` class in `nodejs/src/native_table.ts`.
|
|
||||||
* Note: despite the name, this class is also used for remote tables.
|
|
||||||
5. Add test in `nodejs/__test__/table.test.ts`.
|
|
||||||
6. Run `npm run docs` to generate TypeScript documentation.
|
|
||||||
91
Cargo.lock
generated
91
Cargo.lock
generated
@@ -2933,18 +2933,6 @@ version = "0.2.3"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "f8eb564c5c7423d25c886fb561d1e4ee69f72354d16918afa32c08811f6b6a55"
|
checksum = "f8eb564c5c7423d25c886fb561d1e4ee69f72354d16918afa32c08811f6b6a55"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "fastbloom"
|
|
||||||
version = "0.14.0"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "18c1ddb9231d8554c2d6bdf4cfaabf0c59251658c68b6c95cd52dd0c513a912a"
|
|
||||||
dependencies = [
|
|
||||||
"getrandom 0.3.3",
|
|
||||||
"libm",
|
|
||||||
"rand 0.9.2",
|
|
||||||
"siphasher",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "fastdivide"
|
name = "fastdivide"
|
||||||
version = "0.4.2"
|
version = "0.4.2"
|
||||||
@@ -3044,8 +3032,8 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "fsst"
|
name = "fsst"
|
||||||
version = "0.38.3-beta.11"
|
version = "0.38.3"
|
||||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3-beta.11#db497cb2373156679aa0d6a1f2087880a8579bc6"
|
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3#afc0f9832cf11d0bf74381c2b63fd37de1c5f415"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow-array",
|
"arrow-array",
|
||||||
"rand 0.9.2",
|
"rand 0.9.2",
|
||||||
@@ -4229,8 +4217,8 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lance"
|
name = "lance"
|
||||||
version = "0.38.3-beta.11"
|
version = "0.38.3"
|
||||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3-beta.11#db497cb2373156679aa0d6a1f2087880a8579bc6"
|
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3#afc0f9832cf11d0bf74381c2b63fd37de1c5f415"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow",
|
"arrow",
|
||||||
"arrow-arith",
|
"arrow-arith",
|
||||||
@@ -4293,8 +4281,8 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lance-arrow"
|
name = "lance-arrow"
|
||||||
version = "0.38.3-beta.11"
|
version = "0.38.3"
|
||||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3-beta.11#db497cb2373156679aa0d6a1f2087880a8579bc6"
|
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3#afc0f9832cf11d0bf74381c2b63fd37de1c5f415"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow-array",
|
"arrow-array",
|
||||||
"arrow-buffer",
|
"arrow-buffer",
|
||||||
@@ -4312,8 +4300,8 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lance-bitpacking"
|
name = "lance-bitpacking"
|
||||||
version = "0.38.3-beta.11"
|
version = "0.38.3"
|
||||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3-beta.11#db497cb2373156679aa0d6a1f2087880a8579bc6"
|
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3#afc0f9832cf11d0bf74381c2b63fd37de1c5f415"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrayref",
|
"arrayref",
|
||||||
"paste",
|
"paste",
|
||||||
@@ -4322,8 +4310,8 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lance-core"
|
name = "lance-core"
|
||||||
version = "0.38.3-beta.11"
|
version = "0.38.3"
|
||||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3-beta.11#db497cb2373156679aa0d6a1f2087880a8579bc6"
|
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3#afc0f9832cf11d0bf74381c2b63fd37de1c5f415"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow-array",
|
"arrow-array",
|
||||||
"arrow-buffer",
|
"arrow-buffer",
|
||||||
@@ -4359,8 +4347,8 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lance-datafusion"
|
name = "lance-datafusion"
|
||||||
version = "0.38.3-beta.11"
|
version = "0.38.3"
|
||||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3-beta.11#db497cb2373156679aa0d6a1f2087880a8579bc6"
|
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3#afc0f9832cf11d0bf74381c2b63fd37de1c5f415"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow",
|
"arrow",
|
||||||
"arrow-array",
|
"arrow-array",
|
||||||
@@ -4389,8 +4377,8 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lance-datagen"
|
name = "lance-datagen"
|
||||||
version = "0.38.3-beta.11"
|
version = "0.38.3"
|
||||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3-beta.11#db497cb2373156679aa0d6a1f2087880a8579bc6"
|
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3#afc0f9832cf11d0bf74381c2b63fd37de1c5f415"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow",
|
"arrow",
|
||||||
"arrow-array",
|
"arrow-array",
|
||||||
@@ -4407,8 +4395,8 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lance-encoding"
|
name = "lance-encoding"
|
||||||
version = "0.38.3-beta.11"
|
version = "0.38.3"
|
||||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3-beta.11#db497cb2373156679aa0d6a1f2087880a8579bc6"
|
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3#afc0f9832cf11d0bf74381c2b63fd37de1c5f415"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow-arith",
|
"arrow-arith",
|
||||||
"arrow-array",
|
"arrow-array",
|
||||||
@@ -4445,8 +4433,8 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lance-file"
|
name = "lance-file"
|
||||||
version = "0.38.3-beta.11"
|
version = "0.38.3"
|
||||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3-beta.11#db497cb2373156679aa0d6a1f2087880a8579bc6"
|
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3#afc0f9832cf11d0bf74381c2b63fd37de1c5f415"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow-arith",
|
"arrow-arith",
|
||||||
"arrow-array",
|
"arrow-array",
|
||||||
@@ -4471,7 +4459,6 @@ dependencies = [
|
|||||||
"prost",
|
"prost",
|
||||||
"prost-build",
|
"prost-build",
|
||||||
"prost-types",
|
"prost-types",
|
||||||
"roaring",
|
|
||||||
"snafu",
|
"snafu",
|
||||||
"tokio",
|
"tokio",
|
||||||
"tracing",
|
"tracing",
|
||||||
@@ -4479,8 +4466,8 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lance-index"
|
name = "lance-index"
|
||||||
version = "0.38.3-beta.11"
|
version = "0.38.3"
|
||||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3-beta.11#db497cb2373156679aa0d6a1f2087880a8579bc6"
|
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3#afc0f9832cf11d0bf74381c2b63fd37de1c5f415"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow",
|
"arrow",
|
||||||
"arrow-arith",
|
"arrow-arith",
|
||||||
@@ -4502,7 +4489,6 @@ dependencies = [
|
|||||||
"datafusion-sql",
|
"datafusion-sql",
|
||||||
"deepsize",
|
"deepsize",
|
||||||
"dirs",
|
"dirs",
|
||||||
"fastbloom",
|
|
||||||
"fst",
|
"fst",
|
||||||
"futures",
|
"futures",
|
||||||
"half",
|
"half",
|
||||||
@@ -4542,8 +4528,8 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lance-io"
|
name = "lance-io"
|
||||||
version = "0.38.3-beta.11"
|
version = "0.38.3"
|
||||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3-beta.11#db497cb2373156679aa0d6a1f2087880a8579bc6"
|
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3#afc0f9832cf11d0bf74381c2b63fd37de1c5f415"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow",
|
"arrow",
|
||||||
"arrow-arith",
|
"arrow-arith",
|
||||||
@@ -4583,32 +4569,25 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lance-linalg"
|
name = "lance-linalg"
|
||||||
version = "0.38.3-beta.11"
|
version = "0.38.3"
|
||||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3-beta.11#db497cb2373156679aa0d6a1f2087880a8579bc6"
|
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3#afc0f9832cf11d0bf74381c2b63fd37de1c5f415"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow-array",
|
"arrow-array",
|
||||||
"arrow-buffer",
|
"arrow-buffer",
|
||||||
"arrow-ord",
|
|
||||||
"arrow-schema",
|
"arrow-schema",
|
||||||
"bitvec",
|
|
||||||
"cc",
|
"cc",
|
||||||
"deepsize",
|
"deepsize",
|
||||||
"futures",
|
|
||||||
"half",
|
"half",
|
||||||
"lance-arrow",
|
"lance-arrow",
|
||||||
"lance-core",
|
"lance-core",
|
||||||
"log",
|
|
||||||
"num-traits",
|
"num-traits",
|
||||||
"rand 0.9.2",
|
"rand 0.9.2",
|
||||||
"rayon",
|
|
||||||
"tokio",
|
|
||||||
"tracing",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lance-namespace"
|
name = "lance-namespace"
|
||||||
version = "0.38.3-beta.11"
|
version = "0.38.3"
|
||||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3-beta.11#db497cb2373156679aa0d6a1f2087880a8579bc6"
|
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3#afc0f9832cf11d0bf74381c2b63fd37de1c5f415"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow",
|
"arrow",
|
||||||
"async-trait",
|
"async-trait",
|
||||||
@@ -4620,8 +4599,8 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lance-namespace-impls"
|
name = "lance-namespace-impls"
|
||||||
version = "0.38.3-beta.11"
|
version = "0.38.3"
|
||||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3-beta.11#db497cb2373156679aa0d6a1f2087880a8579bc6"
|
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3#afc0f9832cf11d0bf74381c2b63fd37de1c5f415"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow",
|
"arrow",
|
||||||
"arrow-ipc",
|
"arrow-ipc",
|
||||||
@@ -4654,8 +4633,8 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lance-table"
|
name = "lance-table"
|
||||||
version = "0.38.3-beta.11"
|
version = "0.38.3"
|
||||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3-beta.11#db497cb2373156679aa0d6a1f2087880a8579bc6"
|
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3#afc0f9832cf11d0bf74381c2b63fd37de1c5f415"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow",
|
"arrow",
|
||||||
"arrow-array",
|
"arrow-array",
|
||||||
@@ -4693,8 +4672,8 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lance-testing"
|
name = "lance-testing"
|
||||||
version = "0.38.3-beta.11"
|
version = "0.38.3"
|
||||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3-beta.11#db497cb2373156679aa0d6a1f2087880a8579bc6"
|
source = "git+https://github.com/lancedb/lance.git?tag=v0.38.3#afc0f9832cf11d0bf74381c2b63fd37de1c5f415"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow-array",
|
"arrow-array",
|
||||||
"arrow-schema",
|
"arrow-schema",
|
||||||
@@ -4705,7 +4684,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lancedb"
|
name = "lancedb"
|
||||||
version = "0.22.3-beta.2"
|
version = "0.22.3-beta.3"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"ahash",
|
"ahash",
|
||||||
"anyhow",
|
"anyhow",
|
||||||
@@ -4802,7 +4781,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lancedb-nodejs"
|
name = "lancedb-nodejs"
|
||||||
version = "0.22.3-beta.2"
|
version = "0.22.3-beta.3"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow-array",
|
"arrow-array",
|
||||||
"arrow-ipc",
|
"arrow-ipc",
|
||||||
@@ -4822,7 +4801,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lancedb-python"
|
name = "lancedb-python"
|
||||||
version = "0.25.3-beta.2"
|
version = "0.25.3-beta.3"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow",
|
"arrow",
|
||||||
"async-trait",
|
"async-trait",
|
||||||
|
|||||||
28
Cargo.toml
28
Cargo.toml
@@ -15,20 +15,20 @@ categories = ["database-implementations"]
|
|||||||
rust-version = "1.78.0"
|
rust-version = "1.78.0"
|
||||||
|
|
||||||
[workspace.dependencies]
|
[workspace.dependencies]
|
||||||
lance = { "version" = "=0.38.3-beta.11", default-features = false, "tag" = "v0.38.3-beta.11", "git" = "https://github.com/lancedb/lance.git" }
|
lance = { "version" = "=0.38.3", default-features = false, "tag" = "v0.38.3", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
lance-core = { "version" = "=0.38.3-beta.11", "tag" = "v0.38.3-beta.11", "git" = "https://github.com/lancedb/lance.git" }
|
lance-core = { "version" = "=0.38.3", "tag" = "v0.38.3", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
lance-datagen = { "version" = "=0.38.3-beta.11", "tag" = "v0.38.3-beta.11", "git" = "https://github.com/lancedb/lance.git" }
|
lance-datagen = { "version" = "=0.38.3", "tag" = "v0.38.3", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
lance-file = { "version" = "=0.38.3-beta.11", "tag" = "v0.38.3-beta.11", "git" = "https://github.com/lancedb/lance.git" }
|
lance-file = { "version" = "=0.38.3", "tag" = "v0.38.3", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
lance-io = { "version" = "=0.38.3-beta.11", default-features = false, "tag" = "v0.38.3-beta.11", "git" = "https://github.com/lancedb/lance.git" }
|
lance-io = { "version" = "=0.38.3", default-features = false, "tag" = "v0.38.3", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
lance-index = { "version" = "=0.38.3-beta.11", "tag" = "v0.38.3-beta.11", "git" = "https://github.com/lancedb/lance.git" }
|
lance-index = { "version" = "=0.38.3", "tag" = "v0.38.3", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
lance-linalg = { "version" = "=0.38.3-beta.11", "tag" = "v0.38.3-beta.11", "git" = "https://github.com/lancedb/lance.git" }
|
lance-linalg = { "version" = "=0.38.3", "tag" = "v0.38.3", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
lance-namespace = { "version" = "=0.38.3-beta.11", "tag" = "v0.38.3-beta.11", "git" = "https://github.com/lancedb/lance.git" }
|
lance-namespace = { "version" = "=0.38.3", "tag" = "v0.38.3", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
lance-namespace-impls = { "version" = "=0.38.3-beta.11", "features" = ["dir-aws", "dir-gcp", "dir-azure", "dir-oss", "rest"], "tag" = "v0.38.3-beta.11", "git" = "https://github.com/lancedb/lance.git" }
|
lance-namespace-impls = { "version" = "=0.38.3", "features" = ["dir-aws", "dir-gcp", "dir-azure", "dir-oss", "rest"], "tag" = "v0.38.3", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
lance-table = { "version" = "=0.38.3-beta.11", "tag" = "v0.38.3-beta.11", "git" = "https://github.com/lancedb/lance.git" }
|
lance-table = { "version" = "=0.38.3", "tag" = "v0.38.3", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
lance-testing = { "version" = "=0.38.3-beta.11", "tag" = "v0.38.3-beta.11", "git" = "https://github.com/lancedb/lance.git" }
|
lance-testing = { "version" = "=0.38.3", "tag" = "v0.38.3", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
lance-datafusion = { "version" = "=0.38.3-beta.11", "tag" = "v0.38.3-beta.11", "git" = "https://github.com/lancedb/lance.git" }
|
lance-datafusion = { "version" = "=0.38.3", "tag" = "v0.38.3", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
lance-encoding = { "version" = "=0.38.3-beta.11", "tag" = "v0.38.3-beta.11", "git" = "https://github.com/lancedb/lance.git" }
|
lance-encoding = { "version" = "=0.38.3", "tag" = "v0.38.3", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
lance-arrow = { "version" = "=0.38.3-beta.11", "tag" = "v0.38.3-beta.11", "git" = "https://github.com/lancedb/lance.git" }
|
lance-arrow = { "version" = "=0.38.3", "tag" = "v0.38.3", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
ahash = "0.8"
|
ahash = "0.8"
|
||||||
# Note that this one does not include pyarrow
|
# Note that this one does not include pyarrow
|
||||||
arrow = { version = "56.2", optional = false }
|
arrow = { version = "56.2", optional = false }
|
||||||
|
|||||||
@@ -0,0 +1,97 @@
|
|||||||
|
# VoyageAI Embeddings : Multimodal
|
||||||
|
|
||||||
|
VoyageAI embeddings can also be used to embed both text and image data, only some of the models support image data and you can check the list
|
||||||
|
under [https://docs.voyageai.com/docs/multimodal-embeddings](https://docs.voyageai.com/docs/multimodal-embeddings)
|
||||||
|
|
||||||
|
Supported parameters (to be passed in `create` method) are:
|
||||||
|
|
||||||
|
| Parameter | Type | Default Value | Description |
|
||||||
|
|---|---|-------------------------|-------------------------------------------|
|
||||||
|
| `name` | `str` | `"voyage-multimodal-3"` | The model ID of the VoyageAI model to use |
|
||||||
|
|
||||||
|
Usage Example:
|
||||||
|
|
||||||
|
```python
|
||||||
|
import base64
|
||||||
|
import os
|
||||||
|
from io import BytesIO
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import lancedb
|
||||||
|
from lancedb.pydantic import LanceModel, Vector
|
||||||
|
from lancedb.embeddings import get_registry
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
os.environ['VOYAGE_API_KEY'] = 'YOUR_VOYAGE_API_KEY'
|
||||||
|
|
||||||
|
db = lancedb.connect(".lancedb")
|
||||||
|
func = get_registry().get("voyageai").create(name="voyage-multimodal-3")
|
||||||
|
|
||||||
|
|
||||||
|
def image_to_base64(image_bytes: bytes):
|
||||||
|
buffered = BytesIO(image_bytes)
|
||||||
|
img_str = base64.b64encode(buffered.getvalue())
|
||||||
|
return img_str.decode("utf-8")
|
||||||
|
|
||||||
|
|
||||||
|
class Images(LanceModel):
|
||||||
|
label: str
|
||||||
|
image_uri: str = func.SourceField() # image uri as the source
|
||||||
|
image_bytes: str = func.SourceField() # image bytes base64 encoded as the source
|
||||||
|
vector: Vector(func.ndims()) = func.VectorField() # vector column
|
||||||
|
vec_from_bytes: Vector(func.ndims()) = func.VectorField() # Another vector column
|
||||||
|
|
||||||
|
|
||||||
|
if "images" in db.table_names():
|
||||||
|
db.drop_table("images")
|
||||||
|
table = db.create_table("images", schema=Images)
|
||||||
|
labels = ["cat", "cat", "dog", "dog", "horse", "horse"]
|
||||||
|
uris = [
|
||||||
|
"http://farm1.staticflickr.com/53/167798175_7c7845bbbd_z.jpg",
|
||||||
|
"http://farm1.staticflickr.com/134/332220238_da527d8140_z.jpg",
|
||||||
|
"http://farm9.staticflickr.com/8387/8602747737_2e5c2a45d4_z.jpg",
|
||||||
|
"http://farm5.staticflickr.com/4092/5017326486_1f46057f5f_z.jpg",
|
||||||
|
"http://farm9.staticflickr.com/8216/8434969557_d37882c42d_z.jpg",
|
||||||
|
"http://farm6.staticflickr.com/5142/5835678453_4f3a4edb45_z.jpg",
|
||||||
|
]
|
||||||
|
# get each uri as bytes
|
||||||
|
images_bytes = [image_to_base64(requests.get(uri).content) for uri in uris]
|
||||||
|
table.add(
|
||||||
|
pd.DataFrame({"label": labels, "image_uri": uris, "image_bytes": images_bytes})
|
||||||
|
)
|
||||||
|
```
|
||||||
|
Now we can search using text from both the default vector column and the custom vector column
|
||||||
|
```python
|
||||||
|
|
||||||
|
# text search
|
||||||
|
actual = table.search("man's best friend", "vec_from_bytes").limit(1).to_pydantic(Images)[0]
|
||||||
|
print(actual.label) # prints "dog"
|
||||||
|
|
||||||
|
frombytes = (
|
||||||
|
table.search("man's best friend", vector_column_name="vec_from_bytes")
|
||||||
|
.limit(1)
|
||||||
|
.to_pydantic(Images)[0]
|
||||||
|
)
|
||||||
|
print(frombytes.label)
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
Because we're using a multi-modal embedding function, we can also search using images
|
||||||
|
|
||||||
|
```python
|
||||||
|
# image search
|
||||||
|
query_image_uri = "http://farm1.staticflickr.com/200/467715466_ed4a31801f_z.jpg"
|
||||||
|
image_bytes = requests.get(query_image_uri).content
|
||||||
|
query_image = Image.open(BytesIO(image_bytes))
|
||||||
|
actual = table.search(query_image, "vec_from_bytes").limit(1).to_pydantic(Images)[0]
|
||||||
|
print(actual.label == "dog")
|
||||||
|
|
||||||
|
# image search using a custom vector column
|
||||||
|
other = (
|
||||||
|
table.search(query_image, vector_column_name="vec_from_bytes")
|
||||||
|
.limit(1)
|
||||||
|
.to_pydantic(Images)[0]
|
||||||
|
)
|
||||||
|
print(actual.label)
|
||||||
|
|
||||||
|
```
|
||||||
@@ -397,117 +397,6 @@ For **read-only access**, LanceDB will need a policy such as:
|
|||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
#### DynamoDB Commit Store for concurrent writes
|
|
||||||
|
|
||||||
By default, S3 does not support concurrent writes. Having two or more processes
|
|
||||||
writing to the same table at the same time can lead to data corruption. This is
|
|
||||||
because S3, unlike other object stores, does not have any atomic put or copy
|
|
||||||
operation.
|
|
||||||
|
|
||||||
To enable concurrent writes, you can configure LanceDB to use a DynamoDB table
|
|
||||||
as a commit store. This table will be used to coordinate writes between
|
|
||||||
different processes. To enable this feature, you must modify your connection
|
|
||||||
URI to use the `s3+ddb` scheme and add a query parameter `ddbTableName` with the
|
|
||||||
name of the table to use.
|
|
||||||
|
|
||||||
=== "Python"
|
|
||||||
|
|
||||||
=== "Sync API"
|
|
||||||
|
|
||||||
```python
|
|
||||||
import lancedb
|
|
||||||
db = lancedb.connect(
|
|
||||||
"s3+ddb://bucket/path?ddbTableName=my-dynamodb-table",
|
|
||||||
)
|
|
||||||
```
|
|
||||||
=== "Async API"
|
|
||||||
|
|
||||||
```python
|
|
||||||
import lancedb
|
|
||||||
async_db = await lancedb.connect_async(
|
|
||||||
"s3+ddb://bucket/path?ddbTableName=my-dynamodb-table",
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
=== "JavaScript"
|
|
||||||
|
|
||||||
```javascript
|
|
||||||
const lancedb = require("lancedb");
|
|
||||||
|
|
||||||
const db = await lancedb.connect(
|
|
||||||
"s3+ddb://bucket/path?ddbTableName=my-dynamodb-table",
|
|
||||||
);
|
|
||||||
```
|
|
||||||
|
|
||||||
The DynamoDB table must be created with the following schema:
|
|
||||||
|
|
||||||
- Hash key: `base_uri` (string)
|
|
||||||
- Range key: `version` (number)
|
|
||||||
|
|
||||||
You can create this programmatically with:
|
|
||||||
|
|
||||||
=== "Python"
|
|
||||||
|
|
||||||
<!-- skip-test -->
|
|
||||||
```python
|
|
||||||
import boto3
|
|
||||||
|
|
||||||
dynamodb = boto3.client("dynamodb")
|
|
||||||
table = dynamodb.create_table(
|
|
||||||
TableName=table_name,
|
|
||||||
KeySchema=[
|
|
||||||
{"AttributeName": "base_uri", "KeyType": "HASH"},
|
|
||||||
{"AttributeName": "version", "KeyType": "RANGE"},
|
|
||||||
],
|
|
||||||
AttributeDefinitions=[
|
|
||||||
{"AttributeName": "base_uri", "AttributeType": "S"},
|
|
||||||
{"AttributeName": "version", "AttributeType": "N"},
|
|
||||||
],
|
|
||||||
ProvisionedThroughput={"ReadCapacityUnits": 1, "WriteCapacityUnits": 1},
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
=== "JavaScript"
|
|
||||||
|
|
||||||
<!-- skip-test -->
|
|
||||||
```javascript
|
|
||||||
import {
|
|
||||||
CreateTableCommand,
|
|
||||||
DynamoDBClient,
|
|
||||||
} from "@aws-sdk/client-dynamodb";
|
|
||||||
|
|
||||||
const dynamodb = new DynamoDBClient({
|
|
||||||
region: CONFIG.awsRegion,
|
|
||||||
credentials: {
|
|
||||||
accessKeyId: CONFIG.awsAccessKeyId,
|
|
||||||
secretAccessKey: CONFIG.awsSecretAccessKey,
|
|
||||||
},
|
|
||||||
endpoint: CONFIG.awsEndpoint,
|
|
||||||
});
|
|
||||||
const command = new CreateTableCommand({
|
|
||||||
TableName: table_name,
|
|
||||||
AttributeDefinitions: [
|
|
||||||
{
|
|
||||||
AttributeName: "base_uri",
|
|
||||||
AttributeType: "S",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
AttributeName: "version",
|
|
||||||
AttributeType: "N",
|
|
||||||
},
|
|
||||||
],
|
|
||||||
KeySchema: [
|
|
||||||
{ AttributeName: "base_uri", KeyType: "HASH" },
|
|
||||||
{ AttributeName: "version", KeyType: "RANGE" },
|
|
||||||
],
|
|
||||||
ProvisionedThroughput: {
|
|
||||||
ReadCapacityUnits: 1,
|
|
||||||
WriteCapacityUnits: 1,
|
|
||||||
},
|
|
||||||
});
|
|
||||||
await client.send(command);
|
|
||||||
```
|
|
||||||
|
|
||||||
|
|
||||||
#### S3-compatible stores
|
#### S3-compatible stores
|
||||||
|
|
||||||
|
|||||||
@@ -80,7 +80,7 @@ AnalyzeExec verbose=true, metrics=[]
|
|||||||
### execute()
|
### execute()
|
||||||
|
|
||||||
```ts
|
```ts
|
||||||
protected execute(options?): RecordBatchIterator
|
protected execute(options?): AsyncGenerator<RecordBatch<any>, void, unknown>
|
||||||
```
|
```
|
||||||
|
|
||||||
Execute the query and return the results as an
|
Execute the query and return the results as an
|
||||||
@@ -91,7 +91,7 @@ Execute the query and return the results as an
|
|||||||
|
|
||||||
#### Returns
|
#### Returns
|
||||||
|
|
||||||
[`RecordBatchIterator`](RecordBatchIterator.md)
|
`AsyncGenerator`<`RecordBatch`<`any`>, `void`, `unknown`>
|
||||||
|
|
||||||
#### See
|
#### See
|
||||||
|
|
||||||
|
|||||||
@@ -81,7 +81,7 @@ AnalyzeExec verbose=true, metrics=[]
|
|||||||
### execute()
|
### execute()
|
||||||
|
|
||||||
```ts
|
```ts
|
||||||
protected execute(options?): RecordBatchIterator
|
protected execute(options?): AsyncGenerator<RecordBatch<any>, void, unknown>
|
||||||
```
|
```
|
||||||
|
|
||||||
Execute the query and return the results as an
|
Execute the query and return the results as an
|
||||||
@@ -92,7 +92,7 @@ Execute the query and return the results as an
|
|||||||
|
|
||||||
#### Returns
|
#### Returns
|
||||||
|
|
||||||
[`RecordBatchIterator`](RecordBatchIterator.md)
|
`AsyncGenerator`<`RecordBatch`<`any`>, `void`, `unknown`>
|
||||||
|
|
||||||
#### See
|
#### See
|
||||||
|
|
||||||
|
|||||||
@@ -1,43 +0,0 @@
|
|||||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
|
||||||
|
|
||||||
***
|
|
||||||
|
|
||||||
[@lancedb/lancedb](../globals.md) / RecordBatchIterator
|
|
||||||
|
|
||||||
# Class: RecordBatchIterator
|
|
||||||
|
|
||||||
## Implements
|
|
||||||
|
|
||||||
- `AsyncIterator`<`RecordBatch`>
|
|
||||||
|
|
||||||
## Constructors
|
|
||||||
|
|
||||||
### new RecordBatchIterator()
|
|
||||||
|
|
||||||
```ts
|
|
||||||
new RecordBatchIterator(promise?): RecordBatchIterator
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Parameters
|
|
||||||
|
|
||||||
* **promise?**: `Promise`<`RecordBatchIterator`>
|
|
||||||
|
|
||||||
#### Returns
|
|
||||||
|
|
||||||
[`RecordBatchIterator`](RecordBatchIterator.md)
|
|
||||||
|
|
||||||
## Methods
|
|
||||||
|
|
||||||
### next()
|
|
||||||
|
|
||||||
```ts
|
|
||||||
next(): Promise<IteratorResult<RecordBatch<any>, any>>
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Returns
|
|
||||||
|
|
||||||
`Promise`<`IteratorResult`<`RecordBatch`<`any`>, `any`>>
|
|
||||||
|
|
||||||
#### Implementation of
|
|
||||||
|
|
||||||
`AsyncIterator.next`
|
|
||||||
@@ -76,7 +76,7 @@ AnalyzeExec verbose=true, metrics=[]
|
|||||||
### execute()
|
### execute()
|
||||||
|
|
||||||
```ts
|
```ts
|
||||||
protected execute(options?): RecordBatchIterator
|
protected execute(options?): AsyncGenerator<RecordBatch<any>, void, unknown>
|
||||||
```
|
```
|
||||||
|
|
||||||
Execute the query and return the results as an
|
Execute the query and return the results as an
|
||||||
@@ -87,7 +87,7 @@ Execute the query and return the results as an
|
|||||||
|
|
||||||
#### Returns
|
#### Returns
|
||||||
|
|
||||||
[`RecordBatchIterator`](RecordBatchIterator.md)
|
`AsyncGenerator`<`RecordBatch`<`any`>, `void`, `unknown`>
|
||||||
|
|
||||||
#### See
|
#### See
|
||||||
|
|
||||||
|
|||||||
@@ -221,7 +221,7 @@ also increase the latency of your query. The default value is 1.5*limit.
|
|||||||
### execute()
|
### execute()
|
||||||
|
|
||||||
```ts
|
```ts
|
||||||
protected execute(options?): RecordBatchIterator
|
protected execute(options?): AsyncGenerator<RecordBatch<any>, void, unknown>
|
||||||
```
|
```
|
||||||
|
|
||||||
Execute the query and return the results as an
|
Execute the query and return the results as an
|
||||||
@@ -232,7 +232,7 @@ Execute the query and return the results as an
|
|||||||
|
|
||||||
#### Returns
|
#### Returns
|
||||||
|
|
||||||
[`RecordBatchIterator`](RecordBatchIterator.md)
|
`AsyncGenerator`<`RecordBatch`<`any`>, `void`, `unknown`>
|
||||||
|
|
||||||
#### See
|
#### See
|
||||||
|
|
||||||
|
|||||||
19
docs/src/js/functions/RecordBatchIterator.md
Normal file
19
docs/src/js/functions/RecordBatchIterator.md
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
[@lancedb/lancedb](../globals.md) / RecordBatchIterator
|
||||||
|
|
||||||
|
# Function: RecordBatchIterator()
|
||||||
|
|
||||||
|
```ts
|
||||||
|
function RecordBatchIterator(promisedInner): AsyncGenerator<RecordBatch<any>, void, unknown>
|
||||||
|
```
|
||||||
|
|
||||||
|
## Parameters
|
||||||
|
|
||||||
|
* **promisedInner**: `Promise`<`RecordBatchIterator`>
|
||||||
|
|
||||||
|
## Returns
|
||||||
|
|
||||||
|
`AsyncGenerator`<`RecordBatch`<`any`>, `void`, `unknown`>
|
||||||
@@ -32,7 +32,6 @@
|
|||||||
- [PhraseQuery](classes/PhraseQuery.md)
|
- [PhraseQuery](classes/PhraseQuery.md)
|
||||||
- [Query](classes/Query.md)
|
- [Query](classes/Query.md)
|
||||||
- [QueryBase](classes/QueryBase.md)
|
- [QueryBase](classes/QueryBase.md)
|
||||||
- [RecordBatchIterator](classes/RecordBatchIterator.md)
|
|
||||||
- [Session](classes/Session.md)
|
- [Session](classes/Session.md)
|
||||||
- [StaticHeaderProvider](classes/StaticHeaderProvider.md)
|
- [StaticHeaderProvider](classes/StaticHeaderProvider.md)
|
||||||
- [Table](classes/Table.md)
|
- [Table](classes/Table.md)
|
||||||
@@ -105,6 +104,7 @@
|
|||||||
|
|
||||||
## Functions
|
## Functions
|
||||||
|
|
||||||
|
- [RecordBatchIterator](functions/RecordBatchIterator.md)
|
||||||
- [connect](functions/connect.md)
|
- [connect](functions/connect.md)
|
||||||
- [makeArrowTable](functions/makeArrowTable.md)
|
- [makeArrowTable](functions/makeArrowTable.md)
|
||||||
- [packBits](functions/packBits.md)
|
- [packBits](functions/packBits.md)
|
||||||
|
|||||||
@@ -8,7 +8,7 @@
|
|||||||
<parent>
|
<parent>
|
||||||
<groupId>com.lancedb</groupId>
|
<groupId>com.lancedb</groupId>
|
||||||
<artifactId>lancedb-parent</artifactId>
|
<artifactId>lancedb-parent</artifactId>
|
||||||
<version>0.22.3-beta.2</version>
|
<version>0.22.3-beta.3</version>
|
||||||
<relativePath>../pom.xml</relativePath>
|
<relativePath>../pom.xml</relativePath>
|
||||||
</parent>
|
</parent>
|
||||||
|
|
||||||
|
|||||||
@@ -8,7 +8,7 @@
|
|||||||
<parent>
|
<parent>
|
||||||
<groupId>com.lancedb</groupId>
|
<groupId>com.lancedb</groupId>
|
||||||
<artifactId>lancedb-parent</artifactId>
|
<artifactId>lancedb-parent</artifactId>
|
||||||
<version>0.22.3-beta.2</version>
|
<version>0.22.3-beta.3</version>
|
||||||
<relativePath>../pom.xml</relativePath>
|
<relativePath>../pom.xml</relativePath>
|
||||||
</parent>
|
</parent>
|
||||||
|
|
||||||
|
|||||||
@@ -6,7 +6,7 @@
|
|||||||
|
|
||||||
<groupId>com.lancedb</groupId>
|
<groupId>com.lancedb</groupId>
|
||||||
<artifactId>lancedb-parent</artifactId>
|
<artifactId>lancedb-parent</artifactId>
|
||||||
<version>0.22.3-beta.2</version>
|
<version>0.22.3-beta.3</version>
|
||||||
<packaging>pom</packaging>
|
<packaging>pom</packaging>
|
||||||
<name>${project.artifactId}</name>
|
<name>${project.artifactId}</name>
|
||||||
<description>LanceDB Java SDK Parent POM</description>
|
<description>LanceDB Java SDK Parent POM</description>
|
||||||
|
|||||||
13
nodejs/AGENTS.md
Normal file
13
nodejs/AGENTS.md
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
These are the typescript bindings of LanceDB.
|
||||||
|
The core Rust library is in the `../rust/lancedb` directory, the rust binding
|
||||||
|
code is in the `src/` directory and the typescript bindings are in
|
||||||
|
the `lancedb/` directory.
|
||||||
|
|
||||||
|
Whenever you change the Rust code, you will need to recompile: `npm run build`.
|
||||||
|
|
||||||
|
Common commands:
|
||||||
|
* Build: `npm run build`
|
||||||
|
* Lint: `npm run lint`
|
||||||
|
* Fix lints: `npm run lint-fix`
|
||||||
|
* Test: `npm test`
|
||||||
|
* Run single test file: `npm test __test__/arrow.test.ts`
|
||||||
@@ -1,13 +0,0 @@
|
|||||||
These are the typescript bindings of LanceDB.
|
|
||||||
The core Rust library is in the `../rust/lancedb` directory, the rust binding
|
|
||||||
code is in the `src/` directory and the typescript bindings are in
|
|
||||||
the `lancedb/` directory.
|
|
||||||
|
|
||||||
Whenever you change the Rust code, you will need to recompile: `npm run build`.
|
|
||||||
|
|
||||||
Common commands:
|
|
||||||
* Build: `npm run build`
|
|
||||||
* Lint: `npm run lint`
|
|
||||||
* Fix lints: `npm run lint-fix`
|
|
||||||
* Test: `npm test`
|
|
||||||
* Run single test file: `npm test __test__/arrow.test.ts`
|
|
||||||
1
nodejs/CLAUDE.md
Symbolic link
1
nodejs/CLAUDE.md
Symbolic link
@@ -0,0 +1 @@
|
|||||||
|
AGENTS.md
|
||||||
@@ -1,7 +1,7 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb-nodejs"
|
name = "lancedb-nodejs"
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
version = "0.22.3-beta.2"
|
version = "0.22.3-beta.3"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
description.workspace = true
|
description.workspace = true
|
||||||
repository.workspace = true
|
repository.workspace = true
|
||||||
|
|||||||
@@ -20,35 +20,25 @@ import {
|
|||||||
} from "./native";
|
} from "./native";
|
||||||
import { Reranker } from "./rerankers";
|
import { Reranker } from "./rerankers";
|
||||||
|
|
||||||
export class RecordBatchIterator implements AsyncIterator<RecordBatch> {
|
export async function* RecordBatchIterator(
|
||||||
private promisedInner?: Promise<NativeBatchIterator>;
|
promisedInner: Promise<NativeBatchIterator>,
|
||||||
private inner?: NativeBatchIterator;
|
) {
|
||||||
|
const inner = await promisedInner;
|
||||||
|
|
||||||
constructor(promise?: Promise<NativeBatchIterator>) {
|
if (inner === undefined) {
|
||||||
// TODO: check promise reliably so we dont need to pass two arguments.
|
throw new Error("Invalid iterator state");
|
||||||
this.promisedInner = promise;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// biome-ignore lint/suspicious/noExplicitAny: skip
|
for (let buffer = await inner.next(); buffer; buffer = await inner.next()) {
|
||||||
async next(): Promise<IteratorResult<RecordBatch<any>>> {
|
const { batches } = tableFromIPC(buffer);
|
||||||
if (this.inner === undefined) {
|
|
||||||
this.inner = await this.promisedInner;
|
if (batches.length !== 1) {
|
||||||
}
|
|
||||||
if (this.inner === undefined) {
|
|
||||||
throw new Error("Invalid iterator state state");
|
|
||||||
}
|
|
||||||
const n = await this.inner.next();
|
|
||||||
if (n == null) {
|
|
||||||
return Promise.resolve({ done: true, value: null });
|
|
||||||
}
|
|
||||||
const tbl = tableFromIPC(n);
|
|
||||||
if (tbl.batches.length != 1) {
|
|
||||||
throw new Error("Expected only one batch");
|
throw new Error("Expected only one batch");
|
||||||
}
|
}
|
||||||
return Promise.resolve({ done: false, value: tbl.batches[0] });
|
|
||||||
|
yield batches[0];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/* eslint-enable */
|
|
||||||
|
|
||||||
class RecordBatchIterable<
|
class RecordBatchIterable<
|
||||||
NativeQueryType extends NativeQuery | NativeVectorQuery | NativeTakeQuery,
|
NativeQueryType extends NativeQuery | NativeVectorQuery | NativeTakeQuery,
|
||||||
@@ -64,7 +54,7 @@ class RecordBatchIterable<
|
|||||||
|
|
||||||
// biome-ignore lint/suspicious/noExplicitAny: skip
|
// biome-ignore lint/suspicious/noExplicitAny: skip
|
||||||
[Symbol.asyncIterator](): AsyncIterator<RecordBatch<any>, any, undefined> {
|
[Symbol.asyncIterator](): AsyncIterator<RecordBatch<any>, any, undefined> {
|
||||||
return new RecordBatchIterator(
|
return RecordBatchIterator(
|
||||||
this.inner.execute(this.options?.maxBatchLength, this.options?.timeoutMs),
|
this.inner.execute(this.options?.maxBatchLength, this.options?.timeoutMs),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@@ -231,10 +221,8 @@ export class QueryBase<
|
|||||||
* single query)
|
* single query)
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
protected execute(
|
protected execute(options?: Partial<QueryExecutionOptions>) {
|
||||||
options?: Partial<QueryExecutionOptions>,
|
return RecordBatchIterator(this.nativeExecute(options));
|
||||||
): RecordBatchIterator {
|
|
||||||
return new RecordBatchIterator(this.nativeExecute(options));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -242,8 +230,7 @@ export class QueryBase<
|
|||||||
*/
|
*/
|
||||||
// biome-ignore lint/suspicious/noExplicitAny: skip
|
// biome-ignore lint/suspicious/noExplicitAny: skip
|
||||||
[Symbol.asyncIterator](): AsyncIterator<RecordBatch<any>> {
|
[Symbol.asyncIterator](): AsyncIterator<RecordBatch<any>> {
|
||||||
const promise = this.nativeExecute();
|
return RecordBatchIterator(this.nativeExecute());
|
||||||
return new RecordBatchIterator(promise);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Collect the results as an Arrow @see {@link ArrowTable}. */
|
/** Collect the results as an Arrow @see {@link ArrowTable}. */
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-darwin-arm64",
|
"name": "@lancedb/lancedb-darwin-arm64",
|
||||||
"version": "0.22.3-beta.2",
|
"version": "0.22.3-beta.3",
|
||||||
"os": ["darwin"],
|
"os": ["darwin"],
|
||||||
"cpu": ["arm64"],
|
"cpu": ["arm64"],
|
||||||
"main": "lancedb.darwin-arm64.node",
|
"main": "lancedb.darwin-arm64.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-darwin-x64",
|
"name": "@lancedb/lancedb-darwin-x64",
|
||||||
"version": "0.22.3-beta.2",
|
"version": "0.22.3-beta.3",
|
||||||
"os": ["darwin"],
|
"os": ["darwin"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.darwin-x64.node",
|
"main": "lancedb.darwin-x64.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-linux-arm64-gnu",
|
"name": "@lancedb/lancedb-linux-arm64-gnu",
|
||||||
"version": "0.22.3-beta.2",
|
"version": "0.22.3-beta.3",
|
||||||
"os": ["linux"],
|
"os": ["linux"],
|
||||||
"cpu": ["arm64"],
|
"cpu": ["arm64"],
|
||||||
"main": "lancedb.linux-arm64-gnu.node",
|
"main": "lancedb.linux-arm64-gnu.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-linux-arm64-musl",
|
"name": "@lancedb/lancedb-linux-arm64-musl",
|
||||||
"version": "0.22.3-beta.2",
|
"version": "0.22.3-beta.3",
|
||||||
"os": ["linux"],
|
"os": ["linux"],
|
||||||
"cpu": ["arm64"],
|
"cpu": ["arm64"],
|
||||||
"main": "lancedb.linux-arm64-musl.node",
|
"main": "lancedb.linux-arm64-musl.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-linux-x64-gnu",
|
"name": "@lancedb/lancedb-linux-x64-gnu",
|
||||||
"version": "0.22.3-beta.2",
|
"version": "0.22.3-beta.3",
|
||||||
"os": ["linux"],
|
"os": ["linux"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.linux-x64-gnu.node",
|
"main": "lancedb.linux-x64-gnu.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-linux-x64-musl",
|
"name": "@lancedb/lancedb-linux-x64-musl",
|
||||||
"version": "0.22.3-beta.2",
|
"version": "0.22.3-beta.3",
|
||||||
"os": ["linux"],
|
"os": ["linux"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.linux-x64-musl.node",
|
"main": "lancedb.linux-x64-musl.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-win32-arm64-msvc",
|
"name": "@lancedb/lancedb-win32-arm64-msvc",
|
||||||
"version": "0.22.3-beta.2",
|
"version": "0.22.3-beta.3",
|
||||||
"os": [
|
"os": [
|
||||||
"win32"
|
"win32"
|
||||||
],
|
],
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-win32-x64-msvc",
|
"name": "@lancedb/lancedb-win32-x64-msvc",
|
||||||
"version": "0.22.3-beta.2",
|
"version": "0.22.3-beta.3",
|
||||||
"os": ["win32"],
|
"os": ["win32"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.win32-x64-msvc.node",
|
"main": "lancedb.win32-x64-msvc.node",
|
||||||
|
|||||||
4
nodejs/package-lock.json
generated
4
nodejs/package-lock.json
generated
@@ -1,12 +1,12 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb",
|
"name": "@lancedb/lancedb",
|
||||||
"version": "0.22.3-beta.2",
|
"version": "0.22.3-beta.3",
|
||||||
"lockfileVersion": 3,
|
"lockfileVersion": 3,
|
||||||
"requires": true,
|
"requires": true,
|
||||||
"packages": {
|
"packages": {
|
||||||
"": {
|
"": {
|
||||||
"name": "@lancedb/lancedb",
|
"name": "@lancedb/lancedb",
|
||||||
"version": "0.22.3-beta.2",
|
"version": "0.22.3-beta.3",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"x64",
|
"x64",
|
||||||
"arm64"
|
"arm64"
|
||||||
|
|||||||
@@ -11,7 +11,7 @@
|
|||||||
"ann"
|
"ann"
|
||||||
],
|
],
|
||||||
"private": false,
|
"private": false,
|
||||||
"version": "0.22.3-beta.2",
|
"version": "0.22.3-beta.3",
|
||||||
"main": "dist/index.js",
|
"main": "dist/index.js",
|
||||||
"exports": {
|
"exports": {
|
||||||
".": "./dist/index.js",
|
".": "./dist/index.js",
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
[tool.bumpversion]
|
[tool.bumpversion]
|
||||||
current_version = "0.25.3-beta.3"
|
current_version = "0.25.3-beta.4"
|
||||||
parse = """(?x)
|
parse = """(?x)
|
||||||
(?P<major>0|[1-9]\\d*)\\.
|
(?P<major>0|[1-9]\\d*)\\.
|
||||||
(?P<minor>0|[1-9]\\d*)\\.
|
(?P<minor>0|[1-9]\\d*)\\.
|
||||||
|
|||||||
19
python/AGENTS.md
Normal file
19
python/AGENTS.md
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
These are the Python bindings of LanceDB.
|
||||||
|
The core Rust library is in the `../rust/lancedb` directory, the rust binding
|
||||||
|
code is in the `src/` directory and the Python bindings are in the `lancedb/` directory.
|
||||||
|
|
||||||
|
Common commands:
|
||||||
|
|
||||||
|
* Build: `make develop`
|
||||||
|
* Format: `make format`
|
||||||
|
* Lint: `make check`
|
||||||
|
* Fix lints: `make fix`
|
||||||
|
* Test: `make test`
|
||||||
|
* Doc test: `make doctest`
|
||||||
|
|
||||||
|
Before committing changes, run lints and then formatting.
|
||||||
|
|
||||||
|
When you change the Rust code, you will need to recompile the Python bindings: `make develop`.
|
||||||
|
|
||||||
|
When you export new types from Rust to Python, you must manually update `python/lancedb/_lancedb.pyi`
|
||||||
|
with the corresponding type hints. You can run `pyright` to check for type errors in the Python code.
|
||||||
@@ -1,19 +0,0 @@
|
|||||||
These are the Python bindings of LanceDB.
|
|
||||||
The core Rust library is in the `../rust/lancedb` directory, the rust binding
|
|
||||||
code is in the `src/` directory and the Python bindings are in the `lancedb/` directory.
|
|
||||||
|
|
||||||
Common commands:
|
|
||||||
|
|
||||||
* Build: `make develop`
|
|
||||||
* Format: `make format`
|
|
||||||
* Lint: `make check`
|
|
||||||
* Fix lints: `make fix`
|
|
||||||
* Test: `make test`
|
|
||||||
* Doc test: `make doctest`
|
|
||||||
|
|
||||||
Before committing changes, run lints and then formatting.
|
|
||||||
|
|
||||||
When you change the Rust code, you will need to recompile the Python bindings: `make develop`.
|
|
||||||
|
|
||||||
When you export new types from Rust to Python, you must manually update `python/lancedb/_lancedb.pyi`
|
|
||||||
with the corresponding type hints. You can run `pyright` to check for type errors in the Python code.
|
|
||||||
1
python/CLAUDE.md
Symbolic link
1
python/CLAUDE.md
Symbolic link
@@ -0,0 +1 @@
|
|||||||
|
AGENTS.md
|
||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb-python"
|
name = "lancedb-python"
|
||||||
version = "0.25.3-beta.3"
|
version = "0.25.3-beta.4"
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
description = "Python bindings for LanceDB"
|
description = "Python bindings for LanceDB"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||||
import base64
|
import base64
|
||||||
import os
|
import os
|
||||||
from typing import ClassVar, TYPE_CHECKING, List, Union, Any
|
from typing import ClassVar, TYPE_CHECKING, List, Union, Any, Generator
|
||||||
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
@@ -19,6 +19,23 @@ from .utils import api_key_not_found_help, IMAGES, TEXT
|
|||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
import PIL
|
import PIL
|
||||||
|
|
||||||
|
# Token limits for different VoyageAI models
|
||||||
|
VOYAGE_TOTAL_TOKEN_LIMITS = {
|
||||||
|
"voyage-context-3": 32_000,
|
||||||
|
"voyage-3.5-lite": 1_000_000,
|
||||||
|
"voyage-3.5": 320_000,
|
||||||
|
"voyage-3-lite": 120_000,
|
||||||
|
"voyage-3": 120_000,
|
||||||
|
"voyage-multimodal-3": 120_000,
|
||||||
|
"voyage-finance-2": 120_000,
|
||||||
|
"voyage-multilingual-2": 120_000,
|
||||||
|
"voyage-law-2": 120_000,
|
||||||
|
"voyage-code-2": 120_000,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Batch size for embedding requests (max number of items per batch)
|
||||||
|
BATCH_SIZE = 1000
|
||||||
|
|
||||||
|
|
||||||
def is_valid_url(text):
|
def is_valid_url(text):
|
||||||
try:
|
try:
|
||||||
@@ -120,6 +137,9 @@ class VoyageAIEmbeddingFunction(EmbeddingFunction):
|
|||||||
name: str
|
name: str
|
||||||
The name of the model to use. List of acceptable models:
|
The name of the model to use. List of acceptable models:
|
||||||
|
|
||||||
|
* voyage-context-3
|
||||||
|
* voyage-3.5
|
||||||
|
* voyage-3.5-lite
|
||||||
* voyage-3
|
* voyage-3
|
||||||
* voyage-3-lite
|
* voyage-3-lite
|
||||||
* voyage-multimodal-3
|
* voyage-multimodal-3
|
||||||
@@ -157,25 +177,35 @@ class VoyageAIEmbeddingFunction(EmbeddingFunction):
|
|||||||
name: str
|
name: str
|
||||||
client: ClassVar = None
|
client: ClassVar = None
|
||||||
text_embedding_models: list = [
|
text_embedding_models: list = [
|
||||||
|
"voyage-3.5",
|
||||||
|
"voyage-3.5-lite",
|
||||||
"voyage-3",
|
"voyage-3",
|
||||||
"voyage-3-lite",
|
"voyage-3-lite",
|
||||||
"voyage-finance-2",
|
"voyage-finance-2",
|
||||||
|
"voyage-multilingual-2",
|
||||||
"voyage-law-2",
|
"voyage-law-2",
|
||||||
"voyage-code-2",
|
"voyage-code-2",
|
||||||
]
|
]
|
||||||
multimodal_embedding_models: list = ["voyage-multimodal-3"]
|
multimodal_embedding_models: list = ["voyage-multimodal-3"]
|
||||||
|
contextual_embedding_models: list = ["voyage-context-3"]
|
||||||
|
|
||||||
def _is_multimodal_model(self, model_name: str):
|
def _is_multimodal_model(self, model_name: str):
|
||||||
return (
|
return (
|
||||||
model_name in self.multimodal_embedding_models or "multimodal" in model_name
|
model_name in self.multimodal_embedding_models or "multimodal" in model_name
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def _is_contextual_model(self, model_name: str):
|
||||||
|
return model_name in self.contextual_embedding_models or "context" in model_name
|
||||||
|
|
||||||
def ndims(self):
|
def ndims(self):
|
||||||
if self.name == "voyage-3-lite":
|
if self.name == "voyage-3-lite":
|
||||||
return 512
|
return 512
|
||||||
elif self.name == "voyage-code-2":
|
elif self.name == "voyage-code-2":
|
||||||
return 1536
|
return 1536
|
||||||
elif self.name in [
|
elif self.name in [
|
||||||
|
"voyage-context-3",
|
||||||
|
"voyage-3.5",
|
||||||
|
"voyage-3.5-lite",
|
||||||
"voyage-3",
|
"voyage-3",
|
||||||
"voyage-multimodal-3",
|
"voyage-multimodal-3",
|
||||||
"voyage-finance-2",
|
"voyage-finance-2",
|
||||||
@@ -207,6 +237,11 @@ class VoyageAIEmbeddingFunction(EmbeddingFunction):
|
|||||||
result = client.multimodal_embed(
|
result = client.multimodal_embed(
|
||||||
inputs=[[query]], model=self.name, input_type="query", **kwargs
|
inputs=[[query]], model=self.name, input_type="query", **kwargs
|
||||||
)
|
)
|
||||||
|
elif self._is_contextual_model(self.name):
|
||||||
|
result = client.contextualized_embed(
|
||||||
|
inputs=[[query]], model=self.name, input_type="query", **kwargs
|
||||||
|
)
|
||||||
|
result = result.results[0]
|
||||||
else:
|
else:
|
||||||
result = client.embed(
|
result = client.embed(
|
||||||
texts=[query], model=self.name, input_type="query", **kwargs
|
texts=[query], model=self.name, input_type="query", **kwargs
|
||||||
@@ -231,18 +266,164 @@ class VoyageAIEmbeddingFunction(EmbeddingFunction):
|
|||||||
List[np.array]: the list of embeddings
|
List[np.array]: the list of embeddings
|
||||||
"""
|
"""
|
||||||
client = VoyageAIEmbeddingFunction._get_client()
|
client = VoyageAIEmbeddingFunction._get_client()
|
||||||
|
|
||||||
|
# For multimodal models, check if inputs contain images
|
||||||
if self._is_multimodal_model(self.name):
|
if self._is_multimodal_model(self.name):
|
||||||
inputs = sanitize_multimodal_input(inputs)
|
sanitized = sanitize_multimodal_input(inputs)
|
||||||
result = client.multimodal_embed(
|
has_images = any(
|
||||||
inputs=inputs, model=self.name, input_type="document", **kwargs
|
inp["content"][0].get("type") != "text" for inp in sanitized
|
||||||
)
|
)
|
||||||
|
if has_images:
|
||||||
|
# Use non-batched API for images
|
||||||
|
result = client.multimodal_embed(
|
||||||
|
inputs=sanitized, model=self.name, input_type="document", **kwargs
|
||||||
|
)
|
||||||
|
return result.embeddings
|
||||||
|
# Extract texts for batching
|
||||||
|
inputs = [inp["content"][0]["text"] for inp in sanitized]
|
||||||
else:
|
else:
|
||||||
inputs = sanitize_text_input(inputs)
|
inputs = sanitize_text_input(inputs)
|
||||||
result = client.embed(
|
|
||||||
texts=inputs, model=self.name, input_type="document", **kwargs
|
|
||||||
)
|
|
||||||
|
|
||||||
return result.embeddings
|
# Use batching for all text inputs
|
||||||
|
return self._embed_with_batching(
|
||||||
|
client, inputs, input_type="document", **kwargs
|
||||||
|
)
|
||||||
|
|
||||||
|
def _build_batches(
|
||||||
|
self, client, texts: List[str]
|
||||||
|
) -> Generator[List[str], None, None]:
|
||||||
|
"""
|
||||||
|
Generate batches of texts based on token limits using a generator.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
client : voyageai.Client
|
||||||
|
The VoyageAI client instance.
|
||||||
|
texts : List[str]
|
||||||
|
List of texts to batch.
|
||||||
|
|
||||||
|
Yields
|
||||||
|
------
|
||||||
|
List[str]: Batches of texts.
|
||||||
|
"""
|
||||||
|
if not texts:
|
||||||
|
return
|
||||||
|
|
||||||
|
max_tokens_per_batch = VOYAGE_TOTAL_TOKEN_LIMITS.get(self.name, 120_000)
|
||||||
|
current_batch: List[str] = []
|
||||||
|
current_batch_tokens = 0
|
||||||
|
|
||||||
|
# Tokenize all texts in one API call
|
||||||
|
token_lists = client.tokenize(texts, model=self.name)
|
||||||
|
token_counts = [len(token_list) for token_list in token_lists]
|
||||||
|
|
||||||
|
for i, text in enumerate(texts):
|
||||||
|
n_tokens = token_counts[i]
|
||||||
|
|
||||||
|
# Check if adding this text would exceed limits
|
||||||
|
if current_batch and (
|
||||||
|
len(current_batch) >= BATCH_SIZE
|
||||||
|
or (current_batch_tokens + n_tokens > max_tokens_per_batch)
|
||||||
|
):
|
||||||
|
# Yield the current batch and start a new one
|
||||||
|
yield current_batch
|
||||||
|
current_batch = []
|
||||||
|
current_batch_tokens = 0
|
||||||
|
|
||||||
|
current_batch.append(text)
|
||||||
|
current_batch_tokens += n_tokens
|
||||||
|
|
||||||
|
# Yield the last batch (always has at least one text)
|
||||||
|
if current_batch:
|
||||||
|
yield current_batch
|
||||||
|
|
||||||
|
def _get_embed_function(
|
||||||
|
self, client, input_type: str = "document", **kwargs
|
||||||
|
) -> callable:
|
||||||
|
"""
|
||||||
|
Get the appropriate embedding function based on model type.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
client : voyageai.Client
|
||||||
|
The VoyageAI client instance.
|
||||||
|
input_type : str
|
||||||
|
Either "query" or "document"
|
||||||
|
**kwargs
|
||||||
|
Additional arguments to pass to the embedding API
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
callable: A function that takes a batch of texts and returns embeddings.
|
||||||
|
"""
|
||||||
|
if self._is_multimodal_model(self.name):
|
||||||
|
|
||||||
|
def embed_batch(batch: List[str]) -> List[np.array]:
|
||||||
|
batch_inputs = sanitize_multimodal_input(batch)
|
||||||
|
result = client.multimodal_embed(
|
||||||
|
inputs=batch_inputs,
|
||||||
|
model=self.name,
|
||||||
|
input_type=input_type,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
return result.embeddings
|
||||||
|
|
||||||
|
return embed_batch
|
||||||
|
|
||||||
|
elif self._is_contextual_model(self.name):
|
||||||
|
|
||||||
|
def embed_batch(batch: List[str]) -> List[np.array]:
|
||||||
|
result = client.contextualized_embed(
|
||||||
|
inputs=[batch], model=self.name, input_type=input_type, **kwargs
|
||||||
|
)
|
||||||
|
return result.results[0].embeddings
|
||||||
|
|
||||||
|
return embed_batch
|
||||||
|
|
||||||
|
else:
|
||||||
|
|
||||||
|
def embed_batch(batch: List[str]) -> List[np.array]:
|
||||||
|
result = client.embed(
|
||||||
|
texts=batch, model=self.name, input_type=input_type, **kwargs
|
||||||
|
)
|
||||||
|
return result.embeddings
|
||||||
|
|
||||||
|
return embed_batch
|
||||||
|
|
||||||
|
def _embed_with_batching(
|
||||||
|
self, client, texts: List[str], input_type: str = "document", **kwargs
|
||||||
|
) -> List[np.array]:
|
||||||
|
"""
|
||||||
|
Embed texts with automatic batching based on token limits.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
client : voyageai.Client
|
||||||
|
The VoyageAI client instance.
|
||||||
|
texts : List[str]
|
||||||
|
List of texts to embed.
|
||||||
|
input_type : str
|
||||||
|
Either "query" or "document"
|
||||||
|
**kwargs
|
||||||
|
Additional arguments to pass to the embedding API
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
List[np.array]: List of embeddings.
|
||||||
|
"""
|
||||||
|
if not texts:
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Get the appropriate embedding function for this model type
|
||||||
|
embed_fn = self._get_embed_function(client, input_type=input_type, **kwargs)
|
||||||
|
|
||||||
|
# Process each batch
|
||||||
|
all_embeddings = []
|
||||||
|
for batch in self._build_batches(client, texts):
|
||||||
|
batch_embeddings = embed_fn(batch)
|
||||||
|
all_embeddings.extend(batch_embeddings)
|
||||||
|
|
||||||
|
return all_embeddings
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _get_client():
|
def _get_client():
|
||||||
|
|||||||
@@ -21,6 +21,8 @@ class VoyageAIReranker(Reranker):
|
|||||||
----------
|
----------
|
||||||
model_name : str, default "rerank-english-v2.0"
|
model_name : str, default "rerank-english-v2.0"
|
||||||
The name of the cross encoder model to use. Available voyageai models are:
|
The name of the cross encoder model to use. Available voyageai models are:
|
||||||
|
- rerank-2.5
|
||||||
|
- rerank-2.5-lite
|
||||||
- rerank-2
|
- rerank-2
|
||||||
- rerank-2-lite
|
- rerank-2-lite
|
||||||
column : str, default "text"
|
column : str, default "text"
|
||||||
|
|||||||
@@ -532,6 +532,27 @@ def test_voyageai_embedding_function():
|
|||||||
assert len(tbl.to_pandas()["vector"][0]) == voyageai.ndims()
|
assert len(tbl.to_pandas()["vector"][0]) == voyageai.ndims()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.slow
|
||||||
|
@pytest.mark.skipif(
|
||||||
|
os.environ.get("VOYAGE_API_KEY") is None, reason="VOYAGE_API_KEY not set"
|
||||||
|
)
|
||||||
|
def test_voyageai_embedding_function_contextual_model():
|
||||||
|
voyageai = (
|
||||||
|
get_registry().get("voyageai").create(name="voyage-context-3", max_retries=0)
|
||||||
|
)
|
||||||
|
|
||||||
|
class TextModel(LanceModel):
|
||||||
|
text: str = voyageai.SourceField()
|
||||||
|
vector: Vector(voyageai.ndims()) = voyageai.VectorField()
|
||||||
|
|
||||||
|
df = pd.DataFrame({"text": ["hello world", "goodbye world"]})
|
||||||
|
db = lancedb.connect("~/lancedb")
|
||||||
|
tbl = db.create_table("test", schema=TextModel, mode="overwrite")
|
||||||
|
|
||||||
|
tbl.add(df)
|
||||||
|
assert len(tbl.to_pandas()["vector"][0]) == voyageai.ndims()
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.slow
|
@pytest.mark.slow
|
||||||
@pytest.mark.skipif(
|
@pytest.mark.skipif(
|
||||||
os.environ.get("VOYAGE_API_KEY") is None, reason="VOYAGE_API_KEY not set"
|
os.environ.get("VOYAGE_API_KEY") is None, reason="VOYAGE_API_KEY not set"
|
||||||
|
|||||||
@@ -484,7 +484,7 @@ def test_jina_reranker(tmp_path, use_tantivy):
|
|||||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
@pytest.mark.parametrize("use_tantivy", [True, False])
|
||||||
def test_voyageai_reranker(tmp_path, use_tantivy):
|
def test_voyageai_reranker(tmp_path, use_tantivy):
|
||||||
pytest.importorskip("voyageai")
|
pytest.importorskip("voyageai")
|
||||||
reranker = VoyageAIReranker(model_name="rerank-2")
|
reranker = VoyageAIReranker(model_name="rerank-2.5")
|
||||||
table, schema = get_test_table(tmp_path, use_tantivy)
|
table, schema = get_test_table(tmp_path, use_tantivy)
|
||||||
_run_test_reranker(reranker, table, "single player experience", None, schema)
|
_run_test_reranker(reranker, table, "single player experience", None, schema)
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb"
|
name = "lancedb"
|
||||||
version = "0.22.3-beta.2"
|
version = "0.22.3-beta.3"
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
description = "LanceDB: A serverless, low-latency vector database for AI applications"
|
description = "LanceDB: A serverless, low-latency vector database for AI applications"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
|
|||||||
Reference in New Issue
Block a user