Compare commits

..

8 Commits

Author SHA1 Message Date
Lance Release
11bc674548 Bump version: 0.31.0-beta.1 → 0.31.0-beta.2 2026-04-11 07:05:36 +00:00
LanceDB Robot
5593460823 chore: update lance dependency to v5.1.0-beta.2 (#3263)
## Summary
- Bump Lance Rust workspace dependencies from `5.0.0-beta.5` to
`5.1.0-beta.2` using `ci/set_lance_version.py`.
- Update Java `lance-core.version` in `java/pom.xml` to `5.1.0-beta.2`.
- Refresh `Cargo.lock` to match the new Lance tag.

## Verification
- `cargo clippy --workspace --tests --all-features -- -D warnings`
(passes)
- `cargo fmt --all` (passes)

## Triggering Tag
- https://github.com/lance-format/lance/releases/tag/v5.1.0-beta.2
2026-04-11 00:04:43 -07:00
Will Jones
2807ad6854 chore: bump Rust toolchain from 1.91.0 to 1.94.0 (#3257)
Bumps the Rust toolchain to 1.94.0 (latest installed) to unblock CI
failures caused by the AWS SDK's MSRV requirement. No lint fixes were
needed.

---------

Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-10 07:57:47 -07:00
Dhruv Garg
4761fa9bcb fix(python): migrate gemini-text provider to google-genai sdk (#3250)
## Summary
- migrate gemini-text embedding provider from deprecated
google.generativeai to google.genai
- update Python embedding extra dependency to google-genai
- update default model name to gemini-embedding-001
- adapt embed calls to Client().models.embed_content(...)
- apply lint fixes from CI

## Related
- Closes #3191
2026-04-09 15:28:34 -07:00
lennylxx
4c2939d66e fix(python): guard against None before .decode() on split_names metadata key (#3229)
`.get(b"split_names", None).decode()` was called unconditionally in both
Permutations.__init__ and Permutation.from_tables(), crashing with
AttributeError when schema metadata existed but lacked the split_names
key. Guard the decode behind a None check and add regression tests.
2026-04-08 16:04:13 -07:00
yaommen
a813ce2f71 fix(python): sanitize bad vectors before Arrow cast (#3158)
## Problem

`on_bad_vectors="drop"` is supposed to remove invalid vector rows before
write, but for some schema-defined vector columns it can still fail
later during Arrow cast instead of dropping the bad row.

Repro:
```python
class MySchema(LanceModel):
    text: str
    embedding: Vector(16)

table = db.create_table("test", schema=MySchema)
table.add(
    [
        {"text": "hello", "embedding": []},
        {"text": "bar", "embedding": [0.1] * 16},
    ],
    on_bad_vectors="drop",
)
```
Before:
```
RuntimeError
Arrow error: C Data interface error: Invalid: ListType can only be casted to FixedSizeListType if the lists are all the expected size.
```
After:
```
rows 1
texts ['bar']
```
## Solution

Make bad-vector sanitization use schema dimensions before cast, while
keeping the handling scoped to vector columns identified by schema
metadata or existing vector-name heuristics.

This also preserves existing integer vector inputs and avoids applying
on_bad_vectors to unrelated fixed-size float columns.


Fixes #1670

Signed-off-by: yaommen <myanstu@163.com>
2026-04-08 09:09:41 -07:00
Jack Ye
a898dc81c2 feat: add user_id field to ClientConfig for user identification (#3240)
## Summary

- Add a `user_id` field to `ClientConfig` that allows users to identify
themselves to LanceDB Cloud/Enterprise
- The user_id is sent as the `x-lancedb-user-id` HTTP header in all
requests
- Supports three configuration methods:
  - Direct assignment via `ClientConfig.user_id`
  - Environment variable `LANCEDB_USER_ID`
  - Indirect env var lookup via `LANCEDB_USER_ID_ENV_KEY`

Closes #3230

🤖 Generated with [Claude Code](https://claude.com/claude-code)

---------

Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
2026-04-06 11:20:10 -07:00
Lance Release
de3f8097e7 Bump version: 0.28.0-beta.0 → 0.28.0-beta.1 2026-04-05 02:51:18 +00:00
23 changed files with 943 additions and 145 deletions

View File

@@ -8,6 +8,7 @@ on:
paths:
- Cargo.toml
- Cargo.lock
- rust-toolchain.toml
- nodejs/**
- rust/**
- docs/src/js/**

View File

@@ -8,6 +8,7 @@ on:
paths:
- Cargo.toml
- Cargo.lock
- rust-toolchain.toml
- python/**
- rust/**
- .github/workflows/python.yml

View File

@@ -8,6 +8,7 @@ on:
paths:
- Cargo.toml
- Cargo.lock
- rust-toolchain.toml
- rust/**
- .github/workflows/rust.yml

73
Cargo.lock generated
View File

@@ -3072,8 +3072,8 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c"
[[package]]
name = "fsst"
version = "5.0.0-beta.5"
source = "git+https://github.com/lance-format/lance.git?tag=v5.0.0-beta.5#d630106da5a238b3adfb8c5dea3b3921f3519945"
version = "5.1.0-beta.2"
source = "git+https://github.com/lance-format/lance.git?tag=v5.1.0-beta.2#df61d95cac9ab579e4bc4ff41d1bd749b24af7f0"
dependencies = [
"arrow-array",
"rand 0.9.2",
@@ -4134,13 +4134,14 @@ dependencies = [
[[package]]
name = "lance"
version = "5.0.0-beta.5"
source = "git+https://github.com/lance-format/lance.git?tag=v5.0.0-beta.5#d630106da5a238b3adfb8c5dea3b3921f3519945"
version = "5.1.0-beta.2"
source = "git+https://github.com/lance-format/lance.git?tag=v5.1.0-beta.2#df61d95cac9ab579e4bc4ff41d1bd749b24af7f0"
dependencies = [
"arrow",
"arrow-arith",
"arrow-array",
"arrow-buffer",
"arrow-cast",
"arrow-ipc",
"arrow-ord",
"arrow-row",
@@ -4201,13 +4202,14 @@ dependencies = [
[[package]]
name = "lance-arrow"
version = "5.0.0-beta.5"
source = "git+https://github.com/lance-format/lance.git?tag=v5.0.0-beta.5#d630106da5a238b3adfb8c5dea3b3921f3519945"
version = "5.1.0-beta.2"
source = "git+https://github.com/lance-format/lance.git?tag=v5.1.0-beta.2#df61d95cac9ab579e4bc4ff41d1bd749b24af7f0"
dependencies = [
"arrow-array",
"arrow-buffer",
"arrow-cast",
"arrow-data",
"arrow-ipc",
"arrow-ord",
"arrow-schema",
"arrow-select",
@@ -4222,8 +4224,8 @@ dependencies = [
[[package]]
name = "lance-bitpacking"
version = "5.0.0-beta.5"
source = "git+https://github.com/lance-format/lance.git?tag=v5.0.0-beta.5#d630106da5a238b3adfb8c5dea3b3921f3519945"
version = "5.1.0-beta.2"
source = "git+https://github.com/lance-format/lance.git?tag=v5.1.0-beta.2#df61d95cac9ab579e4bc4ff41d1bd749b24af7f0"
dependencies = [
"arrayref",
"paste",
@@ -4232,8 +4234,8 @@ dependencies = [
[[package]]
name = "lance-core"
version = "5.0.0-beta.5"
source = "git+https://github.com/lance-format/lance.git?tag=v5.0.0-beta.5#d630106da5a238b3adfb8c5dea3b3921f3519945"
version = "5.1.0-beta.2"
source = "git+https://github.com/lance-format/lance.git?tag=v5.1.0-beta.2#df61d95cac9ab579e4bc4ff41d1bd749b24af7f0"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -4270,12 +4272,13 @@ dependencies = [
[[package]]
name = "lance-datafusion"
version = "5.0.0-beta.5"
source = "git+https://github.com/lance-format/lance.git?tag=v5.0.0-beta.5#d630106da5a238b3adfb8c5dea3b3921f3519945"
version = "5.1.0-beta.2"
source = "git+https://github.com/lance-format/lance.git?tag=v5.1.0-beta.2#df61d95cac9ab579e4bc4ff41d1bd749b24af7f0"
dependencies = [
"arrow",
"arrow-array",
"arrow-buffer",
"arrow-cast",
"arrow-ord",
"arrow-schema",
"arrow-select",
@@ -4301,8 +4304,8 @@ dependencies = [
[[package]]
name = "lance-datagen"
version = "5.0.0-beta.5"
source = "git+https://github.com/lance-format/lance.git?tag=v5.0.0-beta.5#d630106da5a238b3adfb8c5dea3b3921f3519945"
version = "5.1.0-beta.2"
source = "git+https://github.com/lance-format/lance.git?tag=v5.1.0-beta.2#df61d95cac9ab579e4bc4ff41d1bd749b24af7f0"
dependencies = [
"arrow",
"arrow-array",
@@ -4320,8 +4323,8 @@ dependencies = [
[[package]]
name = "lance-encoding"
version = "5.0.0-beta.5"
source = "git+https://github.com/lance-format/lance.git?tag=v5.0.0-beta.5#d630106da5a238b3adfb8c5dea3b3921f3519945"
version = "5.1.0-beta.2"
source = "git+https://github.com/lance-format/lance.git?tag=v5.1.0-beta.2#df61d95cac9ab579e4bc4ff41d1bd749b24af7f0"
dependencies = [
"arrow-arith",
"arrow-array",
@@ -4358,8 +4361,8 @@ dependencies = [
[[package]]
name = "lance-file"
version = "5.0.0-beta.5"
source = "git+https://github.com/lance-format/lance.git?tag=v5.0.0-beta.5#d630106da5a238b3adfb8c5dea3b3921f3519945"
version = "5.1.0-beta.2"
source = "git+https://github.com/lance-format/lance.git?tag=v5.1.0-beta.2#df61d95cac9ab579e4bc4ff41d1bd749b24af7f0"
dependencies = [
"arrow-arith",
"arrow-array",
@@ -4391,8 +4394,8 @@ dependencies = [
[[package]]
name = "lance-index"
version = "5.0.0-beta.5"
source = "git+https://github.com/lance-format/lance.git?tag=v5.0.0-beta.5#d630106da5a238b3adfb8c5dea3b3921f3519945"
version = "5.1.0-beta.2"
source = "git+https://github.com/lance-format/lance.git?tag=v5.1.0-beta.2#df61d95cac9ab579e4bc4ff41d1bd749b24af7f0"
dependencies = [
"arrow",
"arrow-arith",
@@ -4456,8 +4459,8 @@ dependencies = [
[[package]]
name = "lance-io"
version = "5.0.0-beta.5"
source = "git+https://github.com/lance-format/lance.git?tag=v5.0.0-beta.5#d630106da5a238b3adfb8c5dea3b3921f3519945"
version = "5.1.0-beta.2"
source = "git+https://github.com/lance-format/lance.git?tag=v5.1.0-beta.2#df61d95cac9ab579e4bc4ff41d1bd749b24af7f0"
dependencies = [
"arrow",
"arrow-arith",
@@ -4501,8 +4504,8 @@ dependencies = [
[[package]]
name = "lance-linalg"
version = "5.0.0-beta.5"
source = "git+https://github.com/lance-format/lance.git?tag=v5.0.0-beta.5#d630106da5a238b3adfb8c5dea3b3921f3519945"
version = "5.1.0-beta.2"
source = "git+https://github.com/lance-format/lance.git?tag=v5.1.0-beta.2#df61d95cac9ab579e4bc4ff41d1bd749b24af7f0"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -4518,8 +4521,8 @@ dependencies = [
[[package]]
name = "lance-namespace"
version = "5.0.0-beta.5"
source = "git+https://github.com/lance-format/lance.git?tag=v5.0.0-beta.5#d630106da5a238b3adfb8c5dea3b3921f3519945"
version = "5.1.0-beta.2"
source = "git+https://github.com/lance-format/lance.git?tag=v5.1.0-beta.2#df61d95cac9ab579e4bc4ff41d1bd749b24af7f0"
dependencies = [
"arrow",
"async-trait",
@@ -4532,8 +4535,8 @@ dependencies = [
[[package]]
name = "lance-namespace-impls"
version = "5.0.0-beta.5"
source = "git+https://github.com/lance-format/lance.git?tag=v5.0.0-beta.5#d630106da5a238b3adfb8c5dea3b3921f3519945"
version = "5.1.0-beta.2"
source = "git+https://github.com/lance-format/lance.git?tag=v5.1.0-beta.2#df61d95cac9ab579e4bc4ff41d1bd749b24af7f0"
dependencies = [
"arrow",
"arrow-ipc",
@@ -4578,8 +4581,8 @@ dependencies = [
[[package]]
name = "lance-table"
version = "5.0.0-beta.5"
source = "git+https://github.com/lance-format/lance.git?tag=v5.0.0-beta.5#d630106da5a238b3adfb8c5dea3b3921f3519945"
version = "5.1.0-beta.2"
source = "git+https://github.com/lance-format/lance.git?tag=v5.1.0-beta.2#df61d95cac9ab579e4bc4ff41d1bd749b24af7f0"
dependencies = [
"arrow",
"arrow-array",
@@ -4618,8 +4621,8 @@ dependencies = [
[[package]]
name = "lance-testing"
version = "5.0.0-beta.5"
source = "git+https://github.com/lance-format/lance.git?tag=v5.0.0-beta.5#d630106da5a238b3adfb8c5dea3b3921f3519945"
version = "5.1.0-beta.2"
source = "git+https://github.com/lance-format/lance.git?tag=v5.1.0-beta.2#df61d95cac9ab579e4bc4ff41d1bd749b24af7f0"
dependencies = [
"arrow-array",
"arrow-schema",
@@ -4630,7 +4633,7 @@ dependencies = [
[[package]]
name = "lancedb"
version = "0.28.0-beta.0"
version = "0.28.0-beta.1"
dependencies = [
"ahash",
"anyhow",
@@ -4712,7 +4715,7 @@ dependencies = [
[[package]]
name = "lancedb-nodejs"
version = "0.28.0-beta.0"
version = "0.28.0-beta.1"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -4734,7 +4737,7 @@ dependencies = [
[[package]]
name = "lancedb-python"
version = "0.31.0-beta.0"
version = "0.31.0-beta.1"
dependencies = [
"arrow",
"async-trait",

View File

@@ -15,20 +15,20 @@ categories = ["database-implementations"]
rust-version = "1.91.0"
[workspace.dependencies]
lance = { "version" = "=5.0.0-beta.5", default-features = false, "tag" = "v5.0.0-beta.5", "git" = "https://github.com/lance-format/lance.git" }
lance-core = { "version" = "=5.0.0-beta.5", "tag" = "v5.0.0-beta.5", "git" = "https://github.com/lance-format/lance.git" }
lance-datagen = { "version" = "=5.0.0-beta.5", "tag" = "v5.0.0-beta.5", "git" = "https://github.com/lance-format/lance.git" }
lance-file = { "version" = "=5.0.0-beta.5", "tag" = "v5.0.0-beta.5", "git" = "https://github.com/lance-format/lance.git" }
lance-io = { "version" = "=5.0.0-beta.5", default-features = false, "tag" = "v5.0.0-beta.5", "git" = "https://github.com/lance-format/lance.git" }
lance-index = { "version" = "=5.0.0-beta.5", "tag" = "v5.0.0-beta.5", "git" = "https://github.com/lance-format/lance.git" }
lance-linalg = { "version" = "=5.0.0-beta.5", "tag" = "v5.0.0-beta.5", "git" = "https://github.com/lance-format/lance.git" }
lance-namespace = { "version" = "=5.0.0-beta.5", "tag" = "v5.0.0-beta.5", "git" = "https://github.com/lance-format/lance.git" }
lance-namespace-impls = { "version" = "=5.0.0-beta.5", default-features = false, "tag" = "v5.0.0-beta.5", "git" = "https://github.com/lance-format/lance.git" }
lance-table = { "version" = "=5.0.0-beta.5", "tag" = "v5.0.0-beta.5", "git" = "https://github.com/lance-format/lance.git" }
lance-testing = { "version" = "=5.0.0-beta.5", "tag" = "v5.0.0-beta.5", "git" = "https://github.com/lance-format/lance.git" }
lance-datafusion = { "version" = "=5.0.0-beta.5", "tag" = "v5.0.0-beta.5", "git" = "https://github.com/lance-format/lance.git" }
lance-encoding = { "version" = "=5.0.0-beta.5", "tag" = "v5.0.0-beta.5", "git" = "https://github.com/lance-format/lance.git" }
lance-arrow = { "version" = "=5.0.0-beta.5", "tag" = "v5.0.0-beta.5", "git" = "https://github.com/lance-format/lance.git" }
lance = { "version" = "=5.1.0-beta.2", default-features = false, "tag" = "v5.1.0-beta.2", "git" = "https://github.com/lance-format/lance.git" }
lance-core = { "version" = "=5.1.0-beta.2", "tag" = "v5.1.0-beta.2", "git" = "https://github.com/lance-format/lance.git" }
lance-datagen = { "version" = "=5.1.0-beta.2", "tag" = "v5.1.0-beta.2", "git" = "https://github.com/lance-format/lance.git" }
lance-file = { "version" = "=5.1.0-beta.2", "tag" = "v5.1.0-beta.2", "git" = "https://github.com/lance-format/lance.git" }
lance-io = { "version" = "=5.1.0-beta.2", default-features = false, "tag" = "v5.1.0-beta.2", "git" = "https://github.com/lance-format/lance.git" }
lance-index = { "version" = "=5.1.0-beta.2", "tag" = "v5.1.0-beta.2", "git" = "https://github.com/lance-format/lance.git" }
lance-linalg = { "version" = "=5.1.0-beta.2", "tag" = "v5.1.0-beta.2", "git" = "https://github.com/lance-format/lance.git" }
lance-namespace = { "version" = "=5.1.0-beta.2", "tag" = "v5.1.0-beta.2", "git" = "https://github.com/lance-format/lance.git" }
lance-namespace-impls = { "version" = "=5.1.0-beta.2", default-features = false, "tag" = "v5.1.0-beta.2", "git" = "https://github.com/lance-format/lance.git" }
lance-table = { "version" = "=5.1.0-beta.2", "tag" = "v5.1.0-beta.2", "git" = "https://github.com/lance-format/lance.git" }
lance-testing = { "version" = "=5.1.0-beta.2", "tag" = "v5.1.0-beta.2", "git" = "https://github.com/lance-format/lance.git" }
lance-datafusion = { "version" = "=5.1.0-beta.2", "tag" = "v5.1.0-beta.2", "git" = "https://github.com/lance-format/lance.git" }
lance-encoding = { "version" = "=5.1.0-beta.2", "tag" = "v5.1.0-beta.2", "git" = "https://github.com/lance-format/lance.git" }
lance-arrow = { "version" = "=5.1.0-beta.2", "tag" = "v5.1.0-beta.2", "git" = "https://github.com/lance-format/lance.git" }
ahash = "0.8"
# Note that this one does not include pyarrow
arrow = { version = "57.2", optional = false }

View File

@@ -53,3 +53,18 @@ optional tlsConfig: TlsConfig;
```ts
optional userAgent: string;
```
***
### userId?
```ts
optional userId: string;
```
User identifier for tracking purposes.
This is sent as the `x-lancedb-user-id` header in requests to LanceDB Cloud/Enterprise.
It can be set directly, or via the `LANCEDB_USER_ID` environment variable.
Alternatively, set `LANCEDB_USER_ID_ENV_KEY` to specify another environment
variable that contains the user ID value.

View File

@@ -28,7 +28,7 @@
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<arrow.version>15.0.0</arrow.version>
<lance-core.version>5.0.0-beta.5</lance-core.version>
<lance-core.version>5.1.0-beta.2</lance-core.version>
<spotless.skip>false</spotless.skip>
<spotless.version>2.30.0</spotless.version>
<spotless.java.googlejavaformat.version>1.7</spotless.java.googlejavaformat.version>

View File

@@ -1,12 +1,12 @@
{
"name": "@lancedb/lancedb",
"version": "0.28.0-beta.0",
"version": "0.28.0-beta.1",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "@lancedb/lancedb",
"version": "0.28.0-beta.0",
"version": "0.28.0-beta.1",
"cpu": [
"x64",
"arm64"

View File

@@ -92,6 +92,13 @@ pub struct ClientConfig {
pub extra_headers: Option<HashMap<String, String>>,
pub id_delimiter: Option<String>,
pub tls_config: Option<TlsConfig>,
/// User identifier for tracking purposes.
///
/// This is sent as the `x-lancedb-user-id` header in requests to LanceDB Cloud/Enterprise.
/// It can be set directly, or via the `LANCEDB_USER_ID` environment variable.
/// Alternatively, set `LANCEDB_USER_ID_ENV_KEY` to specify another environment
/// variable that contains the user ID value.
pub user_id: Option<String>,
}
impl From<TimeoutConfig> for lancedb::remote::TimeoutConfig {
@@ -145,6 +152,7 @@ impl From<ClientConfig> for lancedb::remote::ClientConfig {
id_delimiter: config.id_delimiter,
tls_config: config.tls_config.map(Into::into),
header_provider: None, // the header provider is set separately later
user_id: config.user_id,
}
}
}

View File

@@ -1,5 +1,5 @@
[tool.bumpversion]
current_version = "0.31.0-beta.1"
current_version = "0.31.0-beta.2"
parse = """(?x)
(?P<major>0|[1-9]\\d*)\\.
(?P<minor>0|[1-9]\\d*)\\.

View File

@@ -1,6 +1,6 @@
[package]
name = "lancedb-python"
version = "0.31.0-beta.1"
version = "0.31.0-beta.2"
edition.workspace = true
description = "Python bindings for LanceDB"
license.workspace = true

View File

@@ -83,7 +83,7 @@ embeddings = [
"colpali-engine>=0.3.10",
"huggingface_hub>=0.19.0",
"InstructorEmbedding>=1.0.1",
"google.generativeai>=0.3.0",
"google-genai>=1.0.0",
"boto3>=1.28.57",
"awscli>=1.44.38",
"botocore>=1.31.57",

View File

@@ -19,10 +19,10 @@ from .utils import TEXT, api_key_not_found_help
@register("gemini-text")
class GeminiText(TextEmbeddingFunction):
"""
An embedding function that uses the Google's Gemini API. Requires GOOGLE_API_KEY to
An embedding function that uses Google's Gemini API. Requires GOOGLE_API_KEY to
be set.
https://ai.google.dev/docs/embeddings_guide
https://ai.google.dev/gemini-api/docs/embeddings
Supports various tasks types:
| Task Type | Description |
@@ -46,9 +46,12 @@ class GeminiText(TextEmbeddingFunction):
Parameters
----------
name: str, default "models/embedding-001"
The name of the model to use. See the Gemini documentation for a list of
available models.
name: str, default "gemini-embedding-001"
The name of the model to use. Supported models include:
- "gemini-embedding-001" (768 dimensions)
Note: The legacy "models/embedding-001" format is also supported but
"gemini-embedding-001" is recommended.
query_task_type: str, default "retrieval_query"
Sets the task type for the queries.
@@ -77,7 +80,7 @@ class GeminiText(TextEmbeddingFunction):
"""
name: str = "models/embedding-001"
name: str = "gemini-embedding-001"
query_task_type: str = "retrieval_query"
source_task_type: str = "retrieval_document"
@@ -114,23 +117,48 @@ class GeminiText(TextEmbeddingFunction):
texts: list[str] or np.ndarray (of str)
The texts to embed
"""
if (
kwargs.get("task_type") == "retrieval_document"
): # Provide a title to use existing API design
title = "Embedding of a document"
kwargs["title"] = title
from google.genai import types
return [
self.client.embed_content(model=self.name, content=text, **kwargs)[
"embedding"
]
for text in texts
]
task_type = kwargs.get("task_type")
# Build content objects for embed_content
contents = []
for text in texts:
if task_type == "retrieval_document":
# Provide a title for retrieval_document task
contents.append(
{"parts": [{"text": "Embedding of a document"}, {"text": text}]}
)
else:
contents.append({"parts": [{"text": text}]})
# Build config
config_kwargs = {}
if task_type:
config_kwargs["task_type"] = task_type.upper() # API expects uppercase
# Call embed_content for each content
embeddings = []
for content in contents:
config = (
types.EmbedContentConfig(**config_kwargs) if config_kwargs else None
)
response = self.client.models.embed_content(
model=self.name,
contents=content,
config=config,
)
embeddings.append(response.embeddings[0].values)
return embeddings
@cached_property
def client(self):
genai = attempt_import_or_raise("google.generativeai", "google.generativeai")
attempt_import_or_raise("google.genai", "google-genai")
if not os.environ.get("GOOGLE_API_KEY"):
api_key_not_found_help("google")
return genai
from google import genai as genai_module
return genai_module.Client(api_key=os.environ.get("GOOGLE_API_KEY"))

View File

@@ -284,9 +284,8 @@ class Permutations:
self.permutation_table = permutation_table
if permutation_table.schema.metadata is not None:
split_names = permutation_table.schema.metadata.get(
b"split_names", None
).decode("utf-8")
raw = permutation_table.schema.metadata.get(b"split_names")
split_names = raw.decode("utf-8") if raw is not None else None
if split_names is not None:
self.split_names = json.loads(split_names)
self.split_dict = {
@@ -460,9 +459,8 @@ class Permutation:
f"Cannot create a permutation on split `{split}`"
" because no split names are defined in the permutation table"
)
split_names = permutation_table.schema.metadata.get(
b"split_names", None
).decode("utf-8")
raw = permutation_table.schema.metadata.get(b"split_names")
split_names = raw.decode("utf-8") if raw is not None else None
if split_names is None:
raise ValueError(
f"Cannot create a permutation on split `{split}`"

View File

@@ -145,6 +145,33 @@ class TlsConfig:
@dataclass
class ClientConfig:
"""Configuration for the LanceDB Cloud HTTP client.
Attributes
----------
user_agent: str
User agent string sent with requests.
retry_config: RetryConfig
Configuration for retrying failed requests.
timeout_config: Optional[TimeoutConfig]
Configuration for request timeouts.
extra_headers: Optional[dict]
Additional headers to include in requests.
id_delimiter: Optional[str]
The delimiter to use when constructing object identifiers.
tls_config: Optional[TlsConfig]
TLS/mTLS configuration for secure connections.
header_provider: Optional[HeaderProvider]
Provider for dynamic headers to be added to each request.
user_id: Optional[str]
User identifier for tracking purposes. This is sent as the
`x-lancedb-user-id` header in requests to LanceDB Cloud/Enterprise.
This can also be set via the `LANCEDB_USER_ID` environment variable.
Alternatively, set `LANCEDB_USER_ID_ENV_KEY` to specify another
environment variable that contains the user ID value.
"""
user_agent: str = f"LanceDB-Python-Client/{__version__}"
retry_config: RetryConfig = field(default_factory=RetryConfig)
timeout_config: Optional[TimeoutConfig] = field(default_factory=TimeoutConfig)
@@ -152,6 +179,7 @@ class ClientConfig:
id_delimiter: Optional[str] = None
tls_config: Optional[TlsConfig] = None
header_provider: Optional["HeaderProvider"] = None
user_id: Optional[str] = None
def __post_init__(self):
if isinstance(self.retry_config, dict):

View File

@@ -270,15 +270,17 @@ def _sanitize_data(
reader,
on_bad_vectors=on_bad_vectors,
fill_value=fill_value,
target_schema=target_schema,
metadata=metadata,
)
if target_schema is None:
target_schema, reader = _infer_target_schema(reader)
if metadata:
new_metadata = target_schema.metadata or {}
new_metadata.update(metadata)
target_schema = target_schema.with_metadata(new_metadata)
target_schema = target_schema.with_metadata(
_merge_metadata(target_schema.metadata, metadata)
)
_validate_schema(target_schema)
reader = _cast_to_target_schema(reader, target_schema, allow_subschema)
@@ -294,7 +296,7 @@ def _cast_to_target_schema(
# pa.Table.cast expects field order not to be changed.
# Lance doesn't care about field order, so we don't need to rearrange fields
# to match the target schema. We just need to correctly cast the fields.
if reader.schema == target_schema:
if reader.schema.equals(target_schema, check_metadata=True):
# Fast path when the schemas are already the same
return reader
@@ -314,7 +316,13 @@ def _cast_to_target_schema(
def gen():
for batch in reader:
# Table but not RecordBatch has cast.
yield pa.Table.from_batches([batch]).cast(reordered_schema).to_batches()[0]
cast_batches = (
pa.Table.from_batches([batch]).cast(reordered_schema).to_batches()
)
if cast_batches:
yield pa.RecordBatch.from_arrays(
cast_batches[0].columns, schema=reordered_schema
)
return pa.RecordBatchReader.from_batches(reordered_schema, gen())
@@ -332,37 +340,51 @@ def _align_field_types(
if target_field is None:
raise ValueError(f"Field '{field.name}' not found in target schema")
if pa.types.is_struct(target_field.type):
new_type = pa.struct(
_align_field_types(
field.type.fields,
target_field.type.fields,
if pa.types.is_struct(field.type):
new_type = pa.struct(
_align_field_types(
field.type.fields,
target_field.type.fields,
)
)
)
else:
new_type = target_field.type
elif pa.types.is_list(target_field.type):
new_type = pa.list_(
_align_field_types(
[field.type.value_field],
[target_field.type.value_field],
)[0]
)
if _is_list_like(field.type):
new_type = pa.list_(
_align_field_types(
[field.type.value_field],
[target_field.type.value_field],
)[0]
)
else:
new_type = target_field.type
elif pa.types.is_large_list(target_field.type):
new_type = pa.large_list(
_align_field_types(
[field.type.value_field],
[target_field.type.value_field],
)[0]
)
if _is_list_like(field.type):
new_type = pa.large_list(
_align_field_types(
[field.type.value_field],
[target_field.type.value_field],
)[0]
)
else:
new_type = target_field.type
elif pa.types.is_fixed_size_list(target_field.type):
new_type = pa.list_(
_align_field_types(
[field.type.value_field],
[target_field.type.value_field],
)[0],
target_field.type.list_size,
)
if _is_list_like(field.type):
new_type = pa.list_(
_align_field_types(
[field.type.value_field],
[target_field.type.value_field],
)[0],
target_field.type.list_size,
)
else:
new_type = target_field.type
else:
new_type = target_field.type
new_fields.append(pa.field(field.name, new_type, field.nullable))
new_fields.append(
pa.field(field.name, new_type, field.nullable, target_field.metadata)
)
return new_fields
@@ -440,6 +462,7 @@ def sanitize_create_table(
schema = data.schema
if metadata:
metadata = _merge_metadata(schema.metadata, metadata)
schema = schema.with_metadata(metadata)
# Need to apply metadata to the data as well
if isinstance(data, pa.Table):
@@ -492,9 +515,9 @@ def _append_vector_columns(
vector columns to the table.
"""
if schema is None:
metadata = metadata or {}
metadata = _merge_metadata(metadata)
else:
metadata = schema.metadata or metadata or {}
metadata = _merge_metadata(schema.metadata, metadata)
functions = EmbeddingFunctionRegistry.get_instance().parse_functions(metadata)
if not functions:
@@ -3211,43 +3234,157 @@ def _handle_bad_vectors(
reader: pa.RecordBatchReader,
on_bad_vectors: Literal["error", "drop", "fill", "null"] = "error",
fill_value: float = 0.0,
target_schema: Optional[pa.Schema] = None,
metadata: Optional[dict] = None,
) -> pa.RecordBatchReader:
vector_columns = []
vector_columns = _find_vector_columns(reader.schema, target_schema, metadata)
if not vector_columns:
return reader
for field in reader.schema:
# They can provide a 'vector' column that isn't yet a FSL
named_vector_col = (
(
pa.types.is_list(field.type)
or pa.types.is_large_list(field.type)
or pa.types.is_fixed_size_list(field.type)
)
and pa.types.is_floating(field.type.value_type)
and field.name == VECTOR_COLUMN_NAME
)
# TODO: we're making an assumption that fixed size list of 10 or more
# is a vector column. This is definitely a bit hacky.
likely_vector_col = (
pa.types.is_fixed_size_list(field.type)
and pa.types.is_floating(field.type.value_type)
and (field.type.list_size >= 10)
)
if named_vector_col or likely_vector_col:
vector_columns.append(field.name)
output_schema = _vector_output_schema(reader.schema, vector_columns)
def gen():
for batch in reader:
for name in vector_columns:
pending_dims = []
for vector_column in vector_columns:
dim = vector_column["expected_dim"]
if target_schema is not None and dim is None:
dim = _infer_vector_dim(batch[vector_column["name"]])
pending_dims.append(vector_column)
batch = _handle_bad_vector_column(
batch,
vector_column_name=name,
vector_column_name=vector_column["name"],
on_bad_vectors=on_bad_vectors,
fill_value=fill_value,
expected_dim=dim,
expected_value_type=vector_column["expected_value_type"],
)
yield batch
for vector_column in pending_dims:
if vector_column["expected_dim"] is None:
vector_column["expected_dim"] = _infer_vector_dim(
batch[vector_column["name"]]
)
if batch.schema.equals(output_schema, check_metadata=True):
yield batch
continue
return pa.RecordBatchReader.from_batches(reader.schema, gen())
cast_batches = (
pa.Table.from_batches([batch]).cast(output_schema).to_batches()
)
if cast_batches:
yield pa.RecordBatch.from_arrays(
cast_batches[0].columns,
schema=output_schema,
)
return pa.RecordBatchReader.from_batches(output_schema, gen())
def _find_vector_columns(
reader_schema: pa.Schema,
target_schema: Optional[pa.Schema],
metadata: Optional[dict],
) -> List[dict]:
if target_schema is None:
vector_columns = []
for field in reader_schema:
named_vector_col = (
_is_list_like(field.type)
and pa.types.is_floating(field.type.value_type)
and field.name == VECTOR_COLUMN_NAME
)
likely_vector_col = (
pa.types.is_fixed_size_list(field.type)
and pa.types.is_floating(field.type.value_type)
and (field.type.list_size >= 10)
)
if named_vector_col or likely_vector_col:
vector_columns.append(
{
"name": field.name,
"expected_dim": None,
"expected_value_type": None,
}
)
return vector_columns
reader_column_names = set(reader_schema.names)
active_metadata = _merge_metadata(target_schema.metadata, metadata)
embedding_function_columns = set(
EmbeddingFunctionRegistry.get_instance().parse_functions(active_metadata).keys()
)
vector_columns = []
for field in target_schema:
if field.name not in reader_column_names:
continue
if not _is_list_like(field.type) or not pa.types.is_floating(
field.type.value_type
):
continue
reader_field = reader_schema.field(field.name)
named_vector_col = (
field.name in embedding_function_columns
or field.name == VECTOR_COLUMN_NAME
or (field.name == "embedding" and pa.types.is_fixed_size_list(field.type))
)
typed_fixed_vector_col = (
pa.types.is_fixed_size_list(reader_field.type)
and pa.types.is_floating(reader_field.type.value_type)
and reader_field.type.list_size >= 10
)
if named_vector_col or typed_fixed_vector_col:
vector_columns.append(
{
"name": field.name,
"expected_dim": (
field.type.list_size
if pa.types.is_fixed_size_list(field.type)
else None
),
"expected_value_type": field.type.value_type,
}
)
return vector_columns
def _vector_output_schema(
reader_schema: pa.Schema,
vector_columns: List[dict],
) -> pa.Schema:
columns_by_name = {column["name"]: column for column in vector_columns}
fields = []
for field in reader_schema:
column = columns_by_name.get(field.name)
if column is None:
output_type = field.type
else:
output_type = _vector_output_type(field, column)
fields.append(pa.field(field.name, output_type, field.nullable, field.metadata))
return pa.schema(fields, metadata=reader_schema.metadata)
def _vector_output_type(field: pa.Field, vector_column: dict) -> pa.DataType:
if not _is_list_like(field.type):
return field.type
if vector_column["expected_value_type"] is not None and (
pa.types.is_null(field.type.value_type)
or pa.types.is_integer(field.type.value_type)
or pa.types.is_unsigned_integer(field.type.value_type)
):
return pa.list_(vector_column["expected_value_type"])
if (
vector_column["expected_dim"] is not None
and pa.types.is_fixed_size_list(field.type)
and field.type.list_size != vector_column["expected_dim"]
):
return pa.list_(field.type.value_type)
return field.type
def _handle_bad_vector_column(
@@ -3255,6 +3392,8 @@ def _handle_bad_vector_column(
vector_column_name: str,
on_bad_vectors: str = "error",
fill_value: float = 0.0,
expected_dim: Optional[int] = None,
expected_value_type: Optional[pa.DataType] = None,
) -> pa.RecordBatch:
"""
Ensure that the vector column exists and has type fixed_size_list(float)
@@ -3271,14 +3410,39 @@ def _handle_bad_vector_column(
fill_value: float, default 0.0
The value to use when filling vectors. Only used if on_bad_vectors="fill".
"""
position = data.column_names.index(vector_column_name)
vec_arr = data[vector_column_name]
if not _is_list_like(vec_arr.type):
return data
has_nan = has_nan_values(vec_arr)
if (
expected_dim is not None
and pa.types.is_fixed_size_list(vec_arr.type)
and vec_arr.type.list_size != expected_dim
):
vec_arr = pa.array(vec_arr.to_pylist(), type=pa.list_(vec_arr.type.value_type))
data = data.set_column(position, vector_column_name, vec_arr)
if pa.types.is_fixed_size_list(vec_arr.type):
if expected_value_type is not None and (
pa.types.is_integer(vec_arr.type.value_type)
or pa.types.is_unsigned_integer(vec_arr.type.value_type)
):
vec_arr = pa.array(vec_arr.to_pylist(), type=pa.list_(expected_value_type))
data = data.set_column(position, vector_column_name, vec_arr)
if pa.types.is_floating(vec_arr.type.value_type):
has_nan = has_nan_values(vec_arr)
else:
has_nan = pa.array([False] * len(vec_arr))
if expected_dim is not None:
dim = expected_dim
elif pa.types.is_fixed_size_list(vec_arr.type):
dim = vec_arr.type.list_size
else:
dim = _modal_list_size(vec_arr)
dim = _infer_vector_dim(vec_arr)
if dim is None:
return data
has_wrong_dim = pc.not_equal(pc.list_value_length(vec_arr), dim)
has_bad_vectors = pc.any(has_nan).as_py() or pc.any(has_wrong_dim).as_py()
@@ -3316,13 +3480,12 @@ def _handle_bad_vector_column(
)
vec_arr = pc.if_else(
is_bad,
pa.scalar([fill_value] * dim),
pa.scalar([fill_value] * dim, type=vec_arr.type),
vec_arr,
)
else:
raise ValueError(f"Invalid value for on_bad_vectors: {on_bad_vectors}")
position = data.column_names.index(vector_column_name)
return data.set_column(position, vector_column_name, vec_arr)
@@ -3343,6 +3506,28 @@ def has_nan_values(arr: Union[pa.ListArray, pa.ChunkedArray]) -> pa.BooleanArray
return pc.is_in(indices, has_nan_indices)
def _is_list_like(data_type: pa.DataType) -> bool:
return (
pa.types.is_list(data_type)
or pa.types.is_large_list(data_type)
or pa.types.is_fixed_size_list(data_type)
)
def _merge_metadata(*metadata_dicts: Optional[dict]) -> dict:
merged = {}
for metadata in metadata_dicts:
if metadata is None:
continue
for key, value in metadata.items():
if isinstance(key, str):
key = key.encode("utf-8")
if isinstance(value, str):
value = value.encode("utf-8")
merged[key] = value
return merged
def _name_suggests_vector_column(field_name: str) -> bool:
"""Check if a field name indicates a vector column."""
name_lower = field_name.lower()
@@ -3410,6 +3595,16 @@ def _modal_list_size(arr: Union[pa.ListArray, pa.ChunkedArray]) -> int:
return pc.mode(pc.list_value_length(arr))[0].as_py()["mode"]
def _infer_vector_dim(arr: Union[pa.Array, pa.ChunkedArray]) -> Optional[int]:
if not _is_list_like(arr.type):
return None
lengths = pc.list_value_length(arr)
lengths = pc.filter(lengths, pc.greater(lengths, 0))
if len(lengths) == 0:
return None
return pc.mode(lengths)[0].as_py()["mode"]
def _validate_schema(schema: pa.Schema):
"""
Make sure the metadata is valid utf8

View File

@@ -522,6 +522,50 @@ def test_no_split_names(some_table: Table):
assert permutations[1].num_rows == 500
def test_permutations_metadata_without_split_names_key(mem_db: DBConnection):
"""Regression: schema metadata present but missing split_names key must not crash.
Previously, `.get(b"split_names", None).decode()` was called unconditionally,
so any permutation table whose metadata dict had other keys but no split_names
raised AttributeError: 'NoneType' has no attribute 'decode'.
"""
base = mem_db.create_table("base_nosplit", pa.table({"x": range(10)}))
# Build a permutation-like table that carries some metadata but NOT split_names.
raw = pa.table(
{
"row_id": pa.array(range(10), type=pa.uint64()),
"split_id": pa.array([0] * 10, type=pa.uint32()),
}
).replace_schema_metadata({b"other_key": b"other_value"})
perm_tbl = mem_db.create_table("perm_nosplit", raw)
permutations = Permutations(base, perm_tbl)
assert permutations.split_names == []
assert permutations.split_dict == {}
def test_from_tables_string_split_missing_names_key(mem_db: DBConnection):
"""Regression: from_tables() with a string split must raise ValueError, not
AttributeError.
Previously, `.get(b"split_names", None).decode()` crashed with AttributeError
when the metadata dict existed but had no split_names key.
"""
base = mem_db.create_table("base_strsplit", pa.table({"x": range(10)}))
raw = pa.table(
{
"row_id": pa.array(range(10), type=pa.uint64()),
"split_id": pa.array([0] * 10, type=pa.uint32()),
}
).replace_schema_metadata({b"other_key": b"other_value"})
perm_tbl = mem_db.create_table("perm_strsplit", raw)
with pytest.raises(ValueError, match="no split names are defined"):
Permutation.from_tables(base, perm_tbl, split="train")
@pytest.fixture
def some_perm_table(some_table: Table) -> Table:
return (

View File

@@ -1049,6 +1049,231 @@ def test_add_with_nans(mem_db: DBConnection):
assert np.allclose(v, np.array([0.0, 0.0]))
def test_add_with_empty_fixed_size_list_drops_bad_rows(mem_db: DBConnection):
class Schema(LanceModel):
text: str
embedding: Vector(16)
table = mem_db.create_table("test_empty_embeddings", schema=Schema)
table.add(
[
{"text": "hello", "embedding": []},
{"text": "bar", "embedding": [0.1] * 16},
],
on_bad_vectors="drop",
)
data = table.to_arrow()
assert data["text"].to_pylist() == ["bar"]
assert np.allclose(data["embedding"].to_pylist()[0], np.array([0.1] * 16))
def test_add_with_integer_embeddings_preserves_casting(mem_db: DBConnection):
class Schema(LanceModel):
text: str
embedding: Vector(4)
table = mem_db.create_table("test_integer_embeddings", schema=Schema)
table.add(
[{"text": "foo", "embedding": [1, 2, 3, 4]}],
on_bad_vectors="drop",
)
assert table.to_arrow()["embedding"].to_pylist() == [[1.0, 2.0, 3.0, 4.0]]
def test_on_bad_vectors_does_not_handle_non_vector_fixed_size_lists(
mem_db: DBConnection,
):
schema = pa.schema(
[
pa.field("vector", pa.list_(pa.float32(), 4)),
pa.field("bbox", pa.list_(pa.float32(), 4)),
]
)
table = mem_db.create_table("test_bbox_schema", schema=schema)
with pytest.raises(RuntimeError, match="FixedSizeListType"):
table.add(
[{"vector": [1.0, 2.0, 3.0, 4.0], "bbox": [0.0, 1.0]}],
on_bad_vectors="drop",
)
def test_on_bad_vectors_does_not_handle_custom_named_fixed_size_lists(
mem_db: DBConnection,
):
schema = pa.schema([pa.field("features", pa.list_(pa.float32(), 16))])
table = mem_db.create_table("test_custom_named_fixed_size_vector", schema=schema)
with pytest.raises(RuntimeError, match="FixedSizeListType"):
table.add(
[
{"features": []},
{"features": [0.1] * 16},
],
on_bad_vectors="drop",
)
def test_on_bad_vectors_with_schema_list_vector_still_sanitizes(mem_db: DBConnection):
schema = pa.schema([pa.field("vector", pa.list_(pa.float32()))])
table = mem_db.create_table("test_schema_list_vector", schema=schema)
table.add(
[
{"vector": [1.0, 2.0]},
{"vector": [3.0]},
{"vector": [4.0, 5.0]},
],
on_bad_vectors="drop",
)
assert table.to_arrow()["vector"].to_pylist() == [[1.0, 2.0], [4.0, 5.0]]
def test_on_bad_vectors_handles_typed_custom_fixed_vectors_for_list_schema(
mem_db: DBConnection,
):
schema = pa.schema([pa.field("vec", pa.list_(pa.float32()))])
table = mem_db.create_table("test_typed_custom_fixed_vector", schema=schema)
data = pa.table(
{
"vec": pa.array(
[[float("nan")] * 16, [1.0] * 16],
type=pa.list_(pa.float32(), 16),
)
}
)
table.add(data, on_bad_vectors="drop")
assert table.to_arrow()["vec"].to_pylist() == [[1.0] * 16]
def test_on_bad_vectors_fill_preserves_arrow_nested_vector_type(mem_db: DBConnection):
schema = pa.schema([pa.field("vector", pa.list_(pa.float32()))])
table = mem_db.create_table("test_fill_arrow_nested_type", schema=schema)
data = pa.table(
{
"vector": pa.array(
[[1.0, 2.0], [float("nan"), 3.0]],
type=pa.list_(pa.float32(), 2),
)
}
)
table.add(
data,
on_bad_vectors="fill",
fill_value=0.0,
)
assert table.to_arrow()["vector"].to_pylist() == [[1.0, 2.0], [0.0, 0.0]]
@pytest.mark.parametrize(
("table_name", "batch1", "expected"),
[
(
"test_schema_list_vector_empty_prefix",
pa.record_batch({"vector": [[], []]}),
[[], [], [1.0, 2.0], [3.0, 4.0]],
),
(
"test_schema_list_vector_all_bad_prefix",
pa.record_batch({"vector": [[float("nan")] * 3, [float("nan")] * 3]}),
[[1.0, 2.0], [3.0, 4.0]],
),
],
)
def test_on_bad_vectors_with_schema_list_vector_ignores_invalid_prefix_batches(
mem_db: DBConnection,
table_name: str,
batch1: pa.RecordBatch,
expected: list,
):
schema = pa.schema([pa.field("vector", pa.list_(pa.float32()))])
table = mem_db.create_table(table_name, schema=schema)
batch2 = pa.record_batch({"vector": [[1.0, 2.0], [3.0, 4.0]]})
reader = pa.RecordBatchReader.from_batches(batch1.schema, [batch1, batch2])
table.add(reader, on_bad_vectors="drop")
assert table.to_arrow()["vector"].to_pylist() == expected
def test_on_bad_vectors_with_multiple_vectors_locks_dim_after_final_drop(
mem_db: DBConnection,
):
registry = EmbeddingFunctionRegistry.get_instance()
func = MockTextEmbeddingFunction.create()
metadata = registry.get_table_metadata(
[
EmbeddingFunctionConfig(
source_column="text1", vector_column="vec1", function=func
),
EmbeddingFunctionConfig(
source_column="text2", vector_column="vec2", function=func
),
]
)
schema = pa.schema(
[
pa.field("vec1", pa.list_(pa.float32())),
pa.field("vec2", pa.list_(pa.float32())),
],
metadata=metadata,
)
table = mem_db.create_table("test_multi_vector_dim_lock", schema=schema)
batch1 = pa.record_batch(
{
"vec1": [[1.0, 2.0, 3.0], [10.0, 11.0]],
"vec2": [[float("nan"), 0.0], [5.0, 6.0]],
}
)
batch2 = pa.record_batch(
{
"vec1": [[20.0, 21.0], [30.0, 31.0]],
"vec2": [[7.0, 8.0], [9.0, 10.0]],
}
)
reader = pa.RecordBatchReader.from_batches(batch1.schema, [batch1, batch2])
table.add(reader, on_bad_vectors="drop")
data = table.to_arrow()
assert data["vec1"].to_pylist() == [[10.0, 11.0], [20.0, 21.0], [30.0, 31.0]]
assert data["vec2"].to_pylist() == [[5.0, 6.0], [7.0, 8.0], [9.0, 10.0]]
def test_on_bad_vectors_does_not_handle_non_vector_list_columns(mem_db: DBConnection):
schema = pa.schema([pa.field("embedding_history", pa.list_(pa.float32()))])
table = mem_db.create_table("test_non_vector_list_schema", schema=schema)
table.add(
[
{"embedding_history": [1.0, 2.0]},
{"embedding_history": [3.0]},
],
on_bad_vectors="drop",
)
assert table.to_arrow()["embedding_history"].to_pylist() == [
[1.0, 2.0],
[3.0],
]
def test_on_bad_vectors_all_null_schema_vector_batches_do_not_crash(
mem_db: DBConnection,
):
schema = pa.schema([pa.field("vector", pa.list_(pa.float32(), 2), nullable=True)])
table = mem_db.create_table("test_all_null_vector_batch", schema=schema)
table.add([{"vector": None}], on_bad_vectors="drop")
assert table.to_arrow()["vector"].to_pylist() == [None]
def test_restore(mem_db: DBConnection):
table = mem_db.create_table(
"my_table",

View File

@@ -15,8 +15,10 @@ from lancedb.table import (
_cast_to_target_schema,
_handle_bad_vectors,
_into_pyarrow_reader,
_sanitize_data,
_infer_target_schema,
_merge_metadata,
_sanitize_data,
sanitize_create_table,
)
import pyarrow as pa
import pandas as pd
@@ -304,6 +306,117 @@ def test_handle_bad_vectors_noop():
assert output["vector"] == vector
def test_handle_bad_vectors_updates_reader_schema_for_target_schema():
data = pa.table({"vector": [[1, 2, 3, 4]]})
target_schema = pa.schema([pa.field("vector", pa.list_(pa.float32(), 4))])
output = _handle_bad_vectors(
data.to_reader(),
on_bad_vectors="drop",
target_schema=target_schema,
)
assert output.schema == pa.schema([pa.field("vector", pa.list_(pa.float32()))])
assert output.read_all()["vector"].to_pylist() == [[1.0, 2.0, 3.0, 4.0]]
def test_sanitize_data_keeps_target_field_metadata():
source_field = pa.field(
"vector",
pa.list_(pa.float32(), 2),
metadata={b"source": b"drop-me"},
)
target_field = pa.field(
"vector",
pa.list_(pa.float32(), 2),
metadata={b"target": b"keep-me"},
)
data = pa.table(
{"vector": pa.array([[1.0, 2.0]], type=pa.list_(pa.float32(), 2))},
schema=pa.schema([source_field]),
)
output = _sanitize_data(
data,
target_schema=pa.schema([target_field]),
on_bad_vectors="drop",
).read_all()
assert output.schema.field("vector").metadata == {b"target": b"keep-me"}
def test_sanitize_data_uses_separate_embedding_metadata_for_bad_vectors():
registry = EmbeddingFunctionRegistry.get_instance()
conf = EmbeddingFunctionConfig(
source_column="text",
vector_column="custom_vector",
function=MockTextEmbeddingFunction.create(),
)
metadata = registry.get_table_metadata([conf])
schema = pa.schema(
{
"text": pa.string(),
"custom_vector": pa.list_(pa.float32(), 10),
},
metadata={b"note": b"keep-me"},
)
data = pa.table(
{
"text": ["bad", "good"],
"custom_vector": [[1.0] * 9, [2.0] * 10],
}
)
output = _sanitize_data(
data,
target_schema=schema,
metadata=metadata,
on_bad_vectors="drop",
).read_all()
assert output["text"].to_pylist() == ["good"]
assert output.schema.metadata[b"note"] == b"keep-me"
assert b"embedding_functions" in output.schema.metadata
def test_sanitize_create_table_merges_and_overrides_embedding_metadata():
registry = EmbeddingFunctionRegistry.get_instance()
old_conf = EmbeddingFunctionConfig(
source_column="text",
vector_column="old_vector",
function=MockTextEmbeddingFunction.create(),
)
new_conf = EmbeddingFunctionConfig(
source_column="text",
vector_column="custom_vector",
function=MockTextEmbeddingFunction.create(),
)
metadata = registry.get_table_metadata([new_conf])
schema = pa.schema(
{
"text": pa.string(),
"custom_vector": pa.list_(pa.float32(), 10),
},
metadata=_merge_metadata(
{b"note": b"keep-me"},
registry.get_table_metadata([old_conf]),
),
)
data, schema = sanitize_create_table(
pa.table({"text": ["good"]}),
schema,
metadata=metadata,
on_bad_vectors="drop",
)
assert schema.metadata[b"note"] == b"keep-me"
assert b"embedding_functions" in schema.metadata
assert data.schema.metadata[b"note"] == b"keep-me"
funcs = EmbeddingFunctionRegistry.get_instance().parse_functions(schema.metadata)
assert set(funcs.keys()) == {"custom_vector"}
class TestModel(lancedb.pydantic.LanceModel):
a: Optional[int]
b: Optional[int]

View File

@@ -547,6 +547,7 @@ pub struct PyClientConfig {
id_delimiter: Option<String>,
tls_config: Option<PyClientTlsConfig>,
header_provider: Option<Py<PyAny>>,
user_id: Option<String>,
}
#[derive(FromPyObject)]
@@ -631,6 +632,7 @@ impl From<PyClientConfig> for lancedb::remote::ClientConfig {
id_delimiter: value.id_delimiter,
tls_config: value.tls_config.map(Into::into),
header_provider,
user_id: value.user_id,
}
}
}

View File

@@ -1,2 +1,2 @@
[toolchain]
channel = "1.91.0"
channel = "1.94.0"

View File

@@ -177,6 +177,7 @@ impl BedrockEmbeddingFunction {
))
.send()
.await
.map_err(Box::new)
})
})
.unwrap();

View File

@@ -52,6 +52,13 @@ pub struct ClientConfig {
pub tls_config: Option<TlsConfig>,
/// Provider for custom headers to be added to each request
pub header_provider: Option<Arc<dyn HeaderProvider>>,
/// User identifier for tracking purposes.
///
/// This is sent as the `x-lancedb-user-id` header in requests to LanceDB Cloud/Enterprise.
/// It can be set directly, or via the `LANCEDB_USER_ID` environment variable.
/// Alternatively, set `LANCEDB_USER_ID_ENV_KEY` to specify another environment
/// variable that contains the user ID value.
pub user_id: Option<String>,
}
impl std::fmt::Debug for ClientConfig {
@@ -67,6 +74,7 @@ impl std::fmt::Debug for ClientConfig {
"header_provider",
&self.header_provider.as_ref().map(|_| "Some(...)"),
)
.field("user_id", &self.user_id)
.finish()
}
}
@@ -81,10 +89,41 @@ impl Default for ClientConfig {
id_delimiter: None,
tls_config: None,
header_provider: None,
user_id: None,
}
}
}
impl ClientConfig {
/// Resolve the user ID from the config or environment variables.
///
/// Resolution order:
/// 1. If `user_id` is set in the config, use that value
/// 2. If `LANCEDB_USER_ID` environment variable is set, use that value
/// 3. If `LANCEDB_USER_ID_ENV_KEY` is set, read the env var it points to
/// 4. Otherwise, return None
pub fn resolve_user_id(&self) -> Option<String> {
if self.user_id.is_some() {
return self.user_id.clone();
}
if let Ok(user_id) = std::env::var("LANCEDB_USER_ID")
&& !user_id.is_empty()
{
return Some(user_id);
}
if let Ok(env_key) = std::env::var("LANCEDB_USER_ID_ENV_KEY")
&& let Ok(user_id) = std::env::var(&env_key)
&& !user_id.is_empty()
{
return Some(user_id);
}
None
}
}
/// How to handle timeouts for HTTP requests.
#[derive(Clone, Default, Debug)]
pub struct TimeoutConfig {
@@ -464,6 +503,15 @@ impl<S: HttpSend> RestfulLanceDbClient<S> {
);
}
if let Some(user_id) = config.resolve_user_id() {
headers.insert(
HeaderName::from_static("x-lancedb-user-id"),
HeaderValue::from_str(&user_id).map_err(|_| Error::InvalidInput {
message: format!("non-ascii user_id '{}' provided", user_id),
})?,
);
}
Ok(headers)
}
@@ -1072,4 +1120,91 @@ mod tests {
_ => panic!("Expected Runtime error"),
}
}
#[test]
fn test_resolve_user_id_direct_value() {
let config = ClientConfig {
user_id: Some("direct-user-id".to_string()),
..Default::default()
};
assert_eq!(config.resolve_user_id(), Some("direct-user-id".to_string()));
}
#[test]
fn test_resolve_user_id_none() {
let config = ClientConfig::default();
// Clear env vars that might be set from other tests
// SAFETY: This is only called in tests
unsafe {
std::env::remove_var("LANCEDB_USER_ID");
std::env::remove_var("LANCEDB_USER_ID_ENV_KEY");
}
assert_eq!(config.resolve_user_id(), None);
}
#[test]
fn test_resolve_user_id_from_env() {
// SAFETY: This is only called in tests
unsafe {
std::env::set_var("LANCEDB_USER_ID", "env-user-id");
}
let config = ClientConfig::default();
assert_eq!(config.resolve_user_id(), Some("env-user-id".to_string()));
// SAFETY: This is only called in tests
unsafe {
std::env::remove_var("LANCEDB_USER_ID");
}
}
#[test]
fn test_resolve_user_id_from_env_key() {
// SAFETY: This is only called in tests
unsafe {
std::env::remove_var("LANCEDB_USER_ID");
std::env::set_var("LANCEDB_USER_ID_ENV_KEY", "MY_CUSTOM_USER_ID");
std::env::set_var("MY_CUSTOM_USER_ID", "custom-env-user-id");
}
let config = ClientConfig::default();
assert_eq!(
config.resolve_user_id(),
Some("custom-env-user-id".to_string())
);
// SAFETY: This is only called in tests
unsafe {
std::env::remove_var("LANCEDB_USER_ID_ENV_KEY");
std::env::remove_var("MY_CUSTOM_USER_ID");
}
}
#[test]
fn test_resolve_user_id_direct_takes_precedence() {
// SAFETY: This is only called in tests
unsafe {
std::env::set_var("LANCEDB_USER_ID", "env-user-id");
}
let config = ClientConfig {
user_id: Some("direct-user-id".to_string()),
..Default::default()
};
assert_eq!(config.resolve_user_id(), Some("direct-user-id".to_string()));
// SAFETY: This is only called in tests
unsafe {
std::env::remove_var("LANCEDB_USER_ID");
}
}
#[test]
fn test_resolve_user_id_empty_env_ignored() {
// SAFETY: This is only called in tests
unsafe {
std::env::set_var("LANCEDB_USER_ID", "");
std::env::remove_var("LANCEDB_USER_ID_ENV_KEY");
}
let config = ClientConfig::default();
assert_eq!(config.resolve_user_id(), None);
// SAFETY: This is only called in tests
unsafe {
std::env::remove_var("LANCEDB_USER_ID");
}
}
}