mirror of
https://github.com/lancedb/lancedb.git
synced 2026-06-22 13:40:41 +00:00
feat(python): support blob modes in query to_pandas (#3487)
## Feature - What is the new feature? - Adds `blob_mode` support to sync and async Python query `to_pandas()` APIs. - Enables plain scan queries to return blob columns as lazy `BlobFile` objects, raw bytes, or blob descriptions. - Lets namespace-backed local tables use Lance native blob-aware pandas conversion for lazy blobs. - Why do we need this feature? - Table and Lance dataset/scanner APIs already support blob-aware pandas conversion, but LanceDB query builders did not expose that capability. - Geneva and other callers should be able to use query-level `to_pandas(blob_mode=...)` without manually constructing Lance scanners. - How does it work? - Plain scan queries route through Lance scanner native `to_pandas(blob_mode=...)`, preserving filter, projection, limit, offset, row id, and alias/expression projection behavior. - Non-native query shapes keep existing Arrow fallback semantics and raise a clear error when they return blob columns with `blob_mode="lazy"` or `blob_mode="bytes"`. - Focused tests cover table/query blob modes, filter/select/limit/offset/alias query cases, async query behavior, vector-query error boundaries, and namespace-backed lazy blobs. ## Validation - `cd python && .venv/bin/maturin develop --uv --extras tests,dev --profile dev` - `cd python && uv run --frozen --no-sync pytest python/tests/test_table.py::test_table_to_pandas_blob_modes python/tests/test_table.py::test_async_table_to_pandas_blob_bytes python/tests/test_query.py::test_plain_scan_query_to_pandas_blob_modes python/tests/test_query.py::test_plain_scan_query_to_pandas_blob_projection python/tests/test_query.py::test_async_plain_scan_query_to_pandas_blob_projection python/tests/test_query.py::test_vector_query_to_pandas_blob_mode_requires_native_path python/tests/test_namespace.py::TestNamespaceConnection::test_table_to_pandas_blob_lazy_through_namespace -q` - `cd python && uv run --frozen --no-sync ruff format --check .` - `cd python && uv run --frozen --no-sync ruff check .` - `git diff --check`
This commit is contained in:
@@ -76,6 +76,35 @@ class TestNamespaceConnection:
|
||||
assert len(result) == 0
|
||||
assert list(result.columns) == ["id", "vector", "text"]
|
||||
|
||||
def test_table_to_pandas_blob_lazy_through_namespace(self):
|
||||
"""Namespace-backed tables should use Lance blob-aware pandas conversion."""
|
||||
pytest.importorskip("lance")
|
||||
db = lancedb.connect_namespace("dir", {"root": self.temp_dir})
|
||||
db.create_namespace(["test_ns"])
|
||||
data = pa.table(
|
||||
{
|
||||
"id": pa.array([1, 2], pa.int64()),
|
||||
"blob": pa.array([b"hello", b"world"], pa.large_binary()),
|
||||
},
|
||||
schema=pa.schema(
|
||||
[
|
||||
pa.field("id", pa.int64()),
|
||||
pa.field(
|
||||
"blob",
|
||||
pa.large_binary(),
|
||||
metadata={"lance-encoding:blob": "true"},
|
||||
),
|
||||
]
|
||||
),
|
||||
)
|
||||
|
||||
table = db.create_table("blob_table", data, namespace_path=["test_ns"])
|
||||
df = table.to_pandas(blob_mode="lazy").sort_values("id")
|
||||
|
||||
blob = df["blob"].iloc[0]
|
||||
assert hasattr(blob, "readall")
|
||||
assert blob.readall() == b"hello"
|
||||
|
||||
def test_open_table_through_namespace(self):
|
||||
"""Test opening an existing table through namespace."""
|
||||
db = lancedb.connect_namespace("dir", {"root": self.temp_dir})
|
||||
|
||||
Reference in New Issue
Block a user