fix(python): route blob query pandas through scanner (#3491)

## Bug Fix

### What is the bug?
`QueryBuilder.to_pandas(blob_mode="descriptions")` could still fall back
to `self.to_arrow()` for query outputs with blob columns. Custom query
subclasses or wrappers can have `to_arrow()` behavior that is not
compatible with pandas blob-description conversion, which can surface as
low-level Arrow/list-batch conversion failures.

### What issues or incorrect behavior does the bug cause?
Callers need to carry local `to_pandas` or plain-scan adapter special
casing for blob descriptions, and scanner-only kwargs such as row
addresses and fragment selection are not represented in LanceDB query
state.

### How does this PR fix the problem?
This PR routes blob-output query `to_pandas()` through the Lance scanner
path for `lazy`, `bytes`, and `descriptions` modes when the query is a
scanner-backed plain scan. For `blob_mode="descriptions"` with
`flatten`, it collects scanner Arrow/table output, applies LanceDB
`flatten_columns`, and converts to pandas from there. Non-plain blob
query shapes now fail with a clear unsupported error instead of falling
into subclass `to_arrow()` behavior.

It also adds Python query state and builder methods for scanner-only
plain-scan parameters:

- `with_row_address()` for `_rowaddr`
- `with_fragments(...)` for Lance fragment objects
- `fragment_ids([...])` as a convenience wrapper that resolves IDs to
Lance fragments

## Validation

- `cd python && uv run --no-sync ruff format --check
python/lancedb/query.py python/tests/test_query.py`
- `cd python && uv run --no-sync ruff check python/lancedb/query.py
python/tests/test_query.py`

Targeted pytest was intentionally not run locally per maintainer
request.
This commit is contained in:
Yang Cen
2026-06-04 14:03:33 +08:00
committed by GitHub
parent 415d199c15
commit 927ba2c948
2 changed files with 247 additions and 26 deletions

View File

@@ -255,8 +255,9 @@ def test_plain_scan_query_to_pandas_blob_projection(tmp_db):
assert df["double_id"].tolist() == [6, 8]
@pytest.mark.parametrize("blob_mode", ["bytes", "descriptions"])
def test_plain_scan_query_to_pandas_blob_mode_does_not_collect_arrow(
tmp_db, monkeypatch
tmp_db, monkeypatch, blob_mode
):
pytest.importorskip("lance")
table = tmp_db.create_table(
@@ -269,10 +270,69 @@ def test_plain_scan_query_to_pandas_blob_mode_does_not_collect_arrow(
monkeypatch.setattr(query, "to_arrow", fail_to_arrow)
df = query.to_pandas(blob_mode="bytes")
df = query.to_pandas(blob_mode=blob_mode)
assert df["id"].tolist() == [1]
assert df["blob"].tolist() == [b"one"]
if blob_mode == "bytes":
assert df["blob"].tolist() == [b"one"]
else:
first = df["blob"].iloc[0]
assert first != b"one"
assert not hasattr(first, "readall")
def test_plain_scan_query_to_pandas_blob_descriptions_flatten_uses_scanner(
tmp_db, monkeypatch
):
pytest.importorskip("lance")
table = tmp_db.create_table(
"test_query_to_pandas_blob_desc_flatten", _blob_query_data()
)
query = table.search().where("id = 1").select(["id", "blob"])
def fail_to_arrow(*args, **kwargs):
raise AssertionError("to_arrow should not be called before scanner pandas")
monkeypatch.setattr(query, "to_arrow", fail_to_arrow)
df = query.to_pandas(blob_mode="descriptions", flatten=True)
assert df["id"].tolist() == [1]
assert any(column == "blob" or column.startswith("blob.") for column in df.columns)
def test_plain_scan_query_to_pandas_scanner_state(tmp_db):
pytest.importorskip("lance")
data = _blob_query_data()
table = tmp_db.create_table("test_query_to_pandas_scanner_state", data.slice(0, 2))
table.add(data.slice(2, 2))
fragments = table.to_lance().get_fragments()
assert len(fragments) == 2
query = (
table.search()
.select(["id", "blob"])
.with_row_address()
.fragment_ids([fragments[1].fragment_id])
)
query_obj = query.to_query_object()
assert query_obj.with_row_address is True
assert query_obj.fragment_ids == [fragments[1].fragment_id]
df = query.to_pandas(blob_mode="descriptions")
assert df["id"].tolist() == [3, 4]
assert "_rowaddr" in df.columns
assert {rowaddr >> 32 for rowaddr in df["_rowaddr"]} == {fragments[1].fragment_id}
df_by_fragment = (
table.search()
.select(["id", "blob"])
.with_fragments([fragments[0]])
.to_pandas(blob_mode="descriptions")
)
assert df_by_fragment["id"].tolist() == [1, 2]
@pytest.mark.asyncio
@@ -312,8 +372,9 @@ async def test_async_plain_scan_query_to_pandas_blob_projection(tmp_db_async):
@pytest.mark.asyncio
@pytest.mark.parametrize("blob_mode", ["bytes", "descriptions"])
async def test_async_plain_scan_query_to_pandas_blob_mode_does_not_collect_arrow(
tmp_db_async, monkeypatch
tmp_db_async, monkeypatch, blob_mode
):
pytest.importorskip("lance")
table = await tmp_db_async.create_table(
@@ -326,10 +387,15 @@ async def test_async_plain_scan_query_to_pandas_blob_mode_does_not_collect_arrow
monkeypatch.setattr(query, "to_arrow", fail_to_arrow)
df = await query.to_pandas(blob_mode="bytes")
df = await query.to_pandas(blob_mode=blob_mode)
assert df["id"].tolist() == [1]
assert df["blob"].tolist() == [b"one"]
if blob_mode == "bytes":
assert df["blob"].tolist() == [b"one"]
else:
first = df["blob"].iloc[0]
assert first != b"one"
assert not hasattr(first, "readall")
def test_vector_query_to_pandas_blob_mode_requires_native_path(tmp_db):
@@ -342,6 +408,18 @@ def test_vector_query_to_pandas_blob_mode_requires_native_path(tmp_db):
)
def test_vector_query_to_pandas_blob_descriptions_requires_plain_scan(tmp_db):
pytest.importorskip("lance")
table = tmp_db.create_table(
"test_vector_query_blob_descriptions", _blob_query_data()
)
with pytest.raises(RuntimeError, match="plain scan query"):
table.search([1.0, 0.0]).select(["blob", "vector"]).limit(1).to_pandas(
blob_mode="descriptions"
)
def test_order_by_plain_query(mem_db):
table = mem_db.create_table(
"test_order_by",