fix(python): push down namespace full reads (#3516)

## Bug Fix

### What is the bug?

Namespace-backed `LanceTable.to_arrow()` full-table reads bypassed the
existing `QueryTable` server-side query path and called the lower-level
table `to_arrow()` implementation directly. In Geneva/Sophon this could
fail while parsing the Arrow IPC response for
`hist.get_table().to_arrow()` / `to_pandas()`, even though
`hist.get_table().search().to_arrow()` worked.

### What issues or incorrect behavior does the bug cause?

Full-table reads on namespace-backed tables with `QueryTable` pushdown
could fail with Arrow IPC parse errors, while query/search reads on the
same table succeeded. Since `to_pandas()` delegates through `to_arrow()`
for non-blob/native cases, pandas export was affected too.

### How does this PR fix the problem?

When `QueryTable` pushdown is enabled, sync and async table `to_arrow()`
now construct a plain no-filter, no-limit, all-columns query and execute
it through the table-level `_execute_query()` path. `AsyncTable` now
preserves namespace context from async namespace connections so async
full reads can make the same pushdown decision. Non-namespace tables and
namespace tables without `QueryTable` pushdown keep their existing
behavior.

### Tests

- `uv run --extra tests --extra dev --no-sync ruff check
python/lancedb/table.py python/lancedb/namespace.py
python/tests/test_namespace.py`
- `uv run --extra tests --extra dev --no-sync ruff format
python/lancedb/table.py python/lancedb/namespace.py
python/tests/test_namespace.py`
- `uv run --extra tests --extra dev --no-sync pytest
python/tests/test_namespace.py::TestPushdownOperations::test_lance_table_to_arrow_uses_query_pushdown
python/tests/test_namespace.py::TestAsyncPushdownOperations::test_async_table_to_arrow_uses_query_pushdown
python/tests/test_namespace.py::test_local_table_to_arrow_and_to_pandas_are_unchanged
-q`
- `uv run --extra tests --extra dev --no-sync pytest
python/tests/test_namespace.py -q`
This commit is contained in:
Yang Cen
2026-06-08 19:48:40 +08:00
committed by GitHub
parent 59fbfd4158
commit 3e25f584eb
3 changed files with 172 additions and 7 deletions

View File

@@ -5,10 +5,63 @@
import tempfile
import shutil
import sys
import pytest
import pyarrow as pa
import lancedb
from lance_namespace.errors import NamespaceNotEmptyError, TableNotFoundError
from lancedb.table import AsyncTable, LanceTable
PUSHDOWN_DATA = pa.table(
{"id": list(range(12)), "text": [f"row-{idx}" for idx in range(12)]}
)
def _ipc_file(table: pa.Table = PUSHDOWN_DATA) -> bytes:
sink = pa.BufferOutputStream()
with pa.ipc.new_file(sink, table.schema) as writer:
writer.write_table(table)
return sink.getvalue().to_pybytes()
class _FailingSyncInner:
name = "hist"
async def schema(self):
return PUSHDOWN_DATA.schema
async def to_arrow(self):
raise RuntimeError("direct table to_arrow should not be used")
class _FailingAsyncInner:
def name(self):
return "hist"
async def schema(self):
return PUSHDOWN_DATA.schema
def query(self):
raise AssertionError("direct async query should not be used")
class _NamespaceClient:
def __init__(self):
self.requests = []
def query_table(self, request):
self.requests.append(request)
return _ipc_file()
def _namespace_lance_table(namespace_client: _NamespaceClient) -> LanceTable:
table = LanceTable.__new__(LanceTable)
table._table = _FailingSyncInner()
table._namespace_path = ["geneva"]
table._namespace_client = namespace_client
table._pushdown_operations = {"QueryTable"}
return table
class TestNamespaceConnection:
@@ -736,6 +789,22 @@ class TestPushdownOperations:
db = lancedb.connect_namespace("dir", {"root": self.temp_dir})
assert len(db._namespace_client_pushdown_operations) == 0
def test_lance_table_to_arrow_uses_query_pushdown(self):
namespace_client = _NamespaceClient()
table = _namespace_lance_table(namespace_client)
assert table.to_arrow().equals(PUSHDOWN_DATA)
assert table.to_pandas()["id"].tolist() == list(range(12))
assert len(namespace_client.requests) == 2
assert [request.id for request in namespace_client.requests] == [
["geneva", "hist"],
["geneva", "hist"],
]
assert [request.k for request in namespace_client.requests] == [
sys.maxsize,
sys.maxsize,
]
@pytest.mark.asyncio
class TestAsyncPushdownOperations:
@@ -771,3 +840,39 @@ class TestAsyncPushdownOperations:
"""Test that pushdown operations default to empty on async connection."""
db = lancedb.connect_namespace_async("dir", {"root": self.temp_dir})
assert len(db._namespace_client_pushdown_operations) == 0
async def test_async_table_to_arrow_uses_query_pushdown(self):
namespace_client = _NamespaceClient()
table = AsyncTable(
_FailingAsyncInner(),
namespace_path=["geneva"],
namespace_client=namespace_client,
pushdown_operations={"QueryTable"},
)
assert (await table.to_arrow()).equals(PUSHDOWN_DATA)
assert (await table.to_pandas())["id"].tolist() == list(range(12))
assert len(namespace_client.requests) == 2
assert [request.id for request in namespace_client.requests] == [
["geneva", "hist"],
["geneva", "hist"],
]
assert [request.k for request in namespace_client.requests] == [
sys.maxsize,
sys.maxsize,
]
def test_local_table_to_arrow_and_to_pandas_are_unchanged(tmp_path):
db = lancedb.connect(str(tmp_path / "db"))
table = db.create_table(
"local",
data=[
{"id": 1, "vector": [1.0, 2.0]},
{"id": 2, "vector": [3.0, 4.0]},
],
)
assert table.to_arrow().column("id").to_pylist() == [1, 2]
assert table.to_pandas()["id"].tolist() == [1, 2]