mirror of
https://github.com/lancedb/lancedb.git
synced 2026-06-22 13:40:41 +00:00
fix(python): push down namespace full reads (#3516)
## Bug Fix ### What is the bug? Namespace-backed `LanceTable.to_arrow()` full-table reads bypassed the existing `QueryTable` server-side query path and called the lower-level table `to_arrow()` implementation directly. In Geneva/Sophon this could fail while parsing the Arrow IPC response for `hist.get_table().to_arrow()` / `to_pandas()`, even though `hist.get_table().search().to_arrow()` worked. ### What issues or incorrect behavior does the bug cause? Full-table reads on namespace-backed tables with `QueryTable` pushdown could fail with Arrow IPC parse errors, while query/search reads on the same table succeeded. Since `to_pandas()` delegates through `to_arrow()` for non-blob/native cases, pandas export was affected too. ### How does this PR fix the problem? When `QueryTable` pushdown is enabled, sync and async table `to_arrow()` now construct a plain no-filter, no-limit, all-columns query and execute it through the table-level `_execute_query()` path. `AsyncTable` now preserves namespace context from async namespace connections so async full reads can make the same pushdown decision. Non-namespace tables and namespace tables without `QueryTable` pushdown keep their existing behavior. ### Tests - `uv run --extra tests --extra dev --no-sync ruff check python/lancedb/table.py python/lancedb/namespace.py python/tests/test_namespace.py` - `uv run --extra tests --extra dev --no-sync ruff format python/lancedb/table.py python/lancedb/namespace.py python/tests/test_namespace.py` - `uv run --extra tests --extra dev --no-sync pytest python/tests/test_namespace.py::TestPushdownOperations::test_lance_table_to_arrow_uses_query_pushdown python/tests/test_namespace.py::TestAsyncPushdownOperations::test_async_table_to_arrow_uses_query_pushdown python/tests/test_namespace.py::test_local_table_to_arrow_and_to_pandas_are_unchanged -q` - `uv run --extra tests --extra dev --no-sync pytest python/tests/test_namespace.py -q`
This commit is contained in:
@@ -5,10 +5,63 @@
|
||||
|
||||
import tempfile
|
||||
import shutil
|
||||
import sys
|
||||
import pytest
|
||||
import pyarrow as pa
|
||||
import lancedb
|
||||
from lance_namespace.errors import NamespaceNotEmptyError, TableNotFoundError
|
||||
from lancedb.table import AsyncTable, LanceTable
|
||||
|
||||
|
||||
PUSHDOWN_DATA = pa.table(
|
||||
{"id": list(range(12)), "text": [f"row-{idx}" for idx in range(12)]}
|
||||
)
|
||||
|
||||
|
||||
def _ipc_file(table: pa.Table = PUSHDOWN_DATA) -> bytes:
|
||||
sink = pa.BufferOutputStream()
|
||||
with pa.ipc.new_file(sink, table.schema) as writer:
|
||||
writer.write_table(table)
|
||||
return sink.getvalue().to_pybytes()
|
||||
|
||||
|
||||
class _FailingSyncInner:
|
||||
name = "hist"
|
||||
|
||||
async def schema(self):
|
||||
return PUSHDOWN_DATA.schema
|
||||
|
||||
async def to_arrow(self):
|
||||
raise RuntimeError("direct table to_arrow should not be used")
|
||||
|
||||
|
||||
class _FailingAsyncInner:
|
||||
def name(self):
|
||||
return "hist"
|
||||
|
||||
async def schema(self):
|
||||
return PUSHDOWN_DATA.schema
|
||||
|
||||
def query(self):
|
||||
raise AssertionError("direct async query should not be used")
|
||||
|
||||
|
||||
class _NamespaceClient:
|
||||
def __init__(self):
|
||||
self.requests = []
|
||||
|
||||
def query_table(self, request):
|
||||
self.requests.append(request)
|
||||
return _ipc_file()
|
||||
|
||||
|
||||
def _namespace_lance_table(namespace_client: _NamespaceClient) -> LanceTable:
|
||||
table = LanceTable.__new__(LanceTable)
|
||||
table._table = _FailingSyncInner()
|
||||
table._namespace_path = ["geneva"]
|
||||
table._namespace_client = namespace_client
|
||||
table._pushdown_operations = {"QueryTable"}
|
||||
return table
|
||||
|
||||
|
||||
class TestNamespaceConnection:
|
||||
@@ -736,6 +789,22 @@ class TestPushdownOperations:
|
||||
db = lancedb.connect_namespace("dir", {"root": self.temp_dir})
|
||||
assert len(db._namespace_client_pushdown_operations) == 0
|
||||
|
||||
def test_lance_table_to_arrow_uses_query_pushdown(self):
|
||||
namespace_client = _NamespaceClient()
|
||||
table = _namespace_lance_table(namespace_client)
|
||||
|
||||
assert table.to_arrow().equals(PUSHDOWN_DATA)
|
||||
assert table.to_pandas()["id"].tolist() == list(range(12))
|
||||
assert len(namespace_client.requests) == 2
|
||||
assert [request.id for request in namespace_client.requests] == [
|
||||
["geneva", "hist"],
|
||||
["geneva", "hist"],
|
||||
]
|
||||
assert [request.k for request in namespace_client.requests] == [
|
||||
sys.maxsize,
|
||||
sys.maxsize,
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
class TestAsyncPushdownOperations:
|
||||
@@ -771,3 +840,39 @@ class TestAsyncPushdownOperations:
|
||||
"""Test that pushdown operations default to empty on async connection."""
|
||||
db = lancedb.connect_namespace_async("dir", {"root": self.temp_dir})
|
||||
assert len(db._namespace_client_pushdown_operations) == 0
|
||||
|
||||
async def test_async_table_to_arrow_uses_query_pushdown(self):
|
||||
namespace_client = _NamespaceClient()
|
||||
|
||||
table = AsyncTable(
|
||||
_FailingAsyncInner(),
|
||||
namespace_path=["geneva"],
|
||||
namespace_client=namespace_client,
|
||||
pushdown_operations={"QueryTable"},
|
||||
)
|
||||
|
||||
assert (await table.to_arrow()).equals(PUSHDOWN_DATA)
|
||||
assert (await table.to_pandas())["id"].tolist() == list(range(12))
|
||||
assert len(namespace_client.requests) == 2
|
||||
assert [request.id for request in namespace_client.requests] == [
|
||||
["geneva", "hist"],
|
||||
["geneva", "hist"],
|
||||
]
|
||||
assert [request.k for request in namespace_client.requests] == [
|
||||
sys.maxsize,
|
||||
sys.maxsize,
|
||||
]
|
||||
|
||||
|
||||
def test_local_table_to_arrow_and_to_pandas_are_unchanged(tmp_path):
|
||||
db = lancedb.connect(str(tmp_path / "db"))
|
||||
table = db.create_table(
|
||||
"local",
|
||||
data=[
|
||||
{"id": 1, "vector": [1.0, 2.0]},
|
||||
{"id": 2, "vector": [3.0, 4.0]},
|
||||
],
|
||||
)
|
||||
|
||||
assert table.to_arrow().column("id").to_pylist() == [1, 2]
|
||||
assert table.to_pandas()["id"].tolist() == [1, 2]
|
||||
|
||||
Reference in New Issue
Block a user