feat: add a pyarrow dataset adapater for LanceDB tables (#1902)

This currently only works for local tables (remote tables cannot be
queried)
This is also exclusive to the sync interface. However, since the pyarrow
dataset interface is synchronous I am not sure if there is much value in
making an async-wrapping variant.

In addition, I added a `to_batches` method to the base query in the sync
API. This already exists in the async API. In the sync API this PR only
adds support for vector queries and scalar queries and not for hybrid or
FTS queries.
This commit is contained in:
Weston Pace
2024-12-03 15:42:54 -08:00
committed by GitHub
parent d8c758513c
commit c998a47e17
5 changed files with 334 additions and 1 deletions

View File

@@ -0,0 +1,21 @@
import duckdb
import pyarrow as pa
import lancedb
from lancedb.integrations.pyarrow import PyarrowDatasetAdapter
def test_basic_query(tmp_path):
data = pa.table({"x": [1, 2, 3, 4], "y": [5, 6, 7, 8]})
conn = lancedb.connect(tmp_path)
tbl = conn.create_table("test", data)
adapter = PyarrowDatasetAdapter(tbl) # noqa: F841
duck_conn = duckdb.connect()
results = duck_conn.sql("SELECT SUM(x) FROM adapter").fetchall()
assert results[0][0] == 10
results = duck_conn.sql("SELECT SUM(y) FROM adapter").fetchall()
assert results[0][0] == 26

View File

@@ -0,0 +1,47 @@
import pyarrow as pa
import lancedb
from lancedb.integrations.pyarrow import PyarrowDatasetAdapter
def test_dataset_adapter(tmp_path):
data = pa.table({"x": [1, 2, 3, 4], "y": [5, 6, 7, 8]})
conn = lancedb.connect(tmp_path)
tbl = conn.create_table("test", data)
adapter = PyarrowDatasetAdapter(tbl)
assert adapter.count_rows() == 4
assert adapter.count_rows("x > 2") == 2
assert adapter.schema == data.schema
assert adapter.head(2) == data.slice(0, 2)
assert adapter.to_table() == data
assert adapter.to_batches().read_all() == data
assert adapter.scanner().to_table() == data
assert adapter.scanner().to_batches().read_all() == data
assert adapter.scanner().projected_schema == data.schema
assert adapter.scanner(columns=["x"]).projected_schema == pa.schema(
[data.schema.field("x")]
)
assert adapter.scanner(columns=["x"]).to_table() == pa.table({"x": [1, 2, 3, 4]})
# Make sure we bypass the limit
data = pa.table({"x": range(100)})
tbl = conn.create_table("test2", data)
adapter = PyarrowDatasetAdapter(tbl)
assert adapter.count_rows() == 100
assert adapter.to_table().num_rows == 100
assert adapter.head(10).num_rows == 10
# Empty table
tbl = conn.create_table("test3", None, schema=pa.schema({"x": pa.int64()}))
adapter = PyarrowDatasetAdapter(tbl)
assert adapter.count_rows() == 0
assert adapter.to_table().num_rows == 0
assert adapter.head(10).num_rows == 0
assert adapter.scanner().projected_schema == pa.schema({"x": pa.int64()})