mirror of
https://github.com/lancedb/lancedb.git
synced 2026-06-09 15:20:41 +00:00
feat: add a pyarrow dataset adapater for LanceDB tables (#1902)
This currently only works for local tables (remote tables cannot be queried) This is also exclusive to the sync interface. However, since the pyarrow dataset interface is synchronous I am not sure if there is much value in making an async-wrapping variant. In addition, I added a `to_batches` method to the base query in the sync API. This already exists in the async API. In the sync API this PR only adds support for vector queries and scalar queries and not for hybrid or FTS queries.
This commit is contained in:
21
python/python/tests/test_duckdb.py
Normal file
21
python/python/tests/test_duckdb.py
Normal file
@@ -0,0 +1,21 @@
|
||||
import duckdb
|
||||
import pyarrow as pa
|
||||
|
||||
import lancedb
|
||||
from lancedb.integrations.pyarrow import PyarrowDatasetAdapter
|
||||
|
||||
|
||||
def test_basic_query(tmp_path):
|
||||
data = pa.table({"x": [1, 2, 3, 4], "y": [5, 6, 7, 8]})
|
||||
conn = lancedb.connect(tmp_path)
|
||||
tbl = conn.create_table("test", data)
|
||||
|
||||
adapter = PyarrowDatasetAdapter(tbl) # noqa: F841
|
||||
|
||||
duck_conn = duckdb.connect()
|
||||
|
||||
results = duck_conn.sql("SELECT SUM(x) FROM adapter").fetchall()
|
||||
assert results[0][0] == 10
|
||||
|
||||
results = duck_conn.sql("SELECT SUM(y) FROM adapter").fetchall()
|
||||
assert results[0][0] == 26
|
||||
47
python/python/tests/test_pyarrow.py
Normal file
47
python/python/tests/test_pyarrow.py
Normal file
@@ -0,0 +1,47 @@
|
||||
import pyarrow as pa
|
||||
|
||||
import lancedb
|
||||
from lancedb.integrations.pyarrow import PyarrowDatasetAdapter
|
||||
|
||||
|
||||
def test_dataset_adapter(tmp_path):
|
||||
data = pa.table({"x": [1, 2, 3, 4], "y": [5, 6, 7, 8]})
|
||||
conn = lancedb.connect(tmp_path)
|
||||
tbl = conn.create_table("test", data)
|
||||
|
||||
adapter = PyarrowDatasetAdapter(tbl)
|
||||
|
||||
assert adapter.count_rows() == 4
|
||||
assert adapter.count_rows("x > 2") == 2
|
||||
assert adapter.schema == data.schema
|
||||
assert adapter.head(2) == data.slice(0, 2)
|
||||
assert adapter.to_table() == data
|
||||
assert adapter.to_batches().read_all() == data
|
||||
assert adapter.scanner().to_table() == data
|
||||
assert adapter.scanner().to_batches().read_all() == data
|
||||
|
||||
assert adapter.scanner().projected_schema == data.schema
|
||||
assert adapter.scanner(columns=["x"]).projected_schema == pa.schema(
|
||||
[data.schema.field("x")]
|
||||
)
|
||||
assert adapter.scanner(columns=["x"]).to_table() == pa.table({"x": [1, 2, 3, 4]})
|
||||
|
||||
# Make sure we bypass the limit
|
||||
data = pa.table({"x": range(100)})
|
||||
tbl = conn.create_table("test2", data)
|
||||
|
||||
adapter = PyarrowDatasetAdapter(tbl)
|
||||
|
||||
assert adapter.count_rows() == 100
|
||||
assert adapter.to_table().num_rows == 100
|
||||
assert adapter.head(10).num_rows == 10
|
||||
|
||||
# Empty table
|
||||
tbl = conn.create_table("test3", None, schema=pa.schema({"x": pa.int64()}))
|
||||
adapter = PyarrowDatasetAdapter(tbl)
|
||||
|
||||
assert adapter.count_rows() == 0
|
||||
assert adapter.to_table().num_rows == 0
|
||||
assert adapter.head(10).num_rows == 0
|
||||
|
||||
assert adapter.scanner().projected_schema == pa.schema({"x": pa.int64()})
|
||||
Reference in New Issue
Block a user