feat: add to_batches API #805 (#1048)

SDK
Python

Description
Exposes pyarrow batch api during query execution - relevant when there
is no vector search query, dataset is large and the filtered result is
larger than memory.

---------

Co-authored-by: Ishani Ghose <isghose@amazon.com>
Co-authored-by: Chang She <759245+changhiskhan@users.noreply.github.com>
This commit is contained in:
Ishani Ghose
2024-03-20 13:38:06 -07:00
committed by Weston Pace
parent 968c62cb8f
commit 0838e12b30
5 changed files with 81 additions and 21 deletions

View File

@@ -13,6 +13,7 @@
import unittest.mock as mock
from datetime import timedelta
from typing import Optional
import lance
import lancedb
@@ -35,9 +36,9 @@ class MockTable:
def to_lance(self):
return lance.dataset(self.uri)
def _execute_query(self, query):
def _execute_query(self, query, batch_size: Optional[int] = None):
ds = self.to_lance()
return ds.to_table(
return ds.scanner(
columns=query.columns,
filter=query.filter,
prefilter=query.prefilter,
@@ -49,7 +50,8 @@ class MockTable:
"nprobes": query.nprobes,
"refine_factor": query.refine_factor,
},
)
batch_size=batch_size,
).to_reader()
@pytest.fixture
@@ -115,6 +117,25 @@ def test_query_builder(table):
assert all(np.array(rs[0]["vector"]) == [1, 2])
def test_query_builder_batches(table):
rs = (
LanceVectorQueryBuilder(table, [0, 0], "vector")
.limit(2)
.select(["id", "vector"])
.to_batches(1)
)
rs_list = []
for item in rs:
rs_list.append(item)
assert isinstance(item, pa.RecordBatch)
assert len(rs_list) == 1
assert len(rs_list[0]["id"]) == 2
assert all(rs_list[0].to_pandas()["vector"][0] == [1.0, 2.0])
assert rs_list[0].to_pandas()["id"][0] == 1
assert all(rs_list[0].to_pandas()["vector"][1] == [3.0, 4.0])
assert rs_list[0].to_pandas()["id"][1] == 2
def test_dynamic_projection(table):
rs = (
LanceVectorQueryBuilder(table, [0, 0], "vector")
@@ -199,7 +220,8 @@ def test_query_builder_with_different_vector_column():
nprobes=20,
refine_factor=None,
vector_column="foo_vector",
)
),
None,
)