feat: add to_batches API #805 (#1048)

SDK
Python

Description
Exposes pyarrow batch api during query execution - relevant when there
is no vector search query, dataset is large and the filtered result is
larger than memory.

---------

Co-authored-by: Ishani Ghose <isghose@amazon.com>
Co-authored-by: Chang She <759245+changhiskhan@users.noreply.github.com>
This commit is contained in:
Ishani Ghose
2024-03-20 13:38:06 -07:00
committed by Weston Pace
parent 968c62cb8f
commit 0838e12b30
5 changed files with 81 additions and 21 deletions

View File

@@ -163,7 +163,7 @@ def test_cohere_reranker(tmp_path):
assert np.all(np.diff(result.column("_relevance_score").to_numpy()) <= 0), err
result_explicit = (
table.search(query_vector)
.rerank(reranker=reranker, query=query)
.rerank(reranker=reranker, query_string=query)
.limit(30)
.to_arrow()
)
@@ -225,7 +225,7 @@ def test_cross_encoder_reranker(tmp_path):
result_explicit = (
table.search(query_vector)
.rerank(reranker=reranker, query=query)
.rerank(reranker=reranker, query_string=query)
.limit(30)
.to_arrow()
)
@@ -286,7 +286,7 @@ def test_colbert_reranker(tmp_path):
assert np.all(np.diff(result.column("_relevance_score").to_numpy()) <= 0), err
result_explicit = (
table.search(query_vector)
.rerank(reranker=reranker, query=query)
.rerank(reranker=reranker, query_string=query)
.limit(30)
.to_arrow()
)
@@ -351,7 +351,7 @@ def test_openai_reranker(tmp_path):
assert np.all(np.diff(result.column("_relevance_score").to_numpy()) <= 0), err
result_explicit = (
table.search(query_vector)
.rerank(reranker=reranker, query=query)
.rerank(reranker=reranker, query_string=query)
.limit(30)
.to_arrow()
)