feat!: better api for manual hybrid queries (#1575)

Currently, the only documented way of performing hybrid search is by
using embedding API and passing string queries that get automatically
embedded. There are use cases where users might like to pass vectors and
text manually instead.
This ticket contains more information and historical context -
https://github.com/lancedb/lancedb/issues/937

This breaks a undocumented pathway that allowed passing (vector, text)
tuple queries which was intended to be temporary, so this is marked as a
breaking change. For all practical purposes, this should not really
impact most users

### usage
```
results = table.search(query_type="hybrid")
                .vector(vector_query)
                .text(text_query)
                .limit(5)
                .to_pandas()
```
This commit is contained in:
Ayush Chaurasia
2024-08-30 17:37:58 +05:30
committed by GitHub
parent 1521435193
commit dc72ece847
3 changed files with 131 additions and 116 deletions

View File

@@ -111,7 +111,9 @@ def _run_test_reranker(reranker, table, query, query_vector, schema):
query_vector = table.to_pandas()["vector"][0]
result = (
table.search((query_vector, query), vector_column_name="vector")
table.search(query_type="hybrid", vector_column_name="vector")
.vector(query_vector)
.text(query)
.limit(30)
.rerank(reranker=reranker)
.to_arrow()
@@ -207,14 +209,26 @@ def _run_test_hybrid_reranker(reranker, tmp_path, use_tantivy):
query = "Our father who art in heaven"
query_vector = table.to_pandas()["vector"][0]
result = (
table.search((query_vector, query), vector_column_name="vector")
table.search(query_type="hybrid", vector_column_name="vector")
.vector(query_vector)
.text(query)
.limit(30)
.rerank(normalize="score")
.to_arrow()
)
assert len(result) == 30
# Fail if both query and (vector or text) are provided
with pytest.raises(ValueError):
table.search(query, query_type="hybrid", vector_column_name="vector").vector(
query_vector
).to_arrow()
with pytest.raises(ValueError):
table.search(query, query_type="hybrid", vector_column_name="vector").text(
query
).to_arrow()
assert np.all(np.diff(result.column("_relevance_score").to_numpy()) <= 0), (
"The _relevance_score column of the results returned by the reranker "
"represents the relevance of the result to the query & should "