Files
lancedb/python/lancedb/rerankers/linear_combination.py
Ayush Chaurasia a41f7be88d feat(python): Hybrid search & Reranker API (#824)
based on https://github.com/lancedb/lancedb/pull/713
- The Reranker api can be plugged into vector only or fts only search
but this PR doesn't do that (see example -
https://txt.cohere.com/rerank/)


### Default reranker -- `LinearCombinationReranker(weight=0.7,
fill=1.0)`

```
table.search("hello", query_type="hybrid").rerank(normalize="score").to_pandas()
```
### Available rerankers
LinearCombinationReranker
```
from lancedb.rerankers import LinearCombinationReranker

# Same as default 
table.search("hello", query_type="hybrid").rerank(
                                      normalize="score", 
                                      reranker=LinearCombinationReranker()
                                     ).to_pandas()

# with custom params
reranker = LinearCombinationReranker(weight=0.3, fill=1.0)
table.search("hello", query_type="hybrid").rerank(
                                      normalize="score", 
                                      reranker=reranker
                                     ).to_pandas()
```

Cohere Reranker
```
from lancedb.rerankers import CohereReranker

# default model.. English and multi-lingual supported. See docstring for available custom params
table.search("hello", query_type="hybrid").rerank(
                                      normalize="rank",  # score or rank
                                      reranker=CohereReranker()
                                     ).to_pandas()

```

CrossEncoderReranker

```
from lancedb.rerankers import CrossEncoderReranker

table.search("hello", query_type="hybrid").rerank(
                                      normalize="rank", 
                                      reranker=CrossEncoderReranker()
                                     ).to_pandas()

```

## Using custom Reranker
```
from lancedb.reranker import Reranker

class CustomReranker(Reranker):
    def rerank_hybrid(self, vector_result, fts_result):
           combined_res = self.merge_results(vector_results, fts_results) # or use custom combination logic
           # Custom rerank logic here
           
           return combined_res
```

- [x] Expand testing
- [x] Make sure usage makes sense
- [x] Run simple benchmarks for correctness (Seeing weird result from
cohere reranker in the toy example)
- Support diverse rerankers by default:
- [x] Cross encoding
- [x] Cohere
- [x] Reciprocal Rank Fusion

---------

Co-authored-by: Chang She <759245+changhiskhan@users.noreply.github.com>
Co-authored-by: Prashanth Rao <35005448+prrao87@users.noreply.github.com>
2024-04-05 16:28:56 -07:00

118 lines
4.4 KiB
Python

from typing import List
import pyarrow as pa
from .base import Reranker
class LinearCombinationReranker(Reranker):
"""
Reranks the results using a linear combination of the scores from the
vector and FTS search. For missing scores, fill with `fill` value.
Parameters
----------
weight : float, default 0.7
The weight to give to the vector score. Must be between 0 and 1.
fill : float, default 1.0
The score to give to results that are only in one of the two result sets.
This is treated as penalty, so a higher value means a lower score.
TODO: We should just hardcode this--
its pretty confusing as we invert scores to calculate final score
return_score : str, default "relevance"
opntions are "relevance" or "all"
The type of score to return. If "relevance", will return only the relevance
score. If "all", will return all scores from the vector and FTS search along
with the relevance score.
"""
def __init__(
self, weight: float = 0.7, fill: float = 1.0, return_score="relevance"
):
if weight < 0 or weight > 1:
raise ValueError("weight must be between 0 and 1.")
super().__init__(return_score)
self.weight = weight
self.fill = fill
def rerank_hybrid(
self,
query_builder: "lancedb.HybridQueryBuilder", # noqa: F821
vector_results: pa.Table,
fts_results: pa.Table,
):
combined_results = self.merge_results(vector_results, fts_results, self.fill)
return combined_results
def merge_results(
self, vector_results: pa.Table, fts_results: pa.Table, fill: float
):
# If both are empty then just return an empty table
if len(vector_results) == 0 and len(fts_results) == 0:
return vector_results
# If one is empty then return the other
if len(vector_results) == 0:
return fts_results
if len(fts_results) == 0:
return vector_results
# sort both input tables on _rowid
combined_list = []
vector_list = vector_results.sort_by("_rowid").to_pylist()
fts_list = fts_results.sort_by("_rowid").to_pylist()
i, j = 0, 0
while i < len(vector_list):
if j >= len(fts_list):
for vi in vector_list[i:]:
vi["_relevance_score"] = self._combine_score(vi["_distance"], fill)
combined_list.append(vi)
break
vi = vector_list[i]
fj = fts_list[j]
# invert the fts score from relevance to distance
inverted_fts_score = self._invert_score(fj["score"])
if vi["_rowid"] == fj["_rowid"]:
vi["_relevance_score"] = self._combine_score(
vi["_distance"], inverted_fts_score
)
vi["score"] = fj["score"] # keep the original score
combined_list.append(vi)
i += 1
j += 1
elif vector_list[i]["_rowid"] < fts_list[j]["_rowid"]:
vi["_relevance_score"] = self._combine_score(vi["_distance"], fill)
combined_list.append(vi)
i += 1
else:
fj["_relevance_score"] = self._combine_score(inverted_fts_score, fill)
combined_list.append(fj)
j += 1
if j < len(fts_list) - 1:
for fj in fts_list[j:]:
fj["_relevance_score"] = self._combine_score(inverted_fts_score, fill)
combined_list.append(fj)
relevance_score_schema = pa.schema(
[
pa.field("_relevance_score", pa.float32()),
]
)
combined_schema = pa.unify_schemas(
[vector_results.schema, fts_results.schema, relevance_score_schema]
)
tbl = pa.Table.from_pylist(combined_list, schema=combined_schema).sort_by(
[("_relevance_score", "descending")]
)
if self.score == "relevance":
tbl = tbl.drop_columns(["score", "_distance"])
return tbl
def _combine_score(self, score1, score2):
# these scores represent distance
return 1 - (self.weight * score1 + (1 - self.weight) * score2)
def _invert_score(self, scores: List[float]):
# Invert the scores between relevance and distance
return 1 - scores