Files
lancedb/python/python/lancedb/rerankers/cohere.py
Ayush Chaurasia 0a387a5429 feat(python): Support reranking for vector and fts (#1103)
solves https://github.com/lancedb/lancedb/issues/1086

Usage Reranking with FTS:
```
retriever = db.create_table("fine-tuning", schema=Schema, mode="overwrite")
pylist = [{"text": "Carson City is the capital city of the American state of Nevada. At the  2010 United States Census, Carson City had a population of 55,274."},
          {"text": "The Commonwealth of the Northern Mariana Islands is a group of islands in the Pacific Ocean that are a political division controlled by the United States. Its capital is Saipan."},
        {"text": "Charlotte Amalie is the capital and largest city of the United States Virgin Islands. It has about 20,000 people. The city is on the island of Saint Thomas."},
        {"text": "Washington, D.C. (also known as simply Washington or D.C., and officially as the District of Columbia) is the capital of the United States. It is a federal district. "},
        {"text": "Capital punishment (the death penalty) has existed in the United States since before the United States was a country. As of 2017, capital punishment is legal in 30 of the 50 states."},
        {"text": "North Dakota is a state in the United States. 672,591 people lived in North Dakota in the year 2010. The capital and seat of government is Bismarck."},
        ]
retriever.add(pylist)
retriever.create_fts_index("text", replace=True)

query = "What is the capital of the United States?"
reranker = CohereReranker(return_score="all")
print(retriever.search(query, query_type="fts").limit(10).to_pandas())
print(retriever.search(query, query_type="fts").rerank(reranker=reranker).limit(10).to_pandas())
```
Result
```
                                                text                                             vector     score
0  Capital punishment (the death penalty) has exi...  [0.099975586, 0.047943115, -0.16723633, -0.183...  0.729602
1  Charlotte Amalie is the capital and largest ci...  [-0.021255493, 0.03363037, -0.027450562, -0.17...  0.678046
2  The Commonwealth of the Northern Mariana Islan...  [0.3684082, 0.30493164, 0.004600525, -0.049407...  0.671521
3  Carson City is the capital city of the America...  [0.13989258, 0.14990234, 0.14172363, 0.0546569...  0.667898
4  Washington, D.C. (also known as simply Washing...  [-0.0090408325, 0.42578125, 0.3798828, -0.3574...  0.653422
5  North Dakota is a state in the United States. ...  [0.55859375, -0.2109375, 0.14526367, 0.1634521...  0.639346
                                                text                                             vector     score  _relevance_score
0  Washington, D.C. (also known as simply Washing...  [-0.0090408325, 0.42578125, 0.3798828, -0.3574...  0.653422          0.979977
1  The Commonwealth of the Northern Mariana Islan...  [0.3684082, 0.30493164, 0.004600525, -0.049407...  0.671521          0.299105
2  Capital punishment (the death penalty) has exi...  [0.099975586, 0.047943115, -0.16723633, -0.183...  0.729602          0.284874
3  Carson City is the capital city of the America...  [0.13989258, 0.14990234, 0.14172363, 0.0546569...  0.667898          0.089614
4  North Dakota is a state in the United States. ...  [0.55859375, -0.2109375, 0.14526367, 0.1634521...  0.639346          0.063832
5  Charlotte Amalie is the capital and largest ci...  [-0.021255493, 0.03363037, -0.027450562, -0.17...  0.678046          0.041462
```

## Vector Search usage:
```
query = "What is the capital of the United States?"
reranker = CohereReranker(return_score="all")
print(retriever.search(query).limit(10).to_pandas())
print(retriever.search(query).rerank(reranker=reranker, query=query).limit(10).to_pandas()) # <-- Note: passing extra string query here
```

Results
```
                                                text                                             vector  _distance
0  Capital punishment (the death penalty) has exi...  [0.099975586, 0.047943115, -0.16723633, -0.183...  39.728973
1  Washington, D.C. (also known as simply Washing...  [-0.0090408325, 0.42578125, 0.3798828, -0.3574...  41.384884
2  Carson City is the capital city of the America...  [0.13989258, 0.14990234, 0.14172363, 0.0546569...  55.220200
3  Charlotte Amalie is the capital and largest ci...  [-0.021255493, 0.03363037, -0.027450562, -0.17...  58.345654
4  The Commonwealth of the Northern Mariana Islan...  [0.3684082, 0.30493164, 0.004600525, -0.049407...  60.060867
5  North Dakota is a state in the United States. ...  [0.55859375, -0.2109375, 0.14526367, 0.1634521...  64.260544
                                                text                                             vector  _distance  _relevance_score
0  Washington, D.C. (also known as simply Washing...  [-0.0090408325, 0.42578125, 0.3798828, -0.3574...  41.384884          0.979977
1  The Commonwealth of the Northern Mariana Islan...  [0.3684082, 0.30493164, 0.004600525, -0.049407...  60.060867          0.299105
2  Capital punishment (the death penalty) has exi...  [0.099975586, 0.047943115, -0.16723633, -0.183...  39.728973          0.284874
3  Carson City is the capital city of the America...  [0.13989258, 0.14990234, 0.14172363, 0.0546569...  55.220200          0.089614
4  North Dakota is a state in the United States. ...  [0.55859375, -0.2109375, 0.14526367, 0.1634521...  64.260544          0.063832
5  Charlotte Amalie is the capital and largest ci...  [-0.021255493, 0.03363037, -0.027450562, -0.17...  58.345654          0.041462
```
2024-03-19 22:20:31 +05:30

108 lines
3.4 KiB
Python

import os
from functools import cached_property
from typing import Union
import pyarrow as pa
from ..util import attempt_import_or_raise
from .base import Reranker
class CohereReranker(Reranker):
"""
Reranks the results using the Cohere Rerank API.
https://docs.cohere.com/docs/rerank-guide
Parameters
----------
model_name : str, default "rerank-english-v2.0"
The name of the cross encoder model to use. Available cohere models are:
- rerank-english-v2.0
- rerank-multilingual-v2.0
column : str, default "text"
The name of the column to use as input to the cross encoder model.
top_n : str, default None
The number of results to return. If None, will return all results.
"""
def __init__(
self,
model_name: str = "rerank-english-v2.0",
column: str = "text",
top_n: Union[int, None] = None,
return_score="relevance",
api_key: Union[str, None] = None,
):
super().__init__(return_score)
self.model_name = model_name
self.column = column
self.top_n = top_n
self.api_key = api_key
@cached_property
def _client(self):
cohere = attempt_import_or_raise("cohere")
if os.environ.get("COHERE_API_KEY") is None and self.api_key is None:
raise ValueError(
"COHERE_API_KEY not set. Either set it in your environment or \
pass it as `api_key` argument to the CohereReranker."
)
return cohere.Client(os.environ.get("COHERE_API_KEY") or self.api_key)
def _rerank(self, result_set: pa.Table, query: str):
docs = result_set[self.column].to_pylist()
results = self._client.rerank(
query=query,
documents=docs,
top_n=self.top_n,
model=self.model_name,
) # returns list (text, idx, relevance) attributes sorted descending by score
indices, scores = list(
zip(*[(result.index, result.relevance_score) for result in results])
) # tuples
result_set = result_set.take(list(indices))
# add the scores
result_set = result_set.append_column(
"_relevance_score", pa.array(scores, type=pa.float32())
)
return result_set
def rerank_hybrid(
self,
query: str,
vector_results: pa.Table,
fts_results: pa.Table,
):
combined_results = self.merge_results(vector_results, fts_results)
combined_results = self._rerank(combined_results, query)
if self.score == "relevance":
combined_results = combined_results.drop_columns(["score", "_distance"])
elif self.score == "all":
raise NotImplementedError(
"return_score='all' not implemented for cohere reranker"
)
return combined_results
def rerank_vector(
self,
query: str,
vector_results: pa.Table,
):
result_set = self._rerank(vector_results, query)
if self.score == "relevance":
result_set = result_set.drop_columns(["_distance"])
return result_set
def rerank_fts(
self,
query: str,
fts_results: pa.Table,
):
result_set = self._rerank(fts_results, query)
if self.score == "relevance":
result_set = result_set.drop_columns(["score"])
return result_set