Files
lancedb/python/python/lancedb/rerankers/base.py
Ayush Chaurasia 42fad84ec8 feat(python): Support reranking for vector and fts (#1103)
solves https://github.com/lancedb/lancedb/issues/1086

Usage Reranking with FTS:
```
retriever = db.create_table("fine-tuning", schema=Schema, mode="overwrite")
pylist = [{"text": "Carson City is the capital city of the American state of Nevada. At the  2010 United States Census, Carson City had a population of 55,274."},
          {"text": "The Commonwealth of the Northern Mariana Islands is a group of islands in the Pacific Ocean that are a political division controlled by the United States. Its capital is Saipan."},
        {"text": "Charlotte Amalie is the capital and largest city of the United States Virgin Islands. It has about 20,000 people. The city is on the island of Saint Thomas."},
        {"text": "Washington, D.C. (also known as simply Washington or D.C., and officially as the District of Columbia) is the capital of the United States. It is a federal district. "},
        {"text": "Capital punishment (the death penalty) has existed in the United States since before the United States was a country. As of 2017, capital punishment is legal in 30 of the 50 states."},
        {"text": "North Dakota is a state in the United States. 672,591 people lived in North Dakota in the year 2010. The capital and seat of government is Bismarck."},
        ]
retriever.add(pylist)
retriever.create_fts_index("text", replace=True)

query = "What is the capital of the United States?"
reranker = CohereReranker(return_score="all")
print(retriever.search(query, query_type="fts").limit(10).to_pandas())
print(retriever.search(query, query_type="fts").rerank(reranker=reranker).limit(10).to_pandas())
```
Result
```
                                                text                                             vector     score
0  Capital punishment (the death penalty) has exi...  [0.099975586, 0.047943115, -0.16723633, -0.183...  0.729602
1  Charlotte Amalie is the capital and largest ci...  [-0.021255493, 0.03363037, -0.027450562, -0.17...  0.678046
2  The Commonwealth of the Northern Mariana Islan...  [0.3684082, 0.30493164, 0.004600525, -0.049407...  0.671521
3  Carson City is the capital city of the America...  [0.13989258, 0.14990234, 0.14172363, 0.0546569...  0.667898
4  Washington, D.C. (also known as simply Washing...  [-0.0090408325, 0.42578125, 0.3798828, -0.3574...  0.653422
5  North Dakota is a state in the United States. ...  [0.55859375, -0.2109375, 0.14526367, 0.1634521...  0.639346
                                                text                                             vector     score  _relevance_score
0  Washington, D.C. (also known as simply Washing...  [-0.0090408325, 0.42578125, 0.3798828, -0.3574...  0.653422          0.979977
1  The Commonwealth of the Northern Mariana Islan...  [0.3684082, 0.30493164, 0.004600525, -0.049407...  0.671521          0.299105
2  Capital punishment (the death penalty) has exi...  [0.099975586, 0.047943115, -0.16723633, -0.183...  0.729602          0.284874
3  Carson City is the capital city of the America...  [0.13989258, 0.14990234, 0.14172363, 0.0546569...  0.667898          0.089614
4  North Dakota is a state in the United States. ...  [0.55859375, -0.2109375, 0.14526367, 0.1634521...  0.639346          0.063832
5  Charlotte Amalie is the capital and largest ci...  [-0.021255493, 0.03363037, -0.027450562, -0.17...  0.678046          0.041462
```

## Vector Search usage:
```
query = "What is the capital of the United States?"
reranker = CohereReranker(return_score="all")
print(retriever.search(query).limit(10).to_pandas())
print(retriever.search(query).rerank(reranker=reranker, query=query).limit(10).to_pandas()) # <-- Note: passing extra string query here
```

Results
```
                                                text                                             vector  _distance
0  Capital punishment (the death penalty) has exi...  [0.099975586, 0.047943115, -0.16723633, -0.183...  39.728973
1  Washington, D.C. (also known as simply Washing...  [-0.0090408325, 0.42578125, 0.3798828, -0.3574...  41.384884
2  Carson City is the capital city of the America...  [0.13989258, 0.14990234, 0.14172363, 0.0546569...  55.220200
3  Charlotte Amalie is the capital and largest ci...  [-0.021255493, 0.03363037, -0.027450562, -0.17...  58.345654
4  The Commonwealth of the Northern Mariana Islan...  [0.3684082, 0.30493164, 0.004600525, -0.049407...  60.060867
5  North Dakota is a state in the United States. ...  [0.55859375, -0.2109375, 0.14526367, 0.1634521...  64.260544
                                                text                                             vector  _distance  _relevance_score
0  Washington, D.C. (also known as simply Washing...  [-0.0090408325, 0.42578125, 0.3798828, -0.3574...  41.384884          0.979977
1  The Commonwealth of the Northern Mariana Islan...  [0.3684082, 0.30493164, 0.004600525, -0.049407...  60.060867          0.299105
2  Capital punishment (the death penalty) has exi...  [0.099975586, 0.047943115, -0.16723633, -0.183...  39.728973          0.284874
3  Carson City is the capital city of the America...  [0.13989258, 0.14990234, 0.14172363, 0.0546569...  55.220200          0.089614
4  North Dakota is a state in the United States. ...  [0.55859375, -0.2109375, 0.14526367, 0.1634521...  64.260544          0.063832
5  Charlotte Amalie is the capital and largest ci...  [-0.021255493, 0.03363037, -0.027450562, -0.17...  58.345654          0.041462
```
2024-04-05 16:33:06 -07:00

132 lines
3.9 KiB
Python

from abc import ABC, abstractmethod
import numpy as np
import pyarrow as pa
class Reranker(ABC):
def __init__(self, return_score: str = "relevance"):
"""
Interface for a reranker. A reranker is used to rerank the results from a
vector and FTS search. This is useful for combining the results from both
search methods.
Parameters
----------
return_score : str, default "relevance"
opntions are "relevance" or "all"
The type of score to return. If "relevance", will return only the relevance
score. If "all", will return all scores from the vector and FTS search along
with the relevance score.
"""
if return_score not in ["relevance", "all"]:
raise ValueError("score must be either 'relevance' or 'all'")
self.score = return_score
def rerank_vector(
self,
query: str,
vector_results: pa.Table,
):
"""
Rerank function receives the result from the vector search.
This isn't mandatory to implement
Parameters
----------
query : str
The input query
vector_results : pa.Table
The results from the vector search
Returns
-------
pa.Table
The reranked results
"""
raise NotImplementedError(
f"{self.__class__.__name__} does not implement rerank_vector"
)
def rerank_fts(
self,
query: str,
fts_results: pa.Table,
):
"""
Rerank function receives the result from the FTS search.
This isn't mandatory to implement
Parameters
----------
query : str
The input query
fts_results : pa.Table
The results from the FTS search
Returns
-------
pa.Table
The reranked results
"""
raise NotImplementedError(
f"{self.__class__.__name__} does not implement rerank_fts"
)
@abstractmethod
def rerank_hybrid(
self,
query: str,
vector_results: pa.Table,
fts_results: pa.Table,
):
"""
Rerank function receives the individual results from the vector and FTS search
results. You can choose to use any of the results to generate the final results,
allowing maximum flexibility. This is mandatory to implement
Parameters
----------
query : str
The input query
vector_results : pa.Table
The results from the vector search
fts_results : pa.Table
The results from the FTS search
Returns
-------
pa.Table
The reranked results
"""
pass
def merge_results(self, vector_results: pa.Table, fts_results: pa.Table):
"""
Merge the results from the vector and FTS search. This is a vanilla merging
function that just concatenates the results and removes the duplicates.
NOTE: This doesn't take score into account. It'll keep the instance that was
encountered first. This is designed for rerankers that don't use the score.
In case you want to use the score, or support `return_scores="all"` you'll
have to implement your own merging function.
Parameters
----------
vector_results : pa.Table
The results from the vector search
fts_results : pa.Table
The results from the FTS search
"""
combined = pa.concat_tables([vector_results, fts_results], promote=True)
row_id = combined.column("_rowid")
# deduplicate
mask = np.full((combined.shape[0]), False)
_, mask_indices = np.unique(np.array(row_id), return_index=True)
mask[mask_indices] = True
combined = combined.filter(mask=mask)
return combined