feat(python): Support reranking for vector and fts (#1103)

solves https://github.com/lancedb/lancedb/issues/1086

Usage Reranking with FTS:
```
retriever = db.create_table("fine-tuning", schema=Schema, mode="overwrite")
pylist = [{"text": "Carson City is the capital city of the American state of Nevada. At the  2010 United States Census, Carson City had a population of 55,274."},
          {"text": "The Commonwealth of the Northern Mariana Islands is a group of islands in the Pacific Ocean that are a political division controlled by the United States. Its capital is Saipan."},
        {"text": "Charlotte Amalie is the capital and largest city of the United States Virgin Islands. It has about 20,000 people. The city is on the island of Saint Thomas."},
        {"text": "Washington, D.C. (also known as simply Washington or D.C., and officially as the District of Columbia) is the capital of the United States. It is a federal district. "},
        {"text": "Capital punishment (the death penalty) has existed in the United States since before the United States was a country. As of 2017, capital punishment is legal in 30 of the 50 states."},
        {"text": "North Dakota is a state in the United States. 672,591 people lived in North Dakota in the year 2010. The capital and seat of government is Bismarck."},
        ]
retriever.add(pylist)
retriever.create_fts_index("text", replace=True)

query = "What is the capital of the United States?"
reranker = CohereReranker(return_score="all")
print(retriever.search(query, query_type="fts").limit(10).to_pandas())
print(retriever.search(query, query_type="fts").rerank(reranker=reranker).limit(10).to_pandas())
```
Result
```
                                                text                                             vector     score
0  Capital punishment (the death penalty) has exi...  [0.099975586, 0.047943115, -0.16723633, -0.183...  0.729602
1  Charlotte Amalie is the capital and largest ci...  [-0.021255493, 0.03363037, -0.027450562, -0.17...  0.678046
2  The Commonwealth of the Northern Mariana Islan...  [0.3684082, 0.30493164, 0.004600525, -0.049407...  0.671521
3  Carson City is the capital city of the America...  [0.13989258, 0.14990234, 0.14172363, 0.0546569...  0.667898
4  Washington, D.C. (also known as simply Washing...  [-0.0090408325, 0.42578125, 0.3798828, -0.3574...  0.653422
5  North Dakota is a state in the United States. ...  [0.55859375, -0.2109375, 0.14526367, 0.1634521...  0.639346
                                                text                                             vector     score  _relevance_score
0  Washington, D.C. (also known as simply Washing...  [-0.0090408325, 0.42578125, 0.3798828, -0.3574...  0.653422          0.979977
1  The Commonwealth of the Northern Mariana Islan...  [0.3684082, 0.30493164, 0.004600525, -0.049407...  0.671521          0.299105
2  Capital punishment (the death penalty) has exi...  [0.099975586, 0.047943115, -0.16723633, -0.183...  0.729602          0.284874
3  Carson City is the capital city of the America...  [0.13989258, 0.14990234, 0.14172363, 0.0546569...  0.667898          0.089614
4  North Dakota is a state in the United States. ...  [0.55859375, -0.2109375, 0.14526367, 0.1634521...  0.639346          0.063832
5  Charlotte Amalie is the capital and largest ci...  [-0.021255493, 0.03363037, -0.027450562, -0.17...  0.678046          0.041462
```

## Vector Search usage:
```
query = "What is the capital of the United States?"
reranker = CohereReranker(return_score="all")
print(retriever.search(query).limit(10).to_pandas())
print(retriever.search(query).rerank(reranker=reranker, query=query).limit(10).to_pandas()) # <-- Note: passing extra string query here
```

Results
```
                                                text                                             vector  _distance
0  Capital punishment (the death penalty) has exi...  [0.099975586, 0.047943115, -0.16723633, -0.183...  39.728973
1  Washington, D.C. (also known as simply Washing...  [-0.0090408325, 0.42578125, 0.3798828, -0.3574...  41.384884
2  Carson City is the capital city of the America...  [0.13989258, 0.14990234, 0.14172363, 0.0546569...  55.220200
3  Charlotte Amalie is the capital and largest ci...  [-0.021255493, 0.03363037, -0.027450562, -0.17...  58.345654
4  The Commonwealth of the Northern Mariana Islan...  [0.3684082, 0.30493164, 0.004600525, -0.049407...  60.060867
5  North Dakota is a state in the United States. ...  [0.55859375, -0.2109375, 0.14526367, 0.1634521...  64.260544
                                                text                                             vector  _distance  _relevance_score
0  Washington, D.C. (also known as simply Washing...  [-0.0090408325, 0.42578125, 0.3798828, -0.3574...  41.384884          0.979977
1  The Commonwealth of the Northern Mariana Islan...  [0.3684082, 0.30493164, 0.004600525, -0.049407...  60.060867          0.299105
2  Capital punishment (the death penalty) has exi...  [0.099975586, 0.047943115, -0.16723633, -0.183...  39.728973          0.284874
3  Carson City is the capital city of the America...  [0.13989258, 0.14990234, 0.14172363, 0.0546569...  55.220200          0.089614
4  North Dakota is a state in the United States. ...  [0.55859375, -0.2109375, 0.14526367, 0.1634521...  64.260544          0.063832
5  Charlotte Amalie is the capital and largest ci...  [-0.021255493, 0.03363037, -0.027450562, -0.17...  58.345654          0.041462
```
This commit is contained in:
Ayush Chaurasia
2024-03-19 22:20:31 +05:30
committed by Weston Pace
parent b36c750cc7
commit 42fad84ec8
7 changed files with 399 additions and 59 deletions

View File

@@ -124,8 +124,9 @@ def test_linear_combination(tmp_path):
)
def test_cohere_reranker(tmp_path):
pytest.importorskip("cohere")
reranker = CohereReranker()
table, schema = get_test_table(tmp_path)
# The default reranker
# Hybrid search setting
result1 = (
table.search("Our father who art in heaven", query_type="hybrid")
.rerank(normalize="score", reranker=CohereReranker())
@@ -133,7 +134,7 @@ def test_cohere_reranker(tmp_path):
)
result2 = (
table.search("Our father who art in heaven", query_type="hybrid")
.rerank(reranker=CohereReranker())
.rerank(reranker=reranker)
.to_pydantic(schema)
)
assert result1 == result2
@@ -143,64 +144,120 @@ def test_cohere_reranker(tmp_path):
result = (
table.search((query_vector, query))
.limit(30)
.rerank(reranker=CohereReranker())
.rerank(reranker=reranker)
.to_arrow()
)
assert len(result) == 30
assert np.all(np.diff(result.column("_relevance_score").to_numpy()) <= 0), (
err = (
"The _relevance_score column of the results returned by the reranker "
"represents the relevance of the result to the query & should "
"be descending."
)
assert np.all(np.diff(result.column("_relevance_score").to_numpy()) <= 0), err
# Vector search setting
query = "Our father who art in heaven"
result = table.search(query).rerank(reranker=reranker).limit(30).to_arrow()
assert len(result) == 30
assert np.all(np.diff(result.column("_relevance_score").to_numpy()) <= 0), err
result_explicit = (
table.search(query_vector)
.rerank(reranker=reranker, query=query)
.limit(30)
.to_arrow()
)
assert len(result_explicit) == 30
with pytest.raises(
ValueError
): # This raises an error because vector query is provided without reanking query
table.search(query_vector).rerank(reranker=reranker).limit(30).to_arrow()
# FTS search setting
result = (
table.search(query, query_type="fts")
.rerank(reranker=reranker)
.limit(30)
.to_arrow()
)
assert len(result) > 0
assert np.all(np.diff(result.column("_relevance_score").to_numpy()) <= 0), err
def test_cross_encoder_reranker(tmp_path):
pytest.importorskip("sentence_transformers")
reranker = CrossEncoderReranker()
table, schema = get_test_table(tmp_path)
result1 = (
table.search("Our father who art in heaven", query_type="hybrid")
.rerank(normalize="score", reranker=CrossEncoderReranker())
.rerank(normalize="score", reranker=reranker)
.to_pydantic(schema)
)
result2 = (
table.search("Our father who art in heaven", query_type="hybrid")
.rerank(reranker=CrossEncoderReranker())
.rerank(reranker=reranker)
.to_pydantic(schema)
)
assert result1 == result2
# test explicit hybrid query
query = "Our father who art in heaven"
query_vector = table.to_pandas()["vector"][0]
result = (
table.search((query_vector, query), query_type="hybrid")
.limit(30)
.rerank(reranker=CrossEncoderReranker())
.rerank(reranker=reranker)
.to_arrow()
)
assert len(result) == 30
assert np.all(np.diff(result.column("_relevance_score").to_numpy()) <= 0), (
err = (
"The _relevance_score column of the results returned by the reranker "
"represents the relevance of the result to the query & should "
"be descending."
)
assert np.all(np.diff(result.column("_relevance_score").to_numpy()) <= 0), err
# Vector search setting
result = table.search(query).rerank(reranker=reranker).limit(30).to_arrow()
assert len(result) == 30
assert np.all(np.diff(result.column("_relevance_score").to_numpy()) <= 0), err
result_explicit = (
table.search(query_vector)
.rerank(reranker=reranker, query=query)
.limit(30)
.to_arrow()
)
assert len(result_explicit) == 30
with pytest.raises(
ValueError
): # This raises an error because vector query is provided without reanking query
table.search(query_vector).rerank(reranker=reranker).limit(30).to_arrow()
# FTS search setting
result = (
table.search(query, query_type="fts")
.rerank(reranker=reranker)
.limit(30)
.to_arrow()
)
assert len(result) > 0
assert np.all(np.diff(result.column("_relevance_score").to_numpy()) <= 0), err
def test_colbert_reranker(tmp_path):
pytest.importorskip("transformers")
reranker = ColbertReranker()
table, schema = get_test_table(tmp_path)
result1 = (
table.search("Our father who art in heaven", query_type="hybrid")
.rerank(normalize="score", reranker=ColbertReranker())
.rerank(normalize="score", reranker=reranker)
.to_pydantic(schema)
)
result2 = (
table.search("Our father who art in heaven", query_type="hybrid")
.rerank(reranker=ColbertReranker())
.rerank(reranker=reranker)
.to_pydantic(schema)
)
assert result1 == result2
@@ -211,17 +268,43 @@ def test_colbert_reranker(tmp_path):
result = (
table.search((query_vector, query))
.limit(30)
.rerank(reranker=ColbertReranker())
.rerank(reranker=reranker)
.to_arrow()
)
assert len(result) == 30
assert np.all(np.diff(result.column("_relevance_score").to_numpy()) <= 0), (
err = (
"The _relevance_score column of the results returned by the reranker "
"represents the relevance of the result to the query & should "
"be descending."
)
assert np.all(np.diff(result.column("_relevance_score").to_numpy()) <= 0), err
# Vector search setting
result = table.search(query).rerank(reranker=reranker).limit(30).to_arrow()
assert len(result) == 30
assert np.all(np.diff(result.column("_relevance_score").to_numpy()) <= 0), err
result_explicit = (
table.search(query_vector)
.rerank(reranker=reranker, query=query)
.limit(30)
.to_arrow()
)
assert len(result_explicit) == 30
with pytest.raises(
ValueError
): # This raises an error because vector query is provided without reanking query
table.search(query_vector).rerank(reranker=reranker).limit(30).to_arrow()
# FTS search setting
result = (
table.search(query, query_type="fts")
.rerank(reranker=reranker)
.limit(30)
.to_arrow()
)
assert len(result) > 0
assert np.all(np.diff(result.column("_relevance_score").to_numpy()) <= 0), err
@pytest.mark.skipif(
@@ -230,9 +313,10 @@ def test_colbert_reranker(tmp_path):
def test_openai_reranker(tmp_path):
pytest.importorskip("openai")
table, schema = get_test_table(tmp_path)
reranker = OpenaiReranker()
result1 = (
table.search("Our father who art in heaven", query_type="hybrid")
.rerank(normalize="score", reranker=OpenaiReranker())
.rerank(normalize="score", reranker=reranker)
.to_pydantic(schema)
)
result2 = (
@@ -248,14 +332,40 @@ def test_openai_reranker(tmp_path):
result = (
table.search((query_vector, query))
.limit(30)
.rerank(reranker=OpenaiReranker())
.rerank(reranker=reranker)
.to_arrow()
)
assert len(result) == 30
assert np.all(np.diff(result.column("_relevance_score").to_numpy()) <= 0), (
err = (
"The _relevance_score column of the results returned by the reranker "
"represents the relevance of the result to the query & should "
"be descending."
)
assert np.all(np.diff(result.column("_relevance_score").to_numpy()) <= 0), err
# Vector search setting
result = table.search(query).rerank(reranker=reranker).limit(30).to_arrow()
assert len(result) == 30
assert np.all(np.diff(result.column("_relevance_score").to_numpy()) <= 0), err
result_explicit = (
table.search(query_vector)
.rerank(reranker=reranker, query=query)
.limit(30)
.to_arrow()
)
assert len(result_explicit) == 30
with pytest.raises(
ValueError
): # This raises an error because vector query is provided without reanking query
table.search(query_vector).rerank(reranker=reranker).limit(30).to_arrow()
# FTS search setting
result = (
table.search(query, query_type="fts")
.rerank(reranker=reranker)
.limit(30)
.to_arrow()
)
assert len(result) > 0
assert np.all(np.diff(result.column("_relevance_score").to_numpy()) <= 0), err