From 715b81c86b0055789cd0f012f7bb26ed1899bc0f Mon Sep 17 00:00:00 2001 From: Omair Afzal <32237905+omair445@users.noreply.github.com> Date: Wed, 18 Feb 2026 00:37:10 +0500 Subject: [PATCH] fix(python): graceful handling of empty result sets in hybrid search (#3030) ## Problem When applying hard filters that result in zero matches, hybrid search crashes with `IndexError: list index out of range` during reranking. This happens because empty result tables are passed through the full reranker pipeline, which expects at least one result. Traceback from the issue: ``` lancedb/query.py: in _combine_hybrid_results results = reranker.rerank_hybrid(fts_query, vector_results, fts_results) lancedb/rerankers/answerdotai.py: in rerank_hybrid combined_results = self._rerank(combined_results, query) ... IndexError: list index out of range ``` ## Fix Added an early return in `_combine_hybrid_results` when both vector and FTS results are empty. Instead of passing empty tables through normalization, reranking, and score restoration (which can fail in various ways), we now build a properly-typed empty result table with the `_relevance_score` column and return it directly. ## Test Added `test_empty_hybrid_result_reranker` that exercises `_combine_hybrid_results` directly with empty vector and FTS tables, verifying: - Returns empty table with correct schema - Includes `_relevance_score` column - Respects `with_row_ids` flag Closes #2425 --- python/python/lancedb/query.py | 20 ++++++++ python/python/tests/test_rerankers.py | 72 +++++++++++++++++++++++++++ 2 files changed, 92 insertions(+) diff --git a/python/python/lancedb/query.py b/python/python/lancedb/query.py index 926b51527..01f8328f7 100644 --- a/python/python/lancedb/query.py +++ b/python/python/lancedb/query.py @@ -1782,6 +1782,26 @@ class LanceHybridQueryBuilder(LanceQueryBuilder): vector_results = LanceHybridQueryBuilder._rank(vector_results, "_distance") fts_results = LanceHybridQueryBuilder._rank(fts_results, "_score") + # If both result sets are empty (e.g. after hard filtering), + # return early to avoid errors in reranking or score restoration. + if vector_results.num_rows == 0 and fts_results.num_rows == 0: + # Build a minimal empty table with the _relevance_score column + combined_schema = pa.unify_schemas( + [vector_results.schema, fts_results.schema], + ) + empty = pa.table( + { + col: pa.array([], type=combined_schema.field(col).type) + for col in combined_schema.names + } + ) + empty = empty.append_column( + "_relevance_score", pa.array([], type=pa.float32()) + ) + if not with_row_ids and "_rowid" in empty.column_names: + empty = empty.drop(["_rowid"]) + return empty + original_distances = None original_scores = None original_distance_row_ids = None diff --git a/python/python/tests/test_rerankers.py b/python/python/tests/test_rerankers.py index 27467c8e0..cfbaca204 100644 --- a/python/python/tests/test_rerankers.py +++ b/python/python/tests/test_rerankers.py @@ -531,6 +531,78 @@ def test_empty_result_reranker(): ) +def test_empty_hybrid_result_reranker(): + """Test that hybrid search with empty results after filtering doesn't crash. + + Regression test for https://github.com/lancedb/lancedb/issues/2425 + """ + from lancedb.query import LanceHybridQueryBuilder + + # Simulate empty vector and FTS results with the expected schema + vector_schema = pa.schema( + [ + ("text", pa.string()), + ("vector", pa.list_(pa.float32(), 4)), + ("_rowid", pa.uint64()), + ("_distance", pa.float32()), + ] + ) + fts_schema = pa.schema( + [ + ("text", pa.string()), + ("vector", pa.list_(pa.float32(), 4)), + ("_rowid", pa.uint64()), + ("_score", pa.float32()), + ] + ) + empty_vector = pa.table( + { + "text": pa.array([], type=pa.string()), + "vector": pa.array([], type=pa.list_(pa.float32(), 4)), + "_rowid": pa.array([], type=pa.uint64()), + "_distance": pa.array([], type=pa.float32()), + }, + schema=vector_schema, + ) + empty_fts = pa.table( + { + "text": pa.array([], type=pa.string()), + "vector": pa.array([], type=pa.list_(pa.float32(), 4)), + "_rowid": pa.array([], type=pa.uint64()), + "_score": pa.array([], type=pa.float32()), + }, + schema=fts_schema, + ) + + for reranker in [LinearCombinationReranker(), RRFReranker()]: + result = LanceHybridQueryBuilder._combine_hybrid_results( + fts_results=empty_fts, + vector_results=empty_vector, + norm="score", + fts_query="nonexistent query", + reranker=reranker, + limit=10, + with_row_ids=False, + ) + assert len(result) == 0 + assert "_relevance_score" in result.column_names + assert "_rowid" not in result.column_names + + # Also test with with_row_ids=True + result = LanceHybridQueryBuilder._combine_hybrid_results( + fts_results=empty_fts, + vector_results=empty_vector, + norm="score", + fts_query="nonexistent query", + reranker=LinearCombinationReranker(), + limit=10, + with_row_ids=True, + ) + assert len(result) == 0 + assert "_relevance_score" in result.column_names + assert "_rowid" in result.column_names + + @pytest.mark.parametrize("use_tantivy", [True, False]) def test_cross_encoder_reranker_return_all(tmp_path, use_tantivy): pytest.importorskip("sentence_transformers")