From 3a200d77efbb36840d66648c37f4d80622539fad Mon Sep 17 00:00:00 2001 From: marca116 Date: Tue, 17 Mar 2026 00:48:42 -0400 Subject: [PATCH] fix: pre-filtering on hybrid search (#3096) When using hybrid search with a where filter, the prefilter argument is silently inverted. Passing prefilter=True actually performs post-filtering, and prefilter=False actually performs pre-filtering. --- python/python/lancedb/query.py | 4 +- python/python/tests/test_hybrid_query.py | 54 ++++++++++++++++++++++++ 2 files changed, 56 insertions(+), 2 deletions(-) diff --git a/python/python/lancedb/query.py b/python/python/lancedb/query.py index 8d256e9a4..66ff03a4a 100644 --- a/python/python/lancedb/query.py +++ b/python/python/lancedb/query.py @@ -2205,8 +2205,8 @@ class LanceHybridQueryBuilder(LanceQueryBuilder): self._vector_query.select(self._columns) self._fts_query.select(self._columns) if self._where: - self._vector_query.where(self._where, self._postfilter) - self._fts_query.where(self._where, self._postfilter) + self._vector_query.where(self._where, not self._postfilter) + self._fts_query.where(self._where, not self._postfilter) if self._with_row_id: self._vector_query.with_row_id(True) self._fts_query.with_row_id(True) diff --git a/python/python/tests/test_hybrid_query.py b/python/python/tests/test_hybrid_query.py index bb6f2befc..a9d89c0f0 100644 --- a/python/python/tests/test_hybrid_query.py +++ b/python/python/tests/test_hybrid_query.py @@ -177,6 +177,60 @@ async def test_analyze_plan(table: AsyncTable): assert "metrics=" in res +@pytest.fixture +def table_with_id(tmpdir_factory) -> Table: + tmp_path = str(tmpdir_factory.mktemp("data")) + db = lancedb.connect(tmp_path) + data = pa.table( + { + "id": pa.array([1, 2, 3, 4], type=pa.int64()), + "text": pa.array(["a", "b", "cat", "dog"]), + "vector": pa.array( + [[0.1, 0.1], [2, 2], [-0.1, -0.1], [0.5, -0.5]], + type=pa.list_(pa.float32(), list_size=2), + ), + } + ) + table = db.create_table("test_with_id", data) + table.create_fts_index("text", with_position=False, use_tantivy=False) + return table + + +def test_hybrid_prefilter_explain_plan(table_with_id: Table): + """ + Verify that the prefilter logic is not inverted in LanceHybridQueryBuilder. + """ + plan_prefilter = ( + table_with_id.search(query_type="hybrid") + .vector([0.0, 0.0]) + .text("dog") + .where("id = 1", prefilter=True) + .limit(2) + .explain_plan(verbose=True) + ) + + plan_postfilter = ( + table_with_id.search(query_type="hybrid") + .vector([0.0, 0.0]) + .text("dog") + .where("id = 1", prefilter=False) + .limit(2) + .explain_plan(verbose=True) + ) + + # prefilter=True: filter is pushed into the LanceRead scan. + # The FTS sub-plan exposes this as "full_filter=id = Int64(1)" inside LanceRead. + assert "full_filter=id = Int64(1)" in plan_prefilter, ( + f"Should push the filter into the scan.\nPlan:\n{plan_prefilter}" + ) + + # prefilter=False: filter is applied as a separate FilterExec after the search. + # The filter must NOT be embedded in the scan. + assert "full_filter=id = Int64(1)" not in plan_postfilter, ( + f"Should NOT push the filter into the scan.\nPlan:\n{plan_postfilter}" + ) + + def test_normalize_scores(): cases = [ (pa.array([0.1, 0.4]), pa.array([0.0, 1.0])),