From 3bc6d0ee8250dae62f914572dc2c346a0b083aed Mon Sep 17 00:00:00 2001 From: Lu Qiu Date: Wed, 28 May 2025 12:09:47 -0700 Subject: [PATCH] Support bypass_vector_index --- python/python/lancedb/query.py | 47 +++++++++++++++++---------- python/python/lancedb/remote/table.py | 28 ++++++++++++---- python/python/tests/test_remote_db.py | 21 +++++++++++- 3 files changed, 71 insertions(+), 25 deletions(-) diff --git a/python/python/lancedb/query.py b/python/python/lancedb/query.py index f729902e..d8289884 100644 --- a/python/python/lancedb/query.py +++ b/python/python/lancedb/query.py @@ -119,6 +119,8 @@ class Query(pydantic.BaseModel): fast_search: bool = False + bypass_vector_index: Optional[bool] = None + class LanceQueryBuilder(ABC): """An abstract query builder. Subclasses are defined for vector search, @@ -127,14 +129,15 @@ class LanceQueryBuilder(ABC): @classmethod def create( - cls, - table: "Table", - query: Optional[Union[np.ndarray, str, "PIL.Image.Image", Tuple]], - query_type: str, - vector_column_name: str, - ordering_field_name: Optional[str] = None, - fts_columns: Union[str, List[str]] = [], - fast_search: bool = False, + cls, + table: "Table", + query: Optional[Union[np.ndarray, str, "PIL.Image.Image", Tuple]], + query_type: str, + vector_column_name: str, + ordering_field_name: Optional[str] = None, + fts_columns: Union[str, List[str]] = [], + fast_search: bool = False, + bypass_vector_index: Optional[bool] = None, ) -> LanceQueryBuilder: """ Create a query builder based on the given query and query type. @@ -153,6 +156,8 @@ class LanceQueryBuilder(ABC): The name of the vector column to use for vector search. fast_search: bool Skip flat search of unindexed data. + bypass_vector_index: Optional[bool] + Bypass the vector index and use a brute force search. """ # Check hybrid search first as it supports empty query pattern if query_type == "hybrid": @@ -195,7 +200,12 @@ class LanceQueryBuilder(ABC): raise TypeError(f"Unsupported query type: {type(query)}") return LanceVectorQueryBuilder( - table, query, vector_column_name, str_query, fast_search + table, + query, + vector_column_name, + str_query, + fast_search, + bypass_vector_index, ) @classmethod @@ -557,12 +567,13 @@ class LanceVectorQueryBuilder(LanceQueryBuilder): """ def __init__( - self, - table: "Table", - query: Union[np.ndarray, list, "PIL.Image.Image"], - vector_column: str, - str_query: Optional[str] = None, - fast_search: bool = False, + self, + table: "Table", + query: Union[np.ndarray, list, "PIL.Image.Image"], + vector_column: str, + str_query: Optional[str] = None, + fast_search: bool = False, + bypass_vector_index: Optional[bool] = None, ): super().__init__(table) self._query = query @@ -574,6 +585,7 @@ class LanceVectorQueryBuilder(LanceQueryBuilder): self._reranker = None self._str_query = str_query self._fast_search = fast_search + self._bypass_vector_index = bypass_vector_index def metric(self, metric: Literal["L2", "cosine", "dot"]) -> LanceVectorQueryBuilder: """Set the distance metric to use. @@ -697,6 +709,7 @@ class LanceVectorQueryBuilder(LanceQueryBuilder): with_row_id=self._with_row_id, offset=self._offset, fast_search=self._fast_search, + bypass_vector_index=self._bypass_vector_index, ef=self._ef, ) result_set = self._table._execute_query(query, batch_size) @@ -728,7 +741,7 @@ class LanceVectorQueryBuilder(LanceQueryBuilder): return self def rerank( - self, reranker: Reranker, query_string: Optional[str] = None + self, reranker: Reranker, query_string: Optional[str] = None ) -> LanceVectorQueryBuilder: """Rerank the results using the specified reranker. @@ -947,7 +960,7 @@ class LanceHybridQueryBuilder(LanceQueryBuilder): def _validate_fts_index(self): if self._table._get_fts_index_path() is None: raise ValueError( - "Please create a full-text search index " "to perform hybrid search." + "Please create a full-text search index to perform hybrid search." ) def _validate_query(self, query): diff --git a/python/python/lancedb/remote/table.py b/python/python/lancedb/remote/table.py index c65e308f..16c370dd 100644 --- a/python/python/lancedb/remote/table.py +++ b/python/python/lancedb/remote/table.py @@ -234,8 +234,11 @@ class RemoteTable(Table): query_type: str = "vector", vector_column_name: Optional[str] = None, fast_search: bool = False, + bypass_vector_index: Optional[bool] = None, ) -> LanceVectorQueryBuilder: - return self.search(query, query_type, vector_column_name, fast_search) + return self.search( + query, query_type, vector_column_name, fast_search, bypass_vector_index + ) def search( self, @@ -243,6 +246,7 @@ class RemoteTable(Table): query_type: str = "vector", vector_column_name: Optional[str] = None, fast_search: bool = False, + bypass_vector_index: Optional[bool] = None, ) -> LanceVectorQueryBuilder: """Create a search query to find the nearest neighbors of the given query vector. We currently support [vector search][search] @@ -294,6 +298,15 @@ class RemoteTable(Table): search performance but search results will not include unindexed data. - *default False*. + + bypass_vector_index: bool, optional + If True, the query will bypass the vector index and perform a full scan. + An exhaustive (flat) search will be performed. The query vector will + be compared to every vector in the table. At high scales this can be + expensive. However, this is often still useful. For example, skipping + the vector index can give you ground truth results which you can use to + calculate your recall to select an appropriate value for nprobes. + Returns ------- LanceQueryBuilder @@ -316,6 +329,7 @@ class RemoteTable(Table): query_type, vector_column_name=vector_column_name, fast_search=fast_search, + bypass_vector_index=bypass_vector_index, ) def _execute_query( @@ -377,9 +391,9 @@ class RemoteTable(Table): params["on"] = merge._on[0] params["when_matched_update_all"] = str(merge._when_matched_update_all).lower() if merge._when_matched_update_all_condition is not None: - params[ - "when_matched_update_all_filt" - ] = merge._when_matched_update_all_condition + params["when_matched_update_all_filt"] = ( + merge._when_matched_update_all_condition + ) params["when_not_matched_insert_all"] = str( merge._when_not_matched_insert_all ).lower() @@ -387,9 +401,9 @@ class RemoteTable(Table): merge._when_not_matched_by_source_delete ).lower() if merge._when_not_matched_by_source_condition is not None: - params[ - "when_not_matched_by_source_delete_filt" - ] = merge._when_not_matched_by_source_condition + params["when_not_matched_by_source_delete_filt"] = ( + merge._when_not_matched_by_source_condition + ) self._conn._client.post( f"/v1/table/{self._name}/merge_insert/", diff --git a/python/python/tests/test_remote_db.py b/python/python/tests/test_remote_db.py index e321434e..d86c2464 100644 --- a/python/python/tests/test_remote_db.py +++ b/python/python/tests/test_remote_db.py @@ -57,4 +57,23 @@ def test_fast_search_query_with_filter(): table = conn["test"] table.schema = pa.schema([pa.field("vector", pa.list_(pa.float32(), 2))]) - print(table.query([0, 0], fast_search=True).select(["vector"]).where("foo == bar").to_arrow()) + print( + table.query([0, 0], fast_search=True) + .select(["vector"]) + .where("foo == bar") + .to_arrow() + ) + + +def test_bypass_vector_query_with_filter(): + conn = lancedb.connect("db://client-will-be-injected", api_key="fake") + setattr(conn, "_client", FakeLanceDBClient()) + + table = conn["test"] + table.schema = pa.schema([pa.field("vector", pa.list_(pa.float32(), 2))]) + print( + table.query([0, 0], bypass_vector_index=True) + .select(["vector"]) + .where("foo == bar") + .to_arrow() + )