mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-26 06:39:57 +00:00
1375 lines
49 KiB
Python
1375 lines
49 KiB
Python
# Copyright 2023 LanceDB Developers
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
from __future__ import annotations
|
|
|
|
from abc import ABC, abstractmethod
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
from pathlib import Path
|
|
from typing import (
|
|
TYPE_CHECKING,
|
|
Dict,
|
|
List,
|
|
Literal,
|
|
Optional,
|
|
Tuple,
|
|
Type,
|
|
Union,
|
|
)
|
|
|
|
import deprecation
|
|
import numpy as np
|
|
import pyarrow as pa
|
|
import pyarrow.fs as pa_fs
|
|
import pydantic
|
|
|
|
from . import __version__
|
|
from .arrow import AsyncRecordBatchReader
|
|
from .common import VEC
|
|
from .rerankers.base import Reranker
|
|
from .rerankers.linear_combination import LinearCombinationReranker
|
|
from .util import fs_from_uri, safe_import_pandas
|
|
|
|
if TYPE_CHECKING:
|
|
import PIL
|
|
import polars as pl
|
|
|
|
from ._lancedb import Query as LanceQuery
|
|
from ._lancedb import VectorQuery as LanceVectorQuery
|
|
from .pydantic import LanceModel
|
|
from .table import Table
|
|
|
|
pd = safe_import_pandas()
|
|
|
|
|
|
class Query(pydantic.BaseModel):
|
|
"""The LanceDB Query
|
|
|
|
Attributes
|
|
----------
|
|
vector : List[float]
|
|
the vector to search for
|
|
filter : Optional[str]
|
|
sql filter to refine the query with, optional
|
|
prefilter : bool
|
|
if True then apply the filter before vector search
|
|
k : int
|
|
top k results to return
|
|
metric : str
|
|
the distance metric between a pair of vectors,
|
|
|
|
can support L2 (default), Cosine and Dot.
|
|
[metric definitions][search]
|
|
columns : Optional[List[str]]
|
|
which columns to return in the results
|
|
nprobes : int
|
|
The number of probes used - optional
|
|
|
|
- A higher number makes search more accurate but also slower.
|
|
|
|
- See discussion in [Querying an ANN Index][querying-an-ann-index] for
|
|
tuning advice.
|
|
refine_factor : Optional[int]
|
|
Refine the results by reading extra elements and re-ranking them in memory.
|
|
|
|
- A higher number makes search more accurate but also slower.
|
|
|
|
- See discussion in [Querying an ANN Index][querying-an-ann-index] for
|
|
tuning advice.
|
|
"""
|
|
|
|
vector_column: Optional[str] = None
|
|
|
|
# vector to search for
|
|
vector: Union[List[float], List[List[float]]]
|
|
|
|
# sql filter to refine the query with
|
|
filter: Optional[str] = None
|
|
|
|
# if True then apply the filter before vector search
|
|
prefilter: bool = False
|
|
|
|
# top k results to return
|
|
k: int
|
|
|
|
# # metrics
|
|
metric: str = "L2"
|
|
|
|
# which columns to return in the results
|
|
columns: Optional[Union[List[str], Dict[str, str]]] = None
|
|
|
|
# optional query parameters for tuning the results,
|
|
# e.g. `{"nprobes": "10", "refine_factor": "10"}`
|
|
nprobes: int = 10
|
|
|
|
# Refine factor.
|
|
refine_factor: Optional[int] = None
|
|
|
|
with_row_id: bool = False
|
|
|
|
|
|
class LanceQueryBuilder(ABC):
|
|
"""An abstract query builder. Subclasses are defined for vector search,
|
|
full text search, hybrid, and plain SQL filtering.
|
|
"""
|
|
|
|
@classmethod
|
|
def create(
|
|
cls,
|
|
table: "Table",
|
|
query: Optional[Union[np.ndarray, str, "PIL.Image.Image", Tuple]],
|
|
query_type: str,
|
|
vector_column_name: str,
|
|
ordering_field_name: str = None,
|
|
) -> LanceQueryBuilder:
|
|
"""
|
|
Create a query builder based on the given query and query type.
|
|
|
|
Parameters
|
|
----------
|
|
table: Table
|
|
The table to query.
|
|
query: Optional[Union[np.ndarray, str, "PIL.Image.Image", Tuple]]
|
|
The query to use. If None, an empty query builder is returned
|
|
which performs simple SQL filtering.
|
|
query_type: str
|
|
The type of query to perform. One of "vector", "fts", "hybrid", or "auto".
|
|
If "auto", the query type is inferred based on the query.
|
|
vector_column_name: str
|
|
The name of the vector column to use for vector search.
|
|
"""
|
|
if query is None:
|
|
return LanceEmptyQueryBuilder(table)
|
|
|
|
if query_type == "hybrid":
|
|
# hybrid fts and vector query
|
|
return LanceHybridQueryBuilder(table, query, vector_column_name)
|
|
|
|
# remember the string query for reranking purpose
|
|
str_query = query if isinstance(query, str) else None
|
|
|
|
# convert "auto" query_type to "vector", "fts"
|
|
# or "hybrid" and convert the query to vector if needed
|
|
query, query_type = cls._resolve_query(
|
|
table, query, query_type, vector_column_name
|
|
)
|
|
|
|
if query_type == "hybrid":
|
|
return LanceHybridQueryBuilder(table, query, vector_column_name)
|
|
|
|
if isinstance(query, str):
|
|
# fts
|
|
return LanceFtsQueryBuilder(
|
|
table, query, ordering_field_name=ordering_field_name
|
|
)
|
|
|
|
if isinstance(query, list):
|
|
query = np.array(query, dtype=np.float32)
|
|
elif isinstance(query, np.ndarray):
|
|
query = query.astype(np.float32)
|
|
else:
|
|
raise TypeError(f"Unsupported query type: {type(query)}")
|
|
|
|
return LanceVectorQueryBuilder(table, query, vector_column_name, str_query)
|
|
|
|
@classmethod
|
|
def _resolve_query(cls, table, query, query_type, vector_column_name):
|
|
# If query_type is fts, then query must be a string.
|
|
# otherwise raise TypeError
|
|
if query_type == "fts":
|
|
if not isinstance(query, str):
|
|
raise TypeError(f"'fts' queries must be a string: {type(query)}")
|
|
return query, query_type
|
|
elif query_type == "vector":
|
|
query = cls._query_to_vector(table, query, vector_column_name)
|
|
return query, query_type
|
|
elif query_type == "auto":
|
|
if isinstance(query, (list, np.ndarray)):
|
|
return query, "vector"
|
|
if isinstance(query, tuple):
|
|
return query, "hybrid"
|
|
else:
|
|
conf = table.embedding_functions.get(vector_column_name)
|
|
if conf is not None:
|
|
query = conf.function.compute_query_embeddings_with_retry(query)[0]
|
|
return query, "vector"
|
|
else:
|
|
return query, "fts"
|
|
else:
|
|
raise ValueError(
|
|
f"Invalid query_type, must be 'vector', 'fts', or 'auto': {query_type}"
|
|
)
|
|
|
|
@classmethod
|
|
def _query_to_vector(cls, table, query, vector_column_name):
|
|
if isinstance(query, (list, np.ndarray)):
|
|
return query
|
|
conf = table.embedding_functions.get(vector_column_name)
|
|
if conf is not None:
|
|
return conf.function.compute_query_embeddings_with_retry(query)[0]
|
|
else:
|
|
msg = f"No embedding function for {vector_column_name}"
|
|
raise ValueError(msg)
|
|
|
|
def __init__(self, table: "Table"):
|
|
self._table = table
|
|
self._limit = 10
|
|
self._columns = None
|
|
self._where = None
|
|
self._with_row_id = False
|
|
|
|
@deprecation.deprecated(
|
|
deprecated_in="0.3.1",
|
|
removed_in="0.4.0",
|
|
current_version=__version__,
|
|
details="Use to_pandas() instead",
|
|
)
|
|
def to_df(self) -> "pd.DataFrame":
|
|
"""
|
|
*Deprecated alias for `to_pandas()`. Please use `to_pandas()` instead.*
|
|
|
|
Execute the query and return the results as a pandas DataFrame.
|
|
In addition to the selected columns, LanceDB also returns a vector
|
|
and also the "_distance" column which is the distance between the query
|
|
vector and the returned vector.
|
|
"""
|
|
return self.to_pandas()
|
|
|
|
def to_pandas(self, flatten: Optional[Union[int, bool]] = None) -> "pd.DataFrame":
|
|
"""
|
|
Execute the query and return the results as a pandas DataFrame.
|
|
In addition to the selected columns, LanceDB also returns a vector
|
|
and also the "_distance" column which is the distance between the query
|
|
vector and the returned vector.
|
|
|
|
Parameters
|
|
----------
|
|
flatten: Optional[Union[int, bool]]
|
|
If flatten is True, flatten all nested columns.
|
|
If flatten is an integer, flatten the nested columns up to the
|
|
specified depth.
|
|
If unspecified, do not flatten the nested columns.
|
|
"""
|
|
tbl = self.to_arrow()
|
|
if flatten is True:
|
|
while True:
|
|
tbl = tbl.flatten()
|
|
# loop through all columns to check if there is any struct column
|
|
if any(pa.types.is_struct(col.type) for col in tbl.schema):
|
|
continue
|
|
else:
|
|
break
|
|
elif isinstance(flatten, int):
|
|
if flatten <= 0:
|
|
raise ValueError(
|
|
"Please specify a positive integer for flatten or the boolean "
|
|
"value `True`"
|
|
)
|
|
while flatten > 0:
|
|
tbl = tbl.flatten()
|
|
flatten -= 1
|
|
return tbl.to_pandas()
|
|
|
|
@abstractmethod
|
|
def to_arrow(self) -> pa.Table:
|
|
"""
|
|
Execute the query and return the results as an
|
|
[Apache Arrow Table](https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table).
|
|
|
|
In addition to the selected columns, LanceDB also returns a vector
|
|
and also the "_distance" column which is the distance between the query
|
|
vector and the returned vectors.
|
|
"""
|
|
raise NotImplementedError
|
|
|
|
def to_list(self) -> List[dict]:
|
|
"""
|
|
Execute the query and return the results as a list of dictionaries.
|
|
|
|
Each list entry is a dictionary with the selected column names as keys,
|
|
or all table columns if `select` is not called. The vector and the "_distance"
|
|
fields are returned whether or not they're explicitly selected.
|
|
"""
|
|
return self.to_arrow().to_pylist()
|
|
|
|
def to_pydantic(self, model: Type[LanceModel]) -> List[LanceModel]:
|
|
"""Return the table as a list of pydantic models.
|
|
|
|
Parameters
|
|
----------
|
|
model: Type[LanceModel]
|
|
The pydantic model to use.
|
|
|
|
Returns
|
|
-------
|
|
List[LanceModel]
|
|
"""
|
|
return [
|
|
model(**{k: v for k, v in row.items() if k in model.field_names()})
|
|
for row in self.to_arrow().to_pylist()
|
|
]
|
|
|
|
def to_polars(self) -> "pl.DataFrame":
|
|
"""
|
|
Execute the query and return the results as a Polars DataFrame.
|
|
In addition to the selected columns, LanceDB also returns a vector
|
|
and also the "_distance" column which is the distance between the query
|
|
vector and the returned vector.
|
|
"""
|
|
import polars as pl
|
|
|
|
return pl.from_arrow(self.to_arrow())
|
|
|
|
def limit(self, limit: Union[int, None]) -> LanceQueryBuilder:
|
|
"""Set the maximum number of results to return.
|
|
|
|
Parameters
|
|
----------
|
|
limit: int
|
|
The maximum number of results to return.
|
|
By default the query is limited to the first 10.
|
|
Call this method and pass 0, a negative value,
|
|
or None to remove the limit.
|
|
*WARNING* if you have a large dataset, removing
|
|
the limit can potentially result in reading a
|
|
large amount of data into memory and cause
|
|
out of memory issues.
|
|
|
|
Returns
|
|
-------
|
|
LanceQueryBuilder
|
|
The LanceQueryBuilder object.
|
|
"""
|
|
if limit is None or limit <= 0:
|
|
self._limit = None
|
|
else:
|
|
self._limit = limit
|
|
return self
|
|
|
|
def select(self, columns: Union[list[str], dict[str, str]]) -> LanceQueryBuilder:
|
|
"""Set the columns to return.
|
|
|
|
Parameters
|
|
----------
|
|
columns: list of str, or dict of str to str default None
|
|
List of column names to be fetched.
|
|
Or a dictionary of column names to SQL expressions.
|
|
All columns are fetched if None or unspecified.
|
|
|
|
Returns
|
|
-------
|
|
LanceQueryBuilder
|
|
The LanceQueryBuilder object.
|
|
"""
|
|
if isinstance(columns, list) or isinstance(columns, dict):
|
|
self._columns = columns
|
|
else:
|
|
raise ValueError("columns must be a list or a dictionary")
|
|
return self
|
|
|
|
def where(self, where: str, prefilter: bool = False) -> LanceQueryBuilder:
|
|
"""Set the where clause.
|
|
|
|
Parameters
|
|
----------
|
|
where: str
|
|
The where clause which is a valid SQL where clause. See
|
|
`Lance filter pushdown <https://lancedb.github.io/lance/read_and_write.html#filter-push-down>`_
|
|
for valid SQL expressions.
|
|
prefilter: bool, default False
|
|
If True, apply the filter before vector search, otherwise the
|
|
filter is applied on the result of vector search.
|
|
This feature is **EXPERIMENTAL** and may be removed and modified
|
|
without warning in the future.
|
|
|
|
Returns
|
|
-------
|
|
LanceQueryBuilder
|
|
The LanceQueryBuilder object.
|
|
"""
|
|
self._where = where
|
|
self._prefilter = prefilter
|
|
return self
|
|
|
|
def with_row_id(self, with_row_id: bool) -> LanceQueryBuilder:
|
|
"""Set whether to return row ids.
|
|
|
|
Parameters
|
|
----------
|
|
with_row_id: bool
|
|
If True, return _rowid column in the results.
|
|
|
|
Returns
|
|
-------
|
|
LanceQueryBuilder
|
|
The LanceQueryBuilder object.
|
|
"""
|
|
self._with_row_id = with_row_id
|
|
return self
|
|
|
|
|
|
class LanceVectorQueryBuilder(LanceQueryBuilder):
|
|
"""
|
|
Examples
|
|
--------
|
|
>>> import lancedb
|
|
>>> data = [{"vector": [1.1, 1.2], "b": 2},
|
|
... {"vector": [0.5, 1.3], "b": 4},
|
|
... {"vector": [0.4, 0.4], "b": 6},
|
|
... {"vector": [0.4, 0.4], "b": 10}]
|
|
>>> db = lancedb.connect("./.lancedb")
|
|
>>> table = db.create_table("my_table", data=data)
|
|
>>> (table.search([0.4, 0.4])
|
|
... .metric("cosine")
|
|
... .where("b < 10")
|
|
... .select(["b", "vector"])
|
|
... .limit(2)
|
|
... .to_pandas())
|
|
b vector _distance
|
|
0 6 [0.4, 0.4] 0.0
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
table: "Table",
|
|
query: Union[np.ndarray, list, "PIL.Image.Image"],
|
|
vector_column: str,
|
|
str_query: Optional[str] = None,
|
|
):
|
|
super().__init__(table)
|
|
self._query = query
|
|
self._metric = "L2"
|
|
self._nprobes = 20
|
|
self._refine_factor = None
|
|
self._vector_column = vector_column
|
|
self._prefilter = False
|
|
self._reranker = None
|
|
self._str_query = str_query
|
|
|
|
def metric(self, metric: Literal["L2", "cosine"]) -> LanceVectorQueryBuilder:
|
|
"""Set the distance metric to use.
|
|
|
|
Parameters
|
|
----------
|
|
metric: "L2" or "cosine"
|
|
The distance metric to use. By default "L2" is used.
|
|
|
|
Returns
|
|
-------
|
|
LanceVectorQueryBuilder
|
|
The LanceQueryBuilder object.
|
|
"""
|
|
self._metric = metric
|
|
return self
|
|
|
|
def nprobes(self, nprobes: int) -> LanceVectorQueryBuilder:
|
|
"""Set the number of probes to use.
|
|
|
|
Higher values will yield better recall (more likely to find vectors if
|
|
they exist) at the expense of latency.
|
|
|
|
See discussion in [Querying an ANN Index][querying-an-ann-index] for
|
|
tuning advice.
|
|
|
|
Parameters
|
|
----------
|
|
nprobes: int
|
|
The number of probes to use.
|
|
|
|
Returns
|
|
-------
|
|
LanceVectorQueryBuilder
|
|
The LanceQueryBuilder object.
|
|
"""
|
|
self._nprobes = nprobes
|
|
return self
|
|
|
|
def refine_factor(self, refine_factor: int) -> LanceVectorQueryBuilder:
|
|
"""Set the refine factor to use, increasing the number of vectors sampled.
|
|
|
|
As an example, a refine factor of 2 will sample 2x as many vectors as
|
|
requested, re-ranks them, and returns the top half most relevant results.
|
|
|
|
See discussion in [Querying an ANN Index][querying-an-ann-index] for
|
|
tuning advice.
|
|
|
|
Parameters
|
|
----------
|
|
refine_factor: int
|
|
The refine factor to use.
|
|
|
|
Returns
|
|
-------
|
|
LanceVectorQueryBuilder
|
|
The LanceQueryBuilder object.
|
|
"""
|
|
self._refine_factor = refine_factor
|
|
return self
|
|
|
|
def to_arrow(self) -> pa.Table:
|
|
"""
|
|
Execute the query and return the results as an
|
|
[Apache Arrow Table](https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table).
|
|
|
|
In addition to the selected columns, LanceDB also returns a vector
|
|
and also the "_distance" column which is the distance between the query
|
|
vector and the returned vectors.
|
|
"""
|
|
return self.to_batches().read_all()
|
|
|
|
def to_batches(self, /, batch_size: Optional[int] = None) -> pa.RecordBatchReader:
|
|
"""
|
|
Execute the query and return the result as a RecordBatchReader object.
|
|
|
|
Parameters
|
|
----------
|
|
batch_size: int
|
|
The maximum number of selected records in a RecordBatch object.
|
|
|
|
Returns
|
|
-------
|
|
pa.RecordBatchReader
|
|
"""
|
|
vector = self._query if isinstance(self._query, list) else self._query.tolist()
|
|
if isinstance(vector[0], np.ndarray):
|
|
vector = [v.tolist() for v in vector]
|
|
query = Query(
|
|
vector=vector,
|
|
filter=self._where,
|
|
prefilter=self._prefilter,
|
|
k=self._limit,
|
|
metric=self._metric,
|
|
columns=self._columns,
|
|
nprobes=self._nprobes,
|
|
refine_factor=self._refine_factor,
|
|
vector_column=self._vector_column,
|
|
with_row_id=self._with_row_id,
|
|
)
|
|
result_set = self._table._execute_query(query, batch_size)
|
|
if self._reranker is not None:
|
|
rs_table = result_set.read_all()
|
|
result_set = self._reranker.rerank_vector(self._str_query, rs_table)
|
|
# convert result_set back to RecordBatchReader
|
|
result_set = pa.RecordBatchReader.from_batches(
|
|
result_set.schema, result_set.to_batches()
|
|
)
|
|
|
|
return result_set
|
|
|
|
def where(self, where: str, prefilter: bool = False) -> LanceVectorQueryBuilder:
|
|
"""Set the where clause.
|
|
|
|
Parameters
|
|
----------
|
|
where: str
|
|
The where clause which is a valid SQL where clause. See
|
|
`Lance filter pushdown <https://lancedb.github.io/lance/read_and_write.html#filter-push-down>`_
|
|
for valid SQL expressions.
|
|
prefilter: bool, default False
|
|
If True, apply the filter before vector search, otherwise the
|
|
filter is applied on the result of vector search.
|
|
This feature is **EXPERIMENTAL** and may be removed and modified
|
|
without warning in the future.
|
|
|
|
Returns
|
|
-------
|
|
LanceQueryBuilder
|
|
The LanceQueryBuilder object.
|
|
"""
|
|
self._where = where
|
|
self._prefilter = prefilter
|
|
return self
|
|
|
|
def rerank(
|
|
self, reranker: Reranker, query_string: Optional[str] = None
|
|
) -> LanceVectorQueryBuilder:
|
|
"""Rerank the results using the specified reranker.
|
|
|
|
Parameters
|
|
----------
|
|
reranker: Reranker
|
|
The reranker to use.
|
|
|
|
query_string: Optional[str]
|
|
The query to use for reranking. This needs to be specified explicitly here
|
|
as the query used for vector search may already be vectorized and the
|
|
reranker requires a string query.
|
|
This is only required if the query used for vector search is not a string.
|
|
Note: This doesn't yet support the case where the query is multimodal or a
|
|
list of vectors.
|
|
|
|
Returns
|
|
-------
|
|
LanceVectorQueryBuilder
|
|
The LanceQueryBuilder object.
|
|
"""
|
|
self._reranker = reranker
|
|
if self._str_query is None and query_string is None:
|
|
raise ValueError(
|
|
"""
|
|
The query used for vector search is not a string.
|
|
In this case, the reranker query needs to be specified explicitly.
|
|
"""
|
|
)
|
|
if query_string is not None and not isinstance(query_string, str):
|
|
raise ValueError("Reranking currently only supports string queries")
|
|
self._str_query = query_string if query_string is not None else self._str_query
|
|
return self
|
|
|
|
|
|
class LanceFtsQueryBuilder(LanceQueryBuilder):
|
|
"""A builder for full text search for LanceDB."""
|
|
|
|
def __init__(self, table: "Table", query: str, ordering_field_name: str = None):
|
|
super().__init__(table)
|
|
self._query = query
|
|
self._phrase_query = False
|
|
self.ordering_field_name = ordering_field_name
|
|
self._reranker = None
|
|
|
|
def phrase_query(self, phrase_query: bool = True) -> LanceFtsQueryBuilder:
|
|
"""Set whether to use phrase query.
|
|
|
|
Parameters
|
|
----------
|
|
phrase_query: bool, default True
|
|
If True, then the query will be wrapped in quotes and
|
|
double quotes replaced by single quotes.
|
|
|
|
Returns
|
|
-------
|
|
LanceFtsQueryBuilder
|
|
The LanceFtsQueryBuilder object.
|
|
"""
|
|
self._phrase_query = phrase_query
|
|
return self
|
|
|
|
def to_arrow(self) -> pa.Table:
|
|
try:
|
|
import tantivy
|
|
except ImportError:
|
|
raise ImportError(
|
|
"Please install tantivy-py `pip install tantivy` to use the full text search feature." # noqa: E501
|
|
)
|
|
|
|
from .fts import search_index
|
|
|
|
# get the index path
|
|
index_path = self._table._get_fts_index_path()
|
|
|
|
# Check that we are on local filesystem
|
|
fs, _path = fs_from_uri(index_path)
|
|
if not isinstance(fs, pa_fs.LocalFileSystem):
|
|
raise NotImplementedError(
|
|
"Full-text search is only supported on the local filesystem"
|
|
)
|
|
|
|
# check if the index exist
|
|
if not Path(index_path).exists():
|
|
raise FileNotFoundError(
|
|
"Fts index does not exist. "
|
|
"Please first call table.create_fts_index(['<field_names>']) to "
|
|
"create the fts index."
|
|
)
|
|
# open the index
|
|
index = tantivy.Index.open(index_path)
|
|
# get the scores and doc ids
|
|
query = self._query
|
|
if self._phrase_query:
|
|
query = query.replace('"', "'")
|
|
query = f'"{query}"'
|
|
row_ids, scores = search_index(
|
|
index, query, self._limit, ordering_field=self.ordering_field_name
|
|
)
|
|
if len(row_ids) == 0:
|
|
empty_schema = pa.schema([pa.field("score", pa.float32())])
|
|
return pa.Table.from_pylist([], schema=empty_schema)
|
|
scores = pa.array(scores)
|
|
output_tbl = self._table.to_lance().take(row_ids, columns=self._columns)
|
|
output_tbl = output_tbl.append_column("score", scores)
|
|
# this needs to match vector search results which are uint64
|
|
row_ids = pa.array(row_ids, type=pa.uint64())
|
|
|
|
if self._where is not None:
|
|
tmp_name = "__lancedb__duckdb__indexer__"
|
|
output_tbl = output_tbl.append_column(
|
|
tmp_name, pa.array(range(len(output_tbl)))
|
|
)
|
|
try:
|
|
# TODO would be great to have Substrait generate pyarrow compute
|
|
# expressions or conversely have pyarrow support SQL expressions
|
|
# using Substrait
|
|
import duckdb
|
|
|
|
indexer = duckdb.sql(
|
|
f"SELECT {tmp_name} FROM output_tbl WHERE {self._where}"
|
|
).to_arrow_table()[tmp_name]
|
|
output_tbl = output_tbl.take(indexer).drop([tmp_name])
|
|
row_ids = row_ids.take(indexer)
|
|
|
|
except ImportError:
|
|
import tempfile
|
|
|
|
import lance
|
|
|
|
# TODO Use "memory://" instead once that's supported
|
|
with tempfile.TemporaryDirectory() as tmp:
|
|
ds = lance.write_dataset(output_tbl, tmp)
|
|
output_tbl = ds.to_table(filter=self._where)
|
|
indexer = output_tbl[tmp_name]
|
|
row_ids = row_ids.take(indexer)
|
|
output_tbl = output_tbl.drop([tmp_name])
|
|
|
|
if self._with_row_id:
|
|
output_tbl = output_tbl.append_column("_rowid", row_ids)
|
|
|
|
if self._reranker is not None:
|
|
output_tbl = self._reranker.rerank_fts(self._query, output_tbl)
|
|
return output_tbl
|
|
|
|
def rerank(self, reranker: Reranker) -> LanceFtsQueryBuilder:
|
|
"""Rerank the results using the specified reranker.
|
|
|
|
Parameters
|
|
----------
|
|
reranker: Reranker
|
|
The reranker to use.
|
|
|
|
Returns
|
|
-------
|
|
LanceFtsQueryBuilder
|
|
The LanceQueryBuilder object.
|
|
"""
|
|
self._reranker = reranker
|
|
return self
|
|
|
|
|
|
class LanceEmptyQueryBuilder(LanceQueryBuilder):
|
|
def to_arrow(self) -> pa.Table:
|
|
ds = self._table.to_lance()
|
|
return ds.to_table(
|
|
columns=self._columns,
|
|
filter=self._where,
|
|
limit=self._limit,
|
|
)
|
|
|
|
|
|
class LanceHybridQueryBuilder(LanceQueryBuilder):
|
|
"""
|
|
A query builder that performs hybrid vector and full text search.
|
|
Results are combined and reranked based on the specified reranker.
|
|
By default, the results are reranked using the LinearCombinationReranker.
|
|
|
|
To make the vector and fts results comparable, the scores are normalized.
|
|
Instead of normalizing scores, the `normalize` parameter can be set to "rank"
|
|
in the `rerank` method to convert the scores to ranks and then normalize them.
|
|
"""
|
|
|
|
def __init__(self, table: "Table", query: str, vector_column: str):
|
|
super().__init__(table)
|
|
self._validate_fts_index()
|
|
vector_query, fts_query = self._validate_query(query)
|
|
self._fts_query = LanceFtsQueryBuilder(table, fts_query)
|
|
vector_query = self._query_to_vector(table, vector_query, vector_column)
|
|
self._vector_query = LanceVectorQueryBuilder(table, vector_query, vector_column)
|
|
self._norm = "score"
|
|
self._reranker = LinearCombinationReranker(weight=0.7, fill=1.0)
|
|
|
|
def _validate_fts_index(self):
|
|
if self._table._get_fts_index_path() is None:
|
|
raise ValueError(
|
|
"Please create a full-text search index " "to perform hybrid search."
|
|
)
|
|
|
|
def _validate_query(self, query):
|
|
# Temp hack to support vectorized queries for hybrid search
|
|
if isinstance(query, str):
|
|
return query, query
|
|
elif isinstance(query, tuple):
|
|
if len(query) != 2:
|
|
raise ValueError(
|
|
"The query must be a tuple of (vector_query, fts_query)."
|
|
)
|
|
if not isinstance(query[0], (list, np.ndarray, pa.Array, pa.ChunkedArray)):
|
|
raise ValueError(f"The vector query must be one of {VEC}.")
|
|
if not isinstance(query[1], str):
|
|
raise ValueError("The fts query must be a string.")
|
|
return query[0], query[1]
|
|
else:
|
|
raise ValueError(
|
|
"The query must be either a string or a tuple of (vector, string)."
|
|
)
|
|
|
|
def to_arrow(self) -> pa.Table:
|
|
with ThreadPoolExecutor() as executor:
|
|
fts_future = executor.submit(self._fts_query.with_row_id(True).to_arrow)
|
|
vector_future = executor.submit(
|
|
self._vector_query.with_row_id(True).to_arrow
|
|
)
|
|
fts_results = fts_future.result()
|
|
vector_results = vector_future.result()
|
|
|
|
# convert to ranks first if needed
|
|
if self._norm == "rank":
|
|
vector_results = self._rank(vector_results, "_distance")
|
|
fts_results = self._rank(fts_results, "score")
|
|
# normalize the scores to be between 0 and 1, 0 being most relevant
|
|
vector_results = self._normalize_scores(vector_results, "_distance")
|
|
|
|
# In fts higher scores represent relevance. Not inverting them here as
|
|
# rerankers might need to preserve this score to support `return_score="all"`
|
|
fts_results = self._normalize_scores(fts_results, "score")
|
|
|
|
results = self._reranker.rerank_hybrid(
|
|
self._fts_query._query, vector_results, fts_results
|
|
)
|
|
|
|
if not isinstance(results, pa.Table): # Enforce type
|
|
raise TypeError(
|
|
f"rerank_hybrid must return a pyarrow.Table, got {type(results)}"
|
|
)
|
|
|
|
# apply limit after reranking
|
|
results = results.slice(length=self._limit)
|
|
|
|
if not self._with_row_id:
|
|
results = results.drop(["_rowid"])
|
|
return results
|
|
|
|
def _rank(self, results: pa.Table, column: str, ascending: bool = True):
|
|
if len(results) == 0:
|
|
return results
|
|
# Get the _score column from results
|
|
scores = results.column(column).to_numpy()
|
|
sort_indices = np.argsort(scores)
|
|
if not ascending:
|
|
sort_indices = sort_indices[::-1]
|
|
ranks = np.empty_like(sort_indices)
|
|
ranks[sort_indices] = np.arange(len(scores)) + 1
|
|
# replace the _score column with the ranks
|
|
_score_idx = results.column_names.index(column)
|
|
results = results.set_column(
|
|
_score_idx, column, pa.array(ranks, type=pa.float32())
|
|
)
|
|
return results
|
|
|
|
def _normalize_scores(self, results: pa.Table, column: str, invert=False):
|
|
if len(results) == 0:
|
|
return results
|
|
# Get the _score column from results
|
|
scores = results.column(column).to_numpy()
|
|
# normalize the scores by subtracting the min and dividing by the max
|
|
max, min = np.max(scores), np.min(scores)
|
|
if np.isclose(max, min):
|
|
rng = max
|
|
else:
|
|
rng = max - min
|
|
scores = (scores - min) / rng
|
|
if invert:
|
|
scores = 1 - scores
|
|
# replace the _score column with the ranks
|
|
_score_idx = results.column_names.index(column)
|
|
results = results.set_column(
|
|
_score_idx, column, pa.array(scores, type=pa.float32())
|
|
)
|
|
return results
|
|
|
|
def rerank(
|
|
self,
|
|
normalize="score",
|
|
reranker: Reranker = LinearCombinationReranker(weight=0.7, fill=1.0),
|
|
) -> LanceHybridQueryBuilder:
|
|
"""
|
|
Rerank the hybrid search results using the specified reranker. The reranker
|
|
must be an instance of Reranker class.
|
|
|
|
Parameters
|
|
----------
|
|
normalize: str, default "score"
|
|
The method to normalize the scores. Can be "rank" or "score". If "rank",
|
|
the scores are converted to ranks and then normalized. If "score", the
|
|
scores are normalized directly.
|
|
reranker: Reranker, default LinearCombinationReranker(weight=0.7, fill=1.0)
|
|
The reranker to use. Must be an instance of Reranker class.
|
|
Returns
|
|
-------
|
|
LanceHybridQueryBuilder
|
|
The LanceHybridQueryBuilder object.
|
|
"""
|
|
if normalize not in ["rank", "score"]:
|
|
raise ValueError("normalize must be 'rank' or 'score'.")
|
|
if reranker and not isinstance(reranker, Reranker):
|
|
raise ValueError("reranker must be an instance of Reranker class.")
|
|
|
|
self._norm = normalize
|
|
self._reranker = reranker
|
|
|
|
return self
|
|
|
|
def limit(self, limit: int) -> LanceHybridQueryBuilder:
|
|
"""
|
|
Set the maximum number of results to return for both vector and fts search
|
|
components.
|
|
|
|
Parameters
|
|
----------
|
|
limit: int
|
|
The maximum number of results to return.
|
|
|
|
Returns
|
|
-------
|
|
LanceHybridQueryBuilder
|
|
The LanceHybridQueryBuilder object.
|
|
"""
|
|
self._vector_query.limit(limit)
|
|
self._fts_query.limit(limit)
|
|
self._limit = limit
|
|
|
|
return self
|
|
|
|
def select(self, columns: list) -> LanceHybridQueryBuilder:
|
|
"""
|
|
Set the columns to return for both vector and fts search.
|
|
|
|
Parameters
|
|
----------
|
|
columns: list
|
|
The columns to return.
|
|
|
|
Returns
|
|
-------
|
|
LanceHybridQueryBuilder
|
|
The LanceHybridQueryBuilder object.
|
|
"""
|
|
self._vector_query.select(columns)
|
|
self._fts_query.select(columns)
|
|
return self
|
|
|
|
def where(self, where: str, prefilter: bool = False) -> LanceHybridQueryBuilder:
|
|
"""
|
|
Set the where clause for both vector and fts search.
|
|
|
|
Parameters
|
|
----------
|
|
where: str
|
|
The where clause which is a valid SQL where clause. See
|
|
`Lance filter pushdown <https://lancedb.github.io/lance/read_and_write.html#filter-push-down>`_
|
|
for valid SQL expressions.
|
|
|
|
prefilter: bool, default False
|
|
If True, apply the filter before vector search, otherwise the
|
|
filter is applied on the result of vector search.
|
|
|
|
Returns
|
|
-------
|
|
LanceHybridQueryBuilder
|
|
The LanceHybridQueryBuilder object.
|
|
"""
|
|
|
|
self._vector_query.where(where, prefilter=prefilter)
|
|
self._fts_query.where(where)
|
|
return self
|
|
|
|
def metric(self, metric: Literal["L2", "cosine"]) -> LanceHybridQueryBuilder:
|
|
"""
|
|
Set the distance metric to use for vector search.
|
|
|
|
Parameters
|
|
----------
|
|
metric: "L2" or "cosine"
|
|
The distance metric to use. By default "L2" is used.
|
|
|
|
Returns
|
|
-------
|
|
LanceHybridQueryBuilder
|
|
The LanceHybridQueryBuilder object.
|
|
"""
|
|
self._vector_query.metric(metric)
|
|
return self
|
|
|
|
def nprobes(self, nprobes: int) -> LanceHybridQueryBuilder:
|
|
"""
|
|
Set the number of probes to use for vector search.
|
|
|
|
Higher values will yield better recall (more likely to find vectors if
|
|
they exist) at the expense of latency.
|
|
|
|
Parameters
|
|
----------
|
|
nprobes: int
|
|
The number of probes to use.
|
|
|
|
Returns
|
|
-------
|
|
LanceHybridQueryBuilder
|
|
The LanceHybridQueryBuilder object.
|
|
"""
|
|
self._vector_query.nprobes(nprobes)
|
|
return self
|
|
|
|
def refine_factor(self, refine_factor: int) -> LanceHybridQueryBuilder:
|
|
"""
|
|
Refine the vector search results by reading extra elements and
|
|
re-ranking them in memory.
|
|
|
|
Parameters
|
|
----------
|
|
refine_factor: int
|
|
The refine factor to use.
|
|
|
|
Returns
|
|
-------
|
|
LanceHybridQueryBuilder
|
|
The LanceHybridQueryBuilder object.
|
|
"""
|
|
self._vector_query.refine_factor(refine_factor)
|
|
return self
|
|
|
|
|
|
class AsyncQueryBase(object):
|
|
def __init__(self, inner: Union[LanceQuery | LanceVectorQuery]):
|
|
"""
|
|
Construct an AsyncQueryBase
|
|
|
|
This method is not intended to be called directly. Instead, use the
|
|
[AsyncTable.query][lancedb.table.AsyncTable.query] method to create a query.
|
|
"""
|
|
self._inner = inner
|
|
|
|
def where(self, predicate: str) -> AsyncQuery:
|
|
"""
|
|
Only return rows matching the given predicate
|
|
|
|
The predicate should be supplied as an SQL query string.
|
|
|
|
Examples
|
|
--------
|
|
|
|
>>> predicate = "x > 10"
|
|
>>> predicate = "y > 0 AND y < 100"
|
|
>>> predicate = "x > 5 OR y = 'test'"
|
|
|
|
Filtering performance can often be improved by creating a scalar index
|
|
on the filter column(s).
|
|
"""
|
|
self._inner.where(predicate)
|
|
return self
|
|
|
|
def select(self, columns: Union[List[str], dict[str, str]]) -> AsyncQuery:
|
|
"""
|
|
Return only the specified columns.
|
|
|
|
By default a query will return all columns from the table. However, this can
|
|
have a very significant impact on latency. LanceDb stores data in a columnar
|
|
fashion. This
|
|
means we can finely tune our I/O to select exactly the columns we need.
|
|
|
|
As a best practice you should always limit queries to the columns that you need.
|
|
If you pass in a list of column names then only those columns will be
|
|
returned.
|
|
|
|
You can also use this method to create new "dynamic" columns based on your
|
|
existing columns. For example, you may not care about "a" or "b" but instead
|
|
simply want "a + b". This is often seen in the SELECT clause of an SQL query
|
|
(e.g. `SELECT a+b FROM my_table`).
|
|
|
|
To create dynamic columns you can pass in a dict[str, str]. A column will be
|
|
returned for each entry in the map. The key provides the name of the column.
|
|
The value is an SQL string used to specify how the column is calculated.
|
|
|
|
For example, an SQL query might state `SELECT a + b AS combined, c`. The
|
|
equivalent input to this method would be `{"combined": "a + b", "c": "c"}`.
|
|
|
|
Columns will always be returned in the order given, even if that order is
|
|
different than the order used when adding the data.
|
|
"""
|
|
if isinstance(columns, dict):
|
|
column_tuples = list(columns.items())
|
|
else:
|
|
try:
|
|
column_tuples = [(c, c) for c in columns]
|
|
except TypeError:
|
|
raise TypeError("columns must be a list of column names or a dict")
|
|
self._inner.select(column_tuples)
|
|
return self
|
|
|
|
def limit(self, limit: int) -> AsyncQuery:
|
|
"""
|
|
Set the maximum number of results to return.
|
|
|
|
By default, a plain search has no limit. If this method is not
|
|
called then every valid row from the table will be returned.
|
|
"""
|
|
self._inner.limit(limit)
|
|
return self
|
|
|
|
async def to_batches(self) -> AsyncRecordBatchReader:
|
|
"""
|
|
Execute the query and return the results as an Apache Arrow RecordBatchReader.
|
|
"""
|
|
return AsyncRecordBatchReader(await self._inner.execute())
|
|
|
|
async def to_arrow(self) -> pa.Table:
|
|
"""
|
|
Execute the query and collect the results into an Apache Arrow Table.
|
|
|
|
This method will collect all results into memory before returning. If
|
|
you expect a large number of results, you may want to use
|
|
[to_batches][lancedb.query.AsyncQueryBase.to_batches]
|
|
"""
|
|
batch_iter = await self.to_batches()
|
|
return pa.Table.from_batches(
|
|
await batch_iter.read_all(), schema=batch_iter.schema
|
|
)
|
|
|
|
async def to_pandas(self) -> "pd.DataFrame":
|
|
"""
|
|
Execute the query and collect the results into a pandas DataFrame.
|
|
|
|
This method will collect all results into memory before returning. If you
|
|
expect a large number of results, you may want to use
|
|
[to_batches][lancedb.query.AsyncQueryBase.to_batches] and convert each batch to
|
|
pandas separately.
|
|
|
|
Examples
|
|
--------
|
|
|
|
>>> import asyncio
|
|
>>> from lancedb import connect_async
|
|
>>> async def doctest_example():
|
|
... conn = await connect_async("./.lancedb")
|
|
... table = await conn.create_table("my_table", data=[{"a": 1, "b": 2}])
|
|
... async for batch in await table.query().to_batches():
|
|
... batch_df = batch.to_pandas()
|
|
>>> asyncio.run(doctest_example())
|
|
"""
|
|
return (await self.to_arrow()).to_pandas()
|
|
|
|
|
|
class AsyncQuery(AsyncQueryBase):
|
|
def __init__(self, inner: LanceQuery):
|
|
"""
|
|
Construct an AsyncQuery
|
|
|
|
This method is not intended to be called directly. Instead, use the
|
|
[AsyncTable.query][lancedb.table.AsyncTable.query] method to create a query.
|
|
"""
|
|
super().__init__(inner)
|
|
self._inner = inner
|
|
|
|
@classmethod
|
|
def _query_vec_to_array(self, vec: Union[VEC, Tuple]):
|
|
if isinstance(vec, list):
|
|
return pa.array(vec)
|
|
if isinstance(vec, np.ndarray):
|
|
return pa.array(vec)
|
|
if isinstance(vec, pa.Array):
|
|
return vec
|
|
if isinstance(vec, pa.ChunkedArray):
|
|
return vec.combine_chunks()
|
|
if isinstance(vec, tuple):
|
|
return pa.array(vec)
|
|
# We've checked everything we formally support in our typings
|
|
# but, as a fallback, let pyarrow try and convert it anyway.
|
|
# This can allow for some more exotic things like iterables
|
|
return pa.array(vec)
|
|
|
|
def nearest_to(
|
|
self, query_vector: Optional[Union[VEC, Tuple]] = None
|
|
) -> AsyncVectorQuery:
|
|
"""
|
|
Find the nearest vectors to the given query vector.
|
|
|
|
This converts the query from a plain query to a vector query.
|
|
|
|
This method will attempt to convert the input to the query vector
|
|
expected by the embedding model. If the input cannot be converted
|
|
then an error will be thrown.
|
|
|
|
By default, there is no embedding model, and the input should be
|
|
something that can be converted to a pyarrow array of floats. This
|
|
includes lists, numpy arrays, and tuples.
|
|
|
|
If there is only one vector column (a column whose data type is a
|
|
fixed size list of floats) then the column does not need to be specified.
|
|
If there is more than one vector column you must use
|
|
[AsyncVectorQuery.column][lancedb.query.AsyncVectorQuery.column] to specify
|
|
which column you would like to compare with.
|
|
|
|
If no index has been created on the vector column then a vector query
|
|
will perform a distance comparison between the query vector and every
|
|
vector in the database and then sort the results. This is sometimes
|
|
called a "flat search"
|
|
|
|
For small databases, with tens of thousands of vectors or less, this can
|
|
be reasonably fast. In larger databases you should create a vector index
|
|
on the column. If there is a vector index then an "approximate" nearest
|
|
neighbor search (frequently called an ANN search) will be performed. This
|
|
search is much faster, but the results will be approximate.
|
|
|
|
The query can be further parameterized using the returned builder. There
|
|
are various ANN search parameters that will let you fine tune your recall
|
|
accuracy vs search latency.
|
|
|
|
Vector searches always have a [limit][]. If `limit` has not been called then
|
|
a default `limit` of 10 will be used.
|
|
"""
|
|
return AsyncVectorQuery(
|
|
self._inner.nearest_to(AsyncQuery._query_vec_to_array(query_vector))
|
|
)
|
|
|
|
|
|
class AsyncVectorQuery(AsyncQueryBase):
|
|
def __init__(self, inner: LanceVectorQuery):
|
|
"""
|
|
Construct an AsyncVectorQuery
|
|
|
|
This method is not intended to be called directly. Instead, create
|
|
a query first with [AsyncTable.query][lancedb.table.AsyncTable.query] and then
|
|
use [AsyncQuery.nearest_to][lancedb.query.AsyncQuery.nearest_to]] to convert to
|
|
a vector query. Or you can use
|
|
[AsyncTable.vector_search][lancedb.table.AsyncTable.vector_search]
|
|
"""
|
|
super().__init__(inner)
|
|
self._inner = inner
|
|
|
|
def column(self, column: str) -> AsyncVectorQuery:
|
|
"""
|
|
Set the vector column to query
|
|
|
|
This controls which column is compared to the query vector supplied in
|
|
the call to [AsyncQuery.nearest_to][lancedb.query.AsyncQuery.nearest_to].
|
|
|
|
This parameter must be specified if the table has more than one column
|
|
whose data type is a fixed-size-list of floats.
|
|
"""
|
|
self._inner.column(column)
|
|
return self
|
|
|
|
def nprobes(self, nprobes: int) -> AsyncVectorQuery:
|
|
"""
|
|
Set the number of partitions to search (probe)
|
|
|
|
This argument is only used when the vector column has an IVF PQ index.
|
|
If there is no index then this value is ignored.
|
|
|
|
The IVF stage of IVF PQ divides the input into partitions (clusters) of
|
|
related values.
|
|
|
|
The partition whose centroids are closest to the query vector will be
|
|
exhaustiely searched to find matches. This parameter controls how many
|
|
partitions should be searched.
|
|
|
|
Increasing this value will increase the recall of your query but will
|
|
also increase the latency of your query. The default value is 20. This
|
|
default is good for many cases but the best value to use will depend on
|
|
your data and the recall that you need to achieve.
|
|
|
|
For best results we recommend tuning this parameter with a benchmark against
|
|
your actual data to find the smallest possible value that will still give
|
|
you the desired recall.
|
|
"""
|
|
self._inner.nprobes(nprobes)
|
|
return self
|
|
|
|
def refine_factor(self, refine_factor: int) -> AsyncVectorQuery:
|
|
"""
|
|
A multiplier to control how many additional rows are taken during the refine
|
|
step
|
|
|
|
This argument is only used when the vector column has an IVF PQ index.
|
|
If there is no index then this value is ignored.
|
|
|
|
An IVF PQ index stores compressed (quantized) values. They query vector is
|
|
compared against these values and, since they are compressed, the comparison is
|
|
inaccurate.
|
|
|
|
This parameter can be used to refine the results. It can improve both improve
|
|
recall and correct the ordering of the nearest results.
|
|
|
|
To refine results LanceDb will first perform an ANN search to find the nearest
|
|
`limit` * `refine_factor` results. In other words, if `refine_factor` is 3 and
|
|
`limit` is the default (10) then the first 30 results will be selected. LanceDb
|
|
then fetches the full, uncompressed, values for these 30 results. The results
|
|
are then reordered by the true distance and only the nearest 10 are kept.
|
|
|
|
Note: there is a difference between calling this method with a value of 1 and
|
|
never calling this method at all. Calling this method with any value will have
|
|
an impact on your search latency. When you call this method with a
|
|
`refine_factor` of 1 then LanceDb still needs to fetch the full, uncompressed,
|
|
values so that it can potentially reorder the results.
|
|
|
|
Note: if this method is NOT called then the distances returned in the _distance
|
|
column will be approximate distances based on the comparison of the quantized
|
|
query vector and the quantized result vectors. This can be considerably
|
|
different than the true distance between the query vector and the actual
|
|
uncompressed vector.
|
|
"""
|
|
self._inner.refine_factor(refine_factor)
|
|
return self
|
|
|
|
def distance_type(self, distance_type: str) -> AsyncVectorQuery:
|
|
"""
|
|
Set the distance metric to use
|
|
|
|
When performing a vector search we try and find the "nearest" vectors according
|
|
to some kind of distance metric. This parameter controls which distance metric
|
|
to use. See @see {@link IvfPqOptions.distanceType} for more details on the
|
|
different distance metrics available.
|
|
|
|
Note: if there is a vector index then the distance type used MUST match the
|
|
distance type used to train the vector index. If this is not done then the
|
|
results will be invalid.
|
|
|
|
By default "l2" is used.
|
|
"""
|
|
self._inner.distance_type(distance_type)
|
|
return self
|
|
|
|
def postfilter(self) -> AsyncVectorQuery:
|
|
"""
|
|
If this is called then filtering will happen after the vector search instead of
|
|
before.
|
|
|
|
By default filtering will be performed before the vector search. This is how
|
|
filtering is typically understood to work. This prefilter step does add some
|
|
additional latency. Creating a scalar index on the filter column(s) can
|
|
often improve this latency. However, sometimes a filter is too complex or
|
|
scalar indices cannot be applied to the column. In these cases postfiltering
|
|
can be used instead of prefiltering to improve latency.
|
|
|
|
Post filtering applies the filter to the results of the vector search. This
|
|
means we only run the filter on a much smaller set of data. However, it can
|
|
cause the query to return fewer than `limit` results (or even no results) if
|
|
none of the nearest results match the filter.
|
|
|
|
Post filtering happens during the "refine stage" (described in more detail in
|
|
@see {@link VectorQuery#refineFactor}). This means that setting a higher refine
|
|
factor can often help restore some of the results lost by post filtering.
|
|
"""
|
|
self._inner.postfilter()
|
|
return self
|
|
|
|
def bypass_vector_index(self) -> AsyncVectorQuery:
|
|
"""
|
|
If this is called then any vector index is skipped
|
|
|
|
An exhaustive (flat) search will be performed. The query vector will
|
|
be compared to every vector in the table. At high scales this can be
|
|
expensive. However, this is often still useful. For example, skipping
|
|
the vector index can give you ground truth results which you can use to
|
|
calculate your recall to select an appropriate value for nprobes.
|
|
"""
|
|
self._inner.bypass_vector_index()
|
|
return self
|