Files
lancedb/python/python/lancedb/query.py
marca116 3a200d77ef fix: pre-filtering on hybrid search (#3096)
When using hybrid search with a where filter, the prefilter argument is
silently inverted. Passing prefilter=True actually performs
post-filtering, and prefilter=False actually performs pre-filtering.
2026-03-16 21:48:42 -07:00

3510 lines
125 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
from __future__ import annotations
from abc import ABC, abstractmethod
from concurrent.futures import ThreadPoolExecutor
from enum import Enum
from datetime import timedelta
from typing import (
TYPE_CHECKING,
Dict,
List,
Literal,
Optional,
Tuple,
Type,
TypeVar,
Union,
Any,
)
import asyncio
import deprecation
import numpy as np
import pyarrow as pa
import pyarrow.compute as pc
import pyarrow.fs as pa_fs
import pydantic
from lancedb.pydantic import PYDANTIC_VERSION
from lancedb.background_loop import LOOP
from . import __version__
from .arrow import AsyncRecordBatchReader
from .dependencies import pandas as pd
from .rerankers.base import Reranker
from .rerankers.rrf import RRFReranker
from .rerankers.util import check_reranker_result
from .util import flatten_columns
from lancedb._lancedb import fts_query_to_json
from typing_extensions import Annotated
if TYPE_CHECKING:
import sys
import PIL
import polars as pl
from ._lancedb import Query as LanceQuery
from ._lancedb import FTSQuery as LanceFTSQuery
from ._lancedb import HybridQuery as LanceHybridQuery
from ._lancedb import VectorQuery as LanceVectorQuery
from ._lancedb import TakeQuery as LanceTakeQuery
from ._lancedb import PyQueryRequest
from .common import VEC
from .pydantic import LanceModel
from .table import Table
if sys.version_info >= (3, 11):
from typing import Self
else:
from typing_extensions import Self
T = TypeVar("T", bound="LanceModel")
# Pydantic validation function for vector queries
def ensure_vector_query(
val: Any,
) -> Union[List[float], List[List[float]], pa.Array, List[pa.Array]]:
if isinstance(val, list):
if len(val) == 0:
return ValueError("Vector query must be a non-empty list")
sample = val[0]
else:
if isinstance(val, float):
raise ValueError(
"Vector query must be a list of floats or a list of lists of floats"
)
sample = val
if isinstance(sample, pa.Array):
# val is array or list of array
return val
if isinstance(sample, list):
if len(sample) == 0:
return ValueError("Vector query must be a non-empty list")
if isinstance(sample[0], float):
# val is list of list of floats
return val
if isinstance(sample, float):
# val is a list of floats
return val
class FullTextQueryType(str, Enum):
MATCH = "match"
MATCH_PHRASE = "match_phrase"
BOOST = "boost"
MULTI_MATCH = "multi_match"
BOOLEAN = "boolean"
class FullTextOperator(str, Enum):
AND = "AND"
OR = "OR"
class Occur(str, Enum):
SHOULD = "SHOULD"
MUST = "MUST"
MUST_NOT = "MUST_NOT"
@pydantic.dataclasses.dataclass
class FullTextQuery(ABC):
@abstractmethod
def query_type(self) -> FullTextQueryType:
"""
Get the query type of the query.
Returns
-------
str
The type of the query.
"""
pass
def to_json(self) -> str:
"""
Convert the query to a JSON string.
Returns
-------
str
A JSON string representation of the query.
Examples
--------
>>> from lancedb.query import MatchQuery
>>> query = MatchQuery("puppy", "text", fuzziness=2)
>>> query.to_json()
'{"match":{"column":"text","terms":"puppy","boost":1.0,"fuzziness":2,"max_expansions":50,"operator":"Or","prefix_length":0}}'
"""
return fts_query_to_json(self)
def __and__(self, other: "FullTextQuery") -> "FullTextQuery":
"""
Combine two queries with a logical AND operation.
Parameters
----------
other : FullTextQuery
The other query to combine with.
Returns
-------
FullTextQuery
A new query that combines both queries with AND.
"""
return BooleanQuery([(Occur.MUST, self), (Occur.MUST, other)])
def __or__(self, other: "FullTextQuery") -> "FullTextQuery":
"""
Combine two queries with a logical OR operation.
Parameters
----------
other : FullTextQuery
The other query to combine with.
Returns
-------
FullTextQuery
A new query that combines both queries with OR.
"""
return BooleanQuery([(Occur.SHOULD, self), (Occur.SHOULD, other)])
@pydantic.dataclasses.dataclass
class MatchQuery(FullTextQuery):
"""
Match query for full-text search.
Parameters
----------
query : str
The query string to match against.
column : str
The name of the column to match against.
boost : float, default 1.0
The boost factor for the query.
The score of each matching document is multiplied by this value.
fuzziness : int, optional
The maximum edit distance for each term in the match query.
Defaults to 0 (exact match).
If None, fuzziness is applied automatically by the rules:
- 0 for terms with length <= 2
- 1 for terms with length <= 5
- 2 for terms with length > 5
max_expansions : int, optional
The maximum number of terms to consider for fuzzy matching.
Defaults to 50.
operator : FullTextOperator, default OR
The operator to use for combining the query results.
Can be either `AND` or `OR`.
If `AND`, all terms in the query must match.
If `OR`, at least one term in the query must match.
prefix_length : int, optional
The number of beginning characters being unchanged for fuzzy matching.
This is useful to achieve prefix matching.
"""
query: str
column: str
boost: float = pydantic.Field(1.0, kw_only=True)
fuzziness: int = pydantic.Field(0, kw_only=True)
max_expansions: int = pydantic.Field(50, kw_only=True)
operator: FullTextOperator = pydantic.Field(FullTextOperator.OR, kw_only=True)
prefix_length: int = pydantic.Field(0, kw_only=True)
def query_type(self) -> FullTextQueryType:
return FullTextQueryType.MATCH
@pydantic.dataclasses.dataclass
class PhraseQuery(FullTextQuery):
"""
Phrase query for full-text search.
Parameters
----------
query : str
The query string to match against.
column : str
The name of the column to match against.
"""
query: str
column: str
slop: int = pydantic.Field(0, kw_only=True)
def query_type(self) -> FullTextQueryType:
return FullTextQueryType.MATCH_PHRASE
@pydantic.dataclasses.dataclass
class BoostQuery(FullTextQuery):
"""
Boost query for full-text search.
Parameters
----------
positive : dict
The positive query object.
negative : dict
The negative query object.
negative_boost : float, default 0.5
The boost factor for the negative query.
"""
positive: FullTextQuery
negative: FullTextQuery
negative_boost: float = pydantic.Field(0.5, kw_only=True)
def query_type(self) -> FullTextQueryType:
return FullTextQueryType.BOOST
@pydantic.dataclasses.dataclass
class MultiMatchQuery(FullTextQuery):
"""
Multi-match query for full-text search.
Parameters
----------
query : str | list[Query]
If a string, the query string to match against.
columns : list[str]
The list of columns to match against.
boosts : list[float], optional
The list of boost factors for each column. If not provided,
all columns will have the same boost factor.
operator : FullTextOperator, default OR
The operator to use for combining the query results.
Can be either `AND` or `OR`.
It would be applied to all columns individually.
For example, if the operator is `AND`,
then the query "hello world" is equal to
`match("hello AND world", column1) OR match("hello AND world", column2)`.
"""
query: str
columns: list[str]
boosts: Optional[list[float]] = pydantic.Field(None, kw_only=True)
operator: FullTextOperator = pydantic.Field(FullTextOperator.OR, kw_only=True)
def query_type(self) -> FullTextQueryType:
return FullTextQueryType.MULTI_MATCH
@pydantic.dataclasses.dataclass
class BooleanQuery(FullTextQuery):
"""
Boolean query for full-text search.
Parameters
----------
queries : list[tuple(Occur, FullTextQuery)]
The list of queries with their occurrence requirements.
Each tuple contains an Occur value (MUST, SHOULD, or MUST_NOT)
and a FullTextQuery to apply.
"""
queries: list[tuple[Occur, FullTextQuery]]
def query_type(self) -> FullTextQueryType:
return FullTextQueryType.BOOLEAN
class FullTextSearchQuery(pydantic.BaseModel):
"""A LanceDB Full Text Search Query
Attributes
----------
columns: List[str]
The columns to search
If None, then the table should select the column automatically.
query: str | FullTextQuery
If a string, it is treated as a MatchQuery.
If a FullTextQuery object, it is used directly.
"""
columns: Optional[List[str]] = None
query: Union[str, FullTextQuery]
class Query(pydantic.BaseModel):
"""A LanceDB Query
Queries are constructed by the `Table.search` method. This class is a
python representation of the query. Normally you will not need to interact
with this class directly. You can build up a query and execute it using
collection methods such as `to_batches()`, `to_arrow()`, `to_pandas()`,
etc.
However, you can use the `to_query()` method to get the underlying query object.
This can be useful for serializing a query or using it in a different context.
Attributes
----------
filter : Optional[str]
sql filter to refine the query with
limit : Optional[int]
The limit on the number of results to return. If this is a vector or FTS query,
then this is required. If this is a plain SQL query, then this is optional.
offset: Optional[int]
The offset to start fetching results from
This is ignored for vector / FTS search (will be None).
columns : Optional[Union[List[str], Dict[str, str]]]
which columns to return in the results
This can be a list of column names or a dictionary. If it is a dictionary,
then the keys are the column names and the values are sql expressions to
use to calculate the result.
If this is None then all columns are returned. This can be expensive.
with_row_id : Optional[bool]
if True then include the row id in the results
vector : Optional[Union[List[float], List[List[float]], pa.Array, List[pa.Array]]]
the vector to search for, if this a vector search or hybrid search. It will
be None for full text search and plain SQL filtering.
vector_column : Optional[str]
the name of the vector column to use for vector search
If this is None then a default vector column will be used.
distance_type : Optional[str]
the distance type to use for vector search
This can be l2 (default), cosine and dot. See [metric definitions][search] for
more details.
If this is not a vector search this will be None.
postfilter : bool
if True then apply the filter after vector / FTS search. This is ignored for
plain SQL filtering.
nprobes : Optional[int]
The number of IVF partitions to search. If this is None then a default
number of partitions will be used.
- A higher number makes search more accurate but also slower.
- See discussion in [Querying an ANN Index][querying-an-ann-index] for
tuning advice.
Will be None if this is not a vector search.
refine_factor : Optional[int]
Refine the results by reading extra elements and re-ranking them in memory.
- A higher number makes search more accurate but also slower.
- See discussion in [Querying an ANN Index][querying-an-ann-index] for
tuning advice.
Will be None if this is not a vector search.
lower_bound : Optional[float]
The lower bound for distance search
Only results with a distance greater than or equal to this value
will be returned.
This will only be set on vector search.
upper_bound : Optional[float]
The upper bound for distance search
Only results with a distance less than or equal to this value
will be returned.
This will only be set on vector search.
ef : Optional[int]
The size of the nearest neighbor list maintained during HNSW search
This will only be set on vector search.
full_text_query : Optional[Union[str, dict]]
The full text search query
This can be a string or a dictionary. A dictionary will be used to search
multiple columns. The keys are the column names and the values are the
search queries.
This will only be set on FTS or hybrid queries.
fast_search: Optional[bool]
Skip a flat search of unindexed data. This will improve
search performance but search results will not include unindexed data.
The default is False
"""
# The name of the vector column to use for vector search.
vector_column: Optional[str] = None
# vector to search for
#
# Note: today this will be floats on the sync path and pa.Array on the async
# path though in the future we should unify this to pa.Array everywhere
vector: Annotated[
Optional[Union[List[float], List[List[float]], pa.Array, List[pa.Array]]],
ensure_vector_query,
] = None
# sql filter to refine the query with
filter: Optional[str] = None
# if True then apply the filter after vector search
postfilter: Optional[bool] = None
# full text search query
full_text_query: Optional[FullTextSearchQuery] = None
# top k results to return
limit: Optional[int] = None
# distance type to use for vector search
distance_type: Optional[str] = None
# which columns to return in the results
columns: Optional[Union[List[str], Dict[str, str]]] = None
# minimum number of IVF partitions to search
#
# If None then a default value (20) will be used.
minimum_nprobes: Optional[int] = None
# maximum number of IVF partitions to search
#
# If None then a default value (20) will be used.
#
# If 0 then no limit will be applied and all partitions could be searched
# if needed to satisfy the limit.
maximum_nprobes: Optional[int] = None
# lower bound for distance search
lower_bound: Optional[float] = None
# upper bound for distance search
upper_bound: Optional[float] = None
# multiplier for the number of results to inspect for reranking
refine_factor: Optional[int] = None
# if true, include the row id in the results
with_row_id: Optional[bool] = None
# offset to start fetching results from
offset: Optional[int] = None
# if true, will only search the indexed data
fast_search: Optional[bool] = None
# size of the nearest neighbor list maintained during HNSW search
ef: Optional[int] = None
# Bypass the vector index and use a brute force search
bypass_vector_index: Optional[bool] = None
@classmethod
def from_inner(cls, req: PyQueryRequest) -> Self:
query = cls()
query.limit = req.limit
query.offset = req.offset
query.filter = req.filter
query.full_text_query = req.full_text_search
query.columns = req.select
query.with_row_id = req.with_row_id
query.vector_column = req.column
query.vector = req.query_vector
query.distance_type = req.distance_type
query.minimum_nprobes = req.minimum_nprobes
query.maximum_nprobes = req.maximum_nprobes
query.lower_bound = req.lower_bound
query.upper_bound = req.upper_bound
query.ef = req.ef
query.refine_factor = req.refine_factor
query.bypass_vector_index = req.bypass_vector_index
query.postfilter = req.postfilter
if req.full_text_search is not None:
query.full_text_query = FullTextSearchQuery(
columns=None,
query=req.full_text_search,
)
return query
# This tells pydantic to allow custom types (needed for the `vector` query since
# pa.Array wouln't be allowed otherwise)
if PYDANTIC_VERSION.major < 2: # Pydantic 1.x compat
class Config:
arbitrary_types_allowed = True
else:
model_config = {"arbitrary_types_allowed": True}
class LanceQueryBuilder(ABC):
"""An abstract query builder. Subclasses are defined for vector search,
full text search, hybrid, and plain SQL filtering.
"""
@classmethod
def create(
cls,
table: "Table",
query: Optional[Union[np.ndarray, str, "PIL.Image.Image", Tuple]],
query_type: str,
vector_column_name: str,
ordering_field_name: Optional[str] = None,
fts_columns: Optional[Union[str, List[str]]] = None,
fast_search: bool = None,
) -> Self:
"""
Create a query builder based on the given query and query type.
Parameters
----------
table: Table
The table to query.
query: Optional[Union[np.ndarray, str, "PIL.Image.Image", Tuple]]
The query to use. If None, an empty query builder is returned
which performs simple SQL filtering.
query_type: str
The type of query to perform. One of "vector", "fts", "hybrid", or "auto".
If "auto", the query type is inferred based on the query.
vector_column_name: str
The name of the vector column to use for vector search.
fast_search: bool
Skip flat search of unindexed data.
"""
# Check hybrid search first as it supports empty query pattern
if query_type == "hybrid":
# hybrid fts and vector query
return LanceHybridQueryBuilder(
table, query, vector_column_name, fts_columns=fts_columns
)
if query is None:
return LanceEmptyQueryBuilder(table)
# remember the string query for reranking purpose
str_query = query if isinstance(query, str) else None
# convert "auto" query_type to "vector", "fts"
# or "hybrid" and convert the query to vector if needed
query, query_type = cls._resolve_query(
table, query, query_type, vector_column_name
)
if query_type == "hybrid":
return LanceHybridQueryBuilder(
table, query, vector_column_name, fts_columns=fts_columns
)
if isinstance(query, (str, FullTextQuery)):
# fts
return LanceFtsQueryBuilder(
table,
query,
ordering_field_name=ordering_field_name,
fts_columns=fts_columns,
fast_search=fast_search,
)
if isinstance(query, list):
query = np.array(query, dtype=np.float32)
elif isinstance(query, np.ndarray):
query = query.astype(np.float32)
else:
raise TypeError(f"Unsupported query type: {type(query)}")
return LanceVectorQueryBuilder(
table, query, vector_column_name, str_query, fast_search
)
@classmethod
def _resolve_query(cls, table, query, query_type, vector_column_name):
# If query_type is fts, then query must be a string.
# otherwise raise TypeError
if query_type == "fts":
if not isinstance(query, (str, FullTextQuery)):
raise TypeError(
f"'fts' query must be a string or FullTextQuery: {type(query)}"
)
return query, query_type
elif query_type == "vector":
query = cls._query_to_vector(table, query, vector_column_name)
return query, query_type
elif query_type == "auto":
if isinstance(query, (list, np.ndarray)):
return query, "vector"
else:
conf = table.embedding_functions.get(vector_column_name)
if conf is not None:
query = conf.function.compute_query_embeddings_with_retry(query)[0]
return query, "vector"
else:
return query, "fts"
else:
raise ValueError(
f"Invalid query_type, must be 'vector', 'fts', or 'auto': {query_type}"
)
@classmethod
def _query_to_vector(cls, table, query, vector_column_name):
if isinstance(query, (list, np.ndarray)):
return query
conf = table.embedding_functions.get(vector_column_name)
if conf is not None:
return conf.function.compute_query_embeddings_with_retry(query)[0]
else:
msg = f"No embedding function for {vector_column_name}"
raise ValueError(msg)
def __init__(self, table: "Table"):
self._table = table
self._limit = None
self._offset = None
self._columns = None
self._where = None
self._postfilter = None
self._with_row_id = None
self._vector = None
self._text = None
self._ef = None
self._bypass_vector_index = None
@deprecation.deprecated(
deprecated_in="0.3.1",
removed_in="0.4.0",
current_version=__version__,
details="Use to_pandas() instead",
)
def to_df(self) -> "pd.DataFrame":
"""
*Deprecated alias for `to_pandas()`. Please use `to_pandas()` instead.*
Execute the query and return the results as a pandas DataFrame.
In addition to the selected columns, LanceDB also returns a vector
and also the "_distance" column which is the distance between the query
vector and the returned vector.
"""
return self.to_pandas()
def to_pandas(
self,
flatten: Optional[Union[int, bool]] = None,
*,
timeout: Optional[timedelta] = None,
) -> "pd.DataFrame":
"""
Execute the query and return the results as a pandas DataFrame.
In addition to the selected columns, LanceDB also returns a vector
and also the "_distance" column which is the distance between the query
vector and the returned vector.
Parameters
----------
flatten: Optional[Union[int, bool]]
If flatten is True, flatten all nested columns.
If flatten is an integer, flatten the nested columns up to the
specified depth.
If unspecified, do not flatten the nested columns.
timeout: Optional[timedelta]
The maximum time to wait for the query to complete.
If None, wait indefinitely.
"""
tbl = flatten_columns(self.to_arrow(timeout=timeout), flatten)
return tbl.to_pandas()
@abstractmethod
def to_arrow(self, *, timeout: Optional[timedelta] = None) -> pa.Table:
"""
Execute the query and return the results as an
[Apache Arrow Table](https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table).
In addition to the selected columns, LanceDB also returns a vector
and also the "_distance" column which is the distance between the query
vector and the returned vectors.
Parameters
----------
timeout: Optional[timedelta]
The maximum time to wait for the query to complete.
If None, wait indefinitely.
"""
raise NotImplementedError
@abstractmethod
def to_batches(
self,
/,
batch_size: Optional[int] = None,
*,
timeout: Optional[timedelta] = None,
) -> pa.RecordBatchReader:
"""
Execute the query and return the results as a pyarrow
[RecordBatchReader](https://arrow.apache.org/docs/python/generated/pyarrow.RecordBatchReader.html)
Parameters
----------
batch_size: int
The maximum number of selected records in a RecordBatch object.
timeout: Optional[timedelta]
The maximum time to wait for the query to complete.
If None, wait indefinitely.
"""
raise NotImplementedError
def to_list(self, *, timeout: Optional[timedelta] = None) -> List[dict]:
"""
Execute the query and return the results as a list of dictionaries.
Each list entry is a dictionary with the selected column names as keys,
or all table columns if `select` is not called. The vector and the "_distance"
fields are returned whether or not they're explicitly selected.
Parameters
----------
timeout: Optional[timedelta]
The maximum time to wait for the query to complete.
If None, wait indefinitely.
"""
return self.to_arrow(timeout=timeout).to_pylist()
def to_pydantic(
self, model: type[T], *, timeout: Optional[timedelta] = None
) -> list[T]:
"""Return the table as a list of pydantic models.
Parameters
----------
model: Type[LanceModel]
The pydantic model to use.
timeout: Optional[timedelta]
The maximum time to wait for the query to complete.
If None, wait indefinitely.
Returns
-------
List[LanceModel]
"""
return [model(**row) for row in self.to_arrow(timeout=timeout).to_pylist()]
def to_polars(self, *, timeout: Optional[timedelta] = None) -> "pl.DataFrame":
"""
Execute the query and return the results as a Polars DataFrame.
In addition to the selected columns, LanceDB also returns a vector
and also the "_distance" column which is the distance between the query
vector and the returned vector.
Parameters
----------
timeout: Optional[timedelta]
The maximum time to wait for the query to complete.
If None, wait indefinitely.
"""
import polars as pl
return pl.from_arrow(self.to_arrow(timeout=timeout))
def limit(self, limit: Union[int, None]) -> Self:
"""Set the maximum number of results to return.
Parameters
----------
limit: int
The maximum number of results to return.
The default query limit is 10 results.
For ANN/KNN queries, you must specify a limit.
For plain searches, all records are returned if limit not set.
*WARNING* if you have a large dataset, setting
the limit to a large number, e.g. the table size,
can potentially result in reading a
large amount of data into memory and cause
out of memory issues.
Returns
-------
LanceQueryBuilder
The LanceQueryBuilder object.
"""
if limit is None or limit <= 0:
if isinstance(self, LanceVectorQueryBuilder):
raise ValueError("Limit is required for ANN/KNN queries")
else:
self._limit = None
else:
self._limit = limit
return self
def offset(self, offset: int) -> Self:
"""Set the offset for the results.
Parameters
----------
offset: int
The offset to start fetching results from.
Returns
-------
LanceQueryBuilder
The LanceQueryBuilder object.
"""
if offset is None or offset <= 0:
self._offset = 0
else:
self._offset = offset
return self
def select(self, columns: Union[list[str], dict[str, str]]) -> Self:
"""Set the columns to return.
Parameters
----------
columns: list of str, or dict of str to str default None
List of column names to be fetched.
Or a dictionary of column names to SQL expressions.
All columns are fetched if None or unspecified.
Returns
-------
LanceQueryBuilder
The LanceQueryBuilder object.
"""
if isinstance(columns, list) or isinstance(columns, dict):
self._columns = columns
else:
raise ValueError("columns must be a list or a dictionary")
return self
def where(self, where: str, prefilter: bool = True) -> Self:
"""Set the where clause.
Parameters
----------
where: str
The where clause which is a valid SQL where clause. See
`Lance filter pushdown <https://lance.org/guide/read_and_write#filter-push-down>`_
for valid SQL expressions.
prefilter: bool, default True
If True, apply the filter before vector search, otherwise the
filter is applied on the result of vector search.
This feature is **EXPERIMENTAL** and may be removed and modified
without warning in the future.
Returns
-------
LanceQueryBuilder
The LanceQueryBuilder object.
"""
self._where = where
self._postfilter = not prefilter
return self
def with_row_id(self, with_row_id: bool) -> Self:
"""Set whether to return row ids.
Parameters
----------
with_row_id: bool
If True, return _rowid column in the results.
Returns
-------
LanceQueryBuilder
The LanceQueryBuilder object.
"""
self._with_row_id = with_row_id
return self
def explain_plan(self, verbose: Optional[bool] = False) -> str:
"""Return the execution plan for this query.
Examples
--------
>>> import lancedb
>>> db = lancedb.connect("./.lancedb")
>>> table = db.create_table("my_table", [{"vector": [99.0, 99]}])
>>> query = [100, 100]
>>> plan = table.search(query).explain_plan(True)
>>> print(plan) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
ProjectionExec: expr=[vector@0 as vector, _distance@2 as _distance]
GlobalLimitExec: skip=0, fetch=10
FilterExec: _distance@2 IS NOT NULL
SortExec: TopK(fetch=10), expr=[_distance@2 ASC NULLS LAST, _rowid@1 ASC NULLS LAST], preserve_partitioning=[false]
KNNVectorDistance: metric=l2
LanceRead: uri=..., projection=[vector], ...
Parameters
----------
verbose : bool, default False
Use a verbose output format.
Returns
-------
plan : str
""" # noqa: E501
return self._table._explain_plan(self.to_query_object(), verbose=verbose)
def analyze_plan(self) -> str:
"""
Run the query and return its execution plan with runtime metrics.
This returns detailed metrics for each step, such as elapsed time,
rows processed, bytes read, and I/O stats. It is useful for debugging
and performance tuning.
Examples
--------
>>> import lancedb
>>> db = lancedb.connect("./.lancedb")
>>> table = db.create_table("my_table", [{"vector": [99.0, 99]}])
>>> query = [100, 100]
>>> plan = table.search(query).analyze_plan()
>>> print(plan) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
AnalyzeExec verbose=true, elapsed=..., metrics=...
TracedExec, elapsed=..., metrics=...
ProjectionExec: elapsed=..., expr=[...],
metrics=[output_rows=..., elapsed_compute=..., output_bytes=...]
GlobalLimitExec: elapsed=..., skip=0, fetch=10,
metrics=[output_rows=..., elapsed_compute=..., output_bytes=...]
FilterExec: elapsed=..., _distance@2 IS NOT NULL, metrics=[...]
SortExec: elapsed=..., TopK(fetch=10), expr=[...],
preserve_partitioning=[...],
metrics=[output_rows=..., elapsed_compute=...,
output_bytes=..., row_replacements=...]
KNNVectorDistance: elapsed=..., metric=l2,
metrics=[output_rows=..., elapsed_compute=...,
output_bytes=..., output_batches=...]
LanceRead: elapsed=..., uri=..., projection=[vector],
num_fragments=..., range_before=None, range_after=None,
row_id=true, row_addr=false,
full_filter=--, refine_filter=--,
metrics=[output_rows=..., elapsed_compute=..., output_bytes=...,
fragments_scanned=..., ranges_scanned=1, rows_scanned=1,
bytes_read=..., iops=..., requests=..., task_wait_time=...]
Returns
-------
plan : str
The physical query execution plan with runtime metrics.
"""
return self._table._analyze_plan(self.to_query_object())
def vector(self, vector: Union[np.ndarray, list]) -> Self:
"""Set the vector to search for.
Parameters
----------
vector: np.ndarray or list
The vector to search for.
Returns
-------
LanceQueryBuilder
The LanceQueryBuilder object.
"""
raise NotImplementedError
def text(self, text: str | FullTextQuery) -> Self:
"""Set the text to search for.
Parameters
----------
text: str | FullTextQuery
If a string, it is treated as a MatchQuery.
If a FullTextQuery object, it is used directly.
Returns
-------
LanceQueryBuilder
The LanceQueryBuilder object.
"""
raise NotImplementedError
@abstractmethod
def rerank(self, reranker: Reranker) -> Self:
"""Rerank the results using the specified reranker.
Parameters
----------
reranker: Reranker
The reranker to use.
Returns
-------
The LanceQueryBuilder object.
"""
raise NotImplementedError
@abstractmethod
def to_query_object(self) -> Query:
"""Return a serializable representation of the query
Returns
-------
Query
The serializable representation of the query
"""
raise NotImplementedError
class LanceVectorQueryBuilder(LanceQueryBuilder):
"""
Examples
--------
>>> import lancedb
>>> data = [{"vector": [1.1, 1.2], "b": 2},
... {"vector": [0.5, 1.3], "b": 4},
... {"vector": [0.4, 0.4], "b": 6},
... {"vector": [0.4, 0.4], "b": 10}]
>>> db = lancedb.connect("./.lancedb")
>>> table = db.create_table("my_table", data=data)
>>> (table.search([0.4, 0.4])
... .distance_type("cosine")
... .where("b < 10")
... .select(["b", "vector"])
... .limit(2)
... .to_pandas())
b vector _distance
0 6 [0.4, 0.4] 0.000000
1 2 [1.1, 1.2] 0.000944
"""
def __init__(
self,
table: "Table",
query: Union[np.ndarray, list, "PIL.Image.Image"],
vector_column: str,
str_query: Optional[str] = None,
fast_search: bool = None,
):
super().__init__(table)
self._query = query
self._distance_type = None
self._minimum_nprobes = None
self._maximum_nprobes = None
self._lower_bound = None
self._upper_bound = None
self._refine_factor = None
self._vector_column = vector_column
self._postfilter = None
self._reranker = None
self._str_query = str_query
self._fast_search = fast_search
def metric(self, metric: Literal["l2", "cosine", "dot"]) -> LanceVectorQueryBuilder:
"""Set the distance metric to use.
This is an alias for distance_type() and may be deprecated in the future.
Please use distance_type() instead.
Parameters
----------
metric: "l2" or "cosine" or "dot"
The distance metric to use. By default "l2" is used.
Returns
-------
LanceVectorQueryBuilder
The LanceQueryBuilder object.
"""
return self.distance_type(metric)
def distance_type(
self, distance_type: Literal["l2", "cosine", "dot"]
) -> "LanceVectorQueryBuilder":
"""Set the distance metric to use.
When performing a vector search we try and find the "nearest" vectors according
to some kind of distance metric. This parameter controls which distance metric
to use.
Note: if there is a vector index then the distance type used MUST match the
distance type used to train the vector index. If this is not done then the
results will be invalid.
Parameters
----------
distance_type: "l2" or "cosine" or "dot"
The distance metric to use. By default "l2" is used.
Returns
-------
LanceVectorQueryBuilder
The LanceQueryBuilder object.
"""
self._distance_type = distance_type.lower()
return self
def nprobes(self, nprobes: int) -> LanceVectorQueryBuilder:
"""Set the number of probes to use.
Higher values will yield better recall (more likely to find vectors if
they exist) at the expense of latency.
See discussion in [Querying an ANN Index][querying-an-ann-index] for
tuning advice.
This method sets both the minimum and maximum number of probes to the same
value. See `minimum_nprobes` and `maximum_nprobes` for more fine-grained
control.
Parameters
----------
nprobes: int
The number of probes to use.
Returns
-------
LanceVectorQueryBuilder
The LanceQueryBuilder object.
"""
self._minimum_nprobes = nprobes
self._maximum_nprobes = nprobes
return self
def minimum_nprobes(self, minimum_nprobes: int) -> LanceVectorQueryBuilder:
"""Set the minimum number of probes to use.
See `nprobes` for more details.
These partitions will be searched on every vector query and will increase recall
at the expense of latency.
"""
self._minimum_nprobes = minimum_nprobes
return self
def maximum_nprobes(self, maximum_nprobes: int) -> LanceVectorQueryBuilder:
"""Set the maximum number of probes to use.
See `nprobes` for more details.
If this value is greater than `minimum_nprobes` then the excess partitions
will be searched only if we have not found enough results.
This can be useful when there is a narrow filter to allow these queries to
spend more time searching and avoid potential false negatives.
If this value is 0 then no limit will be applied and all partitions could be
searched if needed to satisfy the limit.
"""
self._maximum_nprobes = maximum_nprobes
return self
def distance_range(
self, lower_bound: Optional[float] = None, upper_bound: Optional[float] = None
) -> LanceVectorQueryBuilder:
"""Set the distance range to use.
Only rows with distances within range [lower_bound, upper_bound)
will be returned.
Parameters
----------
lower_bound: Optional[float]
The lower bound of the distance range.
upper_bound: Optional[float]
The upper bound of the distance range.
Returns
-------
LanceVectorQueryBuilder
The LanceQueryBuilder object.
"""
self._lower_bound = lower_bound
self._upper_bound = upper_bound
return self
def ef(self, ef: int) -> LanceVectorQueryBuilder:
"""Set the number of candidates to consider during search.
Higher values will yield better recall (more likely to find vectors if
they exist) at the expense of latency.
This only applies to the HNSW-related index.
The default value is 1.5 * limit.
Parameters
----------
ef: int
The number of candidates to consider during search.
Returns
-------
LanceVectorQueryBuilder
The LanceQueryBuilder object.
"""
self._ef = ef
return self
def refine_factor(self, refine_factor: int) -> LanceVectorQueryBuilder:
"""Set the refine factor to use, increasing the number of vectors sampled.
As an example, a refine factor of 2 will sample 2x as many vectors as
requested, re-ranks them, and returns the top half most relevant results.
See discussion in [Querying an ANN Index][querying-an-ann-index] for
tuning advice.
Parameters
----------
refine_factor: int
The refine factor to use.
Returns
-------
LanceVectorQueryBuilder
The LanceQueryBuilder object.
"""
self._refine_factor = refine_factor
return self
def output_schema(self) -> pa.Schema:
"""
Return the output schema for the query
This does not execute the query.
"""
return self._table._output_schema(self.to_query_object())
def to_arrow(self, *, timeout: Optional[timedelta] = None) -> pa.Table:
"""
Execute the query and return the results as an
[Apache Arrow Table](https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table).
In addition to the selected columns, LanceDB also returns a vector
and also the "_distance" column which is the distance between the query
vector and the returned vectors.
Parameters
----------
timeout: Optional[timedelta]
The maximum time to wait for the query to complete.
If None, wait indefinitely.
"""
return self.to_batches(timeout=timeout).read_all()
def to_query_object(self) -> Query:
"""
Build a Query object
This can be used to serialize a query
"""
vector = self._query if isinstance(self._query, list) else self._query.tolist()
if isinstance(vector[0], np.ndarray):
vector = [v.tolist() for v in vector]
return Query(
vector=vector,
filter=self._where,
postfilter=self._postfilter,
limit=self._limit,
distance_type=self._distance_type,
columns=self._columns,
minimum_nprobes=self._minimum_nprobes,
maximum_nprobes=self._maximum_nprobes,
lower_bound=self._lower_bound,
upper_bound=self._upper_bound,
refine_factor=self._refine_factor,
vector_column=self._vector_column,
with_row_id=self._with_row_id,
offset=self._offset,
fast_search=self._fast_search,
ef=self._ef,
bypass_vector_index=self._bypass_vector_index,
)
def to_batches(
self,
/,
batch_size: Optional[int] = None,
*,
timeout: Optional[timedelta] = None,
) -> pa.RecordBatchReader:
"""
Execute the query and return the result as a RecordBatchReader object.
Parameters
----------
batch_size: int
The maximum number of selected records in a RecordBatch object.
timeout: timedelta, default None
The maximum time to wait for the query to complete.
If None, wait indefinitely.
Returns
-------
pa.RecordBatchReader
"""
vector = self._query if isinstance(self._query, list) else self._query.tolist()
if isinstance(vector[0], np.ndarray):
vector = [v.tolist() for v in vector]
query = self.to_query_object()
result_set = self._table._execute_query(
query, batch_size=batch_size, timeout=timeout
)
if self._reranker is not None:
rs_table = result_set.read_all()
result_set = self._reranker.rerank_vector(self._str_query, rs_table)
check_reranker_result(result_set)
# convert result_set back to RecordBatchReader
result_set = pa.RecordBatchReader.from_batches(
result_set.schema, result_set.to_batches()
)
return result_set
def where(self, where: str, prefilter: bool = None) -> LanceVectorQueryBuilder:
"""Set the where clause.
Parameters
----------
where: str
The where clause which is a valid SQL where clause. See
`Lance filter pushdown <https://lance.org/guide/read_and_write#filter-push-down>`_
for valid SQL expressions.
prefilter: bool, default True
If True, apply the filter before vector search, otherwise the
filter is applied on the result of vector search.
Returns
-------
LanceQueryBuilder
The LanceQueryBuilder object.
"""
self._where = where
if prefilter is not None:
self._postfilter = not prefilter
return self
def rerank(
self, reranker: Reranker, query_string: Optional[str] = None
) -> LanceVectorQueryBuilder:
"""Rerank the results using the specified reranker.
Parameters
----------
reranker: Reranker
The reranker to use.
query_string: Optional[str]
The query to use for reranking. This needs to be specified explicitly here
as the query used for vector search may already be vectorized and the
reranker requires a string query.
This is only required if the query used for vector search is not a string.
Note: This doesn't yet support the case where the query is multimodal or a
list of vectors.
Returns
-------
LanceVectorQueryBuilder
The LanceQueryBuilder object.
"""
self._reranker = reranker
if self._str_query is None and query_string is None:
raise ValueError(
"""
The query used for vector search is not a string.
In this case, the reranker query needs to be specified explicitly.
"""
)
if query_string is not None and not isinstance(query_string, str):
raise ValueError("Reranking currently only supports string queries")
self._str_query = query_string if query_string is not None else self._str_query
if reranker.score == "all":
self.with_row_id(True)
return self
def bypass_vector_index(self) -> LanceVectorQueryBuilder:
"""
If this is called then any vector index is skipped
An exhaustive (flat) search will be performed. The query vector will
be compared to every vector in the table. At high scales this can be
expensive. However, this is often still useful. For example, skipping
the vector index can give you ground truth results which you can use to
calculate your recall to select an appropriate value for nprobes.
Returns
-------
LanceVectorQueryBuilder
The LanceVectorQueryBuilder object.
"""
self._bypass_vector_index = True
return self
def fast_search(self) -> LanceVectorQueryBuilder:
"""
Skip a flat search of unindexed data. This will improve
search performance but search results will not include unindexed data.
Returns
-------
LanceVectorQueryBuilder
The LanceVectorQueryBuilder object.
"""
self._fast_search = True
return self
class LanceFtsQueryBuilder(LanceQueryBuilder):
"""A builder for full text search for LanceDB."""
def __init__(
self,
table: "Table",
query: str | FullTextQuery,
ordering_field_name: Optional[str] = None,
fts_columns: Optional[Union[str, List[str]]] = None,
fast_search: bool = None,
):
super().__init__(table)
self._query = query
self._phrase_query = False
self.ordering_field_name = ordering_field_name
self._reranker = None
self._fast_search = fast_search
if isinstance(fts_columns, str):
fts_columns = [fts_columns]
self._fts_columns = fts_columns
def phrase_query(self, phrase_query: bool = True) -> LanceFtsQueryBuilder:
"""Set whether to use phrase query.
Parameters
----------
phrase_query: bool, default True
If True, then the query will be wrapped in quotes and
double quotes replaced by single quotes.
Returns
-------
LanceFtsQueryBuilder
The LanceFtsQueryBuilder object.
"""
self._phrase_query = phrase_query
return self
def fast_search(self) -> LanceFtsQueryBuilder:
"""
Skip a flat search of unindexed data. This will improve
search performance but search results will not include unindexed data.
Returns
-------
LanceFtsQueryBuilder
The LanceFtsQueryBuilder object.
"""
self._fast_search = True
return self
def to_query_object(self) -> Query:
return Query(
columns=self._columns,
filter=self._where,
limit=self._limit,
postfilter=self._postfilter,
with_row_id=self._with_row_id,
full_text_query=FullTextSearchQuery(
query=self._query, columns=self._fts_columns
),
offset=self._offset,
fast_search=self._fast_search,
)
def output_schema(self) -> pa.Schema:
"""
Return the output schema for the query
This does not execute the query.
"""
return self._table._output_schema(self.to_query_object())
def to_arrow(self, *, timeout: Optional[timedelta] = None) -> pa.Table:
path, fs, exist = self._table._get_fts_index_path()
if exist:
return self.tantivy_to_arrow()
query = self._query
if self._phrase_query:
if isinstance(query, str):
if not query.startswith('"') or not query.endswith('"'):
self._query = f'"{query}"'
elif isinstance(query, FullTextQuery) and not isinstance(
query, PhraseQuery
):
raise TypeError("Please use PhraseQuery for phrase queries.")
query = self.to_query_object()
results = self._table._execute_query(query, timeout=timeout)
results = results.read_all()
if self._reranker is not None:
results = self._reranker.rerank_fts(self._query, results)
check_reranker_result(results)
return results
def to_batches(
self, /, batch_size: Optional[int] = None, timeout: Optional[timedelta] = None
):
raise NotImplementedError("to_batches on an FTS query")
def tantivy_to_arrow(self) -> pa.Table:
try:
import tantivy
except ImportError:
raise ImportError(
"Please install tantivy-py `pip install tantivy` to use the full text search feature." # noqa: E501
)
from .fts import search_index
# get the index path
path, fs, exist = self._table._get_fts_index_path()
# check if the index exist
if not exist:
raise FileNotFoundError(
"Fts index does not exist. "
"Please first call table.create_fts_index(['<field_names>']) to "
"create the fts index."
)
# Check that we are on local filesystem
if not isinstance(fs, pa_fs.LocalFileSystem):
raise NotImplementedError(
"Tantivy-based full text search "
"is only supported on the local filesystem"
)
# open the index
index = tantivy.Index.open(path)
# get the scores and doc ids
query = self._query
if self._phrase_query:
query = query.replace('"', "'")
query = f'"{query}"'
limit = self._limit if self._limit is not None else 10
row_ids, scores = search_index(
index, query, limit, ordering_field=self.ordering_field_name
)
if len(row_ids) == 0:
empty_schema = pa.schema([pa.field("_score", pa.float32())])
return pa.Table.from_batches([], schema=empty_schema)
scores = pa.array(scores)
output_tbl = self._table.to_lance().take(row_ids, columns=self._columns)
output_tbl = output_tbl.append_column("_score", scores)
# this needs to match vector search results which are uint64
row_ids = pa.array(row_ids, type=pa.uint64())
if self._where is not None:
tmp_name = "__lancedb__duckdb__indexer__"
output_tbl = output_tbl.append_column(
tmp_name, pa.array(range(len(output_tbl)))
)
try:
# TODO would be great to have Substrait generate pyarrow compute
# expressions or conversely have pyarrow support SQL expressions
# using Substrait
import duckdb
indexer = duckdb.sql(
f"SELECT {tmp_name} FROM output_tbl WHERE {self._where}"
).to_arrow_table()[tmp_name]
output_tbl = output_tbl.take(indexer).drop([tmp_name])
row_ids = row_ids.take(indexer)
except ImportError:
import tempfile
import lance
# TODO Use "memory://" instead once that's supported
with tempfile.TemporaryDirectory() as tmp:
ds = lance.write_dataset(output_tbl, tmp)
output_tbl = ds.to_table(filter=self._where)
indexer = output_tbl[tmp_name]
row_ids = row_ids.take(indexer)
output_tbl = output_tbl.drop([tmp_name])
if self._with_row_id:
output_tbl = output_tbl.append_column("_rowid", row_ids)
if self._reranker is not None:
output_tbl = self._reranker.rerank_fts(self._query, output_tbl)
return output_tbl
def rerank(self, reranker: Reranker) -> LanceFtsQueryBuilder:
"""Rerank the results using the specified reranker.
Parameters
----------
reranker: Reranker
The reranker to use.
Returns
-------
LanceFtsQueryBuilder
The LanceQueryBuilder object.
"""
self._reranker = reranker
if reranker.score == "all":
self.with_row_id(True)
return self
class LanceEmptyQueryBuilder(LanceQueryBuilder):
def to_arrow(self, *, timeout: Optional[timedelta] = None) -> pa.Table:
return self.to_batches(timeout=timeout).read_all()
def to_query_object(self) -> Query:
return Query(
columns=self._columns,
filter=self._where,
limit=self._limit,
with_row_id=self._with_row_id,
offset=self._offset,
)
def output_schema(self) -> pa.Schema:
query = self.to_query_object()
return self._table._output_schema(query)
def to_batches(
self, /, batch_size: Optional[int] = None, timeout: Optional[timedelta] = None
) -> pa.RecordBatchReader:
query = self.to_query_object()
return self._table._execute_query(query, batch_size=batch_size, timeout=timeout)
def rerank(self, reranker: Reranker) -> LanceEmptyQueryBuilder:
"""Rerank the results using the specified reranker.
Parameters
----------
reranker: Reranker
The reranker to use.
Returns
-------
LanceEmptyQueryBuilder
The LanceQueryBuilder object.
"""
raise NotImplementedError("Reranking is not yet supported.")
class LanceHybridQueryBuilder(LanceQueryBuilder):
"""
A query builder that performs hybrid vector and full text search.
Results are combined and reranked based on the specified reranker.
By default, the results are reranked using the RRFReranker, which
uses reciprocal rank fusion score for reranking.
To make the vector and fts results comparable, the scores are normalized.
Instead of normalizing scores, the `normalize` parameter can be set to "rank"
in the `rerank` method to convert the scores to ranks and then normalize them.
"""
def __init__(
self,
table: "Table",
query: Optional[Union[str, FullTextQuery]] = None,
vector_column: Optional[str] = None,
fts_columns: Optional[Union[str, List[str]]] = None,
):
super().__init__(table)
self._query = query
self._vector_column = vector_column
self._fts_columns = fts_columns
self._norm = None
self._reranker = None
self._minimum_nprobes = None
self._maximum_nprobes = None
self._refine_factor = None
self._distance_type = None
self._phrase_query = None
self._lower_bound = None
self._upper_bound = None
def _validate_query(self, query, vector=None, text=None):
if query is not None and (vector is not None or text is not None):
raise ValueError(
"You can either provide a string query in search() method"
"or set `vector()` and `text()` explicitly for hybrid search."
"But not both."
)
vector_query = vector if vector is not None else query
if not isinstance(vector_query, (str, list, np.ndarray)):
raise ValueError("Vector query must be either a string or a vector")
text_query = text or query
if text_query is None:
raise ValueError("Text query must be provided for hybrid search.")
if not isinstance(text_query, (str, FullTextQuery)):
raise ValueError("Text query must be a string or FullTextQuery")
return vector_query, text_query
def phrase_query(self, phrase_query: bool = None) -> LanceHybridQueryBuilder:
"""Set whether to use phrase query.
Parameters
----------
phrase_query: bool, default True
If True, then the query will be wrapped in quotes and
double quotes replaced by single quotes.
Returns
-------
LanceHybridQueryBuilder
The LanceHybridQueryBuilder object.
"""
self._phrase_query = phrase_query
return self
def to_query_object(self) -> Query:
raise NotImplementedError("to_query_object not yet supported on a hybrid query")
def to_arrow(self, *, timeout: Optional[timedelta] = None) -> pa.Table:
self._create_query_builders()
with ThreadPoolExecutor() as executor:
fts_future = executor.submit(
self._fts_query.with_row_id(True).to_arrow, timeout=timeout
)
vector_future = executor.submit(
self._vector_query.with_row_id(True).to_arrow, timeout=timeout
)
fts_results = fts_future.result()
vector_results = vector_future.result()
return self._combine_hybrid_results(
fts_results=fts_results,
vector_results=vector_results,
norm=self._norm,
fts_query=self._fts_query._query,
reranker=self._reranker,
limit=self._limit,
with_row_ids=self._with_row_id,
)
@staticmethod
def _combine_hybrid_results(
fts_results: pa.Table,
vector_results: pa.Table,
norm: str,
fts_query: str,
reranker,
limit: int,
with_row_ids: bool,
) -> pa.Table:
if norm == "rank":
vector_results = LanceHybridQueryBuilder._rank(vector_results, "_distance")
fts_results = LanceHybridQueryBuilder._rank(fts_results, "_score")
# If both result sets are empty (e.g. after hard filtering),
# return early to avoid errors in reranking or score restoration.
if vector_results.num_rows == 0 and fts_results.num_rows == 0:
# Build a minimal empty table with the _relevance_score column
combined_schema = pa.unify_schemas(
[vector_results.schema, fts_results.schema],
)
empty = pa.table(
{
col: pa.array([], type=combined_schema.field(col).type)
for col in combined_schema.names
}
)
empty = empty.append_column(
"_relevance_score", pa.array([], type=pa.float32())
)
if not with_row_ids and "_rowid" in empty.column_names:
empty = empty.drop(["_rowid"])
return empty
original_distances = None
original_scores = None
original_distance_row_ids = None
original_score_row_ids = None
# normalize the scores to be between 0 and 1, 0 being most relevant
# We check whether the results (vector and FTS) are empty, because when
# they are, they often are missing the _rowid column, which causes an error
if vector_results.num_rows > 0:
distance_i = vector_results.column_names.index("_distance")
original_distances = vector_results.column(distance_i)
original_distance_row_ids = vector_results.column("_rowid")
vector_results = vector_results.set_column(
distance_i,
vector_results.field(distance_i),
LanceHybridQueryBuilder._normalize_scores(original_distances),
)
# In fts higher scores represent relevance. Not inverting them here as
# rerankers might need to preserve this score to support `return_score="all"`
if fts_results.num_rows > 0:
score_i = fts_results.column_names.index("_score")
original_scores = fts_results.column(score_i)
original_score_row_ids = fts_results.column("_rowid")
fts_results = fts_results.set_column(
score_i,
fts_results.field(score_i),
LanceHybridQueryBuilder._normalize_scores(original_scores),
)
results = reranker.rerank_hybrid(fts_query, vector_results, fts_results)
check_reranker_result(results)
if "_distance" in results.column_names and original_distances is not None:
# restore the original distances
indices = pc.index_in(
results["_rowid"], original_distance_row_ids, skip_nulls=True
)
original_distances = pc.take(original_distances, indices)
distance_i = results.column_names.index("_distance")
results = results.set_column(distance_i, "_distance", original_distances)
if "_score" in results.column_names and original_scores is not None:
# restore the original scores
indices = pc.index_in(
results["_rowid"], original_score_row_ids, skip_nulls=True
)
original_scores = pc.take(original_scores, indices)
score_i = results.column_names.index("_score")
results = results.set_column(score_i, "_score", original_scores)
results = results.slice(length=limit)
if not with_row_ids:
results = results.drop(["_rowid"])
return results
def to_batches(
self, /, batch_size: Optional[int] = None, timeout: Optional[timedelta] = None
):
raise NotImplementedError("to_batches not yet supported on a hybrid query")
@staticmethod
def _rank(results: pa.Table, column: str, ascending: bool = True):
if len(results) == 0:
return results
# Get the _score column from results
scores = results.column(column).to_numpy()
sort_indices = np.argsort(scores)
if not ascending:
sort_indices = sort_indices[::-1]
ranks = np.empty_like(sort_indices)
ranks[sort_indices] = np.arange(len(scores)) + 1
# replace the _score column with the ranks
_score_idx = results.column_names.index(column)
results = results.set_column(
_score_idx, column, pa.array(ranks, type=pa.float32())
)
return results
@staticmethod
def _normalize_scores(scores: pa.Array, invert=False) -> pa.Array:
if len(scores) == 0:
return scores
# normalize the scores by subtracting the min and dividing by the max
min, max = pc.min_max(scores).values()
rng = pc.subtract(max, min)
if not pc.equal(rng, pa.scalar(0.0)).as_py():
scores = pc.divide(pc.subtract(scores, min), rng)
elif not pc.equal(max, pa.scalar(0.0)).as_py():
# If rng is 0, then we at least want the scores to be 0
scores = pc.subtract(scores, min)
if invert:
scores = pc.subtract(1, scores)
return scores
def rerank(
self,
reranker: Reranker = RRFReranker(),
normalize: str = "score",
) -> LanceHybridQueryBuilder:
"""
Rerank the hybrid search results using the specified reranker. The reranker
must be an instance of Reranker class.
Parameters
----------
reranker: Reranker, default RRFReranker()
The reranker to use. Must be an instance of Reranker class.
normalize: str, default "score"
The method to normalize the scores. Can be "rank" or "score". If "rank",
the scores are converted to ranks and then normalized. If "score", the
scores are normalized directly.
Returns
-------
LanceHybridQueryBuilder
The LanceHybridQueryBuilder object.
"""
if normalize not in ["rank", "score"]:
raise ValueError("normalize must be 'rank' or 'score'.")
if reranker and not isinstance(reranker, Reranker):
raise ValueError("reranker must be an instance of Reranker class.")
self._norm = normalize
self._reranker = reranker
if reranker.score == "all":
self.with_row_id(True)
return self
def nprobes(self, nprobes: int) -> LanceHybridQueryBuilder:
"""
Set the number of probes to use for vector search.
Higher values will yield better recall (more likely to find vectors if
they exist) at the expense of latency.
Parameters
----------
nprobes: int
The number of probes to use.
Returns
-------
LanceHybridQueryBuilder
The LanceHybridQueryBuilder object.
"""
self._minimum_nprobes = nprobes
self._maximum_nprobes = nprobes
return self
def minimum_nprobes(self, minimum_nprobes: int) -> LanceHybridQueryBuilder:
"""Set the minimum number of probes to use.
See `nprobes` for more details.
"""
self._minimum_nprobes = minimum_nprobes
return self
def maximum_nprobes(self, maximum_nprobes: int) -> LanceHybridQueryBuilder:
"""Set the maximum number of probes to use.
See `nprobes` for more details.
"""
self._maximum_nprobes = maximum_nprobes
return self
def distance_range(
self, lower_bound: Optional[float] = None, upper_bound: Optional[float] = None
) -> LanceHybridQueryBuilder:
"""
Set the distance range to use.
Only rows with distances within range [lower_bound, upper_bound)
will be returned.
Parameters
----------
lower_bound: Optional[float]
The lower bound of the distance range.
upper_bound: Optional[float]
The upper bound of the distance range.
Returns
-------
LanceHybridQueryBuilder
The LanceHybridQueryBuilder object.
"""
self._lower_bound = lower_bound
self._upper_bound = upper_bound
return self
def ef(self, ef: int) -> LanceHybridQueryBuilder:
"""
Set the number of candidates to consider during search.
Higher values will yield better recall (more likely to find vectors if
they exist) at the expense of latency.
This only applies to the HNSW-related index.
The default value is 1.5 * limit.
Parameters
----------
ef: int
The number of candidates to consider during search.
Returns
-------
LanceHybridQueryBuilder
The LanceHybridQueryBuilder object.
"""
self._ef = ef
return self
def metric(self, metric: Literal["l2", "cosine", "dot"]) -> LanceHybridQueryBuilder:
"""Set the distance metric to use.
This is an alias for distance_type() and may be deprecated in the future.
Please use distance_type() instead.
Parameters
----------
metric: "l2" or "cosine" or "dot"
The distance metric to use. By default "l2" is used.
Returns
-------
LanceVectorQueryBuilder
The LanceQueryBuilder object.
"""
return self.distance_type(metric)
def distance_type(
self, distance_type: Literal["l2", "cosine", "dot"]
) -> "LanceHybridQueryBuilder":
"""Set the distance metric to use.
When performing a vector search we try and find the "nearest" vectors according
to some kind of distance metric. This parameter controls which distance metric
to use.
Note: if there is a vector index then the distance type used MUST match the
distance type used to train the vector index. If this is not done then the
results will be invalid.
Parameters
----------
distance_type: "l2" or "cosine" or "dot"
The distance metric to use. By default "l2" is used.
Returns
-------
LanceVectorQueryBuilder
The LanceQueryBuilder object.
"""
self._distance_type = distance_type.lower()
return self
def refine_factor(self, refine_factor: int) -> LanceHybridQueryBuilder:
"""
Refine the vector search results by reading extra elements and
re-ranking them in memory.
Parameters
----------
refine_factor: int
The refine factor to use.
Returns
-------
LanceHybridQueryBuilder
The LanceHybridQueryBuilder object.
"""
self._refine_factor = refine_factor
return self
def vector(self, vector: Union[np.ndarray, list]) -> LanceHybridQueryBuilder:
self._vector = vector
return self
def text(self, text: str | FullTextQuery) -> LanceHybridQueryBuilder:
self._text = text
return self
def bypass_vector_index(self) -> LanceHybridQueryBuilder:
"""
If this is called then any vector index is skipped
An exhaustive (flat) search will be performed. The query vector will
be compared to every vector in the table. At high scales this can be
expensive. However, this is often still useful. For example, skipping
the vector index can give you ground truth results which you can use to
calculate your recall to select an appropriate value for nprobes.
Returns
-------
LanceHybridQueryBuilder
The LanceHybridQueryBuilder object.
"""
self._bypass_vector_index = True
return self
def explain_plan(self, verbose: Optional[bool] = False) -> str:
"""Return the execution plan for this query.
Examples
--------
>>> import lancedb
>>> db = lancedb.connect("./.lancedb")
>>> table = db.create_table("my_table", [{"vector": [99.0, 99]}])
>>> query = [100, 100]
>>> plan = table.search(query).explain_plan(True)
>>> print(plan) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
ProjectionExec: expr=[vector@0 as vector, _distance@2 as _distance]
GlobalLimitExec: skip=0, fetch=10
FilterExec: _distance@2 IS NOT NULL
SortExec: TopK(fetch=10), expr=[_distance@2 ASC NULLS LAST, _rowid@1 ASC NULLS LAST], preserve_partitioning=[false]
KNNVectorDistance: metric=l2
LanceRead: uri=..., projection=[vector], ...
Parameters
----------
verbose : bool, default False
Use a verbose output format.
Returns
-------
plan : str
""" # noqa: E501
self._create_query_builders()
reranker_label = str(self._reranker) if self._reranker else "No reranker"
vector_plan = self._table._explain_plan(
self._vector_query.to_query_object(), verbose=verbose
)
fts_plan = self._table._explain_plan(
self._fts_query.to_query_object(), verbose=verbose
)
# Indent sub-plans under the reranker
indented_vector = "\n".join(" " + line for line in vector_plan.splitlines())
indented_fts = "\n".join(" " + line for line in fts_plan.splitlines())
return f"{reranker_label}\n {indented_vector}\n {indented_fts}"
def analyze_plan(self):
"""Execute the query and display with runtime metrics.
Returns
-------
plan : str
"""
self._create_query_builders()
results = ["Vector Search Plan:"]
results.append(self._table._analyze_plan(self._vector_query.to_query_object()))
results.append("FTS Search Plan:")
results.append(self._table._analyze_plan(self._fts_query.to_query_object()))
return "\n".join(results)
def _create_query_builders(self):
"""Set up and configure the vector and FTS query builders."""
vector_query, fts_query = self._validate_query(
self._query, self._vector, self._text
)
self._fts_query = LanceFtsQueryBuilder(
self._table, fts_query, fts_columns=self._fts_columns
)
vector_query = self._query_to_vector(
self._table, vector_query, self._vector_column
)
self._vector_query = LanceVectorQueryBuilder(
self._table, vector_query, self._vector_column
)
# Apply common configurations
if self._limit:
self._vector_query.limit(self._limit)
self._fts_query.limit(self._limit)
if self._columns:
self._vector_query.select(self._columns)
self._fts_query.select(self._columns)
if self._where:
self._vector_query.where(self._where, not self._postfilter)
self._fts_query.where(self._where, not self._postfilter)
if self._with_row_id:
self._vector_query.with_row_id(True)
self._fts_query.with_row_id(True)
if self._phrase_query:
self._fts_query.phrase_query(True)
if self._distance_type:
self._vector_query.metric(self._distance_type)
if self._minimum_nprobes:
self._vector_query.minimum_nprobes(self._minimum_nprobes)
if self._maximum_nprobes is not None:
self._vector_query.maximum_nprobes(self._maximum_nprobes)
if self._refine_factor:
self._vector_query.refine_factor(self._refine_factor)
if self._ef:
self._vector_query.ef(self._ef)
if self._bypass_vector_index:
self._vector_query.bypass_vector_index()
if self._lower_bound or self._upper_bound:
self._vector_query.distance_range(
lower_bound=self._lower_bound, upper_bound=self._upper_bound
)
if self._reranker is None:
self._reranker = RRFReranker()
class AsyncQueryBase(object):
"""
Base class for all async queries (take, scan, vector, fts, hybrid)
"""
def __init__(self, inner: Union[LanceQuery, LanceVectorQuery, LanceTakeQuery]):
"""
Construct an AsyncQueryBase
This method is not intended to be called directly. Instead, use the
[AsyncTable.query][lancedb.table.AsyncTable.query] method to create a query.
"""
self._inner = inner
def to_query_object(self) -> Query:
"""
Convert the query into a query object
This is currently experimental but can be useful as the query object is pure
python and more easily serializable.
"""
return Query.from_inner(self._inner.to_query_request())
def select(self, columns: Union[List[str], dict[str, str]]) -> Self:
"""
Return only the specified columns.
By default a query will return all columns from the table. However, this can
have a very significant impact on latency. LanceDb stores data in a columnar
fashion. This
means we can finely tune our I/O to select exactly the columns we need.
As a best practice you should always limit queries to the columns that you need.
If you pass in a list of column names then only those columns will be
returned.
You can also use this method to create new "dynamic" columns based on your
existing columns. For example, you may not care about "a" or "b" but instead
simply want "a + b". This is often seen in the SELECT clause of an SQL query
(e.g. `SELECT a+b FROM my_table`).
To create dynamic columns you can pass in a dict[str, str]. A column will be
returned for each entry in the map. The key provides the name of the column.
The value is an SQL string used to specify how the column is calculated.
For example, an SQL query might state `SELECT a + b AS combined, c`. The
equivalent input to this method would be `{"combined": "a + b", "c": "c"}`.
Columns will always be returned in the order given, even if that order is
different than the order used when adding the data.
"""
if isinstance(columns, list) and all(isinstance(c, str) for c in columns):
self._inner.select_columns(columns)
elif isinstance(columns, dict) and all(
isinstance(k, str) and isinstance(v, str) for k, v in columns.items()
):
self._inner.select(list(columns.items()))
else:
raise TypeError("columns must be a list of column names or a dict")
return self
def with_row_id(self) -> Self:
"""
Include the _rowid column in the results.
"""
self._inner.with_row_id()
return self
async def to_batches(
self,
*,
max_batch_length: Optional[int] = None,
timeout: Optional[timedelta] = None,
) -> AsyncRecordBatchReader:
"""
Execute the query and return the results as an Apache Arrow RecordBatchReader.
Parameters
----------
max_batch_length: Optional[int]
The maximum number of selected records in a single RecordBatch object.
If not specified, a default batch length is used.
It is possible for batches to be smaller than the provided length if the
underlying data is stored in smaller chunks.
timeout: Optional[timedelta]
The maximum time to wait for the query to complete.
If not specified, no timeout is applied. If the query does not
complete within the specified time, an error will be raised.
"""
return AsyncRecordBatchReader(
await self._inner.execute(
max_batch_length=max_batch_length, timeout=timeout
)
)
async def output_schema(self) -> pa.Schema:
"""
Return the output schema for the query
This does not execute the query.
"""
return await self._inner.output_schema()
async def to_arrow(self, timeout: Optional[timedelta] = None) -> pa.Table:
"""
Execute the query and collect the results into an Apache Arrow Table.
This method will collect all results into memory before returning. If
you expect a large number of results, you may want to use
[to_batches][lancedb.query.AsyncQueryBase.to_batches]
Parameters
----------
timeout: Optional[timedelta]
The maximum time to wait for the query to complete.
If not specified, no timeout is applied. If the query does not
complete within the specified time, an error will be raised.
"""
batch_iter = await self.to_batches(timeout=timeout)
return pa.Table.from_batches(
await batch_iter.read_all(), schema=batch_iter.schema
)
async def to_list(self, timeout: Optional[timedelta] = None) -> List[dict]:
"""
Execute the query and return the results as a list of dictionaries.
Each list entry is a dictionary with the selected column names as keys,
or all table columns if `select` is not called. The vector and the "_distance"
fields are returned whether or not they're explicitly selected.
Parameters
----------
timeout: Optional[timedelta]
The maximum time to wait for the query to complete.
If not specified, no timeout is applied. If the query does not
complete within the specified time, an error will be raised.
"""
return (await self.to_arrow(timeout=timeout)).to_pylist()
async def to_pandas(
self,
flatten: Optional[Union[int, bool]] = None,
timeout: Optional[timedelta] = None,
) -> "pd.DataFrame":
"""
Execute the query and collect the results into a pandas DataFrame.
This method will collect all results into memory before returning. If you
expect a large number of results, you may want to use
[to_batches][lancedb.query.AsyncQueryBase.to_batches] and convert each batch to
pandas separately.
Examples
--------
>>> import asyncio
>>> from lancedb import connect_async
>>> async def doctest_example():
... conn = await connect_async("./.lancedb")
... table = await conn.create_table("my_table", data=[{"a": 1, "b": 2}])
... async for batch in await table.query().to_batches():
... batch_df = batch.to_pandas()
>>> asyncio.run(doctest_example())
Parameters
----------
flatten: Optional[Union[int, bool]]
If flatten is True, flatten all nested columns.
If flatten is an integer, flatten the nested columns up to the
specified depth.
If unspecified, do not flatten the nested columns.
timeout: Optional[timedelta]
The maximum time to wait for the query to complete.
If not specified, no timeout is applied. If the query does not
complete within the specified time, an error will be raised.
"""
return (
flatten_columns(await self.to_arrow(timeout=timeout), flatten)
).to_pandas()
async def to_polars(
self,
timeout: Optional[timedelta] = None,
) -> "pl.DataFrame":
"""
Execute the query and collect the results into a Polars DataFrame.
This method will collect all results into memory before returning. If you
expect a large number of results, you may want to use
[to_batches][lancedb.query.AsyncQueryBase.to_batches] and convert each batch to
polars separately.
Parameters
----------
timeout: Optional[timedelta]
The maximum time to wait for the query to complete.
If not specified, no timeout is applied. If the query does not
complete within the specified time, an error will be raised.
Examples
--------
>>> import asyncio
>>> import polars as pl
>>> from lancedb import connect_async
>>> async def doctest_example():
... conn = await connect_async("./.lancedb")
... table = await conn.create_table("my_table", data=[{"a": 1, "b": 2}])
... async for batch in await table.query().to_batches():
... batch_df = pl.from_arrow(batch)
>>> asyncio.run(doctest_example())
"""
import polars as pl
return pl.from_arrow(await self.to_arrow(timeout=timeout))
async def to_pydantic(
self, model: Type[LanceModel], *, timeout: Optional[timedelta] = None
) -> List[LanceModel]:
"""
Convert results to a list of pydantic models.
Parameters
----------
model : Type[LanceModel]
The pydantic model to use.
timeout : timedelta, optional
The maximum time to wait for the query to complete.
If None, wait indefinitely.
Returns
-------
list[LanceModel]
"""
return [
model(**row) for row in (await self.to_arrow(timeout=timeout)).to_pylist()
]
async def explain_plan(self, verbose: Optional[bool] = False):
"""Return the execution plan for this query.
Examples
--------
>>> import asyncio
>>> from lancedb import connect_async
>>> async def doctest_example():
... conn = await connect_async("./.lancedb")
... table = await conn.create_table("my_table", [{"vector": [99.0, 99.0]}])
... plan = await table.query().nearest_to([1.0, 2.0]).explain_plan(True)
... print(plan)
>>> asyncio.run(doctest_example()) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
ProjectionExec: expr=[vector@0 as vector, _distance@2 as _distance]
GlobalLimitExec: skip=0, fetch=10
FilterExec: _distance@2 IS NOT NULL
SortExec: TopK(fetch=10), expr=[_distance@2 ASC NULLS LAST, _rowid@1 ASC NULLS LAST], preserve_partitioning=[false]
KNNVectorDistance: metric=l2
LanceRead: uri=..., projection=[vector], ...
<BLANKLINE>
Parameters
----------
verbose : bool, default False
Use a verbose output format.
Returns
-------
plan : str
""" # noqa: E501
return await self._inner.explain_plan(verbose)
async def analyze_plan(self):
"""Execute the query and display with runtime metrics.
Returns
-------
plan : str
"""
return await self._inner.analyze_plan()
class AsyncStandardQuery(AsyncQueryBase):
"""
Base class for "standard" async queries (all but take currently)
"""
def __init__(self, inner: Union[LanceQuery, LanceVectorQuery]):
"""
Construct an AsyncStandardQuery
This method is not intended to be called directly. Instead, use the
[AsyncTable.query][lancedb.table.AsyncTable.query] method to create a query.
"""
super().__init__(inner)
def where(self, predicate: str) -> Self:
"""
Only return rows matching the given predicate
The predicate should be supplied as an SQL query string.
Examples
--------
>>> predicate = "x > 10"
>>> predicate = "y > 0 AND y < 100"
>>> predicate = "x > 5 OR y = 'test'"
Filtering performance can often be improved by creating a scalar index
on the filter column(s).
"""
self._inner.where(predicate)
return self
def limit(self, limit: int) -> Self:
"""
Set the maximum number of results to return.
By default, a plain search has no limit. If this method is not
called then every valid row from the table will be returned.
"""
self._inner.limit(limit)
return self
def offset(self, offset: int) -> Self:
"""
Set the offset for the results.
Parameters
----------
offset: int
The offset to start fetching results from.
"""
self._inner.offset(offset)
return self
def fast_search(self) -> Self:
"""
Skip searching un-indexed data.
This can make queries faster, but will miss any data that has not been
indexed.
!!! tip
You can add new data into an existing index by calling
[AsyncTable.optimize][lancedb.table.AsyncTable.optimize].
"""
self._inner.fast_search()
return self
def postfilter(self) -> Self:
"""
If this is called then filtering will happen after the search instead of
before.
By default filtering will be performed before the search. This is how
filtering is typically understood to work. This prefilter step does add some
additional latency. Creating a scalar index on the filter column(s) can
often improve this latency. However, sometimes a filter is too complex or
scalar indices cannot be applied to the column. In these cases postfiltering
can be used instead of prefiltering to improve latency.
Post filtering applies the filter to the results of the search. This
means we only run the filter on a much smaller set of data. However, it can
cause the query to return fewer than `limit` results (or even no results) if
none of the nearest results match the filter.
Post filtering happens during the "refine stage" (described in more detail in
@see {@link VectorQuery#refineFactor}). This means that setting a higher refine
factor can often help restore some of the results lost by post filtering.
"""
self._inner.postfilter()
return self
class AsyncQuery(AsyncStandardQuery):
def __init__(self, inner: LanceQuery):
"""
Construct an AsyncQuery
This method is not intended to be called directly. Instead, use the
[AsyncTable.query][lancedb.table.AsyncTable.query] method to create a query.
"""
super().__init__(inner)
self._inner = inner
@classmethod
def _query_vec_to_array(self, vec: Union[VEC, Tuple]):
if isinstance(vec, list):
return pa.array(vec)
if isinstance(vec, np.ndarray):
return pa.array(vec)
if isinstance(vec, pa.Array):
return vec
if isinstance(vec, pa.ChunkedArray):
return vec.combine_chunks()
if isinstance(vec, tuple):
return pa.array(vec)
# We've checked everything we formally support in our typings
# but, as a fallback, let pyarrow try and convert it anyway.
# This can allow for some more exotic things like iterables
return pa.array(vec)
def nearest_to(
self,
query_vector: Union[VEC, Tuple, List[VEC]],
) -> AsyncVectorQuery:
"""
Find the nearest vectors to the given query vector.
This converts the query from a plain query to a vector query.
This method will attempt to convert the input to the query vector
expected by the embedding model. If the input cannot be converted
then an error will be thrown.
By default, there is no embedding model, and the input should be
something that can be converted to a pyarrow array of floats. This
includes lists, numpy arrays, and tuples.
If there is only one vector column (a column whose data type is a
fixed size list of floats) then the column does not need to be specified.
If there is more than one vector column you must use
[AsyncVectorQuery.column][lancedb.query.AsyncVectorQuery.column] to specify
which column you would like to compare with.
If no index has been created on the vector column then a vector query
will perform a distance comparison between the query vector and every
vector in the database and then sort the results. This is sometimes
called a "flat search"
For small databases, with tens of thousands of vectors or less, this can
be reasonably fast. In larger databases you should create a vector index
on the column. If there is a vector index then an "approximate" nearest
neighbor search (frequently called an ANN search) will be performed. This
search is much faster, but the results will be approximate.
The query can be further parameterized using the returned builder. There
are various ANN search parameters that will let you fine tune your recall
accuracy vs search latency.
Vector searches always have a [limit][]. If `limit` has not been called then
a default `limit` of 10 will be used.
Typically, a single vector is passed in as the query. However, you can also
pass in multiple vectors. When multiple vectors are passed in, if the vector
column is with multivector type, then the vectors will be treated as a single
query. Or the vectors will be treated as multiple queries, this can be useful
if you want to find the nearest vectors to multiple query vectors.
This is not expected to be faster than making multiple queries concurrently;
it is just a convenience method. If multiple vectors are passed in then
an additional column `query_index` will be added to the results. This column
will contain the index of the query vector that the result is nearest to.
"""
if query_vector is None:
raise ValueError("query_vector can not be None")
if (
isinstance(query_vector, (list, np.ndarray, pa.Array))
and len(query_vector) > 0
and isinstance(query_vector[0], (list, np.ndarray, pa.Array))
):
# multiple have been passed
query_vectors = [AsyncQuery._query_vec_to_array(v) for v in query_vector]
new_self = self._inner.nearest_to(query_vectors[0])
for v in query_vectors[1:]:
new_self.add_query_vector(v)
return AsyncVectorQuery(new_self)
else:
return AsyncVectorQuery(
self._inner.nearest_to(AsyncQuery._query_vec_to_array(query_vector))
)
def nearest_to_text(
self, query: str | FullTextQuery, columns: Union[str, List[str], None] = None
) -> AsyncFTSQuery:
"""
Find the documents that are most relevant to the given text query.
This method will perform a full text search on the table and return
the most relevant documents. The relevance is determined by BM25.
The columns to search must be with native FTS index
(Tantivy-based can't work with this method).
By default, all indexed columns are searched,
now only one column can be searched at a time.
Parameters
----------
query: str
The text query to search for.
columns: str or list of str, default None
The columns to search in. If None, all indexed columns are searched.
For now only one column can be searched at a time.
"""
if isinstance(columns, str):
columns = [columns]
if columns is None:
columns = []
if isinstance(query, str):
return AsyncFTSQuery(
self._inner.nearest_to_text({"query": query, "columns": columns})
)
# FullTextQuery object
return AsyncFTSQuery(self._inner.nearest_to_text({"query": query}))
class AsyncFTSQuery(AsyncStandardQuery):
"""A query for full text search for LanceDB."""
def __init__(self, inner: LanceFTSQuery):
super().__init__(inner)
self._inner = inner
self._reranker = None
def get_query(self) -> str:
return self._inner.get_query()
def rerank(
self,
reranker: Reranker = RRFReranker(),
) -> AsyncFTSQuery:
if reranker and not isinstance(reranker, Reranker):
raise ValueError("reranker must be an instance of Reranker class.")
self._reranker = reranker
return self
def nearest_to(
self,
query_vector: Union[VEC, Tuple, List[VEC]],
) -> AsyncHybridQuery:
"""
In addition doing text search on the LanceDB Table, also
find the nearest vectors to the given query vector.
This converts the query from a FTS Query to a Hybrid query. Results
from the vector search will be combined with results from the FTS query.
This method will attempt to convert the input to the query vector
expected by the embedding model. If the input cannot be converted
then an error will be thrown.
By default, there is no embedding model, and the input should be
something that can be converted to a pyarrow array of floats. This
includes lists, numpy arrays, and tuples.
If there is only one vector column (a column whose data type is a
fixed size list of floats) then the column does not need to be specified.
If there is more than one vector column you must use
[AsyncVectorQuery.column][lancedb.query.AsyncVectorQuery.column] to specify
which column you would like to compare with.
If no index has been created on the vector column then a vector query
will perform a distance comparison between the query vector and every
vector in the database and then sort the results. This is sometimes
called a "flat search"
For small databases, with tens of thousands of vectors or less, this can
be reasonably fast. In larger databases you should create a vector index
on the column. If there is a vector index then an "approximate" nearest
neighbor search (frequently called an ANN search) will be performed. This
search is much faster, but the results will be approximate.
The query can be further parameterized using the returned builder. There
are various ANN search parameters that will let you fine tune your recall
accuracy vs search latency.
Hybrid searches always have a [limit][]. If `limit` has not been called then
a default `limit` of 10 will be used.
Typically, a single vector is passed in as the query. However, you can also
pass in multiple vectors. This can be useful if you want to find the nearest
vectors to multiple query vectors. This is not expected to be faster than
making multiple queries concurrently; it is just a convenience method.
If multiple vectors are passed in then an additional column `query_index`
will be added to the results. This column will contain the index of the
query vector that the result is nearest to.
"""
if query_vector is None:
raise ValueError("query_vector can not be None")
if (
isinstance(query_vector, list)
and len(query_vector) > 0
and not isinstance(query_vector[0], (float, int))
):
# multiple have been passed
query_vectors = [AsyncQuery._query_vec_to_array(v) for v in query_vector]
new_self = self._inner.nearest_to(query_vectors[0])
for v in query_vectors[1:]:
new_self.add_query_vector(v)
return AsyncHybridQuery(new_self)
else:
return AsyncHybridQuery(
self._inner.nearest_to(AsyncQuery._query_vec_to_array(query_vector))
)
async def to_batches(
self,
*,
max_batch_length: Optional[int] = None,
timeout: Optional[timedelta] = None,
) -> AsyncRecordBatchReader:
reader = await super().to_batches(timeout=timeout)
results = pa.Table.from_batches(await reader.read_all(), reader.schema)
if self._reranker:
results = self._reranker.rerank_fts(self.get_query(), results)
return AsyncRecordBatchReader(results, max_batch_length=max_batch_length)
class AsyncVectorQueryBase:
def column(self, column: str) -> Self:
"""
Set the vector column to query
This controls which column is compared to the query vector supplied in
the call to [AsyncQuery.nearest_to][lancedb.query.AsyncQuery.nearest_to].
This parameter must be specified if the table has more than one column
whose data type is a fixed-size-list of floats.
"""
self._inner.column(column)
return self
def nprobes(self, nprobes: int) -> Self:
"""
Set the number of partitions to search (probe)
This argument is only used when the vector column has an IVF-based index.
If there is no index then this value is ignored.
The IVF stage of IVF PQ divides the input into partitions (clusters) of
related values.
The partition whose centroids are closest to the query vector will be
exhaustiely searched to find matches. This parameter controls how many
partitions should be searched.
Increasing this value will increase the recall of your query but will
also increase the latency of your query. The default value is 20. This
default is good for many cases but the best value to use will depend on
your data and the recall that you need to achieve.
For best results we recommend tuning this parameter with a benchmark against
your actual data to find the smallest possible value that will still give
you the desired recall.
"""
self._inner.nprobes(nprobes)
return self
def minimum_nprobes(self, minimum_nprobes: int) -> Self:
"""Set the minimum number of probes to use.
See `nprobes` for more details.
These partitions will be searched on every indexed vector query and will
increase recall at the expense of latency.
"""
self._inner.minimum_nprobes(minimum_nprobes)
return self
def maximum_nprobes(self, maximum_nprobes: int) -> Self:
"""Set the maximum number of probes to use.
See `nprobes` for more details.
If this value is greater than `minimum_nprobes` then the excess partitions
will be searched only if we have not found enough results.
This can be useful when there is a narrow filter to allow these queries to
spend more time searching and avoid potential false negatives.
If this value is 0 then no limit will be applied and all partitions could be
searched if needed to satisfy the limit.
"""
self._inner.maximum_nprobes(maximum_nprobes)
return self
def distance_range(
self, lower_bound: Optional[float] = None, upper_bound: Optional[float] = None
) -> Self:
"""Set the distance range to use.
Only rows with distances within range [lower_bound, upper_bound)
will be returned.
Parameters
----------
lower_bound: Optional[float]
The lower bound of the distance range.
upper_bound: Optional[float]
The upper bound of the distance range.
Returns
-------
AsyncVectorQuery
The AsyncVectorQuery object.
"""
self._inner.distance_range(lower_bound, upper_bound)
return self
def ef(self, ef: int) -> Self:
"""
Set the number of candidates to consider during search
This argument is only used when the vector column has an HNSW index.
If there is no index then this value is ignored.
Increasing this value will increase the recall of your query but will also
increase the latency of your query. The default value is 1.5 * limit. This
default is good for many cases but the best value to use will depend on your
data and the recall that you need to achieve.
"""
self._inner.ef(ef)
return self
def refine_factor(self, refine_factor: int) -> Self:
"""
A multiplier to control how many additional rows are taken during the refine
step
This argument is only used when the vector column has an IVF PQ index.
If there is no index then this value is ignored.
An IVF PQ index stores compressed (quantized) values. They query vector is
compared against these values and, since they are compressed, the comparison is
inaccurate.
This parameter can be used to refine the results. It can improve both improve
recall and correct the ordering of the nearest results.
To refine results LanceDb will first perform an ANN search to find the nearest
`limit` * `refine_factor` results. In other words, if `refine_factor` is 3 and
`limit` is the default (10) then the first 30 results will be selected. LanceDb
then fetches the full, uncompressed, values for these 30 results. The results
are then reordered by the true distance and only the nearest 10 are kept.
Note: there is a difference between calling this method with a value of 1 and
never calling this method at all. Calling this method with any value will have
an impact on your search latency. When you call this method with a
`refine_factor` of 1 then LanceDb still needs to fetch the full, uncompressed,
values so that it can potentially reorder the results.
Note: if this method is NOT called then the distances returned in the _distance
column will be approximate distances based on the comparison of the quantized
query vector and the quantized result vectors. This can be considerably
different than the true distance between the query vector and the actual
uncompressed vector.
"""
self._inner.refine_factor(refine_factor)
return self
def distance_type(self, distance_type: str) -> Self:
"""
Set the distance metric to use
When performing a vector search we try and find the "nearest" vectors according
to some kind of distance metric. This parameter controls which distance metric
to use. See @see {@link IvfPqOptions.distanceType} for more details on the
different distance metrics available.
Note: if there is a vector index then the distance type used MUST match the
distance type used to train the vector index. If this is not done then the
results will be invalid.
By default "l2" is used.
"""
self._inner.distance_type(distance_type)
return self
def bypass_vector_index(self) -> Self:
"""
If this is called then any vector index is skipped
An exhaustive (flat) search will be performed. The query vector will
be compared to every vector in the table. At high scales this can be
expensive. However, this is often still useful. For example, skipping
the vector index can give you ground truth results which you can use to
calculate your recall to select an appropriate value for nprobes.
"""
self._inner.bypass_vector_index()
return self
class AsyncVectorQuery(AsyncStandardQuery, AsyncVectorQueryBase):
def __init__(self, inner: LanceVectorQuery):
"""
Construct an AsyncVectorQuery
This method is not intended to be called directly. Instead, create
a query first with [AsyncTable.query][lancedb.table.AsyncTable.query] and then
use [AsyncQuery.nearest_to][lancedb.query.AsyncQuery.nearest_to]] to convert to
a vector query. Or you can use
[AsyncTable.vector_search][lancedb.table.AsyncTable.vector_search]
"""
super().__init__(inner)
self._inner = inner
self._reranker = None
self._query_string = None
def rerank(
self, reranker: Reranker = RRFReranker(), query_string: Optional[str] = None
) -> AsyncHybridQuery:
if reranker and not isinstance(reranker, Reranker):
raise ValueError("reranker must be an instance of Reranker class.")
self._reranker = reranker
if not self._query_string and not query_string:
raise ValueError("query_string must be provided to rerank the results.")
self._query_string = query_string
return self
def nearest_to_text(
self, query: str | FullTextQuery, columns: Union[str, List[str], None] = None
) -> AsyncHybridQuery:
"""
Find the documents that are most relevant to the given text query,
in addition to vector search.
This converts the vector query into a hybrid query.
This search will perform a full text search on the table and return
the most relevant documents, combined with the vector query results.
The text relevance is determined by BM25.
The columns to search must be with native FTS index
(Tantivy-based can't work with this method).
By default, all indexed columns are searched,
now only one column can be searched at a time.
Parameters
----------
query: str
The text query to search for.
columns: str or list of str, default None
The columns to search in. If None, all indexed columns are searched.
For now only one column can be searched at a time.
"""
if isinstance(columns, str):
columns = [columns]
if columns is None:
columns = []
if isinstance(query, str):
return AsyncHybridQuery(
self._inner.nearest_to_text({"query": query, "columns": columns})
)
# FullTextQuery object
return AsyncHybridQuery(self._inner.nearest_to_text({"query": query}))
async def to_batches(
self,
*,
max_batch_length: Optional[int] = None,
timeout: Optional[timedelta] = None,
) -> AsyncRecordBatchReader:
reader = await super().to_batches(timeout=timeout)
results = pa.Table.from_batches(await reader.read_all(), reader.schema)
if self._reranker:
results = self._reranker.rerank_vector(self._query_string, results)
return AsyncRecordBatchReader(results, max_batch_length=max_batch_length)
class AsyncHybridQuery(AsyncStandardQuery, AsyncVectorQueryBase):
"""
A query builder that performs hybrid vector and full text search.
Results are combined and reranked based on the specified reranker.
By default, the results are reranked using the RRFReranker, which
uses reciprocal rank fusion score for reranking.
To make the vector and fts results comparable, the scores are normalized.
Instead of normalizing scores, the `normalize` parameter can be set to "rank"
in the `rerank` method to convert the scores to ranks and then normalize them.
"""
def __init__(self, inner: LanceHybridQuery):
super().__init__(inner)
self._inner = inner
self._norm = "score"
self._reranker = RRFReranker()
def rerank(
self, reranker: Reranker = RRFReranker(), normalize: str = "score"
) -> AsyncHybridQuery:
"""
Rerank the hybrid search results using the specified reranker. The reranker
must be an instance of Reranker class.
Parameters
----------
reranker: Reranker, default RRFReranker()
The reranker to use. Must be an instance of Reranker class.
normalize: str, default "score"
The method to normalize the scores. Can be "rank" or "score". If "rank",
the scores are converted to ranks and then normalized. If "score", the
scores are normalized directly.
Returns
-------
AsyncHybridQuery
The AsyncHybridQuery object.
"""
if normalize not in ["rank", "score"]:
raise ValueError("normalize must be 'rank' or 'score'.")
if reranker and not isinstance(reranker, Reranker):
raise ValueError("reranker must be an instance of Reranker class.")
self._norm = normalize
self._reranker = reranker
return self
async def to_batches(
self,
*,
max_batch_length: Optional[int] = None,
timeout: Optional[timedelta] = None,
) -> AsyncRecordBatchReader:
fts_query = AsyncFTSQuery(self._inner.to_fts_query())
vec_query = AsyncVectorQuery(self._inner.to_vector_query())
# save the row ID choice that was made on the query builder and force it
# to actually fetch the row ids because we need this for reranking
with_row_ids = self._inner.get_with_row_id()
fts_query.with_row_id()
vec_query.with_row_id()
fts_results, vector_results = await asyncio.gather(
fts_query.to_arrow(timeout=timeout),
vec_query.to_arrow(timeout=timeout),
)
result = LanceHybridQueryBuilder._combine_hybrid_results(
fts_results=fts_results,
vector_results=vector_results,
norm=self._norm,
fts_query=fts_query.get_query(),
reranker=self._reranker,
limit=self._inner.get_limit(),
with_row_ids=with_row_ids,
)
return AsyncRecordBatchReader(result, max_batch_length=max_batch_length)
async def explain_plan(self, verbose: Optional[bool] = False):
"""Return the execution plan for this query.
The output includes both the vector and FTS search plans.
Examples
--------
>>> import asyncio
>>> from lancedb import connect_async
>>> from lancedb.index import FTS
>>> async def doctest_example():
... conn = await connect_async("./.lancedb")
... table = await conn.create_table("my_table", [{"vector": [99.0, 99.0], "text": "hello world"}])
... await table.create_index("text", config=FTS(with_position=False))
... plan = await table.query().nearest_to([1.0, 2.0]).nearest_to_text("hello").explain_plan(True)
... print(plan)
>>> asyncio.run(doctest_example()) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
RRFReranker(K=60)
ProjectionExec: expr=[vector@0 as vector, text@3 as text, _distance@2 as _distance]
Take: columns="vector, _rowid, _distance, (text)"
CoalesceBatchesExec: target_batch_size=1024
GlobalLimitExec: skip=0, fetch=10
FilterExec: _distance@2 IS NOT NULL
SortExec: TopK(fetch=10), expr=[_distance@2 ASC NULLS LAST, _rowid@1 ASC NULLS LAST], preserve_partitioning=[false]
KNNVectorDistance: metric=l2
LanceRead: uri=..., projection=[vector], ...
ProjectionExec: expr=[vector@2 as vector, text@3 as text, _score@1 as _score]
Take: columns="_rowid, _score, (vector), (text)"
CoalesceBatchesExec: target_batch_size=1024
GlobalLimitExec: skip=0, fetch=10
MatchQuery: column=text, query=hello
Parameters
----------
verbose : bool, default False
Use a verbose output format.
Returns
-------
plan : str
""" # noqa: E501
vector_plan = await self._inner.to_vector_query().explain_plan(verbose)
fts_plan = await self._inner.to_fts_query().explain_plan(verbose)
# Indent sub-plans under the reranker
indented_vector = "\n".join(" " + line for line in vector_plan.splitlines())
indented_fts = "\n".join(" " + line for line in fts_plan.splitlines())
return f"{self._reranker}\n {indented_vector}\n {indented_fts}"
async def analyze_plan(self):
"""
Execute the query and return the physical execution plan with runtime metrics.
This runs both the vector and FTS (full-text search) queries and returns
detailed metrics for each step of execution—such as rows processed,
elapsed time, I/O stats, and more. Its useful for debugging and
performance analysis.
Returns
-------
plan : str
"""
results = ["Vector Search Query:"]
results.append(await self._inner.to_vector_query().analyze_plan())
results.append("FTS Search Query:")
results.append(await self._inner.to_fts_query().analyze_plan())
return "\n".join(results)
class AsyncTakeQuery(AsyncQueryBase):
"""
Builder for parameterizing and executing take queries.
"""
def __init__(self, inner: LanceTakeQuery):
super().__init__(inner)
class BaseQueryBuilder(object):
"""
Wraps AsyncQueryBase and provides a synchronous interface
"""
def __init__(self, inner: AsyncQueryBase):
self._inner = inner
def to_query_object(self) -> Query:
return self._inner.to_query_object()
def select(self, columns: Union[List[str], dict[str, str]]) -> Self:
"""
Return only the specified columns.
By default a query will return all columns from the table. However, this can
have a very significant impact on latency. LanceDb stores data in a columnar
fashion. This
means we can finely tune our I/O to select exactly the columns we need.
As a best practice you should always limit queries to the columns that you need.
If you pass in a list of column names then only those columns will be
returned.
You can also use this method to create new "dynamic" columns based on your
existing columns. For example, you may not care about "a" or "b" but instead
simply want "a + b". This is often seen in the SELECT clause of an SQL query
(e.g. `SELECT a+b FROM my_table`).
To create dynamic columns you can pass in a dict[str, str]. A column will be
returned for each entry in the map. The key provides the name of the column.
The value is an SQL string used to specify how the column is calculated.
For example, an SQL query might state `SELECT a + b AS combined, c`. The
equivalent input to this method would be `{"combined": "a + b", "c": "c"}`.
Columns will always be returned in the order given, even if that order is
different than the order used when adding the data.
"""
self._inner.select(columns)
return self
def with_row_id(self) -> Self:
"""
Include the _rowid column in the results.
"""
self._inner.with_row_id()
return self
def output_schema(self) -> pa.Schema:
"""
Return the output schema for the query
This does not execute the query.
"""
return LOOP.run(self._inner.output_schema())
def to_batches(
self,
*,
max_batch_length: Optional[int] = None,
timeout: Optional[timedelta] = None,
) -> pa.RecordBatchReader:
"""
Execute the query and return the results as an Apache Arrow RecordBatchReader.
Parameters
----------
max_batch_length: Optional[int]
The maximum number of selected records in a single RecordBatch object.
If not specified, a default batch length is used.
It is possible for batches to be smaller than the provided length if the
underlying data is stored in smaller chunks.
timeout: Optional[timedelta]
The maximum time to wait for the query to complete.
If not specified, no timeout is applied. If the query does not
complete within the specified time, an error will be raised.
"""
async_iter = LOOP.run(self._inner.execute(max_batch_length, timeout))
def iter_sync():
try:
while True:
yield LOOP.run(async_iter.__anext__())
except StopAsyncIteration:
return
return pa.RecordBatchReader.from_batches(async_iter.schema, iter_sync())
def to_arrow(self, timeout: Optional[timedelta] = None) -> pa.Table:
"""
Execute the query and collect the results into an Apache Arrow Table.
This method will collect all results into memory before returning. If
you expect a large number of results, you may want to use
[to_batches][lancedb.query.AsyncQueryBase.to_batches]
Parameters
----------
timeout: Optional[timedelta]
The maximum time to wait for the query to complete.
If not specified, no timeout is applied. If the query does not
complete within the specified time, an error will be raised.
"""
return LOOP.run(self._inner.to_arrow(timeout))
def to_list(self, timeout: Optional[timedelta] = None) -> List[dict]:
"""
Execute the query and return the results as a list of dictionaries.
Each list entry is a dictionary with the selected column names as keys,
or all table columns if `select` is not called. The vector and the "_distance"
fields are returned whether or not they're explicitly selected.
Parameters
----------
timeout: Optional[timedelta]
The maximum time to wait for the query to complete.
If not specified, no timeout is applied. If the query does not
complete within the specified time, an error will be raised.
"""
return LOOP.run(self._inner.to_list(timeout))
def to_pandas(
self,
flatten: Optional[Union[int, bool]] = None,
timeout: Optional[timedelta] = None,
) -> "pd.DataFrame":
"""
Execute the query and collect the results into a pandas DataFrame.
This method will collect all results into memory before returning. If you
expect a large number of results, you may want to use
[to_batches][lancedb.query.AsyncQueryBase.to_batches] and convert each batch to
pandas separately.
Examples
--------
>>> import asyncio
>>> from lancedb import connect_async
>>> async def doctest_example():
... conn = await connect_async("./.lancedb")
... table = await conn.create_table("my_table", data=[{"a": 1, "b": 2}])
... async for batch in await table.query().to_batches():
... batch_df = batch.to_pandas()
>>> asyncio.run(doctest_example())
Parameters
----------
flatten: Optional[Union[int, bool]]
If flatten is True, flatten all nested columns.
If flatten is an integer, flatten the nested columns up to the
specified depth.
If unspecified, do not flatten the nested columns.
timeout: Optional[timedelta]
The maximum time to wait for the query to complete.
If not specified, no timeout is applied. If the query does not
complete within the specified time, an error will be raised.
"""
return LOOP.run(self._inner.to_pandas(flatten, timeout))
def to_polars(
self,
timeout: Optional[timedelta] = None,
) -> "pl.DataFrame":
"""
Execute the query and collect the results into a Polars DataFrame.
This method will collect all results into memory before returning. If you
expect a large number of results, you may want to use
[to_batches][lancedb.query.AsyncQueryBase.to_batches] and convert each batch to
polars separately.
Parameters
----------
timeout: Optional[timedelta]
The maximum time to wait for the query to complete.
If not specified, no timeout is applied. If the query does not
complete within the specified time, an error will be raised.
Examples
--------
>>> import asyncio
>>> import polars as pl
>>> from lancedb import connect_async
>>> async def doctest_example():
... conn = await connect_async("./.lancedb")
... table = await conn.create_table("my_table", data=[{"a": 1, "b": 2}])
... async for batch in await table.query().to_batches():
... batch_df = pl.from_arrow(batch)
>>> asyncio.run(doctest_example())
"""
return LOOP.run(self._inner.to_polars(timeout))
def explain_plan(self, verbose: Optional[bool] = False):
"""Return the execution plan for this query.
Examples
--------
>>> import asyncio
>>> from lancedb import connect_async
>>> async def doctest_example():
... conn = await connect_async("./.lancedb")
... table = await conn.create_table("my_table", [{"vector": [99.0, 99.0]}])
... plan = await table.query().nearest_to([1.0, 2.0]).explain_plan(True)
... print(plan)
>>> asyncio.run(doctest_example()) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
ProjectionExec: expr=[vector@0 as vector, _distance@2 as _distance]
GlobalLimitExec: skip=0, fetch=10
FilterExec: _distance@2 IS NOT NULL
SortExec: TopK(fetch=10), expr=[_distance@2 ASC NULLS LAST, _rowid@1 ASC NULLS LAST], preserve_partitioning=[false]
KNNVectorDistance: metric=l2
LanceRead: uri=..., projection=[vector], ...
<BLANKLINE>
Parameters
----------
verbose : bool, default False
Use a verbose output format.
Returns
-------
plan : str
""" # noqa: E501
return LOOP.run(self._inner.explain_plan(verbose))
def analyze_plan(self):
"""Execute the query and display with runtime metrics.
Returns
-------
plan : str
"""
return LOOP.run(self._inner.analyze_plan())
class LanceTakeQueryBuilder(BaseQueryBuilder):
"""
Builder for parameterizing and executing take queries.
"""
def __init__(self, inner: AsyncTakeQuery):
super().__init__(inner)